1 21 package au.id.jericho.lib.html; 22 23 import java.util.*; 24 25 43 public abstract class CharacterReference extends Segment { 44 int codePoint; 45 46 51 public static final int INVALID_CODE_POINT=-1; 52 53 57 static final int MAX_CODE_POINT=0x10FFFF; 58 59 60 private static final int TAB_LENGTH=4; 61 62 CharacterReference(final Source source, final int begin, final int end, final int codePoint) { 63 super(source,begin,end); 64 this.codePoint=codePoint; 65 } 66 67 71 public int getCodePoint() { 72 return codePoint; 73 } 74 75 84 public char getChar() { 85 return (char)codePoint; 86 } 87 88 104 public boolean isTerminated() { 105 return source.charAt(end-1)==';'; 106 } 107 108 129 public static String encode(final CharSequence unencodedText) { 130 if (unencodedText==null) return null; 131 return appendEncode(new StringBuffer (unencodedText.length()*2),unencodedText,false).toString(); 132 } 133 134 159 public static String encodeWithWhiteSpaceFormatting(final CharSequence unencodedText) { 160 if (unencodedText==null) return null; 161 return appendEncode(new StringBuffer (unencodedText.length()*2),unencodedText,true).toString(); 162 } 163 164 183 public static String decode(final CharSequence encodedText) { 184 return decode(encodedText,false); 185 } 186 187 207 public static String decode(final CharSequence encodedText, final boolean insideAttributeValue) { 208 if (encodedText==null) return null; 209 final String encodedString=encodedText.toString(); 210 final int pos=encodedString.indexOf('&'); 211 if (pos==-1) return encodedString; 212 return appendDecode(new StringBuffer (encodedString.length()),encodedString,pos,insideAttributeValue).toString(); 213 } 214 215 228 public static String decodeCollapseWhiteSpace(final CharSequence text) { 229 return decode(appendCollapseWhiteSpace(new StringBuffer (text.length()),text)); 230 } 231 232 244 public static String reencode(final CharSequence encodedText) { 245 return encode(decode(encodedText,true)); 246 } 247 248 265 public abstract String getCharacterReferenceString(); 266 267 288 public static String getCharacterReferenceString(final int codePoint) { 289 String characterReferenceString=null; 290 if (codePoint!=CharacterEntityReference._apos) characterReferenceString=CharacterEntityReference.getCharacterReferenceString(codePoint); 291 if (characterReferenceString==null) characterReferenceString=NumericCharacterReference.getCharacterReferenceString(codePoint); 292 return characterReferenceString; 293 } 294 295 309 public String getDecimalCharacterReferenceString() { 310 return getDecimalCharacterReferenceString(codePoint); 311 } 312 313 326 public static String getDecimalCharacterReferenceString(final int codePoint) { 327 return appendDecimalCharacterReferenceString(new StringBuffer (),codePoint).toString(); 328 } 329 330 344 public String getHexadecimalCharacterReferenceString() { 345 return getHexadecimalCharacterReferenceString(codePoint); 346 } 347 348 361 public static String getHexadecimalCharacterReferenceString(final int codePoint) { 362 return appendHexadecimalCharacterReferenceString(new StringBuffer (),codePoint).toString(); 363 } 364 365 378 public String getUnicodeText() { 379 return getUnicodeText(codePoint); 380 } 381 382 393 public static String getUnicodeText(final int codePoint) { 394 return appendUnicodeText(new StringBuffer (),codePoint).toString(); 395 } 396 397 static final StringBuffer appendUnicodeText(final StringBuffer sb, final int codePoint) { 398 sb.append("U+"); 399 final String hex=Integer.toString(codePoint,16).toUpperCase(); 400 for (int i=4-hex.length(); i>0; i--) sb.append('0'); 401 sb.append(hex); 402 return sb; 403 } 404 405 427 public static CharacterReference parse(final CharSequence characterReferenceText) { 428 return construct(new Source(characterReferenceText.toString()),0,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL); 429 } 430 431 450 public static int getCodePointFromCharacterReferenceString(final CharSequence characterReferenceText) { 451 final CharacterReference characterReference=parse(characterReferenceText); 452 return (characterReference!=null) ? characterReference.getCodePoint() : INVALID_CODE_POINT; 453 } 454 455 467 public static final boolean requiresEncoding(final char ch) { 468 return ch>127 || (CharacterEntityReference.getName(ch)!=null && (ch!='\'' || Config.IsApostropheEncoded)); 469 } 470 471 static StringBuffer appendEncode(final StringBuffer sb, CharSequence unencodedText, final boolean whiteSpaceFormatting) { 472 if (unencodedText==null) return sb; 473 int beginPos=0; 474 int endPos=unencodedText.length(); 475 if (unencodedText instanceof Segment) { 476 final Segment segment=(Segment)unencodedText; 478 final int segmentOffset=segment.getBegin(); 479 beginPos=segmentOffset; 480 endPos+=segmentOffset; 481 unencodedText=segment.source.string; 482 } 483 final boolean isApostropheEncoded=Config.IsApostropheEncoded; 484 for (int i=beginPos; i<endPos; i++) { 485 char ch=unencodedText.charAt(i); 486 final String characterEntityReferenceName=CharacterEntityReference.getName(ch); 487 if (characterEntityReferenceName!=null) { 488 if (ch=='\'') { 489 if (isApostropheEncoded) 490 sb.append("'"); 491 else 492 sb.append(ch); 493 } else { 494 CharacterEntityReference.appendCharacterReferenceString(sb,characterEntityReferenceName); 495 } 496 } else if (ch>127) { 497 appendDecimalCharacterReferenceString(sb,ch); 498 } else if (!(whiteSpaceFormatting && isWhiteSpace(ch))) { 499 sb.append(ch); 500 } else { 501 int spaceCount; 503 int nexti=i+1; 504 if (ch!=' ') { 505 if (ch!='\t') { 506 if (ch=='\r' && nexti<endPos && unencodedText.charAt(nexti)=='\n') i++; sb.append("<br />"); continue; 510 } else { 511 spaceCount=TAB_LENGTH; 512 } 513 } else { 514 spaceCount=1; 515 } 516 while (nexti<endPos) { 517 ch=unencodedText.charAt(nexti); 518 if (ch==' ') 519 spaceCount+=1; 520 else if (ch=='\t') 521 spaceCount+=TAB_LENGTH; 522 else 523 break; 524 nexti++; 525 } 526 if (spaceCount==1) { 527 sb.append(' '); 529 continue; 530 } 531 if (spaceCount%2==1) sb.append(' '); while (spaceCount>=2) { 533 sb.append(" "); spaceCount-=2; 535 } 536 i=nexti-1; } 539 } 540 return sb; 541 } 542 543 static CharacterReference findPreviousOrNext(final Source source, final int pos, final boolean previous) { 544 return findPreviousOrNext(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL,previous); 545 } 546 547 private static CharacterReference findPreviousOrNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings, final boolean previous) { 548 final ParseText parseText=source.getParseText(); 549 pos=previous ? parseText.lastIndexOf('&',pos) : parseText.indexOf('&',pos); 550 while (pos!=-1) { 551 final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings); 552 if (characterReference!=null) return characterReference; 553 pos=previous ? parseText.lastIndexOf('&',pos-1) : parseText.indexOf('&',pos+1); 554 } 555 return null; 556 } 557 558 static final StringBuffer appendHexadecimalCharacterReferenceString(final StringBuffer sb, final int codePoint) { 559 return sb.append("&#x").append(Integer.toString(codePoint,16)).append(';'); 560 } 561 562 static final StringBuffer appendDecimalCharacterReferenceString(final StringBuffer sb, final int codePoint) { 563 return sb.append("&#").append(codePoint).append(';'); 564 } 565 566 private static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) { 567 try { 568 if (source.getParseText().charAt(begin)!='&') return null; 569 return (source.getParseText().charAt(begin+1)=='#') 570 ? NumericCharacterReference.construct(source,begin,unterminatedCharacterReferenceSettings) 571 : CharacterEntityReference.construct(source,begin,unterminatedCharacterReferenceSettings.characterEntityReferenceMaxCodePoint); 572 } catch (IndexOutOfBoundsException ex) { 573 return null; 574 } 575 } 576 577 private static StringBuffer appendDecode(final StringBuffer sb, final String encodedString, int pos, final boolean insideAttributeValue) { 578 final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue); 579 int lastEnd=0; 580 final Source source=new Source(encodedString); 581 while (true) { 582 final CharacterReference characterReference=findPreviousOrNext(source,pos,unterminatedCharacterReferenceSettings,false); 583 if (characterReference==null) break; 584 if (lastEnd!=characterReference.getBegin()) Util.appendTo(sb,encodedString,lastEnd,characterReference.getBegin()); 585 sb.append((char)characterReference.codePoint); 586 pos=lastEnd=characterReference.getEnd(); 587 } 588 if (lastEnd!=encodedString.length()) Util.appendTo(sb,encodedString,lastEnd,encodedString.length()); 589 return sb; 590 } 591 } 592 | Popular Tags |