1 21 24 package org.lobobrowser.html.parser; 25 26 import java.io.*; 27 import java.util.*; 28 import org.w3c.dom.html2.*; 29 import org.w3c.dom.Node ; 30 import org.w3c.dom.Element ; 31 import org.w3c.dom.Document ; 32 import org.xml.sax.ErrorHandler ; 33 import org.xml.sax.Locator ; 34 import org.xml.sax.SAXException ; 35 import org.xml.sax.SAXParseException ; 36 import org.lobobrowser.html.*; 37 import org.lobobrowser.html.io.*; 38 import org.mozilla.javascript.*; 39 40 47 public class HtmlParser { 48 private final HTMLDocument document; 49 private final UserAgentContext ucontext; 50 private final ErrorHandler errorHandler; 51 private final String publicId; 52 private final String systemId; 53 54 private static final Map ENTITIES = new HashMap(); 55 private static final Map ELEMENT_INFOS = new HashMap(); 56 57 65 public static final String MODIFYING_KEY = "cobra.suspend"; 66 67 static { 68 Map entities = ENTITIES; 69 entities.put("amp", new Character ('&')); 70 entities.put("lt", new Character ('<')); 71 entities.put("gt", new Character ('>')); 72 entities.put("quot", new Character ('"')); 73 entities.put("nbsp", new Character ((char) 160)); 74 75 entities.put("lsquo", new Character ('`')); 76 entities.put("rsquo", new Character ('´')); 77 78 entities.put("frasl", new Character ((char) 47)); 79 entities.put("ndash", new Character ((char) 8211)); 80 entities.put("mdash", new Character ((char) 8212)); 81 entities.put("iexcl", new Character ((char) 161)); 82 entities.put("cent", new Character ((char) 162)); 83 entities.put("pound", new Character ((char) 163)); 84 entities.put("curren", new Character ((char) 164)); 85 entities.put("yen", new Character ((char) 165)); 86 entities.put("brvbar", new Character ((char) 166)); 87 entities.put("brkbar", new Character ((char) 166)); 88 entities.put("sect", new Character ((char) 167)); 89 entities.put("uml", new Character ((char) 168)); 90 entities.put("die", new Character ((char) 168)); 91 entities.put("copy", new Character ((char) 169)); 92 entities.put("ordf", new Character ((char) 170)); 93 entities.put("laquo", new Character ((char) 171)); 94 entities.put("not", new Character ((char) 172)); 95 entities.put("shy", new Character ((char) 173)); 96 entities.put("reg", new Character ((char) 174)); 97 entities.put("macr", new Character ((char) 175)); 98 entities.put("hibar", new Character ((char) 175)); 99 entities.put("deg", new Character ((char) 176)); 100 entities.put("plusmn", new Character ((char) 177)); 101 entities.put("sup2", new Character ((char) 178)); 102 entities.put("sup3", new Character ((char) 179)); 103 entities.put("acute", new Character ((char) 180)); 104 entities.put("micro", new Character ((char) 181)); 105 entities.put("para", new Character ((char) 182)); 106 entities.put("middot", new Character ((char) 183)); 107 entities.put("cedil", new Character ((char) 184)); 108 entities.put("sup1", new Character ((char) 185)); 109 entities.put("ordm", new Character ((char) 186)); 110 entities.put("raquo", new Character ((char) 187)); 111 entities.put("frac14", new Character ((char) 188)); 112 entities.put("frac12", new Character ((char) 189)); 113 entities.put("frac34", new Character ((char) 190)); 114 entities.put("iquest", new Character ((char) 191)); 115 entities.put("Agrave", new Character ((char) 192)); 116 entities.put("Aacute", new Character ((char) 193)); 117 entities.put("Acirc", new Character ((char) 194)); 118 entities.put("Atilde", new Character ((char) 195)); 119 entities.put("Auml", new Character ((char) 196)); 120 entities.put("Aring", new Character ((char) 197)); 121 entities.put("AElig", new Character ((char) 198)); 122 entities.put("Ccedil", new Character ((char) 199)); 123 entities.put("Egrave", new Character ((char) 200)); 124 entities.put("Eacute", new Character ((char) 201)); 125 entities.put("Ecirc", new Character ((char) 202)); 126 entities.put("Euml", new Character ((char) 203)); 127 entities.put("Igrave", new Character ((char) 204)); 128 entities.put("Iacute", new Character ((char) 205)); 129 entities.put("Icirc", new Character ((char) 206)); 130 entities.put("Iuml", new Character ((char) 207)); 131 entities.put("ETH", new Character ((char) 208)); 132 entities.put("Ntilde", new Character ((char) 209)); 133 entities.put("Ograve", new Character ((char) 210)); 134 entities.put("Oacute", new Character ((char) 211)); 135 entities.put("Ocirc", new Character ((char) 212)); 136 entities.put("Otilde", new Character ((char) 213)); 137 entities.put("Ouml", new Character ((char) 214)); 138 entities.put("times", new Character ((char) 215)); 139 entities.put("Oslash", new Character ((char) 216)); 140 entities.put("Ugrave", new Character ((char) 217)); 141 entities.put("Uacute", new Character ((char) 218)); 142 entities.put("Ucirc", new Character ((char) 219)); 143 entities.put("Uuml", new Character ((char) 220)); 144 entities.put("Yacute", new Character ((char) 221)); 145 entities.put("THORN", new Character ((char) 222)); 146 entities.put("szlig", new Character ((char) 223)); 147 entities.put("agrave", new Character ((char) 224)); 148 entities.put("aacute", new Character ((char) 225)); 149 entities.put("acirc", new Character ((char) 226)); 150 entities.put("atilde", new Character ((char) 227)); 151 entities.put("auml", new Character ((char) 228)); 152 entities.put("aring", new Character ((char) 229)); 153 entities.put("aelig", new Character ((char) 230)); 154 entities.put("ccedil", new Character ((char) 231)); 155 entities.put("egrave", new Character ((char) 232)); 156 entities.put("eacute", new Character ((char) 233)); 157 entities.put("ecirc", new Character ((char) 234)); 158 entities.put("euml", new Character ((char) 235)); 159 entities.put("igrave", new Character ((char) 236)); 160 entities.put("iacute", new Character ((char) 237)); 161 entities.put("icirc", new Character ((char) 238)); 162 entities.put("iuml", new Character ((char) 239)); 163 entities.put("eth", new Character ((char) 240)); 164 entities.put("ntilde", new Character ((char) 241)); 165 entities.put("ograve", new Character ((char) 242)); 166 entities.put("oacute", new Character ((char) 243)); 167 entities.put("ocirc", new Character ((char) 244)); 168 entities.put("otilde", new Character ((char) 245)); 169 entities.put("ouml", new Character ((char) 246)); 170 entities.put("divide", new Character ((char) 247)); 171 entities.put("oslash", new Character ((char) 248)); 172 entities.put("ugrave", new Character ((char) 249)); 173 entities.put("uacute", new Character ((char) 250)); 174 entities.put("ucirc", new Character ((char) 251)); 175 entities.put("uuml", new Character ((char) 252)); 176 entities.put("yacute", new Character ((char) 253)); 177 entities.put("thorn", new Character ((char) 254)); 178 entities.put("yuml", new Character ((char) 255)); 179 180 183 Map elementInfos = ELEMENT_INFOS; 184 185 elementInfos.put("NOSCRIPT", new ElementInfo(true, ElementInfo.END_ELEMENT_REQUIRED, null, true)); 186 187 ElementInfo optionalEndElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL); 188 ElementInfo forbiddenEndElement = new ElementInfo(false, ElementInfo.END_ELEMENT_FORBIDDEN); 189 ElementInfo onlyText = new ElementInfo(false, ElementInfo.END_ELEMENT_REQUIRED); 190 191 Set tableCellStopElements = new HashSet(); 192 tableCellStopElements.add("TH"); 193 tableCellStopElements.add("TD"); 194 tableCellStopElements.add("TR"); 195 ElementInfo tableCellElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, tableCellStopElements); 196 197 Set headStopElements = new HashSet(); 198 headStopElements.add("BODY"); 199 headStopElements.add("DIV"); 200 headStopElements.add("SPAN"); 201 headStopElements.add("TABLE"); 202 ElementInfo headElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, headStopElements); 203 204 Set optionStopElements = new HashSet(); 205 optionStopElements.add("OPTION"); 206 optionStopElements.add("SELECT"); 207 ElementInfo optionElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, optionStopElements); 208 209 Set paragraphStopElements = new HashSet(); 210 paragraphStopElements.add("P"); 211 paragraphStopElements.add("DIV"); 212 paragraphStopElements.add("TABLE"); 213 paragraphStopElements.add("PRE"); 214 paragraphStopElements.add("UL"); 215 paragraphStopElements.add("OL"); 216 ElementInfo paragraphElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, paragraphStopElements); 217 218 Set liStopElements = new HashSet(); 219 liStopElements.add("LI"); 220 liStopElements.add("UL"); 221 liStopElements.add("OL"); 222 223 elementInfos.put("SCRIPT", onlyText); 224 elementInfos.put("STYLE", onlyText); 225 elementInfos.put("TEXTAREA", onlyText); 226 elementInfos.put("IMG", forbiddenEndElement); 227 elementInfos.put("META", forbiddenEndElement); 228 elementInfos.put("LINK", forbiddenEndElement); 229 elementInfos.put("BASE", forbiddenEndElement); 230 elementInfos.put("INPUT", forbiddenEndElement); 231 elementInfos.put("FRAME", forbiddenEndElement); 232 elementInfos.put("BR", forbiddenEndElement); 233 elementInfos.put("HR", forbiddenEndElement); 234 elementInfos.put("EMBED", forbiddenEndElement); 235 elementInfos.put("SPACER", forbiddenEndElement); 236 237 elementInfos.put("P", paragraphElement); 238 elementInfos.put("LI", optionalEndElement); 239 elementInfos.put("DT", optionalEndElement); 240 elementInfos.put("DD", optionalEndElement); 241 elementInfos.put("TR", optionalEndElement); 242 elementInfos.put("TH", tableCellElement); 243 elementInfos.put("TD", tableCellElement); 244 elementInfos.put("HEAD", headElement); 245 elementInfos.put("OPTION", optionElement); 246 } 248 249 260 public HtmlParser(HtmlParserContext context, HTMLDocument document, ErrorHandler errorHandler, String publicId, String systemId) { 261 this.ucontext = null; 262 this.document = document; 263 this.errorHandler = errorHandler; 264 this.publicId = publicId; 265 this.systemId = systemId; 266 } 267 268 276 public HtmlParser(HTMLDocument document, ErrorHandler errorHandler, String publicId, String systemId) { 277 this.ucontext = null; 278 this.document = document; 279 this.errorHandler = errorHandler; 280 this.publicId = publicId; 281 this.systemId = systemId; 282 } 283 284 292 public HtmlParser(UserAgentContext ucontext, HTMLDocument document, ErrorHandler errorHandler, String publicId, String systemId) { 293 this.ucontext = ucontext; 294 this.document = document; 295 this.errorHandler = errorHandler; 296 this.publicId = publicId; 297 this.systemId = systemId; 298 } 299 300 305 public HtmlParser(UserAgentContext ucontext, HTMLDocument document) { 306 this.ucontext = ucontext; 307 this.document = document; 308 this.errorHandler = null; 309 this.publicId = null; 310 this.systemId = null; 311 } 312 313 320 public void parse(InputStream in) throws IOException,SAXException ,UnsupportedEncodingException { 321 this.parse(in, "ISO-8859-1"); 322 } 323 324 332 public void parse(InputStream in, String charset) throws IOException,SAXException ,UnsupportedEncodingException { 333 WritableLineReader reader = new WritableLineReader(new InputStreamReader(in, charset)); 334 this.parse(reader); 335 } 336 337 344 public void parse(Reader reader) throws IOException, SAXException { 345 this.parse(new LineNumberReader(reader)); 346 } 347 348 public void parse(LineNumberReader reader) throws IOException, SAXException { 349 Document doc = this.document; 350 this.parse(reader, doc); 351 } 352 353 362 public void parse(Reader reader, Node parent) throws IOException, SAXException { 363 this.parse(new LineNumberReader(reader), parent); 364 } 365 366 375 public void parse(LineNumberReader reader, Node parent) throws IOException, SAXException { 376 try { 378 parent.setUserData(MODIFYING_KEY, Boolean.TRUE, null); 379 try { 380 while(this.parseToken(parent, reader, null, new LinkedList()) != TOKEN_EOD) {;} 381 } catch(StopException se) { 382 throw new SAXException ("Unexpected flow exception", se); 383 } 384 } finally { 385 parent.setUserData(MODIFYING_KEY, Boolean.FALSE, null); 386 } 387 } 388 389 private static final int TOKEN_EOD = 0; 390 private static final int TOKEN_COMMENT = 1; 391 private static final int TOKEN_TEXT = 2; 392 private static final int TOKEN_BEGIN_ELEMENT = 3; 393 private static final int TOKEN_END_ELEMENT = 4; 394 private static final int TOKEN_FULL_ELEMENT = 5; 395 private static final int TOKEN_BAD = 6; 396 397 private String normalLastTag = null; 398 private boolean justReadTagBegin = false; 399 private boolean justReadTagEnd = false; 400 401 404 private boolean justReadEmptyElement = false; 405 406 417 private final int parseToken(Node parent, LineNumberReader reader, Set stopTags, LinkedList ancestors) throws IOException, StopException, SAXException { 418 Document doc = this.document; 419 StringBuffer textSb = this.readUpToTagBegin(reader); 420 if(textSb == null) { 421 return TOKEN_EOD; 422 } 423 if(textSb.length() != 0) { 424 int textLine = reader.getLineNumber(); 425 StringBuffer decText = this.entityDecode(textSb, textLine); 426 Node textNode = doc.createTextNode(decText.toString()); 427 parent.appendChild(textNode); 428 } 429 if(this.justReadTagBegin) { 430 String tag = this.readTag(reader); 431 if(tag == null) { 432 return TOKEN_EOD; 433 } 434 String normalTag = tag.toUpperCase(); 435 try { 436 if(tag.startsWith("!")) { 437 if("!--".equals(tag)) { 438 int commentLine = reader.getLineNumber(); 439 StringBuffer comment = this.passEndOfComment(reader); 440 StringBuffer decText = this.entityDecode(comment, commentLine); 441 parent.appendChild(doc.createComment(decText.toString())); 442 return TOKEN_COMMENT; 443 } 444 else { 445 this.passEndOfTag(reader); 447 return TOKEN_BAD; 448 } 449 } 450 else if(tag.startsWith("/")) { 451 tag = tag.substring(1); 452 normalTag = normalTag.substring(1); 453 this.passEndOfTag(reader); 454 return TOKEN_END_ELEMENT; 455 } 456 else { 457 Element element = doc.createElement(tag); 458 element.setUserData(MODIFYING_KEY, Boolean.TRUE, null); 459 try { 460 if(!this.justReadTagEnd) { 461 while(this.readAttribute(reader, element)) {;} 462 } 463 if(stopTags != null && stopTags.contains(normalTag)) { 464 throw new StopException(element); 468 } 469 parent.appendChild(element); 472 if(!this.justReadEmptyElement) { 473 ElementInfo einfo = (ElementInfo) ELEMENT_INFOS.get(normalTag); 474 int endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.endElementType; 475 if(endTagType != ElementInfo.END_ELEMENT_FORBIDDEN) { 476 boolean childrenOk = einfo == null ? true : einfo.childElementOk; 477 Set newStopSet = einfo == null ? null : einfo.stopTags; 478 if(newStopSet == null) { 479 if(endTagType == ElementInfo.END_ELEMENT_OPTIONAL) { 480 newStopSet = Collections.singleton(normalTag); 481 } 482 } 483 if(stopTags != null) { 484 if(newStopSet != null) { 485 Set newStopSet2 = new HashSet(); 486 newStopSet2.addAll(stopTags); 487 newStopSet2.addAll(newStopSet); 488 newStopSet = newStopSet2; 489 } 490 else { 491 newStopSet = endTagType == ElementInfo.END_ELEMENT_REQUIRED ? null : stopTags; 492 } 493 } 494 ancestors.addFirst(normalTag); 495 try { 496 for(;;) { 497 try { 498 int token; 499 if(einfo != null && einfo.noScriptElement) { 500 UserAgentContext ucontext = this.ucontext; 501 if(ucontext == null || ucontext.isScriptingEnabled()) { 502 token = this.parseForEndTag(parent, reader, tag, false); 503 } 504 else { 505 token = this.parseToken(element, reader, newStopSet, ancestors); 506 } 507 } 508 else { 509 token = childrenOk ? this.parseToken(element, reader, newStopSet, ancestors) : this.parseForEndTag(element, reader, tag, true); 510 } 511 if(token == TOKEN_END_ELEMENT) { 512 String normalLastTag = this.normalLastTag; 513 if(normalTag.equals(normalLastTag)) { 514 return TOKEN_FULL_ELEMENT; 515 } 516 else { 517 ElementInfo closeTagInfo = (ElementInfo) ELEMENT_INFOS.get(normalLastTag); 518 if(closeTagInfo == null || closeTagInfo.endElementType != ElementInfo.END_ELEMENT_FORBIDDEN) { 519 Iterator i = ancestors.iterator(); 521 if(i.hasNext()) { 522 i.next(); 523 while(i.hasNext()) { 524 String normalAncestorTag = (String ) i.next(); 525 if(normalLastTag.equals(normalAncestorTag)) { 526 normalTag = normalLastTag; 527 return TOKEN_END_ELEMENT; 528 } 529 } 530 } 531 } 532 } 534 } 535 else if(token == TOKEN_EOD) { 536 return TOKEN_EOD; 537 } 538 } catch(StopException se) { 539 Element newElement = se.getElement(); 541 tag = newElement.getTagName(); 542 normalTag = tag.toUpperCase(); 543 if(stopTags != null && stopTags.contains(normalTag)) { 547 throw se; 548 } 549 einfo = (ElementInfo) ELEMENT_INFOS.get(normalTag); 550 endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.endElementType; 551 childrenOk = einfo == null ? true : einfo.childElementOk; 552 newStopSet = einfo == null ? null : einfo.stopTags; 553 if(newStopSet == null) { 554 if(endTagType == ElementInfo.END_ELEMENT_OPTIONAL) { 555 newStopSet = Collections.singleton(normalTag); 556 } 557 } 558 if(stopTags != null && newStopSet != null) { 559 Set newStopSet2 = new HashSet(); 560 newStopSet2.addAll(stopTags); 561 newStopSet2.addAll(newStopSet); 562 newStopSet = newStopSet2; 563 } 564 ancestors.removeFirst(); 565 ancestors.addFirst(normalTag); 566 element.setUserData(MODIFYING_KEY, Boolean.FALSE, null); 568 element = newElement; 570 parent.appendChild(element); 572 if(this.justReadEmptyElement) { 573 return TOKEN_BEGIN_ELEMENT; 574 } 575 } 576 } 577 } finally { 578 ancestors.removeFirst(); 579 } 580 } 581 } 582 return TOKEN_BEGIN_ELEMENT; 583 } finally { 584 element.setUserData(MODIFYING_KEY, Boolean.FALSE, null); 588 } 589 } 590 } finally { 591 this.normalLastTag = normalTag; 592 } 593 } 594 else { 595 this.normalLastTag = null; 596 return TOKEN_TEXT; 597 } 598 } 599 600 746 751 private final StringBuffer readUpToTagBegin(LineNumberReader reader) throws IOException, SAXException { 752 StringBuffer sb = null; 753 int intCh; 754 while((intCh = reader.read()) != -1) { 755 char ch = (char) intCh; 756 if(ch == '<') { 757 this.justReadTagBegin = true; 758 this.justReadTagEnd = false; 759 this.justReadEmptyElement = false; 760 if(sb == null) { 761 sb = new StringBuffer (0); 762 } 763 return sb; 764 } 765 if(sb == null) { 766 sb = new StringBuffer (); 767 } 768 sb.append(ch); 769 } 770 this.justReadTagBegin = false; 771 this.justReadTagEnd = false; 772 this.justReadEmptyElement = false; 773 return sb; 774 } 775 776 785 private final int parseForEndTag(Node parent, LineNumberReader reader, String tagName, boolean addTextNode) throws IOException { 786 Document doc = this.document; 787 int intCh; 788 StringBuffer sb = new StringBuffer (); 789 while((intCh = reader.read()) != -1) { 790 char ch = (char) intCh; 791 if(ch == '<') { 792 intCh = reader.read(); 793 if(intCh != -1) { 794 ch = (char) intCh; 795 if(ch == '/') { 796 StringBuffer tempBuffer = new StringBuffer (); 797 INNER: 798 while((intCh = reader.read()) != -1) { 799 ch = (char) intCh; 800 if(ch == '>') { 801 String thisTag = tempBuffer.toString().trim(); 802 if(thisTag.equalsIgnoreCase(tagName)) { 803 this.justReadTagBegin = false; 804 this.justReadTagEnd = true; 805 this.justReadEmptyElement = false; 806 this.normalLastTag = thisTag.toUpperCase(); 807 if(addTextNode) { 808 String text = sb.toString(); 809 if(text.length() != 0) { 810 Node textNode = doc.createTextNode(text); 811 parent.appendChild(textNode); 812 } 813 } 814 return HtmlParser.TOKEN_END_ELEMENT; 815 } 816 else { 817 break INNER; 818 } 819 } 820 else { 821 tempBuffer.append(ch); 822 } 823 } 824 sb.append("</"); 825 sb.append(tempBuffer); 826 } 827 else { 828 sb.append('<'); 829 } 830 } 831 } 832 sb.append(ch); 833 } 834 this.justReadTagBegin = false; 835 this.justReadTagEnd = false; 836 this.justReadEmptyElement = false; 837 if(addTextNode) { 838 String text = sb.toString(); 839 if(text.length() != 0) { 840 Node textNode = doc.createTextNode(text); 841 parent.appendChild(textNode); 842 } 843 } 844 return HtmlParser.TOKEN_EOD; 845 } 846 847 852 private final String readTag(LineNumberReader reader) throws IOException { 853 StringBuffer sb = new StringBuffer (); 854 int chInt; 855 chInt = reader.read(); 856 if(chInt != -1) { 857 boolean cont = true; 858 char ch = (char) chInt; 859 if(ch == '!') { 860 sb.append('!'); 861 chInt = reader.read(); 862 if(chInt != -1) { 863 ch = (char) chInt; 864 if(ch == '-') { 865 sb.append('-'); 866 chInt = reader.read(); 867 if(chInt != -1) { 868 ch = (char) chInt; 869 if(ch == '-') { 870 sb.append('-'); 871 cont = false; 872 } 873 } 874 else { 875 cont = false; 876 } 877 } 878 } 879 else { 880 cont = false; 881 } 882 } 883 else if(ch == '/') { 884 sb.append(ch); 885 chInt = reader.read(); 886 if(chInt != -1) { 887 ch = (char) chInt; 888 } 889 else { 890 cont = false; 891 } 892 } 893 if(cont) { 894 boolean lastCharSlash = false; 895 for(;;) { 896 if(Character.isWhitespace(ch)) { 897 break; 898 } 899 else if(ch == '>') { 900 this.justReadTagEnd = true; 901 this.justReadTagBegin = false; 902 this.justReadEmptyElement = lastCharSlash; 903 String tag = sb.toString(); 904 return tag; 905 } 906 else if (ch == '/') { 907 lastCharSlash = true; 908 } 909 else { 910 if(lastCharSlash) { 911 sb.append('/'); 912 } 913 lastCharSlash = false; 914 sb.append(ch); 915 } 916 chInt = reader.read(); 917 if(chInt == -1) { 918 break; 919 } 920 ch = (char) chInt; 921 } 922 } 923 } 924 if(sb.length() > 0) { 925 this.justReadTagEnd = false; 926 this.justReadTagBegin = false; 927 this.justReadEmptyElement = false; 928 } 929 String tag = sb.toString(); 930 return tag; 931 } 932 933 private final StringBuffer passEndOfComment(LineNumberReader reader) throws IOException { 934 if(this.justReadTagEnd) { 935 return new StringBuffer (0); 936 } 937 StringBuffer sb = new StringBuffer (); 938 OUTER: 939 for(;;) { 940 int chInt = reader.read(); 941 if(chInt == -1) { 942 break OUTER; 943 } 944 char ch = (char) chInt; 945 if(ch == '-') { 946 chInt = reader.read(); 947 if(chInt == -1) { 948 sb.append(ch); 949 break OUTER; 950 } 951 ch = (char) chInt; 952 if(ch == '-') { 953 StringBuffer extra = null; 954 INNER: 955 for(;;) { 956 chInt = reader.read(); 957 if(chInt == -1) { 958 if(extra != null) { 959 sb.append(extra.toString()); 960 } 961 break OUTER; 962 } 963 ch = (char) chInt; 964 if(ch == '>') { 965 this.justReadTagBegin = false; 966 this.justReadTagEnd = true; 967 return sb; 968 } 969 else if(Character.isWhitespace(ch)) { 970 if(extra == null) { 971 extra = new StringBuffer (); 972 extra.append("--"); 973 } 974 extra.append(ch); 975 } 976 else { 977 if(extra != null) { 978 sb.append(extra.toString()); 979 } 980 sb.append(ch); 981 break INNER; 982 } 983 } 984 } 985 else { 986 sb.append('-'); 987 sb.append(ch); 988 } 989 } 990 else { 991 sb.append(ch); 992 } 993 } 994 if(sb.length() > 0) { 995 this.justReadTagBegin = false; 996 this.justReadTagEnd = false; 997 } 998 return sb; 999 } 1000 1001 private final void passEndOfTag(Reader reader) throws IOException { 1002 if(this.justReadTagEnd) { 1003 return; 1004 } 1005 boolean readSomething = false; 1006 for(;;) { 1007 int chInt = reader.read(); 1008 if(chInt == -1) { 1009 break; 1010 } 1011 readSomething = true; 1012 char ch = (char) chInt; 1013 if(ch == '>') { 1014 this.justReadTagEnd = true; 1015 this.justReadTagBegin = false; 1016 return; 1017 } 1018 } 1019 if(readSomething) { 1020 this.justReadTagBegin = false; 1021 this.justReadTagEnd = false; 1022 } 1023 } 1024 1025 private final boolean readAttribute(LineNumberReader reader, Element element) throws IOException, SAXException { 1026 if(this.justReadTagEnd) { 1027 return false; 1028 } 1029 1030 1033 StringBuffer attributeName = null; 1034 boolean blankFound = false; 1035 boolean lastCharSlash = false; 1036 for(;;) { 1037 int chInt = reader.read(); 1038 if(chInt == -1) { 1039 if(attributeName != null && attributeName.length() != 0) { 1040 String attributeNameStr = attributeName.toString(); 1041 element.setAttribute(attributeNameStr, attributeNameStr); 1042 attributeName.setLength(0); 1043 } 1044 this.justReadTagBegin = false; 1045 this.justReadTagEnd = false; 1046 this.justReadEmptyElement = false; 1047 return false; 1048 } 1049 char ch = (char) chInt; 1050 if(ch == '=') { 1051 lastCharSlash = false; 1052 blankFound = false; 1053 break; 1054 } 1055 else if(ch == '>') { 1056 if(attributeName != null && attributeName.length() != 0) { 1057 String attributeNameStr = attributeName.toString(); 1058 element.setAttribute(attributeNameStr, attributeNameStr); 1059 } 1060 this.justReadTagBegin = false; 1061 this.justReadTagEnd = true; 1062 this.justReadEmptyElement = lastCharSlash; 1063 return false; 1064 } 1065 else if(ch == '/') { 1066 blankFound = true; 1067 lastCharSlash = true; 1068 } 1069 else if(Character.isWhitespace(ch)) { 1070 lastCharSlash = false; 1071 blankFound = true; 1072 } 1073 else { 1074 lastCharSlash = false; 1075 if(blankFound) { 1076 blankFound = false; 1077 if(attributeName != null && attributeName.length() != 0) { 1078 String attributeNameStr = attributeName.toString(); 1079 element.setAttribute(attributeNameStr, attributeNameStr); 1080 attributeName.setLength(0); 1081 } 1082 } 1083 if(attributeName == null) { 1084 attributeName = new StringBuffer (6); 1085 } 1086 attributeName.append(ch); 1087 } 1088 } 1089 StringBuffer attributeValue = null; 1091 int openQuote = -1; 1092 for(;;) { 1093 int chInt = reader.read(); 1094 if(chInt == -1) { 1095 break; 1096 } 1097 char ch = (char) chInt; 1098 if(ch == '>') { 1099 if(attributeName != null && attributeName.length() != 0) { 1100 String attributeNameStr = attributeName.toString(); 1101 element.setAttribute(attributeNameStr, attributeNameStr); 1102 } 1103 this.justReadTagBegin = false; 1104 this.justReadTagEnd = true; 1105 this.justReadEmptyElement = lastCharSlash; 1106 return false; 1107 } 1108 else if(ch == '/') { 1109 lastCharSlash = true; 1110 } 1111 else if(Character.isWhitespace(ch)) { 1112 lastCharSlash = false; 1113 } 1114 else { 1115 lastCharSlash = false; 1116 if(ch == '"') { 1117 openQuote = '"'; 1118 } 1119 else if(ch == '\'') { 1120 openQuote = '\''; 1121 } 1122 else { 1123 openQuote = -1; 1124 if(attributeValue == null) { 1125 attributeValue = new StringBuffer (6); 1126 } 1127 attributeValue.append(ch); 1128 } 1129 break; 1130 } 1131 } 1132 1133 1135 for(;;) { 1136 int chInt = reader.read(); 1137 if(chInt == -1) { 1138 break; 1139 } 1140 char ch = (char) chInt; 1141 if(openQuote != -1 && ch == openQuote) { 1142 lastCharSlash = false; 1143 if(attributeName != null) { 1144 String attributeNameStr = attributeName.toString(); 1145 if(attributeValue == null) { 1146 element.setAttribute(attributeNameStr, null); 1147 } 1148 else { 1149 StringBuffer actualAttributeValue = this.entityDecode(attributeValue, reader.getLineNumber()); 1150 element.setAttribute(attributeNameStr, actualAttributeValue.toString()); 1151 } 1152 } 1153 this.justReadTagBegin = false; 1154 this.justReadTagEnd = false; 1155 return true; 1156 } 1157 else if(openQuote == -1 && ch == '>') { 1158 if(attributeName != null) { 1159 String attributeNameStr = attributeName.toString(); 1160 if(attributeValue == null) { 1161 element.setAttribute(attributeNameStr, null); 1162 } 1163 else { 1164 StringBuffer actualAttributeValue = this.entityDecode(attributeValue, reader.getLineNumber()); 1165 element.setAttribute(attributeNameStr, actualAttributeValue.toString()); 1166 } 1167 } 1168 this.justReadTagBegin = false; 1169 this.justReadTagEnd = true; 1170 this.justReadEmptyElement = lastCharSlash; 1171 return false; 1172 } 1173 else if(openQuote == -1 && Character.isWhitespace(ch)) { 1174 lastCharSlash = false; 1175 if(attributeName != null) { 1176 String attributeNameStr = attributeName.toString(); 1177 if(attributeValue == null) { 1178 element.setAttribute(attributeNameStr, null); 1179 } 1180 else { 1181 StringBuffer actualAttributeValue = this.entityDecode(attributeValue, reader.getLineNumber()); 1182 element.setAttribute(attributeNameStr, actualAttributeValue.toString()); 1183 } 1184 } 1185 this.justReadTagBegin = false; 1186 this.justReadTagEnd = false; 1187 return true; 1188 } 1189 else { 1190 if(attributeValue == null) { 1191 attributeValue = new StringBuffer (6); 1192 } 1193 if(lastCharSlash) { 1194 attributeValue.append('/'); 1195 } 1196 lastCharSlash = false; 1197 attributeValue.append(ch); 1198 } 1199 } 1200 this.justReadTagBegin = false; 1201 this.justReadTagEnd = false; 1202 if(attributeName != null) { 1203 String attributeNameStr = attributeName.toString(); 1204 if(attributeValue == null) { 1205 element.setAttribute(attributeNameStr, null); 1206 } 1207 else { 1208 StringBuffer actualAttributeValue = this.entityDecode(attributeValue, reader.getLineNumber()); 1209 element.setAttribute(attributeNameStr, actualAttributeValue.toString()); 1210 } 1211 } 1212 return false; 1213 } 1214 1215 private final StringBuffer entityDecode(StringBuffer rawText, int lineNumber) throws org.xml.sax.SAXException { 1216 int startIdx = 0; 1217 StringBuffer sb = null; 1218 for(;;) { 1219 int ampIdx = rawText.indexOf("&", startIdx); 1220 if(ampIdx == -1) { 1221 if(sb == null) { 1222 return rawText; 1223 } 1224 else { 1225 sb.append(rawText.substring(startIdx)); 1226 return sb; 1227 } 1228 } 1229 if(sb == null) { 1230 sb = new StringBuffer (); 1231 } 1232 sb.append(rawText.substring(startIdx, ampIdx)); 1233 int colonIdx = rawText.indexOf(";", ampIdx); 1234 if(colonIdx == -1) { 1235 sb.append('&'); 1236 startIdx = ampIdx+1; 1237 continue; 1238 } 1239 String spec = rawText.substring(ampIdx+1, colonIdx); 1240 if(spec.startsWith("#")) { 1241 String number = spec.substring(1).toLowerCase(); 1242 int decimal; 1243 try { 1244 if(number.startsWith("x")) { 1245 decimal = Integer.parseInt(number.substring(1), 16); 1246 } 1247 else { 1248 decimal = Integer.parseInt(number); 1249 } 1250 } catch(NumberFormatException nfe) { 1251 if(this.errorHandler != null) { 1252 this.errorHandler.error(new SAXParseException ("Bad entity: " + spec, this.getLocator(lineNumber, 0))); 1253 } 1254 decimal = 0; 1255 } 1256 sb.append((char) decimal); 1257 } 1258 else { 1259 int chInt = this.getEntityChar(spec); 1260 if(chInt == -1) { 1261 sb.append('&'); 1262 sb.append(spec); 1263 sb.append(';'); 1264 } 1265 else { 1266 sb.append((char) chInt); 1267 } 1268 } 1269 startIdx = colonIdx+1; 1270 } 1271 } 1272 1273 private final Locator getLocator(int lineNumber, int columnNumber) { 1274 return new LocatorImpl(this.publicId, this.systemId, lineNumber, columnNumber); 1275 } 1276 1277 private final int getEntityChar(String spec) { 1278 Character c = (Character ) ENTITIES.get(spec); 1280 if(c == null) { 1281 String specTL = spec.toLowerCase(); 1282 c = (Character ) ENTITIES.get(specTL); 1283 if(c == null) { 1284 return -1; 1285 } 1286 } 1287 return (int) c.charValue(); 1288 } 1289} 1290 | Popular Tags |