1 11 12 package org.cyberneko.html; 13 14 import java.io.EOFException ; 15 import java.io.FileInputStream ; 16 import java.io.FilterInputStream ; 17 import java.io.InputStream ; 18 import java.io.InputStreamReader ; 19 import java.io.IOException ; 20 import java.io.PushbackReader ; 21 import java.io.Reader ; 22 import java.io.UnsupportedEncodingException ; 23 import java.lang.reflect.InvocationTargetException ; 24 import java.lang.reflect.Method ; 25 import java.net.URL ; 26 import java.util.Stack ; 27 28 import org.apache.xerces.util.EncodingMap; 29 import org.apache.xerces.util.NamespaceSupport; 30 import org.apache.xerces.util.URI; 31 import org.apache.xerces.util.XMLAttributesImpl; 32 import org.apache.xerces.util.XMLResourceIdentifierImpl; 33 import org.apache.xerces.util.XMLStringBuffer; 34 import org.apache.xerces.xni.Augmentations; 35 import org.apache.xerces.xni.NamespaceContext; 36 import org.apache.xerces.xni.QName; 37 import org.apache.xerces.xni.XMLAttributes; 38 import org.apache.xerces.xni.XMLDocumentHandler; 39 import org.apache.xerces.xni.XMLLocator; 40 import org.apache.xerces.xni.XMLResourceIdentifier; 41 import org.apache.xerces.xni.XMLString; 42 import org.apache.xerces.xni.XNIException; 43 import org.apache.xerces.xni.parser.XMLComponentManager; 44 import org.apache.xerces.xni.parser.XMLConfigurationException; 45 import org.apache.xerces.xni.parser.XMLDocumentScanner; 46 import org.apache.xerces.xni.parser.XMLInputSource; 47 48 89 public class HTMLScanner 90 implements XMLDocumentScanner, XMLLocator, HTMLComponent { 91 92 96 98 99 public static final String HTML_4_01_STRICT_PUBID = "-//W3C//DTD HTML 4.01//EN"; 100 101 102 public static final String HTML_4_01_STRICT_SYSID = "http://www.w3.org/TR/html4/strict.dtd"; 103 104 106 107 public static final String HTML_4_01_TRANSITIONAL_PUBID = "-//W3C//DTD HTML 4.01 Transitional//EN"; 108 109 110 public static final String HTML_4_01_TRANSITIONAL_SYSID = "http://www.w3.org/TR/html4/loose.dtd"; 111 112 114 115 public static final String HTML_4_01_FRAMESET_PUBID = "-//W3C//DTD HTML 4.01 Frameset//EN"; 116 117 118 public static final String HTML_4_01_FRAMESET_SYSID = "http://www.w3.org/TR/html4/frameset.dtd"; 119 120 122 123 protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations"; 124 125 126 protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors"; 127 128 129 public static final String NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs"; 130 131 144 public static final String NOTIFY_XML_BUILTIN_REFS = "http://apache.org/xml/features/scanner/notify-builtin-refs"; 145 146 153 public static final String NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs"; 154 155 156 public static final String FIX_MSWINDOWS_REFS = "http://cyberneko.org/html/features/scanner/fix-mswindows-refs"; 157 158 162 public static final String SCRIPT_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-comment-delims"; 163 164 168 public static final String SCRIPT_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-cdata-delims"; 169 170 174 public static final String STYLE_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-comment-delims"; 175 176 180 public static final String STYLE_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims"; 181 182 186 public static final String IGNORE_SPECIFIED_CHARSET = "http://cyberneko.org/html/features/scanner/ignore-specified-charset"; 187 188 189 public static final String CDATA_SECTIONS = "http://cyberneko.org/html/features/scanner/cdata-sections"; 190 191 192 public static final String OVERRIDE_DOCTYPE = "http://cyberneko.org/html/features/override-doctype"; 193 194 195 public static final String INSERT_DOCTYPE = "http://cyberneko.org/html/features/insert-doctype"; 196 197 198 private static final String [] RECOGNIZED_FEATURES = { 199 AUGMENTATIONS, 200 REPORT_ERRORS, 201 NOTIFY_CHAR_REFS, 202 NOTIFY_XML_BUILTIN_REFS, 203 NOTIFY_HTML_BUILTIN_REFS, 204 FIX_MSWINDOWS_REFS, 205 SCRIPT_STRIP_CDATA_DELIMS, 206 SCRIPT_STRIP_COMMENT_DELIMS, 207 STYLE_STRIP_CDATA_DELIMS, 208 STYLE_STRIP_COMMENT_DELIMS, 209 IGNORE_SPECIFIED_CHARSET, 210 CDATA_SECTIONS, 211 OVERRIDE_DOCTYPE, 212 INSERT_DOCTYPE, 213 }; 214 215 216 private static final Boolean [] RECOGNIZED_FEATURES_DEFAULTS = { 217 null, 218 null, 219 Boolean.FALSE, 220 Boolean.FALSE, 221 Boolean.FALSE, 222 Boolean.FALSE, 223 Boolean.FALSE, 224 Boolean.FALSE, 225 Boolean.FALSE, 226 Boolean.FALSE, 227 Boolean.FALSE, 228 Boolean.FALSE, 229 Boolean.FALSE, 230 Boolean.FALSE, 231 }; 232 233 235 236 protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems"; 237 238 239 protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs"; 240 241 242 protected static final String DEFAULT_ENCODING = "http://cyberneko.org/html/properties/default-encoding"; 243 244 245 protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter"; 246 247 248 protected static final String DOCTYPE_PUBID = "http://cyberneko.org/html/properties/doctype/pubid"; 249 250 251 protected static final String DOCTYPE_SYSID = "http://cyberneko.org/html/properties/doctype/sysid"; 252 253 254 private static final String [] RECOGNIZED_PROPERTIES = { 255 NAMES_ELEMS, 256 NAMES_ATTRS, 257 DEFAULT_ENCODING, 258 ERROR_REPORTER, 259 DOCTYPE_PUBID, 260 DOCTYPE_SYSID, 261 }; 262 263 264 private static final Object [] RECOGNIZED_PROPERTIES_DEFAULTS = { 265 null, 266 null, 267 "Windows-1252", 268 null, 269 HTML_4_01_TRANSITIONAL_PUBID, 270 HTML_4_01_TRANSITIONAL_SYSID, 271 }; 272 273 275 276 protected static final short STATE_CONTENT = 0; 277 278 279 protected static final short STATE_MARKUP_BRACKET = 1; 280 281 282 protected static final short STATE_START_DOCUMENT = 10; 283 284 285 protected static final short STATE_END_DOCUMENT = 11; 286 287 289 290 protected static final short NAMES_NO_CHANGE = 0; 291 292 293 protected static final short NAMES_UPPERCASE = 1; 294 295 296 protected static final short NAMES_LOWERCASE = 2; 297 298 300 301 protected static final int DEFAULT_BUFFER_SIZE = 2048; 302 303 305 306 private static final boolean DEBUG_SCANNER = false; 307 308 309 private static final boolean DEBUG_SCANNER_STATE = false; 310 311 312 private static final boolean DEBUG_BUFFER = false; 313 314 315 private static final boolean DEBUG_CHARSET = false; 316 317 318 protected static final boolean DEBUG_CALLBACKS = false; 319 320 322 323 protected static final HTMLEventInfo SYNTHESIZED_ITEM = 324 new HTMLEventInfo.SynthesizedItem(); 325 326 330 332 333 protected boolean fAugmentations; 334 335 336 protected boolean fReportErrors; 337 338 339 protected boolean fNotifyCharRefs; 340 341 342 protected boolean fNotifyXmlBuiltinRefs; 343 344 345 protected boolean fNotifyHtmlBuiltinRefs; 346 347 348 protected boolean fFixWindowsCharRefs; 349 350 351 protected boolean fScriptStripCDATADelims; 352 353 354 protected boolean fScriptStripCommentDelims; 355 356 357 protected boolean fStyleStripCDATADelims; 358 359 360 protected boolean fStyleStripCommentDelims; 361 362 363 protected boolean fIgnoreSpecifiedCharset; 364 365 366 protected boolean fCDATASections; 367 368 369 protected boolean fOverrideDoctype; 370 371 372 protected boolean fInsertDoctype; 373 374 376 377 protected short fNamesElems; 378 379 380 protected short fNamesAttrs; 381 382 383 protected String fDefaultIANAEncoding; 384 385 386 protected HTMLErrorReporter fErrorReporter; 387 388 389 protected String fDoctypePubid; 390 391 392 protected String fDoctypeSysid; 393 394 396 397 protected int fBeginLineNumber; 398 399 400 protected int fBeginColumnNumber; 401 402 403 protected int fEndLineNumber; 404 405 406 protected int fEndColumnNumber; 407 408 410 411 protected PlaybackInputStream fByteStream; 412 413 414 protected CurrentEntity fCurrentEntity; 415 416 417 protected final Stack fCurrentEntityStack = new Stack (); 418 419 420 protected Scanner fScanner; 421 422 423 protected short fScannerState; 424 425 426 protected XMLDocumentHandler fDocumentHandler; 427 428 429 protected String fIANAEncoding; 430 431 432 protected String fJavaEncoding; 433 434 435 protected boolean fIso8859Encoding; 436 437 438 protected int fElementCount; 439 440 441 protected int fElementDepth; 442 443 445 446 protected Scanner fContentScanner = new ContentScanner(); 447 448 453 protected SpecialScanner fSpecialScanner = new SpecialScanner(); 454 455 457 458 protected final XMLString fString = new XMLString(); 459 460 461 protected final XMLStringBuffer fStringBuffer = new XMLStringBuffer(1024); 462 463 464 private final XMLStringBuffer fStringBuffer2 = new XMLStringBuffer(1024); 465 466 467 private final XMLStringBuffer fNonNormAttr = new XMLStringBuffer(128); 468 469 470 private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations(); 471 472 473 private final LocationItem fLocationItem = new LocationItem(); 474 475 476 private final boolean[] fSingleBoolean = { false }; 477 478 479 private final XMLResourceIdentifierImpl fResourceId = new XMLResourceIdentifierImpl(); 480 481 485 498 public void pushInputSource(XMLInputSource inputSource) { 499 Reader reader = inputSource.getCharacterStream(); 500 if (reader == null) { 501 throw new IllegalArgumentException ("pushed input source has no reader"); 502 } 503 fCurrentEntityStack.push(fCurrentEntity); 504 String encoding = inputSource.getEncoding(); 505 String publicId = inputSource.getPublicId(); 506 String baseSystemId = inputSource.getBaseSystemId(); 507 String literalSystemId = inputSource.getSystemId(); 508 String expandedSystemId = expandSystemId(literalSystemId, baseSystemId); 509 fCurrentEntity = new CurrentEntity(reader, encoding, 510 publicId, baseSystemId, 511 literalSystemId, expandedSystemId); 512 } 514 524 public void cleanup(boolean closeall) { 525 int size = fCurrentEntityStack.size(); 526 if (size > 0) { 527 if (fCurrentEntity != null) { 529 try { 530 fCurrentEntity.stream.close(); 531 } 532 catch (IOException e) { 533 } 535 } 536 for (int i = closeall ? 0 : 1; i < size; i++) { 538 fCurrentEntity = (CurrentEntity)fCurrentEntityStack.pop(); 539 try { 540 fCurrentEntity.stream.close(); 541 } 542 catch (IOException e) { 543 } 545 } 546 } 547 else if (closeall && fCurrentEntity != null) { 548 try { 549 fCurrentEntity.stream.close(); 550 } 551 catch (IOException e) { 552 } 554 } 555 } 557 561 562 public String getEncoding() { 563 return fCurrentEntity != null ? fCurrentEntity.encoding : null; 564 } 566 567 public String getPublicId() { 568 return fCurrentEntity != null ? fCurrentEntity.publicId : null; 569 } 571 572 public String getBaseSystemId() { 573 return fCurrentEntity != null ? fCurrentEntity.baseSystemId : null; 574 } 576 577 public String getLiteralSystemId() { 578 return fCurrentEntity != null ? fCurrentEntity.literalSystemId : null; 579 } 581 582 public String getExpandedSystemId() { 583 return fCurrentEntity != null ? fCurrentEntity.expandedSystemId : null; 584 } 586 587 public int getLineNumber() { 588 return fCurrentEntity != null ? fCurrentEntity.lineNumber : -1; 589 } 591 592 public int getColumnNumber() { 593 return fCurrentEntity != null ? fCurrentEntity.columnNumber : -1; 594 } 596 600 601 public Boolean getFeatureDefault(String featureId) { 602 int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0; 603 for (int i = 0; i < length; i++) { 604 if (RECOGNIZED_FEATURES[i].equals(featureId)) { 605 return RECOGNIZED_FEATURES_DEFAULTS[i]; 606 } 607 } 608 return null; 609 } 611 612 public Object getPropertyDefault(String propertyId) { 613 int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0; 614 for (int i = 0; i < length; i++) { 615 if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) { 616 return RECOGNIZED_PROPERTIES_DEFAULTS[i]; 617 } 618 } 619 return null; 620 } 622 626 627 public String [] getRecognizedFeatures() { 628 return RECOGNIZED_FEATURES; 629 } 631 632 public String [] getRecognizedProperties() { 633 return RECOGNIZED_PROPERTIES; 634 } 636 637 public void reset(XMLComponentManager manager) 638 throws XMLConfigurationException { 639 640 fAugmentations = manager.getFeature(AUGMENTATIONS); 642 fReportErrors = manager.getFeature(REPORT_ERRORS); 643 fNotifyCharRefs = manager.getFeature(NOTIFY_CHAR_REFS); 644 fNotifyXmlBuiltinRefs = manager.getFeature(NOTIFY_XML_BUILTIN_REFS); 645 fNotifyHtmlBuiltinRefs = manager.getFeature(NOTIFY_HTML_BUILTIN_REFS); 646 fFixWindowsCharRefs = manager.getFeature(FIX_MSWINDOWS_REFS); 647 fScriptStripCDATADelims = manager.getFeature(SCRIPT_STRIP_CDATA_DELIMS); 648 fScriptStripCommentDelims = manager.getFeature(SCRIPT_STRIP_COMMENT_DELIMS); 649 fStyleStripCDATADelims = manager.getFeature(STYLE_STRIP_CDATA_DELIMS); 650 fStyleStripCommentDelims = manager.getFeature(STYLE_STRIP_COMMENT_DELIMS); 651 fIgnoreSpecifiedCharset = manager.getFeature(IGNORE_SPECIFIED_CHARSET); 652 fCDATASections = manager.getFeature(CDATA_SECTIONS); 653 fOverrideDoctype = manager.getFeature(OVERRIDE_DOCTYPE); 654 fInsertDoctype = manager.getFeature(INSERT_DOCTYPE); 655 656 fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS))); 658 fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS))); 659 fDefaultIANAEncoding = String.valueOf(manager.getProperty(DEFAULT_ENCODING)); 660 fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER); 661 fDoctypePubid = String.valueOf(manager.getProperty(DOCTYPE_PUBID)); 662 fDoctypeSysid = String.valueOf(manager.getProperty(DOCTYPE_SYSID)); 663 664 } 666 667 public void setFeature(String featureId, boolean state) 668 throws XMLConfigurationException { 669 670 if (featureId.equals(AUGMENTATIONS)) { 671 fAugmentations = state; 672 } 673 else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) { 674 fIgnoreSpecifiedCharset = state; 675 } 676 else if (featureId.equals(NOTIFY_CHAR_REFS)) { 677 fNotifyCharRefs = state; 678 } 679 else if (featureId.equals(NOTIFY_XML_BUILTIN_REFS)) { 680 fNotifyXmlBuiltinRefs = state; 681 } 682 else if (featureId.equals(NOTIFY_HTML_BUILTIN_REFS)) { 683 fNotifyHtmlBuiltinRefs = state; 684 } 685 else if (featureId.equals(FIX_MSWINDOWS_REFS)) { 686 fFixWindowsCharRefs = state; 687 } 688 else if (featureId.equals(SCRIPT_STRIP_CDATA_DELIMS)) { 689 fScriptStripCDATADelims = state; 690 } 691 else if (featureId.equals(SCRIPT_STRIP_COMMENT_DELIMS)) { 692 fScriptStripCommentDelims = state; 693 } 694 else if (featureId.equals(STYLE_STRIP_CDATA_DELIMS)) { 695 fStyleStripCDATADelims = state; 696 } 697 else if (featureId.equals(STYLE_STRIP_COMMENT_DELIMS)) { 698 fStyleStripCommentDelims = state; 699 } 700 else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) { 701 fIgnoreSpecifiedCharset = state; 702 } 703 704 } 706 707 public void setProperty(String propertyId, Object value) 708 throws XMLConfigurationException { 709 710 if (propertyId.equals(NAMES_ELEMS)) { 711 fNamesElems = getNamesValue(String.valueOf(value)); 712 return; 713 } 714 715 if (propertyId.equals(NAMES_ATTRS)) { 716 fNamesAttrs = getNamesValue(String.valueOf(value)); 717 return; 718 } 719 720 if (propertyId.equals(DEFAULT_ENCODING)) { 721 fDefaultIANAEncoding = String.valueOf(value); 722 return; 723 } 724 725 } 727 731 732 public void setInputSource(XMLInputSource source) throws IOException { 733 734 fElementCount = 0; 736 fElementDepth = -1; 737 fByteStream = null; 738 fCurrentEntityStack.removeAllElements(); 739 740 fBeginLineNumber = 1; 741 fBeginColumnNumber = 1; 742 fEndLineNumber = fBeginLineNumber; 743 fEndColumnNumber = fBeginColumnNumber; 744 745 fIANAEncoding = fDefaultIANAEncoding; 747 fJavaEncoding = fIANAEncoding; 748 749 String encoding = source.getEncoding(); 751 String publicId = source.getPublicId(); 752 String baseSystemId = source.getBaseSystemId(); 753 String literalSystemId = source.getSystemId(); 754 String expandedSystemId = expandSystemId(literalSystemId, baseSystemId); 755 756 Reader reader = source.getCharacterStream(); 758 if (reader == null) { 759 InputStream inputStream = source.getByteStream(); 760 if (inputStream == null) { 761 URL url = new URL (expandedSystemId); 762 inputStream = url.openStream(); 763 } 764 fByteStream = new PlaybackInputStream(inputStream); 765 String [] encodings = new String [2]; 766 if (encoding == null) { 767 fByteStream.detectEncoding(encodings); 768 } 769 else { 770 encodings[0] = encoding; 771 } 772 if (encodings[0] == null) { 773 encodings[0] = fDefaultIANAEncoding; 774 if (fReportErrors) { 775 fErrorReporter.reportWarning("HTML1000", null); 776 } 777 } 778 if (encodings[1] == null) { 779 encodings[1] = EncodingMap.getIANA2JavaMapping(encodings[0].toUpperCase()); 780 if (encodings[1] == null) { 781 encodings[1] = encodings[0]; 782 if (fReportErrors) { 783 fErrorReporter.reportWarning("HTML1001", new Object []{encodings[0]}); 784 } 785 } 786 } 787 fIANAEncoding = encodings[0]; 788 fJavaEncoding = encodings[1]; 789 790 fIso8859Encoding = fIANAEncoding == null 791 || fIANAEncoding.toUpperCase().startsWith("ISO-8859") 792 || fIANAEncoding.equalsIgnoreCase(fDefaultIANAEncoding); 793 encoding = fIANAEncoding; 794 reader = new InputStreamReader (fByteStream, fJavaEncoding); 795 } 796 fCurrentEntity = new CurrentEntity(reader, encoding, 797 publicId, baseSystemId, 798 literalSystemId, expandedSystemId); 799 800 setScanner(fContentScanner); 802 setScannerState(STATE_START_DOCUMENT); 803 804 } 806 807 public boolean scanDocument(boolean complete) throws XNIException, IOException { 808 do { 809 if (!fScanner.scan(complete)) { 810 return false; 811 } 812 } while (complete); 813 return true; 814 } 816 817 public void setDocumentHandler(XMLDocumentHandler handler) { 818 fDocumentHandler = handler; 819 } 821 823 824 public XMLDocumentHandler getDocumentHandler() { 825 return fDocumentHandler; 826 } 828 832 833 protected static String getValue(XMLAttributes attrs, String aname) { 834 int length = attrs != null ? attrs.getLength() : 0; 835 for (int i = 0; i < length; i++) { 836 if (attrs.getQName(i).equalsIgnoreCase(aname)) { 837 return attrs.getValue(i); 838 } 839 } 840 return null; 841 } 843 856 public static String expandSystemId(String systemId, String baseSystemId) { 857 858 if (systemId == null || systemId.length() == 0) { 860 return systemId; 861 } 862 try { 864 URI uri = new URI(systemId); 865 if (uri != null) { 866 return systemId; 867 } 868 } 869 catch (URI.MalformedURIException e) { 870 } 872 String id = fixURI(systemId); 874 875 URI base = null; 877 URI uri = null; 878 try { 879 if (baseSystemId == null || baseSystemId.length() == 0 || 880 baseSystemId.equals(systemId)) { 881 String dir; 882 try { 883 dir = fixURI(System.getProperty("user.dir")); 884 } 885 catch (SecurityException se) { 886 dir = ""; 887 } 888 if (!dir.endsWith("/")) { 889 dir = dir + "/"; 890 } 891 base = new URI("file", "", dir, null, null); 892 } 893 else { 894 try { 895 base = new URI(fixURI(baseSystemId)); 896 } 897 catch (URI.MalformedURIException e) { 898 String dir; 899 try { 900 dir = fixURI(System.getProperty("user.dir")); 901 } 902 catch (SecurityException se) { 903 dir = ""; 904 } 905 if (baseSystemId.indexOf(':') != -1) { 906 base = new URI("file", "", fixURI(baseSystemId), null, null); 909 } 910 else { 911 if (!dir.endsWith("/")) { 912 dir = dir + "/"; 913 } 914 dir = dir + fixURI(baseSystemId); 915 base = new URI("file", "", dir, null, null); 916 } 917 } 918 } 919 uri = new URI(base, id); 921 } 922 catch (URI.MalformedURIException e) { 923 } 925 926 if (uri == null) { 927 return systemId; 928 } 929 return uri.toString(); 930 931 } 933 940 protected static String fixURI(String str) { 941 942 str = str.replace(java.io.File.separatorChar, '/'); 944 945 if (str.length() >= 2) { 947 char ch1 = str.charAt(1); 948 if (ch1 == ':') { 950 char ch0 = Character.toUpperCase(str.charAt(0)); 951 if (ch0 >= 'A' && ch0 <= 'Z') { 952 str = "/" + str; 953 } 954 } 955 else if (ch1 == '/' && str.charAt(0) == '/') { 957 str = "file:" + str; 958 } 959 } 960 961 return str; 963 964 } 966 967 protected static final String modifyName(String name, short mode) { 968 switch (mode) { 969 case NAMES_UPPERCASE: return name.toUpperCase(); 970 case NAMES_LOWERCASE: return name.toLowerCase(); 971 } 972 return name; 973 } 975 982 protected static final short getNamesValue(String value) { 983 if (value.equals("lower")) { 984 return NAMES_LOWERCASE; 985 } 986 if (value.equals("upper")) { 987 return NAMES_UPPERCASE; 988 } 989 return NAMES_NO_CHANGE; 990 } 992 998 protected int fixWindowsCharacter(int origChar) { 999 1000 switch(origChar) { 1001 case 130: return 8218; 1002 case 131: return 402; 1003 case 132: return 8222; 1004 case 133: return 8230; 1005 case 134: return 8224; 1006 case 135: return 8225; 1007 case 136: return 710; 1008 case 137: return 8240; 1009 case 138: return 352; 1010 case 139: return 8249; 1011 case 140: return 338; 1012 case 145: return 8216; 1013 case 146: return 8217; 1014 case 147: return 8220; 1015 case 148: return 8221; 1016 case 149: return 8226; 1017 case 150: return 8211; 1018 case 151: return 8212; 1019 case 152: return 732; 1020 case 153: return 8482; 1021 case 154: return 353; 1022 case 155: return 8250; 1023 case 156: return 339; 1024 case 159: return 376; 1025 } 1026 return origChar; 1027 } 1029 1033 1035 1036 protected int read() throws IOException { 1037 if (DEBUG_BUFFER) { 1038 System.out.print("(read: "); 1039 printBuffer(); 1040 System.out.println(); 1041 } 1042 if (fCurrentEntity.offset == fCurrentEntity.length) { 1043 if (load(0) == -1) { 1044 if (DEBUG_BUFFER) { 1045 System.out.println(")read: -> -1"); 1046 } 1047 return -1; 1048 } 1049 } 1050 int c = fCurrentEntity.buffer[fCurrentEntity.offset++]; 1051 fCurrentEntity.columnNumber++; 1052 if (DEBUG_BUFFER) { 1053 System.out.print(")read: "); 1054 printBuffer(); 1055 System.out.print(" -> "); 1056 System.out.print(c); 1057 System.out.println(); 1058 } 1059 return c; 1060 } 1062 1068 protected int load(int offset) throws IOException { 1069 if (DEBUG_BUFFER) { 1070 System.out.print("(load: "); 1071 printBuffer(); 1072 System.out.println(); 1073 } 1074 if (offset == fCurrentEntity.buffer.length) { 1076 int adjust = fCurrentEntity.buffer.length / 4; 1077 char[] array = new char[fCurrentEntity.buffer.length + adjust]; 1078 System.arraycopy(fCurrentEntity.buffer, 0, array, 0, fCurrentEntity.length); 1079 fCurrentEntity.buffer = array; 1080 } 1081 int count = fCurrentEntity.stream.read(fCurrentEntity.buffer, offset, fCurrentEntity.buffer.length - offset); 1083 fCurrentEntity.length = count != -1 ? count + offset : offset; 1084 fCurrentEntity.offset = offset; 1085 if (DEBUG_BUFFER) { 1086 System.out.print(")load: "); 1087 printBuffer(); 1088 System.out.print(" -> "); 1089 System.out.print(count); 1090 System.out.println(); 1091 } 1092 return count; 1093 } 1095 1097 1098 protected void setScanner(Scanner scanner) { 1099 fScanner = scanner; 1100 if (DEBUG_SCANNER) { 1101 System.out.print("$$$ setScanner("); 1102 System.out.print(scanner!=null?scanner.getClass().getName():"null"); 1103 System.out.println(");"); 1104 } 1105 } 1107 1108 protected void setScannerState(short state) { 1109 fScannerState = state; 1110 if (DEBUG_SCANNER_STATE) { 1111 System.out.print("$$$ setScannerState("); 1112 switch (fScannerState) { 1113 case STATE_CONTENT: { System.out.print("STATE_CONTENT"); break; } 1114 case STATE_MARKUP_BRACKET: { System.out.print("STATE_MARKUP_BRACKET"); break; } 1115 case STATE_START_DOCUMENT: { System.out.print("STATE_START_DOCUMENT"); break; } 1116 case STATE_END_DOCUMENT: { System.out.print("STATE_END_DOCUMENT"); break; } 1117 } 1118 System.out.println(");"); 1119 } 1120 } 1122 1124 1125 protected void scanDoctype() throws IOException { 1126 String root = null; 1127 String pubid = null; 1128 String sysid = null; 1129 1130 if (skipSpaces()) { 1131 root = scanName(); 1132 if (root == null) { 1133 if (fReportErrors) { 1134 fErrorReporter.reportError("HTML1014", null); 1135 } 1136 } 1137 else { 1138 root = modifyName(root, fNamesElems); 1139 } 1140 if (skipSpaces()) { 1141 if (skip("PUBLIC", false)) { 1142 skipSpaces(); 1143 pubid = scanLiteral(); 1144 if (skipSpaces()) { 1145 sysid = scanLiteral(); 1146 } 1147 } 1148 else if (skip("SYSTEM", false)) { 1149 skipSpaces(); 1150 sysid = scanLiteral(); 1151 } 1152 } 1153 } 1154 int c; 1155 while ((c = read()) != -1) { 1156 if (c == '<') { 1157 fCurrentEntity.offset--; 1158 fCurrentEntity.columnNumber--; 1159 break; 1160 } 1161 if (c == '>') { 1162 break; 1163 } 1164 if (c == '[') { 1165 skipMarkup(true); 1166 break; 1167 } 1168 } 1169 1170 if (fDocumentHandler != null) { 1171 if (fOverrideDoctype) { 1172 pubid = fDoctypePubid; 1173 sysid = fDoctypeSysid; 1174 } 1175 fEndLineNumber = fCurrentEntity.lineNumber; 1176 fEndColumnNumber = fCurrentEntity.columnNumber; 1177 fDocumentHandler.doctypeDecl(root, pubid, sysid, locationAugs()); 1178 } 1179 1180 } 1182 1183 protected String scanLiteral() throws IOException { 1184 int quote = read(); 1185 if (quote == '\'' || quote == '"') { 1186 StringBuffer str = new StringBuffer (); 1187 int c; 1188 while ((c = read()) != -1) { 1189 if (c == quote) { 1190 break; 1191 } 1192 if (c == '\r' || c == '\n') { 1193 fCurrentEntity.offset--; 1194 fCurrentEntity.columnNumber--; 1195 skipNewlines(); 1198 str.append(' '); 1199 } 1200 else if (c == '<') { 1201 fCurrentEntity.offset--; 1202 fCurrentEntity.columnNumber--; 1203 break; 1204 } 1205 else { 1206 str.append((char)c); 1207 } 1208 } 1209 if (c == -1) { 1210 if (fReportErrors) { 1211 fErrorReporter.reportError("HTML1007", null); 1212 } 1213 throw new EOFException (); 1214 } 1215 return str.toString(); 1216 } 1217 else { 1218 fCurrentEntity.offset--; 1219 fCurrentEntity.columnNumber--; 1220 } 1221 return null; 1222 } 1224 1225 protected String scanName() throws IOException { 1226 if (DEBUG_BUFFER) { 1227 System.out.print("(scanName: "); 1228 printBuffer(); 1229 System.out.println(); 1230 } 1231 if (fCurrentEntity.offset == fCurrentEntity.length) { 1232 if (load(0) == -1) { 1233 if (DEBUG_BUFFER) { 1234 System.out.print(")scanName: "); 1235 printBuffer(); 1236 System.out.println(" -> null"); 1237 } 1238 return null; 1239 } 1240 } 1241 int offset = fCurrentEntity.offset; 1242 while (true) { 1243 while (fCurrentEntity.offset < fCurrentEntity.length) { 1244 char c = fCurrentEntity.buffer[fCurrentEntity.offset]; 1245 if (!Character.isLetterOrDigit(c) && 1246 !(c == '-' || c == '.' || c == ':' || c == '_')) { 1247 break; 1248 } 1249 fCurrentEntity.offset++; 1250 fCurrentEntity.columnNumber++; 1251 } 1252 if (fCurrentEntity.offset == fCurrentEntity.length) { 1253 int length = fCurrentEntity.length - offset; 1254 System.arraycopy(fCurrentEntity.buffer, offset, fCurrentEntity.buffer, 0, length); 1255 int count = load(length); 1256 offset = 0; 1257 if (count == -1) { 1258 break; 1259 } 1260 } 1261 else { 1262 break; 1263 } 1264 } 1265 int length = fCurrentEntity.offset - offset; 1266 String name = length > 0 ? new String (fCurrentEntity.buffer, offset, length) : null; 1267 if (DEBUG_BUFFER) { 1268 System.out.print(")scanName: "); 1269 printBuffer(); 1270 System.out.print(" -> \""); 1271 System.out.print(name); 1272 System.out.println('"'); 1273 } 1274 return name; 1275 } 1277 1278 protected int scanEntityRef(XMLStringBuffer str, boolean content) 1279 throws IOException { 1280 str.clear(); 1281 str.append('&'); 1282 while (true) { 1283 int c = read(); 1284 if (c == ';') { 1285 str.append(';'); 1286 break; 1287 } 1288 if (c == -1) { 1289 if (fReportErrors) { 1290 fErrorReporter.reportWarning("HTML1004", null); 1291 } 1292 if (content && fDocumentHandler != null && fElementCount >= fElementDepth) { 1293 fEndLineNumber = fCurrentEntity.lineNumber; 1294 fEndColumnNumber = fCurrentEntity.columnNumber; 1295 fDocumentHandler.characters(str, locationAugs()); 1296 } 1297 return -1; 1298 } 1299 if (!Character.isLetterOrDigit((char)c) && c != '#') { 1300 if (fReportErrors) { 1301 fErrorReporter.reportWarning("HTML1004", null); 1302 } 1303 fCurrentEntity.offset--; 1304 fCurrentEntity.columnNumber--; 1305 if (content && fDocumentHandler != null && fElementCount >= fElementDepth) { 1306 fEndLineNumber = fCurrentEntity.lineNumber; 1307 fEndColumnNumber = fCurrentEntity.columnNumber; 1308 fDocumentHandler.characters(str, locationAugs()); 1309 } 1310 return -1; 1311 } 1312 str.append((char)c); 1313 } 1314 if (str.length == 1) { 1315 if (content && fDocumentHandler != null && fElementCount >= fElementDepth) { 1316 fEndLineNumber = fCurrentEntity.lineNumber; 1317 fEndColumnNumber = fCurrentEntity.columnNumber; 1318 fDocumentHandler.characters(str, locationAugs()); 1319 } 1320 return -1; 1321 } 1322 1323 String name = str.toString().substring(1, str.length-1); 1324 if (name.startsWith("#")) { 1325 int value = -1; 1326 try { 1327 if (name.startsWith("#x")) { 1328 value = Integer.parseInt(name.substring(2), 16); 1329 } 1330 else { 1331 value = Integer.parseInt(name.substring(1)); 1332 } 1333 1334 if (fFixWindowsCharRefs && fIso8859Encoding) { 1335 value = fixWindowsCharacter(value); 1336 } 1337 if (content && fDocumentHandler != null && fElementCount >= fElementDepth) { 1338 fEndLineNumber = fCurrentEntity.lineNumber; 1339 fEndColumnNumber = fCurrentEntity.columnNumber; 1340 if (fNotifyCharRefs) { 1341 XMLResourceIdentifier id = resourceId(); 1342 String encoding = null; 1343 fDocumentHandler.startGeneralEntity(name, id, encoding, locationAugs()); 1344 } 1345 str.clear(); 1346 str.append((char)value); 1347 fDocumentHandler.characters(str, locationAugs()); 1348 if (fNotifyCharRefs) { 1349 fDocumentHandler.endGeneralEntity(name, locationAugs()); 1350 } 1351 } 1352 } 1353 catch (NumberFormatException e) { 1354 if (fReportErrors) { 1355 fErrorReporter.reportError("HTML1005", new Object []{name}); 1356 } 1357 if (content && fDocumentHandler != null && fElementCount >= fElementDepth) { 1358 fEndLineNumber = fCurrentEntity.lineNumber; 1359 fEndColumnNumber = fCurrentEntity.columnNumber; 1360 fDocumentHandler.characters(str, locationAugs()); 1361 } 1362 } 1363 return value; 1364 } 1365 1366 int c = HTMLEntities.get(name); 1367 if (c == -1) { 1368 if (fReportErrors) { 1369 fErrorReporter.reportWarning("HTML1006", new Object []{name}); 1370 } 1371 if (content && fDocumentHandler != null && fElementCount >= fElementDepth) { 1372 fEndLineNumber = fCurrentEntity.lineNumber; 1373 fEndColumnNumber = fCurrentEntity.columnNumber; 1374 fDocumentHandler.characters(str, locationAugs()); 1375 } 1376 return -1; 1377 } 1378 if (content && fDocumentHandler != null && fElementCount >= fElementDepth) { 1379 fEndLineNumber = fCurrentEntity.lineNumber; 1380 fEndColumnNumber = fCurrentEntity.columnNumber; 1381 boolean notify = fNotifyHtmlBuiltinRefs || (fNotifyXmlBuiltinRefs && builtinXmlRef(name)); 1382 if (notify) { 1383 XMLResourceIdentifier id = resourceId(); 1384 String encoding = null; 1385 fDocumentHandler.startGeneralEntity(name, id, encoding, locationAugs()); 1386 } 1387 str.clear(); 1388 str.append((char)c); 1389 fDocumentHandler.characters(str, locationAugs()); 1390 if (notify) { 1391 fDocumentHandler.endGeneralEntity(name, locationAugs()); 1392 } 1393 } 1394 return c; 1395 1396 } 1398 1399 protected boolean skip(String s, boolean caseSensitive) throws IOException { 1400 int length = s != null ? s.length() : 0; 1401 for (int i = 0; i < length; i++) { 1402 if (fCurrentEntity.offset == fCurrentEntity.length) { 1403 System.arraycopy(fCurrentEntity.buffer, fCurrentEntity.offset - i, fCurrentEntity.buffer, 0, i); 1404 if (load(i) == -1) { 1405 fCurrentEntity.offset = 0; 1406 return false; 1407 } 1408 } 1409 char c0 = s.charAt(i); 1410 char c1 = fCurrentEntity.buffer[fCurrentEntity.offset++]; 1411 fCurrentEntity.columnNumber++; 1412 if (!caseSensitive) { 1413 c0 = Character.toUpperCase(c0); 1414 c1 = Character.toUpperCase(c1); 1415 } 1416 if (c0 != c1) { 1417 fCurrentEntity.offset -= i + 1; 1418 return false; 1419 } 1420 } 1421 return true; 1422 } 1424 1425 protected boolean skipMarkup(boolean balance) throws IOException { 1426 if (DEBUG_BUFFER) { 1427 System.out.print("(skipMarkup: "); 1428 printBuffer(); 1429 System.out.println(); 1430 } 1431 int depth = 1; 1432 boolean slashgt = false; 1433 OUTER: while (true) { 1434 if (fCurrentEntity.offset == fCurrentEntity.length) { 1435 if (load(0) == -1) { 1436 break OUTER; 1437 } 1438 } 1439 while (fCurrentEntity.offset < fCurrentEntity.length) { 1440 char c = fCurrentEntity.buffer[fCurrentEntity.offset++]; 1441 fCurrentEntity.columnNumber++; 1442 if (balance && c == '<') { 1443 depth++; 1444 } 1445 else if (c == '>') { 1446 depth--; 1447 if (depth == 0) { 1448 break OUTER; 1449 } 1450 } 1451 else if (c == '/') { 1452 if (fCurrentEntity.offset == fCurrentEntity.length) { 1453 if (load(0) == -1) { 1454 break OUTER; 1455 } 1456 } 1457 c = fCurrentEntity.buffer[fCurrentEntity.offset++]; 1458 fCurrentEntity.columnNumber++; 1459 if (c == '>') { 1460 slashgt = true; 1461 depth--; 1462 if (depth == 0) { 1463 break OUTER; 1464 } 1465 } 1466 else { 1467 fCurrentEntity.offset--; 1468 fCurrentEntity.columnNumber--; 1469 } 1470 } 1471 else if (c == '\r' || c == '\n') { 1472 skipNewlines(); 1473 } 1474 } 1475 } 1476 if (DEBUG_BUFFER) { 1477 System.out.print(")skipMarkup: "); 1478 printBuffer(); 1479 System.out.print(" -> "+slashgt); 1480 System.out.println(); 1481 } 1482 return slashgt; 1483 } 1485 1486 protected boolean skipSpaces() throws IOException { 1487 if (DEBUG_BUFFER) { 1488 System.out.print("(skipSpaces: "); 1489 printBuffer(); 1490 System.out.println(); 1491 } 1492 boolean spaces = false; 1493 while (true) { 1494 if (fCurrentEntity.offset == fCurrentEntity.length) { 1495 if (load(0) == -1) { 1496 break; 1497 } 1498 } 1499 char c = fCurrentEntity.buffer[fCurrentEntity.offset]; 1500 if (!Character.isSpace(c)) { 1501 break; 1502 } 1503 spaces = true; 1504 if (c == '\r' || c == '\n') { 1505 skipNewlines(); 1506 continue; 1507 } 1508 fCurrentEntity.offset++; 1509 fCurrentEntity.columnNumber++; 1510 } 1511 if (DEBUG_BUFFER) { 1512 System.out.print(")skipSpaces: "); 1513 printBuffer(); 1514 System.out.print(" -> "); 1515 System.out.print(spaces); 1516 System.out.println(); 1517 } 1518 return spaces; 1519 } 1521 1522 protected int skipNewlines() throws IOException { 1523 return skipNewlines(Integer.MAX_VALUE); 1524 } 1526 1527 protected int skipNewlines(int maxlines) throws IOException { 1528 if (DEBUG_BUFFER) { 1529 System.out.print("(skipNewlines: "); 1530 printBuffer(); 1531 System.out.println(); 1532 } 1533 if (fCurrentEntity.offset == fCurrentEntity.length) { 1534 if (load(0) == -1) { 1535 if (DEBUG_BUFFER) { 1536 System.out.print(")skipNewlines: "); 1537 printBuffer(); 1538 System.out.println(); 1539 } 1540 return 0; 1541 } 1542 } 1543 char c = fCurrentEntity.buffer[fCurrentEntity.offset]; 1544 int newlines = 0; 1545 int offset = fCurrentEntity.offset; 1546 if (c == '\n' || c == '\r') { 1547 do { 1548 c = fCurrentEntity.buffer[fCurrentEntity.offset++]; 1549 if (c == '\r') { 1550 newlines++; 1551 if (fCurrentEntity.offset == fCurrentEntity.length) { 1552 offset = 0; 1553 fCurrentEntity.offset = newlines; 1554 if (load(newlines) == -1) { 1555 break; 1556 } 1557 } 1558 if (fCurrentEntity.buffer[fCurrentEntity.offset] == '\n') { 1559 fCurrentEntity.offset++; 1560 offset++; 1561 } 1562 } 1563 else if (c == '\n') { 1564 newlines++; 1565 if (fCurrentEntity.offset == fCurrentEntity.length) { 1566 offset = 0; 1567 fCurrentEntity.offset = newlines; 1568 if (load(newlines) == -1) { 1569 break; 1570 } 1571 } 1572 } 1573 else { 1574 fCurrentEntity.offset--; 1575 break; 1576 } 1577 } while (newlines < maxlines && 1578 fCurrentEntity.offset < fCurrentEntity.length - 1); 1579 fCurrentEntity.lineNumber += newlines; 1580 fCurrentEntity.columnNumber = 1; 1581 } 1582 if (DEBUG_BUFFER) { 1583 System.out.print(")skipNewlines: "); 1584 printBuffer(); 1585 System.out.print(" -> "); 1586 System.out.print(newlines); 1587 System.out.println(); 1588 } 1589 return newlines; 1590 } 1592 1594 1595 protected final Augmentations locationAugs() { 1596 HTMLAugmentations augs = null; 1597 if (fAugmentations) { 1598 fLocationItem.setValues(fBeginLineNumber, fBeginColumnNumber, 1599 fEndLineNumber, fEndColumnNumber); 1600 augs = fInfosetAugs; 1601 augs.removeAllItems(); 1602 augs.putItem(AUGMENTATIONS, fLocationItem); 1603 } 1604 return augs; 1605 } 1607 1608 protected final Augmentations synthesizedAugs() { 1609 HTMLAugmentations augs = null; 1610 if (fAugmentations) { 1611 augs = fInfosetAugs; 1612 augs.removeAllItems(); 1613 augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM); 1614 } 1615 return augs; 1616 } 1618 1619 protected final XMLResourceIdentifier resourceId() { 1620 1621 fResourceId.clear(); 1622 return fResourceId; 1623 1629 } 1631 1635 1636 protected static boolean builtinXmlRef(String name) { 1637 return name.equals("amp") || name.equals("lt") || name.equals("gt") || 1638 name.equals("quot") || name.equals("apos"); 1639 } 1641 1645 1646 private void printBuffer() { 1647 if (DEBUG_BUFFER) { 1648 System.out.print('['); 1649 System.out.print(fCurrentEntity.length); 1650 System.out.print(' '); 1651 System.out.print(fCurrentEntity.offset); 1652 if (fCurrentEntity.length > 0) { 1653 System.out.print(" \""); 1654 for (int i = 0; i < fCurrentEntity.length; i++) { 1655 if (i == fCurrentEntity.offset) { 1656 System.out.print('^'); 1657 } 1658 char c = fCurrentEntity.buffer[i]; 1659 switch (c) { 1660 case '\r': { 1661 System.out.print("\\r"); 1662 break; 1663 } 1664 case '\n': { 1665 System.out.print("\\n"); 1666 break; 1667 } 1668 case '\t': { 1669 System.out.print("\\t"); 1670 break; 1671 } 1672 case '"': { 1673 System.out.print("\\\""); 1674 break; 1675 } 1676 default: { 1677 System.out.print(c); 1678 } 1679 } 1680 } 1681 if (fCurrentEntity.offset == fCurrentEntity.length) { 1682 System.out.print('^'); 1683 } 1684 System.out.print('"'); 1685 } 1686 System.out.print(']'); 1687 } 1688 } 1690 1694 1699 public interface Scanner { 1700 1701 1705 1716 public boolean scan(boolean complete) throws IOException ; 1717 1718 } 1720 1724 1729 public static class CurrentEntity { 1730 1731 1735 1736 public Reader stream; 1737 1738 1739 public String encoding; 1740 1741 1742 public String publicId; 1743 1744 1745 public String baseSystemId; 1746 1747 1748 public String literalSystemId; 1749 1750 1751 public String expandedSystemId; 1752 1753 1754 public int lineNumber = 1; 1755 1756 1757 public int columnNumber = 1; 1758 1759 1761 1762 public char[] buffer = new char[DEFAULT_BUFFER_SIZE]; 1763 1764 1765 public int offset = 0; 1766 1767 1768 public int length = 0; 1769 1770 1774 1775 public CurrentEntity(Reader stream, String encoding, 1776 String publicId, String baseSystemId, 1777 String literalSystemId, String expandedSystemId) { 1778 this.stream = stream; 1779 this.encoding = encoding; 1780 this.publicId = publicId; 1781 this.baseSystemId = baseSystemId; 1782 this.literalSystemId = literalSystemId; 1783 this.expandedSystemId = expandedSystemId; 1784 } 1786 } 1788 1793 public class ContentScanner 1794 implements Scanner { 1795 1796 1800 1802 1803 private final QName fQName = new QName(); 1804 1805 1806 private final XMLAttributesImpl fAttributes = new XMLAttributesImpl(); 1807 1808 1812 1813 public boolean scan(boolean complete) throws IOException { 1814 boolean next; 1815 do { 1816 try { 1817 next = false; 1818 switch (fScannerState) { 1819 case STATE_CONTENT: { 1820 fBeginLineNumber = fCurrentEntity.lineNumber; 1821 fBeginColumnNumber = fCurrentEntity.columnNumber; 1822 int c = read(); 1823 if (c == '<') { 1824 setScannerState(STATE_MARKUP_BRACKET); 1825 next = true; 1826 } 1827 else if (c == '&') { 1828 scanEntityRef(fStringBuffer, true); 1829 } 1830 else if (c == -1) { 1831 throw new EOFException (); 1832 } 1833 else { 1834 fCurrentEntity.offset--; 1835 fCurrentEntity.columnNumber--; 1836 scanCharacters(); 1837 } 1838 break; 1839 } 1840 case STATE_MARKUP_BRACKET: { 1841 int c = read(); 1842 if (c == '!') { 1843 if (skip("--", false)) { 1844 scanComment(); 1845 } 1846 else if (skip("[CDATA[", false)) { 1847 scanCDATA(); 1848 } 1849 else if (skip("DOCTYPE", false)) { 1850 scanDoctype(); 1851 } 1852 else { 1853 if (fReportErrors) { 1854 fErrorReporter.reportError("HTML1002", null); 1855 } 1856 skipMarkup(true); 1857 } 1858 } 1859 else if (c == '?') { 1860 scanPI(); 1861 } 1862 else if (c == '/') { 1863 scanEndElement(); 1864 } 1865 else if (c == -1) { 1866 if (fReportErrors) { 1867 fErrorReporter.reportError("HTML1003", null); 1868 } 1869 if (fDocumentHandler != null && fElementCount >= fElementDepth) { 1870 fStringBuffer.clear(); 1871 fStringBuffer.append('<'); 1872 fDocumentHandler.characters(fStringBuffer, null); 1873 } 1874 throw new EOFException (); 1875 } 1876 else { 1877 fCurrentEntity.offset--; 1878 fCurrentEntity.columnNumber--; 1879 fElementCount++; 1880 fSingleBoolean[0] = false; 1881 String ename = scanStartElement(fSingleBoolean); 1882 if (ename != null && !fSingleBoolean[0] && 1883 HTMLElements.getElement(ename).isSpecial()) { 1884 setScanner(fSpecialScanner.setElementName(ename)); 1885 setScannerState(STATE_CONTENT); 1886 return true; 1887 } 1888 } 1889 setScannerState(STATE_CONTENT); 1890 break; 1891 } 1892 case STATE_START_DOCUMENT: { 1893 if (fDocumentHandler != null && fElementCount >= fElementDepth) { 1894 if (DEBUG_CALLBACKS) { 1895 System.out.println("startDocument()"); 1896 } 1897 XMLLocator locator = HTMLScanner.this; 1898 String encoding = fIANAEncoding; 1899 Augmentations augs = locationAugs(); 1900 try { 1901 Class cls = fDocumentHandler.getClass(); 1905 Class [] types = { 1906 XMLLocator.class, String .class, 1907 NamespaceContext.class, Augmentations.class 1908 }; 1909 Method method = cls.getMethod("startDocument", types); 1910 NamespaceContext nscontext = new NamespaceSupport(); 1911 Object [] params = { 1912 locator, encoding, 1913 nscontext, augs 1914 }; 1915 method.invoke(fDocumentHandler, params); 1916 } 1917 catch (IllegalAccessException e) { 1918 throw new XNIException(e); 1919 } 1920 catch (InvocationTargetException e) { 1921 throw new XNIException(e); 1922 } 1923 catch (NoSuchMethodException e) { 1924 try { 1925 Class cls = fDocumentHandler.getClass(); 1929 Class [] types = { 1930 XMLLocator.class, String .class, Augmentations.class 1931 }; 1932 Method method = cls.getMethod("startDocument", types); 1933 Object [] params = { 1934 locator, encoding, augs 1935 }; 1936 method.invoke(fDocumentHandler, params); 1937 } 1938 catch (IllegalAccessException ex) { 1939 throw new XNIException(ex); 1941 } 1942 catch (InvocationTargetException ex) { 1943 throw new XNIException(ex); 1945 } 1946 catch (NoSuchMethodException ex) { 1947 throw new XNIException(ex); 1949 } 1950 } 1951 } 1952 if (fInsertDoctype && fDocumentHandler != null) { 1953 String root = HTMLElements.getElement(HTMLElements.HTML).name; 1954 root = modifyName(root, fNamesElems); 1955 String pubid = fDoctypePubid; 1956 String sysid = fDoctypeSysid; 1957 fDocumentHandler.doctypeDecl(root, pubid, sysid, 1958 synthesizedAugs()); 1959 } 1960 setScannerState(STATE_CONTENT); 1961 break; 1962 } 1963 case STATE_END_DOCUMENT: { 1964 if (fDocumentHandler != null && fElementCount >= fElementDepth) { 1965 if (DEBUG_CALLBACKS) { 1966 System.out.println("endDocument()"); 1967 } 1968 fEndLineNumber = fCurrentEntity.lineNumber; 1969 fEndColumnNumber = fCurrentEntity.columnNumber; 1970 fDocumentHandler.endDocument(locationAugs()); 1971 } 1972 return false; 1973 } 1974 default: { 1975 throw new RuntimeException ("unknown scanner state: "+fScannerState); 1976 } 1977 } 1978 } 1979 catch (EOFException e) { 1980 if (fCurrentEntityStack.empty()) { 1981 setScannerState(STATE_END_DOCUMENT); 1982 } 1983 else { 1984 fCurrentEntity = (CurrentEntity)fCurrentEntityStack.pop(); 1985 } 1986 next = true; 1987 } 1988 } while (next || complete); 1989 return true; 1990 } 1992 1996 1997 protected void scanCharacters() throws IOException { 1998 if (DEBUG_BUFFER) { 1999 System.out.print("(scanCharacters: "); 2000 printBuffer(); 2001 System.out.println(); 2002 } 2003 int newlines = skipNewlines(); 2004 if (newlines == 0 && fCurrentEntity.offset == fCurrentEntity.length) { 2005 if (DEBUG_BUFFER) { 2006 System.out.print(")scanCharacters: "); 2007 printBuffer(); 2008 System.out.println(); 2009 } 2010 return; 2011 } 2012 char c; 2013 int offset = fCurrentEntity.offset - newlines; 2014 for (int i = offset; i < fCurrentEntity.offset; i++) { 2015 fCurrentEntity.buffer[i] = '\n'; 2016 } 2017 while (fCurrentEntity.offset < fCurrentEntity.length) { 2018 c = fCurrentEntity.buffer[fCurrentEntity.offset]; 2019 if (c == '<' || c == '&' || c == '\n' || c == '\r') { 2020 break; 2021 } 2022 fCurrentEntity.offset++; 2023 fCurrentEntity.columnNumber++; 2024 } 2025 if (fCurrentEntity.offset > offset && 2026 fDocumentHandler != null && fElementCount >= fElementDepth) { 2027 fString.setValues(fCurrentEntity.buffer, offset, fCurrentEntity.offset - offset); 2028 if (DEBUG_CALLBACKS) { 2029 System.out.println("characters("+fString+")"); 2030 } 2031 fEndLineNumber = fCurrentEntity.lineNumber; 2032 fEndColumnNumber = fCurrentEntity.columnNumber; 2033 fDocumentHandler.characters(fString, locationAugs()); 2034 } 2035 if (DEBUG_BUFFER) { 2036 System.out.print(")scanCharacters: "); 2037 printBuffer(); 2038 System.out.println(); 2039 } 2040 } 2042 2043 protected void scanCDATA() throws IOException { 2044 if (DEBUG_BUFFER) { 2045 System.out.print("(scanCDATA: "); 2046 printBuffer(); 2047 System.out.println(); 2048 } 2049 fStringBuffer.clear(); 2050 if (fCDATASections) { 2051 if (fDocumentHandler != null && fElementCount >= fElementDepth) { 2052 fEndLineNumber = fCurrentEntity.lineNumber; 2053 fEndColumnNumber = fCurrentEntity.columnNumber; 2054 if (DEBUG_CALLBACKS) { 2055 System.out.println("startCDATA()"); 2056 } 2057 fDocumentHandler.startCDATA(locationAugs()); 2058 } 2059 } 2060 else { 2061 fStringBuffer.append("[CDATA["); 2062 } 2063 boolean eof = scanMarkupContent(fStringBuffer, ']'); 2064 if (!fCDATASections) { 2065 fStringBuffer.append("]]"); 2066 } 2067 if (fDocumentHandler != null && fElementCount >= fElementDepth) { 2068 fEndLineNumber = fCurrentEntity.lineNumber; 2069 fEndColumnNumber = fCurrentEntity.columnNumber; 2070 if (fCDATASections) { 2071 if (DEBUG_CALLBACKS) { 2072 System.out.println("characters("+fStringBuffer+")"); 2073 } 2074 fDocumentHandler.characters(fStringBuffer, locationAugs()); 2075 if (DEBUG_CALLBACKS) { 2076 System.out.println("endCDATA()"); 2077 } 2078 fDocumentHandler.endCDATA(locationAugs()); 2079 } 2080 else { 2081 if (DEBUG_CALLBACKS) { 2082 System.out.println("comment("+fStringBuffer+")"); 2083 } 2084 fDocumentHandler.comment(fStringBuffer, locationAugs()); 2085 } 2086 } 2087 if (DEBUG_BUFFER) { 2088 System.out.print(")scanCDATA: "); 2089 printBuffer(); 2090 System.out.println(); 2091 } 2092 if (eof) { 2093 throw new EOFException (); 2094 } 2095 } 2097 2098 protected void scanComment() throws IOException { 2099 if (DEBUG_BUFFER) { 2100 System.out.print("(scanComment: "); 2101 printBuffer(); 2102 System.out.println(); 2103 } 2104 fStringBuffer.clear(); 2105 boolean eof = scanMarkupContent(fStringBuffer, '-'); 2106 if (fDocumentHandler != null && fElementCount >= fElementDepth) { 2107 if (DEBUG_CALLBACKS) { 2108 System.out.println("comment("+fStringBuffer+")"); 2109 } 2110 fEndLineNumber = fCurrentEntity.lineNumber; 2111 fEndColumnNumber = fCurrentEntity.columnNumber; 2112 fDocumentHandler.comment(fStringBuffer, locationAugs()); 2113 } 2114 if (DEBUG_BUFFER) { 2115 System.out.print(")scanComment: "); 2116 printBuffer(); 2117 System.out.println(); 2118 } 2119 if (eof) { 2120 throw new EOFException (); 2121 } 2122 } 2124 2125 protected boolean scanMarkupContent(XMLStringBuffer buffer, 2126 char cend) throws IOException { 2127 int c = -1; 2128 OUTER: while (true) { 2129 c = read(); 2130 if (c == cend) { 2131 int count = 1; 2132 while (true) { 2133 c = read(); 2134 if (c == cend) { 2135 count++; 2136 continue; 2137 } 2138 break; 2139 } 2140 if (c == -1) { 2141 if (fReportErrors) { 2142 fErrorReporter.reportError("HTML1007", null); 2143 } 2144 break OUTER; 2145 } 2146 if (count < 2) { 2147 buffer.append(cend); 2148 fCurrentEntity.offset--; 2150 fCurrentEntity.columnNumber--; 2151 continue; 2153 } 2154 if (c != '>') { 2155 for (int i = 0; i < count; i++) { 2156 buffer.append(cend); 2157 } 2158 fCurrentEntity.offset--; 2159 fCurrentEntity.columnNumber--; 2160 continue; 2161 } 2162 for (int i = 0; i < count - 2; i++) { 2163 buffer.append(cend); 2164 } 2165 break; 2166 } 2167 else if (c == '\n' || c == '\r') { 2168 fCurrentEntity.offset--; 2169 fCurrentEntity.columnNumber--; 2170 int newlines = skipNewlines(); 2171 for (int i = 0; i < newlines; i++) { 2172 buffer.append('\n'); 2173 } 2174 continue; 2175 } 2176 else if (c == -1) { 2177 if (fReportErrors) { 2178 fErrorReporter.reportError("HTML1007", null); 2179 } 2180 break; 2181 } 2182 buffer.append((char)c); 2183 } 2184 return c == -1; 2185 } 2187 2188 protected void scanPI() throws IOException { 2189 if (DEBUG_BUFFER) { 2190 System.out.print("(scanPI: "); 2191 printBuffer(); 2192 System.out.println(); 2193 } 2194 if (fReportErrors) { 2195 fErrorReporter.reportWarning("HTML1008", null); 2196 } 2197 2198 String target = scanName(); 2200 if (target != null && !target.equalsIgnoreCase("xml")) { 2201 while (true) { 2202 int c = read(); 2203 if (c == '\r' || c == '\n') { 2204 fCurrentEntity.lineNumber++; 2205 fCurrentEntity.columnNumber = 1; 2206 if (c == '\r') { 2207 c = read(); 2208 if (c != '\n') { 2209 fCurrentEntity.offset--; 2210 } 2211 } 2212 continue; 2213 } 2214 if (c == -1) { 2215 break; 2216 } 2217 if (c != ' ' && c != '\t') { 2218 fCurrentEntity.offset--; 2219 fCurrentEntity.columnNumber--; 2220 break; 2221 } 2222 } 2223 fStringBuffer.clear(); 2224 while (true) { 2225 int c = read(); 2226 if (c == '?' || c == '/') { 2227 char c0 = (char)c; 2228 c = read(); 2229 if (c == '>') { 2230 break; 2231 } 2232 else { 2233 fStringBuffer.append(c0); 2234 fCurrentEntity.offset--; 2235 fCurrentEntity.columnNumber--; 2236 continue; 2237 } 2238 } 2239 else if (c == '\r' || c == '\n') { 2240 fStringBuffer.append('\n'); 2241 fCurrentEntity.lineNumber++; 2242 fCurrentEntity.columnNumber = 1; 2243 if (c == '\r') { 2244 c = read(); 2245 if (c != '\n') { 2246 fCurrentEntity.offset--; 2247 } 2248 } 2249 continue; 2250 } 2251 else if (c == -1) { 2252 break; 2253 } 2254 else { 2255 fStringBuffer.append((char)c); 2256 } 2257 } 2258 XMLString data = fStringBuffer; 2259 if (fDocumentHandler != null) { 2260 fEndLineNumber = fCurrentEntity.lineNumber; 2261 fEndColumnNumber = fCurrentEntity.columnNumber; 2262 fDocumentHandler.processingInstruction(target, data, locationAugs()); 2263 } 2264 } 2265 2266 else { 2268 int beginLineNumber = fBeginLineNumber; 2269 int beginColumnNumber = fBeginColumnNumber; 2270 fAttributes.removeAllAttributes(); 2271 int aindex = 0; 2272 while (scanPseudoAttribute(fAttributes)) { 2273 fAttributes.getName(aindex,fQName); 2274 fQName.rawname = fQName.rawname.toLowerCase(); 2275 fAttributes.setName(aindex,fQName); 2276 aindex++; 2277 } 2278 if (fDocumentHandler != null) { 2279 String version = fAttributes.getValue("version"); 2280 String encoding = fAttributes.getValue("encoding"); 2281 String standalone = fAttributes.getValue("standalone"); 2282 2283 fBeginLineNumber = beginLineNumber; 2284 fBeginColumnNumber = beginColumnNumber; 2285 fEndLineNumber = fCurrentEntity.lineNumber; 2286 fEndColumnNumber = fCurrentEntity.columnNumber; 2287 fDocumentHandler.xmlDecl(version, encoding, standalone, 2288 locationAugs()); 2289 } 2290 } 2291 2292 if (DEBUG_BUFFER) { 2293 System.out.print(")scanPI: "); 2294 printBuffer(); 2295 System.out.println(); 2296 } 2297 } 2299 2305 protected String scanStartElement(boolean[] empty) throws IOException { 2306 String ename = scanName(); 2307 int length = ename != null ? ename.length() : 0; 2308 int c = length > 0 ? ename.charAt(0) : -1; 2309 if (length == 0 || !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) { 2310 if (fReportErrors) { 2311 fErrorReporter.reportError("HTML1009", null); 2312 } 2313 if (fDocumentHandler != null && fElementCount >= fElementDepth) { 2314 fStringBuffer.clear(); 2315 fStringBuffer.append('<'); 2316 if (length > 0) { 2317 fStringBuffer.append(ename); 2318 } 2319 fDocumentHandler.characters(fStringBuffer, null); 2320 } 2321 return null; 2322 } 2323 ename = modifyName(ename, fNamesElems); 2324 fAttributes.removeAllAttributes(); 2325 boolean print = false; 2326 int beginLineNumber = fBeginLineNumber; 2327 int beginColumnNumber = fBeginColumnNumber; 2328 while (scanAttribute(fAttributes, empty)) { 2329 } 2331 fBeginLineNumber = beginLineNumber; 2332 fBeginColumnNumber = beginColumnNumber; 2333 if (fByteStream != null && fElementDepth == -1) { 2334 if (ename.equalsIgnoreCase("META")) { 2335 if (DEBUG_CHARSET) { 2336 System.out.println("+++ <META>"); 2337 } 2338 String httpEquiv = getValue(fAttributes, "http-equiv"); 2339 if (httpEquiv != null && httpEquiv.equalsIgnoreCase("content-type")) { 2340 if (DEBUG_CHARSET) { 2341 System.out.println("+++ @content-type: \""+httpEquiv+'"'); 2342 } 2343 String content = getValue(fAttributes, "content"); 2344 int index1 = content != null ? content.toLowerCase().indexOf("charset=") : -1; 2345 if (index1 != -1 && !fIgnoreSpecifiedCharset) { 2346 int index2 = content.indexOf(';', index1); 2347 String charset = index2 != -1 ? content.substring(index1+8, index2) : content.substring(index1+8); 2348 try { 2349 String ianaEncoding = charset; 2350 String javaEncoding = EncodingMap.getIANA2JavaMapping(ianaEncoding.toUpperCase()); 2351 if (DEBUG_CHARSET) { 2352 System.out.println("+++ ianaEncoding: "+ianaEncoding); 2353 System.out.println("+++ javaEncoding: "+javaEncoding); 2354 } 2355 if (javaEncoding == null) { 2356 javaEncoding = ianaEncoding; 2357 if (fReportErrors) { 2358 fErrorReporter.reportError("HTML1001", new Object []{ianaEncoding}); 2359 } 2360 } 2361 fIso8859Encoding = ianaEncoding == null 2362 || ianaEncoding.toUpperCase().startsWith("ISO-8859") 2363 || ianaEncoding.equalsIgnoreCase(fDefaultIANAEncoding); 2364 fCurrentEntity.stream = new InputStreamReader (fByteStream, javaEncoding); 2365 fByteStream.playback(); 2366 fElementDepth = fElementCount; 2367 fElementCount = 0; 2368 fCurrentEntity.offset = fCurrentEntity.length = 0; 2369 fCurrentEntity.lineNumber = 1; 2370 fCurrentEntity.columnNumber = 1; 2371 } 2372 catch (UnsupportedEncodingException e) { 2373 if (fReportErrors) { 2374 fErrorReporter.reportError("HTML1010", new Object []{charset}); 2375 } 2376 fByteStream.clear(); 2380 } 2381 } 2382 } 2383 } 2384 else if (ename.equalsIgnoreCase("BODY")) { 2385 fByteStream.clear(); 2386 } 2387 else { 2388 HTMLElements.Element element = HTMLElements.getElement(ename); 2389 if (element.parent != null && element.parent.length > 0) { 2390 if (element.parent[0].code == HTMLElements.BODY) { 2391 fByteStream.clear(); 2392 } 2393 } 2394 } 2395 } 2396 if (fDocumentHandler != null && fElementCount >= fElementDepth) { 2397 fQName.setValues(null, ename, ename, null); 2398 if (DEBUG_CALLBACKS) { 2399 System.out.println("startElement("+fQName+','+fAttributes+")"); 2400 } 2401 fEndLineNumber = fCurrentEntity.lineNumber; 2402 fEndColumnNumber = fCurrentEntity.columnNumber; 2403 if (empty[0]) { 2404 fDocumentHandler.emptyElement(fQName, fAttributes, locationAugs()); 2405 } 2406 else { 2407 fDocumentHandler.startElement(fQName, fAttributes, locationAugs()); 2408 } 2409 } 2410 return ename; 2411 } 2413 2421 protected boolean scanAttribute(XMLAttributesImpl attributes, 2422 boolean[] empty) 2423 throws IOException { 2424 return scanAttribute(attributes,empty,'/'); 2425 } 2427 2432 protected boolean scanPseudoAttribute(XMLAttributesImpl attributes) 2433 throws IOException { 2434 return scanAttribute(attributes,fSingleBoolean,'?'); 2435 } 2437 2447 protected boolean scanAttribute(XMLAttributesImpl attributes, 2448 boolean[] empty, char endc) 2449 throws IOException { 2450 boolean skippedSpaces = skipSpaces(); 2451 fBeginLineNumber = fCurrentEntity.lineNumber; 2452 fBeginColumnNumber = fCurrentEntity.columnNumber; 2453 int c = read(); 2454 if (c == -1) { 2455 if (fReportErrors) { 2456 fErrorReporter.reportError("HTML1007", null); 2457 } 2458 throw new EOFException (); 2459 } 2460 if (c == '>') { 2461 return false; 2462 } 2463 fCurrentEntity.offset--; 2464 fCurrentEntity.columnNumber--; 2465 String aname = scanName(); 2466 if (aname == null) { 2467 if (fReportErrors) { 2468 fErrorReporter.reportError("HTML1011", null); 2469 } 2470 empty[0] = skipMarkup(false); 2471 return false; 2472 } 2473 if (!skippedSpaces && fReportErrors) { 2474 fErrorReporter.reportError("HTML1013", new Object [] { aname }); 2475 } 2476 aname = modifyName(aname, fNamesAttrs); 2477 skipSpaces(); 2478 c = read(); 2479 if (c == -1) { 2480 if (fReportErrors) { 2481 fErrorReporter.reportError("HTML1007", null); 2482 } 2483 throw new EOFException (); 2484 } 2485 if (c == '/' || c == '>') { 2486 fQName.setValues(null, aname, aname, null); 2487 attributes.addAttribute(fQName, "CDATA", ""); 2488 attributes.setSpecified(attributes.getLength()-1, true); 2489 if (fAugmentations) { 2490 addLocationItem(attributes, attributes.getLength() - 1); 2491 } 2492 if (c == '/') { 2493 fCurrentEntity.offset--; 2494 fCurrentEntity.columnNumber--; 2495 empty[0] = skipMarkup(false); 2496 } 2497 return false; 2498 } 2499 2516 if (c == '=') { 2517 skipSpaces(); 2518 c = read(); 2519 if (c == -1) { 2520 if (fReportErrors) { 2521 fErrorReporter.reportError("HTML1007", null); 2522 } 2523 throw new EOFException (); 2524 } 2525 if (c == '>') { 2527 fQName.setValues(null, aname, aname, null); 2528 attributes.addAttribute(fQName, "CDATA", ""); 2529 attributes.setSpecified(attributes.getLength()-1, true); 2530 if (fAugmentations) { 2531 addLocationItem(attributes, attributes.getLength() - 1); 2532 } 2533 return false; 2534 } 2535 fStringBuffer.clear(); 2536 fNonNormAttr.clear(); 2537 if (c != '\'' && c != '"') { 2538 fCurrentEntity.offset--; 2539 fCurrentEntity.columnNumber--; 2540 while (true) { 2541 c = read(); 2542 if (Character.isSpace((char)c) || c == '>') { 2544 fCurrentEntity.offset--; 2546 fCurrentEntity.columnNumber--; 2547 break; 2548 } 2549 if (c == -1) { 2550 if (fReportErrors) { 2551 fErrorReporter.reportError("HTML1007", null); 2552 } 2553 throw new EOFException (); 2554 } 2555 if (c == '&') { 2556 int ce = scanEntityRef(fStringBuffer2, false); 2557 if (ce != -1) { 2558 fStringBuffer.append((char)ce); 2559 } 2560 else { 2561 fStringBuffer.append(fStringBuffer2); 2562 } 2563 fNonNormAttr.append(fStringBuffer2); 2564 } 2565 else { 2566 fStringBuffer.append((char)c); 2567 fNonNormAttr.append((char)c); 2568 } 2569 } 2570 fQName.setValues(null, aname, aname, null); 2571 String avalue = fStringBuffer.toString(); 2572 attributes.addAttribute(fQName, "CDATA", avalue); 2573 2574 int lastattr = attributes.getLength()-1; 2575 attributes.setSpecified(lastattr, true); 2576 attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString()); 2577 if (fAugmentations) { 2578 addLocationItem(attributes, attributes.getLength() - 1); 2579 } 2580 return true; 2581 } 2582 char quote = (char)c; 2583 do { 2584 c = read(); 2585 if (c == -1) { 2586 if (fReportErrors) { 2587 fErrorReporter.reportError("HTML1007", null); 2588 } 2589 throw new EOFException (); 2590 } 2591 if (c == '&') { 2592 int ce = scanEntityRef(fStringBuffer2, false); 2593 if (ce != -1) { 2594 fStringBuffer.append((char)ce); 2595 } 2596 else { 2597 fStringBuffer.append(fStringBuffer2); 2598 } 2599 fNonNormAttr.append(fStringBuffer2); 2600 } 2601 else if (c == '\t') { 2602 fStringBuffer.append(' '); 2603 fNonNormAttr.append('\t'); 2604 } 2605 else if (c == '\r' || c == '\n') { 2606 fCurrentEntity.lineNumber++; 2607 fCurrentEntity.columnNumber = 0; 2608 if (c == '\r') { 2609 int c2 = read(); 2610 if (c2 != '\n') { 2611 fCurrentEntity.offset--; 2612 fCurrentEntity.columnNumber--; 2613 } 2614 else { 2615 fNonNormAttr.append('\r'); 2616 c = c2; 2617 } 2618 } 2619 fStringBuffer.append(' '); 2620 fNonNormAttr.append((char)c); 2621 } 2622 else if (c != quote) { 2623 fStringBuffer.append((char)c); 2624 fNonNormAttr.append((char)c); 2625 } 2626 } while (c != quote); 2627 fQName.setValues(null, aname, aname, null); 2628 String avalue = fStringBuffer.toString(); 2629 attributes.addAttribute(fQName, "CDATA", avalue); 2630 2631 int lastattr = attributes.getLength()-1; 2632 attributes.setSpecified(lastattr, true); 2633 attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString()); 2634 if (fAugmentations) { 2635 addLocationItem(attributes, attributes.getLength() - 1); 2636 } 2637 } 2638 else { 2639 fQName.setValues(null, aname, aname, null); 2640 attributes.addAttribute(fQName, "CDATA", ""); 2641 attributes.setSpecified(attributes.getLength()-1, true); 2642 fCurrentEntity.offset--; 2643 fCurrentEntity.columnNumber--; 2644 if (fAugmentations) { 2645 addLocationItem(attributes, attributes.getLength() - 1); 2646 } 2647 } 2648 return true; 2649 } 2651 2652 protected void addLocationItem(XMLAttributes attributes, int index) { 2653 fEndLineNumber = fCurrentEntity.lineNumber; 2654 fEndColumnNumber = fCurrentEntity.columnNumber; 2655 LocationItem locationItem = new LocationItem(); 2656 locationItem.setValues(fBeginLineNumber, fBeginColumnNumber, 2657 fEndLineNumber, fEndColumnNumber); 2658 Augmentations augs = attributes.getAugmentations(index); 2659 augs.putItem(AUGMENTATIONS, locationItem); 2660 } 2662 2663 protected void scanEndElement() throws IOException { 2664 String ename = scanName(); 2665 if (fReportErrors && ename == null) { 2666 fErrorReporter.reportError("HTML1012", null); 2667 } 2668 skipMarkup(false); 2669 if (ename != null) { 2670 ename = modifyName(ename, fNamesElems); 2671 if (fDocumentHandler != null && fElementCount >= fElementDepth) { 2672 fQName.setValues(null, ename, ename, null); 2673 if (DEBUG_CALLBACKS) { 2674 System.out.println("endElement("+fQName+")"); 2675 } 2676 fEndLineNumber = fCurrentEntity.lineNumber; 2677 fEndColumnNumber = fCurrentEntity.columnNumber; 2678 fDocumentHandler.endElement(fQName, locationAugs()); 2679 } 2680 } 2681 } 2683 } 2685 2692 public class SpecialScanner 2693 implements Scanner { 2694 2695 2699 2700 protected String fElementName; 2701 2702 2703 protected boolean fScript; 2704 2705 2706 protected boolean fStyle; 2707 2708 2709 protected boolean fTextarea; 2710 2711 2713 2714 private final QName fQName = new QName(); 2715 2716 2717 private final XMLStringBuffer fStringBuffer = new XMLStringBuffer(); 2718 2719 2723 2724 public Scanner setElementName(String ename) { 2725 fElementName = ename; 2726 fScript = fElementName.equalsIgnoreCase("SCRIPT"); 2727 fStyle = fElementName.equalsIgnoreCase("STYLE"); 2728 fTextarea = fElementName.equalsIgnoreCase("TEXTAREA"); 2729 return this; 2730 } 2732 2736 2737 public boolean scan(boolean complete) throws IOException { 2738 boolean next; 2739 do { 2740 try { 2741 next = false; 2742 int delimiter = -1; 2743 switch (fScannerState) { 2744 case STATE_CONTENT: { 2745 fBeginLineNumber = fCurrentEntity.lineNumber; 2746 fBeginColumnNumber = fCurrentEntity.columnNumber; 2747 int c = read(); 2748 if (c == '<') { 2749 setScannerState(STATE_MARKUP_BRACKET); 2750 continue; 2751 } 2752 if (c == '&') { 2753 if (fTextarea) { 2754 scanEntityRef(fStringBuffer, true); 2755 continue; 2756 } 2757 fStringBuffer.clear(); 2758 fStringBuffer.append('&'); 2759 } 2760 else if (c == -1) { 2761 if (fReportErrors) { 2762 fErrorReporter.reportError("HTML1007", null); 2763 } 2764 throw new EOFException (); 2765 } 2766 else { 2767 fCurrentEntity.offset--; 2768 fCurrentEntity.columnNumber--; 2769 fStringBuffer.clear(); 2770 } 2771 scanCharacters(fStringBuffer, -1); 2772 break; 2773 } case STATE_MARKUP_BRACKET: { 2775 int c = read(); 2776 if (c == '!') { 2777 if (skip("--", false)) { 2778 fStringBuffer.clear(); 2779 boolean strip = (fScript && fScriptStripCommentDelims) || 2780 (fStyle && fStyleStripCommentDelims); 2781 if (strip) { 2782 do { 2783 c = read(); 2784 if (c == '\r' || c == '\n') { 2785 fCurrentEntity.columnNumber--; 2786 fCurrentEntity.offset--; 2787 break; 2788 } 2789 } while (c != -1); 2790 skipNewlines(1); 2791 delimiter = '-'; 2792 } 2793 else { 2794 fStringBuffer.append("<!--"); 2795 } 2796 } 2797 else if (skip("[CDATA[", false)) { 2798 fStringBuffer.clear(); 2799 boolean strip = (fScript && fScriptStripCDATADelims) || 2800 (fStyle && fStyleStripCDATADelims); 2801 if (strip) { 2802 do { 2803 c = read(); 2804 if (c == '\r' || c == '\n') { 2805 fCurrentEntity.columnNumber--; 2806 fCurrentEntity.offset--; 2807 break; 2808 } 2809 } while (c != -1); 2810 skipNewlines(1); 2811 delimiter = ']'; 2812 } 2813 else { 2814 fStringBuffer.append("<![CDATA["); 2815 } 2816 } 2817 } 2818 else if (c == '/') { 2819 String ename = scanName(); 2820 if (ename != null) { 2821 if (ename.equalsIgnoreCase(fElementName)) { 2822 if (read() == '>') { 2823 ename = modifyName(ename, fNamesElems); 2824 if (fDocumentHandler != null && fElementCount >= fElementDepth) { 2825 fQName.setValues(null, ename, ename, null); 2826 if (DEBUG_CALLBACKS) { 2827 System.out.println("endElement("+fQName+")"); 2828 } 2829 fEndLineNumber = fCurrentEntity.lineNumber; 2830 fEndColumnNumber = fCurrentEntity.columnNumber; 2831 fDocumentHandler.endElement(fQName, locationAugs()); 2832 } 2833 setScanner(fContentScanner); 2834 setScannerState(STATE_CONTENT); 2835 return true; 2836 } 2837 else { 2838 fCurrentEntity.offset--; 2839 fCurrentEntity.columnNumber--; 2840 } 2841 } 2842 fStringBuffer.clear(); 2843 fStringBuffer.append("</"); 2844 fStringBuffer.append(ename); 2845 } 2846 else { 2847 fStringBuffer.clear(); 2848 fStringBuffer.append("</"); 2849 } 2850 } 2851 else { 2852 fStringBuffer.clear(); 2853 fStringBuffer.append('<'); 2854 fStringBuffer.append((char)c); 2855 } 2856 scanCharacters(fStringBuffer, delimiter); 2857 setScannerState(STATE_CONTENT); 2858 break; 2859 } } } catch (EOFException e) { 2863 setScanner(fContentScanner); 2864 if (fCurrentEntityStack.empty()) { 2865 setScannerState(STATE_END_DOCUMENT); 2866 } 2867 else { 2868 fCurrentEntity = (CurrentEntity)fCurrentEntityStack.pop(); 2869 setScannerState(STATE_CONTENT); 2870 } 2871 return true; 2872 } 2873 } while (next || complete); 2875 return true; 2876 } 2878 2882 2883 protected void scanCharacters(XMLStringBuffer buffer, 2884 int delimiter) throws IOException { 2885 if (DEBUG_BUFFER) { 2886 System.out.print("(scanCharacters, delimiter="+delimiter+": "); 2887 printBuffer(); 2888 System.out.println(); 2889 } 2890 boolean strip = (fScript && fScriptStripCommentDelims) || 2891 (fScript && fScriptStripCDATADelims) || 2892 (fStyle && fStyleStripCommentDelims) || 2893 (fStyle && fStyleStripCDATADelims); 2894 while (true) { 2895 int c = read(); 2896 if (c == -1 || (delimiter == -1 && (c == '<' || c == '&'))) { 2897 if (c != -1) { 2898 fCurrentEntity.offset--; 2899 fCurrentEntity.columnNumber--; 2900 } 2901 break; 2902 } 2903 else if (c == '\r' || c == '\n') { 2905 fCurrentEntity.offset--; 2906 fCurrentEntity.columnNumber--; 2907 int newlines = skipNewlines(); 2908 for (int i = 0; i < newlines; i++) { 2909 buffer.append('\n'); 2910 } 2911 } 2912 else if (delimiter != -1 && c == (char)delimiter) { 2913 int count = 0; 2914 do { 2915 count++; 2916 c = read(); 2917 } while (c == (char)delimiter); 2918 for (int i = strip && c == '>' ? 2 : 0; i < count; i++) { 2919 buffer.append((char)delimiter); 2920 } 2921 if (c == -1 || (count >= 2 && c == '>')) { 2922 if (!strip) { 2923 buffer.append((char)c); 2924 } 2925 break; 2926 } 2927 fCurrentEntity.offset--; 2928 fCurrentEntity.columnNumber--; 2929 } 2930 else { 2931 buffer.append((char)c); 2932 if (c == '\n') { 2933 fCurrentEntity.columnNumber = 1; 2934 fCurrentEntity.lineNumber++; 2935 } 2936 } 2937 } 2938 if (buffer.length > 0 && fDocumentHandler != null && fElementCount >= fElementDepth) { 2939 if (DEBUG_CALLBACKS) { 2940 System.out.println("characters("+buffer+")"); 2941 } 2942 fEndLineNumber = fCurrentEntity.lineNumber; 2943 fEndColumnNumber = fCurrentEntity.columnNumber; 2944 fDocumentHandler.characters(buffer, locationAugs()); 2945 } 2946 if (DEBUG_BUFFER) { 2947 System.out.print(")scanCharacters: "); 2948 printBuffer(); 2949 System.out.println(); 2950 } 2951 } 2953 } 2955 2978 public static class PlaybackInputStream 2979 extends FilterInputStream { 2980 2981 2985 2986 private static final boolean DEBUG_PLAYBACK = false; 2987 2988 2992 2994 2995 protected boolean fPlayback = false; 2996 2997 2998 protected boolean fCleared = false; 2999 3000 3001 protected boolean fDetected = false; 3002 3003 3005 3006 protected byte[] fByteBuffer = new byte[1024]; 3007 3008 3009 protected int fByteOffset = 0; 3010 3011 3012 protected int fByteLength = 0; 3013 3014 3015 public int fPushbackOffset = 0; 3016 3017 3018 public int fPushbackLength = 0; 3019 3020 3024 3025 public PlaybackInputStream(InputStream in) { 3026 super(in); 3027 } 3029 3033 3034 public void detectEncoding(String [] encodings) throws IOException { 3035 if (fDetected) { 3036 throw new IOException ("Should not detect encoding twice."); 3037 } 3038 fDetected = true; 3039 int b1 = read(); 3040 if (b1 == -1) { 3041 return; 3042 } 3043 int b2 = read(); 3044 if (b2 == -1) { 3045 fPushbackLength = 1; 3046 return; 3047 } 3048 if (b1 == 0xEF && b2 == 0xBB) { 3050 int b3 = read(); 3051 if (b3 == 0xBF) { 3052 fPushbackOffset = 3; 3053 encodings[0] = "UTF-8"; 3054 encodings[1] = "UTF8"; 3055 return; 3056 } 3057 fPushbackLength = 3; 3058 } 3059 if (b1 == 0xFF && b2 == 0xFE) { 3061 encodings[0] = "UTF-16"; 3062 encodings[1] = "UnicodeLittleUnmarked"; 3063 return; 3064 } 3065 else if (b1 == 0xFE && b2 == 0xFF) { 3067 encodings[0] = "UTF-16"; 3068 encodings[1] = "UnicodeBigUnmarked"; 3069 return; 3070 } 3071 fPushbackLength = 2; 3073 } 3075 3076 public void playback() { 3077 fPlayback = true; 3078 } 3080 3088 public void clear() { 3089 if (!fPlayback) { 3090 fCleared = true; 3091 fByteBuffer = null; 3092 } 3093 } 3095 3099 3100 public int read() throws IOException { 3101 if (DEBUG_PLAYBACK) { 3102 System.out.println("(read"); 3103 } 3104 if (fPushbackOffset < fPushbackLength) { 3105 return fByteBuffer[fPushbackOffset++]; 3106 } 3107 if (fCleared) { 3108 return in.read(); 3109 } 3110 if (fPlayback) { 3111 int c = fByteBuffer[fByteOffset++]; 3112 if (fByteOffset == fByteLength) { 3113 fCleared = true; 3114 fByteBuffer = null; 3115 } 3116 if (DEBUG_PLAYBACK) { 3117 System.out.println(")read -> "+(char)c); 3118 } 3119 return c; 3120 } 3121 int c = in.read(); 3122 if (c != -1) { 3123 if (fByteLength == fByteBuffer.length) { 3124 byte[] newarray = new byte[fByteLength + 1024]; 3125 System.arraycopy(fByteBuffer, 0, newarray, 0, fByteLength); 3126 fByteBuffer = newarray; 3127 } 3128 fByteBuffer[fByteLength++] = (byte)c; 3129 } 3130 if (DEBUG_PLAYBACK) { 3131 System.out.println(")read -> "+(char)c); 3132 } 3133 return c; 3134 } 3136 3137 public int read(byte[] array) throws IOException { 3138 return read(array, 0, array.length); 3139 } 3141 3142 public int read(byte[] array, int offset, int length) throws IOException { 3143 if (DEBUG_PLAYBACK) { 3144 System.out.println(")read("+offset+','+length+')'); 3145 } 3146 if (fPushbackOffset < fPushbackLength) { 3147 int count = fPushbackLength - fPushbackOffset; 3148 if (count > length) { 3149 count = length; 3150 } 3151 System.arraycopy(fByteBuffer, fPushbackOffset, array, offset, count); 3152 fPushbackOffset += count; 3153 return count; 3154 } 3155 if (fCleared) { 3156 return in.read(array, offset, length); 3157 } 3158 if (fPlayback) { 3159 if (fByteOffset + length > fByteLength) { 3160 length = fByteLength - fByteOffset; 3161 } 3162 System.arraycopy(fByteBuffer, fByteOffset, array, offset, length); 3163 fByteOffset += length; 3164 if (fByteOffset == fByteLength) { 3165 fCleared = true; 3166 fByteBuffer = null; 3167 } 3168 return length; 3169 } 3170 int count = in.read(array, offset, length); 3171 if (count != -1) { 3172 if (fByteLength + count > fByteBuffer.length) { 3173 byte[] newarray = new byte[fByteLength + count + 512]; 3174 System.arraycopy(fByteBuffer, 0, newarray, 0, fByteLength); 3175 fByteBuffer = newarray; 3176 } 3177 System.arraycopy(array, offset, fByteBuffer, fByteLength, count); 3178 fByteLength += count; 3179 } 3180 if (DEBUG_PLAYBACK) { 3181 System.out.println(")read("+offset+','+length+") -> "+count); 3182 } 3183 return count; 3184 } 3186 } 3188 3193 protected static class LocationItem 3194 implements HTMLEventInfo { 3195 3196 3200 3201 protected int fBeginLineNumber; 3202 3203 3204 protected int fBeginColumnNumber; 3205 3206 3207 protected int fEndLineNumber; 3208 3209 3210 protected int fEndColumnNumber; 3211 3212 3216 3217 public void setValues(int beginLine, int beginColumn, 3218 int endLine, int endColumn) { 3219 fBeginLineNumber = beginLine; 3220 fBeginColumnNumber = beginColumn; 3221 fEndLineNumber = endLine; 3222 fEndColumnNumber = endColumn; 3223 } 3225 3229 3231 3232 public int getBeginLineNumber() { 3233 return fBeginLineNumber; 3234 } 3236 3237 public int getBeginColumnNumber() { 3238 return fBeginColumnNumber; 3239 } 3241 3242 public int getEndLineNumber() { 3243 return fEndLineNumber; 3244 } 3246 3247 public int getEndColumnNumber() { 3248 return fEndColumnNumber; 3249 } 3251 3253 3254 public boolean isSynthesized() { 3255 return false; 3256 } 3258 3262 3263 public String toString() { 3264 StringBuffer str = new StringBuffer (); 3265 str.append(fBeginLineNumber); 3266 str.append(':'); 3267 str.append(fBeginColumnNumber); 3268 str.append(':'); 3269 str.append(fEndLineNumber); 3270 str.append(':'); 3271 str.append(fEndColumnNumber); 3272 return str.toString(); 3273 } 3275 } 3277} | Popular Tags |