| 1 7 25 26 42 43 package com.icl.saxon.aelfred; 44 45 import java.io.BufferedInputStream; 46 import java.io.CharConversionException; 47 import java.io.EOFException; 48 import java.io.InputStream; 49 import java.io.InputStreamReader; 50 import java.io.IOException; 51 import java.io.Reader; 52 import java.net.URL; 53 import java.net.URLConnection; 54 import java.util.Enumeration; 55 import java.util.Hashtable; 56 import java.util.Stack; 57 58 import org.xml.sax.SAXException; 59 60 61 63 74 final class XmlParser 75 { 76 private final static boolean USE_CHEATS = true; 78 79 private final static int DEFAULT_ATTR_COUNT = 23; 81 82 83 87 88 93 XmlParser () 95 { 96 cleanupVariables (); 97 } 98 99 100 105 void setHandler (SAXDriver handler) 107 { 108 this.handler = handler; 109 } 110 111 112 129 void doParse ( 131 String systemId, 132 String publicId, 133 Reader reader, 134 InputStream stream, 135 String encoding 136 ) throws Exception 137 { 138 if (handler == null) 139 throw new IllegalStateException ("no callback handler"); 140 141 basePublicId = publicId; 142 baseURI = systemId; 143 baseReader = reader; 144 baseInputStream = stream; 145 146 initializeVariables (); 147 148 setInternalEntity ("amp", "&"); 152 setInternalEntity ("lt", "<"); 153 setInternalEntity ("gt", ">"); 154 setInternalEntity ("apos", "'"); 155 setInternalEntity ("quot", """); 156 157 handler.startDocument (); 158 159 pushURL ("[document]", basePublicId, baseURI, 160 baseReader, baseInputStream, encoding, false); 161 162 try { 163 parseDocument (); 164 handler.endDocument (); 165 } finally { 166 if (baseReader != null) 167 try { baseReader.close (); 168 } catch (IOException e) { } 169 if (baseInputStream != null) 170 try { baseInputStream.close (); 171 } catch (IOException e) { } 172 if (is != null) 173 try { is.close (); 174 } catch (IOException e) { } 175 if (reader != null) 176 try { 177 reader.close (); 178 } catch (IOException e) { 179 } 180 cleanupVariables (); 181 } 182 } 183 184 185 189 193 197 public final static int CONTENT_UNDECLARED = 0; 198 199 203 public final static int CONTENT_ANY = 1; 204 205 209 public final static int CONTENT_EMPTY = 2; 210 211 215 public final static int CONTENT_MIXED = 3; 216 217 221 public final static int CONTENT_ELEMENTS = 4; 222 223 224 228 232 public final static int ENTITY_UNDECLARED = 0; 233 234 238 public final static int ENTITY_INTERNAL = 1; 239 240 244 public final static int ENTITY_NDATA = 2; 245 246 250 public final static int ENTITY_TEXT = 3; 251 252 253 257 261 public final static int ATTRIBUTE_UNDECLARED = 0; 262 263 267 public final static int ATTRIBUTE_CDATA = 1; 268 269 273 public final static int ATTRIBUTE_ID = 2; 274 275 279 public final static int ATTRIBUTE_IDREF = 3; 280 281 285 public final static int ATTRIBUTE_IDREFS = 4; 286 287 291 public final static int ATTRIBUTE_ENTITY = 5; 292 293 297 public final static int ATTRIBUTE_ENTITIES = 6; 298 299 303 public final static int ATTRIBUTE_NMTOKEN = 7; 304 305 309 public final static int ATTRIBUTE_NMTOKENS = 8; 310 311 315 public final static int ATTRIBUTE_ENUMERATED = 9; 316 317 321 public final static int ATTRIBUTE_NOTATION = 10; 322 323 324 329 332 private static Hashtable attributeTypeHash; 333 static { 334 attributeTypeHash = new Hashtable (13); 335 attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA)); 336 attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID)); 337 attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF)); 338 attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS)); 339 attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY)); 340 attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES)); 341 attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN)); 342 attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS)); 343 attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION)); 344 } 345 346 347 private final static int ENCODING_EXTERNAL = 0; 351 private final static int ENCODING_UTF_8 = 1; 352 private final static int ENCODING_ISO_8859_1 = 2; 353 private final static int ENCODING_UCS_2_12 = 3; 354 private final static int ENCODING_UCS_2_21 = 4; 355 private final static int ENCODING_UCS_4_1234 = 5; 356 private final static int ENCODING_UCS_4_4321 = 6; 357 private final static int ENCODING_UCS_4_2143 = 7; 358 private final static int ENCODING_UCS_4_3412 = 8; 359 private final static int ENCODING_ASCII = 9; 360 361 362 366 370 public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; 371 372 377 public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; 378 379 383 public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; 384 385 389 public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; 390 391 396 public final static int ATTRIBUTE_DEFAULT_FIXED = 34; 397 398 399 private final static int INPUT_NONE = 0; 403 private final static int INPUT_INTERNAL = 1; 404 private final static int INPUT_STREAM = 3; 405 private final static int INPUT_BUFFER = 4; 406 private final static int INPUT_READER = 5; 407 408 409 private final static int LIT_ENTITY_REF = 2; 414 private final static int LIT_NORMALIZE = 4; 416 private final static int LIT_ATTRIBUTE = 8; 418 private final static int LIT_DISABLE_PE = 16; 420 private final static int LIT_DISABLE_CREF = 32; 422 private final static int LIT_DISABLE_EREF = 64; 424 private final static int LIT_ENTITY_CHECK = 128; 426 private final static int LIT_PUBID = 256; 428 429 private final static int CONTEXT_NORMAL = 0; 434 private final static int CONTEXT_LITERAL = 1; 435 436 437 441 442 449 private void error (String message, String textFound, String textExpected) 450 throws SAXException 451 { 452 if (textFound != null) { 453 message = message + " (found \"" + textFound + "\")"; 454 } 455 if (textExpected != null) { 456 message = message + " (expected \"" + textExpected + "\")"; 457 } 458 String uri = null; 459 460 if (externalEntity != null) { 461 uri = externalEntity.getURL ().toString (); 462 } 463 handler.error (message, uri, line, column); 464 465 throw new SAXException (message); 467 } 468 469 470 475 private void error (String message, char textFound, String textExpected) 476 throws SAXException 477 { 478 error (message, new Character (textFound).toString (), textExpected); 479 } 480 481 482 private void error (String message) 483 throws SAXException 484 { 485 error (message, null, null); 486 } 487 488 489 493 494 504 private void parseDocument () 505 throws Exception 506 { 507 try { parseProlog (); 509 require ('<', "document prolog"); 510 parseElement (); 511 } catch (EOFException ee) { error("premature end of file", "[EOF]", null); 513 } 514 515 try { 516 parseMisc (); char c = readCh (); error ("unexpected characters after document end", c, null); 519 } catch (EOFException e) { 520 return; 521 } 522 } 523 524 525 532 private void parseComment () 533 throws Exception 534 { 535 char c; 536 boolean saved = expandPE; 537 538 expandPE = false; 539 parseUntil ("--"); 540 require ('>', "-- in comment"); 541 expandPE = saved; 542 handler.comment (dataBuffer, 0, dataBufferPos); 543 dataBufferPos = 0; 544 } 545 546 547 557 private void parsePI () 558 throws SAXException, IOException 559 { 560 String name; 561 boolean saved = expandPE; 562 563 expandPE = false; 564 name = readNmtoken (true); 565 if ("xml".equalsIgnoreCase (name)) 566 error ("Illegal processing instruction target", name, null); 567 if (!tryRead ("?>")) { 568 requireWhitespace (); 569 parseUntil ("?>"); 570 } 571 expandPE = saved; 572 handler.processingInstruction (name, dataBufferToString ()); 573 } 574 575 576 586 private void parseCDSect () 587 throws Exception 588 { 589 parseUntil ("]]>"); 590 dataBufferFlush (); 591 } 592 593 594 608 private void parseProlog () 609 throws Exception 610 { 611 parseMisc (); 612 613 if (tryRead ("<!DOCTYPE")) { 614 parseDoctypedecl (); 615 parseMisc (); 616 } 617 } 618 619 620 638 private String parseXMLDecl (boolean ignoreEncoding) 639 throws SAXException, IOException 640 { 641 String version; 642 String encodingName = null; 643 String standalone = null; 644 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 645 646 require ("version", "XML declaration"); 648 parseEq (); 649 version = readLiteral (flags); 650 if (!version.equals ("1.0")) { 651 error ("unsupported XML version", version, "1.0"); 652 } 653 654 boolean white = tryWhitespace (); 656 if (tryRead ("encoding")) { 657 if (!white) 658 error ("whitespace required before 'encoding='"); 659 parseEq (); 660 encodingName = readLiteral (flags); 661 if (!ignoreEncoding) 662 setupDecoding (encodingName); 663 } 664 665 if (encodingName != null) 667 white = tryWhitespace (); 668 if (tryRead ("standalone")) { 669 if (!white) 670 error ("whitespace required before 'standalone='"); 671 parseEq (); 672 standalone = readLiteral (flags); 673 if (! ("yes".equals (standalone) || "no".equals (standalone))) 674 error ("standalone flag must be 'yes' or 'no'"); 675 } 676 677 skipWhitespace (); 678 require ("?>", "XML declaration"); 679 680 return encodingName; 681 } 682 683 684 697 private String parseTextDecl (boolean ignoreEncoding) 698 throws SAXException, IOException 699 { 700 String encodingName = null; 701 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 702 703 if (tryRead ("version")) { 705 String version; 706 parseEq (); 707 version = readLiteral (flags); 708 if (!version.equals ("1.0")) { 709 error ("unsupported XML version", version, "1.0"); 710 } 711 requireWhitespace (); 712 } 713 714 715 require ("encoding", "XML text declaration"); 717 parseEq (); 718 encodingName = readLiteral (flags); 719 if (!ignoreEncoding) 720 setupDecoding (encodingName); 721 722 skipWhitespace (); 723 require ("?>", "XML text declaration"); 724 725 return encodingName; 726 } 727 728 729 745 private void setupDecoding (String encodingName) 746 throws SAXException, IOException 747 { 748 encodingName = encodingName.toUpperCase (); 749 750 754 757 if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) { 759 if (encodingName.equals ("ISO-8859-1") 760 || encodingName.equals ("8859_1") 761 || encodingName.equals ("ISO8859_1") 762 ) { 763 encoding = ENCODING_ISO_8859_1; 764 return; 765 } else if (encodingName.equals ("US-ASCII") 766 || encodingName.equals ("ASCII")) { 767 encoding = ENCODING_ASCII; 768 return; 769 } else if (encodingName.equals ("UTF-8") 770 || encodingName.equals ("UTF8")) { 771 encoding = ENCODING_UTF_8; 772 return; 773 } else if (encoding != ENCODING_EXTERNAL) { 774 throw new EncodingException (encodingName); 776 } 777 } 780 781 if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) { 783 if (!(encodingName.equals ("ISO-10646-UCS-2") 784 || encodingName.equals ("UTF-16") 785 || encodingName.equals ("UTF-16BE") 786 || encodingName.equals ("UTF-16LE"))) 787 error ("unsupported Unicode encoding", 788 encodingName, 789 "UTF-16"); 790 return; 791 } 792 793 if (encoding == ENCODING_UCS_4_1234 795 || encoding == ENCODING_UCS_4_4321 796 || encoding == ENCODING_UCS_4_2143 797 || encoding == ENCODING_UCS_4_3412) { 798 if (!encodingName.equals ("ISO-10646-UCS-4")) 799 error ("unsupported 32-bit encoding", 800 encodingName, 801 "ISO-10646-UCS-4"); 802 return; 803 } 804 805 809 if (encodingName.equals ("UTF-16BE")) { 810 encoding = ENCODING_UCS_2_12; 811 return; 812 } 813 if (encodingName.equals ("UTF-16LE")) { 814 encoding = ENCODING_UCS_2_21; 815 return; 816 } 817 818 822 if (encodingName.equals ("UTF-16") 823 || encodingName.equals ("ISO-10646-UCS-2")) 824 encodingName = "Unicode"; 825 827 reader = new InputStreamReader (is, encodingName); 828 sourceType = INPUT_READER; 829 } 830 831 832 839 private void parseMisc () 840 throws Exception 841 { 842 while (true) { 843 skipWhitespace (); 844 if (tryRead ("<?")) { 845 parsePI (); 846 } else if (tryRead ("<!--")) { 847 parseComment (); 848 } else { 849 return; 850 } 851 } 852 } 853 854 855 863 private void parseDoctypedecl () 864 throws Exception 865 { 866 String doctypeName, ids[]; 867 868 requireWhitespace (); 870 doctypeName = readNmtoken (true); 871 872 skipWhitespace (); 874 ids = readExternalIds (false); 875 876 handler.doctypeDecl (doctypeName, ids [0], ids [1]); 878 879 skipWhitespace (); 881 if (tryRead ('[')) { 882 883 while (true) { 885 expandPE = true; 886 skipWhitespace (); 887 expandPE = false; 888 if (tryRead (']')) { 889 break; } else { 891 peIsError = expandPE = true; 893 parseMarkupdecl (); 894 peIsError = expandPE = false; 895 } 896 } 897 } 898 899 if (ids [1] != null) { 901 pushURL ("[external subset]", ids [0], ids [1], null, null, null, false); 902 903 while (true) { 905 expandPE = true; 906 skipWhitespace (); 907 expandPE = false; 908 if (tryRead ('>')) { 909 break; 910 } else { 911 expandPE = true; 912 parseMarkupdecl (); 913 expandPE = false; 914 } 915 } 916 } else { 917 skipWhitespace (); 919 require ('>', "internal DTD subset"); 920 } 921 922 handler.endDoctype (); 924 expandPE = false; 925 } 926 927 928 939 private void parseMarkupdecl () 940 throws Exception 941 { 942 if (tryRead ("<!ELEMENT")) { 943 parseElementdecl (); 944 } else if (tryRead ("<!ATTLIST")) { 945 parseAttlistDecl (); 946 } else if (tryRead ("<!ENTITY")) { 947 parseEntityDecl (); 948 } else if (tryRead ("<!NOTATION")) { 949 parseNotationDecl (); 950 } else if (tryRead ("<?")) { 951 parsePI (); 952 } else if (tryRead ("<!--")) { 953 parseComment (); 954 } else if (tryRead ("<![")) { 955 if (inputStack.size () > 0) 956 parseConditionalSect (); 957 else 958 error ("conditional sections illegal in internal subset"); 959 } else { 960 error ("expected markup declaration"); 961 } 962 } 963 964 965 976 private void parseElement () 977 throws Exception 978 { 979 String gi; 980 char c; 981 int oldElementContent = currentElementContent; 982 String oldElement = currentElement; 983 Object element []; 984 985 tagAttributePos = 0; 988 989 gi = readNmtoken (true); 991 992 currentElement = gi; 994 element = (Object []) elementInfo.get (gi); 995 currentElementContent = getContentType (element, CONTENT_ANY); 996 997 boolean white = tryWhitespace (); 1000 c = readCh (); 1001 while (c != '/' && c != '>') { 1002 unread (c); 1003 if (!white) 1004 error ("need whitespace between attributes"); 1005 parseAttribute (gi); 1006 white = tryWhitespace (); 1007 c = readCh (); 1008 } 1009 1010 Enumeration atts = declaredAttributes (element); 1012 if (atts != null) { 1013 String aname; 1014loop: 1015 while (atts.hasMoreElements ()) { 1016 aname = (String) atts.nextElement (); 1017 for (int i = 0; i < tagAttributePos; i++) { 1019 if (tagAttributes [i] == aname) { 1020 continue loop; 1021 } 1022 } 1023 String defaultVal = getAttributeExpandedValue (gi, aname); 1025 if (defaultVal!=null) { 1026 handler.attribute (aname, defaultVal, false); 1027 } 1028 } 1029 } 1030 1031 switch (c) { 1035 case '>': 1036 handler.startElement (gi); 1037 parseContent (); 1038 break; 1039 case '/': 1040 require ('>', "empty element tag"); 1041 handler.startElement (gi); 1042 handler.endElement (gi); 1043 break;
|