KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > icl > saxon > aelfred > XmlParser


1 // AElfred XML Parser. This version of the AElfred parser is
2
// derived from the original Microstar distribution, with additional
3
// bug fixes by Michael Kay, and selected enhancements and further
4
// bug fixes from the version produced by David Brownell.
5
//
6

7 /*
8  * $Id: XmlParser.java,v 1.8 2001/06/06 17:57:44 dbrownell Exp $
9  * Copyright (C) 1999-2001 David Brownell
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25
26 //
27
// Copyright (c) 1997, 1998 by Microstar Software Ltd.
28
// From Microstar's README (the entire original license):
29
//
30
// AElfred is free for both commercial and non-commercial use and
31
// redistribution, provided that Microstar's copyright and disclaimer are
32
// retained intact. You are free to modify AElfred for your own use and
33
// to redistribute AElfred with your modifications, provided that the
34
// modifications are clearly documented.
35
//
36
// This program is distributed in the hope that it will be useful, but
37
// WITHOUT ANY WARRANTY; without even the implied warranty of
38
// merchantability or fitness for a particular purpose. Please use it AT
39
// YOUR OWN RISK.
40
//
41

42
43 package com.icl.saxon.aelfred;
44
45 import java.io.BufferedInputStream;
46 import java.io.CharConversionException;
47 import java.io.EOFException;
48 import java.io.InputStream;
49 import java.io.InputStreamReader;
50 import java.io.IOException;
51 import java.io.Reader;
52 import java.net.URL;
53 import java.net.URLConnection;
54 import java.util.Enumeration;
55 import java.util.Hashtable;
56 import java.util.Stack;
57
58 import org.xml.sax.SAXException;
59
60
61 // $Id: XmlParser.java,v 1.19 2000/02/26 04:30:20 mojo Exp $
62

63 /**
64  * Parse XML documents and return parse events through call-backs.
65  * Use the <code>SAXDriver</code> class as your entry point, as the
66  * internal parser interfaces are subject to change.
67  *
68  * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
69  * (version 1.2a with bugfixes)
70  * @author Updated by David Brownell &lt;david-b@pacbell.net&gt;
71  * @version $Date: 2001/06/06 17:57:44 $
72  * @see SAXDriver
73  */

74 final class XmlParser
75 {
76     // parse from buffer, avoiding slow per-character readCh()
77
private final static boolean USE_CHEATS = true;
78
79     // don't waste too much space in hashtables
80
private final static int DEFAULT_ATTR_COUNT = 23;
81
82
83     //////////////////////////////////////////////////////////////////////
84
// Constructors.
85
////////////////////////////////////////////////////////////////////////
86

87
88     /**
89      * Construct a new parser with no associated handler.
90      * @see #setHandler
91      * @see #parse
92      */

93     // package private
94
XmlParser ()
95     {
96         cleanupVariables ();
97     }
98
99
100     /**
101      * Set the handler that will receive parsing events.
102      * @param handler The handler to receive callback events.
103      * @see #parse
104      */

105     // package private
106
void setHandler (SAXDriver handler)
107     {
108         this.handler = handler;
109     }
110
111
112     /**
113      * Parse an XML document from the character stream, byte stream, or URI
114      * that you provide (in that order of preference). Any URI that you
115      * supply will become the base URI for resolving relative URI, and may
116      * be used to acquire a reader or byte stream.
117      *
118      * <p>You may parse more than one document, but that must be done
119      * sequentially. Only one thread at a time may use this parser.
120      *
121      * @param systemId The URI of the document; should never be null,
122      * but may be so iff a reader <em>or</em> a stream is provided.
123      * @param publicId The public identifier of the document, or null.
124      * @param reader A character stream; must be null if stream isn't.
125      * @param stream A byte input stream; must be null if reader isn't.
126      * @param encoding The suggested encoding, or null if unknown.
127      * @exception java.lang.Exception Basically SAXException or IOException
128      */

129     // package private
130
void doParse (
131     String systemId,
132     String publicId,
133     Reader reader,
134     InputStream stream,
135     String encoding
136     ) throws Exception
137     {
138     if (handler == null)
139         throw new IllegalStateException ("no callback handler");
140
141     basePublicId = publicId;
142     baseURI = systemId;
143     baseReader = reader;
144     baseInputStream = stream;
145
146     initializeVariables ();
147
148     // predeclare the built-in entities here (replacement texts)
149
// we don't need to intern(), since we're guaranteed literals
150
// are always (globally) interned.
151
setInternalEntity ("amp", "&#38;");
152     setInternalEntity ("lt", "&#60;");
153     setInternalEntity ("gt", "&#62;");
154     setInternalEntity ("apos", "&#39;");
155     setInternalEntity ("quot", "&#34;");
156
157     handler.startDocument ();
158
159     pushURL ("[document]", basePublicId, baseURI,
160         baseReader, baseInputStream, encoding, false);
161
162     try {
163         parseDocument ();
164         handler.endDocument ();
165     } finally {
166         if (baseReader != null)
167         try { baseReader.close ();
168         } catch (IOException e) { /* ignore */ }
169         if (baseInputStream != null)
170         try { baseInputStream.close ();
171         } catch (IOException e) { /* ignore */ }
172         if (is != null)
173         try { is.close ();
174         } catch (IOException e) { /* ignore */ }
175         if (reader != null)
176         try {
177             reader.close ();
178         } catch (IOException e) { /* ignore */
179         }
180         cleanupVariables ();
181     }
182     }
183
184
185     ////////////////////////////////////////////////////////////////////////
186
// Constants.
187
////////////////////////////////////////////////////////////////////////
188

189     //
190
// Constants for element content type.
191
//
192

193     /**
194      * Constant: an element has not been declared.
195      * @see #getElementContentType
196      */

197     public final static int CONTENT_UNDECLARED = 0;
198
199     /**
200      * Constant: the element has a content model of ANY.
201      * @see #getElementContentType
202      */

203     public final static int CONTENT_ANY = 1;
204
205     /**
206      * Constant: the element has declared content of EMPTY.
207      * @see #getElementContentType
208      */

209     public final static int CONTENT_EMPTY = 2;
210
211     /**
212      * Constant: the element has mixed content.
213      * @see #getElementContentType
214      */

215     public final static int CONTENT_MIXED = 3;
216
217     /**
218      * Constant: the element has element content.
219      * @see #getElementContentType
220      */

221     public final static int CONTENT_ELEMENTS = 4;
222
223
224     //
225
// Constants for the entity type.
226
//
227

228     /**
229      * Constant: the entity has not been declared.
230      * @see #getEntityType
231      */

232     public final static int ENTITY_UNDECLARED = 0;
233
234     /**
235      * Constant: the entity is internal.
236      * @see #getEntityType
237      */

238     public final static int ENTITY_INTERNAL = 1;
239
240     /**
241      * Constant: the entity is external, non-parseable data.
242      * @see #getEntityType
243      */

244     public final static int ENTITY_NDATA = 2;
245
246     /**
247      * Constant: the entity is external XML data.
248      * @see #getEntityType
249      */

250     public final static int ENTITY_TEXT = 3;
251
252
253     //
254
// Constants for attribute type.
255
//
256

257     /**
258      * Constant: the attribute has not been declared for this element type.
259      * @see #getAttributeType
260      */

261     public final static int ATTRIBUTE_UNDECLARED = 0;
262
263     /**
264      * Constant: the attribute value is a string value.
265      * @see #getAttributeType
266      */

267     public final static int ATTRIBUTE_CDATA = 1;
268
269     /**
270      * Constant: the attribute value is a unique identifier.
271      * @see #getAttributeType
272      */

273     public final static int ATTRIBUTE_ID = 2;
274
275     /**
276      * Constant: the attribute value is a reference to a unique identifier.
277      * @see #getAttributeType
278      */

279     public final static int ATTRIBUTE_IDREF = 3;
280
281     /**
282      * Constant: the attribute value is a list of ID references.
283      * @see #getAttributeType
284      */

285     public final static int ATTRIBUTE_IDREFS = 4;
286
287     /**
288      * Constant: the attribute value is the name of an entity.
289      * @see #getAttributeType
290      */

291     public final static int ATTRIBUTE_ENTITY = 5;
292
293     /**
294      * Constant: the attribute value is a list of entity names.
295      * @see #getAttributeType
296      */

297     public final static int ATTRIBUTE_ENTITIES = 6;
298
299     /**
300      * Constant: the attribute value is a name token.
301      * @see #getAttributeType
302      */

303     public final static int ATTRIBUTE_NMTOKEN = 7;
304
305     /**
306      * Constant: the attribute value is a list of name tokens.
307      * @see #getAttributeType
308      */

309     public final static int ATTRIBUTE_NMTOKENS = 8;
310
311     /**
312      * Constant: the attribute value is a token from an enumeration.
313      * @see #getAttributeType
314      */

315     public final static int ATTRIBUTE_ENUMERATED = 9;
316
317     /**
318      * Constant: the attribute is the name of a notation.
319      * @see #getAttributeType
320      */

321     public final static int ATTRIBUTE_NOTATION = 10;
322
323
324     //
325
// When the class is loaded, populate the hash table of
326
// attribute types.
327
//
328

329     /**
330      * Hash table of attribute types.
331      */

332     private static Hashtable attributeTypeHash;
333     static {
334     attributeTypeHash = new Hashtable (13);
335     attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA));
336     attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID));
337     attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF));
338     attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS));
339     attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY));
340     attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES));
341     attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN));
342     attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS));
343     attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION));
344     }
345
346
347     //
348
// Constants for supported encodings. "external" is just a flag.
349
//
350
private final static int ENCODING_EXTERNAL = 0;
351     private final static int ENCODING_UTF_8 = 1;
352     private final static int ENCODING_ISO_8859_1 = 2;
353     private final static int ENCODING_UCS_2_12 = 3;
354     private final static int ENCODING_UCS_2_21 = 4;
355     private final static int ENCODING_UCS_4_1234 = 5;
356     private final static int ENCODING_UCS_4_4321 = 6;
357     private final static int ENCODING_UCS_4_2143 = 7;
358     private final static int ENCODING_UCS_4_3412 = 8;
359     private final static int ENCODING_ASCII = 9;
360
361
362     //
363
// Constants for attribute default value.
364
//
365

366     /**
367      * Constant: the attribute is not declared.
368      * @see #getAttributeDefaultValueType
369      */

370     public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
371
372     /**
373      * Constant: the attribute has a literal default value specified.
374      * @see #getAttributeDefaultValueType
375      * @see #getAttributeDefaultValue
376      */

377     public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
378
379     /**
380      * Constant: the attribute was declared #IMPLIED.
381      * @see #getAttributeDefaultValueType
382      */

383     public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
384
385     /**
386      * Constant: the attribute was declared #REQUIRED.
387      * @see #getAttributeDefaultValueType
388      */

389     public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
390
391     /**
392      * Constant: the attribute was declared #FIXED.
393      * @see #getAttributeDefaultValueType
394      * @see #getAttributeDefaultValue
395      */

396     public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
397
398
399     //
400
// Constants for input.
401
//
402
private final static int INPUT_NONE = 0;
403     private final static int INPUT_INTERNAL = 1;
404     private final static int INPUT_STREAM = 3;
405     private final static int INPUT_BUFFER = 4;
406     private final static int INPUT_READER = 5;
407
408
409     //
410
// Flags for reading literals.
411
//
412
// expand general entity refs (attribute values in dtd and content)
413
private final static int LIT_ENTITY_REF = 2;
414     // normalize this value (space chars) (attributes, public ids)
415
private final static int LIT_NORMALIZE = 4;
416     // literal is an attribute value
417
private final static int LIT_ATTRIBUTE = 8;
418     // don't expand parameter entities
419
private final static int LIT_DISABLE_PE = 16;
420     // don't expand [or parse] character refs
421
private final static int LIT_DISABLE_CREF = 32;
422     // don't parse general entity refs
423
private final static int LIT_DISABLE_EREF = 64;
424     // don't expand general entities, but make sure we _could_
425
private final static int LIT_ENTITY_CHECK = 128;
426     // literal is a public ID value
427
private final static int LIT_PUBID = 256;
428
429     //
430
// Flags affecting PE handling in DTDs (if expandPE is true).
431
// PEs expand with space padding, except inside literals.
432
//
433
private final static int CONTEXT_NORMAL = 0;
434     private final static int CONTEXT_LITERAL = 1;
435
436
437     //////////////////////////////////////////////////////////////////////
438
// Error reporting.
439
//////////////////////////////////////////////////////////////////////
440

441
442     /**
443      * Report an error.
444      * @param message The error message.
445      * @param textFound The text that caused the error (or null).
446      * @see SAXDriver#error
447      * @see #line
448      */

449     private void error (String message, String textFound, String textExpected)
450     throws SAXException
451     {
452     if (textFound != null) {
453         message = message + " (found \"" + textFound + "\")";
454     }
455     if (textExpected != null) {
456         message = message + " (expected \"" + textExpected + "\")";
457     }
458     String uri = null;
459
460     if (externalEntity != null) {
461         uri = externalEntity.getURL ().toString ();
462     }
463     handler.error (message, uri, line, column);
464
465     // "can't happen"
466
throw new SAXException (message);
467     }
468
469
470     /**
471      * Report a serious error.
472      * @param message The error message.
473      * @param textFound The text that caused the error (or null).
474      */

475     private void error (String message, char textFound, String textExpected)
476     throws SAXException
477     {
478     error (message, new Character (textFound).toString (), textExpected);
479     }
480
481     /** Report typical case fatal errors. */
482     private void error (String message)
483     throws SAXException
484     {
485     error (message, null, null);
486     }
487
488
489     //////////////////////////////////////////////////////////////////////
490
// Major syntactic productions.
491
//////////////////////////////////////////////////////////////////////
492

493
494     /**
495      * Parse an XML document.
496      * <pre>
497      * [1] document ::= prolog element Misc*
498      * </pre>
499      * <p>This is the top-level parsing function for a single XML
500      * document. As a minimum, a well-formed document must have
501      * a document element, and a valid document must have a prolog
502      * (one with doctype) as well.
503      */

504     private void parseDocument ()
505     throws Exception
506     {
507         try { // added by MHK
508
parseProlog ();
509             require ('<', "document prolog");
510             parseElement ();
511         } catch (EOFException ee) { // added by MHK
512
error("premature end of file", "[EOF]", null);
513         }
514         
515         try {
516             parseMisc (); //skip all white, PIs, and comments
517
char c = readCh (); //if this doesn't throw an exception...
518
error ("unexpected characters after document end", c, null);
519         } catch (EOFException e) {
520             return;
521         }
522     }
523
524
525     /**
526      * Skip a comment.
527      * <pre>
528      * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
529      * </pre>
530      * <p> (The <code>&lt;!--</code> has already been read.)
531      */

532     private void parseComment ()
533     throws Exception
534     {
535     char c;
536     boolean saved = expandPE;
537
538     expandPE = false;
539     parseUntil ("--");
540     require ('>', "-- in comment");
541     expandPE = saved;
542     handler.comment (dataBuffer, 0, dataBufferPos);
543     dataBufferPos = 0;
544     }
545
546
547     /**
548      * Parse a processing instruction and do a call-back.
549      * <pre>
550      * [16] PI ::= '&lt;?' PITarget
551      * (S (Char* - (Char* '?&gt;' Char*)))?
552      * '?&gt;'
553      * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
554      * </pre>
555      * <p> (The <code>&lt;?</code> has already been read.)
556      */

557     private void parsePI ()
558     throws SAXException, IOException
559     {
560     String name;
561     boolean saved = expandPE;
562
563     expandPE = false;
564     name = readNmtoken (true);
565     if ("xml".equalsIgnoreCase (name))
566         error ("Illegal processing instruction target", name, null);
567     if (!tryRead ("?>")) {
568         requireWhitespace ();
569         parseUntil ("?>");
570     }
571     expandPE = saved;
572     handler.processingInstruction (name, dataBufferToString ());
573     }
574
575
576     /**
577      * Parse a CDATA section.
578      * <pre>
579      * [18] CDSect ::= CDStart CData CDEnd
580      * [19] CDStart ::= '&lt;![CDATA['
581      * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
582      * [21] CDEnd ::= ']]&gt;'
583      * </pre>
584      * <p> (The '&lt;![CDATA[' has already been read.)
585      */

586     private void parseCDSect ()
587     throws Exception
588     {
589     parseUntil ("]]>");
590     dataBufferFlush ();
591     }
592
593
594     /**
595      * Parse the prolog of an XML document.
596      * <pre>
597      * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
598      * </pre>
599      * <p>There are a couple of tricks here. First, it is necessary to
600      * declare the XML default attributes after the DTD (if present)
601      * has been read. [??] Second, it is not possible to expand general
602      * references in attribute value literals until after the entire
603      * DTD (if present) has been parsed.
604      * <p>We do not look for the XML declaration here, because it was
605      * handled by pushURL ().
606      * @see pushURL
607      */

608     private void parseProlog ()
609     throws Exception
610     {
611     parseMisc ();
612
613     if (tryRead ("<!DOCTYPE")) {
614         parseDoctypedecl ();
615         parseMisc ();
616     }
617     }
618
619
620     /**
621      * Parse the XML declaration.
622      * <pre>
623      * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
624      * [24] VersionInfo ::= S 'version' Eq
625      * ("'" VersionNum "'" | '"' VersionNum '"' )
626      * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
627      * [32] SDDecl ::= S 'standalone' Eq
628      * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
629      * [80] EncodingDecl ::= S 'encoding' Eq
630      * ( "'" EncName "'" | "'" EncName "'" )
631      * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
632      * </pre>
633      * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
634      * @return the encoding in the declaration, uppercased; or null
635      * @see #parseTextDecl
636      * @see #setupDecoding
637      */

638     private String parseXMLDecl (boolean ignoreEncoding)
639     throws SAXException, IOException
640     {
641     String version;
642     String encodingName = null;
643     String standalone = null;
644     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
645
646     // Read the version.
647
require ("version", "XML declaration");
648     parseEq ();
649     version = readLiteral (flags);
650     if (!version.equals ("1.0")) {
651         error ("unsupported XML version", version, "1.0");
652     }
653
654     // Try reading an encoding declaration.
655
boolean white = tryWhitespace ();
656     if (tryRead ("encoding")) {
657         if (!white)
658         error ("whitespace required before 'encoding='");
659         parseEq ();
660         encodingName = readLiteral (flags);
661         if (!ignoreEncoding)
662         setupDecoding (encodingName);
663     }
664
665     // Try reading a standalone declaration
666
if (encodingName != null)
667         white = tryWhitespace ();
668     if (tryRead ("standalone")) {
669         if (!white)
670         error ("whitespace required before 'standalone='");
671         parseEq ();
672         standalone = readLiteral (flags);
673         if (! ("yes".equals (standalone) || "no".equals (standalone)))
674         error ("standalone flag must be 'yes' or 'no'");
675     }
676
677     skipWhitespace ();
678     require ("?>", "XML declaration");
679
680     return encodingName;
681     }
682
683
684     /**
685      * Parse a text declaration.
686      * <pre>
687      * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
688      * [80] EncodingDecl ::= S 'encoding' Eq
689      * ( '"' EncName '"' | "'" EncName "'" )
690      * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
691      * </pre>
692      * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
693      * @return the encoding in the declaration, uppercased; or null
694      * @see #parseXMLDecl
695      * @see #setupDecoding
696      */

697     private String parseTextDecl (boolean ignoreEncoding)
698     throws SAXException, IOException
699     {
700     String encodingName = null;
701     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
702
703     // Read an optional version.
704
if (tryRead ("version")) {
705         String version;
706         parseEq ();
707         version = readLiteral (flags);
708         if (!version.equals ("1.0")) {
709         error ("unsupported XML version", version, "1.0");
710         }
711         requireWhitespace ();
712     }
713
714
715     // Read the encoding.
716
require ("encoding", "XML text declaration");
717     parseEq ();
718     encodingName = readLiteral (flags);
719     if (!ignoreEncoding)
720         setupDecoding (encodingName);
721
722     skipWhitespace ();
723     require ("?>", "XML text declaration");
724
725     return encodingName;
726     }
727
728
729     /**
730      * Sets up internal state so that we can decode an entity using the
731      * specified encoding. This is used when we start to read an entity
732      * and we have been given knowledge of its encoding before we start to
733      * read any data (e.g. from a SAX input source or from a MIME type).
734      *
735      * <p> It is also used after autodetection, at which point only very
736      * limited adjustments to the encoding may be used (switching between
737      * related builtin decoders).
738      *
739      * @param encodingName The name of the encoding specified by the user.
740      * @exception IOException if the encoding isn't supported either
741      * internally to this parser, or by the hosting JVM.
742      * @see #parseXMLDecl
743      * @see #parseTextDecl
744      */

745     private void setupDecoding (String encodingName)
746     throws SAXException, IOException
747     {
748     encodingName = encodingName.toUpperCase ();
749
750     // ENCODING_EXTERNAL indicates an encoding that wasn't
751
// autodetected ... we can use builtin decoders, or
752
// ones from the JVM (InputStreamReader).
753

754     // Otherwise we can only tweak what was autodetected, and
755
// only for single byte (ASCII derived) builtin encodings.
756

757     // ASCII-derived encodings
758
if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) {
759         if (encodingName.equals ("ISO-8859-1")
760                 || encodingName.equals ("8859_1")
761                 || encodingName.equals ("ISO8859_1")
762           ) {
763             encoding = ENCODING_ISO_8859_1;
764             return;
765         } else if (encodingName.equals ("US-ASCII")
766                 || encodingName.equals ("ASCII")) {
767             encoding = ENCODING_ASCII;
768             return;
769         } else if (encodingName.equals ("UTF-8")
770                 || encodingName.equals ("UTF8")) {
771             encoding = ENCODING_UTF_8;
772             return;
773         } else if (encoding != ENCODING_EXTERNAL) {
774             // used to start with a new reader ...
775
throw new EncodingException (encodingName);
776         }
777         // else fallthrough ...
778
// it's ASCII-ish and something other than a builtin
779
}
780
781     // Unicode and such
782
if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) {
783         if (!(encodingName.equals ("ISO-10646-UCS-2")
784             || encodingName.equals ("UTF-16")
785             || encodingName.equals ("UTF-16BE")
786             || encodingName.equals ("UTF-16LE")))
787         error ("unsupported Unicode encoding",
788                encodingName,
789                "UTF-16");
790         return;
791     }
792
793     // four byte encodings
794
if (encoding == ENCODING_UCS_4_1234
795         || encoding == ENCODING_UCS_4_4321
796         || encoding == ENCODING_UCS_4_2143
797         || encoding == ENCODING_UCS_4_3412) {
798         if (!encodingName.equals ("ISO-10646-UCS-4"))
799         error ("unsupported 32-bit encoding",
800                encodingName,
801                "ISO-10646-UCS-4");
802         return;
803     }
804
805     // assert encoding == ENCODING_EXTERNAL
806
// if (encoding != ENCODING_EXTERNAL)
807
// throw new RuntimeException ("encoding = " + encoding);
808

809     if (encodingName.equals ("UTF-16BE")) {
810         encoding = ENCODING_UCS_2_12;
811         return;
812     }
813     if (encodingName.equals ("UTF-16LE")) {
814         encoding = ENCODING_UCS_2_21;
815         return;
816     }
817
818     // We couldn't use the builtin decoders at all. But we can try to
819
// create a reader, since we haven't messed up buffering. Tweak
820
// the encoding name if necessary.
821

822     if (encodingName.equals ("UTF-16")
823         || encodingName.equals ("ISO-10646-UCS-2"))
824         encodingName = "Unicode";
825     // Ignoring all the EBCDIC aliases here
826

827     reader = new InputStreamReader (is, encodingName);
828     sourceType = INPUT_READER;
829     }
830
831
832     /**
833      * Parse miscellaneous markup outside the document element and DOCTYPE
834      * declaration.
835      * <pre>
836      * [27] Misc ::= Comment | PI | S
837      * </pre>
838      */

839     private void parseMisc ()
840     throws Exception
841     {
842     while (true) {
843         skipWhitespace ();
844         if (tryRead ("<?")) {
845         parsePI ();
846         } else if (tryRead ("<!--")) {
847         parseComment ();
848         } else {
849         return;
850         }
851     }
852     }
853
854
855     /**
856      * Parse a document type declaration.
857      * <pre>
858      * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
859      * ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
860      * </pre>
861      * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
862      */

863     private void parseDoctypedecl ()
864     throws Exception
865     {
866     String doctypeName, ids[];
867
868     // Read the document type name.
869
requireWhitespace ();
870     doctypeName = readNmtoken (true);
871
872     // Read the External subset's IDs
873
skipWhitespace ();
874     ids = readExternalIds (false);
875
876     // report (a) declaration of name, (b) lexical info (ids)
877
handler.doctypeDecl (doctypeName, ids [0], ids [1]);
878
879     // Internal subset is parsed first, if present
880
skipWhitespace ();
881     if (tryRead ('[')) {
882
883         // loop until the subset ends
884
while (true) {
885         expandPE = true;
886         skipWhitespace ();
887         expandPE = false;
888         if (tryRead (']')) {
889             break; // end of subset
890
} else {
891             // WFC, PEs in internal subset (only between decls)
892
peIsError = expandPE = true;
893             parseMarkupdecl ();
894             peIsError = expandPE = false;
895         }
896         }
897     }
898
899     // Read the external subset, if any
900
if (ids [1] != null) {
901         pushURL ("[external subset]", ids [0], ids [1], null, null, null, false);
902
903         // Loop until we end up back at '>'
904
while (true) {
905         expandPE = true;
906         skipWhitespace ();
907         expandPE = false;
908         if (tryRead ('>')) {
909             break;
910         } else {
911             expandPE = true;
912             parseMarkupdecl ();
913             expandPE = false;
914         }
915         }
916     } else {
917         // No external subset.
918
skipWhitespace ();
919         require ('>', "internal DTD subset");
920     }
921
922     // done dtd
923
handler.endDoctype ();
924     expandPE = false;
925     }
926
927
928     /**
929      * Parse a markup declaration in the internal or external DTD subset.
930      * <pre>
931      * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
932      * | NotationDecl | PI | Comment
933      * [30] extSubsetDecl ::= (markupdecl | conditionalSect
934      * | PEReference | S) *
935      * </pre>
936      * <p> Reading toplevel PE references is handled as a lexical issue
937      * by the caller, as is whitespace.
938      */

939     private void parseMarkupdecl ()
940     throws Exception
941     {
942     if (tryRead ("<!ELEMENT")) {
943         parseElementdecl ();
944     } else if (tryRead ("<!ATTLIST")) {
945         parseAttlistDecl ();
946     } else if (tryRead ("<!ENTITY")) {
947         parseEntityDecl ();
948     } else if (tryRead ("<!NOTATION")) {
949         parseNotationDecl ();
950     } else if (tryRead ("<?")) {
951         parsePI ();
952     } else if (tryRead ("<!--")) {
953         parseComment ();
954     } else if (tryRead ("<![")) {
955         if (inputStack.size () > 0)
956         parseConditionalSect ();
957         else
958         error ("conditional sections illegal in internal subset");
959     } else {
960         error ("expected markup declaration");
961     }
962     }
963
964
965     /**
966      * Parse an element, with its tags.
967      * <pre>
968      * [39] element ::= EmptyElementTag | STag content ETag
969      * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
970      * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
971      * </pre>
972      * <p> (The '&lt;' has already been read.)
973      * <p>NOTE: this method actually chains onto parseContent (), if necessary,
974      * and parseContent () will take care of calling parseETag ().
975      */

976     private void parseElement ()
977     throws Exception
978     {
979     String gi;
980     char c;
981     int oldElementContent = currentElementContent;
982     String oldElement = currentElement;
983     Object element [];
984
985     // This is the (global) counter for the
986
// array of specified attributes.
987
tagAttributePos = 0;
988
989     // Read the element type name.
990
gi = readNmtoken (true);
991
992     // Determine the current content type.
993
currentElement = gi;
994     element = (Object []) elementInfo.get (gi);
995     currentElementContent = getContentType (element, CONTENT_ANY);
996
997     // Read the attributes, if any.
998
// After this loop, "c" is the closing delimiter.
999
boolean white = tryWhitespace ();
1000    c = readCh ();
1001    while (c != '/' && c != '>') {
1002        unread (c);
1003        if (!white)
1004        error ("need whitespace between attributes");
1005        parseAttribute (gi);
1006        white = tryWhitespace ();
1007        c = readCh ();
1008    }
1009
1010    // Supply any defaulted attributes.
1011
Enumeration atts = declaredAttributes (element);
1012    if (atts != null) {
1013        String aname;
1014loop:
1015        while (atts.hasMoreElements ()) {
1016            aname = (String) atts.nextElement ();
1017            // See if it was specified.
1018
for (int i = 0; i < tagAttributePos; i++) {
1019                if (tagAttributes [i] == aname) {
1020                continue loop;
1021                }
1022            }
1023            // I guess not...
1024
String defaultVal = getAttributeExpandedValue (gi, aname);
1025            if (defaultVal!=null) {
1026                handler.attribute (aname, defaultVal, false);
1027            }
1028        }
1029    }
1030
1031    // Figure out if this is a start tag
1032
// or an empty element, and dispatch an
1033
// event accordingly.
1034
switch (c) {
1035    case '>':
1036        handler.startElement (gi);
1037        parseContent ();
1038        break;
1039    case '/':
1040        require ('>', "empty element tag");
1041        handler.startElement (gi);
1042        handler.endElement (gi);
1043        break;