XmlParser


1   // AElfred XML Parser. This version of the AElfred parser is
2   // derived from the original Microstar distribution, with additional
3   // bug fixes by Michael Kay, and selected enhancements and further
4   // bug fixes from the version produced by David Brownell.
5   //
6   
7   /*
8    * $Id: XmlParser.java,v 1.8 2001/06/06 17:57:44 dbrownell Exp $
9    * Copyright (C) 1999-2001 David Brownell
10   * 
11   * This program is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU General Public License as published by
13   * the Free Software Foundation; either version 2 of the License, or
14   * (at your option) any later version.
15   * 
16   * This program is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU General Public License for more details.
20   * 
21   * You should have received a copy of the GNU General Public License
22   * along with this program; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  
26  //
27  // Copyright (c) 1997, 1998 by Microstar Software Ltd.
28  // From Microstar's README (the entire original license):
29  //
30  // AElfred is free for both commercial and non-commercial use and
31  // redistribution, provided that Microstar's copyright and disclaimer are
32  // retained intact.  You are free to modify AElfred for your own use and
33  // to redistribute AElfred with your modifications, provided that the
34  // modifications are clearly documented.
35  //
36  // This program is distributed in the hope that it will be useful, but
37  // WITHOUT ANY WARRANTY; without even the implied warranty of
38  // merchantability or fitness for a particular purpose.  Please use it AT
39  // YOUR OWN RISK.
40  //
41  
42  
43  package net.sf.saxon.aelfred;
44  
45  import java.io.BufferedInputStream;
46  import java.io.CharConversionException;
47  import java.io.EOFException;
48  import java.io.InputStream;
49  import java.io.InputStreamReader;
50  import java.io.IOException;
51  import java.io.Reader;
52  import java.net.URL;
53  import java.net.URLConnection;
54  import java.util.Enumeration;
55  import java.util.Hashtable;
56  import java.util.Stack;
57  
58  import org.xml.sax.SAXException;
59  
60  
61  // $Id: XmlParser.java,v 1.19 2000/02/26 04:30:20 mojo Exp $
62  
63  /**
64   * Parse XML documents and return parse events through call-backs.
65   * Use the <code>SAXDriver</code> class as your entry point, as the
66   * internal parser interfaces are subject to change.
67   *
68   * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
69   *  (version 1.2a with bugfixes)
70   * @author Updated by David Brownell &lt;david-b@pacbell.net&gt;
71   * @version $Date: 2001/06/06 17:57:44 $
72   * @see SAXDriver
73   */
74  final class XmlParser
75  {
76      // parse from buffer, avoiding slow per-character readCh()
77      private final static boolean USE_CHEATS = true;
78  
79      // don't waste too much space in hashtables 
80      private final static int DEFAULT_ATTR_COUNT = 23;
81  
82  
83      //////////////////////////////////////////////////////////////////////
84      // Constructors.
85      ////////////////////////////////////////////////////////////////////////
86  
87  
88      /**
89       * Construct a new parser with no associated handler.
90       * @see #setHandler
91       * @see #parse
92       */
93      // package private
94      XmlParser ()
95      {
96          cleanupVariables ();
97      }
98  
99  
100     /**
101      * Set the handler that will receive parsing events.
102      * @param handler The handler to receive callback events.
103      * @see #parse
104      */
105     // package private
106     void setHandler (SAXDriver handler)
107     {
108         this.handler = handler;
109     }
110 
111 
112     /**
113      * Parse an XML document from the character stream, byte stream, or URI
114      * that you provide (in that order of preference).  Any URI that you
115      * supply will become the base URI for resolving relative URI, and may
116      * be used to acquire a reader or byte stream.
117      *
118      * <p>You may parse more than one document, but that must be done
119      * sequentially.  Only one thread at a time may use this parser.
120      *
121      * @param systemId The URI of the document; should never be null,
122      *  but may be so iff a reader <em>or</em> a stream is provided.
123      * @param publicId The public identifier of the document, or null.
124      * @param reader A character stream; must be null if stream isn't.
125      * @param stream A byte input stream; must be null if reader isn't.
126      * @param encoding The suggested encoding, or null if unknown.
127      * @exception java.lang.Exception Basically SAXException or IOException
128      */
129     // package private 
130     void doParse (
131     String      systemId,
132     String      publicId,
133     Reader      reader,
134     InputStream stream,
135     String      encoding
136     ) throws Exception
137     {
138     if (handler == null)
139         throw new IllegalStateException ("no callback handler");
140 
141     basePublicId = publicId;
142     baseURI = systemId;
143     baseReader = reader;
144     baseInputStream = stream;
145 
146     initializeVariables ();
147 
148     // predeclare the built-in entities here (replacement texts)
149     // we don't need to intern(), since we're guaranteed literals
150     // are always (globally) interned.
151     setInternalEntity ("amp", "&#38;");
152     setInternalEntity ("lt", "&#60;");
153     setInternalEntity ("gt", "&#62;");
154     setInternalEntity ("apos", "&#39;");
155     setInternalEntity ("quot", "&#34;");
156 
157     handler.startDocument ();
158 
159     pushURL ("[document]", basePublicId, baseURI,
160         baseReader, baseInputStream, encoding, false);
161 
162     try {
163         parseDocument ();
164         handler.endDocument ();
165     } finally {
166         if (baseReader != null)
167         try { baseReader.close ();
168         } catch (IOException e) { /* ignore */ }
169         if (baseInputStream != null)
170         try { baseInputStream.close ();
171         } catch (IOException e) { /* ignore */ }
172         if (is != null)
173         try { is.close ();
174         } catch (IOException e) { /* ignore */ }
175         if (reader != null)
176         try {
177             reader.close ();
178         } catch (IOException e) { /* ignore */
179         }
180         cleanupVariables ();
181     }
182     }
183 
184 
185     ////////////////////////////////////////////////////////////////////////
186     // Constants.
187     ////////////////////////////////////////////////////////////////////////
188 
189     //
190     // Constants for element content type.
191     //
192 
193     /**
194      * Constant: an element has not been declared.
195      * @see #getElementContentType
196      */
197     public final static int CONTENT_UNDECLARED = 0;
198 
199     /**
200      * Constant: the element has a content model of ANY.
201      * @see #getElementContentType
202      */
203     public final static int CONTENT_ANY = 1;
204 
205     /**
206      * Constant: the element has declared content of EMPTY.
207      * @see #getElementContentType
208      */
209     public final static int CONTENT_EMPTY = 2;
210 
211     /**
212      * Constant: the element has mixed content.
213      * @see #getElementContentType
214      */
215     public final static int CONTENT_MIXED = 3;
216 
217     /**
218      * Constant: the element has element content.
219      * @see #getElementContentType
220      */
221     public final static int CONTENT_ELEMENTS = 4;
222 
223 
224     //
225     // Constants for the entity type.
226     //
227 
228     /**
229      * Constant: the entity has not been declared.
230      * @see #getEntityType
231      */
232     public final static int ENTITY_UNDECLARED = 0;
233 
234     /**
235      * Constant: the entity is internal.
236      * @see #getEntityType
237      */
238     public final static int ENTITY_INTERNAL = 1;
239 
240     /**
241      * Constant: the entity is external, non-parseable data.
242      * @see #getEntityType
243      */
244     public final static int ENTITY_NDATA = 2;
245 
246     /**
247      * Constant: the entity is external XML data.
248      * @see #getEntityType
249      */
250     public final static int ENTITY_TEXT = 3;
251 
252 
253     //
254     // Constants for attribute type.
255     //
256 
257     /**
258      * Constant: the attribute has not been declared for this element type.
259      * @see #getAttributeType
260      */
261     public final static int ATTRIBUTE_UNDECLARED = 0;
262 
263     /**
264      * Constant: the attribute value is a string value.
265      * @see #getAttributeType
266      */
267     public final static int ATTRIBUTE_CDATA = 1;
268 
269     /**
270      * Constant: the attribute value is a unique identifier.
271      * @see #getAttributeType
272      */
273     public final static int ATTRIBUTE_ID = 2;
274 
275     /**
276      * Constant: the attribute value is a reference to a unique identifier.
277      * @see #getAttributeType
278      */
279     public final static int ATTRIBUTE_IDREF = 3;
280 
281     /**
282      * Constant: the attribute value is a list of ID references.
283      * @see #getAttributeType
284      */
285     public final static int ATTRIBUTE_IDREFS = 4;
286 
287     /**
288      * Constant: the attribute value is the name of an entity.
289      * @see #getAttributeType
290      */
291     public final static int ATTRIBUTE_ENTITY = 5;
292 
293     /**
294      * Constant: the attribute value is a list of entity names.
295      * @see #getAttributeType
296      */
297     public final static int ATTRIBUTE_ENTITIES = 6;
298 
299     /**
300      * Constant: the attribute value is a name token.
301      * @see #getAttributeType
302      */
303     public final static int ATTRIBUTE_NMTOKEN = 7;
304 
305     /**
306      * Constant: the attribute value is a list of name tokens.
307      * @see #getAttributeType
308      */
309     public final static int ATTRIBUTE_NMTOKENS = 8;
310 
311     /**
312      * Constant: the attribute value is a token from an enumeration.
313      * @see #getAttributeType
314      */
315     public final static int ATTRIBUTE_ENUMERATED = 9;
316 
317     /**
318      * Constant: the attribute is the name of a notation.
319      * @see #getAttributeType
320      */
321     public final static int ATTRIBUTE_NOTATION = 10;
322 
323 
324     //
325     // When the class is loaded, populate the hash table of
326     // attribute types.
327     //
328 
329     /**
330      * Hash table of attribute types.
331      */
332     private static Hashtable attributeTypeHash;
333     static {
334     attributeTypeHash = new Hashtable (13);
335     attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA));
336     attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID));
337     attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF));
338     attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS));
339     attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY));
340     attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES));
341     attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN));
342     attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS));
343     attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION));
344     }
345 
346 
347     //
348     // Constants for supported encodings.  "external" is just a flag.
349     //
350     private final static int ENCODING_EXTERNAL = 0;
351     private final static int ENCODING_UTF_8 = 1;
352     private final static int ENCODING_ISO_8859_1 = 2;
353     private final static int ENCODING_UCS_2_12 = 3;
354     private final static int ENCODING_UCS_2_21 = 4;
355     private final static int ENCODING_UCS_4_1234 = 5;
356     private final static int ENCODING_UCS_4_4321 = 6;
357     private final static int ENCODING_UCS_4_2143 = 7;
358     private final static int ENCODING_UCS_4_3412 = 8;
359     private final static int ENCODING_ASCII = 9;
360 
361 
362     //
363     // Constants for attribute default value.
364     //
365 
366     /**
367      * Constant: the attribute is not declared.
368      * @see #getAttributeDefaultValueType
369      */
370     public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
371 
372     /**
373      * Constant: the attribute has a literal default value specified.
374      * @see #getAttributeDefaultValueType
375      * @see #getAttributeDefaultValue
376      */
377     public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
378 
379     /**
380      * Constant: the attribute was declared #IMPLIED.
381      * @see #getAttributeDefaultValueType
382      */
383     public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
384 
385     /**
386      * Constant: the attribute was declared #REQUIRED.
387      * @see #getAttributeDefaultValueType
388      */
389     public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
390 
391     /**
392      * Constant: the attribute was declared #FIXED.
393      * @see #getAttributeDefaultValueType
394      * @see #getAttributeDefaultValue
395      */
396     public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
397 
398 
399     //
400     // Constants for input.
401     //
402     private final static int INPUT_NONE = 0;
403     private final static int INPUT_INTERNAL = 1;
404     private final static int INPUT_STREAM = 3;
405     private final static int INPUT_BUFFER = 4;
406     private final static int INPUT_READER = 5;
407 
408 
409     //
410     // Flags for reading literals.
411     //
412     // expand general entity refs (attribute values in dtd and content)
413     private final static int LIT_ENTITY_REF = 2;
414     // normalize this value (space chars) (attributes, public ids)
415     private final static int LIT_NORMALIZE = 4;
416     // literal is an attribute value 
417     private final static int LIT_ATTRIBUTE = 8;
418     // don't expand parameter entities
419     private final static int LIT_DISABLE_PE = 16;
420     // don't expand [or parse] character refs
421     private final static int LIT_DISABLE_CREF = 32;
422     // don't parse general entity refs
423     private final static int LIT_DISABLE_EREF = 64;
424     // don't expand general entities, but make sure we _could_
425     private final static int LIT_ENTITY_CHECK = 128;
426     // literal is a public ID value 
427     private final static int LIT_PUBID = 256;
428 
429     //
430     // Flags affecting PE handling in DTDs (if expandPE is true).
431     // PEs expand with space padding, except inside literals.
432     //
433     private final static int CONTEXT_NORMAL = 0;
434     private final static int CONTEXT_LITERAL = 1;
435 
436 
437     //////////////////////////////////////////////////////////////////////
438     // Error reporting.
439     //////////////////////////////////////////////////////////////////////
440 
441 
442     /**
443      * Report an error.
444      * @param message The error message.
445      * @param textFound The text that caused the error (or null).
446      * @see SAXDriver#error
447      * @see #line
448      */
449     private void error (String message, String textFound, String textExpected)
450     throws SAXException
451     {
452     if (textFound != null) {
453         message = message + " (found \"" + textFound + "\")";
454     }
455     if (textExpected != null) {
456         message = message + " (expected \"" + textExpected + "\")";
457     }
458     String uri = null;
459 
460     if (externalEntity != null) {
461         uri = externalEntity.getURL ().toString ();
462     }
463     handler.error (message, uri, line, column);
464 
465     // "can't happen"
466     throw new SAXException (message);
467     }
468 
469 
470     /**
471      * Report a serious error.
472      * @param message The error message.
473      * @param textFound The text that caused the error (or null).
474      */
475     private void error (String message, char textFound, String textExpected)
476     throws SAXException
477     {
478     error (message, new Character (textFound).toString (), textExpected);
479     }
480 
481     /** Report typical case fatal errors. */
482     private void error (String message)
483     throws SAXException
484     {
485     error (message, null, null);
486     }
487 
488 
489     //////////////////////////////////////////////////////////////////////
490     // Major syntactic productions.
491     //////////////////////////////////////////////////////////////////////
492 
493 
494     /**
495      * Parse an XML document.
496      * <pre>
497      * [1] document ::= prolog element Misc*
498      * </pre>
499      * <p>This is the top-level parsing function for a single XML
500      * document.  As a minimum, a well-formed document must have
501      * a document element, and a valid document must have a prolog
502      * (one with doctype) as well.
503      */
504     private void parseDocument ()
505     throws Exception
506     {
507         try {                                       // added by MHK
508             parseProlog ();
509             require ('<', "document prolog");
510             parseElement ();
511         } catch (EOFException ee) {                 // added by MHK
512             error("premature end of file", "[EOF]", null);
513         }
514         
515         try {
516             parseMisc ();   //skip all white, PIs, and comments
517             char c = readCh ();    //if this doesn't throw an exception...
518             error ("unexpected characters after document end", c, null);
519         } catch (EOFException e) {
520             return;
521         }
522     }
523 
524 
525     /**
526      * Skip a comment.
527      * <pre>
528      * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
529      * </pre>
530      * <p> (The <code>&lt;!--</code> has already been read.)
531      */
532     private void parseComment ()
533     throws Exception
534     {
535     char c;
536     boolean saved = expandPE;
537 
538     expandPE = false;
539     parseUntil ("--");
540     require ('>', "-- in comment");
541     expandPE = saved;
542     handler.comment (dataBuffer, 0, dataBufferPos);
543     dataBufferPos = 0;
544     }
545 
546 
547     /**
548      * Parse a processing instruction and do a call-back.
549      * <pre>
550      * [16] PI ::= '&lt;?' PITarget
551      *      (S (Char* - (Char* '?&gt;' Char*)))?
552      *      '?&gt;'
553      * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
554      * </pre>
555      * <p> (The <code>&lt;?</code> has already been read.)
556      */
557     private void parsePI ()
558     throws SAXException, IOException
559     {
560     String name;
561     boolean saved = expandPE;
562 
563     expandPE = false;
564     name = readNmtoken (true);
565     if ("xml".equalsIgnoreCase (name))
566         error ("Illegal processing instruction target", name, null);
567     if (!tryRead ("?>")) {
568         requireWhitespace ();
569         parseUntil ("?>");
570     }
571     expandPE = saved;
572     handler.processingInstruction (name, dataBufferToString ());
573     }
574 
575 
576     /**
577      * Parse a CDATA section.
578      * <pre>
579      * [18] CDSect ::= CDStart CData CDEnd
580      * [19] CDStart ::= '&lt;![CDATA['
581      * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
582      * [21] CDEnd ::= ']]&gt;'
583      * </pre>
584      * <p> (The '&lt;![CDATA[' has already been read.)
585      */
586     private void parseCDSect ()
587     throws Exception
588     {
589     parseUntil ("]]>");
590     dataBufferFlush ();
591     }
592 
593 
594     /**
595      * Parse the prolog of an XML document.
596      * <pre>
597      * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
598      * </pre>
599      * <p>There are a couple of tricks here.  First, it is necessary to
600      * declare the XML default attributes after the DTD (if present)
601      * has been read. [??]  Second, it is not possible to expand general
602      * references in attribute value literals until after the entire
603      * DTD (if present) has been parsed.
604      * <p>We do not look for the XML declaration here, because it was
605      * handled by pushURL ().
606      * @see pushURL
607      */
608     private void parseProlog ()
609     throws Exception
610     {
611     parseMisc ();
612 
613     if (tryRead ("<!DOCTYPE")) {
614         parseDoctypedecl ();
615         parseMisc ();
616     }
617     }
618 
619 
620     /**
621      * Parse the XML declaration.
622      * <pre>
623      * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
624      * [24] VersionInfo ::= S 'version' Eq
625      *      ("'" VersionNum "'" | '"' VersionNum '"' )
626      * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
627      * [32] SDDecl ::= S 'standalone' Eq
628      *      ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
629      * [80] EncodingDecl ::= S 'encoding' Eq
630      *      ( "'" EncName "'" | "'" EncName "'" )
631      * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
632      * </pre>
633      * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
634      * @return the encoding in the declaration, uppercased; or null
635      * @see #parseTextDecl
636      * @see #setupDecoding
637      */
638     private String parseXMLDecl (boolean ignoreEncoding)
639     throws SAXException, IOException
640     {
641     String  version;
642     String  encodingName = null;
643     String  standalone = null;
644     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
645 
646     // Read the version.
647     require ("version", "XML declaration");
648     parseEq ();
649     version = readLiteral (flags);
650     if (!version.equals ("1.0")) {
651         error ("unsupported XML version", version, "1.0");
652     }
653 
654     // Try reading an encoding declaration.
655     boolean white = tryWhitespace ();
656     if (tryRead ("encoding")) {
657         if (!white)
658         error ("whitespace required before 'encoding='");
659         parseEq ();
660         encodingName = readLiteral (flags);
661         if (!ignoreEncoding)
662         setupDecoding (encodingName);
663     }
664 
665     // Try reading a standalone declaration
666     if (encodingName != null)
667         white = tryWhitespace ();
668     if (tryRead ("standalone")) {
669         if (!white)
670         error ("whitespace required before 'standalone='");
671         parseEq ();
672         standalone = readLiteral (flags);
673         if (! ("yes".equals (standalone) || "no".equals (standalone)))
674         error ("standalone flag must be 'yes' or 'no'");
675     }
676 
677     skipWhitespace ();
678     require ("?>", "XML declaration");
679 
680     return encodingName;
681     }
682 
683 
684     /**
685      * Parse a text declaration.
686      * <pre>
687      * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
688      * [80] EncodingDecl ::= S 'encoding' Eq
689      *      ( '"' EncName '"' | "'" EncName "'" )
690      * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
691      * </pre>
692      * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
693      * @return the encoding in the declaration, uppercased; or null
694      * @see #parseXMLDecl
695      * @see #setupDecoding
696      */
697     private String parseTextDecl (boolean ignoreEncoding)
698     throws SAXException, IOException
699     {
700     String  encodingName = null;
701     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
702 
703     // Read an optional version.
704     if (tryRead ("version")) {
705         String version;
706         parseEq ();
707         version = readLiteral (flags);
708         if (!version.equals ("1.0")) {
709         error ("unsupported XML version", version, "1.0");
710         }
711         requireWhitespace ();
712     }
713 
714 
715     // Read the encoding.
716     require ("encoding", "XML text declaration");
717     parseEq ();
718     encodingName = readLiteral (flags);
719     if (!ignoreEncoding)
720         setupDecoding (encodingName);
721 
722     skipWhitespace ();
723     require ("?>", "XML text declaration");
724 
725     return encodingName;
726     }
727 
728 
729     /**
730      * Sets up internal state so that we can decode an entity using the
731      * specified encoding.  This is used when we start to read an entity
732      * and we have been given knowledge of its encoding before we start to
733      * read any data (e.g. from a SAX input source or from a MIME type).
734      *
735      * <p> It is also used after autodetection, at which point only very
736      * limited adjustments to the encoding may be used (switching between
737      * related builtin decoders).
738      *
739      * @param encodingName The name of the encoding specified by the user.
740      * @exception IOException if the encoding isn't supported either
741      *  internally to this parser, or by the hosting JVM.
742      * @see #parseXMLDecl
743      * @see #parseTextDecl
744      */
745     private void setupDecoding (String encodingName)
746     throws SAXException, IOException
747     {
748     encodingName = encodingName.toUpperCase ();
749 
750     // ENCODING_EXTERNAL indicates an encoding that wasn't
751     // autodetected ... we can use builtin decoders, or
752     // ones from the JVM (InputStreamReader).
753 
754     // Otherwise we can only tweak what was autodetected, and
755     // only for single byte (ASCII derived) builtin encodings.
756 
757     // ASCII-derived encodings
758     if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) {
759         if (encodingName.equals ("ISO-8859-1")
760                 || encodingName.equals ("8859_1")
761                 || encodingName.equals ("ISO8859_1")
762           ) {
763             encoding = ENCODING_ISO_8859_1;
764             return;
765         } else if (encodingName.equals ("US-ASCII")
766                 || encodingName.equals ("ASCII")) {
767             encoding = ENCODING_ASCII;
768             return;
769         } else if (encodingName.equals ("UTF-8")
770                 || encodingName.equals ("UTF8")) {
771             encoding = ENCODING_UTF_8;
772             return;
773         } else if (encoding != ENCODING_EXTERNAL) {
774             // used to start with a new reader ...
775             throw new EncodingException (encodingName);
776         }
777         // else fallthrough ...
778         // it's ASCII-ish and something other than a builtin
779     }
780 
781     // Unicode and such
782     if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) {
783         if (!(encodingName.equals ("ISO-10646-UCS-2")
784             || encodingName.equals ("UTF-16")
785             || encodingName.equals ("UTF-16BE")
786             || encodingName.equals ("UTF-16LE")))
787         error ("unsupported Unicode encoding",
788                encodingName,
789                "UTF-16");
790         return;
791     }
792 
793     // four byte encodings
794     if (encoding == ENCODING_UCS_4_1234
795         || encoding == ENCODING_UCS_4_4321
796         || encoding == ENCODING_UCS_4_2143
797         || encoding == ENCODING_UCS_4_3412) {
798         if (!encodingName.equals ("ISO-10646-UCS-4"))
799         error ("unsupported 32-bit encoding",
800                encodingName,
801                "ISO-10646-UCS-4");
802         return;
803     }
804 
805     // assert encoding == ENCODING_EXTERNAL
806     // if (encoding != ENCODING_EXTERNAL)
807     //     throw new RuntimeException ("encoding = " + encoding);
808 
809     if (encodingName.equals ("UTF-16BE")) {
810         encoding = ENCODING_UCS_2_12;
811         return;
812     }
813     if (encodingName.equals ("UTF-16LE")) {
814         encoding = ENCODING_UCS_2_21;
815         return;
816     }
817 
818     // We couldn't use the builtin decoders at all.  But we can try to
819     // create a reader, since we haven't messed up buffering.  Tweak
820     // the encoding name if necessary.
821 
822     if (encodingName.equals ("UTF-16")
823         || encodingName.equals ("ISO-10646-UCS-2"))
824         encodingName = "Unicode";
825     // Ignoring all the EBCDIC aliases here
826 
827     reader = new InputStreamReader (is, encodingName);
828     sourceType = INPUT_READER;
829     }
830 
831 
832     /**
833      * Parse miscellaneous markup outside the document element and DOCTYPE
834      * declaration.
835      * <pre>
836      * [27] Misc ::= Comment | PI | S
837      * </pre>
838      */
839     private void parseMisc ()
840     throws Exception
841     {
842     while (true) {
843         skipWhitespace ();
844         if (tryRead ("<?")) {
845         parsePI ();
846         } else if (tryRead ("<!--")) {
847         parseComment ();
848         } else {
849         return;
850         }
851     }
852     }
853 
854 
855     /**
856      * Parse a document type declaration.
857      * <pre>
858      * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
859      *      ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
860      * </pre>
861      * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
862      */
863     private void parseDoctypedecl ()
864     throws Exception
865     {
866     String doctypeName, ids[];
867 
868     // Read the document type name.
869     requireWhitespace ();
870     doctypeName = readNmtoken (true);
871 
872     // Read the External subset's IDs
873     skipWhitespace ();
874     ids = readExternalIds (false);
875 
876     // report (a) declaration of name, (b) lexical info (ids)
877     handler.doctypeDecl (doctypeName, ids [0], ids [1]);
878 
879     // Internal subset is parsed first, if present
880     skipWhitespace ();
881     if (tryRead ('[')) {
882 
883         // loop until the subset ends
884         while (true) {
885         expandPE = true;
886         skipWhitespace ();
887         expandPE = false;
888         if (tryRead (']')) {
889             break;      // end of subset
890         } else {
891             // WFC, PEs in internal subset (only between decls)
892             peIsError = expandPE = true;
893             parseMarkupdecl ();
894             peIsError = expandPE = false;
895         }
896         }
897     }
898 
899     // Read the external subset, if any
900     if (ids [1] != null) {
901         pushURL ("[external subset]", ids [0], ids [1], null, null, null, false);
902 
903         // Loop until we end up back at '>'
904         while (true) {
905         expandPE = true;
906         skipWhitespace ();
907         expandPE = false;
908         if (tryRead ('>')) {
909             break;
910         } else {
911             expandPE = true;
912             parseMarkupdecl ();
913             expandPE = false;
914         }
915         }
916     } else {
917         // No external subset.
918         skipWhitespace ();
919         require ('>', "internal DTD subset");
920     }
921 
922     // done dtd
923     handler.endDoctype ();
924     expandPE = false;
925     }
926 
927 
928     /**
929      * Parse a markup declaration in the internal or external DTD subset.
930      * <pre>
931      * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
932      *      | NotationDecl | PI | Comment
933      * [30] extSubsetDecl ::= (markupdecl | conditionalSect
934      *      | PEReference | S) *
935      * </pre>
936      * <p> Reading toplevel PE references is handled as a lexical issue
937      * by the caller, as is whitespace.
938      */
939     private void parseMarkupdecl ()
940     throws Exception
941     {
942     if (tryRead ("<!ELEMENT")) {
943         parseElementdecl ();
944     } else if (tryRead ("<!ATTLIST")) {
945         parseAttlistDecl ();
946     } else if (tryRead ("<!ENTITY")) {
947         parseEntityDecl ();
948     } else if (tryRead ("<!NOTATION")) {
949         parseNotationDecl ();
950     } else if (tryRead ("<?")) {
951         parsePI ();
952     } else if (tryRead ("<!--")) {
953         parseComment ();
954     } else if (tryRead ("<![")) {
955         if (inputStack.size () > 0)
956         parseConditionalSect ();
957         else
958         error ("conditional sections illegal in internal subset");
959     } else {
960         error ("expected markup declaration");
961     }
962     }
963 
964 
965     /**
966      * Parse an element, with its tags.
967      * <pre>
968      * [39] element ::= EmptyElementTag | STag content ETag
969      * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
970      * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
971      * </pre>
972      * <p> (The '&lt;' has already been read.)
973      * <p>NOTE: this method actually chains onto parseContent (), if necessary,
974      * and parseContent () will take care of calling parseETag ().
975      */
976     private void parseElement ()
977     throws Exception
978     {
979     String  gi;
980     char    c;
981     int oldElementContent = currentElementContent;
982     String  oldElement = currentElement;
983     Object  element [];
984 
985     // This is the (global) counter for the
986     // array of specified attributes.
987     tagAttributePos = 0;
988 
989     // Read the element type name.
990     gi = readNmtoken (true);
991 
992     // Determine the current content type.
993     currentElement = gi;
994     element = (Object []) elementInfo.get (gi);
995     currentElementContent = getContentType (element, CONTENT_ANY);
996 
997     // Read the attributes, if any.
998     // After this loop, "c" is the closing delimiter.
999     boolean white = tryWhitespace ();
1000    c = readCh ();
1001    while (c != '/' && c != '>') {
1002        unread (c);
1003        if (!white)
1004        error ("need whitespace between attributes");
1005        parseAttribute (gi);
1006        white = tryWhitespace ();
1007        c = readCh ();
1008    }
1009
1010    // Supply any defaulted attributes.
1011    Enumeration atts = declaredAttributes (element);
1012    if (atts != null) {
1013        String aname;
1014loop:
1015        while (atts.hasMoreElements ()) {
1016            aname = (String) atts.nextElement ();
1017            // See if it was specified.
1018            for (int i = 0; i < tagAttributePos; i++) {
1019                if (tagAttributes [i] == aname) {
1020                continue loop;
1021                }
1022            }
1023            // I guess not...
1024            String defaultVal = getAttributeExpandedValue (gi, aname);
1025            if (defaultVal!=null) {
1026                handler.attribute (aname, defaultVal, false);
1027            }
1028        }
1029    }
1030
1031    // Figure out if this is a start tag
1032    // or an empty element, and dispatch an
1033    // event accordingly.
1034    switch (c) {
1035    case '>':
1036        handler.startElement (gi);
1037        parseContent ();
1038        break;
1039    case '/':
1040        require ('>', "empty element tag");
1041        handler.startElement (gi);
1042        handler.endElement (gi);
1043        break;
1044    }
1045
1046    // Restore the previous state.
1047    currentElement = oldElement;
1048    currentElementContent = oldElementContent;
1049    }
1050
1051
1052    /**
1053     * Parse an attribute assignment.
1054     * <pre>
1055     * [41] Attribute ::= Name Eq AttValue
1056     * </pre>
1057     * @param name The name of the attribute's element.
1058     * @see SAXDriver#attribute
1059     */
1060    private void parseAttribute (String name)
1061    throws Exception
1062    {
1063    String aname;
1064    int type;
1065    String value;
1066    int flags = LIT_ATTRIBUTE |  LIT_ENTITY_REF;
1067
1068    // Read the attribute name.
1069    aname = readNmtoken (true);
1070    type = getAttributeType (name, aname);
1071
1072    // Parse '='
1073    parseEq ();
1074
1075    // Read the value, normalizing whitespace
1076    // unless it is CDATA.
1077    if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
1078        value = readLiteral (flags);
1079    } else {
1080        value = readLiteral (flags | LIT_NORMALIZE);
1081    }
1082
1083    // WFC: no duplicate attributes
1084    for (int i = 0; i < tagAttributePos; i++)
1085        if (aname.equals (tagAttributes [i]))
1086        error ("duplicate attribute", aname, null);
1087    
1088        // Above check is almost redundant; the SAXDriver performs a more
1089        // rigorous check that the expanded-names of the attributes are distinct. However,
1090        // the check is needed here to spot duplicate xmlns:xx attributes. - MHK
1091
1092    // Inform the handler about the
1093    // attribute.
1094    handler.attribute (aname, value, true);
1095    dataBufferPos = 0;
1096
1097    // Note that the attribute has been
1098    // specified.
1099    if (tagAttributePos == tagAttributes.length) {
1100        String newAttrib[] = new String [tagAttributes.length * 2];
1101        System.arraycopy (tagAttributes, 0, newAttrib, 0, tagAttributePos);
1102        tagAttributes = newAttrib;
1103    }
1104    tagAttributes [tagAttributePos++] = aname;
1105    }
1106
1107
1108    /**
1109     * Parse an equals sign surrounded by optional whitespace.
1110     * <pre>
1111     * [25] Eq ::= S? '=' S?
1112     * </pre>
1113     */
1114    private void parseEq ()
1115    throws SAXException, IOException
1116    {
1117    skipWhitespace ();
1118    require ('=', "attribute name");
1119    skipWhitespace ();
1120    }
1121
1122
1123    /**
1124     * Parse an end tag.
1125     * <pre>
1126     * [42] ETag ::= '</' Name S? '>'
1127     * </pre>
1128     * <p>NOTE: parseContent () chains to here, we already read the
1129     * "&lt;/".
1130     */
1131    private void parseETag ()
1132    throws Exception
1133    {
1134    require (currentElement, "element end tag");
1135    skipWhitespace ();
1136    require ('>', "name in end tag");
1137    handler.endElement (currentElement);
1138    // not re-reporting any SAXException re bogus end tags,
1139    // even though that diagnostic might be clearer ...
1140    }
1141
1142
1143    /**
1144     * Parse the content of an element.
1145     * <pre>
1146     * [43] content ::= (element | CharData | Reference
1147     *      | CDSect | PI | Comment)*
1148     * [67] Reference ::= EntityRef | CharRef
1149     * </pre>
1150     * <p> NOTE: consumes ETtag.
1151     */
1152    private void parseContent ()
1153    throws Exception
1154    {
1155    char c;
1156    while (true) {
1157        //switch (currentElementContent) {
1158        //    case CONTENT_ANY:
1159        //    case CONTENT_MIXED:
1160        //    case CONTENT_UNDECLARED:    // this line added by MHK 24 May 2000
1161        //    case CONTENT_EMPTY:         // this line added by MHK 8 Sept 2000
1162        //        parseCharData ();
1163        //        break;
1164        //    case CONTENT_ELEMENTS:
1165        //        //parseWhitespace ();   // removed MHK 27 May 2001. The problem is that
1166        //                                // with element content, the text should be whitespace
1167        //                                // but if the document is invalid it might not be.
1168        //                                // Replaced with....
1169        //        parseCharData();        // This processes any char data, but still reports
1170        //                                // it as ignorable white space if within element content.
1171        //        break;
1172        //}
1173        
1174        parseCharData();    // parse it the same way regardless of content type
1175                            // because it might not be valid anyway
1176
1177        // Handle delimiters
1178        c = readCh ();
1179        switch (c) {
1180        case '&':           // Found "&"
1181
1182            c = readCh ();
1183            if (c == '#') {
1184                parseCharRef ();
1185            } else {
1186                unread (c);
1187                parseEntityRef (true);
1188            }
1189            break;
1190
1191        case '<':           // Found "<"
1192            dataBufferFlush ();
1193            c = readCh ();
1194            switch (c) {
1195              case '!':             // Found "<!"
1196                c = readCh ();
1197                switch (c) {
1198                  case '-':         // Found "<!-"
1199                    require ('-', "start of comment");
1200                    parseComment ();
1201                    break;
1202                  case '[':         // Found "<!["
1203                    require ("CDATA[", "CDATA section");
1204                    handler.startCDATA ();
1205                    inCDATA = true;
1206                    parseCDSect ();
1207                    inCDATA = false;
1208                    handler.endCDATA ();
1209                    break;
1210                  default:
1211                    error ("expected comment or CDATA section", c, null);
1212                    break;
1213                }
1214                break;
1215
1216              case '?':         // Found "<?"
1217                parsePI ();
1218                break;
1219
1220              case '/':         // Found "</"
1221                parseETag ();
1222                return;
1223
1224              default:      // Found "<" followed by something else
1225                unread (c);
1226                parseElement ();
1227                break;
1228            }
1229            }
1230        }
1231    }
1232
1233
1234    /**
1235     * Parse an element type declaration.
1236     * <pre>
1237     * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1238     * </pre>
1239     * <p> NOTE: the '&lt;!ELEMENT' has already been read.
1240     */
1241    private void parseElementdecl ()
1242    throws Exception
1243    {
1244    String name;
1245
1246    requireWhitespace ();
1247    // Read the element type name.
1248    name = readNmtoken (true);
1249
1250    requireWhitespace ();
1251    // Read the content model.
1252    parseContentspec (name);
1253
1254    skipWhitespace ();
1255    require ('>', "element declaration");
1256    }
1257
1258
1259    /**
1260     * Content specification.
1261     * <pre>
1262     * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1263     * </pre>
1264     */
1265    private void parseContentspec (String name)
1266    throws Exception
1267    {
1268    if (tryRead ("EMPTY")) {
1269        setElement (name, CONTENT_EMPTY, null, null);
1270        return;
1271    } else if (tryRead ("ANY")) {
1272        setElement (name, CONTENT_ANY, null, null);
1273        return;
1274    } else {
1275        require ('(', "element name");
1276        dataBufferAppend ('(');
1277        skipWhitespace ();
1278        if (tryRead ("#PCDATA")) {
1279        dataBufferAppend ("#PCDATA");
1280        parseMixed ();
1281        setElement (name, CONTENT_MIXED, dataBufferToString (), null);
1282        } else {
1283        parseElements ();
1284        setElement (name, CONTENT_ELEMENTS,
1285            dataBufferToString (), null);
1286        }
1287    }
1288    }
1289
1290
1291    /**
1292     * Parse an element-content model.
1293     * <pre>
1294     * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1295     * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1296     * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1297     * </pre>
1298     *
1299     * <p> NOTE: the opening '(' and S have already been read.
1300     */
1301    private void parseElements ()
1302    throws Exception
1303    {
1304    char c;
1305    char sep;
1306
1307    // Parse the first content particle
1308    skipWhitespace ();
1309    parseCp ();
1310
1311    // Check for end or for a separator.
1312    skipWhitespace ();
1313    c = readCh ();
1314    switch (c) {
1315    case ')':
1316        dataBufferAppend (')');
1317        c = readCh ();
1318        switch (c) {
1319        case '*':
1320        case '+':
1321        case '?':
1322        dataBufferAppend (c);
1323        break;
1324        default:
1325        unread (c);
1326        }
1327        return;
1328    case ',':           // Register the separator.
1329    case '|':
1330        sep = c;
1331        dataBufferAppend (c);
1332        break;
1333    default:
1334        error ("bad separator in content model", c, null);
1335        return;
1336    }
1337
1338    // Parse the rest of the content model.
1339    while (true) {
1340        skipWhitespace ();
1341        parseCp ();
1342        skipWhitespace ();
1343        c = readCh ();
1344        if (c == ')') {
1345        dataBufferAppend (')');
1346        break;
1347        } else if (c != sep) {
1348        error ("bad separator in content model", c, null);
1349        return;
1350        } else {
1351        dataBufferAppend (c);
1352        }
1353    }
1354
1355    // Check for the occurrence indicator.
1356    c = readCh ();
1357    switch (c) {
1358    case '?':
1359    case '*':
1360    case '+':
1361        dataBufferAppend (c);
1362        return;
1363    default:
1364        unread (c);
1365        return;
1366    }
1367    }
1368
1369
1370    /**
1371     * Parse a content particle.
1372     * <pre>
1373     * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1374     * </pre>
1375     */
1376    private void parseCp ()
1377    throws Exception
1378    {
1379    if (tryRead ('(')) {
1380        dataBufferAppend ('(');
1381        parseElements ();
1382    } else {
1383        dataBufferAppend (readNmtoken (true));
1384        char c = readCh ();
1385        switch (c) {
1386        case '?':
1387        case '*':
1388        case '+':
1389        dataBufferAppend (c);
1390        break;
1391        default:
1392        unread (c);
1393        break;
1394        }
1395    }
1396    }
1397
1398
1399    /**
1400     * Parse mixed content.
1401     * <pre>
1402     * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1403     *        | '(' S? ('#PCDATA') S? ')'
1404     * </pre>
1405     */
1406    private void parseMixed ()
1407    throws Exception
1408    {
1409
1410    // Check for PCDATA alone.
1411    skipWhitespace ();
1412    if (tryRead (')')) {
1413        dataBufferAppend (")*");
1414        tryRead ('*');
1415        return;
1416    }
1417
1418    // Parse mixed content.
1419    skipWhitespace ();
1420    while (!tryRead (")*")) {
1421        require ('|', "alternative");
1422        dataBufferAppend ('|');
1423        skipWhitespace ();
1424        dataBufferAppend (readNmtoken (true));
1425        skipWhitespace ();
1426    }
1427    dataBufferAppend (")*");
1428    }
1429
1430
1431    /**
1432     * Parse an attribute list declaration.
1433     * <pre>
1434     * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1435     * </pre>
1436     * <p>NOTE: the '&lt;!ATTLIST' has already been read.
1437     */
1438    private void parseAttlistDecl ()
1439    throws Exception
1440    {
1441    String elementName;
1442
1443    requireWhitespace ();
1444    elementName = readNmtoken (true);
1445    boolean white = tryWhitespace ();
1446    while (!tryRead ('>')) {
1447        if (!white)
1448        error ("whitespace required before attribute definition");
1449        parseAttDef (elementName);
1450        white = tryWhitespace ();
1451    }
1452    }
1453
1454
1455    /**
1456     * Parse a single attribute definition.
1457     * <pre>
1458     * [53] AttDef ::= S Name S AttType S DefaultDecl
1459     * </pre>
1460     */
1461    private void parseAttDef (String elementName)
1462    throws Exception
1463    {
1464    String name;
1465    int type;
1466    String enum = null;
1467
1468    // Read the attribute name.
1469    name = readNmtoken (true);
1470
1471    // Read the attribute type.
1472    requireWhitespace ();
1473    type = readAttType ();
1474
1475    // Get the string of enumerated values
1476    // if necessary.
1477    if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1478        enum = dataBufferToString ();
1479    }
1480
1481    // Read the default value.
1482    requireWhitespace ();
1483    parseDefault (elementName, name, type, enum);
1484    }
1485
1486
1487    /**
1488     * Parse the attribute type.
1489     * <pre>
1490     * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1491     * [55] StringType ::= 'CDATA'
1492     * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1493     *      | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1494     * [57] EnumeratedType ::= NotationType | Enumeration
1495     * </pre>
1496     */
1497    private int readAttType ()
1498    throws Exception
1499    {
1500    if (tryRead ('(')) {
1501        parseEnumeration (false);
1502        return ATTRIBUTE_ENUMERATED;
1503    } else {
1504        String typeString = readNmtoken (true);
1505        if (typeString.equals ("NOTATION")) {
1506        parseNotationType ();
1507        }
1508        Integer type = (Integer) attributeTypeHash.get (typeString);
1509        if (type == null) {
1510        error ("illegal attribute type", typeString, null);
1511        return ATTRIBUTE_UNDECLARED;
1512        } else {
1513        return type.intValue ();
1514        }
1515    }
1516    }
1517
1518
1519    /**
1520     * Parse an enumeration.
1521     * <pre>
1522     * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1523     * </pre>
1524     * <p>NOTE: the '(' has already been read.
1525     */
1526    private void parseEnumeration (boolean isNames)
1527    throws Exception
1528    {
1529    dataBufferAppend ('(');
1530
1531    // Read the first token.
1532    skipWhitespace ();
1533    dataBufferAppend (readNmtoken (isNames));
1534    // Read the remaining tokens.
1535    skipWhitespace ();
1536    while (!tryRead (')')) {
1537        require ('|', "enumeration value");
1538        dataBufferAppend ('|');
1539        skipWhitespace ();
1540        dataBufferAppend (readNmtoken (isNames));
1541        skipWhitespace ();
1542    }
1543    dataBufferAppend (')');
1544    }
1545
1546
1547    /**
1548     * Parse a notation type for an attribute.
1549     * <pre>
1550     * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
1551     *      (S? '|' S? name)* S? ')'
1552     * </pre>
1553     * <p>NOTE: the 'NOTATION' has already been read
1554     */
1555    private void parseNotationType ()
1556    throws Exception
1557    {
1558    requireWhitespace ();
1559    require ('(', "NOTATION");
1560
1561    parseEnumeration (true);
1562    }
1563
1564
1565    /**
1566     * Parse the default value for an attribute.
1567     * <pre>
1568     * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1569     *      | (('#FIXED' S)? AttValue)
1570     * </pre>
1571     */
1572    private void parseDefault (
1573    String elementName,
1574    String name,
1575    int type,
1576    String enum
1577    ) throws Exception
1578    {
1579    int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1580    String  value = null;
1581    int flags = LIT_ATTRIBUTE | LIT_DISABLE_CREF | LIT_ENTITY_CHECK | LIT_DISABLE_PE;
1582                                                                   // ^^^^^^^^^^^^^^
1583                                                                   // added MHK 20 Mar 2002
1584
1585    // Note: char refs not checked here, and input not normalized,
1586    // since it's done correctly later when we actually expand any
1587    // entity refs.  We ought to report char ref syntax errors now,
1588    // but don't.  Cost: unused defaults mean unreported WF errs.
1589    
1590    // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
1591    // chars to spaces (doesn't matter when that's done if it doesn't
1592    // interfere with char refs expanding to whitespace).
1593
1594    if (tryRead ('#')) {
1595        if (tryRead ("FIXED")) {
1596        valueType = ATTRIBUTE_DEFAULT_FIXED;
1597        requireWhitespace ();
1598        value = readLiteral (flags);
1599        } else if (tryRead ("REQUIRED")) {
1600        valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1601        } else if (tryRead ("IMPLIED")) {
1602        valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1603        } else {
1604        error ("illegal keyword for attribute default value");
1605        }
1606    } else
1607        value = readLiteral (flags);
1608    setAttribute (elementName, name, type, enum, value, valueType);
1609    }
1610
1611
1612    /**
1613     * Parse a conditional section.
1614     * <pre>
1615     * [61] conditionalSect ::= includeSect || ignoreSect
1616     * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
1617     *      extSubsetDecl ']]&gt;'
1618     * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
1619     *      ignoreSectContents* ']]&gt;'
1620     * [64] ignoreSectContents ::= Ignore
1621     *      ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
1622     * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
1623     * </pre>
1624     * <p> NOTE: the '&gt;![' has already been read.
1625     */
1626    private void parseConditionalSect ()
1627    throws Exception
1628    {
1629    skipWhitespace ();
1630    if (tryRead ("INCLUDE")) {
1631        skipWhitespace ();
1632        require ('[', "INCLUDE");
1633        skipWhitespace ();
1634        while (!tryRead ("]]>")) {
1635        parseMarkupdecl ();
1636        skipWhitespace ();
1637        }
1638    } else if (tryRead ("IGNORE")) {
1639        skipWhitespace ();
1640        require ('[', "IGNORE");
1641        int nesting = 1;
1642        char c;
1643        expandPE = false;
1644        for (int nest = 1; nest > 0;) {
1645        c = readCh ();
1646        switch (c) {
1647        case '<':
1648            if (tryRead ("![")) {
1649            nest++;
1650            }
1651        case ']':
1652            if (tryRead ("]>")) {
1653            nest--;
1654            }
1655        }
1656        }
1657        expandPE = true;
1658    } else {
1659        error ("conditional section must begin with INCLUDE or IGNORE");
1660    }
1661    }
1662
1663
1664    /**
1665     * Read and interpret a character reference.
1666     * <pre>
1667     * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1668     * </pre>
1669     * <p>NOTE: the '&#' has already been read.
1670     */
1671    private void parseCharRef ()
1672    throws SAXException, IOException
1673    {
1674    int value = 0;
1675    char c;
1676
1677    if (tryRead ('x')) {
1678loop1:
1679        while (true) {
1680        c = readCh ();
1681        switch (c) {
1682        case '0':
1683        case '1':
1684        case '2':
1685        case '3':
1686        case '4':
1687        case '5':
1688        case '6':
1689        case '7':
1690        case '8':
1691        case '9':
1692        case 'a':
1693        case 'A':
1694        case 'b':
1695        case 'B':
1696        case 'c':
1697        case 'C':
1698        case 'd':
1699        case 'D':
1700        case 'e':
1701        case 'E':
1702        case 'f':
1703        case 'F':
1704            value *= 16;
1705            value += Integer.parseInt (new Character (c).toString (),
1706                    16);
1707            break;
1708        case ';':
1709            break loop1;
1710        default:
1711            error ("illegal character in character reference", c, null);
1712            break loop1;
1713        }
1714        }
1715    } else {
1716loop2:
1717        while (true) {
1718        c = readCh ();
1719        switch (c) {
1720        case '0':
1721        case '1':
1722        case '2':
1723        case '3':
1724        case '4':
1725        case '5':
1726        case '6':
1727        case '7':
1728        case '8':
1729        case '9':
1730            value *= 10;
1731            value += Integer.parseInt (new Character (c).toString (),
1732                    10);
1733            break;
1734        case ';':
1735            break loop2;
1736        default:
1737            error ("illegal character in character reference", c, null);
1738            break loop2;
1739        }
1740        }
1741    }
1742
1743    // check for character refs being legal XML
1744    if ((value < 0x0020
1745        && ! (value == '\n' || value == '\t' || value == '\r'))
1746        || (value >= 0xD800 && value <= 0xDFFF)
1747        || value == 0xFFFE || value == 0xFFFF
1748        || value > 0x0010ffff)
1749        error ("illegal XML character reference U+"
1750            + Integer.toHexString (value));
1751
1752    // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
1753    //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
1754    if (value <= 0x0000ffff) {
1755        // no surrogates needed
1756        dataBufferAppend ((char) value);
1757    } else if (value <= 0x0010ffff) {
1758        value -= 0x10000;
1759        // > 16 bits, surrogate needed
1760        dataBufferAppend ((char) (0xd800 | (value >> 10)));
1761        dataBufferAppend ((char) (0xdc00 | (value & 0x0003ff)));
1762    } else {
1763        // too big for surrogate
1764        error ("character reference " + value + " is too large for UTF-16",
1765           new Integer (value).toString (), null);
1766    }
1767    }
1768
1769
1770    /**
1771     * Parse and expand an entity reference.
1772     * <pre>
1773     * [68] EntityRef ::= '&' Name ';'
1774     * </pre>
1775     * <p>NOTE: the '&amp;' has already been read.
1776     * @param externalAllowed External entities are allowed here.
1777     */
1778    private void parseEntityRef (boolean externalAllowed)
1779    throws SAXException, IOException
1780    {
1781    String name;
1782
1783    name = readNmtoken (true);
1784    require (';', "entity reference");
1785    switch (getEntityType (name)) {
1786    case ENTITY_UNDECLARED:
1787        error ("reference to undeclared entity", name, null);
1788        break;
1789    case ENTITY_INTERNAL:
1790        pushString (name, getEntityValue (name));
1791        break;
1792    case ENTITY_TEXT:
1793        if (externalAllowed) {
1794        pushURL (name, getEntityPublicId (name),
1795             getEntitySystemId (name),
1796             null, null, null, true);
1797        } else {
1798        error ("reference to external entity in attribute value.",
1799            name, null);
1800        }
1801        break;
1802    case ENTITY_NDATA:
1803        if (externalAllowed) {
1804        error ("unparsed entity reference in content", name, null);
1805        } else {
1806        error ("reference to external entity in attribute value.",
1807            name, null);
1808        }
1809        break;
1810    }
1811    }
1812
1813
1814    /**
1815     * Parse and expand a parameter entity reference.
1816     * <pre>
1817     * [69] PEReference ::= '%' Name ';'
1818     * </pre>
1819     * <p>NOTE: the '%' has already been read.
1820     */
1821    private void parsePEReference ()
1822    throws SAXException, IOException
1823    {
1824    String name;
1825
1826    name = "%" + readNmtoken (true);
1827    require (';', "parameter entity reference");
1828    switch (getEntityType (name)) {
1829    case ENTITY_UNDECLARED:
1830        // this is a validity problem, not a WFC violation ... but
1831        // we should disable handling of all subsequent declarations
1832        // unless this is a standalone document
1833        // warn ("reference to undeclared parameter entity", name, null);
1834
1835        break;
1836    case ENTITY_INTERNAL:
1837        if (inLiteral)
1838        pushString (name, getEntityValue (name));
1839        else
1840        pushString (name, ' ' + getEntityValue (name) + ' ');
1841        break;
1842    case ENTITY_TEXT:
1843        if (!inLiteral)
1844            pushString (null, " ");
1845        pushURL (name, getEntityPublicId (name),
1846             getEntitySystemId (name),
1847             null, null, null, true);
1848        if (!inLiteral)
1849            pushString (null, " ");
1850        break;
1851    }
1852    }
1853
1854    /**
1855     * Parse an entity declaration.
1856     * <pre>
1857     * [70] EntityDecl ::= GEDecl | PEDecl
1858     * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
1859     * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
1860     * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
1861     * [74] PEDef ::= EntityValue | ExternalID
1862     * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1863     *         | 'PUBLIC' S PubidLiteral S SystemLiteral
1864     * [76] NDataDecl ::= S 'NDATA' S Name
1865     * </pre>
1866     * <p>NOTE: the '&lt;!ENTITY' has already been read.
1867     */
1868    private void parseEntityDecl ()
1869    throws Exception
1870    {
1871    boolean peFlag = false;
1872
1873    // Check for a parameter entity.
1874    expandPE = false;
1875    requireWhitespace ();
1876    if (tryRead ('%')) {
1877        peFlag = true;
1878        requireWhitespace ();
1879    }
1880    expandPE = true;
1881
1882    // Read the entity name, and prepend
1883    // '%' if necessary.
1884    String name = readNmtoken (true);
1885    if (peFlag) {
1886        name = "%" + name;
1887    }
1888
1889    // Read the entity value.
1890    requireWhitespace ();
1891    char c = readCh ();
1892    unread (c);
1893    if (c == '"' || c == '\'') {
1894        // Internal entity ... replacement text has expanded refs
1895        // to characters and PEs, but not to general entities
1896        String value = readLiteral (0);
1897        setInternalEntity (name, value);
1898    } else {
1899        // Read the external IDs
1900        String[] ids = readExternalIds (false);
1901        if (ids [1] == null) {
1902            error ("system identifer missing", name, null);
1903        }
1904
1905        // Check for NDATA declaration.
1906        boolean white = tryWhitespace ();
1907        if (!peFlag && tryRead ("NDATA")) {
1908            if (!white)
1909                error ("whitespace required before NDATA");
1910            requireWhitespace ();
1911            String notationName = readNmtoken (true);
1912            setExternalDataEntity (name, ids [0], ids [1], notationName);
1913        } else {
1914            setExternalTextEntity (name, ids [0], ids [1]);
1915        }
1916    }
1917
1918    // Finish the declaration.
1919    skipWhitespace ();
1920    require ('>', "NDATA");
1921    }
1922
1923
1924    /**
1925     * Parse a notation declaration.
1926     * <pre>
1927     * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
1928     *      (ExternalID | PublicID) S? '&gt;'
1929     * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1930     * </pre>
1931     * <P>NOTE: the '&lt;!NOTATION' has already been read.
1932     */
1933    private void parseNotationDecl ()
1934    throws Exception
1935    {
1936    String nname, ids[];
1937
1938
1939    requireWhitespace ();
1940    nname = readNmtoken (true);
1941
1942    requireWhitespace ();
1943
1944    // Read the external identifiers.
1945    ids = readExternalIds (true);
1946    if (ids [0] == null && ids [1] == null) {
1947        error ("external identifer missing", nname, null);
1948    }
1949
1950    // Register the notation.
1951    setNotation (nname, ids [0], ids [1]);
1952
1953    skipWhitespace ();
1954    require ('>', "notation declaration");
1955    }
1956
1957
1958    /**
1959     * Parse character data.
1960     * <pre>
1961     * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
1962     * </pre>
1963     */
1964    private void parseCharData ()
1965    throws Exception
1966    {
1967    char c;
1968
1969    // Start with a little cheat -- in most
1970    // cases, the entire sequence of
1971    // character data will already be in
1972    // the readBuffer; if not, fall through to
1973    // the normal approach.
1974    if (USE_CHEATS) {
1975        int lineAugment = 0;
1976        int columnAugment = 0;
1977
1978loop:
1979        for (int i = readBufferPos; i < readBufferLength; i++) {
1980
1981        switch (c = readBuffer [i]) {
1982        case '\n':
1983            lineAugment++;
1984            columnAugment = 0;
1985            break;
1986        case '&':
1987        case '<':
1988            int start = readBufferPos;
1989            columnAugment++;
1990            readBufferPos = i;
1991            if (lineAugment > 0) {
1992                line += lineAugment;
1993                column = columnAugment;
1994            } else {
1995                column += columnAugment;
1996            }
1997            dataBufferAppend (readBuffer, start, i - start);
1998            return;
1999        case ']':
2000            // XXX missing two end-of-buffer cases
2001            if ((i + 2) < readBufferLength) {
2002                if (readBuffer [i + 1] == ']'
2003                    && readBuffer [i + 2] == '>') {
2004                    error ("character data may not contain ']]>'");
2005                }
2006            }
2007            columnAugment++;
2008            break;
2009        default:
2010            if (c < 0x0020 || c > 0xFFFD)
2011            error ("illegal XML character U+"
2012                + Integer.toHexString (c));
2013            // FALLTHROUGH
2014        case '\r':
2015        case '\t':
2016            columnAugment++;
2017        }
2018        }
2019    }
2020
2021    // OK, the cheat didn't work; start over
2022    // and do it by the book.
2023
2024    int closeSquareBracketCount = 0;
2025    while (true) {
2026        c = readCh ();
2027        switch (c) {
2028        case '<':
2029        case '&':
2030            unread (c);
2031            return;
2032        case ']':
2033            closeSquareBracketCount++;
2034            dataBufferAppend(c);
2035            break;
2036        case '>':
2037            if (closeSquareBracketCount>=2) {
2038                // we've hit ']]>'
2039                error ("']]>' is not allowed here");
2040                break;
2041            }
2042            // fall-through                
2043        default:
2044            closeSquareBracketCount=0;
2045            dataBufferAppend (c);
2046            break;
2047        }
2048    }
2049    }
2050
2051
2052    //////////////////////////////////////////////////////////////////////
2053    // High-level reading and scanning methods.
2054    //////////////////////////////////////////////////////////////////////
2055
2056    /**
2057     * Require whitespace characters.
2058     */
2059    private void requireWhitespace ()
2060    throws SAXException, IOException
2061    {
2062    char c = readCh ();
2063    if (isWhitespace (c)) {
2064        skipWhitespace ();
2065    } else {
2066        error ("whitespace required", c, null);
2067    }
2068    }
2069
2070
2071    /**
2072     * Parse whitespace characters, and leave them in the data buffer.
2073     */
2074    private void parseWhitespace ()     // method no longer used - MHK
2075    throws Exception
2076    {
2077        char c = readCh ();
2078        while (isWhitespace (c)) {
2079            dataBufferAppend (c);
2080            c = readCh ();
2081        }
2082        unread (c);
2083    }
2084
2085
2086    /**
2087     * Skip whitespace characters.
2088     * <pre>
2089     * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2090     * </pre>
2091     */
2092    private void skipWhitespace ()
2093    throws SAXException, IOException
2094    {
2095    // Start with a little cheat.  Most of
2096    // the time, the white space will fall
2097    // within the current read buffer; if
2098    // not, then fall through.
2099    if (USE_CHEATS) {
2100        int lineAugment = 0;
2101        int columnAugment = 0;
2102
2103loop:
2104        for (int i = readBufferPos; i < readBufferLength; i++) {
2105        switch (readBuffer [i]) {
2106        case ' ':
2107        case '\t':
2108        case '\r':
2109            columnAugment++;
2110            break;
2111        case '\n':
2112            lineAugment++;
2113            columnAugment = 0;
2114            break;
2115        case '%':
2116            if (expandPE)
2117            break loop;
2118            // else fall through...
2119        default:
2120            readBufferPos = i;
2121            if (lineAugment > 0) {
2122            line += lineAugment;
2123            column = columnAugment;
2124            } else {
2125            column += columnAugment;
2126            }
2127            return;
2128        }
2129        }
2130    }
2131
2132    // OK, do it by the book.
2133    char c = readCh ();
2134    while (isWhitespace (c)) {
2135        c = readCh ();
2136    }
2137    unread (c);
2138    }
2139
2140
2141    /**
2142     * Read a name or (when parsing an enumeration) name token.
2143     * <pre>
2144     * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2145     * [7] Nmtoken ::= (NameChar)+
2146     * </pre>
2147     */
2148    private String readNmtoken (boolean isName)
2149    throws SAXException, IOException
2150    {
2151    char c;
2152
2153    if (USE_CHEATS) {
2154loop:
2155        for (int i = readBufferPos; i < readBufferLength; i++) {
2156        c = readBuffer [i];
2157        switch (c) {
2158          case '%':
2159            if (expandPE)
2160            break loop;
2161            // else fall through...
2162
2163            // What may legitimately come AFTER a name/nmtoken?
2164          case '<': case '>': case '&':
2165          case ',': case '|': case '*': case '+': case '?':
2166          case ')':
2167          case '=':
2168          case '\'': case '"':
2169          case '[':
2170          case ' ': case '\t': case '\r': case '\n':
2171          case ';':
2172          case '/':
2173            int start = readBufferPos;
2174            if (i == start)
2175            error ("name expected", readBuffer [i], null);
2176            readBufferPos = i;
2177            return intern (readBuffer, start, i - start);
2178
2179          default:
2180            // punt on exact tests from Appendix A; approximate
2181            // them using the Unicode ID start/part rules
2182            if (i == readBufferPos && isName) {
2183            if (!Character.isUnicodeIdentifierStart (c)
2184                && c != ':' && c != '_')
2185                error ("Not a name start character, U+"
2186                  + Integer.toHexString (c));
2187            } else if (!Character.isUnicodeIdentifierPart (c)
2188                && c != '-' && c != ':' && c != '_' && c != '.'
2189                && !isExtender (c))
2190            error ("Not a name character, U+"
2191                + Integer.toHexString (c));
2192        }
2193        }
2194    }
2195
2196    nameBufferPos = 0;
2197
2198    // Read the first character.
2199loop:
2200    while (true) {
2201        c = readCh ();
2202        switch (c) {
2203        case '%':
2204        case '<': case '>': case '&':
2205        case ',': case '|': case '*': case '+': case '?':
2206        case ')':
2207        case '=':
2208        case '\'': case '"':
2209        case '[':
2210        case ' ': case '\t': case '\n': case '\r':
2211        case ';':
2212        case '/':
2213        unread (c);
2214        if (nameBufferPos == 0) {
2215            error ("name expected");
2216        }
2217        // punt on exact tests from Appendix A, but approximate them
2218        if (isName
2219            && !Character.isUnicodeIdentifierStart (
2220                nameBuffer [0])
2221            && ":_".indexOf (nameBuffer [0]) == -1)
2222            error ("Not a name start character, U+"
2223                  + Integer.toHexString (nameBuffer [0]));
2224        String s = intern (nameBuffer, 0, nameBufferPos);
2225        nameBufferPos = 0;
2226        return s;
2227        default:
2228        // punt on exact tests from Appendix A, but approximate them
2229
2230        if ((nameBufferPos != 0 || !isName)
2231            && !Character.isUnicodeIdentifierPart (c)
2232            && ":-_.".indexOf (c) == -1
2233            && !isExtender (c))
2234            error ("Not a name character, U+"
2235                + Integer.toHexString (c));
2236        if (nameBufferPos >= nameBuffer.length)
2237            nameBuffer =
2238            (char[]) extendArray (nameBuffer,
2239                    nameBuffer.length, nameBufferPos);
2240        nameBuffer [nameBufferPos++] = c;
2241        }
2242    }
2243    }
2244
2245    private static boolean isExtender (char c)
2246    {
2247    // [88] Extender ::= ...
2248    return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
2249           || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
2250           || (c >= 0x3031 && c <= 0x3035)
2251           || (c >= 0x309d && c <= 0x309e)
2252           || (c >= 0x30fc && c <= 0x30fe);
2253    }
2254
2255
2256    /**
2257     * Read a literal.  With matching single or double quotes as
2258     * delimiters (and not embedded!) this is used to parse:
2259     * <pre>
2260     *  [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
2261     *  [10] AttValue ::= ... ([^<&] | Reference)* ...
2262     *  [11] SystemLiteral ::= ... (URLchar - "'")* ...
2263     *  [12] PubidLiteral ::= ... (PubidChar - "'")* ...
2264     * </pre>
2265     * as well as the quoted strings in XML and text declarations
2266     * (for version, encoding, and standalone) which have their
2267     * own constraints.
2268     */
2269    private String readLiteral (int flags)
2270    throws SAXException, IOException
2271    {
2272    char    delim, c;
2273    int startLine = line;
2274    boolean saved = expandPE;
2275
2276    // Find the first delimiter.
2277    delim = readCh ();
2278    if (delim != '"' && delim != '\'' && delim != (char) 0) {
2279        error ("expected '\"' or \"'\"", delim, null);
2280        return null;
2281    }
2282    inLiteral = true;
2283    if ((flags & LIT_DISABLE_PE) != 0)
2284        expandPE = false;
2285
2286    // Each level of input source has its own buffer; remember
2287    // ours, so we won't read the ending delimiter from any
2288    // other input source, regardless of entity processing.
2289    char ourBuf [] = readBuffer;
2290
2291    // Read the literal.
2292    try {
2293        c = readCh ();
2294loop:
2295        while (! (c == delim && readBuffer == ourBuf)) {
2296        switch (c) {
2297            // attributes and public ids are normalized
2298            // in almost the same ways
2299        case '\n':
2300        case '\r':
2301            if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0)
2302            c = ' ';
2303            break;
2304        case '\t':
2305            if ((flags & LIT_ATTRIBUTE) != 0)
2306            c = ' ';
2307            break;
2308        case '&':
2309            c = readCh ();
2310            // Char refs are expanded immediately, except for
2311            // all the cases where it's deferred.
2312            if (c == '#') {
2313            if ((flags & LIT_DISABLE_CREF) != 0) {
2314                dataBufferAppend ('&');
2315                continue;
2316            }
2317            parseCharRef ();
2318
2319            // It looks like an entity ref ...
2320            } else {
2321            unread (c);
2322            // Expand it?
2323            if ((flags & LIT_ENTITY_REF) > 0) {
2324                parseEntityRef (false);
2325
2326            // Is it just data?
2327            } else if ((flags & LIT_DISABLE_EREF) != 0) {
2328                dataBufferAppend ('&');
2329
2330            // OK, it will be an entity ref -- expanded later.
2331            } else {
2332                String name = readNmtoken (true);
2333                require (';', "entity reference");
2334                if ((flags & LIT_ENTITY_CHECK) != 0
2335                    && getEntityType (name) ==
2336                        ENTITY_UNDECLARED) {
2337                            // Possibly a validity error, shouldn't report it?
2338                            error ("General entity '" + name
2339                                + "' must be declared before use");
2340                }
2341                dataBufferAppend ('&');
2342                dataBufferAppend (name);
2343                dataBufferAppend (';');
2344            }
2345            }
2346            c = readCh ();
2347            continue loop;
2348
2349        case '<':
2350            // and why?  Perhaps so "&foo;" expands the same
2351            // inside and outside an attribute?
2352            if ((flags & LIT_ATTRIBUTE) != 0)
2353            error ("attribute values may not contain '<'");
2354            break;
2355
2356        // We don't worry about case '%' and PE refs, readCh does.
2357
2358        default:
2359            break;
2360        }
2361        dataBufferAppend (c);
2362        c = readCh ();
2363        }
2364    } catch (EOFException e) {
2365        error ("end of input while looking for delimiter (started on line "
2366           + startLine + ')', null, new Character (delim).toString ());
2367    }
2368    inLiteral = false;
2369    expandPE = saved;
2370
2371    // Normalise whitespace if necessary.
2372    if ((flags & LIT_NORMALIZE) > 0) {
2373        dataBufferNormalize ();
2374    }
2375
2376    // Return the value.
2377    return dataBufferToString ();
2378    }
2379
2380
2381    /**
2382     * Try reading external identifiers.
2383     * A system identifier is not required for notations.
2384     * @param inNotation Are we in a notation?
2385     * @return A two-member String array containing the identifiers.
2386     */
2387    private String[] readExternalIds (boolean inNotation)
2388    throws Exception
2389    {
2390    char    c;
2391    String  ids[] = new String [2];
2392    int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
2393
2394    if (tryRead ("PUBLIC")) {
2395        requireWhitespace ();
2396        ids [0] = readLiteral (LIT_NORMALIZE | LIT_PUBID | flags);
2397        if (inNotation) {
2398        skipWhitespace ();
2399        c = readCh ();
2400        unread (c);
2401        if (c == '"' || c == '\'') {
2402            ids [1] = readLiteral (flags);
2403        }
2404        } else {
2405        requireWhitespace ();
2406        ids [1] = readLiteral (flags);
2407        }
2408
2409        for (int i = 0; i < ids [0].length (); i++) {
2410        c = ids [0].charAt (i);
2411        if (c >= 'a' && c <= 'z')
2412            continue;
2413        if (c >= 'A' && c <= 'Z')
2414            continue;
2415        if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1)
2416            continue;
2417        error ("illegal PUBLIC id character U+"
2418            + Integer.toHexString (c));
2419        }
2420    } else if (tryRead ("SYSTEM")) {
2421        requireWhitespace ();
2422        ids [1] = readLiteral (flags);
2423    }
2424
2425    // XXX should normalize system IDs as follows:
2426    // - Convert to UTF-8
2427    // - Map reserved and non-ASCII characters to %HH
2428
2429    return ids;
2430    }
2431
2432
2433    /**
2434     * Test if a character is whitespace.
2435     * <pre>
2436     * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2437     * </pre>
2438     * @param c The character to test.
2439     * @return true if the character is whitespace.
2440     */
2441    private final boolean isWhitespace (char c)
2442    {
2443    if (c > 0x20)
2444        return false;
2445    if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
2446        return true;
2447    return false;   // illegal ...
2448    }
2449
2450
2451    //////////////////////////////////////////////////////////////////////
2452    // Utility routines.
2453    //////////////////////////////////////////////////////////////////////
2454
2455
2456    /**
2457     * Add a character to the data buffer.
2458     */
2459    private void dataBufferAppend (char c)
2460    {
2461    // Expand buffer if necessary.
2462    if (dataBufferPos >= dataBuffer.length)
2463        dataBuffer =
2464        (char[]) extendArray (dataBuffer,
2465            dataBuffer.length, dataBufferPos);
2466    dataBuffer [dataBufferPos++] = c;
2467    }
2468
2469
2470    /**
2471     * Add a string to the data buffer.
2472     */
2473    private void dataBufferAppend (String s)
2474    {
2475    dataBufferAppend (s.toCharArray (), 0, s.length ());
2476    }
2477
2478
2479    /**
2480     * Append (part of) a character array to the data buffer.
2481     */
2482    private void dataBufferAppend (char ch[], int start, int length)
2483    {
2484    dataBuffer = (char[])
2485        extendArray (dataBuffer, dataBuffer.length,
2486                    dataBufferPos + length);
2487
2488    System.arraycopy (ch, start, dataBuffer, dataBufferPos, length);
2489    dataBufferPos += length;
2490    }
2491
2492
2493    /**
2494     * Normalise spaces in the data buffer.
2495     */
2496    private void dataBufferNormalize ()
2497    {
2498    int i = 0;
2499    int j = 0;
2500    int end = dataBufferPos;
2501
2502    // Skip spaces at the start.
2503    while (j < end && dataBuffer [j] == ' ') {
2504        j++;
2505    }
2506
2507    // Skip whitespace at the end.
2508    while (end > j && dataBuffer [end - 1] == ' ') {
2509        end --;
2510    }
2511
2512    // Start copying to the left.
2513    while (j < end) {
2514
2515        char c = dataBuffer [j++];
2516
2517        // Normalise all other whitespace to
2518        // a single space.
2519        if (c == ' ') {
2520        while (j < end && dataBuffer [j++] == ' ') {}
2521
2522        dataBuffer [i++] = ' ';
2523        dataBuffer [i++] = dataBuffer [j - 1];
2524        } else {
2525        dataBuffer [i++] = c;
2526        }
2527    }
2528
2529    // The new length is <= the old one.
2530    dataBufferPos = i;
2531    }
2532
2533
2534    /**
2535     * Convert the data buffer to a string.
2536     */
2537    private String dataBufferToString ()
2538    {
2539    String s = new String (dataBuffer, 0, dataBufferPos);
2540    dataBufferPos = 0;
2541    return s;
2542    }
2543
2544
2545    /**
2546     * Flush the contents of the data buffer to the handler, as
2547     * appropriate, and reset the buffer for new input.
2548     */
2549    private void dataBufferFlush ()
2550    throws SAXException
2551    {
2552    if (currentElementContent == CONTENT_ELEMENTS
2553        && dataBufferPos > 0
2554        && !inCDATA
2555        ) {
2556        // We can't just trust the buffer to be whitespace, there
2557        // are cases when it isn't
2558        for (int i = 0; i < dataBufferPos; i++) {
2559            if (!isWhitespace (dataBuffer [i])) {
2560                handler.charData (dataBuffer, 0, dataBufferPos);
2561                dataBufferPos = 0;
2562            }
2563        }
2564        if (dataBufferPos > 0) {
2565            handler.ignorableWhitespace (dataBuffer, 0, dataBufferPos);
2566            dataBufferPos = 0;
2567        }
2568    } else if (dataBufferPos > 0) {
2569        handler.charData (dataBuffer, 0, dataBufferPos);
2570        dataBufferPos = 0;
2571    }
2572    }
2573
2574
2575    /**
2576     * Require a string to appear, or throw an exception.
2577     * <p><em>Precondition:</em> Entity expansion is not required.
2578     * <p><em>Precondition:</em> data buffer has no characters that
2579     * will get sent to the application.
2580     */
2581    private void require (String delim, String context)
2582    throws SAXException, IOException
2583    {
2584    int length = delim.length ();
2585    char    ch [];
2586        
2587    if (length < dataBuffer.length) {
2588        ch = dataBuffer;
2589        delim.getChars (0, length, ch, 0);
2590    } else
2591        ch = delim.toCharArray ();
2592
2593    if (USE_CHEATS
2594        && length <= (readBufferLength - readBufferPos)) {
2595        int offset = readBufferPos;
2596
2597        for (int i = 0; i < length; i++, offset++)
2598        if (ch [i] != readBuffer [offset])
2599            error ("unexpected characters in " + context, null, delim);
2600        readBufferPos = offset;
2601        
2602    } else {
2603        for (int i = 0; i < length; i++)
2604        require (ch [i], delim);
2605    }
2606    }
2607
2608
2609    /**
2610     * Require a character to appear, or throw an exception.
2611     */
2612    private void require (char delim, String after)
2613    throws SAXException, IOException
2614    {
2615    char c = readCh ();
2616
2617    if (c != delim) {
2618        error ("unexpected character after " + after, c, delim+"");
2619    }
2620    }
2621
2622
2623    /**
2624     * Create an interned string from a character array.
2625     * &AElig;lfred uses this method to create an interned version
2626     * of all names and name tokens, so that it can test equality
2627     * with <code>==</code> instead of <code>String.equals ()</code>.
2628     *
2629     * <p>This is much more efficient than constructing a non-interned
2630     * string first, and then interning it.
2631     *
2632     * @param ch an array of characters for building the string.
2633     * @param start the starting position in the array.
2634     * @param length the number of characters to place in the string.
2635     * @return an interned string.
2636     * @see #intern (String)
2637     * @see java.lang.String#intern
2638     */
2639    public String intern (char ch[], int start, int length)
2640    {
2641    int index = 0;
2642    int hash = 0;
2643    Object  bucket [];
2644
2645    // Generate a hash code.
2646    for (int i = start; i < start + length; i++)
2647        hash = 31 * hash + ch [i];
2648    hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
2649
2650    // Get the bucket -- consists of {array,String} pairs
2651    if ((bucket = symbolTable [hash]) == null) {
2652        // first string in this bucket
2653        bucket = new Object [8];
2654
2655    // Search for a matching tuple, and
2656    // return the string if we find one.
2657    } else {
2658        while (index < bucket.length) {
2659        char chFound [] = (char []) bucket [index];
2660
2661        // Stop when we hit a null index.
2662        if (chFound == null)
2663            break;
2664
2665        // If they're the same length, check for a match.
2666        if (chFound.length == length) {
2667            for (int i = 0; i < chFound.length; i++) {
2668            // continue search on failure
2669            if (ch [start + i] != chFound [i]) {
2670                break;
2671            } else if (i == length - 1) {
2672                // That's it, we have a match!
2673                return (String) bucket [index + 1];
2674            }
2675            }
2676        }
2677        index += 2;
2678        }
2679        // Not found -- we'll have to add it.
2680
2681        // Do we have to grow the bucket?
2682        bucket = (Object []) extendArray (bucket, bucket.length, index);
2683    }
2684    symbolTable [hash] = bucket;
2685
2686    // OK, add it to the end of the bucket -- "local" interning.
2687    // Intern "globally" to let applications share interning benefits.
2688    String s = new String (ch, start, length).intern ();
2689    bucket [index] = s.toCharArray ();
2690    bucket [index + 1] = s;
2691    return s;
2692    }
2693
2694
2695    /**
2696     * Ensure the capacity of an array, allocating a new one if
2697     * necessary.  Usually called only a handful of times.
2698     */
2699    private Object extendArray (Object array, int currentSize, int requiredSize)
2700    {
2701    if (requiredSize < currentSize) {
2702        return array;
2703    } else {
2704        Object newArray = null;
2705        int newSize = currentSize * 2;
2706
2707        if (newSize <= requiredSize)
2708        newSize = requiredSize + 1;
2709
2710        if (array instanceof char[])
2711        newArray = new char [newSize];
2712        else if (array instanceof Object[])
2713        newArray = new Object [newSize];
2714        else
2715        throw new RuntimeException ();
2716
2717        System.arraycopy (array, 0, newArray, 0, currentSize);
2718        return newArray;
2719    }
2720    }
2721
2722
2723    //////////////////////////////////////////////////////////////////////
2724    // XML query routines.
2725    //////////////////////////////////////////////////////////////////////
2726
2727
2728    //
2729    // Elements
2730    //
2731
2732    /**
2733     * Get the declared elements for an XML document.
2734     * <p>The results will be valid only after the DTD (if any) has been
2735     * parsed.
2736     * @return An enumeration of all element types declared for this
2737     *   document (as Strings).
2738     * @see #getElementContentType
2739     * @see #getElementContentModel
2740     */
2741    public Enumeration declaredElements ()
2742    {
2743    return elementInfo.keys ();
2744    }
2745
2746
2747    /**
2748     * Look up the content type of an element.
2749     * @param element element info vector
2750     * @param defaultType value for null vector
2751     * @return An integer constant representing the content type.
2752     * @see #CONTENT_UNDECLARED
2753     * @see #CONTENT_ANY
2754     * @see #CONTENT_EMPTY
2755     * @see #CONTENT_MIXED
2756     * @see #CONTENT_ELEMENTS
2757     */
2758    private int getContentType (Object element [], int defaultType)
2759    {
2760    int retval;
2761
2762    if (element == null)
2763        return defaultType;
2764    retval = ((Integer) element [0]).intValue ();
2765    if (retval == CONTENT_UNDECLARED)
2766        retval = defaultType;
2767    return retval;
2768    }
2769
2770
2771    /**
2772     * Look up the content type of an element.
2773     * @param name The element type name.
2774     * @return An integer constant representing the content type.
2775     * @see #getElementContentModel
2776     * @see #CONTENT_UNDECLARED
2777     * @see #CONTENT_ANY
2778     * @see #CONTENT_EMPTY
2779     * @see #CONTENT_MIXED
2780     * @see #CONTENT_ELEMENTS
2781     */
2782    public int getElementContentType (String name)
2783    {
2784    Object element [] = (Object []) elementInfo.get (name);
2785    return getContentType (element, CONTENT_UNDECLARED);
2786    }
2787
2788
2789    /**
2790     * Look up the content model of an element.
2791     * <p>The result will always be null unless the content type is
2792     * CONTENT_ELEMENTS or CONTENT_MIXED.
2793     * @param name The element type name.
2794     * @return The normalised content model, as a string.
2795     * @see #getElementContentType
2796     */
2797    public String getElementContentModel (String name)
2798    {
2799    Object element[] = (Object[]) elementInfo.get (name);
2800    if (element == null) {
2801        return null;
2802    } else {
2803        return (String) element [1];
2804    }
2805    }
2806
2807
2808    /**
2809     * Register an element.
2810     * Array format:
2811     *  [0] element type name
2812     *  [1] content model (mixed, elements only)
2813     *  [2] attribute hash table
2814     */
2815    private void setElement (String name, int contentType,
2816              String contentModel, Hashtable attributes)
2817    throws Exception
2818    {
2819    Object element[] = (Object []) elementInfo.get (name);
2820
2821    // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
2822    if (element == null) {
2823        element = new Object [3];
2824        element [0] = new Integer (contentType);
2825        element [1] = contentModel;
2826        element [2] = attributes;
2827        elementInfo.put (name, element);
2828        return;
2829    } 
2830
2831    // <!ELEMENT ...> declaration?  
2832    if (contentType != CONTENT_UNDECLARED) {
2833        // ... following an associated <!ATTLIST ...>
2834        if (((Integer) element [0]).intValue () == CONTENT_UNDECLARED) {
2835        element [0] = new Integer (contentType);
2836        element [1] = contentModel;
2837        } else {
2838        // VC: Unique Element Type Declaration
2839        //verror ("multiple declarations for element type: " + name);
2840        }
2841    }
2842
2843    // first <!ATTLIST ...>, before <!ELEMENT ...> ?
2844    else if (attributes != null) {
2845        element [2] = attributes;
2846    }
2847
2848    }
2849
2850
2851    /**
2852     * Look up the attribute hash table for an element.
2853     * The hash table is the second item in the element array.
2854     */
2855    private Hashtable getElementAttributes (String name)
2856    {
2857    Object element[] = (Object[]) elementInfo.get (name);
2858    if (element == null) {
2859        return null;
2860    } else {
2861        return (Hashtable) element [2];
2862    }
2863    }
2864
2865
2866
2867    //
2868    // Attributes
2869    //
2870
2871    /**
2872     * Get the declared attributes for an element type.
2873     * @param elname The name of the element type.
2874     * @return An Enumeration of all the attributes declared for
2875     *   a specific element type.  The results will be valid only
2876     *   after the DTD (if any) has been parsed.
2877     * @see #getAttributeType
2878     * @see #getAttributeEnumeration
2879     * @see #getAttributeDefaultValueType
2880     * @see #getAttributeDefaultValue
2881     * @see #getAttributeExpandedValue
2882     */
2883    private Enumeration declaredAttributes (Object element [])
2884    {
2885    Hashtable attlist;
2886
2887    if (element == null)
2888        return null;
2889    if ((attlist = (Hashtable) element [2]) == null)
2890        return null;
2891    return attlist.keys ();
2892    }
2893
2894    /**
2895     * Get the declared attributes for an element type.
2896     * @param elname The name of the element type.
2897     * @return An Enumeration of all the attributes declared for
2898     *   a specific element type.  The results will be valid only
2899     *   after the DTD (if any) has been parsed.
2900     * @see #getAttributeType
2901     * @see #getAttributeEnumeration
2902     * @see #getAttributeDefaultValueType
2903     * @see #getAttributeDefaultValue
2904     * @see #getAttributeExpandedValue
2905     */
2906    public Enumeration declaredAttributes (String elname)
2907    {
2908    return declaredAttributes ((Object []) elementInfo.get (elname));
2909    }
2910
2911
2912    /**
2913     * Retrieve the declared type of an attribute.
2914     * @param name The name of the associated element.
2915     * @param aname The name of the attribute.
2916     * @return An integer constant representing the attribute type.
2917     * @see #ATTRIBUTE_UNDECLARED
2918     * @see #ATTRIBUTE_CDATA
2919     * @see #ATTRIBUTE_ID
2920     * @see #ATTRIBUTE_IDREF
2921     * @see #ATTRIBUTE_IDREFS
2922     * @see #ATTRIBUTE_ENTITY
2923     * @see #ATTRIBUTE_ENTITIES
2924     * @see #ATTRIBUTE_NMTOKEN
2925     * @see #ATTRIBUTE_NMTOKENS
2926     * @see #ATTRIBUTE_ENUMERATED
2927     * @see #ATTRIBUTE_NOTATION
2928     */
2929    public int getAttributeType (String name, String aname)
2930    {
2931    Object attribute[] = getAttribute (name, aname);
2932    if (attribute == null) {
2933        return ATTRIBUTE_UNDECLARED;
2934    } else {
2935        return ((Integer) attribute [0]).intValue ();
2936    }
2937    }
2938
2939
2940    /**
2941     * Retrieve the allowed values for an enumerated attribute type.
2942     * @param name The name of the associated element.
2943     * @param aname The name of the attribute.
2944     * @return A string containing the token list.
2945     * @see #ATTRIBUTE_ENUMERATED
2946     * @see #ATTRIBUTE_NOTATION
2947     */
2948    public String getAttributeEnumeration (String name, String aname)
2949    {
2950    Object attribute[] = getAttribute (name, aname);
2951    if (attribute == null) {
2952        return null;
2953    } else {
2954        return (String) attribute [3];
2955    }
2956    }
2957
2958
2959    /**
2960     * Retrieve the default value of a declared attribute.
2961     * @param name The name of the associated element.
2962     * @param aname The name of the attribute.
2963     * @return The default value, or null if the attribute was
2964     *   #IMPLIED or simply undeclared and unspecified.
2965     * @see #getAttributeExpandedValue
2966     */
2967    public String getAttributeDefaultValue (String name, String aname)
2968    {
2969    Object attribute[] = getAttribute (name, aname);
2970    if (attribute == null) {
2971        return null;
2972    } else {
2973        return (String) attribute [1];
2974    }
2975    }
2976
2977
2978    /**
2979     * Retrieve the expanded value of a declared attribute.
2980     * <p>General entities (and char refs) will be expanded (once).
2981     * @param name The name of the associated element.
2982     * @param aname The name of the attribute.
2983     * @return The expanded default value, or null if the attribute was
2984     *   #IMPLIED or simply undeclared
2985     * @see #getAttributeDefaultValue
2986     */
2987    public String getAttributeExpandedValue (String name, String aname)
2988    throws Exception
2989    {
2990    Object attribute[] = getAttribute (name, aname);
2991
2992    if (attribute == null) {
2993        return null;
2994    } else if (attribute [4] == null && attribute [1] != null) {
2995        // we MUST use the same buf for both quotes else the literal
2996        // can't be properly terminated
2997        char buf [] = new char [1];
2998        int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
2999        int type = getAttributeType (name, aname);
3000
3001        if (type != ATTRIBUTE_CDATA && type != ATTRIBUTE_UNDECLARED)
3002        flags |= LIT_NORMALIZE;
3003        buf [0] = '"';
3004        pushCharArray (null, buf, 0, 1);
3005        pushString (null, (String) attribute [1]);
3006        pushCharArray (null, buf, 0, 1);
3007        attribute [4] = readLiteral (flags);
3008    }
3009    return (String) attribute [4];
3010    }
3011
3012
3013    /**
3014     * Retrieve the default value type of a declared attribute.
3015     * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3016     * @see #ATTRIBUTE_DEFAULT_IMPLIED
3017     * @see #ATTRIBUTE_DEFAULT_REQUIRED
3018     * @see #ATTRIBUTE_DEFAULT_FIXED
3019     */
3020    public int getAttributeDefaultValueType (String name, String aname)
3021    {
3022    Object attribute[] = getAttribute (name, aname);
3023    if (attribute == null) {
3024        return ATTRIBUTE_DEFAULT_UNDECLARED;
3025    } else {
3026        return ((Integer) attribute [2]).intValue ();
3027    }
3028    }
3029
3030
3031    /**
3032     * Register an attribute declaration for later retrieval.
3033     * Format:
3034     * - String type
3035     * - String default value
3036     * - int value type
3037     */
3038    private void setAttribute (String elName, String name, int type,
3039            String enumeration,
3040            String value, int valueType)
3041    throws Exception
3042    {
3043    Hashtable attlist;
3044
3045    // Create a new hashtable if necessary.
3046    attlist = getElementAttributes (elName);
3047    if (attlist == null) {
3048        attlist = new Hashtable ();
3049    }
3050
3051    // ignore multiple attribute declarations!
3052    if (attlist.get (name) != null) {
3053        // warn ...
3054        return;
3055    } else {
3056        Object[] attribute = new Object [5];
3057        attribute [0] = new Integer (type);
3058        attribute [1] = value;
3059        attribute [2] = new Integer (valueType);
3060        attribute [3] = enumeration;
3061        attribute [4] = null;
3062        attlist.put (name, attribute);
3063
3064        // save; but don't overwrite any existing <!ELEMENT ...>
3065        setElement (elName, CONTENT_UNDECLARED, null, attlist);
3066    }
3067    }
3068
3069
3070    /**
3071     * Retrieve the five-member array representing an
3072     * attribute declaration.
3073     */
3074    private Object[] getAttribute (String elName, String name)
3075    {
3076        Hashtable attlist = getElementAttributes (elName);
3077        if (attlist == null) {
3078            return null;
3079        }
3080
3081        return (Object[]) attlist.get (name);
3082    }
3083
3084
3085    //
3086    // Entities
3087    //
3088
3089    /**
3090     * Get declared entities.
3091     * @return An Enumeration of all the entities declared for
3092     *   this XML document.  The results will be valid only
3093     *   after the DTD (if any) has been parsed.
3094     * @see #getEntityType
3095     * @see #getEntityPublicId
3096     * @see #getEntitySystemId
3097     * @see #getEntityValue
3098     * @see #getEntityNotationName
3099     */
3100    public Enumeration declaredEntities ()
3101    {
3102    return entityInfo.keys ();
3103    }
3104
3105
3106    /**
3107     * Find the type of an entity.
3108     * @returns An integer constant representing the entity type.
3109     * @see #ENTITY_UNDECLARED
3110     * @see #ENTITY_INTERNAL
3111     * @see #ENTITY_NDATA
3112     * @see #ENTITY_TEXT
3113     */
3114    public int getEntityType (String ename)
3115    {
3116    Object entity[] = (Object[]) entityInfo.get (ename);
3117    if (entity == null) {
3118        return ENTITY_UNDECLARED;
3119    } else {
3120        return ((Integer) entity [0]).intValue ();
3121    }
3122    }
3123
3124
3125    /**
3126     * Return an external entity's public identifier, if any.
3127     * @param ename The name of the external entity.
3128     * @return The entity's system identifier, or null if the
3129     *   entity was not declared, if it is not an
3130     *   external entity, or if no public identifier was
3131     *   provided.
3132     * @see #getEntityType
3133     */
3134    public String getEntityPublicId (String ename)
3135    {
3136    Object entity[] = (Object[]) entityInfo.get (ename);
3137    if (entity == null) {
3138        return null;
3139    } else {
3140        return (String) entity [1];
3141    }
3142    }
3143
3144
3145    /**
3146     * Return an external entity's system identifier.
3147     * @param ename The name of the external entity.
3148     * @return The entity's system identifier, or null if the
3149     *   entity was not declared, or if it is not an
3150     *   external entity. Change made by MHK: The system identifier
3151     *   is returned as an absolute URL, resolved relative to the entity
3152     *   it was contained in.
3153     * @see #getEntityType
3154     */
3155    public String getEntitySystemId (String ename) 
3156    {
3157        Object entity[] = (Object[]) entityInfo.get (ename);
3158        if (entity == null) {
3159            return null;
3160        } else {
3161            try {
3162                String relativeURI = (String)entity [2];
3163                URL baseURI = (URL)entity [5];
3164                if (baseURI==null) return relativeURI;
3165                URL absoluteURI = new URL( baseURI, relativeURI );
3166                return absoluteURI.toString();
3167            } catch (IOException err) {
3168                // ignore the exception, a user entity resolver may be able
3169                // to do something; if not, the error will be caught later
3170                return (String)entity [2];
3171            }
3172        }
3173    }
3174
3175
3176    /**
3177     * Return the value of an internal entity.
3178     * @param ename The name of the internal entity.
3179     * @return The entity's value, or null if the entity was
3180     *   not declared, or if it is not an internal entity.
3181     * @see #getEntityType
3182     */
3183    public String getEntityValue (String ename)
3184    {
3185    Object entity[] = (Object[]) entityInfo.get (ename);
3186    if (entity == null) {
3187        return null;
3188    } else {
3189        return (String) entity [3];
3190    }
3191    }
3192
3193
3194    /**
3195     * Get the notation name associated with an NDATA entity.
3196     * @param ename The NDATA entity name.
3197     * @return The associated notation name, or null if the
3198     *   entity was not declared, or if it is not an
3199     *   NDATA entity.
3200     * @see #getEntityType
3201     */
3202    public String getEntityNotationName (String eName)
3203    {
3204    Object entity[] = (Object[]) entityInfo.get (eName);
3205    if (entity == null) {
3206        return null;
3207    } else {
3208        return (String) entity [4];
3209    }
3210    }
3211
3212
3213    /**
3214     * Register an entity declaration for later retrieval.
3215     */
3216    private void setInternalEntity (String eName, String value)
3217    {
3218    setEntity (eName, ENTITY_INTERNAL, null, null, value, null);
3219    }
3220
3221
3222    /**
3223     * Register an external data entity.
3224     */
3225    private void setExternalDataEntity (String eName, String pubid,
3226                 String sysid, String nName)
3227    {
3228    setEntity (eName, ENTITY_NDATA, pubid, sysid, null, nName);
3229    }
3230
3231
3232    /**
3233     * Register an external text entity.
3234     */
3235    private void setExternalTextEntity (String eName,
3236            String pubid, String sysid)
3237    {
3238    setEntity (eName, ENTITY_TEXT, pubid, sysid, null, null);
3239    }
3240
3241
3242    /**
3243     * Register an entity declaration for later retrieval.
3244     */
3245    private void setEntity (String eName, int eClass,
3246             String pubid, String sysid,
3247             String value, String nName)
3248    {
3249    Object entity[];
3250
3251    if (entityInfo.get (eName) == null) {
3252        entity = new Object [6];
3253        entity [0] = new Integer (eClass);
3254        entity [1] = pubid;
3255        entity [2] = sysid;
3256        entity [3] = value;
3257        entity [4] = nName;
3258        entity [5] = (externalEntity == null ? null : externalEntity.getURL());    
3259                        // added MHK: provides base URI for resolution
3260
3261        entityInfo.put (eName, entity);
3262    }
3263    }
3264
3265
3266    //
3267    // Notations.
3268    //
3269
3270    /**
3271     * Get declared notations.
3272     * @return An Enumeration of all the notations declared for
3273     *   this XML document.  The results will be valid only
3274     *   after the DTD (if any) has been parsed.
3275     * @see #getNotationPublicId
3276     * @see #getNotationSystemId
3277     */
3278    public Enumeration declaredNotations ()
3279    {
3280    return notationInfo.keys ();
3281    }
3282
3283
3284    /**
3285     * Look up the public identifier for a notation.
3286     * You will normally use this method to look up a notation
3287     * that was provided as an attribute value or for an NDATA entity.
3288     * @param nname The name of the notation.
3289     * @return A string containing the public identifier, or null
3290     *   if none was provided or if no such notation was
3291     *   declared.
3292     * @see #getNotationSystemId
3293     */
3294    public String getNotationPublicId (String nname)
3295    {
3296    Object notation[] = (Object[]) notationInfo.get (nname);
3297    if (notation == null) {
3298        return null;
3299    } else {
3300        return (String) notation [0];
3301    }
3302    }
3303
3304
3305    /**
3306     * Look up the system identifier for a notation.
3307     * You will normally use this method to look up a notation
3308     * that was provided as an attribute value or for an NDATA entity.
3309     * @param nname The name of the notation.
3310     * @return A string containing the system identifier, or null
3311     *   if no such notation was declared.
3312     * @see #getNotationPublicId
3313     */
3314    public String getNotationSystemId (String nname)
3315    {
3316    Object notation[] = (Object[]) notationInfo.get (nname);
3317    if (notation == null) {
3318        return null;
3319    } else {
3320        return (String) notation [1];
3321    }
3322    }
3323
3324
3325    /**
3326     * Register a notation declaration for later retrieval.
3327     * Format:
3328     * - public id
3329     * - system id
3330     */
3331    private void setNotation (String nname, String pubid, String sysid)
3332    throws Exception
3333    {
3334    Object notation[];
3335
3336    if (notationInfo.get (nname) == null) {
3337        notation = new Object [2];
3338        notation [0] = pubid;
3339        notation [1] = sysid;
3340        notationInfo.put (nname, notation);
3341    } else {
3342        // VC: Unique Notation Name
3343        // (it's not fatal)
3344    }
3345    }
3346
3347
3348    //
3349    // Location.
3350    //
3351
3352
3353    /**
3354     * Return the current line number.
3355     */
3356    public int getLineNumber ()
3357    {
3358    return line;
3359    }
3360
3361
3362    /**
3363     * Return the current column number.
3364     */
3365    public int getColumnNumber ()
3366    {
3367    return column;
3368    }
3369
3370
3371    //////////////////////////////////////////////////////////////////////
3372    // High-level I/O.
3373    //////////////////////////////////////////////////////////////////////
3374
3375
3376    /**
3377     * Read a single character from the readBuffer.
3378     * <p>The readDataChunk () method maintains the buffer.
3379     * <p>If we hit the end of an entity, try to pop the stack and
3380     * keep going.
3381     * <p> (This approach doesn't really enforce XML's rules about
3382     * entity boundaries, but this is not currently a validating
3383     * parser).
3384     * <p>This routine also attempts to keep track of the current
3385     * position in external entities, but it's not entirely accurate.
3386     * @return The next available input character.
3387     * @see #unread (char)
3388     * @see #unread (String)
3389     * @see #readDataChunk
3390     * @see #readBuffer
3391     * @see #line
3392     * @return The next character from the current input source.
3393     */
3394    private char readCh ()
3395    throws SAXException, IOException
3396    {
3397
3398    // As long as there's nothing in the
3399    // read buffer, try reading more data
3400    // (for an external entity) or popping
3401    // the entity stack (for either).
3402    while (readBufferPos >= readBufferLength) {
3403        switch (sourceType) {
3404        case INPUT_READER:
3405        case INPUT_STREAM:
3406        readDataChunk ();
3407        while (readBufferLength < 1) {
3408            popInput ();
3409            if (readBufferLength < 1) {
3410            readDataChunk ();
3411            }
3412        }
3413        break;
3414
3415        default:
3416
3417        popInput ();
3418        break;
3419        }
3420    }
3421
3422    char c = readBuffer [readBufferPos++];
3423
3424    if (c == '\n') {
3425        line++;
3426        column = 0;
3427    } else {
3428        if (c == '<') {
3429        /* the most common  return to parseContent () .. NOP */ ;
3430        } else if ((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
3431        error ("illegal XML character U+"
3432            + Integer.toHexString (c));
3433
3434        // If we're in the DTD and in a context where PEs get expanded,
3435        // do so ... 1/14/2000 errata identify those contexts.  There
3436        // are also spots in the internal subset where PE refs are fatal
3437        // errors, hence yet another flag.
3438        else if (c == '%' && expandPE) {
3439        if (peIsError && entityStack.size()==1)
3440            // not an error if PE reference is in an external PE called from internal subset
3441            error ("PE reference within declaration in internal subset.");
3442        parsePEReference ();
3443        return readCh ();
3444        }
3445        column++;
3446    }
3447
3448    return c;
3449    }
3450
3451
3452    /**
3453     * Push a single character back onto the current input stream.
3454     * <p>This method usually pushes the character back onto
3455     * the readBuffer, while the unread (String) method treats the
3456     * string as a new internal entity.
3457     * <p>I don't think that this would ever be called with 
3458     * readBufferPos = 0, because the methods always reads a character
3459     * before unreading it, but just in case, I've added a boundary
3460     * condition.
3461     * @param c The character to push back.
3462     * @see #readCh
3463     * @see #unread (String)
3464     * @see #unread (char[])
3465     * @see #readBuffer
3466     */
3467    private void unread (char c)
3468    throws SAXException
3469    {
3470    // Normal condition.
3471    if (c == '\n') {
3472        line--;
3473        column = -1;
3474    }
3475    if (readBufferPos > 0) {
3476        readBuffer [--readBufferPos] = c;
3477    } else {
3478        pushString (null, new Character (c).toString ());
3479    }
3480    }
3481
3482
3483    /**
3484     * Push a char array back onto the current input stream.
3485     * <p>NOTE: you must <em>never</em> push back characters that you
3486     * haven't actually read: use pushString () instead.
3487     * @see #readCh
3488     * @see #unread (char)
3489     * @see #unread (String)
3490     * @see #readBuffer
3491     * @see #pushString
3492     */
3493    private void unread (char ch[], int length)
3494    throws SAXException
3495    {
3496    for (int i = 0; i < length; i++) {
3497        if (ch [i] == '\n') {
3498        line--;
3499        column = -1;
3500        }
3501    }
3502    if (length < readBufferPos) {
3503        readBufferPos -= length;
3504    } else {
3505        pushCharArray (null, ch, 0, length);
3506        sourceType = INPUT_BUFFER;
3507    }
3508    }
3509
3510
3511    /**
3512     * Push a new external input source.
3513     * The source will be some kind of parsed entity, such as a PE
3514     * (including the external DTD subset) or content for the body.
3515     * <p>TODO: Right now, this method always attempts to autodetect
3516     * the encoding; in the future, it should allow the caller to 
3517     * request an encoding explicitly, and it should also look at the
3518     * headers with an HTTP connection.
3519     * @param url The java.net.URL object for the entity.
3520     * @see SAXDriver#resolveEntity
3521     * @see #pushString
3522     * @see #sourceType
3523     * @see #pushInput
3524     * @see #detectEncoding
3525     * @see #sourceType
3526     * @see #readBuffer
3527     */
3528    private void pushURL (
3529    String      ename,
3530    String      publicId,
3531    String      systemId,
3532    Reader      reader,
3533    InputStream stream,
3534    String      encoding,
3535    boolean     isAbsolute
3536    ) throws SAXException, IOException
3537    {
3538    boolean ignoreEncoding = false;
3539
3540    // Push the existing status.
3541    pushInput (ename);
3542
3543    // Create a new read buffer.
3544    // (Note the four-character margin)
3545    readBuffer = new char [READ_BUFFER_MAX + 4];
3546    readBufferPos = 0;
3547    readBufferLength = 0;
3548    readBufferOverflow = -1;
3549    is = null;
3550    line = 1;
3551    column = 0;
3552    currentByteCount = 0;
3553
3554    if (!isAbsolute) {
3555
3556        // Make any system ID (URI/URL) absolute.  There's one case
3557        // where it may be null:  parser was invoked without providing
3558        // one, e.g. since the XML data came from a memory buffer.
3559        try {
3560            if (systemId != null && externalEntity != null) {
3561                systemId = new URL (externalEntity.getURL (), systemId).toString ();
3562            } else if (baseURI != null) {
3563                systemId = new URL (new URL (baseURI), systemId).toString ();
3564                // throws IOException if couldn't create new URL
3565            }
3566        } catch(java.io.IOException err) {
3567            popInput();
3568            error("Invalid URL " + systemId + " (" + err.getMessage() + ")");
3569        }
3570    }
3571
3572    // See if the application wants to
3573    // redirect the system ID and/or
3574    // supply its own character stream.
3575    if (reader == null && stream == null && systemId != null) {
3576        Object input = null;
3577        try {
3578            input = handler.resolveEntity (publicId, systemId);
3579        } catch (java.io.IOException err) {
3580            popInput();
3581            error("Failure resolving entity " + systemId + " (" + err.getMessage() + ")");
3582        }
3583        if (input != null) {
3584            if (input instanceof String) {
3585                systemId = (String) input;
3586                isAbsolute = true;
3587            } else if (input instanceof InputStream) {
3588                stream = (InputStream) input;
3589            } else if (input instanceof Reader) {
3590                reader = (Reader) input;
3591            }
3592        } 
3593    }
3594    
3595    // Start the entity.
3596    if (systemId != null) {
3597        handler.startExternalEntity (systemId);
3598    } else {
3599        handler.startExternalEntity ("[unidentified data stream]");
3600    }
3601
3602    // If there's an explicit character stream, just
3603    // ignore encoding declarations.
3604    if (reader != null) {
3605        sourceType = INPUT_READER;
3606        this.reader = reader;
3607        tryEncodingDecl (true);
3608        return;
3609    }
3610    
3611    // Else we handle the conversion, and need to ensure
3612    // it's done right.
3613    sourceType = INPUT_STREAM;
3614    if (stream != null) {       
3615        is = stream;
3616    } else {
3617        // We have to open our own stream to the URL.
3618        URL url = new URL (systemId);
3619        try {
3620            externalEntity = url.openConnection ();
3621            externalEntity.connect ();
3622            is = externalEntity.getInputStream ();
3623        } catch (java.io.IOException err) {
3624            try {
3625                popInput();
3626            } catch (Exception err2) {}
3627            error("Cannot read input file " + err.getMessage());
3628        }
3629    }
3630
3631    // If we get to here, there must be
3632    // an InputStream available.
3633    if (!is.markSupported ()) {
3634        is = new BufferedInputStream (is);
3635    }
3636
3637    // Get any external encoding label.
3638    if (encoding == null && externalEntity != null) {
3639        // External labels can be untrustworthy; filesystems in
3640        // particular often have the wrong default for content
3641        // that wasn't locally originated.  Those we autodetect.
3642        if (!"file".equals (externalEntity.getURL ().getProtocol ())) {
3643        int temp;
3644
3645        // application/xml;charset=something;otherAttr=...
3646        // ... with many variants on 'something'
3647        encoding = externalEntity.getContentType ();
3648
3649        // MHK code (fix for Saxon 5.5.1/007): protect against encoding==null
3650        if (encoding==null) {
3651            temp = -1;
3652        } else {
3653            temp = encoding.indexOf ("charset");
3654        }
3655
3656        // RFC 2376 sez MIME text defaults to ASCII, but since the
3657        // JDK will create a MIME type out of thin air, we always
3658        // autodetect when there's no explicit charset attribute.
3659        if (temp < 0)
3660            encoding = null;    // autodetect
3661        else {
3662            temp = encoding.indexOf ('=', temp + 7); 
3663            encoding = encoding.substring (temp+1);   // +1 added by MHK 2 April 2001
3664            if ((temp = encoding.indexOf (';')) > 0)
3665            encoding = encoding.substring (0, temp);
3666
3667            // attributes can have comment fields (RFC 822)
3668            if ((temp = encoding.indexOf ('(')) > 0)
3669            encoding = encoding.substring (0, temp);
3670            // ... and values may be quoted
3671            if ((temp = encoding.indexOf ('"')) > 0)
3672            encoding = encoding.substring (temp + 1,
3673                encoding.indexOf ('"', temp + 2));
3674            encoding.trim ();
3675        }
3676        }
3677    }
3678
3679    // if we got an external encoding label, use it ...
3680    if (encoding != null) {
3681        this.encoding = ENCODING_EXTERNAL;
3682        setupDecoding (encoding);
3683        ignoreEncoding = true;
3684    
3685    // ... else autodetect
3686    } else {
3687        detectEncoding ();
3688        ignoreEncoding = false;
3689    }
3690    is.mark(100);
3691
3692    // Read any XML or text declaration.
3693    try {
3694        tryEncodingDecl (ignoreEncoding);
3695    } catch (EncodingException x) {
3696        encoding = x.getMessage ();
3697
3698        // if we don't handle the declared encoding,
3699        // try letting a JVM InputStreamReader do it
3700        try {
3701        if (sourceType != INPUT_STREAM)
3702            throw x;
3703
3704        is.reset ();
3705        readBufferPos = 0;
3706        readBufferLength = 0;
3707        readBufferOverflow = -1;
3708        line = 1;
3709        currentByteCount = column = 0;
3710
3711        sourceType = INPUT_READER;
3712        this.reader = new InputStreamReader (is, encoding);
3713        is = null;
3714
3715        tryEncodingDecl (true);
3716
3717        } catch (IOException e) {
3718        error ("unsupported text encoding",
3719               encoding,
3720               null);
3721        }
3722    }
3723    }
3724
3725
3726    /**
3727     * Check for an encoding declaration.  This is the second part of the
3728     * XML encoding autodetection algorithm, relying on detectEncoding to
3729     * get to the point that this part can read any encoding declaration
3730     * in the document (using only US-ASCII characters).
3731     *
3732     * <p> Because this part starts to fill parser buffers with this data,
3733     * it's tricky to to a reader so that Java's built-in decoders can be
3734     * used for the character encodings that aren't built in to this parser
3735     * (such as EUC-JP, KOI8-R, Big5, etc).
3736     *
3737     * @return any encoding in the declaration, uppercased; or null
3738     * @see detectEncoding
3739     */
3740    private String tryEncodingDecl (boolean ignoreEncoding)
3741    throws SAXException, IOException
3742    {
3743    // Read the XML/text declaration.
3744    if (tryRead ("<?xml")) {
3745        dataBufferFlush ();
3746        if (tryWhitespace ()) {
3747        if (inputStack.size () > 0) {
3748            return parseTextDecl (ignoreEncoding);
3749        } else {
3750            return parseXMLDecl (ignoreEncoding);
3751        }
3752        } else {
3753        unread ("xml".toCharArray (), 3);
3754        parsePI ();
3755        }
3756    }
3757    return null;
3758    }
3759
3760
3761    /**
3762     * Attempt to detect the encoding of an entity.
3763     * <p>The trick here (as suggested in the XML standard) is that
3764     * any entity not in UTF-8, or in UCS-2 with a byte-order mark, 
3765     * <b>must</b> begin with an XML declaration or an encoding
3766     * declaration; we simply have to look for "&lt;?xml" in various
3767     * encodings.
3768     * <p>This method has no way to distinguish among 8-bit encodings.
3769     * Instead, it sets up for UTF-8, then (possibly) revises its assumption
3770     * later in setupDecoding ().  Any ASCII-derived 8-bit encoding
3771     * should work, but most will be rejected later by setupDecoding ().
3772     * <p>I don't currently detect EBCDIC, since I'm concerned that it
3773     * could also be a valid UTF-8 sequence; I'll have to do more checking
3774     * later.
3775     * <p>MHK Nov 2001: modified to handle a BOM on UTF-8 files, which is
3776     * allowed by XML 2nd edition, and generated when Windows Notepad does
3777     * "save as UTF-8".
3778     * @see #tryEncoding (byte[], byte, byte, byte, byte)
3779     * @see #tryEncoding (byte[], byte, byte)
3780     * @see #setupDecoding
3781     * @see #read8bitEncodingDeclaration
3782     */
3783    private void detectEncoding ()
3784    throws SAXException, IOException
3785    {
3786    byte signature[] = new byte [4];
3787
3788    // Read the first four bytes for
3789    // autodetection.
3790    is.mark (4);
3791    is.read (signature);
3792    is.reset ();
3793
3794    //
3795    // FIRST:  four byte encodings (who uses these?)
3796    //
3797    if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3798              (byte) 0x00, (byte) 0x3c)) {
3799        // UCS-4 must begin with "<?xml"
3800        // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
3801        encoding = ENCODING_UCS_4_1234;
3802
3803    } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
3804                 (byte) 0x00, (byte) 0x00)) {
3805        // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
3806        encoding = ENCODING_UCS_4_4321;
3807
3808    } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3809                 (byte) 0x3c, (byte) 0x00)) {
3810        // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
3811        encoding = ENCODING_UCS_4_2143;
3812
3813    } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
3814                 (byte) 0x00, (byte) 0x00)) {
3815        // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
3816        encoding = ENCODING_UCS_4_3412;
3817
3818        // 00 00 fe ff UCS_4_1234 (with BOM)
3819        // ff fe 00 00 UCS_4_4321 (with BOM)
3820    }
3821    
3822    // SECOND: three byte signature:
3823    // look for UTF-8 byte order mark 3C 3F 78, allowed by XML 1.0 2nd edition
3824    
3825    else if (tryEncoding (signature, (byte)0xef, (byte)0xbb, (byte)0xbf)) {
3826        encoding = ENCODING_UTF_8;
3827        is.read(); is.read(); is.read();
3828    }
3829
3830    //
3831    // THIRD:  two byte encodings
3832    // note ... with 1/14/2000 errata the XML spec identifies some
3833    // more "broken UTF-16" autodetection cases, with no XML decl,
3834    // which we don't handle here (that's legal too).
3835    //
3836    else if (tryEncoding (signature, (byte) 0xfe, (byte) 0xff)) {
3837        // UCS-2 with a byte-order marker. (UTF-16)
3838        // 0xfe 0xff: UCS-2, big-endian (12)
3839        encoding = ENCODING_UCS_2_12;
3840        is.read (); is.read ();
3841
3842    } else if (tryEncoding (signature, (byte) 0xff, (byte) 0xfe)) {
3843        // UCS-2 with a byte-order marker. (UTF-16)
3844        // 0xff 0xfe: UCS-2, little-endian (21)
3845        encoding = ENCODING_UCS_2_21;
3846        is.read (); is.read ();
3847
3848    } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
3849                 (byte) 0x00, (byte) 0x3f)) {
3850        // UTF-16-BE (otherwise, malformed UTF-16)
3851        // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
3852        encoding = ENCODING_UCS_2_12;
3853        error ("no byte-order mark for UCS-2 entity");
3854
3855    } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
3856                 (byte) 0x3f, (byte) 0x00)) {
3857        // UTF-16-LE (otherwise, malformed UTF-16)
3858        // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
3859        encoding = ENCODING_UCS_2_21;
3860        error ("no byte-order mark for UCS-2 entity");
3861    }
3862
3863    //
3864    // THIRD:  ASCII-derived encodings, fixed and variable lengths
3865    //
3866    else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x3f,
3867                   (byte) 0x78, (byte) 0x6d)) {
3868        // ASCII derived
3869        // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
3870        encoding = ENCODING_UTF_8;
3871        read8bitEncodingDeclaration ();
3872
3873    } else {
3874        // 4c 6f a7 94 ... we don't understand EBCDIC flavors
3875        // ... but we COULD at least kick in some fixed code page
3876
3877        // (default) UTF-8 without encoding/XML declaration
3878        encoding = ENCODING_UTF_8;
3879    }
3880    }
3881
3882
3883    /**
3884     * Check for a four-byte signature.
3885     * <p>Utility routine for detectEncoding ().
3886     * <p>Always looks for some part of "<?XML" in a specific encoding.
3887     * @param sig The first four bytes read.
3888     * @param b1 The first byte of the signature
3889     * @param b2 The second byte of the signature
3890     * @param b3 The third byte of the signature
3891     * @param b4 The fourth byte of the signature
3892     * @see #detectEncoding
3893     */
3894    private static boolean tryEncoding (
3895    byte sig[], byte b1, byte b2, byte b3, byte b4)
3896    {
3897    return (sig [0] == b1 && sig [1] == b2
3898        && sig [2] == b3 && sig [3] == b4);
3899    }
3900
3901
3902    /**
3903     * Check for a two-byte signature.
3904     * <p>Looks for a UCS-2 byte-order mark.
3905     * <p>Utility routine for detectEncoding ().
3906     * @param sig The first four bytes read.
3907     * @param b1 The first byte of the signature
3908     * @param b2 The second byte of the signature
3909     * @see #detectEncoding
3910     */
3911    private static boolean tryEncoding (byte sig[], byte b1, byte b2)
3912    {
3913    return ((sig [0] == b1) && (sig [1] == b2));
3914    }
3915
3916    /**
3917     * Check for a three-byte signature.
3918     * <p>Looks for a UTF-8 byte-order mark.
3919     * <p>Utility routine for detectEncoding ().
3920     * @param sig The first four bytes read.
3921     * @param b1 The first byte of the signature
3922     * @param b2 The second byte of the signature
3923     * @param b3 The second byte of the signature
3924     * @see #detectEncoding
3925     */
3926    private static boolean tryEncoding (byte sig[], byte b1, byte b2, byte b3)
3927    {
3928    return ((sig [0] == b1) && (sig [1] == b2) && (sig [2] == b3));
3929    }
3930
3931    /**
3932     * This method pushes a string back onto input.
3933     * <p>It is useful either as the expansion of an internal entity, 
3934     * or for backtracking during the parse.
3935     * <p>Call pushCharArray () to do the actual work.
3936     * @param s The string to push back onto input.
3937     * @see #pushCharArray
3938     */
3939    private void pushString (String ename, String s)
3940    throws SAXException
3941    {
3942    char ch[] = s.toCharArray ();
3943    pushCharArray (ename, ch, 0, ch.length);
3944    }
3945
3946
3947    /**
3948     * Push a new internal input source.
3949     * <p>This method is useful for expanding an internal entity,
3950     * or for unreading a string of characters.  It creates a new
3951     * readBuffer containing the characters in the array, instead
3952     * of characters converted from an input byte stream.
3953     * @param ch The char array to push.
3954     * @see #pushString
3955     * @see #pushURL
3956     * @see #readBuffer
3957     * @see #sourceType
3958     * @see #pushInput
3959     */
3960    private void pushCharArray (String ename, char ch[], int start, int length)
3961    throws SAXException
3962    {
3963    // Push the existing status
3964    pushInput (ename);
3965    sourceType = INPUT_INTERNAL;
3966    readBuffer = ch;
3967    readBufferPos = start;
3968    readBufferLength = length;
3969    readBufferOverflow = -1;
3970    }
3971
3972
3973    /**
3974     * Save the current input source onto the stack.
3975     * <p>This method saves all of the global variables associated with
3976     * the current input source, so that they can be restored when a new
3977     * input source has finished.  It also tests for entity recursion.
3978     * <p>The method saves the following global variables onto a stack
3979     * using a fixed-length array:
3980     * <ol>
3981     * <li>sourceType
3982     * <li>externalEntity
3983     * <li>readBuffer
3984     * <li>readBufferPos
3985     * <li>readBufferLength
3986     * <li>line
3987     * <li>encoding
3988     * </ol>
3989     * @param ename The name of the entity (if any) causing the new input.
3990     * @see #popInput
3991     * @see #sourceType
3992     * @see #externalEntity
3993     * @see #readBuffer
3994     * @see #readBufferPos
3995     * @see #readBufferLength
3996     * @see #line
3997     * @see #encoding
3998     */
3999    private void pushInput (String ename)
4000    throws SAXException
4001    {
4002    Object input[] = new Object [12];
4003
4004    // Check for entity recursion.
4005    if (ename != null) {
4006        Enumeration entities = entityStack.elements ();
4007        while (entities.hasMoreElements ()) {
4008        String e = (String) entities.nextElement ();
4009        if (e == ename) {
4010            error ("recursive reference to entity", ename, null);
4011        }
4012        }
4013    }
4014    entityStack.push (ename);
4015
4016    // Don't bother if there is no current input.
4017    if (sourceType == INPUT_NONE) {
4018        return;
4019    }
4020
4021    // Set up a snapshot of the current
4022    // input source.
4023    input [0] = new Integer (sourceType);
4024    input [1] = externalEntity;
4025    input [2] = readBuffer;
4026    input [3] = new Integer (readBufferPos);
4027    input [4] = new Integer (readBufferLength);
4028    input [5] = new Integer (line);
4029    input [6] = new Integer (encoding);
4030    input [7] = new Integer (readBufferOverflow);
4031    input [8] = is;
4032    input [9] = new Integer (currentByteCount);
4033    input [10] = new Integer (column);
4034    input [11] = reader;
4035
4036    // Push it onto the stack.
4037    inputStack.push (input);
4038    }
4039
4040
4041    /**
4042     * Restore a previous input source.
4043     * <p>This method restores all of the global variables associated with
4044     * the current input source.
4045     * @exception java.io.EOFException
4046     *    If there are no more entries on the input stack.
4047     * @see #pushInput
4048     * @see #sourceType
4049     * @see #externalEntity
4050     * @see #readBuffer
4051     * @see #readBufferPos
4052     * @see #readBufferLength
4053     * @see #line
4054     * @see #encoding
4055     */
4056    private void popInput ()
4057    throws SAXException, IOException
4058    {
4059    String uri;
4060
4061    if (externalEntity != null)
4062        uri = externalEntity.getURL ().toString ();
4063    else
4064        uri = baseURI;
4065
4066    switch (sourceType) {
4067    case INPUT_STREAM:
4068        if (is!=null) {
4069            if (uri != null) {
4070                handler.endExternalEntity (baseURI);
4071            }
4072            is.close ();
4073        }
4074        break;
4075    case INPUT_READER:
4076        if (reader != null) {
4077            if (uri != null) {
4078                handler.endExternalEntity (baseURI);
4079            }
4080            reader.close ();
4081        }
4082        break;
4083    }
4084
4085    // Throw an EOFException if there
4086    // is nothing else to pop.
4087    if (inputStack.isEmpty ()) {
4088        throw new EOFException ("no more input");
4089    } 
4090    
4091    Object[] input = (Object[]) inputStack.pop ();
4092    entityStack.pop ();
4093
4094    sourceType = ((Integer) input [0]).intValue ();
4095    externalEntity = (URLConnection) input [1];
4096    readBuffer = (char[]) input [2];
4097    readBufferPos = ((Integer) input [3]).intValue ();
4098    readBufferLength = ((Integer) input [4]).intValue ();
4099    line = ((Integer) input [5]).intValue ();
4100    encoding = ((Integer) input [6]).intValue ();
4101    readBufferOverflow = ((Integer) input [7]).intValue ();
4102    is = (InputStream) input [8];
4103    currentByteCount = ((Integer) input [9]).intValue ();
4104    column = ((Integer) input [10]).intValue ();
4105    reader = (Reader) input [11];
4106    }
4107
4108
4109    /**
4110     * Return true if we can read the expected character.
4111     * <p>Note that the character will be removed from the input stream
4112     * on success, but will be put back on failure.  Do not attempt to
4113     * read the character again if the method succeeds.
4114     * @param delim The character that should appear next.  For a
4115     *        insensitive match, you must supply this in upper-case.
4116     * @return true if the character was successfully read, or false if
4117     *   it was not.
4118     * @see #tryRead (String)
4119     */
4120    private boolean tryRead (char delim)
4121    throws SAXException, IOException
4122    {
4123    char c;
4124
4125    // Read the character
4126    c = readCh ();
4127
4128    // Test for a match, and push the character
4129    // back if the match fails.
4130    if (c == delim) {
4131        return true;
4132    } else {
4133        unread (c);
4134        return false;
4135    }
4136    }
4137
4138
4139    /**
4140     * Return true if we can read the expected string.
4141     * <p>This is simply a convenience method.
4142     * <p>Note that the string will be removed from the input stream
4143     * on success, but will be put back on failure.  Do not attempt to
4144     * read the string again if the method succeeds.
4145     * <p>This method will push back a character rather than an
4146     * array whenever possible (probably the majority of cases).
4147     * <p><b>NOTE:</b> This method currently has a hard-coded limit
4148     * of 100 characters for the delimiter.
4149     * @param delim The string that should appear next.
4150     * @return true if the string was successfully read, or false if
4151     *   it was not.
4152     * @see #tryRead (char)
4153     */
4154    private boolean tryRead (String delim)
4155    throws SAXException, IOException
4156    {
4157    char ch[] = delim.toCharArray ();
4158    char c;
4159
4160    // Compare the input, character-
4161    // by character.
4162
4163    for (int i = 0; i < ch.length; i++) {
4164        c = readCh ();
4165        if (c != ch [i]) {
4166        unread (c);
4167        if (i != 0) {
4168            unread (ch, i);
4169        }
4170        return false;
4171        }
4172    }
4173    return true;
4174    }
4175
4176
4177
4178    /**
4179     * Return true if we can read some whitespace.
4180     * <p>This is simply a convenience method.
4181     * <p>This method will push back a character rather than an
4182     * array whenever possible (probably the majority of cases).
4183     * @return true if whitespace was found.
4184     */
4185    private boolean tryWhitespace ()
4186    throws SAXException, IOException
4187    {
4188    char c;
4189    c = readCh ();
4190    if (isWhitespace (c)) {
4191        skipWhitespace ();
4192        return true;
4193    } else {
4194        unread (c);
4195        return false;
4196    }
4197    }
4198
4199
4200    /**
4201     * Read all data until we find the specified string.
4202     * This is useful for scanning CDATA sections and PIs.
4203     * <p>This is inefficient right now, since it calls tryRead ()
4204     * for every character.
4205     * @param delim The string delimiter
4206     * @see #tryRead (String, boolean)
4207     * @see #readCh
4208     */
4209    private void parseUntil (String delim)
4210    throws SAXException, IOException
4211    {
4212    char c;
4213    int startLine = line;
4214
4215    try {
4216        while (!tryRead (delim)) {
4217        c = readCh ();
4218        dataBufferAppend (c);
4219        }
4220    } catch (EOFException e) {
4221        error ("end of input while looking for delimiter "
4222        + "(started on line " + startLine
4223        + ')', null, delim);
4224    }
4225    }
4226
4227
4228    /**
4229     * Read just the encoding declaration (or XML declaration) at the 
4230     * start of an external entity.
4231     * When this method is called, we know that the declaration is
4232     * present (or appears to be).  We also know that the entity is
4233     * in some sort of ASCII-derived 8-bit encoding.
4234     * The idea of this is to let us read what the 8-bit encoding is
4235     * before we've committed to converting any more of the file; the
4236     * XML or encoding declaration must be in 7-bit ASCII, so we're
4237     * safe as long as we don't go past it.
4238     */
4239    private void read8bitEncodingDeclaration ()
4240    throws SAXException, IOException
4241    {
4242    int ch;
4243    readBufferPos = readBufferLength = 0;
4244
4245    while (true) {
4246        ch = is.read ();
4247        readBuffer [readBufferLength++] = (char) ch;
4248        switch (ch) {
4249          case (int) '>':
4250        return;
4251          case - 1:
4252        error ("end of file before end of XML or encoding declaration.",
4253               null, "?>");
4254        }
4255        if (readBuffer.length == readBufferLength)
4256        error ("unfinished XML or encoding declaration");
4257    }
4258    }
4259
4260
4261    //////////////////////////////////////////////////////////////////////
4262    // Low-level I/O.
4263    //////////////////////////////////////////////////////////////////////
4264
4265
4266    /**
4267     * Read a chunk of data from an external input source.
4268     * <p>This is simply a front-end that fills the rawReadBuffer
4269     * with bytes, then calls the appropriate encoding handler.
4270     * @see #encoding
4271     * @see #rawReadBuffer
4272     * @see #readBuffer
4273     * @see #filterCR
4274     * @see #copyUtf8ReadBuffer
4275     * @see #copyIso8859_1ReadBuffer
4276     * @see #copyUcs_2ReadBuffer
4277     * @see #copyUcs_4ReadBuffer
4278     */
4279    private void readDataChunk ()
4280    throws SAXException, IOException
4281    {
4282    int count;
4283
4284    // See if we have any overflow (filterCR sets for CR at end)
4285    if (readBufferOverflow > -1) {
4286        readBuffer [0] = (char) readBufferOverflow;
4287        readBufferOverflow = -1;
4288        readBufferPos = 1;
4289        sawCR = true;
4290    } else {
4291        readBufferPos = 0;
4292        sawCR = false;
4293    }
4294
4295    // input from a character stream.
4296    if (sourceType == INPUT_READER) {
4297        count = reader.read (readBuffer,
4298                readBufferPos, READ_BUFFER_MAX - readBufferPos);
4299        if (count < 0)
4300        readBufferLength = readBufferPos;
4301        else
4302        readBufferLength = readBufferPos + count;
4303        if (readBufferLength > 0)
4304        filterCR (count >= 0);
4305        sawCR = false;
4306        return;
4307    }
4308
4309    // Read as many bytes as possible into the raw buffer.
4310    count = is.read (rawReadBuffer, 0, READ_BUFFER_MAX);
4311
4312    // Dispatch to an encoding-specific reader method to populate
4313    // the readBuffer.  In most parser speed profiles, these routines
4314    // show up at the top of the CPU usage chart.
4315    if (count > 0) {
4316        switch (encoding) {
4317          // one byte builtins
4318          case ENCODING_ASCII:
4319        copyIso8859_1ReadBuffer (count, (char) 0x0080);
4320        break;
4321          case ENCODING_UTF_8:
4322        copyUtf8ReadBuffer (count);
4323        break;
4324          case ENCODING_ISO_8859_1:
4325        copyIso8859_1ReadBuffer (count, (char) 0);
4326        break;
4327
4328          // two byte builtins
4329          case ENCODING_UCS_2_12:
4330        copyUcs2ReadBuffer (count, 8, 0);
4331        break;
4332          case ENCODING_UCS_2_21:
4333        copyUcs2ReadBuffer (count, 0, 8);
4334        break;
4335
4336          // four byte builtins
4337          case ENCODING_UCS_4_1234:
4338        copyUcs4ReadBuffer (count, 24, 16, 8, 0);
4339        break;
4340          case ENCODING_UCS_4_4321:
4341        copyUcs4ReadBuffer (count, 0, 8, 16, 24);
4342        break;
4343          case ENCODING_UCS_4_2143:
4344        copyUcs4ReadBuffer (count, 16, 24, 0, 8);
4345        break;
4346          case ENCODING_UCS_4_3412:
4347        copyUcs4ReadBuffer (count, 8, 0, 24, 16);
4348        break;
4349        }
4350    } else
4351        readBufferLength = readBufferPos;
4352
4353    readBufferPos = 0;
4354
4355    // Filter out all carriage returns if we've seen any
4356    // (including any saved from a previous read)
4357    if (sawCR) {
4358        filterCR (count >= 0);
4359        sawCR = false;
4360
4361        // must actively report EOF, lest some CRs get lost.
4362        if (readBufferLength == 0 && count >= 0)
4363        readDataChunk ();
4364    }
4365
4366    if (count > 0)
4367        currentByteCount += count;
4368    }
4369
4370
4371    /**
4372     * Filter carriage returns in the read buffer.
4373     * CRLF becomes LF; CR becomes LF.
4374     * @param moreData true iff more data might come from the same source
4375     * @see #readDataChunk
4376     * @see #readBuffer
4377     * @see #readBufferOverflow
4378     */
4379    private void filterCR (boolean moreData)
4380    {
4381    int i, j;
4382
4383    readBufferOverflow = -1;
4384
4385loop:
4386    for (i = j = readBufferPos; j < readBufferLength; i++, j++) {
4387        switch (readBuffer [j]) {
4388        case '\r':
4389        if (j == readBufferLength - 1) {
4390            if (moreData) {
4391            readBufferOverflow = '\r';
4392            readBufferLength--;
4393            } else  // CR at end of buffer
4394            readBuffer [i++] = '\n';
4395            break loop;
4396        } else if (readBuffer [j + 1] == '\n') {
4397            j++;
4398        }
4399        readBuffer [i] = '\n';
4400        break;
4401
4402        case '\n':
4403        default:
4404        readBuffer [i] = readBuffer [j];
4405        break;
4406        }
4407    }
4408    readBufferLength = i;
4409    }
4410
4411    /**
4412     * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
4413     * <p>When readDataChunk () calls this method, the raw bytes are in 
4414     * rawReadBuffer, and the final characters will appear in 
4415     * readBuffer.
4416     * @param count The number of bytes to convert.
4417     * @see #readDataChunk
4418     * @see #rawReadBuffer
4419     * @see #readBuffer
4420     * @see #getNextUtf8Byte
4421     */
4422    private void copyUtf8ReadBuffer (int count)
4423    throws SAXException, IOException
4424    {
4425    int i = 0;
4426    int j = readBufferPos;
4427    int b1;
4428    char    c = 0;
4429
4430    /*
4431    // check once, so the runtime won't (if it's smart enough)
4432    if (count < 0 || count > rawReadBuffer.length)
4433        throw new ArrayIndexOutOfBoundsException (Integer.toString (count));
4434    */
4435
4436    while (i < count) {
4437        b1 = rawReadBuffer [i++];
4438
4439        // Determine whether we are dealing
4440        // with a one-, two-, three-, or four-
4441        // byte sequence.
4442        if (b1 < 0) {
4443        if ((b1 & 0xe0) == 0xc0) {
4444            // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
4445            c = (char) (((b1 & 0x1f) << 6)
4446                | getNextUtf8Byte (i++, count));
4447        } else if ((b1 & 0xf0) == 0xe0) {
4448            // 3-byte sequence:
4449            // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
4450            // most CJKV characters
4451            c = (char) (((b1 & 0x0f) << 12) |
4452                   (getNextUtf8Byte (i++, count) << 6) |
4453                   getNextUtf8Byte (i++, count));
4454        } else if ((b1 & 0xf8) == 0xf0) {
4455            // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
4456            //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
4457            // (uuuuu = wwww + 1)
4458            // "Surrogate Pairs" ... from the "Astral Planes"
4459            int iso646 = b1 & 07;
4460            iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4461            iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4462            iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4463
4464            if (iso646 <= 0xffff) {
4465            c = (char) iso646;
4466            } else {
4467            if (iso646 > 0x0010ffff)
4468                encodingError (
4469                "UTF-8 value out of range for Unicode",
4470                iso646, 0);
4471            iso646 -= 0x010000;
4472            readBuffer [j++] = (char) (0xd800 | (iso646 >> 10));
4473            readBuffer [j++] = (char) (0xdc00 | (iso646 & 0x03ff));
4474            continue;
4475            }
4476        } else {
4477            // The five and six byte encodings aren't supported;
4478            // they exceed the Unicode (and XML) range.
4479            encodingError (
4480                "invalid UTF-8 byte (check the XML declaration)",
4481                0xff & b1, i);
4482            // NOTREACHED
4483            c = 0;
4484        }
4485        } else {
4486        // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
4487        // (US-ASCII character, "common" case, one branch to here)
4488        c = (char) b1;
4489        }
4490        readBuffer [j++] = c;
4491        if (c == '\r')
4492        sawCR = true;
4493    }
4494    // How many characters have we read?
4495    readBufferLength = j;
4496    }
4497
4498
4499    /**
4500     * Return the next byte value in a UTF-8 sequence.
4501     * If it is not possible to get a byte from the current
4502     * entity, throw an exception.
4503     * @param pos The current position in the rawReadBuffer.
4504     * @param count The number of bytes in the rawReadBuffer
4505     * @return The significant six bits of a non-initial byte in
4506     *   a UTF-8 sequence.
4507     * @exception EOFException If the sequence is incomplete.
4508     */
4509    private int getNextUtf8Byte (int pos, int count)
4510    throws SAXException, IOException
4511    {
4512    int val;
4513
4514    // Take a character from the buffer
4515    // or from the actual input stream.
4516    if (pos < count) {
4517        val = rawReadBuffer [pos];
4518    } else {
4519        val = is.read ();
4520        if (val == -1) {
4521        encodingError ("unfinished multi-byte UTF-8 sequence at EOF",
4522            -1, pos);
4523        }
4524    }
4525
4526    // Check for the correct bits at the start.
4527    if ((val & 0xc0) != 0x80) {
4528        encodingError ("bad continuation of multi-byte UTF-8 sequence",
4529            val, pos + 1);
4530    }
4531
4532    // Return the significant bits.
4533    return (val & 0x3f);
4534    }
4535
4536
4537    /**
4538     * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
4539     * UTF-16 characters.
4540     *
4541     * <p>When readDataChunk () calls this method, the raw bytes are in 
4542     * rawReadBuffer, and the final characters will appear in 
4543     * readBuffer.
4544     *
4545     * @param count The number of bytes to convert.
4546     * @param mask For ASCII conversion, 0x7f; else, 0xff.
4547     * @see #readDataChunk
4548     * @see #rawReadBuffer
4549     * @see #readBuffer
4550     */
4551    private void copyIso8859_1ReadBuffer (int count, char mask)
4552    throws IOException
4553    {
4554    int i, j;
4555    for (i = 0, j = readBufferPos; i < count; i++, j++) {
4556        char c = (char) (rawReadBuffer [i] & 0xff);
4557        if ((c & mask) != 0)
4558        throw new CharConversionException ("non-ASCII character U+"
4559                            + Integer.toHexString (c));
4560        readBuffer [j] = c;
4561        if (c == '\r') {
4562        sawCR = true;
4563        }
4564    }
4565    readBufferLength = j;
4566    }
4567
4568
4569    /**
4570     * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
4571     * (as used in Java string manipulation).
4572     *
4573     * <p>When readDataChunk () calls this method, the raw bytes are in 
4574     * rawReadBuffer, and the final characters will appear in 
4575     * readBuffer.
4576     * @param count The number of bytes to convert.
4577     * @param shift1 The number of bits to shift byte 1.
4578     * @param shift2 The number of bits to shift byte 2
4579     * @see #readDataChunk
4580     * @see #rawReadBuffer
4581     * @see #readBuffer
4582     */
4583    private void copyUcs2ReadBuffer (int count, int shift1, int shift2)
4584    throws SAXException
4585    {
4586    int j = readBufferPos;
4587
4588    if (count > 0 && (count % 2) != 0) {
4589        encodingError ("odd number of bytes in UCS-2 encoding", -1, count);
4590    }
4591    // The loops are faster with less internal brancing; hence two
4592    if (shift1 == 0) {  // "UTF-16-LE"
4593        for (int i = 0; i < count; i += 2) {
4594        char c = (char) (rawReadBuffer [i + 1] << 8);
4595        c |= 0xff & rawReadBuffer [i];
4596        readBuffer [j++] = c;
4597        if (c == '\r')
4598            sawCR = true;
4599        }
4600    } else {    // "UTF-16-BE"
4601        for (int i = 0; i < count; i += 2) {
4602        char c = (char) (rawReadBuffer [i] << 8);
4603        c |= 0xff & rawReadBuffer [i + 1];
4604        readBuffer [j++] = c;
4605        if (c == '\r')
4606            sawCR = true;
4607        }
4608    }
4609    readBufferLength = j;
4610    }
4611
4612
4613    /**
4614     * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
4615     *
4616     * <p>When readDataChunk () calls this method, the raw bytes are in 
4617     * rawReadBuffer, and the final characters will appear in 
4618     * readBuffer.
4619     * <p>Java has Unicode chars, and this routine uses surrogate pairs
4620     * for ISO-10646 values between 0x00010000 and 0x000fffff.  An
4621     * exception is thrown if the ISO-10646 character has no Unicode
4622     * representation.
4623     *
4624     * @param count The number of bytes to convert.
4625     * @param shift1 The number of bits to shift byte 1.
4626     * @param shift2 The number of bits to shift byte 2
4627     * @param shift3 The number of bits to shift byte 2
4628     * @param shift4 The number of bits to shift byte 2
4629     * @see #readDataChunk
4630     * @see #rawReadBuffer
4631     * @see #readBuffer
4632     */
4633    private void copyUcs4ReadBuffer (int count, int shift1, int shift2,
4634                  int shift3, int shift4)
4635    throws SAXException
4636    {
4637    int j = readBufferPos;
4638
4639    if (count > 0 && (count % 4) != 0) {
4640        encodingError (
4641            "number of bytes in UCS-4 encoding not divisible by 4",
4642            -1, count);
4643    }
4644    for (int i = 0; i < count; i += 4) {
4645        int value = (((rawReadBuffer [i] & 0xff) << shift1) |
4646              ((rawReadBuffer [i + 1] & 0xff) << shift2) |
4647              ((rawReadBuffer [i + 2] & 0xff) << shift3) |
4648              ((rawReadBuffer [i + 3] & 0xff) << shift4));
4649        if (value < 0x0000ffff) {
4650        readBuffer [j++] = (char) value;
4651        if (value == (int) '\r') {
4652            sawCR = true;
4653        }
4654        } else if (value < 0x0010ffff) {
4655        value -= 0x010000;
4656        readBuffer [j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
4657        readBuffer [j++] = (char) (0xdc | (value & 0x03ff));
4658        } else {
4659        encodingError ("UCS-4 value out of range for Unicode",
4660                   value, i);
4661        }
4662    }
4663    readBufferLength = j;
4664    }
4665
4666
4667    /**
4668     * Report a character encoding error.
4669     */
4670    private void encodingError (String message, int value, int offset)
4671    throws SAXException
4672    {
4673    String uri;
4674
4675    if (value != -1) {
4676        message = message + " (code: 0x" +
4677              Integer.toHexString (value) + ')';
4678    }
4679    if (externalEntity != null) {
4680        uri = externalEntity.getURL ().toString ();
4681    } else {
4682        uri = baseURI;
4683    }
4684    handler.error (message, uri, -1, offset + currentByteCount);
4685    }
4686
4687
4688    //////////////////////////////////////////////////////////////////////
4689    // Local Variables.
4690    //////////////////////////////////////////////////////////////////////
4691
4692    /**
4693     * Re-initialize the variables for each parse.
4694     */
4695    private void initializeVariables ()
4696    {
4697    // First line
4698    line = 1;
4699    column = 0;
4700
4701    // Set up the buffers for data and names
4702    dataBufferPos = 0;
4703    dataBuffer = new char [DATA_BUFFER_INITIAL];
4704    nameBufferPos = 0;
4705    nameBuffer = new char [NAME_BUFFER_INITIAL];
4706
4707    // Set up the DTD hash tables
4708    elementInfo = new Hashtable ();
4709    entityInfo = new Hashtable ();
4710    notationInfo = new Hashtable ();
4711
4712    // Set up the variables for the current
4713    // element context.
4714    currentElement = null;
4715    currentElementContent = CONTENT_UNDECLARED;
4716
4717    // Set up the input variables
4718    sourceType = INPUT_NONE;
4719    inputStack = new Stack ();
4720    entityStack = new Stack ();
4721    externalEntity = null;
4722    tagAttributePos = 0;
4723    tagAttributes = new String [100];
4724    rawReadBuffer = new byte [READ_BUFFER_MAX];
4725    readBufferOverflow = -1;
4726
4727    inLiteral = false;
4728    expandPE = false;
4729    peIsError = false;
4730
4731    inCDATA = false;
4732
4733    symbolTable = new Object [SYMBOL_TABLE_LENGTH][];
4734    }
4735
4736
4737    /**
4738     * Clean up after the parse to allow some garbage collection.
4739     */
4740    private void cleanupVariables ()
4741    {
4742    dataBuffer = null;
4743    nameBuffer = null;
4744
4745    elementInfo = null;
4746    entityInfo = null;
4747    notationInfo = null;
4748
4749    currentElement = null;
4750
4751    inputStack = null;
4752    entityStack = null;
4753    externalEntity = null;
4754
4755    tagAttributes = null;
4756    rawReadBuffer = null;
4757
4758    symbolTable = null;
4759    }
4760
4761    /* used to restart reading with some InputStreamReader */
4762    static class EncodingException extends IOException
4763    {
4764    EncodingException (String encoding) { super (encoding); }
4765    }
4766
4767    //
4768    // The current XML handler interface.
4769    //
4770    private SAXDriver   handler;
4771
4772    //
4773    // I/O information.
4774    //
4775    private Reader  reader;     // current reader
4776    private InputStream is;         // current input stream
4777    private int     line;       // current line number
4778    private int     column;     // current column number
4779    private int     sourceType;     // type of input source
4780    private Stack   inputStack;     // stack of input soruces
4781    private URLConnection externalEntity; // current external entity
4782    private int     encoding;   // current character encoding
4783    private int     currentByteCount; // bytes read from current source
4784
4785    //
4786    // Buffers for decoded but unparsed character input.
4787    //
4788    private char    readBuffer [];
4789    private int     readBufferPos;
4790    private int     readBufferLength;
4791    private int     readBufferOverflow;  // overflow from last data chunk.
4792
4793
4794    //
4795    // Buffer for undecoded raw byte input.
4796    //
4797    private final static int READ_BUFFER_MAX = 16384;
4798    private byte    rawReadBuffer [];
4799
4800
4801    //
4802    // Buffer for parsed character data.
4803    //
4804    private static int DATA_BUFFER_INITIAL = 4096;
4805    private char    dataBuffer [];
4806    private int     dataBufferPos;
4807
4808    //
4809    // Buffer for parsed names.
4810    //
4811    private static int NAME_BUFFER_INITIAL = 1024;
4812    private char    nameBuffer [];
4813    private int     nameBufferPos;
4814
4815
4816    //
4817    // Hashtables for DTD information on elements, entities, and notations.
4818    //
4819    private Hashtable   elementInfo;
4820    private Hashtable   entityInfo;
4821    private Hashtable   notationInfo;
4822
4823
4824    //
4825    // Element type currently in force.
4826    //
4827    private String  currentElement;
4828    private int     currentElementContent;
4829
4830    //
4831    // Base external identifiers for resolution.
4832    //
4833    private String  basePublicId;
4834    private String  baseURI;
4835    private int     baseEncoding;
4836    private Reader  baseReader;
4837    private InputStream baseInputStream;
4838    private char    baseInputBuffer [];
4839    private int     baseInputBufferStart;
4840    private int     baseInputBufferLength;
4841
4842    //
4843    // Stack of entity names, to detect recursion.
4844    //
4845    private Stack   entityStack;
4846
4847    //
4848    // PE expansion is enabled in most chunks of the DTD, not all.
4849    // When it's enabled, literals are treated differently.
4850    //
4851    private boolean inLiteral;
4852    private boolean expandPE;
4853    private boolean peIsError;
4854
4855    //
4856    // Symbol table, for caching interned names.
4857    //
4858    private final static int SYMBOL_TABLE_LENGTH = 1087;
4859    private Object  symbolTable [][];
4860
4861    //
4862    // Hash table of attributes found in current start tag.
4863    //
4864    private String  tagAttributes [];
4865    private int     tagAttributePos;
4866
4867    //
4868    // Utility flag: have we noticed a CR while reading the last
4869    // data chunk?  If so, we will have to go back and normalise
4870    // CR or CR/LF line ends.
4871    //
4872    private boolean sawCR;
4873
4874    //
4875    // Utility flag: are we in CDATA?  If so, whitespace isn't ignorable.
4876    // 
4877    private boolean inCDATA;
4878}
4879
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Java Books Remove Frame
Popular Tags