XmlParser


1   // AElfred XML Parser. This version of the AElfred parser is
2   // derived from the original Microstar distribution, with additional
3   // bug fixes by Michael Kay, and selected enhancements and further
4   // bug fixes from the version produced by David Brownell.
5   //
6   
7   /*
8    * $Id: XmlParser.java,v 1.8 2001/06/06 17:57:44 dbrownell Exp $
9    * Copyright (C) 1999-2001 David Brownell
10   * 
11   * This program is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU General Public License as published by
13   * the Free Software Foundation; either version 2 of the License, or
14   * (at your option) any later version.
15   * 
16   * This program is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU General Public License for more details.
20   * 
21   * You should have received a copy of the GNU General Public License
22   * along with this program; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  
26  //
27  // Copyright (c) 1997, 1998 by Microstar Software Ltd.
28  // From Microstar's README (the entire original license):
29  //
30  // AElfred is free for both commercial and non-commercial use and
31  // redistribution, provided that Microstar's copyright and disclaimer are
32  // retained intact.  You are free to modify AElfred for your own use and
33  // to redistribute AElfred with your modifications, provided that the
34  // modifications are clearly documented.
35  //
36  // This program is distributed in the hope that it will be useful, but
37  // WITHOUT ANY WARRANTY; without even the implied warranty of
38  // merchantability or fitness for a particular purpose.  Please use it AT
39  // YOUR OWN RISK.
40  //
41  
42  
43  package com.icl.saxon.aelfred;
44  
45  import java.io.BufferedInputStream;
46  import java.io.CharConversionException;
47  import java.io.EOFException;
48  import java.io.InputStream;
49  import java.io.InputStreamReader;
50  import java.io.IOException;
51  import java.io.Reader;
52  import java.net.URL;
53  import java.net.URLConnection;
54  import java.util.Enumeration;
55  import java.util.Hashtable;
56  import java.util.Stack;
57  
58  import org.xml.sax.SAXException;
59  
60  
61  // $Id: XmlParser.java,v 1.19 2000/02/26 04:30:20 mojo Exp $
62  
63  /**
64   * Parse XML documents and return parse events through call-backs.
65   * Use the <code>SAXDriver</code> class as your entry point, as the
66   * internal parser interfaces are subject to change.
67   *
68   * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
69   *  (version 1.2a with bugfixes)
70   * @author Updated by David Brownell &lt;david-b@pacbell.net&gt;
71   * @version $Date: 2001/06/06 17:57:44 $
72   * @see SAXDriver
73   */
74  final class XmlParser
75  {
76      // parse from buffer, avoiding slow per-character readCh()
77      private final static boolean USE_CHEATS = true;
78  
79      // don't waste too much space in hashtables 
80      private final static int DEFAULT_ATTR_COUNT = 23;
81  
82  
83      //////////////////////////////////////////////////////////////////////
84      // Constructors.
85      ////////////////////////////////////////////////////////////////////////
86  
87  
88      /**
89       * Construct a new parser with no associated handler.
90       * @see #setHandler
91       * @see #parse
92       */
93      // package private
94      XmlParser ()
95      {
96          cleanupVariables ();
97      }
98  
99  
100     /**
101      * Set the handler that will receive parsing events.
102      * @param handler The handler to receive callback events.
103      * @see #parse
104      */
105     // package private
106     void setHandler (SAXDriver handler)
107     {
108         this.handler = handler;
109     }
110 
111 
112     /**
113      * Parse an XML document from the character stream, byte stream, or URI
114      * that you provide (in that order of preference).  Any URI that you
115      * supply will become the base URI for resolving relative URI, and may
116      * be used to acquire a reader or byte stream.
117      *
118      * <p>You may parse more than one document, but that must be done
119      * sequentially.  Only one thread at a time may use this parser.
120      *
121      * @param systemId The URI of the document; should never be null,
122      *  but may be so iff a reader <em>or</em> a stream is provided.
123      * @param publicId The public identifier of the document, or null.
124      * @param reader A character stream; must be null if stream isn't.
125      * @param stream A byte input stream; must be null if reader isn't.
126      * @param encoding The suggested encoding, or null if unknown.
127      * @exception java.lang.Exception Basically SAXException or IOException
128      */
129     // package private 
130     void doParse (
131     String      systemId,
132     String      publicId,
133     Reader      reader,
134     InputStream stream,
135     String      encoding
136     ) throws Exception
137     {
138     if (handler == null)
139         throw new IllegalStateException ("no callback handler");
140 
141     basePublicId = publicId;
142     baseURI = systemId;
143     baseReader = reader;
144     baseInputStream = stream;
145 
146     initializeVariables ();
147 
148     // predeclare the built-in entities here (replacement texts)
149     // we don't need to intern(), since we're guaranteed literals
150     // are always (globally) interned.
151     setInternalEntity ("amp", "&#38;");
152     setInternalEntity ("lt", "&#60;");
153     setInternalEntity ("gt", "&#62;");
154     setInternalEntity ("apos", "&#39;");
155     setInternalEntity ("quot", "&#34;");
156 
157     handler.startDocument ();
158 
159     pushURL ("[document]", basePublicId, baseURI,
160         baseReader, baseInputStream, encoding, false);
161 
162     try {
163         parseDocument ();
164         handler.endDocument ();
165     } finally {
166         if (baseReader != null)
167         try { baseReader.close ();
168         } catch (IOException e) { /* ignore */ }
169         if (baseInputStream != null)
170         try { baseInputStream.close ();
171         } catch (IOException e) { /* ignore */ }
172         if (is != null)
173         try { is.close ();
174         } catch (IOException e) { /* ignore */ }
175         if (reader != null)
176         try {
177             reader.close ();
178         } catch (IOException e) { /* ignore */
179         }
180         cleanupVariables ();
181     }
182     }
183 
184 
185     ////////////////////////////////////////////////////////////////////////
186     // Constants.
187     ////////////////////////////////////////////////////////////////////////
188 
189     //
190     // Constants for element content type.
191     //
192 
193     /**
194      * Constant: an element has not been declared.
195      * @see #getElementContentType
196      */
197     public final static int CONTENT_UNDECLARED = 0;
198 
199     /**
200      * Constant: the element has a content model of ANY.
201      * @see #getElementContentType
202      */
203     public final static int CONTENT_ANY = 1;
204 
205     /**
206      * Constant: the element has declared content of EMPTY.
207      * @see #getElementContentType
208      */
209     public final static int CONTENT_EMPTY = 2;
210 
211     /**
212      * Constant: the element has mixed content.
213      * @see #getElementContentType
214      */
215     public final static int CONTENT_MIXED = 3;
216 
217     /**
218      * Constant: the element has element content.
219      * @see #getElementContentType
220      */
221     public final static int CONTENT_ELEMENTS = 4;
222 
223 
224     //
225     // Constants for the entity type.
226     //
227 
228     /**
229      * Constant: the entity has not been declared.
230      * @see #getEntityType
231      */
232     public final static int ENTITY_UNDECLARED = 0;
233 
234     /**
235      * Constant: the entity is internal.
236      * @see #getEntityType
237      */
238     public final static int ENTITY_INTERNAL = 1;
239 
240     /**
241      * Constant: the entity is external, non-parseable data.
242      * @see #getEntityType
243      */
244     public final static int ENTITY_NDATA = 2;
245 
246     /**
247      * Constant: the entity is external XML data.
248      * @see #getEntityType
249      */
250     public final static int ENTITY_TEXT = 3;
251 
252 
253     //
254     // Constants for attribute type.
255     //
256 
257     /**
258      * Constant: the attribute has not been declared for this element type.
259      * @see #getAttributeType
260      */
261     public final static int ATTRIBUTE_UNDECLARED = 0;
262 
263     /**
264      * Constant: the attribute value is a string value.
265      * @see #getAttributeType
266      */
267     public final static int ATTRIBUTE_CDATA = 1;
268 
269     /**
270      * Constant: the attribute value is a unique identifier.
271      * @see #getAttributeType
272      */
273     public final static int ATTRIBUTE_ID = 2;
274 
275     /**
276      * Constant: the attribute value is a reference to a unique identifier.
277      * @see #getAttributeType
278      */
279     public final static int ATTRIBUTE_IDREF = 3;
280 
281     /**
282      * Constant: the attribute value is a list of ID references.
283      * @see #getAttributeType
284      */
285     public final static int ATTRIBUTE_IDREFS = 4;
286 
287     /**
288      * Constant: the attribute value is the name of an entity.
289      * @see #getAttributeType
290      */
291     public final static int ATTRIBUTE_ENTITY = 5;
292 
293     /**
294      * Constant: the attribute value is a list of entity names.
295      * @see #getAttributeType
296      */
297     public final static int ATTRIBUTE_ENTITIES = 6;
298 
299     /**
300      * Constant: the attribute value is a name token.
301      * @see #getAttributeType
302      */
303     public final static int ATTRIBUTE_NMTOKEN = 7;
304 
305     /**
306      * Constant: the attribute value is a list of name tokens.
307      * @see #getAttributeType
308      */
309     public final static int ATTRIBUTE_NMTOKENS = 8;
310 
311     /**
312      * Constant: the attribute value is a token from an enumeration.
313      * @see #getAttributeType
314      */
315     public final static int ATTRIBUTE_ENUMERATED = 9;
316 
317     /**
318      * Constant: the attribute is the name of a notation.
319      * @see #getAttributeType
320      */
321     public final static int ATTRIBUTE_NOTATION = 10;
322 
323 
324     //
325     // When the class is loaded, populate the hash table of
326     // attribute types.
327     //
328 
329     /**
330      * Hash table of attribute types.
331      */
332     private static Hashtable attributeTypeHash;
333     static {
334     attributeTypeHash = new Hashtable (13);
335     attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA));
336     attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID));
337     attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF));
338     attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS));
339     attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY));
340     attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES));
341     attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN));
342     attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS));
343     attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION));
344     }
345 
346 
347     //
348     // Constants for supported encodings.  "external" is just a flag.
349     //
350     private final static int ENCODING_EXTERNAL = 0;
351     private final static int ENCODING_UTF_8 = 1;
352     private final static int ENCODING_ISO_8859_1 = 2;
353     private final static int ENCODING_UCS_2_12 = 3;
354     private final static int ENCODING_UCS_2_21 = 4;
355     private final static int ENCODING_UCS_4_1234 = 5;
356     private final static int ENCODING_UCS_4_4321 = 6;
357     private final static int ENCODING_UCS_4_2143 = 7;
358     private final static int ENCODING_UCS_4_3412 = 8;
359     private final static int ENCODING_ASCII = 9;
360 
361 
362     //
363     // Constants for attribute default value.
364     //
365 
366     /**
367      * Constant: the attribute is not declared.
368      * @see #getAttributeDefaultValueType
369      */
370     public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
371 
372     /**
373      * Constant: the attribute has a literal default value specified.
374      * @see #getAttributeDefaultValueType
375      * @see #getAttributeDefaultValue
376      */
377     public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
378 
379     /**
380      * Constant: the attribute was declared #IMPLIED.
381      * @see #getAttributeDefaultValueType
382      */
383     public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
384 
385     /**
386      * Constant: the attribute was declared #REQUIRED.
387      * @see #getAttributeDefaultValueType
388      */
389     public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
390 
391     /**
392      * Constant: the attribute was declared #FIXED.
393      * @see #getAttributeDefaultValueType
394      * @see #getAttributeDefaultValue
395      */
396     public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
397 
398 
399     //
400     // Constants for input.
401     //
402     private final static int INPUT_NONE = 0;
403     private final static int INPUT_INTERNAL = 1;
404     private final static int INPUT_STREAM = 3;
405     private final static int INPUT_BUFFER = 4;
406     private final static int INPUT_READER = 5;
407 
408 
409     //
410     // Flags for reading literals.
411     //
412     // expand general entity refs (attribute values in dtd and content)
413     private final static int LIT_ENTITY_REF = 2;
414     // normalize this value (space chars) (attributes, public ids)
415     private final static int LIT_NORMALIZE = 4;
416     // literal is an attribute value 
417     private final static int LIT_ATTRIBUTE = 8;
418     // don't expand parameter entities
419     private final static int LIT_DISABLE_PE = 16;
420     // don't expand [or parse] character refs
421     private final static int LIT_DISABLE_CREF = 32;
422     // don't parse general entity refs
423     private final static int LIT_DISABLE_EREF = 64;
424     // don't expand general entities, but make sure we _could_
425     private final static int LIT_ENTITY_CHECK = 128;
426     // literal is a public ID value 
427     private final static int LIT_PUBID = 256;
428 
429     //
430     // Flags affecting PE handling in DTDs (if expandPE is true).
431     // PEs expand with space padding, except inside literals.
432     //
433     private final static int CONTEXT_NORMAL = 0;
434     private final static int CONTEXT_LITERAL = 1;
435 
436 
437     //////////////////////////////////////////////////////////////////////
438     // Error reporting.
439     //////////////////////////////////////////////////////////////////////
440 
441 
442     /**
443      * Report an error.
444      * @param message The error message.
445      * @param textFound The text that caused the error (or null).
446      * @see SAXDriver#error
447      * @see #line
448      */
449     private void error (String message, String textFound, String textExpected)
450     throws SAXException
451     {
452     if (textFound != null) {
453         message = message + " (found \"" + textFound + "\")";
454     }
455     if (textExpected != null) {
456         message = message + " (expected \"" + textExpected + "\")";
457     }
458     String uri = null;
459 
460     if (externalEntity != null) {
461         uri = externalEntity.getURL ().toString ();
462     }
463     handler.error (message, uri, line, column);
464 
465     // "can't happen"
466     throw new SAXException (message);
467     }
468 
469 
470     /**
471      * Report a serious error.
472      * @param message The error message.
473      * @param textFound The text that caused the error (or null).
474      */
475     private void error (String message, char textFound, String textExpected)
476     throws SAXException
477     {
478     error (message, new Character (textFound).toString (), textExpected);
479     }
480 
481     /** Report typical case fatal errors. */
482     private void error (String message)
483     throws SAXException
484     {
485     error (message, null, null);
486     }
487 
488 
489     //////////////////////////////////////////////////////////////////////
490     // Major syntactic productions.
491     //////////////////////////////////////////////////////////////////////
492 
493 
494     /**
495      * Parse an XML document.
496      * <pre>
497      * [1] document ::= prolog element Misc*
498      * </pre>
499      * <p>This is the top-level parsing function for a single XML
500      * document.  As a minimum, a well-formed document must have
501      * a document element, and a valid document must have a prolog
502      * (one with doctype) as well.
503      */
504     private void parseDocument ()
505     throws Exception
506     {
507         try {                                       // added by MHK
508             parseProlog ();
509             require ('<', "document prolog");
510             parseElement ();
511         } catch (EOFException ee) {                 // added by MHK
512             error("premature end of file", "[EOF]", null);
513         }
514         
515         try {
516             parseMisc ();   //skip all white, PIs, and comments
517             char c = readCh ();    //if this doesn't throw an exception...
518             error ("unexpected characters after document end", c, null);
519         } catch (EOFException e) {
520             return;
521         }
522     }
523 
524 
525     /**
526      * Skip a comment.
527      * <pre>
528      * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
529      * </pre>
530      * <p> (The <code>&lt;!--</code> has already been read.)
531      */
532     private void parseComment ()
533     throws Exception
534     {
535     char c;
536     boolean saved = expandPE;
537 
538     expandPE = false;
539     parseUntil ("--");
540     require ('>', "-- in comment");
541     expandPE = saved;
542     handler.comment (dataBuffer, 0, dataBufferPos);
543     dataBufferPos = 0;
544     }
545 
546 
547     /**
548      * Parse a processing instruction and do a call-back.
549      * <pre>
550      * [16] PI ::= '&lt;?' PITarget
551      *      (S (Char* - (Char* '?&gt;' Char*)))?
552      *      '?&gt;'
553      * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
554      * </pre>
555      * <p> (The <code>&lt;?</code> has already been read.)
556      */
557     private void parsePI ()
558     throws SAXException, IOException
559     {
560     String name;
561     boolean saved = expandPE;
562 
563     expandPE = false;
564     name = readNmtoken (true);
565     if ("xml".equalsIgnoreCase (name))
566         error ("Illegal processing instruction target", name, null);
567     if (!tryRead ("?>")) {
568         requireWhitespace ();
569         parseUntil ("?>");
570     }
571     expandPE = saved;
572     handler.processingInstruction (name, dataBufferToString ());
573     }
574 
575 
576     /**
577      * Parse a CDATA section.
578      * <pre>
579      * [18] CDSect ::= CDStart CData CDEnd
580      * [19] CDStart ::= '&lt;![CDATA['
581      * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
582      * [21] CDEnd ::= ']]&gt;'
583      * </pre>
584      * <p> (The '&lt;![CDATA[' has already been read.)
585      */
586     private void parseCDSect ()
587     throws Exception
588     {
589     parseUntil ("]]>");
590     dataBufferFlush ();
591     }
592 
593 
594     /**
595      * Parse the prolog of an XML document.
596      * <pre>
597      * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
598      * </pre>
599      * <p>There are a couple of tricks here.  First, it is necessary to
600      * declare the XML default attributes after the DTD (if present)
601      * has been read. [??]  Second, it is not possible to expand general
602      * references in attribute value literals until after the entire
603      * DTD (if present) has been parsed.
604      * <p>We do not look for the XML declaration here, because it was
605      * handled by pushURL ().
606      * @see pushURL
607      */
608     private void parseProlog ()
609     throws Exception
610     {
611     parseMisc ();
612 
613     if (tryRead ("<!DOCTYPE")) {
614         parseDoctypedecl ();
615         parseMisc ();
616     }
617     }
618 
619 
620     /**
621      * Parse the XML declaration.
622      * <pre>
623      * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
624      * [24] VersionInfo ::= S 'version' Eq
625      *      ("'" VersionNum "'" | '"' VersionNum '"' )
626      * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
627      * [32] SDDecl ::= S 'standalone' Eq
628      *      ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
629      * [80] EncodingDecl ::= S 'encoding' Eq
630      *      ( "'" EncName "'" | "'" EncName "'" )
631      * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
632      * </pre>
633      * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
634      * @return the encoding in the declaration, uppercased; or null
635      * @see #parseTextDecl
636      * @see #setupDecoding
637      */
638     private String parseXMLDecl (boolean ignoreEncoding)
639     throws SAXException, IOException
640     {
641     String  version;
642     String  encodingName = null;
643     String  standalone = null;
644     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
645 
646     // Read the version.
647     require ("version", "XML declaration");
648     parseEq ();
649     version = readLiteral (flags);
650     if (!version.equals ("1.0")) {
651         error ("unsupported XML version", version, "1.0");
652     }
653 
654     // Try reading an encoding declaration.
655     boolean white = tryWhitespace ();
656     if (tryRead ("encoding")) {
657         if (!white)
658         error ("whitespace required before 'encoding='");
659         parseEq ();
660         encodingName = readLiteral (flags);
661         if (!ignoreEncoding)
662         setupDecoding (encodingName);
663     }
664 
665     // Try reading a standalone declaration
666     if (encodingName != null)
667         white = tryWhitespace ();
668     if (tryRead ("standalone")) {
669         if (!white)
670         error ("whitespace required before 'standalone='");
671         parseEq ();
672         standalone = readLiteral (flags);
673         if (! ("yes".equals (standalone) || "no".equals (standalone)))
674         error ("standalone flag must be 'yes' or 'no'");
675     }
676 
677     skipWhitespace ();
678     require ("?>", "XML declaration");
679 
680     return encodingName;
681     }
682 
683 
684     /**
685      * Parse a text declaration.
686      * <pre>
687      * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
688      * [80] EncodingDecl ::= S 'encoding' Eq
689      *      ( '"' EncName '"' | "'" EncName "'" )
690      * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
691      * </pre>
692      * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
693      * @return the encoding in the declaration, uppercased; or null
694      * @see #parseXMLDecl
695      * @see #setupDecoding
696      */
697     private String parseTextDecl (boolean ignoreEncoding)
698     throws SAXException, IOException
699     {
700     String  encodingName = null;
701     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
702 
703     // Read an optional version.
704     if (tryRead ("version")) {
705         String version;
706         parseEq ();
707         version = readLiteral (flags);
708         if (!version.equals ("1.0")) {
709         error ("unsupported XML version", version, "1.0");
710         }
711         requireWhitespace ();
712     }
713 
714 
715     // Read the encoding.
716     require ("encoding", "XML text declaration");
717     parseEq ();
718     encodingName = readLiteral (flags);
719     if (!ignoreEncoding)
720         setupDecoding (encodingName);
721 
722     skipWhitespace ();
723     require ("?>", "XML text declaration");
724 
725     return encodingName;
726     }
727 
728 
729     /**
730      * Sets up internal state so that we can decode an entity using the
731      * specified encoding.  This is used when we start to read an entity
732      * and we have been given knowledge of its encoding before we start to
733      * read any data (e.g. from a SAX input source or from a MIME type).
734      *
735      * <p> It is also used after autodetection, at which point only very
736      * limited adjustments to the encoding may be used (switching between
737      * related builtin decoders).
738      *
739      * @param encodingName The name of the encoding specified by the user.
740      * @exception IOException if the encoding isn't supported either
741      *  internally to this parser, or by the hosting JVM.
742      * @see #parseXMLDecl
743      * @see #parseTextDecl
744      */
745     private void setupDecoding (String encodingName)
746     throws SAXException, IOException
747     {
748     encodingName = encodingName.toUpperCase ();
749 
750     // ENCODING_EXTERNAL indicates an encoding that wasn't
751     // autodetected ... we can use builtin decoders, or
752     // ones from the JVM (InputStreamReader).
753 
754     // Otherwise we can only tweak what was autodetected, and
755     // only for single byte (ASCII derived) builtin encodings.
756 
757     // ASCII-derived encodings
758     if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) {
759         if (encodingName.equals ("ISO-8859-1")
760                 || encodingName.equals ("8859_1")
761                 || encodingName.equals ("ISO8859_1")
762           ) {
763             encoding = ENCODING_ISO_8859_1;
764             return;
765         } else if (encodingName.equals ("US-ASCII")
766                 || encodingName.equals ("ASCII")) {
767             encoding = ENCODING_ASCII;
768             return;
769         } else if (encodingName.equals ("UTF-8")
770                 || encodingName.equals ("UTF8")) {
771             encoding = ENCODING_UTF_8;
772             return;
773         } else if (encoding != ENCODING_EXTERNAL) {
774             // used to start with a new reader ...
775             throw new EncodingException (encodingName);
776         }
777         // else fallthrough ...
778         // it's ASCII-ish and something other than a builtin
779     }
780 
781     // Unicode and such
782     if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) {
783         if (!(encodingName.equals ("ISO-10646-UCS-2")
784             || encodingName.equals ("UTF-16")
785             || encodingName.equals ("UTF-16BE")
786             || encodingName.equals ("UTF-16LE")))
787         error ("unsupported Unicode encoding",
788                encodingName,
789                "UTF-16");
790         return;
791     }
792 
793     // four byte encodings
794     if (encoding == ENCODING_UCS_4_1234
795         || encoding == ENCODING_UCS_4_4321
796         || encoding == ENCODING_UCS_4_2143
797         || encoding == ENCODING_UCS_4_3412) {
798         if (!encodingName.equals ("ISO-10646-UCS-4"))
799         error ("unsupported 32-bit encoding",
800                encodingName,
801                "ISO-10646-UCS-4");
802         return;
803     }
804 
805     // assert encoding == ENCODING_EXTERNAL
806     // if (encoding != ENCODING_EXTERNAL)
807     //     throw new RuntimeException ("encoding = " + encoding);
808 
809     if (encodingName.equals ("UTF-16BE")) {
810         encoding = ENCODING_UCS_2_12;
811         return;
812     }
813     if (encodingName.equals ("UTF-16LE")) {
814         encoding = ENCODING_UCS_2_21;
815         return;
816     }
817 
818     // We couldn't use the builtin decoders at all.  But we can try to
819     // create a reader, since we haven't messed up buffering.  Tweak
820     // the encoding name if necessary.
821 
822     if (encodingName.equals ("UTF-16")
823         || encodingName.equals ("ISO-10646-UCS-2"))
824         encodingName = "Unicode";
825     // Ignoring all the EBCDIC aliases here
826 
827     reader = new InputStreamReader (is, encodingName);
828     sourceType = INPUT_READER;
829     }
830 
831 
832     /**
833      * Parse miscellaneous markup outside the document element and DOCTYPE
834      * declaration.
835      * <pre>
836      * [27] Misc ::= Comment | PI | S
837      * </pre>
838      */
839     private void parseMisc ()
840     throws Exception
841     {
842     while (true) {
843         skipWhitespace ();
844         if (tryRead ("<?")) {
845         parsePI ();
846         } else if (tryRead ("<!--")) {
847         parseComment ();
848         } else {
849         return;
850         }
851     }
852     }
853 
854 
855     /**
856      * Parse a document type declaration.
857      * <pre>
858      * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
859      *      ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
860      * </pre>
861      * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
862      */
863     private void parseDoctypedecl ()
864     throws Exception
865     {
866     String doctypeName, ids[];
867 
868     // Read the document type name.
869     requireWhitespace ();
870     doctypeName = readNmtoken (true);
871 
872     // Read the External subset's IDs
873     skipWhitespace ();
874     ids = readExternalIds (false);
875 
876     // report (a) declaration of name, (b) lexical info (ids)
877     handler.doctypeDecl (doctypeName, ids [0], ids [1]);
878 
879     // Internal subset is parsed first, if present
880     skipWhitespace ();
881     if (tryRead ('[')) {
882 
883         // loop until the subset ends
884         while (true) {
885         expandPE = true;
886         skipWhitespace ();
887         expandPE = false;
888         if (tryRead (']')) {
889             break;      // end of subset
890         } else {
891             // WFC, PEs in internal subset (only between decls)
892             peIsError = expandPE = true;
893             parseMarkupdecl ();
894             peIsError = expandPE = false;
895         }
896         }
897     }
898 
899     // Read the external subset, if any
900     if (ids [1] != null) {
901         pushURL ("[external subset]", ids [0], ids [1], null, null, null, false);
902 
903         // Loop until we end up back at '>'
904         while (true) {
905         expandPE = true;
906         skipWhitespace ();
907         expandPE = false;
908         if (tryRead ('>')) {
909             break;
910         } else {
911             expandPE = true;
912             parseMarkupdecl ();
913             expandPE = false;
914         }
915         }
916     } else {
917         // No external subset.
918         skipWhitespace ();
919         require ('>', "internal DTD subset");
920     }
921 
922     // done dtd
923     handler.endDoctype ();
924     expandPE = false;
925     }
926 
927 
928     /**
929      * Parse a markup declaration in the internal or external DTD subset.
930      * <pre>
931      * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
932      *      | NotationDecl | PI | Comment
933      * [30] extSubsetDecl ::= (markupdecl | conditionalSect
934      *      | PEReference | S) *
935      * </pre>
936      * <p> Reading toplevel PE references is handled as a lexical issue
937      * by the caller, as is whitespace.
938      */
939     private void parseMarkupdecl ()
940     throws Exception
941     {
942     if (tryRead ("<!ELEMENT")) {
943         parseElementdecl ();
944     } else if (tryRead ("<!ATTLIST")) {
945         parseAttlistDecl ();
946     } else if (tryRead ("<!ENTITY")) {
947         parseEntityDecl ();
948     } else if (tryRead ("<!NOTATION")) {
949         parseNotationDecl ();
950     } else if (tryRead ("<?")) {
951         parsePI ();
952     } else if (tryRead ("<!--")) {
953         parseComment ();
954     } else if (tryRead ("<![")) {
955         if (inputStack.size () > 0)
956         parseConditionalSect ();
957         else
958         error ("conditional sections illegal in internal subset");
959     } else {
960         error ("expected markup declaration");
961     }
962     }
963 
964 
965     /**
966      * Parse an element, with its tags.
967      * <pre>
968      * [39] element ::= EmptyElementTag | STag content ETag
969      * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
970      * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
971      * </pre>
972      * <p> (The '&lt;' has already been read.)
973      * <p>NOTE: this method actually chains onto parseContent (), if necessary,
974      * and parseContent () will take care of calling parseETag ().
975      */
976     private void parseElement ()
977     throws Exception
978     {
979     String  gi;
980     char    c;
981     int oldElementContent = currentElementContent;
982     String  oldElement = currentElement;
983     Object  element [];
984 
985     // This is the (global) counter for the
986     // array of specified attributes.
987     tagAttributePos = 0;
988 
989     // Read the element type name.
990     gi = readNmtoken (true);
991 
992     // Determine the current content type.
993     currentElement = gi;
994     element = (Object []) elementInfo.get (gi);
995     currentElementContent = getContentType (element, CONTENT_ANY);
996 
997     // Read the attributes, if any.
998     // After this loop, "c" is the closing delimiter.
999     boolean white = tryWhitespace ();
1000    c = readCh ();
1001    while (c != '/' && c != '>') {
1002        unread (c);
1003        if (!white)
1004        error ("need whitespace between attributes");
1005        parseAttribute (gi);
1006        white = tryWhitespace ();
1007        c = readCh ();
1008    }
1009
1010    // Supply any defaulted attributes.
1011    Enumeration atts = declaredAttributes (element);
1012    if (atts != null) {
1013        String aname;
1014loop:
1015        while (atts.hasMoreElements ()) {
1016            aname = (String) atts.nextElement ();
1017            // See if it was specified.
1018            for (int i = 0; i < tagAttributePos; i++) {
1019                if (tagAttributes [i] == aname) {
1020                continue loop;
1021                }
1022            }
1023            // I guess not...
1024            String defaultVal = getAttributeExpandedValue (gi, aname);
1025            if (defaultVal!=null) {
1026                handler.attribute (aname, defaultVal, false);
1027            }
1028        }
1029    }
1030
1031    // Figure out if this is a start tag
1032    // or an empty element, and dispatch an
1033    // event accordingly.
1034    switch (c) {
1035    case '>':
1036        handler.startElement (gi);
1037        parseContent ();
1038        break;
1039    case '/':
1040        require ('>', "empty element tag");
1041        handler.startElement (gi);
1042        handler.endElement (gi);
1043        break;
1044    }
1045
1046    // Restore the previous state.
1047    currentElement = oldElement;
1048    currentElementContent = oldElementContent;
1049    }
1050
1051
1052    /**
1053     * Parse an attribute assignment.
1054     * <pre>
1055     * [41] Attribute ::= Name Eq AttValue
1056     * </pre>
1057     * @param name The name of the attribute's element.
1058     * @see SAXDriver#attribute
1059     */
1060    private void parseAttribute (String name)
1061    throws Exception
1062    {
1063    String aname;
1064    int type;
1065    String value;
1066    int flags = LIT_ATTRIBUTE |  LIT_ENTITY_REF;
1067
1068    // Read the attribute name.
1069    aname = readNmtoken (true);
1070    type = getAttributeType (name, aname);
1071
1072    // Parse '='
1073    parseEq ();
1074
1075    // Read the value, normalizing whitespace
1076    // unless it is CDATA.
1077    if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
1078        value = readLiteral (flags);
1079    } else {
1080        value = readLiteral (flags | LIT_NORMALIZE);
1081    }
1082
1083    // WFC: no duplicate attributes
1084    for (int i = 0; i < tagAttributePos; i++)
1085        if (aname.equals (tagAttributes [i]))
1086        error ("duplicate attribute", aname, null);
1087
1088    // Inform the handler about the
1089    // attribute.
1090    handler.attribute (aname, value, true);
1091    dataBufferPos = 0;
1092
1093    // Note that the attribute has been
1094    // specified.
1095    if (tagAttributePos == tagAttributes.length) {
1096        String newAttrib[] = new String [tagAttributes.length * 2];
1097        System.arraycopy (tagAttributes, 0, newAttrib, 0, tagAttributePos);
1098        tagAttributes = newAttrib;
1099    }
1100    tagAttributes [tagAttributePos++] = aname;
1101    }
1102
1103
1104    /**
1105     * Parse an equals sign surrounded by optional whitespace.
1106     * <pre>
1107     * [25] Eq ::= S? '=' S?
1108     * </pre>
1109     */
1110    private void parseEq ()
1111    throws SAXException, IOException
1112    {
1113    skipWhitespace ();
1114    require ('=', "attribute name");
1115    skipWhitespace ();
1116    }
1117
1118
1119    /**
1120     * Parse an end tag.
1121     * <pre>
1122     * [42] ETag ::= '</' Name S? '>'
1123     * </pre>
1124     * <p>NOTE: parseContent () chains to here, we already read the
1125     * "&lt;/".
1126     */
1127    private void parseETag ()
1128    throws Exception
1129    {
1130    require (currentElement, "element end tag");
1131    skipWhitespace ();
1132    require ('>', "name in end tag");
1133    handler.endElement (currentElement);
1134    // not re-reporting any SAXException re bogus end tags,
1135    // even though that diagnostic might be clearer ...
1136    }
1137
1138
1139    /**
1140     * Parse the content of an element.
1141     * <pre>
1142     * [43] content ::= (element | CharData | Reference
1143     *      | CDSect | PI | Comment)*
1144     * [67] Reference ::= EntityRef | CharRef
1145     * </pre>
1146     * <p> NOTE: consumes ETtag.
1147     */
1148    private void parseContent ()
1149    throws Exception
1150    {
1151    char c;
1152    while (true) {
1153        //switch (currentElementContent) {
1154        //    case CONTENT_ANY:
1155        //    case CONTENT_MIXED:
1156        //    case CONTENT_UNDECLARED:    // this line added by MHK 24 May 2000
1157        //    case CONTENT_EMPTY:         // this line added by MHK 8 Sept 2000
1158        //        parseCharData ();
1159        //        break;
1160        //    case CONTENT_ELEMENTS:
1161        //        //parseWhitespace ();   // removed MHK 27 May 2001. The problem is that
1162        //                                // with element content, the text should be whitespace
1163        //                                // but if the document is invalid it might not be.
1164        //                                // Replaced with....
1165        //        parseCharData();        // This processes any char data, but still reports
1166        //                                // it as ignorable white space if within element content.
1167        //        break;
1168        //}
1169        
1170        parseCharData();    // parse it the same way regardless of content type
1171                            // because it might not be valid anyway
1172
1173        // Handle delimiters
1174        c = readCh ();
1175        switch (c) {
1176        case '&':           // Found "&"
1177
1178            c = readCh ();
1179            if (c == '#') {
1180                parseCharRef ();
1181            } else {
1182                unread (c);
1183                parseEntityRef (true);
1184            }
1185            break;
1186
1187        case '<':           // Found "<"
1188            dataBufferFlush ();
1189            c = readCh ();
1190            switch (c) {
1191              case '!':             // Found "<!"
1192                c = readCh ();
1193                switch (c) {
1194                  case '-':         // Found "<!-"
1195                    require ('-', "start of comment");
1196                    parseComment ();
1197                    break;
1198                  case '[':         // Found "<!["
1199                    require ("CDATA[", "CDATA section");
1200                    handler.startCDATA ();
1201                    inCDATA = true;
1202                    parseCDSect ();
1203                    inCDATA = false;
1204                    handler.endCDATA ();
1205                    break;
1206                  default:
1207                    error ("expected comment or CDATA section", c, null);
1208                    break;
1209                }
1210                break;
1211
1212              case '?':         // Found "<?"
1213                parsePI ();
1214                break;
1215
1216              case '/':         // Found "</"
1217                parseETag ();
1218                return;
1219
1220              default:      // Found "<" followed by something else
1221                unread (c);
1222                parseElement ();
1223                break;
1224            }
1225            }
1226        }
1227    }
1228
1229
1230    /**
1231     * Parse an element type declaration.
1232     * <pre>
1233     * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1234     * </pre>
1235     * <p> NOTE: the '&lt;!ELEMENT' has already been read.
1236     */
1237    private void parseElementdecl ()
1238    throws Exception
1239    {
1240    String name;
1241
1242    requireWhitespace ();
1243    // Read the element type name.
1244    name = readNmtoken (true);
1245
1246    requireWhitespace ();
1247    // Read the content model.
1248    parseContentspec (name);
1249
1250    skipWhitespace ();
1251    require ('>', "element declaration");
1252    }
1253
1254
1255    /**
1256     * Content specification.
1257     * <pre>
1258     * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1259     * </pre>
1260     */
1261    private void parseContentspec (String name)
1262    throws Exception
1263    {
1264    if (tryRead ("EMPTY")) {
1265        setElement (name, CONTENT_EMPTY, null, null);
1266        return;
1267    } else if (tryRead ("ANY")) {
1268        setElement (name, CONTENT_ANY, null, null);
1269        return;
1270    } else {
1271        require ('(', "element name");
1272        dataBufferAppend ('(');
1273        skipWhitespace ();
1274        if (tryRead ("#PCDATA")) {
1275        dataBufferAppend ("#PCDATA");
1276        parseMixed ();
1277        setElement (name, CONTENT_MIXED, dataBufferToString (), null);
1278        } else {
1279        parseElements ();
1280        setElement (name, CONTENT_ELEMENTS,
1281            dataBufferToString (), null);
1282        }
1283    }
1284    }
1285
1286
1287    /**
1288     * Parse an element-content model.
1289     * <pre>
1290     * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1291     * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1292     * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1293     * </pre>
1294     *
1295     * <p> NOTE: the opening '(' and S have already been read.
1296     */
1297    private void parseElements ()
1298    throws Exception
1299    {
1300    char c;
1301    char sep;
1302
1303    // Parse the first content particle
1304    skipWhitespace ();
1305    parseCp ();
1306
1307    // Check for end or for a separator.
1308    skipWhitespace ();
1309    c = readCh ();
1310    switch (c) {
1311    case ')':
1312        dataBufferAppend (')');
1313        c = readCh ();
1314        switch (c) {
1315        case '*':
1316        case '+':
1317        case '?':
1318        dataBufferAppend (c);
1319        break;
1320        default:
1321        unread (c);
1322        }
1323        return;
1324    case ',':           // Register the separator.
1325    case '|':
1326        sep = c;
1327        dataBufferAppend (c);
1328        break;
1329    default:
1330        error ("bad separator in content model", c, null);
1331        return;
1332    }
1333
1334    // Parse the rest of the content model.
1335    while (true) {
1336        skipWhitespace ();
1337        parseCp ();
1338        skipWhitespace ();
1339        c = readCh ();
1340        if (c == ')') {
1341        dataBufferAppend (')');
1342        break;
1343        } else if (c != sep) {
1344        error ("bad separator in content model", c, null);
1345        return;
1346        } else {
1347        dataBufferAppend (c);
1348        }
1349    }
1350
1351    // Check for the occurrence indicator.
1352    c = readCh ();
1353    switch (c) {
1354    case '?':
1355    case '*':
1356    case '+':
1357        dataBufferAppend (c);
1358        return;
1359    default:
1360        unread (c);
1361        return;
1362    }
1363    }
1364
1365
1366    /**
1367     * Parse a content particle.
1368     * <pre>
1369     * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1370     * </pre>
1371     */
1372    private void parseCp ()
1373    throws Exception
1374    {
1375    if (tryRead ('(')) {
1376        dataBufferAppend ('(');
1377        parseElements ();
1378    } else {
1379        dataBufferAppend (readNmtoken (true));
1380        char c = readCh ();
1381        switch (c) {
1382        case '?':
1383        case '*':
1384        case '+':
1385        dataBufferAppend (c);
1386        break;
1387        default:
1388        unread (c);
1389        break;
1390        }
1391    }
1392    }
1393
1394
1395    /**
1396     * Parse mixed content.
1397     * <pre>
1398     * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1399     *        | '(' S? ('#PCDATA') S? ')'
1400     * </pre>
1401     */
1402    private void parseMixed ()
1403    throws Exception
1404    {
1405
1406    // Check for PCDATA alone.
1407    skipWhitespace ();
1408    if (tryRead (')')) {
1409        dataBufferAppend (")*");
1410        tryRead ('*');
1411        return;
1412    }
1413
1414    // Parse mixed content.
1415    skipWhitespace ();
1416    while (!tryRead (")*")) {
1417        require ('|', "alternative");
1418        dataBufferAppend ('|');
1419        skipWhitespace ();
1420        dataBufferAppend (readNmtoken (true));
1421        skipWhitespace ();
1422    }
1423    dataBufferAppend (")*");
1424    }
1425
1426
1427    /**
1428     * Parse an attribute list declaration.
1429     * <pre>
1430     * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1431     * </pre>
1432     * <p>NOTE: the '&lt;!ATTLIST' has already been read.
1433     */
1434    private void parseAttlistDecl ()
1435    throws Exception
1436    {
1437    String elementName;
1438
1439    requireWhitespace ();
1440    elementName = readNmtoken (true);
1441    boolean white = tryWhitespace ();
1442    while (!tryRead ('>')) {
1443        if (!white)
1444        error ("whitespace required before attribute definition");
1445        parseAttDef (elementName);
1446        white = tryWhitespace ();
1447    }
1448    }
1449
1450
1451    /**
1452     * Parse a single attribute definition.
1453     * <pre>
1454     * [53] AttDef ::= S Name S AttType S DefaultDecl
1455     * </pre>
1456     */
1457    private void parseAttDef (String elementName)
1458    throws Exception
1459    {
1460    String name;
1461    int type;
1462    String enum = null;
1463
1464    // Read the attribute name.
1465    name = readNmtoken (true);
1466
1467    // Read the attribute type.
1468    requireWhitespace ();
1469    type = readAttType ();
1470
1471    // Get the string of enumerated values
1472    // if necessary.
1473    if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1474        enum = dataBufferToString ();
1475    }
1476
1477    // Read the default value.
1478    requireWhitespace ();
1479    parseDefault (elementName, name, type, enum);
1480    }
1481
1482
1483    /**
1484     * Parse the attribute type.
1485     * <pre>
1486     * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1487     * [55] StringType ::= 'CDATA'
1488     * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1489     *      | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1490     * [57] EnumeratedType ::= NotationType | Enumeration
1491     * </pre>
1492     */
1493    private int readAttType ()
1494    throws Exception
1495    {
1496    if (tryRead ('(')) {
1497        parseEnumeration (false);
1498        return ATTRIBUTE_ENUMERATED;
1499    } else {
1500        String typeString = readNmtoken (true);
1501        if (typeString.equals ("NOTATION")) {
1502        parseNotationType ();
1503        }
1504        Integer type = (Integer) attributeTypeHash.get (typeString);
1505        if (type == null) {
1506        error ("illegal attribute type", typeString, null);
1507        return ATTRIBUTE_UNDECLARED;
1508        } else {
1509        return type.intValue ();
1510        }
1511    }
1512    }
1513
1514
1515    /**
1516     * Parse an enumeration.
1517     * <pre>
1518     * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1519     * </pre>
1520     * <p>NOTE: the '(' has already been read.
1521     */
1522    private void parseEnumeration (boolean isNames)
1523    throws Exception
1524    {
1525    dataBufferAppend ('(');
1526
1527    // Read the first token.
1528    skipWhitespace ();
1529    dataBufferAppend (readNmtoken (isNames));
1530    // Read the remaining tokens.
1531    skipWhitespace ();
1532    while (!tryRead (')')) {
1533        require ('|', "enumeration value");
1534        dataBufferAppend ('|');
1535        skipWhitespace ();
1536        dataBufferAppend (readNmtoken (isNames));
1537        skipWhitespace ();
1538    }
1539    dataBufferAppend (')');
1540    }
1541
1542
1543    /**
1544     * Parse a notation type for an attribute.
1545     * <pre>
1546     * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
1547     *      (S? '|' S? name)* S? ')'
1548     * </pre>
1549     * <p>NOTE: the 'NOTATION' has already been read
1550     */
1551    private void parseNotationType ()
1552    throws Exception
1553    {
1554    requireWhitespace ();
1555    require ('(', "NOTATION");
1556
1557    parseEnumeration (true);
1558    }
1559
1560
1561    /**
1562     * Parse the default value for an attribute.
1563     * <pre>
1564     * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1565     *      | (('#FIXED' S)? AttValue)
1566     * </pre>
1567     */
1568    private void parseDefault (
1569    String elementName,
1570    String name,
1571    int type,
1572    String enum
1573    ) throws Exception
1574    {
1575    int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1576    String  value = null;
1577    int flags = LIT_ATTRIBUTE | LIT_DISABLE_CREF | LIT_ENTITY_CHECK | LIT_DISABLE_PE;
1578                                                                   // ^^^^^^^^^^^^^^
1579                                                                   // added MHK 20 Mar 2002
1580
1581    // Note: char refs not checked here, and input not normalized,
1582    // since it's done correctly later when we actually expand any
1583    // entity refs.  We ought to report char ref syntax errors now,
1584    // but don't.  Cost: unused defaults mean unreported WF errs.
1585    
1586    // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
1587    // chars to spaces (doesn't matter when that's done if it doesn't
1588    // interfere with char refs expanding to whitespace).
1589
1590    if (tryRead ('#')) {
1591        if (tryRead ("FIXED")) {
1592        valueType = ATTRIBUTE_DEFAULT_FIXED;
1593        requireWhitespace ();
1594        value = readLiteral (flags);
1595        } else if (tryRead ("REQUIRED")) {
1596        valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1597        } else if (tryRead ("IMPLIED")) {
1598        valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1599        } else {
1600        error ("illegal keyword for attribute default value");
1601        }
1602    } else
1603        value = readLiteral (flags);
1604    setAttribute (elementName, name, type, enum, value, valueType);
1605    }
1606
1607
1608    /**
1609     * Parse a conditional section.
1610     * <pre>
1611     * [61] conditionalSect ::= includeSect || ignoreSect
1612     * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
1613     *      extSubsetDecl ']]&gt;'
1614     * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
1615     *      ignoreSectContents* ']]&gt;'
1616     * [64] ignoreSectContents ::= Ignore
1617     *      ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
1618     * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
1619     * </pre>
1620     * <p> NOTE: the '&gt;![' has already been read.
1621     */
1622    private void parseConditionalSect ()
1623    throws Exception
1624    {
1625    skipWhitespace ();
1626    if (tryRead ("INCLUDE")) {
1627        skipWhitespace ();
1628        require ('[', "INCLUDE");
1629        skipWhitespace ();
1630        while (!tryRead ("]]>")) {
1631        parseMarkupdecl ();
1632        skipWhitespace ();
1633        }
1634    } else if (tryRead ("IGNORE")) {
1635        skipWhitespace ();
1636        require ('[', "IGNORE");
1637        int nesting = 1;
1638        char c;
1639        expandPE = false;
1640        for (int nest = 1; nest > 0;) {
1641        c = readCh ();
1642        switch (c) {
1643        case '<':
1644            if (tryRead ("![")) {
1645            nest++;
1646            }
1647        case ']':
1648            if (tryRead ("]>")) {
1649            nest--;
1650            }
1651        }
1652        }
1653        expandPE = true;
1654    } else {
1655        error ("conditional section must begin with INCLUDE or IGNORE");
1656    }
1657    }
1658
1659
1660    /**
1661     * Read and interpret a character reference.
1662     * <pre>
1663     * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1664     * </pre>
1665     * <p>NOTE: the '&#' has already been read.
1666     */
1667    private void parseCharRef ()
1668    throws SAXException, IOException
1669    {
1670    int value = 0;
1671    char c;
1672
1673    if (tryRead ('x')) {
1674loop1:
1675        while (true) {
1676        c = readCh ();
1677        switch (c) {
1678        case '0':
1679        case '1':
1680        case '2':
1681        case '3':
1682        case '4':
1683        case '5':
1684        case '6':
1685        case '7':
1686        case '8':
1687        case '9':
1688        case 'a':
1689        case 'A':
1690        case 'b':
1691        case 'B':
1692        case 'c':
1693        case 'C':
1694        case 'd':
1695        case 'D':
1696        case 'e':
1697        case 'E':
1698        case 'f':
1699        case 'F':
1700            value *= 16;
1701            value += Integer.parseInt (new Character (c).toString (),
1702                    16);
1703            break;
1704        case ';':
1705            break loop1;
1706        default:
1707            error ("illegal character in character reference", c, null);
1708            break loop1;
1709        }
1710        }
1711    } else {
1712loop2:
1713        while (true) {
1714        c = readCh ();
1715        switch (c) {
1716        case '0':
1717        case '1':
1718        case '2':
1719        case '3':
1720        case '4':
1721        case '5':
1722        case '6':
1723        case '7':
1724        case '8':
1725        case '9':
1726            value *= 10;
1727            value += Integer.parseInt (new Character (c).toString (),
1728                    10);
1729            break;
1730        case ';':
1731            break loop2;
1732        default:
1733            error ("illegal character in character reference", c, null);
1734            break loop2;
1735        }
1736        }
1737    }
1738
1739    // check for character refs being legal XML
1740    if ((value < 0x0020
1741        && ! (value == '\n' || value == '\t' || value == '\r'))
1742        || (value >= 0xD800 && value <= 0xDFFF)
1743        || value == 0xFFFE || value == 0xFFFF
1744        || value > 0x0010ffff)
1745        error ("illegal XML character reference U+"
1746            + Integer.toHexString (value));
1747
1748    // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
1749    //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
1750    if (value <= 0x0000ffff) {
1751        // no surrogates needed
1752        dataBufferAppend ((char) value);
1753    } else if (value <= 0x0010ffff) {
1754        value -= 0x10000;
1755        // > 16 bits, surrogate needed
1756        dataBufferAppend ((char) (0xd800 | (value >> 10)));
1757        dataBufferAppend ((char) (0xdc00 | (value & 0x0003ff)));
1758    } else {
1759        // too big for surrogate
1760        error ("character reference " + value + " is too large for UTF-16",
1761           new Integer (value).toString (), null);
1762    }
1763    }
1764
1765
1766    /**
1767     * Parse and expand an entity reference.
1768     * <pre>
1769     * [68] EntityRef ::= '&' Name ';'
1770     * </pre>
1771     * <p>NOTE: the '&amp;' has already been read.
1772     * @param externalAllowed External entities are allowed here.
1773     */
1774    private void parseEntityRef (boolean externalAllowed)
1775    throws SAXException, IOException
1776    {
1777    String name;
1778
1779    name = readNmtoken (true);
1780    require (';', "entity reference");
1781    switch (getEntityType (name)) {
1782    case ENTITY_UNDECLARED:
1783        error ("reference to undeclared entity", name, null);
1784        break;
1785    case ENTITY_INTERNAL:
1786        pushString (name, getEntityValue (name));
1787        break;
1788    case ENTITY_TEXT:
1789        if (externalAllowed) {
1790        pushURL (name, getEntityPublicId (name),
1791             getEntitySystemId (name),
1792             null, null, null, true);
1793        } else {
1794        error ("reference to external entity in attribute value.",
1795            name, null);
1796        }
1797        break;
1798    case ENTITY_NDATA:
1799        if (externalAllowed) {
1800        error ("unparsed entity reference in content", name, null);
1801        } else {
1802        error ("reference to external entity in attribute value.",
1803            name, null);
1804        }
1805        break;
1806    }
1807    }
1808
1809
1810    /**
1811     * Parse and expand a parameter entity reference.
1812     * <pre>
1813     * [69] PEReference ::= '%' Name ';'
1814     * </pre>
1815     * <p>NOTE: the '%' has already been read.
1816     */
1817    private void parsePEReference ()
1818    throws SAXException, IOException
1819    {
1820    String name;
1821
1822    name = "%" + readNmtoken (true);
1823    require (';', "parameter entity reference");
1824    switch (getEntityType (name)) {
1825    case ENTITY_UNDECLARED:
1826        // this is a validity problem, not a WFC violation ... but
1827        // we should disable handling of all subsequent declarations
1828        // unless this is a standalone document
1829        // warn ("reference to undeclared parameter entity", name, null);
1830
1831        break;
1832    case ENTITY_INTERNAL:
1833        if (inLiteral)
1834        pushString (name, getEntityValue (name));
1835        else
1836        pushString (name, ' ' + getEntityValue (name) + ' ');
1837        break;
1838    case ENTITY_TEXT:
1839        if (!inLiteral)
1840            pushString (null, " ");
1841        pushURL (name, getEntityPublicId (name),
1842             getEntitySystemId (name),
1843             null, null, null, true);
1844        if (!inLiteral)
1845            pushString (null, " ");
1846        break;
1847    }
1848    }
1849
1850    /**
1851     * Parse an entity declaration.
1852     * <pre>
1853     * [70] EntityDecl ::= GEDecl | PEDecl
1854     * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
1855     * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
1856     * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
1857     * [74] PEDef ::= EntityValue | ExternalID
1858     * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1859     *         | 'PUBLIC' S PubidLiteral S SystemLiteral
1860     * [76] NDataDecl ::= S 'NDATA' S Name
1861     * </pre>
1862     * <p>NOTE: the '&lt;!ENTITY' has already been read.
1863     */
1864    private void parseEntityDecl ()
1865    throws Exception
1866    {
1867    boolean peFlag = false;
1868
1869    // Check for a parameter entity.
1870    expandPE = false;
1871    requireWhitespace ();
1872    if (tryRead ('%')) {
1873        peFlag = true;
1874        requireWhitespace ();
1875    }
1876    expandPE = true;
1877
1878    // Read the entity name, and prepend
1879    // '%' if necessary.
1880    String name = readNmtoken (true);
1881    if (peFlag) {
1882        name = "%" + name;
1883    }
1884
1885    // Read the entity value.
1886    requireWhitespace ();
1887    char c = readCh ();
1888    unread (c);
1889    if (c == '"' || c == '\'') {
1890        // Internal entity ... replacement text has expanded refs
1891        // to characters and PEs, but not to general entities
1892        String value = readLiteral (0);
1893        setInternalEntity (name, value);
1894    } else {
1895        // Read the external IDs
1896        String[] ids = readExternalIds (false);
1897        if (ids [1] == null) {
1898            error ("system identifer missing", name, null);
1899        }
1900
1901        // Check for NDATA declaration.
1902        boolean white = tryWhitespace ();
1903        if (!peFlag && tryRead ("NDATA")) {
1904            if (!white)
1905                error ("whitespace required before NDATA");
1906            requireWhitespace ();
1907            String notationName = readNmtoken (true);
1908            setExternalDataEntity (name, ids [0], ids [1], notationName);
1909        } else {
1910            setExternalTextEntity (name, ids [0], ids [1]);
1911        }
1912    }
1913
1914    // Finish the declaration.
1915    skipWhitespace ();
1916    require ('>', "NDATA");
1917    }
1918
1919
1920    /**
1921     * Parse a notation declaration.
1922     * <pre>
1923     * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
1924     *      (ExternalID | PublicID) S? '&gt;'
1925     * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1926     * </pre>
1927     * <P>NOTE: the '&lt;!NOTATION' has already been read.
1928     */
1929    private void parseNotationDecl ()
1930    throws Exception
1931    {
1932    String nname, ids[];
1933
1934
1935    requireWhitespace ();
1936    nname = readNmtoken (true);
1937
1938    requireWhitespace ();
1939
1940    // Read the external identifiers.
1941    ids = readExternalIds (true);
1942    if (ids [0] == null && ids [1] == null) {
1943        error ("external identifer missing", nname, null);
1944    }
1945
1946    // Register the notation.
1947    setNotation (nname, ids [0], ids [1]);
1948
1949    skipWhitespace ();
1950    require ('>', "notation declaration");
1951    }
1952
1953
1954    /**
1955     * Parse character data.
1956     * <pre>
1957     * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
1958     * </pre>
1959     */
1960    private void parseCharData ()
1961    throws Exception
1962    {
1963    char c;
1964
1965    // Start with a little cheat -- in most
1966    // cases, the entire sequence of
1967    // character data will already be in
1968    // the readBuffer; if not, fall through to
1969    // the normal approach.
1970    if (USE_CHEATS) {
1971        int lineAugment = 0;
1972        int columnAugment = 0;
1973
1974loop:
1975        for (int i = readBufferPos; i < readBufferLength; i++) {
1976
1977        switch (c = readBuffer [i]) {
1978        case '\n':
1979            lineAugment++;
1980            columnAugment = 0;
1981            break;
1982        case '&':
1983        case '<':
1984            int start = readBufferPos;
1985            columnAugment++;
1986            readBufferPos = i;
1987            if (lineAugment > 0) {
1988                line += lineAugment;
1989                column = columnAugment;
1990            } else {
1991                column += columnAugment;
1992            }
1993            dataBufferAppend (readBuffer, start, i - start);
1994            return;
1995        case ']':
1996            // XXX missing two end-of-buffer cases
1997            if ((i + 2) < readBufferLength) {
1998                if (readBuffer [i + 1] == ']'
1999                    && readBuffer [i + 2] == '>') {
2000                    error ("character data may not contain ']]>'");
2001                }
2002            }
2003            columnAugment++;
2004            break;
2005        default:
2006            if (c < 0x0020 || c > 0xFFFD)
2007            error ("illegal XML character U+"
2008                + Integer.toHexString (c));
2009            // FALLTHROUGH
2010        case '\r':
2011        case '\t':
2012            columnAugment++;
2013        }
2014        }
2015    }
2016
2017    // OK, the cheat didn't work; start over
2018    // and do it by the book.
2019
2020    int closeSquareBracketCount = 0;
2021    while (true) {
2022        c = readCh ();
2023        switch (c) {
2024        case '<':
2025        case '&':
2026            unread (c);
2027            return;
2028        case ']':
2029            closeSquareBracketCount++;
2030            dataBufferAppend(c);
2031            break;
2032        case '>':
2033            if (closeSquareBracketCount>=2) {
2034                // we've hit ']]>'
2035                error ("']]>' is not allowed here");
2036                break;
2037            }
2038            // fall-through                
2039        default:
2040            closeSquareBracketCount=0;
2041            dataBufferAppend (c);
2042            break;
2043        }
2044    }
2045    }
2046
2047
2048    //////////////////////////////////////////////////////////////////////
2049    // High-level reading and scanning methods.
2050    //////////////////////////////////////////////////////////////////////
2051
2052    /**
2053     * Require whitespace characters.
2054     */
2055    private void requireWhitespace ()
2056    throws SAXException, IOException
2057    {
2058    char c = readCh ();
2059    if (isWhitespace (c)) {
2060        skipWhitespace ();
2061    } else {
2062        error ("whitespace required", c, null);
2063    }
2064    }
2065
2066
2067    /**
2068     * Parse whitespace characters, and leave them in the data buffer.
2069     */
2070    private void parseWhitespace ()     // method no longer used - MHK
2071    throws Exception
2072    {
2073        char c = readCh ();
2074        while (isWhitespace (c)) {
2075            dataBufferAppend (c);
2076            c = readCh ();
2077        }
2078        unread (c);
2079    }
2080
2081
2082    /**
2083     * Skip whitespace characters.
2084     * <pre>
2085     * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2086     * </pre>
2087     */
2088    private void skipWhitespace ()
2089    throws SAXException, IOException
2090    {
2091    // Start with a little cheat.  Most of
2092    // the time, the white space will fall
2093    // within the current read buffer; if
2094    // not, then fall through.
2095    if (USE_CHEATS) {
2096        int lineAugment = 0;
2097        int columnAugment = 0;
2098
2099loop:
2100        for (int i = readBufferPos; i < readBufferLength; i++) {
2101        switch (readBuffer [i]) {
2102        case ' ':
2103        case '\t':
2104        case '\r':
2105            columnAugment++;
2106            break;
2107        case '\n':
2108            lineAugment++;
2109            columnAugment = 0;
2110            break;
2111        case '%':
2112            if (expandPE)
2113            break loop;
2114            // else fall through...
2115        default:
2116            readBufferPos = i;
2117            if (lineAugment > 0) {
2118            line += lineAugment;
2119            column = columnAugment;
2120            } else {
2121            column += columnAugment;
2122            }
2123            return;
2124        }
2125        }
2126    }
2127
2128    // OK, do it by the book.
2129    char c = readCh ();
2130    while (isWhitespace (c)) {
2131        c = readCh ();
2132    }
2133    unread (c);
2134    }
2135
2136
2137    /**
2138     * Read a name or (when parsing an enumeration) name token.
2139     * <pre>
2140     * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2141     * [7] Nmtoken ::= (NameChar)+
2142     * </pre>
2143     */
2144    private String readNmtoken (boolean isName)
2145    throws SAXException, IOException
2146    {
2147    char c;
2148
2149    if (USE_CHEATS) {
2150loop:
2151        for (int i = readBufferPos; i < readBufferLength; i++) {
2152        c = readBuffer [i];
2153        switch (c) {
2154          case '%':
2155            if (expandPE)
2156            break loop;
2157            // else fall through...
2158
2159            // What may legitimately come AFTER a name/nmtoken?
2160          case '<': case '>': case '&':
2161          case ',': case '|': case '*': case '+': case '?':
2162          case ')':
2163          case '=':
2164          case '\'': case '"':
2165          case '[':
2166          case ' ': case '\t': case '\r': case '\n':
2167          case ';':
2168          case '/':
2169            int start = readBufferPos;
2170            if (i == start)
2171            error ("name expected", readBuffer [i], null);
2172            readBufferPos = i;
2173            return intern (readBuffer, start, i - start);
2174
2175          default:
2176            // punt on exact tests from Appendix A; approximate
2177            // them using the Unicode ID start/part rules
2178            if (i == readBufferPos && isName) {
2179            if (!Character.isUnicodeIdentifierStart (c)
2180                && c != ':' && c != '_')
2181                error ("Not a name start character, U+"
2182                  + Integer.toHexString (c));
2183            } else if (!Character.isUnicodeIdentifierPart (c)
2184                && c != '-' && c != ':' && c != '_' && c != '.'
2185                && !isExtender (c))
2186            error ("Not a name character, U+"
2187                + Integer.toHexString (c));
2188        }
2189        }
2190    }
2191
2192    nameBufferPos = 0;
2193
2194    // Read the first character.
2195loop:
2196    while (true) {
2197        c = readCh ();
2198        switch (c) {
2199        case '%':
2200        case '<': case '>': case '&':
2201        case ',': case '|': case '*': case '+': case '?':
2202        case ')':
2203        case '=':
2204        case '\'': case '"':
2205        case '[':
2206        case ' ': case '\t': case '\n': case '\r':
2207        case ';':
2208        case '/':
2209        unread (c);
2210        if (nameBufferPos == 0) {
2211            error ("name expected");
2212        }
2213        // punt on exact tests from Appendix A, but approximate them
2214        if (isName
2215            && !Character.isUnicodeIdentifierStart (
2216                nameBuffer [0])
2217            && ":_".indexOf (nameBuffer [0]) == -1)
2218            error ("Not a name start character, U+"
2219                  + Integer.toHexString (nameBuffer [0]));
2220        String s = intern (nameBuffer, 0, nameBufferPos);
2221        nameBufferPos = 0;
2222        return s;
2223        default:
2224        // punt on exact tests from Appendix A, but approximate them
2225
2226        if ((nameBufferPos != 0 || !isName)
2227            && !Character.isUnicodeIdentifierPart (c)
2228            && ":-_.".indexOf (c) == -1
2229            && !isExtender (c))
2230            error ("Not a name character, U+"
2231                + Integer.toHexString (c));
2232        if (nameBufferPos >= nameBuffer.length)
2233            nameBuffer =
2234            (char[]) extendArray (nameBuffer,
2235                    nameBuffer.length, nameBufferPos);
2236        nameBuffer [nameBufferPos++] = c;
2237        }
2238    }
2239    }
2240
2241    private static boolean isExtender (char c)
2242    {
2243    // [88] Extender ::= ...
2244    return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
2245           || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
2246           || (c >= 0x3031 && c <= 0x3035)
2247           || (c >= 0x309d && c <= 0x309e)
2248           || (c >= 0x30fc && c <= 0x30fe);
2249    }
2250
2251
2252    /**
2253     * Read a literal.  With matching single or double quotes as
2254     * delimiters (and not embedded!) this is used to parse:
2255     * <pre>
2256     *  [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
2257     *  [10] AttValue ::= ... ([^<&] | Reference)* ...
2258     *  [11] SystemLiteral ::= ... (URLchar - "'")* ...
2259     *  [12] PubidLiteral ::= ... (PubidChar - "'")* ...
2260     * </pre>
2261     * as well as the quoted strings in XML and text declarations
2262     * (for version, encoding, and standalone) which have their
2263     * own constraints.
2264     */
2265    private String readLiteral (int flags)
2266    throws SAXException, IOException
2267    {
2268    char    delim, c;
2269    int startLine = line;
2270    boolean saved = expandPE;
2271
2272    // Find the first delimiter.
2273    delim = readCh ();
2274    if (delim != '"' && delim != '\'' && delim != (char) 0) {
2275        error ("expected '\"' or \"'\"", delim, null);
2276        return null;
2277    }
2278    inLiteral = true;
2279    if ((flags & LIT_DISABLE_PE) != 0)
2280        expandPE = false;
2281
2282    // Each level of input source has its own buffer; remember
2283    // ours, so we won't read the ending delimiter from any
2284    // other input source, regardless of entity processing.
2285    char ourBuf [] = readBuffer;
2286
2287    // Read the literal.
2288    try {
2289        c = readCh ();
2290loop:
2291        while (! (c == delim && readBuffer == ourBuf)) {
2292        switch (c) {
2293            // attributes and public ids are normalized
2294            // in almost the same ways
2295        case '\n':
2296        case '\r':
2297            if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0)
2298            c = ' ';
2299            break;
2300        case '\t':
2301            if ((flags & LIT_ATTRIBUTE) != 0)
2302            c = ' ';
2303            break;
2304        case '&':
2305            c = readCh ();
2306            // Char refs are expanded immediately, except for
2307            // all the cases where it's deferred.
2308            if (c == '#') {
2309            if ((flags & LIT_DISABLE_CREF) != 0) {
2310                dataBufferAppend ('&');
2311                continue;
2312            }
2313            parseCharRef ();
2314
2315            // It looks like an entity ref ...
2316            } else {
2317            unread (c);
2318            // Expand it?
2319            if ((flags & LIT_ENTITY_REF) > 0) {
2320                parseEntityRef (false);
2321
2322            // Is it just data?
2323            } else if ((flags & LIT_DISABLE_EREF) != 0) {
2324                dataBufferAppend ('&');
2325
2326            // OK, it will be an entity ref -- expanded later.
2327            } else {
2328                String name = readNmtoken (true);
2329                require (';', "entity reference");
2330                if ((flags & LIT_ENTITY_CHECK) != 0
2331                    && getEntityType (name) ==
2332                        ENTITY_UNDECLARED) {
2333                            // Possibly a validity error, shouldn't report it?
2334                            error ("General entity '" + name
2335                                + "' must be declared before use");
2336                }
2337                dataBufferAppend ('&');
2338                dataBufferAppend (name);
2339                dataBufferAppend (';');
2340            }
2341            }
2342            c = readCh ();
2343            continue loop;
2344
2345        case '<':
2346            // and why?  Perhaps so "&foo;" expands the same
2347            // inside and outside an attribute?
2348            if ((flags & LIT_ATTRIBUTE) != 0)
2349            error ("attribute values may not contain '<'");
2350            break;
2351
2352        // We don't worry about case '%' and PE refs, readCh does.
2353
2354        default:
2355            break;
2356        }
2357        dataBufferAppend (c);
2358        c = readCh ();
2359        }
2360    } catch (EOFException e) {
2361        error ("end of input while looking for delimiter (started on line "
2362           + startLine + ')', null, new Character (delim).toString ());
2363    }
2364    inLiteral = false;
2365    expandPE = saved;
2366
2367    // Normalise whitespace if necessary.
2368    if ((flags & LIT_NORMALIZE) > 0) {
2369        dataBufferNormalize ();
2370    }
2371
2372    // Return the value.
2373    return dataBufferToString ();
2374    }
2375
2376
2377    /**
2378     * Try reading external identifiers.
2379     * A system identifier is not required for notations.
2380     * @param inNotation Are we in a notation?
2381     * @return A two-member String array containing the identifiers.
2382     */
2383    private String[] readExternalIds (boolean inNotation)
2384    throws Exception
2385    {
2386    char    c;
2387    String  ids[] = new String [2];
2388    int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
2389
2390    if (tryRead ("PUBLIC")) {
2391        requireWhitespace ();
2392        ids [0] = readLiteral (LIT_NORMALIZE | LIT_PUBID | flags);
2393        if (inNotation) {
2394        skipWhitespace ();
2395        c = readCh ();
2396        unread (c);
2397        if (c == '"' || c == '\'') {
2398            ids [1] = readLiteral (flags);
2399        }
2400        } else {
2401        requireWhitespace ();
2402        ids [1] = readLiteral (flags);
2403        }
2404
2405        for (int i = 0; i < ids [0].length (); i++) {
2406        c = ids [0].charAt (i);
2407        if (c >= 'a' && c <= 'z')
2408            continue;
2409        if (c >= 'A' && c <= 'Z')
2410            continue;
2411        if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1)
2412            continue;
2413        error ("illegal PUBLIC id character U+"
2414            + Integer.toHexString (c));
2415        }
2416    } else if (tryRead ("SYSTEM")) {
2417        requireWhitespace ();
2418        ids [1] = readLiteral (flags);
2419    }
2420
2421    // XXX should normalize system IDs as follows:
2422    // - Convert to UTF-8
2423    // - Map reserved and non-ASCII characters to %HH
2424
2425    return ids;
2426    }
2427
2428
2429    /**
2430     * Test if a character is whitespace.
2431     * <pre>
2432     * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2433     * </pre>
2434     * @param c The character to test.
2435     * @return true if the character is whitespace.
2436     */
2437    private final boolean isWhitespace (char c)
2438    {
2439    if (c > 0x20)
2440        return false;
2441    if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
2442        return true;
2443    return false;   // illegal ...
2444    }
2445
2446
2447    //////////////////////////////////////////////////////////////////////
2448    // Utility routines.
2449    //////////////////////////////////////////////////////////////////////
2450
2451
2452    /**
2453     * Add a character to the data buffer.
2454     */
2455    private void dataBufferAppend (char c)
2456    {
2457    // Expand buffer if necessary.
2458    if (dataBufferPos >= dataBuffer.length)
2459        dataBuffer =
2460        (char[]) extendArray (dataBuffer,
2461            dataBuffer.length, dataBufferPos);
2462    dataBuffer [dataBufferPos++] = c;
2463    }
2464
2465
2466    /**
2467     * Add a string to the data buffer.
2468     */
2469    private void dataBufferAppend (String s)
2470    {
2471    dataBufferAppend (s.toCharArray (), 0, s.length ());
2472    }
2473
2474
2475    /**
2476     * Append (part of) a character array to the data buffer.
2477     */
2478    private void dataBufferAppend (char ch[], int start, int length)
2479    {
2480    dataBuffer = (char[])
2481        extendArray (dataBuffer, dataBuffer.length,
2482                    dataBufferPos + length);
2483
2484    System.arraycopy (ch, start, dataBuffer, dataBufferPos, length);
2485    dataBufferPos += length;
2486    }
2487
2488
2489    /**
2490     * Normalise spaces in the data buffer.
2491     */
2492    private void dataBufferNormalize ()
2493    {
2494    int i = 0;
2495    int j = 0;
2496    int end = dataBufferPos;
2497
2498    // Skip spaces at the start.
2499    while (j < end && dataBuffer [j] == ' ') {
2500        j++;
2501    }
2502
2503    // Skip whitespace at the end.
2504    while (end > j && dataBuffer [end - 1] == ' ') {
2505        end --;
2506    }
2507
2508    // Start copying to the left.
2509    while (j < end) {
2510
2511        char c = dataBuffer [j++];
2512
2513        // Normalise all other whitespace to
2514        // a single space.
2515        if (c == ' ') {
2516        while (j < end && dataBuffer [j++] == ' ') {}
2517
2518        dataBuffer [i++] = ' ';
2519        dataBuffer [i++] = dataBuffer [j - 1];
2520        } else {
2521        dataBuffer [i++] = c;
2522        }
2523    }
2524
2525    // The new length is <= the old one.
2526    dataBufferPos = i;
2527    }
2528
2529
2530    /**
2531     * Convert the data buffer to a string.
2532     */
2533    private String dataBufferToString ()
2534    {
2535    String s = new String (dataBuffer, 0, dataBufferPos);
2536    dataBufferPos = 0;
2537    return s;
2538    }
2539
2540
2541    /**
2542     * Flush the contents of the data buffer to the handler, as
2543     * appropriate, and reset the buffer for new input.
2544     */
2545    private void dataBufferFlush ()
2546    throws SAXException
2547    {
2548    if (currentElementContent == CONTENT_ELEMENTS
2549        && dataBufferPos > 0
2550        && !inCDATA
2551        ) {
2552        // We can't just trust the buffer to be whitespace, there
2553        // are cases when it isn't
2554        for (int i = 0; i < dataBufferPos; i++) {
2555            if (!isWhitespace (dataBuffer [i])) {
2556                handler.charData (dataBuffer, 0, dataBufferPos);
2557                dataBufferPos = 0;
2558            }
2559        }
2560        if (dataBufferPos > 0) {
2561            handler.ignorableWhitespace (dataBuffer, 0, dataBufferPos);
2562            dataBufferPos = 0;
2563        }
2564    } else if (dataBufferPos > 0) {
2565        handler.charData (dataBuffer, 0, dataBufferPos);
2566        dataBufferPos = 0;
2567    }
2568    }
2569
2570
2571    /**
2572     * Require a string to appear, or throw an exception.
2573     * <p><em>Precondition:</em> Entity expansion is not required.
2574     * <p><em>Precondition:</em> data buffer has no characters that
2575     * will get sent to the application.
2576     */
2577    private void require (String delim, String context)
2578    throws SAXException, IOException
2579    {
2580    int length = delim.length ();
2581    char    ch [];
2582        
2583    if (length < dataBuffer.length) {
2584        ch = dataBuffer;
2585        delim.getChars (0, length, ch, 0);
2586    } else
2587        ch = delim.toCharArray ();
2588
2589    if (USE_CHEATS
2590        && length <= (readBufferLength - readBufferPos)) {
2591        int offset = readBufferPos;
2592
2593        for (int i = 0; i < length; i++, offset++)
2594        if (ch [i] != readBuffer [offset])
2595            error ("unexpected characters in " + context, null, delim);
2596        readBufferPos = offset;
2597        
2598    } else {
2599        for (int i = 0; i < length; i++)
2600        require (ch [i], delim);
2601    }
2602    }
2603
2604
2605    /**
2606     * Require a character to appear, or throw an exception.
2607     */
2608    private void require (char delim, String after)
2609    throws SAXException, IOException
2610    {
2611    char c = readCh ();
2612
2613    if (c != delim) {
2614        error ("unexpected character after " + after, c, delim+"");
2615    }
2616    }
2617
2618
2619    /**
2620     * Create an interned string from a character array.
2621     * &AElig;lfred uses this method to create an interned version
2622     * of all names and name tokens, so that it can test equality
2623     * with <code>==</code> instead of <code>String.equals ()</code>.
2624     *
2625     * <p>This is much more efficient than constructing a non-interned
2626     * string first, and then interning it.
2627     *
2628     * @param ch an array of characters for building the string.
2629     * @param start the starting position in the array.
2630     * @param length the number of characters to place in the string.
2631     * @return an interned string.
2632     * @see #intern (String)
2633     * @see java.lang.String#intern
2634     */
2635    public String intern (char ch[], int start, int length)
2636    {
2637    int index = 0;
2638    int hash = 0;
2639    Object  bucket [];
2640
2641    // Generate a hash code.
2642    for (int i = start; i < start + length; i++)
2643        hash = 31 * hash + ch [i];
2644    hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
2645
2646    // Get the bucket -- consists of {array,String} pairs
2647    if ((bucket = symbolTable [hash]) == null) {
2648        // first string in this bucket
2649        bucket = new Object [8];
2650
2651    // Search for a matching tuple, and
2652    // return the string if we find one.
2653    } else {
2654        while (index < bucket.length) {
2655        char chFound [] = (char []) bucket [index];
2656
2657        // Stop when we hit a null index.
2658        if (chFound == null)
2659            break;
2660
2661        // If they're the same length, check for a match.
2662        if (chFound.length == length) {
2663            for (int i = 0; i < chFound.length; i++) {
2664            // continue search on failure
2665            if (ch [start + i] != chFound [i]) {
2666                break;
2667            } else if (i == length - 1) {
2668                // That's it, we have a match!
2669                return (String) bucket [index + 1];
2670            }
2671            }
2672        }
2673        index += 2;
2674        }
2675        // Not found -- we'll have to add it.
2676
2677        // Do we have to grow the bucket?
2678        bucket = (Object []) extendArray (bucket, bucket.length, index);
2679    }
2680    symbolTable [hash] = bucket;
2681
2682    // OK, add it to the end of the bucket -- "local" interning.
2683    // Intern "globally" to let applications share interning benefits.
2684    String s = new String (ch, start, length).intern ();
2685    bucket [index] = s.toCharArray ();
2686    bucket [index + 1] = s;
2687    return s;
2688    }
2689
2690
2691    /**
2692     * Ensure the capacity of an array, allocating a new one if
2693     * necessary.  Usually called only a handful of times.
2694     */
2695    private Object extendArray (Object array, int currentSize, int requiredSize)
2696    {
2697    if (requiredSize < currentSize) {
2698        return array;
2699    } else {
2700        Object newArray = null;
2701        int newSize = currentSize * 2;
2702
2703        if (newSize <= requiredSize)
2704        newSize = requiredSize + 1;
2705
2706        if (array instanceof char[])
2707        newArray = new char [newSize];
2708        else if (array instanceof Object[])
2709        newArray = new Object [newSize];
2710        else
2711        throw new RuntimeException ();
2712
2713        System.arraycopy (array, 0, newArray, 0, currentSize);
2714        return newArray;
2715    }
2716    }
2717
2718
2719    //////////////////////////////////////////////////////////////////////
2720    // XML query routines.
2721    //////////////////////////////////////////////////////////////////////
2722
2723
2724    //
2725    // Elements
2726    //
2727
2728    /**
2729     * Get the declared elements for an XML document.
2730     * <p>The results will be valid only after the DTD (if any) has been
2731     * parsed.
2732     * @return An enumeration of all element types declared for this
2733     *   document (as Strings).
2734     * @see #getElementContentType
2735     * @see #getElementContentModel
2736     */
2737    public Enumeration declaredElements ()
2738    {
2739    return elementInfo.keys ();
2740    }
2741
2742
2743    /**
2744     * Look up the content type of an element.
2745     * @param element element info vector
2746     * @param defaultType value for null vector
2747     * @return An integer constant representing the content type.
2748     * @see #CONTENT_UNDECLARED
2749     * @see #CONTENT_ANY
2750     * @see #CONTENT_EMPTY
2751     * @see #CONTENT_MIXED
2752     * @see #CONTENT_ELEMENTS
2753     */
2754    private int getContentType (Object element [], int defaultType)
2755    {
2756    int retval;
2757
2758    if (element == null)
2759        return defaultType;
2760    retval = ((Integer) element [0]).intValue ();
2761    if (retval == CONTENT_UNDECLARED)
2762        retval = defaultType;
2763    return retval;
2764    }
2765
2766
2767    /**
2768     * Look up the content type of an element.
2769     * @param name The element type name.
2770     * @return An integer constant representing the content type.
2771     * @see #getElementContentModel
2772     * @see #CONTENT_UNDECLARED
2773     * @see #CONTENT_ANY
2774     * @see #CONTENT_EMPTY
2775     * @see #CONTENT_MIXED
2776     * @see #CONTENT_ELEMENTS
2777     */
2778    public int getElementContentType (String name)
2779    {
2780    Object element [] = (Object []) elementInfo.get (name);
2781    return getContentType (element, CONTENT_UNDECLARED);
2782    }
2783
2784
2785    /**
2786     * Look up the content model of an element.
2787     * <p>The result will always be null unless the content type is
2788     * CONTENT_ELEMENTS or CONTENT_MIXED.
2789     * @param name The element type name.
2790     * @return The normalised content model, as a string.
2791     * @see #getElementContentType
2792     */
2793    public String getElementContentModel (String name)
2794    {
2795    Object element[] = (Object[]) elementInfo.get (name);
2796    if (element == null) {
2797        return null;
2798    } else {
2799        return (String) element [1];
2800    }
2801    }
2802
2803
2804    /**
2805     * Register an element.
2806     * Array format:
2807     *  [0] element type name
2808     *  [1] content model (mixed, elements only)
2809     *  [2] attribute hash table
2810     */
2811    private void setElement (String name, int contentType,
2812              String contentModel, Hashtable attributes)
2813    throws Exception
2814    {
2815    Object element[] = (Object []) elementInfo.get (name);
2816
2817    // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
2818    if (element == null) {
2819        element = new Object [3];
2820        element [0] = new Integer (contentType);
2821        element [1] = contentModel;
2822        element [2] = attributes;
2823        elementInfo.put (name, element);
2824        return;
2825    } 
2826
2827    // <!ELEMENT ...> declaration?  
2828    if (contentType != CONTENT_UNDECLARED) {
2829        // ... following an associated <!ATTLIST ...>
2830        if (((Integer) element [0]).intValue () == CONTENT_UNDECLARED) {
2831        element [0] = new Integer (contentType);
2832        element [1] = contentModel;
2833        } else {
2834        // VC: Unique Element Type Declaration
2835        //verror ("multiple declarations for element type: " + name);
2836        }
2837    }
2838
2839    // first <!ATTLIST ...>, before <!ELEMENT ...> ?
2840    else if (attributes != null) {
2841        element [2] = attributes;
2842    }
2843
2844    }
2845
2846
2847    /**
2848     * Look up the attribute hash table for an element.
2849     * The hash table is the second item in the element array.
2850     */
2851    private Hashtable getElementAttributes (String name)
2852    {
2853    Object element[] = (Object[]) elementInfo.get (name);
2854    if (element == null) {
2855        return null;
2856    } else {
2857        return (Hashtable) element [2];
2858    }
2859    }
2860
2861
2862
2863    //
2864    // Attributes
2865    //
2866
2867    /**
2868     * Get the declared attributes for an element type.
2869     * @param elname The name of the element type.
2870     * @return An Enumeration of all the attributes declared for
2871     *   a specific element type.  The results will be valid only
2872     *   after the DTD (if any) has been parsed.
2873     * @see #getAttributeType
2874     * @see #getAttributeEnumeration
2875     * @see #getAttributeDefaultValueType
2876     * @see #getAttributeDefaultValue
2877     * @see #getAttributeExpandedValue
2878     */
2879    private Enumeration declaredAttributes (Object element [])
2880    {
2881    Hashtable attlist;
2882
2883    if (element == null)
2884        return null;
2885    if ((attlist = (Hashtable) element [2]) == null)
2886        return null;
2887    return attlist.keys ();
2888    }
2889
2890    /**
2891     * Get the declared attributes for an element type.
2892     * @param elname The name of the element type.
2893     * @return An Enumeration of all the attributes declared for
2894     *   a specific element type.  The results will be valid only
2895     *   after the DTD (if any) has been parsed.
2896     * @see #getAttributeType
2897     * @see #getAttributeEnumeration
2898     * @see #getAttributeDefaultValueType
2899     * @see #getAttributeDefaultValue
2900     * @see #getAttributeExpandedValue
2901     */
2902    public Enumeration declaredAttributes (String elname)
2903    {
2904    return declaredAttributes ((Object []) elementInfo.get (elname));
2905    }
2906
2907
2908    /**
2909     * Retrieve the declared type of an attribute.
2910     * @param name The name of the associated element.
2911     * @param aname The name of the attribute.
2912     * @return An integer constant representing the attribute type.
2913     * @see #ATTRIBUTE_UNDECLARED
2914     * @see #ATTRIBUTE_CDATA
2915     * @see #ATTRIBUTE_ID
2916     * @see #ATTRIBUTE_IDREF
2917     * @see #ATTRIBUTE_IDREFS
2918     * @see #ATTRIBUTE_ENTITY
2919     * @see #ATTRIBUTE_ENTITIES
2920     * @see #ATTRIBUTE_NMTOKEN
2921     * @see #ATTRIBUTE_NMTOKENS
2922     * @see #ATTRIBUTE_ENUMERATED
2923     * @see #ATTRIBUTE_NOTATION
2924     */
2925    public int getAttributeType (String name, String aname)
2926    {
2927    Object attribute[] = getAttribute (name, aname);
2928    if (attribute == null) {
2929        return ATTRIBUTE_UNDECLARED;
2930    } else {
2931        return ((Integer) attribute [0]).intValue ();
2932    }
2933    }
2934
2935
2936    /**
2937     * Retrieve the allowed values for an enumerated attribute type.
2938     * @param name The name of the associated element.
2939     * @param aname The name of the attribute.
2940     * @return A string containing the token list.
2941     * @see #ATTRIBUTE_ENUMERATED
2942     * @see #ATTRIBUTE_NOTATION
2943     */
2944    public String getAttributeEnumeration (String name, String aname)
2945    {
2946    Object attribute[] = getAttribute (name, aname);
2947    if (attribute == null) {
2948        return null;
2949    } else {
2950        return (String) attribute [3];
2951    }
2952    }
2953
2954
2955    /**
2956     * Retrieve the default value of a declared attribute.
2957     * @param name The name of the associated element.
2958     * @param aname The name of the attribute.
2959     * @return The default value, or null if the attribute was
2960     *   #IMPLIED or simply undeclared and unspecified.
2961     * @see #getAttributeExpandedValue
2962     */
2963    public String getAttributeDefaultValue (String name, String aname)
2964    {
2965    Object attribute[] = getAttribute (name, aname);
2966    if (attribute == null) {
2967        return null;
2968    } else {
2969        return (String) attribute [1];
2970    }
2971    }
2972
2973
2974    /**
2975     * Retrieve the expanded value of a declared attribute.
2976     * <p>General entities (and char refs) will be expanded (once).
2977     * @param name The name of the associated element.
2978     * @param aname The name of the attribute.
2979     * @return The expanded default value, or null if the attribute was
2980     *   #IMPLIED or simply undeclared
2981     * @see #getAttributeDefaultValue
2982     */
2983    public String getAttributeExpandedValue (String name, String aname)
2984    throws Exception
2985    {
2986    Object attribute[] = getAttribute (name, aname);
2987
2988    if (attribute == null) {
2989        return null;
2990    } else if (attribute [4] == null && attribute [1] != null) {
2991        // we MUST use the same buf for both quotes else the literal
2992        // can't be properly terminated
2993        char buf [] = new char [1];
2994        int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
2995        int type = getAttributeType (name, aname);
2996
2997        if (type != ATTRIBUTE_CDATA && type != ATTRIBUTE_UNDECLARED)
2998        flags |= LIT_NORMALIZE;
2999        buf [0] = '"';
3000        pushCharArray (null, buf, 0, 1);
3001        pushString (null, (String) attribute [1]);
3002        pushCharArray (null, buf, 0, 1);
3003        attribute [4] = readLiteral (flags);
3004    }
3005    return (String) attribute [4];
3006    }
3007
3008
3009    /**
3010     * Retrieve the default value type of a declared attribute.
3011     * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3012     * @see #ATTRIBUTE_DEFAULT_IMPLIED
3013     * @see #ATTRIBUTE_DEFAULT_REQUIRED
3014     * @see #ATTRIBUTE_DEFAULT_FIXED
3015     */
3016    public int getAttributeDefaultValueType (String name, String aname)
3017    {
3018    Object attribute[] = getAttribute (name, aname);
3019    if (attribute == null) {
3020        return ATTRIBUTE_DEFAULT_UNDECLARED;
3021    } else {
3022        return ((Integer) attribute [2]).intValue ();
3023    }
3024    }
3025
3026
3027    /**
3028     * Register an attribute declaration for later retrieval.
3029     * Format:
3030     * - String type
3031     * - String default value
3032     * - int value type
3033     */
3034    private void setAttribute (String elName, String name, int type,
3035            String enumeration,
3036            String value, int valueType)
3037    throws Exception
3038    {
3039    Hashtable attlist;
3040
3041    // Create a new hashtable if necessary.
3042    attlist = getElementAttributes (elName);
3043    if (attlist == null) {
3044        attlist = new Hashtable ();
3045    }
3046
3047    // ignore multiple attribute declarations!
3048    if (attlist.get (name) != null) {
3049        // warn ...
3050        return;
3051    } else {
3052        Object[] attribute = new Object [5];
3053        attribute [0] = new Integer (type);
3054        attribute [1] = value;
3055        attribute [2] = new Integer (valueType);
3056        attribute [3] = enumeration;
3057        attribute [4] = null;
3058        attlist.put (name, attribute);
3059
3060        // save; but don't overwrite any existing <!ELEMENT ...>
3061        setElement (elName, CONTENT_UNDECLARED, null, attlist);
3062    }
3063    }
3064
3065
3066    /**
3067     * Retrieve the five-member array representing an
3068     * attribute declaration.
3069     */
3070    private Object[] getAttribute (String elName, String name)
3071    {
3072        Hashtable attlist = getElementAttributes (elName);
3073        if (attlist == null) {
3074            return null;
3075        }
3076
3077        return (Object[]) attlist.get (name);
3078    }
3079
3080
3081    //
3082    // Entities
3083    //
3084
3085    /**
3086     * Get declared entities.
3087     * @return An Enumeration of all the entities declared for
3088     *   this XML document.  The results will be valid only
3089     *   after the DTD (if any) has been parsed.
3090     * @see #getEntityType
3091     * @see #getEntityPublicId
3092     * @see #getEntitySystemId
3093     * @see #getEntityValue
3094     * @see #getEntityNotationName
3095     */
3096    public Enumeration declaredEntities ()
3097    {
3098    return entityInfo.keys ();
3099    }
3100
3101
3102    /**
3103     * Find the type of an entity.
3104     * @returns An integer constant representing the entity type.
3105     * @see #ENTITY_UNDECLARED
3106     * @see #ENTITY_INTERNAL
3107     * @see #ENTITY_NDATA
3108     * @see #ENTITY_TEXT
3109     */
3110    public int getEntityType (String ename)
3111    {
3112    Object entity[] = (Object[]) entityInfo.get (ename);
3113    if (entity == null) {
3114        return ENTITY_UNDECLARED;
3115    } else {
3116        return ((Integer) entity [0]).intValue ();
3117    }
3118    }
3119
3120
3121    /**
3122     * Return an external entity's public identifier, if any.
3123     * @param ename The name of the external entity.
3124     * @return The entity's system identifier, or null if the
3125     *   entity was not declared, if it is not an
3126     *   external entity, or if no public identifier was
3127     *   provided.
3128     * @see #getEntityType
3129     */
3130    public String getEntityPublicId (String ename)
3131    {
3132    Object entity[] = (Object[]) entityInfo.get (ename);
3133    if (entity == null) {
3134        return null;
3135    } else {
3136        return (String) entity [1];
3137    }
3138    }
3139
3140
3141    /**
3142     * Return an external entity's system identifier.
3143     * @param ename The name of the external entity.
3144     * @return The entity's system identifier, or null if the
3145     *   entity was not declared, or if it is not an
3146     *   external entity. Change made by MHK: The system identifier
3147     *   is returned as an absolute URL, resolved relative to the entity
3148     *   it was contained in.
3149     * @see #getEntityType
3150     */
3151    public String getEntitySystemId (String ename) 
3152    {
3153        Object entity[] = (Object[]) entityInfo.get (ename);
3154        if (entity == null) {
3155            return null;
3156        } else {
3157            try {
3158                String relativeURI = (String)entity [2];
3159                URL baseURI = (URL)entity [5];
3160                if (baseURI==null) return relativeURI;
3161                URL absoluteURI = new URL( baseURI, relativeURI );
3162                return absoluteURI.toString();
3163            } catch (IOException err) {
3164                // ignore the exception, a user entity resolver may be able
3165                // to do something; if not, the error will be caught later
3166                return (String)entity [2];
3167            }
3168        }
3169    }
3170
3171
3172    /**
3173     * Return the value of an internal entity.
3174     * @param ename The name of the internal entity.
3175     * @return The entity's value, or null if the entity was
3176     *   not declared, or if it is not an internal entity.
3177     * @see #getEntityType
3178     */
3179    public String getEntityValue (String ename)
3180    {
3181    Object entity[] = (Object[]) entityInfo.get (ename);
3182    if (entity == null) {
3183        return null;
3184    } else {
3185        return (String) entity [3];
3186    }
3187    }
3188
3189
3190    /**
3191     * Get the notation name associated with an NDATA entity.
3192     * @param ename The NDATA entity name.
3193     * @return The associated notation name, or null if the
3194     *   entity was not declared, or if it is not an
3195     *   NDATA entity.
3196     * @see #getEntityType
3197     */
3198    public String getEntityNotationName (String eName)
3199    {
3200    Object entity[] = (Object[]) entityInfo.get (eName);
3201    if (entity == null) {
3202        return null;
3203    } else {
3204        return (String) entity [4];
3205    }
3206    }
3207
3208
3209    /**
3210     * Register an entity declaration for later retrieval.
3211     */
3212    private void setInternalEntity (String eName, String value)
3213    {
3214    setEntity (eName, ENTITY_INTERNAL, null, null, value, null);
3215    }
3216
3217
3218    /**
3219     * Register an external data entity.
3220     */
3221    private void setExternalDataEntity (String eName, String pubid,
3222                 String sysid, String nName)
3223    {
3224    setEntity (eName, ENTITY_NDATA, pubid, sysid, null, nName);
3225    }
3226
3227
3228    /**
3229     * Register an external text entity.
3230     */
3231    private void setExternalTextEntity (String eName,
3232            String pubid, String sysid)
3233    {
3234    setEntity (eName, ENTITY_TEXT, pubid, sysid, null, null);
3235    }
3236
3237
3238    /**
3239     * Register an entity declaration for later retrieval.
3240     */
3241    private void setEntity (String eName, int eClass,
3242             String pubid, String sysid,
3243             String value, String nName)
3244    {
3245    Object entity[];
3246
3247    if (entityInfo.get (eName) == null) {
3248        entity = new Object [6];
3249        entity [0] = new Integer (eClass);
3250        entity [1] = pubid;
3251        entity [2] = sysid;
3252        entity [3] = value;
3253        entity [4] = nName;
3254        entity [5] = (externalEntity == null ? null : externalEntity.getURL());    
3255                        // added MHK: provides base URI for resolution
3256
3257        entityInfo.put (eName, entity);
3258    }
3259    }
3260
3261
3262    //
3263    // Notations.
3264    //
3265
3266    /**
3267     * Get declared notations.
3268     * @return An Enumeration of all the notations declared for
3269     *   this XML document.  The results will be valid only
3270     *   after the DTD (if any) has been parsed.
3271     * @see #getNotationPublicId
3272     * @see #getNotationSystemId
3273     */
3274    public Enumeration declaredNotations ()
3275    {
3276    return notationInfo.keys ();
3277    }
3278
3279
3280    /**
3281     * Look up the public identifier for a notation.
3282     * You will normally use this method to look up a notation
3283     * that was provided as an attribute value or for an NDATA entity.
3284     * @param nname The name of the notation.
3285     * @return A string containing the public identifier, or null
3286     *   if none was provided or if no such notation was
3287     *   declared.
3288     * @see #getNotationSystemId
3289     */
3290    public String getNotationPublicId (String nname)
3291    {
3292    Object notation[] = (Object[]) notationInfo.get (nname);
3293    if (notation == null) {
3294        return null;
3295    } else {
3296        return (String) notation [0];
3297    }
3298    }
3299
3300
3301    /**
3302     * Look up the system identifier for a notation.
3303     * You will normally use this method to look up a notation
3304     * that was provided as an attribute value or for an NDATA entity.
3305     * @param nname The name of the notation.
3306     * @return A string containing the system identifier, or null
3307     *   if no such notation was declared.
3308     * @see #getNotationPublicId
3309     */
3310    public String getNotationSystemId (String nname)
3311    {
3312    Object notation[] = (Object[]) notationInfo.get (nname);
3313    if (notation == null) {
3314        return null;
3315    } else {
3316        return (String) notation [1];
3317    }
3318    }
3319
3320
3321    /**
3322     * Register a notation declaration for later retrieval.
3323     * Format:
3324     * - public id
3325     * - system id
3326     */
3327    private void setNotation (String nname, String pubid, String sysid)
3328    throws Exception
3329    {
3330    Object notation[];
3331
3332    if (notationInfo.get (nname) == null) {
3333        notation = new Object [2];
3334        notation [0] = pubid;
3335        notation [1] = sysid;
3336        notationInfo.put (nname, notation);
3337    } else {
3338        // VC: Unique Notation Name
3339        // (it's not fatal)
3340    }
3341    }
3342
3343
3344    //
3345    // Location.
3346    //
3347
3348
3349    /**
3350     * Return the current line number.
3351     */
3352    public int getLineNumber ()
3353    {
3354    return line;
3355    }
3356
3357
3358    /**
3359     * Return the current column number.
3360     */
3361    public int getColumnNumber ()
3362    {
3363    return column;
3364    }
3365
3366
3367    //////////////////////////////////////////////////////////////////////
3368    // High-level I/O.
3369    //////////////////////////////////////////////////////////////////////
3370
3371
3372    /**
3373     * Read a single character from the readBuffer.
3374     * <p>The readDataChunk () method maintains the buffer.
3375     * <p>If we hit the end of an entity, try to pop the stack and
3376     * keep going.
3377     * <p> (This approach doesn't really enforce XML's rules about
3378     * entity boundaries, but this is not currently a validating
3379     * parser).
3380     * <p>This routine also attempts to keep track of the current
3381     * position in external entities, but it's not entirely accurate.
3382     * @return The next available input character.
3383     * @see #unread (char)
3384     * @see #unread (String)
3385     * @see #readDataChunk
3386     * @see #readBuffer
3387     * @see #line
3388     * @return The next character from the current input source.
3389     */
3390    private char readCh ()
3391    throws SAXException, IOException
3392    {
3393
3394    // As long as there's nothing in the
3395    // read buffer, try reading more data
3396    // (for an external entity) or popping
3397    // the entity stack (for either).
3398    while (readBufferPos >= readBufferLength) {
3399        switch (sourceType) {
3400        case INPUT_READER:
3401        case INPUT_STREAM:
3402        readDataChunk ();
3403        while (readBufferLength < 1) {
3404            popInput ();
3405            if (readBufferLength < 1) {
3406            readDataChunk ();
3407            }
3408        }
3409        break;
3410
3411        default:
3412
3413        popInput ();
3414        break;
3415        }
3416    }
3417
3418    char c = readBuffer [readBufferPos++];
3419
3420    if (c == '\n') {
3421        line++;
3422        column = 0;
3423    } else {
3424        if (c == '<') {
3425        /* the most common  return to parseContent () .. NOP */ ;
3426        } else if ((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
3427        error ("illegal XML character U+"
3428            + Integer.toHexString (c));
3429
3430        // If we're in the DTD and in a context where PEs get expanded,
3431        // do so ... 1/14/2000 errata identify those contexts.  There
3432        // are also spots in the internal subset where PE refs are fatal
3433        // errors, hence yet another flag.
3434        else if (c == '%' && expandPE) {
3435        if (peIsError && entityStack.size()==1)
3436            // not an error if PE reference is in an external PE called from internal subset
3437            error ("PE reference within declaration in internal subset.");
3438        parsePEReference ();
3439        return readCh ();
3440        }
3441        column++;
3442    }
3443
3444    return c;
3445    }
3446
3447
3448    /**
3449     * Push a single character back onto the current input stream.
3450     * <p>This method usually pushes the character back onto
3451     * the readBuffer, while the unread (String) method treats the
3452     * string as a new internal entity.
3453     * <p>I don't think that this would ever be called with 
3454     * readBufferPos = 0, because the methods always reads a character
3455     * before unreading it, but just in case, I've added a boundary
3456     * condition.
3457     * @param c The character to push back.
3458     * @see #readCh
3459     * @see #unread (String)
3460     * @see #unread (char[])
3461     * @see #readBuffer
3462     */
3463    private void unread (char c)
3464    throws SAXException
3465    {
3466    // Normal condition.
3467    if (c == '\n') {
3468        line--;
3469        column = -1;
3470    }
3471    if (readBufferPos > 0) {
3472        readBuffer [--readBufferPos] = c;
3473    } else {
3474        pushString (null, new Character (c).toString ());
3475    }
3476    }
3477
3478
3479    /**
3480     * Push a char array back onto the current input stream.
3481     * <p>NOTE: you must <em>never</em> push back characters that you
3482     * haven't actually read: use pushString () instead.
3483     * @see #readCh
3484     * @see #unread (char)
3485     * @see #unread (String)
3486     * @see #readBuffer
3487     * @see #pushString
3488     */
3489    private void unread (char ch[], int length)
3490    throws SAXException
3491    {
3492    for (int i = 0; i < length; i++) {
3493        if (ch [i] == '\n') {
3494        line--;
3495        column = -1;
3496        }
3497    }
3498    if (length < readBufferPos) {
3499        readBufferPos -= length;
3500    } else {
3501        pushCharArray (null, ch, 0, length);
3502        sourceType = INPUT_BUFFER;
3503    }
3504    }
3505
3506
3507    /**
3508     * Push a new external input source.
3509     * The source will be some kind of parsed entity, such as a PE
3510     * (including the external DTD subset) or content for the body.
3511     * <p>TODO: Right now, this method always attempts to autodetect
3512     * the encoding; in the future, it should allow the caller to 
3513     * request an encoding explicitly, and it should also look at the
3514     * headers with an HTTP connection.
3515     * @param url The java.net.URL object for the entity.
3516     * @see SAXDriver#resolveEntity
3517     * @see #pushString
3518     * @see #sourceType
3519     * @see #pushInput
3520     * @see #detectEncoding
3521     * @see #sourceType
3522     * @see #readBuffer
3523     */
3524    private void pushURL (
3525    String      ename,
3526    String      publicId,
3527    String      systemId,
3528    Reader      reader,
3529    InputStream stream,
3530    String      encoding,
3531    boolean     isAbsolute
3532    ) throws SAXException, IOException
3533    {
3534    boolean ignoreEncoding = false;
3535
3536    // Push the existing status.
3537    pushInput (ename);
3538
3539    // Create a new read buffer.
3540    // (Note the four-character margin)
3541    readBuffer = new char [READ_BUFFER_MAX + 4];
3542    readBufferPos = 0;
3543    readBufferLength = 0;
3544    readBufferOverflow = -1;
3545    is = null;
3546    line = 1;
3547    column = 0;
3548    currentByteCount = 0;
3549
3550    if (!isAbsolute) {
3551
3552        // Make any system ID (URI/URL) absolute.  There's one case
3553        // where it may be null:  parser was invoked without providing
3554        // one, e.g. since the XML data came from a memory buffer.
3555        try {
3556            if (systemId != null && externalEntity != null) {
3557                systemId = new URL (externalEntity.getURL (), systemId).toString ();
3558            } else if (baseURI != null) {
3559                systemId = new URL (new URL (baseURI), systemId).toString ();
3560                // throws IOException if couldn't create new URL
3561            }
3562        } catch(java.io.IOException err) {
3563            popInput();
3564            error("Invalid URL " + systemId + " (" + err.getMessage() + ")");
3565        }
3566    }
3567
3568    // See if the application wants to
3569    // redirect the system ID and/or
3570    // supply its own character stream.
3571    if (reader == null && stream == null && systemId != null) {
3572        Object input = null;
3573        try {
3574            input = handler.resolveEntity (publicId, systemId);
3575        } catch (java.io.IOException err) {
3576            popInput();
3577            error("Failure resolving entity " + systemId + " (" + err.getMessage() + ")");
3578        }
3579        if (input != null) {
3580            if (input instanceof String) {
3581                systemId = (String) input;
3582                isAbsolute = true;
3583            } else if (input instanceof InputStream) {
3584                stream = (InputStream) input;
3585            } else if (input instanceof Reader) {
3586                reader = (Reader) input;
3587            }
3588        } 
3589    }
3590    
3591    // Start the entity.
3592    if (systemId != null) {
3593        handler.startExternalEntity (systemId);
3594    } else {
3595        handler.startExternalEntity ("[unidentified data stream]");
3596    }
3597
3598    // If there's an explicit character stream, just
3599    // ignore encoding declarations.
3600    if (reader != null) {
3601        sourceType = INPUT_READER;
3602        this.reader = reader;
3603        tryEncodingDecl (true);
3604        return;
3605    }
3606    
3607    // Else we handle the conversion, and need to ensure
3608    // it's done right.
3609    sourceType = INPUT_STREAM;
3610    if (stream != null) {       
3611        is = stream;
3612    } else {
3613        // We have to open our own stream to the URL.
3614        URL url = new URL (systemId);
3615        try {
3616            externalEntity = url.openConnection ();
3617            externalEntity.connect ();
3618            is = externalEntity.getInputStream ();
3619        } catch (java.io.IOException err) {
3620            popInput();
3621            error("Cannot read from " + systemId + 
3622                (systemId.equals(err.getMessage()) ? "" : " (" + err.getMessage() + ")"));
3623        }
3624    }
3625
3626    // If we get to here, there must be
3627    // an InputStream available.
3628    if (!is.markSupported ()) {
3629        is = new BufferedInputStream (is);
3630    }
3631
3632    // Get any external encoding label.
3633    if (encoding == null && externalEntity != null) {
3634        // External labels can be untrustworthy; filesystems in
3635        // particular often have the wrong default for content
3636        // that wasn't locally originated.  Those we autodetect.
3637        if (!"file".equals (externalEntity.getURL ().getProtocol ())) {
3638        int temp;
3639
3640        // application/xml;charset=something;otherAttr=...
3641        // ... with many variants on 'something'
3642        encoding = externalEntity.getContentType ();
3643
3644        // MHK code (fix for Saxon 5.5.1/007): protect against encoding==null
3645        if (encoding==null) {
3646            temp = -1;
3647        } else {
3648            temp = encoding.indexOf ("charset");
3649        }
3650
3651        // RFC 2376 sez MIME text defaults to ASCII, but since the
3652        // JDK will create a MIME type out of thin air, we always
3653        // autodetect when there's no explicit charset attribute.
3654        if (temp < 0)
3655            encoding = null;    // autodetect
3656        else {
3657            temp = encoding.indexOf ('=', temp + 7); 
3658            encoding = encoding.substring (temp+1);   // +1 added by MHK 2 April 2001
3659            if ((temp = encoding.indexOf (';')) > 0)
3660            encoding = encoding.substring (0, temp);
3661
3662            // attributes can have comment fields (RFC 822)
3663            if ((temp = encoding.indexOf ('(')) > 0)
3664            encoding = encoding.substring (0, temp);
3665            // ... and values may be quoted
3666            if ((temp = encoding.indexOf ('"')) > 0)
3667            encoding = encoding.substring (temp + 1,
3668                encoding.indexOf ('"', temp + 2));
3669            encoding.trim ();
3670        }
3671        }
3672    }
3673
3674    // if we got an external encoding label, use it ...
3675    if (encoding != null) {
3676        this.encoding = ENCODING_EXTERNAL;
3677        setupDecoding (encoding);
3678        ignoreEncoding = true;
3679    
3680    // ... else autodetect
3681    } else {
3682        detectEncoding ();
3683        ignoreEncoding = false;
3684    }
3685    is.mark(100);
3686
3687    // Read any XML or text declaration.
3688    try {
3689        tryEncodingDecl (ignoreEncoding);
3690    } catch (EncodingException x) {
3691        encoding = x.getMessage ();
3692
3693        // if we don't handle the declared encoding,
3694        // try letting a JVM InputStreamReader do it
3695        try {
3696        if (sourceType != INPUT_STREAM)
3697            throw x;
3698
3699        is.reset ();
3700        readBufferPos = 0;
3701        readBufferLength = 0;
3702        readBufferOverflow = -1;
3703        line = 1;
3704        currentByteCount = column = 0;
3705
3706        sourceType = INPUT_READER;
3707        this.reader = new InputStreamReader (is, encoding);
3708        is = null;
3709
3710        tryEncodingDecl (true);
3711
3712        } catch (IOException e) {
3713        error ("unsupported text encoding",
3714               encoding,
3715               null);
3716        }
3717    }
3718    }
3719
3720
3721    /**
3722     * Check for an encoding declaration.  This is the second part of the
3723     * XML encoding autodetection algorithm, relying on detectEncoding to
3724     * get to the point that this part can read any encoding declaration
3725     * in the document (using only US-ASCII characters).
3726     *
3727     * <p> Because this part starts to fill parser buffers with this data,
3728     * it's tricky to to a reader so that Java's built-in decoders can be
3729     * used for the character encodings that aren't built in to this parser
3730     * (such as EUC-JP, KOI8-R, Big5, etc).
3731     *
3732     * @return any encoding in the declaration, uppercased; or null
3733     * @see detectEncoding
3734     */
3735    private String tryEncodingDecl (boolean ignoreEncoding)
3736    throws SAXException, IOException
3737    {
3738    // Read the XML/text declaration.
3739    if (tryRead ("<?xml")) {
3740        dataBufferFlush ();
3741        if (tryWhitespace ()) {
3742        if (inputStack.size () > 0) {
3743            return parseTextDecl (ignoreEncoding);
3744        } else {
3745            return parseXMLDecl (ignoreEncoding);
3746        }
3747        } else {
3748        unread ("xml".toCharArray (), 3);
3749        parsePI ();
3750        }
3751    }
3752    return null;
3753    }
3754
3755
3756    /**
3757     * Attempt to detect the encoding of an entity.
3758     * <p>The trick here (as suggested in the XML standard) is that
3759     * any entity not in UTF-8, or in UCS-2 with a byte-order mark, 
3760     * <b>must</b> begin with an XML declaration or an encoding
3761     * declaration; we simply have to look for "&lt;?xml" in various
3762     * encodings.
3763     * <p>This method has no way to distinguish among 8-bit encodings.
3764     * Instead, it sets up for UTF-8, then (possibly) revises its assumption
3765     * later in setupDecoding ().  Any ASCII-derived 8-bit encoding
3766     * should work, but most will be rejected later by setupDecoding ().
3767     * <p>I don't currently detect EBCDIC, since I'm concerned that it
3768     * could also be a valid UTF-8 sequence; I'll have to do more checking
3769     * later.
3770     * <p>MHK Nov 2001: modified to handle a BOM on UTF-8 files, which is
3771     * allowed by XML 2nd edition, and generated when Windows Notepad does
3772     * "save as UTF-8".
3773     * @see #tryEncoding (byte[], byte, byte, byte, byte)
3774     * @see #tryEncoding (byte[], byte, byte)
3775     * @see #setupDecoding
3776     * @see #read8bitEncodingDeclaration
3777     */
3778    private void detectEncoding ()
3779    throws SAXException, IOException
3780    {
3781    byte signature[] = new byte [4];
3782
3783    // Read the first four bytes for
3784    // autodetection.
3785    is.mark (4);
3786    is.read (signature);
3787    is.reset ();
3788
3789    //
3790    // FIRST:  four byte encodings (who uses these?)
3791    //
3792    if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3793              (byte) 0x00, (byte) 0x3c)) {
3794        // UCS-4 must begin with "<?xml"
3795        // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
3796        encoding = ENCODING_UCS_4_1234;
3797
3798    } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
3799                 (byte) 0x00, (byte) 0x00)) {
3800        // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
3801        encoding = ENCODING_UCS_4_4321;
3802
3803    } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3804                 (byte) 0x3c, (byte) 0x00)) {
3805        // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
3806        encoding = ENCODING_UCS_4_2143;
3807
3808    } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
3809                 (byte) 0x00, (byte) 0x00)) {
3810        // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
3811        encoding = ENCODING_UCS_4_3412;
3812
3813        // 00 00 fe ff UCS_4_1234 (with BOM)
3814        // ff fe 00 00 UCS_4_4321 (with BOM)
3815    }
3816    
3817    // SECOND: three byte signature:
3818    // look for UTF-8 byte order mark 3C 3F 78, allowed by XML 1.0 2nd edition
3819    
3820    else if (tryEncoding (signature, (byte)0xef, (byte)0xbb, (byte)0xbf)) {
3821        encoding = ENCODING_UTF_8;
3822        is.read(); is.read(); is.read();
3823    }
3824
3825    //
3826    // THIRD:  two byte encodings
3827    // note ... with 1/14/2000 errata the XML spec identifies some
3828    // more "broken UTF-16" autodetection cases, with no XML decl,
3829    // which we don't handle here (that's legal too).
3830    //
3831    else if (tryEncoding (signature, (byte) 0xfe, (byte) 0xff)) {
3832        // UCS-2 with a byte-order marker. (UTF-16)
3833        // 0xfe 0xff: UCS-2, big-endian (12)
3834        encoding = ENCODING_UCS_2_12;
3835        is.read (); is.read ();
3836
3837    } else if (tryEncoding (signature, (byte) 0xff, (byte) 0xfe)) {
3838        // UCS-2 with a byte-order marker. (UTF-16)
3839        // 0xff 0xfe: UCS-2, little-endian (21)
3840        encoding = ENCODING_UCS_2_21;
3841        is.read (); is.read ();
3842
3843    } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
3844                 (byte) 0x00, (byte) 0x3f)) {
3845        // UTF-16-BE (otherwise, malformed UTF-16)
3846        // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
3847        encoding = ENCODING_UCS_2_12;
3848        error ("no byte-order mark for UCS-2 entity");
3849
3850    } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
3851                 (byte) 0x3f, (byte) 0x00)) {
3852        // UTF-16-LE (otherwise, malformed UTF-16)
3853        // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
3854        encoding = ENCODING_UCS_2_21;
3855        error ("no byte-order mark for UCS-2 entity");
3856    }
3857
3858    //
3859    // THIRD:  ASCII-derived encodings, fixed and variable lengths
3860    //
3861    else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x3f,
3862                   (byte) 0x78, (byte) 0x6d)) {
3863        // ASCII derived
3864        // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
3865        encoding = ENCODING_UTF_8;
3866        read8bitEncodingDeclaration ();
3867
3868    } else {
3869        // 4c 6f a7 94 ... we don't understand EBCDIC flavors
3870        // ... but we COULD at least kick in some fixed code page
3871
3872        // (default) UTF-8 without encoding/XML declaration
3873        encoding = ENCODING_UTF_8;
3874    }
3875    }
3876
3877
3878    /**
3879     * Check for a four-byte signature.
3880     * <p>Utility routine for detectEncoding ().
3881     * <p>Always looks for some part of "<?XML" in a specific encoding.
3882     * @param sig The first four bytes read.
3883     * @param b1 The first byte of the signature
3884     * @param b2 The second byte of the signature
3885     * @param b3 The third byte of the signature
3886     * @param b4 The fourth byte of the signature
3887     * @see #detectEncoding
3888     */
3889    private static boolean tryEncoding (
3890    byte sig[], byte b1, byte b2, byte b3, byte b4)
3891    {
3892    return (sig [0] == b1 && sig [1] == b2
3893        && sig [2] == b3 && sig [3] == b4);
3894    }
3895
3896
3897    /**
3898     * Check for a two-byte signature.
3899     * <p>Looks for a UCS-2 byte-order mark.
3900     * <p>Utility routine for detectEncoding ().
3901     * @param sig The first four bytes read.
3902     * @param b1 The first byte of the signature
3903     * @param b2 The second byte of the signature
3904     * @see #detectEncoding
3905     */
3906    private static boolean tryEncoding (byte sig[], byte b1, byte b2)
3907    {
3908    return ((sig [0] == b1) && (sig [1] == b2));
3909    }
3910
3911    /**
3912     * Check for a three-byte signature.
3913     * <p>Looks for a UTF-8 byte-order mark.
3914     * <p>Utility routine for detectEncoding ().
3915     * @param sig The first four bytes read.
3916     * @param b1 The first byte of the signature
3917     * @param b2 The second byte of the signature
3918     * @param b3 The second byte of the signature
3919     * @see #detectEncoding
3920     */
3921    private static boolean tryEncoding (byte sig[], byte b1, byte b2, byte b3)
3922    {
3923    return ((sig [0] == b1) && (sig [1] == b2) && (sig [2] == b3));
3924    }
3925
3926    /**
3927     * This method pushes a string back onto input.
3928     * <p>It is useful either as the expansion of an internal entity, 
3929     * or for backtracking during the parse.
3930     * <p>Call pushCharArray () to do the actual work.
3931     * @param s The string to push back onto input.
3932     * @see #pushCharArray
3933     */
3934    private void pushString (String ename, String s)
3935    throws SAXException
3936    {
3937    char ch[] = s.toCharArray ();
3938    pushCharArray (ename, ch, 0, ch.length);
3939    }
3940
3941
3942    /**
3943     * Push a new internal input source.
3944     * <p>This method is useful for expanding an internal entity,
3945     * or for unreading a string of characters.  It creates a new
3946     * readBuffer containing the characters in the array, instead
3947     * of characters converted from an input byte stream.
3948     * @param ch The char array to push.
3949     * @see #pushString
3950     * @see #pushURL
3951     * @see #readBuffer
3952     * @see #sourceType
3953     * @see #pushInput
3954     */
3955    private void pushCharArray (String ename, char ch[], int start, int length)
3956    throws SAXException
3957    {
3958    // Push the existing status
3959    pushInput (ename);
3960    sourceType = INPUT_INTERNAL;
3961    readBuffer = ch;
3962    readBufferPos = start;
3963    readBufferLength = length;
3964    readBufferOverflow = -1;
3965    }
3966
3967
3968    /**
3969     * Save the current input source onto the stack.
3970     * <p>This method saves all of the global variables associated with
3971     * the current input source, so that they can be restored when a new
3972     * input source has finished.  It also tests for entity recursion.
3973     * <p>The method saves the following global variables onto a stack
3974     * using a fixed-length array:
3975     * <ol>
3976     * <li>sourceType
3977     * <li>externalEntity
3978     * <li>readBuffer
3979     * <li>readBufferPos
3980     * <li>readBufferLength
3981     * <li>line
3982     * <li>encoding
3983     * </ol>
3984     * @param ename The name of the entity (if any) causing the new input.
3985     * @see #popInput
3986     * @see #sourceType
3987     * @see #externalEntity
3988     * @see #readBuffer
3989     * @see #readBufferPos
3990     * @see #readBufferLength
3991     * @see #line
3992     * @see #encoding
3993     */
3994    private void pushInput (String ename)
3995    throws SAXException
3996    {
3997    Object input[] = new Object [12];
3998
3999    // Check for entity recursion.
4000    if (ename != null) {
4001        Enumeration entities = entityStack.elements ();
4002        while (entities.hasMoreElements ()) {
4003        String e = (String) entities.nextElement ();
4004        if (e == ename) {
4005            error ("recursive reference to entity", ename, null);
4006        }
4007        }
4008    }
4009    entityStack.push (ename);
4010
4011    // Don't bother if there is no current input.
4012    if (sourceType == INPUT_NONE) {
4013        return;
4014    }
4015
4016    // Set up a snapshot of the current
4017    // input source.
4018    input [0] = new Integer (sourceType);
4019    input [1] = externalEntity;
4020    input [2] = readBuffer;
4021    input [3] = new Integer (readBufferPos);
4022    input [4] = new Integer (readBufferLength);
4023    input [5] = new Integer (line);
4024    input [6] = new Integer (encoding);
4025    input [7] = new Integer (readBufferOverflow);
4026    input [8] = is;
4027    input [9] = new Integer (currentByteCount);
4028    input [10] = new Integer (column);
4029    input [11] = reader;
4030
4031    // Push it onto the stack.
4032    inputStack.push (input);
4033    }
4034
4035
4036    /**
4037     * Restore a previous input source.
4038     * <p>This method restores all of the global variables associated with
4039     * the current input source.
4040     * @exception java.io.EOFException
4041     *    If there are no more entries on the input stack.
4042     * @see #pushInput
4043     * @see #sourceType
4044     * @see #externalEntity
4045     * @see #readBuffer
4046     * @see #readBufferPos
4047     * @see #readBufferLength
4048     * @see #line
4049     * @see #encoding
4050     */
4051    private void popInput ()
4052    throws SAXException, IOException
4053    {
4054    String uri;
4055
4056    if (externalEntity != null)
4057        uri = externalEntity.getURL ().toString ();
4058    else
4059        uri = baseURI;
4060
4061    switch (sourceType) {
4062    case INPUT_STREAM:
4063        if (is!=null) {
4064            if (uri != null) {
4065                handler.endExternalEntity (baseURI);
4066            }
4067            is.close ();
4068        }
4069        break;
4070    case INPUT_READER:
4071        if (reader != null) {
4072            if (uri != null) {
4073                handler.endExternalEntity (baseURI);
4074            }
4075            reader.close ();
4076        }
4077        break;
4078    }
4079
4080    // Throw an EOFException if there
4081    // is nothing else to pop.
4082    if (inputStack.isEmpty ()) {
4083        throw new EOFException ("no more input");
4084    } 
4085    
4086    Object[] input = (Object[]) inputStack.pop ();
4087    entityStack.pop ();
4088
4089    sourceType = ((Integer) input [0]).intValue ();
4090    externalEntity = (URLConnection) input [1];
4091    readBuffer = (char[]) input [2];
4092    readBufferPos = ((Integer) input [3]).intValue ();
4093    readBufferLength = ((Integer) input [4]).intValue ();
4094    line = ((Integer) input [5]).intValue ();
4095    encoding = ((Integer) input [6]).intValue ();
4096    readBufferOverflow = ((Integer) input [7]).intValue ();
4097    is = (InputStream) input [8];
4098    currentByteCount = ((Integer) input [9]).intValue ();
4099    column = ((Integer) input [10]).intValue ();
4100    reader = (Reader) input [11];
4101    }
4102
4103
4104    /**
4105     * Return true if we can read the expected character.
4106     * <p>Note that the character will be removed from the input stream
4107     * on success, but will be put back on failure.  Do not attempt to
4108     * read the character again if the method succeeds.
4109     * @param delim The character that should appear next.  For a
4110     *        insensitive match, you must supply this in upper-case.
4111     * @return true if the character was successfully read, or false if
4112     *   it was not.
4113     * @see #tryRead (String)
4114     */
4115    private boolean tryRead (char delim)
4116    throws SAXException, IOException
4117    {
4118    char c;
4119
4120    // Read the character
4121    c = readCh ();
4122
4123    // Test for a match, and push the character
4124    // back if the match fails.
4125    if (c == delim) {
4126        return true;
4127    } else {
4128        unread (c);
4129        return false;
4130    }
4131    }
4132
4133
4134    /**
4135     * Return true if we can read the expected string.
4136     * <p>This is simply a convenience method.
4137     * <p>Note that the string will be removed from the input stream
4138     * on success, but will be put back on failure.  Do not attempt to
4139     * read the string again if the method succeeds.
4140     * <p>This method will push back a character rather than an
4141     * array whenever possible (probably the majority of cases).
4142     * <p><b>NOTE:</b> This method currently has a hard-coded limit
4143     * of 100 characters for the delimiter.
4144     * @param delim The string that should appear next.
4145     * @return true if the string was successfully read, or false if
4146     *   it was not.
4147     * @see #tryRead (char)
4148     */
4149    private boolean tryRead (String delim)
4150    throws SAXException, IOException
4151    {
4152    char ch[] = delim.toCharArray ();
4153    char c;
4154
4155    // Compare the input, character-
4156    // by character.
4157
4158    for (int i = 0; i < ch.length; i++) {
4159        c = readCh ();
4160        if (c != ch [i]) {
4161        unread (c);
4162        if (i != 0) {
4163            unread (ch, i);
4164        }
4165        return false;
4166        }
4167    }
4168    return true;
4169    }
4170
4171
4172
4173    /**
4174     * Return true if we can read some whitespace.
4175     * <p>This is simply a convenience method.
4176     * <p>This method will push back a character rather than an
4177     * array whenever possible (probably the majority of cases).
4178     * @return true if whitespace was found.
4179     */
4180    private boolean tryWhitespace ()
4181    throws SAXException, IOException
4182    {
4183    char c;
4184    c = readCh ();
4185    if (isWhitespace (c)) {
4186        skipWhitespace ();
4187        return true;
4188    } else {
4189        unread (c);
4190        return false;
4191    }
4192    }
4193
4194
4195    /**
4196     * Read all data until we find the specified string.
4197     * This is useful for scanning CDATA sections and PIs.
4198     * <p>This is inefficient right now, since it calls tryRead ()
4199     * for every character.
4200     * @param delim The string delimiter
4201     * @see #tryRead (String, boolean)
4202     * @see #readCh
4203     */
4204    private void parseUntil (String delim)
4205    throws SAXException, IOException
4206    {
4207    char c;
4208    int startLine = line;
4209
4210    try {
4211        while (!tryRead (delim)) {
4212        c = readCh ();
4213        dataBufferAppend (c);
4214        }
4215    } catch (EOFException e) {
4216        error ("end of input while looking for delimiter "
4217        + "(started on line " + startLine
4218        + ')', null, delim);
4219    }
4220    }
4221
4222
4223    /**
4224     * Read just the encoding declaration (or XML declaration) at the 
4225     * start of an external entity.
4226     * When this method is called, we know that the declaration is
4227     * present (or appears to be).  We also know that the entity is
4228     * in some sort of ASCII-derived 8-bit encoding.
4229     * The idea of this is to let us read what the 8-bit encoding is
4230     * before we've committed to converting any more of the file; the
4231     * XML or encoding declaration must be in 7-bit ASCII, so we're
4232     * safe as long as we don't go past it.
4233     */
4234    private void read8bitEncodingDeclaration ()
4235    throws SAXException, IOException
4236    {
4237    int ch;
4238    readBufferPos = readBufferLength = 0;
4239
4240    while (true) {
4241        ch = is.read ();
4242        readBuffer [readBufferLength++] = (char) ch;
4243        switch (ch) {
4244          case (int) '>':
4245        return;
4246          case - 1:
4247        error ("end of file before end of XML or encoding declaration.",
4248               null, "?>");
4249        }
4250        if (readBuffer.length == readBufferLength)
4251        error ("unfinished XML or encoding declaration");
4252    }
4253    }
4254
4255
4256    //////////////////////////////////////////////////////////////////////
4257    // Low-level I/O.
4258    //////////////////////////////////////////////////////////////////////
4259
4260
4261    /**
4262     * Read a chunk of data from an external input source.
4263     * <p>This is simply a front-end that fills the rawReadBuffer
4264     * with bytes, then calls the appropriate encoding handler.
4265     * @see #encoding
4266     * @see #rawReadBuffer
4267     * @see #readBuffer
4268     * @see #filterCR
4269     * @see #copyUtf8ReadBuffer
4270     * @see #copyIso8859_1ReadBuffer
4271     * @see #copyUcs_2ReadBuffer
4272     * @see #copyUcs_4ReadBuffer
4273     */
4274    private void readDataChunk ()
4275    throws SAXException, IOException
4276    {
4277    int count;
4278
4279    // See if we have any overflow (filterCR sets for CR at end)
4280    if (readBufferOverflow > -1) {
4281        readBuffer [0] = (char) readBufferOverflow;
4282        readBufferOverflow = -1;
4283        readBufferPos = 1;
4284        sawCR = true;
4285    } else {
4286        readBufferPos = 0;
4287        sawCR = false;
4288    }
4289
4290    // input from a character stream.
4291    if (sourceType == INPUT_READER) {
4292        count = reader.read (readBuffer,
4293                readBufferPos, READ_BUFFER_MAX - readBufferPos);
4294        if (count < 0)
4295        readBufferLength = readBufferPos;
4296        else
4297        readBufferLength = readBufferPos + count;
4298        if (readBufferLength > 0)
4299        filterCR (count >= 0);
4300        sawCR = false;
4301        return;
4302    }
4303
4304    // Read as many bytes as possible into the raw buffer.
4305    count = is.read (rawReadBuffer, 0, READ_BUFFER_MAX);
4306
4307    // Dispatch to an encoding-specific reader method to populate
4308    // the readBuffer.  In most parser speed profiles, these routines
4309    // show up at the top of the CPU usage chart.
4310    if (count > 0) {
4311        switch (encoding) {
4312          // one byte builtins
4313          case ENCODING_ASCII:
4314        copyIso8859_1ReadBuffer (count, (char) 0x0080);
4315        break;
4316          case ENCODING_UTF_8:
4317        copyUtf8ReadBuffer (count);
4318        break;
4319          case ENCODING_ISO_8859_1:
4320        copyIso8859_1ReadBuffer (count, (char) 0);
4321        break;
4322
4323          // two byte builtins
4324          case ENCODING_UCS_2_12:
4325        copyUcs2ReadBuffer (count, 8, 0);
4326        break;
4327          case ENCODING_UCS_2_21:
4328        copyUcs2ReadBuffer (count, 0, 8);
4329        break;
4330
4331          // four byte builtins
4332          case ENCODING_UCS_4_1234:
4333        copyUcs4ReadBuffer (count, 24, 16, 8, 0);
4334        break;
4335          case ENCODING_UCS_4_4321:
4336        copyUcs4ReadBuffer (count, 0, 8, 16, 24);
4337        break;
4338          case ENCODING_UCS_4_2143:
4339        copyUcs4ReadBuffer (count, 16, 24, 0, 8);
4340        break;
4341          case ENCODING_UCS_4_3412:
4342        copyUcs4ReadBuffer (count, 8, 0, 24, 16);
4343        break;
4344        }
4345    } else
4346        readBufferLength = readBufferPos;
4347
4348    readBufferPos = 0;
4349
4350    // Filter out all carriage returns if we've seen any
4351    // (including any saved from a previous read)
4352    if (sawCR) {
4353        filterCR (count >= 0);
4354        sawCR = false;
4355
4356        // must actively report EOF, lest some CRs get lost.
4357        if (readBufferLength == 0 && count >= 0)
4358        readDataChunk ();
4359    }
4360
4361    if (count > 0)
4362        currentByteCount += count;
4363    }
4364
4365
4366    /**
4367     * Filter carriage returns in the read buffer.
4368     * CRLF becomes LF; CR becomes LF.
4369     * @param moreData true iff more data might come from the same source
4370     * @see #readDataChunk
4371     * @see #readBuffer
4372     * @see #readBufferOverflow
4373     */
4374    private void filterCR (boolean moreData)
4375    {
4376    int i, j;
4377
4378    readBufferOverflow = -1;
4379
4380loop:
4381    for (i = j = readBufferPos; j < readBufferLength; i++, j++) {
4382        switch (readBuffer [j]) {
4383        case '\r':
4384        if (j == readBufferLength - 1) {
4385            if (moreData) {
4386            readBufferOverflow = '\r';
4387            readBufferLength--;
4388            } else  // CR at end of buffer
4389            readBuffer [i++] = '\n';
4390            break loop;
4391        } else if (readBuffer [j + 1] == '\n') {
4392            j++;
4393        }
4394        readBuffer [i] = '\n';
4395        break;
4396
4397        case '\n':
4398        default:
4399        readBuffer [i] = readBuffer [j];
4400        break;
4401        }
4402    }
4403    readBufferLength = i;
4404    }
4405
4406    /**
4407     * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
4408     * <p>When readDataChunk () calls this method, the raw bytes are in 
4409     * rawReadBuffer, and the final characters will appear in 
4410     * readBuffer.
4411     * @param count The number of bytes to convert.
4412     * @see #readDataChunk
4413     * @see #rawReadBuffer
4414     * @see #readBuffer
4415     * @see #getNextUtf8Byte
4416     */
4417    private void copyUtf8ReadBuffer (int count)
4418    throws SAXException, IOException
4419    {
4420    int i = 0;
4421    int j = readBufferPos;
4422    int b1;
4423    char    c = 0;
4424
4425    /*
4426    // check once, so the runtime won't (if it's smart enough)
4427    if (count < 0 || count > rawReadBuffer.length)
4428        throw new ArrayIndexOutOfBoundsException (Integer.toString (count));
4429    */
4430
4431    while (i < count) {
4432        b1 = rawReadBuffer [i++];
4433
4434        // Determine whether we are dealing
4435        // with a one-, two-, three-, or four-
4436        // byte sequence.
4437        if (b1 < 0) {
4438        if ((b1 & 0xe0) == 0xc0) {
4439            // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
4440            c = (char) (((b1 & 0x1f) << 6)
4441                | getNextUtf8Byte (i++, count));
4442        } else if ((b1 & 0xf0) == 0xe0) {
4443            // 3-byte sequence:
4444            // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
4445            // most CJKV characters
4446            c = (char) (((b1 & 0x0f) << 12) |
4447                   (getNextUtf8Byte (i++, count) << 6) |
4448                   getNextUtf8Byte (i++, count));
4449        } else if ((b1 & 0xf8) == 0xf0) {
4450            // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
4451            //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
4452            // (uuuuu = wwww + 1)
4453            // "Surrogate Pairs" ... from the "Astral Planes"
4454            int iso646 = b1 & 07;
4455            iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4456            iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4457            iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4458
4459            if (iso646 <= 0xffff) {
4460            c = (char) iso646;
4461            } else {
4462            if (iso646 > 0x0010ffff)
4463                encodingError (
4464                "UTF-8 value out of range for Unicode",
4465                iso646, 0);
4466            iso646 -= 0x010000;
4467            readBuffer [j++] = (char) (0xd800 | (iso646 >> 10));
4468            readBuffer [j++] = (char) (0xdc00 | (iso646 & 0x03ff));
4469            continue;
4470            }
4471        } else {
4472            // The five and six byte encodings aren't supported;
4473            // they exceed the Unicode (and XML) range.
4474            encodingError (
4475                "invalid UTF-8 byte (check the XML declaration)",
4476                0xff & b1, i);
4477            // NOTREACHED
4478            c = 0;
4479        }
4480        } else {
4481        // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
4482        // (US-ASCII character, "common" case, one branch to here)
4483        c = (char) b1;
4484        }
4485        readBuffer [j++] = c;
4486        if (c == '\r')
4487        sawCR = true;
4488    }
4489    // How many characters have we read?
4490    readBufferLength = j;
4491    }
4492
4493
4494    /**
4495     * Return the next byte value in a UTF-8 sequence.
4496     * If it is not possible to get a byte from the current
4497     * entity, throw an exception.
4498     * @param pos The current position in the rawReadBuffer.
4499     * @param count The number of bytes in the rawReadBuffer
4500     * @return The significant six bits of a non-initial byte in
4501     *   a UTF-8 sequence.
4502     * @exception EOFException If the sequence is incomplete.
4503     */
4504    private int getNextUtf8Byte (int pos, int count)
4505    throws SAXException, IOException
4506    {
4507    int val;
4508
4509    // Take a character from the buffer
4510    // or from the actual input stream.
4511    if (pos < count) {
4512        val = rawReadBuffer [pos];
4513    } else {
4514        val = is.read ();
4515        if (val == -1) {
4516        encodingError ("unfinished multi-byte UTF-8 sequence at EOF",
4517            -1, pos);
4518        }
4519    }
4520
4521    // Check for the correct bits at the start.
4522    if ((val & 0xc0) != 0x80) {
4523        encodingError ("bad continuation of multi-byte UTF-8 sequence",
4524            val, pos + 1);
4525    }
4526
4527    // Return the significant bits.
4528    return (val & 0x3f);
4529    }
4530
4531
4532    /**
4533     * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
4534     * UTF-16 characters.
4535     *
4536     * <p>When readDataChunk () calls this method, the raw bytes are in 
4537     * rawReadBuffer, and the final characters will appear in 
4538     * readBuffer.
4539     *
4540     * @param count The number of bytes to convert.
4541     * @param mask For ASCII conversion, 0x7f; else, 0xff.
4542     * @see #readDataChunk
4543     * @see #rawReadBuffer
4544     * @see #readBuffer
4545     */
4546    private void copyIso8859_1ReadBuffer (int count, char mask)
4547    throws IOException
4548    {
4549    int i, j;
4550    for (i = 0, j = readBufferPos; i < count; i++, j++) {
4551        char c = (char) (rawReadBuffer [i] & 0xff);
4552        if ((c & mask) != 0)
4553        throw new CharConversionException ("non-ASCII character U+"
4554                            + Integer.toHexString (c));
4555        readBuffer [j] = c;
4556        if (c == '\r') {
4557        sawCR = true;
4558        }
4559    }
4560    readBufferLength = j;
4561    }
4562
4563
4564    /**
4565     * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
4566     * (as used in Java string manipulation).
4567     *
4568     * <p>When readDataChunk () calls this method, the raw bytes are in 
4569     * rawReadBuffer, and the final characters will appear in 
4570     * readBuffer.
4571     * @param count The number of bytes to convert.
4572     * @param shift1 The number of bits to shift byte 1.
4573     * @param shift2 The number of bits to shift byte 2
4574     * @see #readDataChunk
4575     * @see #rawReadBuffer
4576     * @see #readBuffer
4577     */
4578    private void copyUcs2ReadBuffer (int count, int shift1, int shift2)
4579    throws SAXException
4580    {
4581    int j = readBufferPos;
4582
4583    if (count > 0 && (count % 2) != 0) {
4584        encodingError ("odd number of bytes in UCS-2 encoding", -1, count);
4585    }
4586    // The loops are faster with less internal brancing; hence two
4587    if (shift1 == 0) {  // "UTF-16-LE"
4588        for (int i = 0; i < count; i += 2) {
4589        char c = (char) (rawReadBuffer [i + 1] << 8);
4590        c |= 0xff & rawReadBuffer [i];
4591        readBuffer [j++] = c;
4592        if (c == '\r')
4593            sawCR = true;
4594        }
4595    } else {    // "UTF-16-BE"
4596        for (int i = 0; i < count; i += 2) {
4597        char c = (char) (rawReadBuffer [i] << 8);
4598        c |= 0xff & rawReadBuffer [i + 1];
4599        readBuffer [j++] = c;
4600        if (c == '\r')
4601            sawCR = true;
4602        }
4603    }
4604    readBufferLength = j;
4605    }
4606
4607
4608    /**
4609     * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
4610     *
4611     * <p>When readDataChunk () calls this method, the raw bytes are in 
4612     * rawReadBuffer, and the final characters will appear in 
4613     * readBuffer.
4614     * <p>Java has Unicode chars, and this routine uses surrogate pairs
4615     * for ISO-10646 values between 0x00010000 and 0x000fffff.  An
4616     * exception is thrown if the ISO-10646 character has no Unicode
4617     * representation.
4618     *
4619     * @param count The number of bytes to convert.
4620     * @param shift1 The number of bits to shift byte 1.
4621     * @param shift2 The number of bits to shift byte 2
4622     * @param shift3 The number of bits to shift byte 2
4623     * @param shift4 The number of bits to shift byte 2
4624     * @see #readDataChunk
4625     * @see #rawReadBuffer
4626     * @see #readBuffer
4627     */
4628    private void copyUcs4ReadBuffer (int count, int shift1, int shift2,
4629                  int shift3, int shift4)
4630    throws SAXException
4631    {
4632    int j = readBufferPos;
4633
4634    if (count > 0 && (count % 4) != 0) {
4635        encodingError (
4636            "number of bytes in UCS-4 encoding not divisible by 4",
4637            -1, count);
4638    }
4639    for (int i = 0; i < count; i += 4) {
4640        int value = (((rawReadBuffer [i] & 0xff) << shift1) |
4641              ((rawReadBuffer [i + 1] & 0xff) << shift2) |
4642              ((rawReadBuffer [i + 2] & 0xff) << shift3) |
4643              ((rawReadBuffer [i + 3] & 0xff) << shift4));
4644        if (value < 0x0000ffff) {
4645        readBuffer [j++] = (char) value;
4646        if (value == (int) '\r') {
4647            sawCR = true;
4648        }
4649        } else if (value < 0x0010ffff) {
4650        value -= 0x010000;
4651        readBuffer [j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
4652        readBuffer [j++] = (char) (0xdc | (value & 0x03ff));
4653        } else {
4654        encodingError ("UCS-4 value out of range for Unicode",
4655                   value, i);
4656        }
4657    }
4658    readBufferLength = j;
4659    }
4660
4661
4662    /**
4663     * Report a character encoding error.
4664     */
4665    private void encodingError (String message, int value, int offset)
4666    throws SAXException
4667    {
4668    String uri;
4669
4670    if (value != -1) {
4671        message = message + " (code: 0x" +
4672              Integer.toHexString (value) + ')';
4673    }
4674    if (externalEntity != null) {
4675        uri = externalEntity.getURL ().toString ();
4676    } else {
4677        uri = baseURI;
4678    }
4679    handler.error (message, uri, -1, offset + currentByteCount);
4680    }
4681
4682
4683    //////////////////////////////////////////////////////////////////////
4684    // Local Variables.
4685    //////////////////////////////////////////////////////////////////////
4686
4687    /**
4688     * Re-initialize the variables for each parse.
4689     */
4690    private void initializeVariables ()
4691    {
4692    // First line
4693    line = 1;
4694    column = 0;
4695
4696    // Set up the buffers for data and names
4697    dataBufferPos = 0;
4698    dataBuffer = new char [DATA_BUFFER_INITIAL];
4699    nameBufferPos = 0;
4700    nameBuffer = new char [NAME_BUFFER_INITIAL];
4701
4702    // Set up the DTD hash tables
4703    elementInfo = new Hashtable ();
4704    entityInfo = new Hashtable ();
4705    notationInfo = new Hashtable ();
4706
4707    // Set up the variables for the current
4708    // element context.
4709    currentElement = null;
4710    currentElementContent = CONTENT_UNDECLARED;
4711
4712    // Set up the input variables
4713    sourceType = INPUT_NONE;
4714    inputStack = new Stack ();
4715    entityStack = new Stack ();
4716    externalEntity = null;
4717    tagAttributePos = 0;
4718    tagAttributes = new String [100];
4719    rawReadBuffer = new byte [READ_BUFFER_MAX];
4720    readBufferOverflow = -1;
4721
4722    inLiteral = false;
4723    expandPE = false;
4724    peIsError = false;
4725
4726    inCDATA = false;
4727
4728    symbolTable = new Object [SYMBOL_TABLE_LENGTH][];
4729    }
4730
4731
4732    /**
4733     * Clean up after the parse to allow some garbage collection.
4734     */
4735    private void cleanupVariables ()
4736    {
4737    dataBuffer = null;
4738    nameBuffer = null;
4739
4740    elementInfo = null;
4741    entityInfo = null;
4742    notationInfo = null;
4743
4744    currentElement = null;
4745
4746    inputStack = null;
4747    entityStack = null;
4748    externalEntity = null;
4749
4750    tagAttributes = null;
4751    rawReadBuffer = null;
4752
4753    symbolTable = null;
4754    }
4755
4756    /* used to restart reading with some InputStreamReader */
4757    static class EncodingException extends IOException
4758    {
4759    EncodingException (String encoding) { super (encoding); }
4760    }
4761
4762    //
4763    // The current XML handler interface.
4764    //
4765    private SAXDriver   handler;
4766
4767    //
4768    // I/O information.
4769    //
4770    private Reader  reader;     // current reader
4771    private InputStream is;         // current input stream
4772    private int     line;       // current line number
4773    private int     column;     // current column number
4774    private int     sourceType;     // type of input source
4775    private Stack   inputStack;     // stack of input soruces
4776    private URLConnection externalEntity; // current external entity
4777    private int     encoding;   // current character encoding
4778    private int     currentByteCount; // bytes read from current source
4779
4780    //
4781    // Buffers for decoded but unparsed character input.
4782    //
4783    private char    readBuffer [];
4784    private int     readBufferPos;
4785    private int     readBufferLength;
4786    private int     readBufferOverflow;  // overflow from last data chunk.
4787
4788
4789    //
4790    // Buffer for undecoded raw byte input.
4791    //
4792    private final static int READ_BUFFER_MAX = 16384;
4793    private byte    rawReadBuffer [];
4794
4795
4796    //
4797    // Buffer for parsed character data.
4798    //
4799    private static int DATA_BUFFER_INITIAL = 4096;
4800    private char    dataBuffer [];
4801    private int     dataBufferPos;
4802
4803    //
4804    // Buffer for parsed names.
4805    //
4806    private static int NAME_BUFFER_INITIAL = 1024;
4807    private char    nameBuffer [];
4808    private int     nameBufferPos;
4809
4810
4811    //
4812    // Hashtables for DTD information on elements, entities, and notations.
4813    //
4814    private Hashtable   elementInfo;
4815    private Hashtable   entityInfo;
4816    private Hashtable   notationInfo;
4817
4818
4819    //
4820    // Element type currently in force.
4821    //
4822    private String  currentElement;
4823    private int     currentElementContent;
4824
4825    //
4826    // Base external identifiers for resolution.
4827    //
4828    private String  basePublicId;
4829    private String  baseURI;
4830    private int     baseEncoding;
4831    private Reader  baseReader;
4832    private InputStream baseInputStream;
4833    private char    baseInputBuffer [];
4834    private int     baseInputBufferStart;
4835    private int     baseInputBufferLength;
4836
4837    //
4838    // Stack of entity names, to detect recursion.
4839    //
4840    private Stack   entityStack;
4841
4842    //
4843    // PE expansion is enabled in most chunks of the DTD, not all.
4844    // When it's enabled, literals are treated differently.
4845    //
4846    private boolean inLiteral;
4847    private boolean expandPE;
4848    private boolean peIsError;
4849
4850    //
4851    // Symbol table, for caching interned names.
4852    //
4853    private final static int SYMBOL_TABLE_LENGTH = 1087;
4854    private Object  symbolTable [][];
4855
4856    //
4857    // Hash table of attributes found in current start tag.
4858    //
4859    private String  tagAttributes [];
4860    private int     tagAttributePos;
4861
4862    //
4863    // Utility flag: have we noticed a CR while reading the last
4864    // data chunk?  If so, we will have to go back and normalise
4865    // CR or CR/LF line ends.
4866    //
4867    private boolean sawCR;
4868
4869    //
4870    // Utility flag: are we in CDATA?  If so, whitespace isn't ignorable.
4871    // 
4872    private boolean inCDATA;
4873}
4874
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Java Books Remove Frame
Popular Tags