KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > microstar > xml > XmlParser


1 // XmlParser.java: the main parser class.
2
// NO WARRANTY! See README, and copyright below.
3
// $Id: XmlParser.java 4972 2004-02-22 20:00:54Z spestov $
4

5 package com.microstar.xml;
6
7 import java.io.BufferedInputStream JavaDoc;
8 import java.io.EOFException JavaDoc;
9 import java.io.InputStream JavaDoc;
10 import java.io.Reader JavaDoc;
11 import java.net.URL JavaDoc;
12 import java.net.URLConnection JavaDoc;
13 import java.util.Enumeration JavaDoc;
14 import java.util.Hashtable JavaDoc;
15 import java.util.Stack JavaDoc;
16
17
18 /**
19   * Parse XML documents and return parse events through call-backs.
20   * <p>You need to define a class implementing the <code>XmlHandler</code>
21   * interface: an object belonging to this class will receive the
22   * callbacks for the events. (As an alternative to implementing
23   * the full XmlHandler interface, you can simply extend the
24   * <code>HandlerBase</code> convenience class.)
25   * <p>Usage (assuming that <code>MyHandler</code> is your implementation
26   * of the <code>XmlHandler</code> interface):
27   * <pre>
28   * XmlHandler handler = new MyHandler();
29   * XmlParser parser = new XmlParser();
30   * parser.setHandler(handler);
31   * try {
32   * parser.parse("http://www.host.com/doc.xml", null);
33   * } catch (Exception e) {
34   * [do something interesting]
35   * }
36   * </pre>
37   * <p>Alternatively, you can use the standard SAX interfaces
38   * with the <code>SAXDriver</code> class as your entry point.
39   * @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
40   * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
41   * @version 1.1
42   * @see XmlHandler
43   * @see HandlerBase
44   * @see SAXDriver
45   */

46 public class XmlParser {
47
48   //
49
// Use special cheats that speed up the code (currently about 50%),
50
// but may cause problems with future maintenance and add to the
51
// class file size (about 500 bytes).
52
//
53
private final static boolean USE_CHEATS = true;
54
55
56
57   //////////////////////////////////////////////////////////////////////
58
// Constructors.
59
////////////////////////////////////////////////////////////////////////
60

61
62   /**
63     * Construct a new parser with no associated handler.
64     * @see #setHandler
65     * @see #parse
66     */

67   public XmlParser ()
68   {
69   }
70
71
72   /**
73     * Set the handler that will receive parsing events.
74     * @param handler The handler to receive callback events.
75     * @see #parse
76     * @see XmlHandler
77     */

78   public void setHandler (XmlHandler handler)
79   {
80     this.handler = handler;
81   }
82
83
84   /**
85     * Parse an XML document from a URI.
86     * <p>You may parse a document more than once, but only one thread
87     * may call this method for an object at one time.
88     * @param systemId The URI of the document.
89     * @param publicId The public identifier of the document, or null.
90     * @param encoding The suggested encoding, or null if unknown.
91     * @exception java.lang.Exception Any exception thrown by your
92     * own handlers, or any derivation of java.io.IOException
93     * thrown by the parser itself.
94     */

95   public void parse (String JavaDoc systemId, String JavaDoc publicId, String JavaDoc encoding)
96     throws java.lang.Exception JavaDoc
97   {
98     doParse(systemId, publicId, null, null, encoding);
99   }
100
101
102   /**
103     * Parse an XML document from a byte stream.
104     * <p>The URI that you supply will become the base URI for
105     * resolving relative links, but &AElig;lfred will actually read
106     * the document from the supplied input stream.
107     * <p>You may parse a document more than once, but only one thread
108     * may call this method for an object at one time.
109     * @param systemId The base URI of the document, or null if not
110     * known.
111     * @param publicId The public identifier of the document, or null
112     * if not known.
113     * @param stream A byte input stream.
114     * @param encoding The suggested encoding, or null if unknown.
115     * @exception java.lang.Exception Any exception thrown by your
116     * own handlers, or any derivation of java.io.IOException
117     * thrown by the parser itself.
118     */

119   public void parse (String JavaDoc systemId, String JavaDoc publicId,
120              InputStream JavaDoc stream, String JavaDoc encoding)
121     throws java.lang.Exception JavaDoc
122   {
123     doParse(systemId, publicId, null, stream, encoding);
124   }
125
126
127   /**
128     * Parse an XML document from a character stream.
129     * <p>The URI that you supply will become the base URI for
130     * resolving relative links, but &AElig;lfred will actually read
131     * the document from the supplied input stream.
132     * <p>You may parse a document more than once, but only one thread
133     * may call this method for an object at one time.
134     * @param systemId The base URI of the document, or null if not
135     * known.
136     * @param publicId The public identifier of the document, or null
137     * if not known.
138     * @param reader A character stream.
139     * @exception java.lang.Exception Any exception thrown by your
140     * own handlers, or any derivation of java.io.IOException
141     * thrown by the parser itself.
142     */

143   public void parse (String JavaDoc systemId, String JavaDoc publicId, Reader JavaDoc reader)
144     throws java.lang.Exception JavaDoc
145   {
146     doParse(systemId, publicId, reader, null, null);
147   }
148
149
150   private synchronized void doParse (String JavaDoc systemId, String JavaDoc publicId,
151                      Reader JavaDoc reader, InputStream JavaDoc stream,
152                      String JavaDoc encoding)
153     throws java.lang.Exception JavaDoc
154   {
155     basePublicId = publicId;
156     baseURI = systemId;
157     baseReader = reader;
158     baseInputStream = stream;
159
160     initializeVariables();
161
162                 // Set the default entities here.
163
setInternalEntity(intern("amp"), "&#38;");
164     setInternalEntity(intern("lt"), "&#60;");
165     setInternalEntity(intern("gt"), "&#62;");
166     setInternalEntity(intern("apos"), "&#39;");
167     setInternalEntity(intern("quot"), "&#34;");
168
169     if (handler != null) {
170       handler.startDocument();
171     }
172
173     pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream,
174         encoding);
175
176     parseDocument();
177
178     if (handler != null) {
179       handler.endDocument();
180     }
181     cleanupVariables();
182   }
183
184
185
186   ////////////////////////////////////////////////////////////////////////
187
// Constants.
188
////////////////////////////////////////////////////////////////////////
189

190   //
191
// Constants for element content type.
192
//
193

194   /**
195     * Constant: an element has not been declared.
196     * @see #getElementContentType
197     */

198   public final static int CONTENT_UNDECLARED = 0;
199
200   /**
201     * Constant: the element has a content model of ANY.
202     * @see #getElementContentType
203     */

204   public final static int CONTENT_ANY = 1;
205
206   /**
207     * Constant: the element has declared content of EMPTY.
208     * @see #getElementContentType
209     */

210   public final static int CONTENT_EMPTY = 2;
211
212   /**
213     * Constant: the element has mixed content.
214     * @see #getElementContentType
215     */

216   public final static int CONTENT_MIXED = 3;
217
218   /**
219     * Constant: the element has element content.
220     * @see #getElementContentType
221     */

222   public final static int CONTENT_ELEMENTS = 4;
223
224
225   //
226
// Constants for the entity type.
227
//
228

229   /**
230     * Constant: the entity has not been declared.
231     * @see #getEntityType
232     */

233   public final static int ENTITY_UNDECLARED = 0;
234
235   /**
236     * Constant: the entity is internal.
237     * @see #getEntityType
238     */

239   public final static int ENTITY_INTERNAL = 1;
240
241   /**
242     * Constant: the entity is external, non-XML data.
243     * @see #getEntityType
244     */

245   public final static int ENTITY_NDATA = 2;
246
247   /**
248     * Constant: the entity is external XML data.
249     * @see #getEntityType
250     */

251   public final static int ENTITY_TEXT = 3;
252
253
254   //
255
// Constants for attribute type.
256
//
257

258   /**
259     * Constant: the attribute has not been declared for this element type.
260     * @see #getAttributeType
261     */

262   public final static int ATTRIBUTE_UNDECLARED = 0;
263
264   /**
265     * Constant: the attribute value is a string value.
266     * @see #getAttributeType
267     */

268   public final static int ATTRIBUTE_CDATA = 1;
269
270   /**
271     * Constant: the attribute value is a unique identifier.
272     * @see #getAttributeType
273     */

274   public final static int ATTRIBUTE_ID = 2;
275
276   /**
277     * Constant: the attribute value is a reference to a unique identifier.
278     * @see #getAttributeType
279     */

280   public final static int ATTRIBUTE_IDREF = 3;
281
282   /**
283     * Constant: the attribute value is a list of ID references.
284     * @see #getAttributeType
285     */

286   public final static int ATTRIBUTE_IDREFS = 4;
287
288   /**
289     * Constant: the attribute value is the name of an entity.
290     * @see #getAttributeType
291     */

292   public final static int ATTRIBUTE_ENTITY = 5;
293
294   /**
295     * Constant: the attribute value is a list of entity names.
296     * @see #getAttributeType
297     */

298   public final static int ATTRIBUTE_ENTITIES = 6;
299
300   /**
301     * Constant: the attribute value is a name token.
302     * @see #getAttributeType
303     */

304   public final static int ATTRIBUTE_NMTOKEN = 7;
305
306   /**
307     * Constant: the attribute value is a list of name tokens.
308     * @see #getAttributeType
309     */

310   public final static int ATTRIBUTE_NMTOKENS = 8;
311
312   /**
313     * Constant: the attribute value is a token from an enumeration.
314     * @see #getAttributeType
315     */

316   public final static int ATTRIBUTE_ENUMERATED = 9;
317
318   /**
319     * Constant: the attribute is the name of a notation.
320     * @see #getAttributeType
321     */

322   public final static int ATTRIBUTE_NOTATION = 10;
323
324
325   //
326
// When the class is loaded, populate the hash table of
327
// attribute types.
328
//
329

330   /**
331     * Hash table of attribute types.
332     */

333   private static Hashtable JavaDoc attributeTypeHash;
334   static {
335     attributeTypeHash = new Hashtable JavaDoc();
336     attributeTypeHash.put("CDATA", new Integer JavaDoc(ATTRIBUTE_CDATA));
337     attributeTypeHash.put("ID", new Integer JavaDoc(ATTRIBUTE_ID));
338     attributeTypeHash.put("IDREF", new Integer JavaDoc(ATTRIBUTE_IDREF));
339     attributeTypeHash.put("IDREFS", new Integer JavaDoc(ATTRIBUTE_IDREFS));
340     attributeTypeHash.put("ENTITY", new Integer JavaDoc(ATTRIBUTE_ENTITY));
341     attributeTypeHash.put("ENTITIES", new Integer JavaDoc(ATTRIBUTE_ENTITIES));
342     attributeTypeHash.put("NMTOKEN", new Integer JavaDoc(ATTRIBUTE_NMTOKEN));
343     attributeTypeHash.put("NMTOKENS", new Integer JavaDoc(ATTRIBUTE_NMTOKENS));
344     attributeTypeHash.put("NOTATION", new Integer JavaDoc(ATTRIBUTE_NOTATION));
345   }
346
347
348   //
349
// Constants for supported encodings.
350
//
351
private final static int ENCODING_UTF_8 = 1;
352   private final static int ENCODING_ISO_8859_1 = 2;
353   private final static int ENCODING_UCS_2_12 = 3;
354   private final static int ENCODING_UCS_2_21 = 4;
355   private final static int ENCODING_UCS_4_1234 = 5;
356   private final static int ENCODING_UCS_4_4321 = 6;
357   private final static int ENCODING_UCS_4_2143 = 7;
358   private final static int ENCODING_UCS_4_3412 = 8;
359
360
361   //
362
// Constants for attribute default value.
363
//
364

365   /**
366     * Constant: the attribute is not declared.
367     * @see #getAttributeDefaultValueType
368     */

369   public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
370
371   /**
372     * Constant: the attribute has a literal default value specified.
373     * @see #getAttributeDefaultValueType
374     * @see #getAttributeDefaultValue
375     */

376   public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
377
378   /**
379     * Constant: the attribute was declared #IMPLIED.
380     * @see #getAttributeDefaultValueType
381     */

382   public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2;
383
384   /**
385     * Constant: the attribute was declared #REQUIRED.
386     * @see #getAttributeDefaultValueType
387     */

388   public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3;
389
390   /**
391     * Constant: the attribute was declared #FIXED.
392     * @see #getAttributeDefaultValueType
393     * @see #getAttributeDefaultValue
394     */

395   public final static int ATTRIBUTE_DEFAULT_FIXED = 4;
396
397
398   //
399
// Constants for input.
400
//
401
private final static int INPUT_NONE = 0;
402   private final static int INPUT_INTERNAL = 1;
403   private final static int INPUT_EXTERNAL = 2;
404   private final static int INPUT_STREAM = 3;
405   private final static int INPUT_BUFFER = 4;
406   private final static int INPUT_READER = 5;
407
408
409   //
410
// Flags for reading literals.
411
//
412
private final static int LIT_CHAR_REF = 1;
413   private final static int LIT_ENTITY_REF = 2;
414   private final static int LIT_PE_REF = 4;
415   private final static int LIT_NORMALIZE = 8;
416
417
418   //
419
// Flags for parsing context.
420
//
421
private final static int CONTEXT_NONE = 0;
422   private final static int CONTEXT_DTD = 1;
423   private final static int CONTEXT_ENTITYVALUE = 2;
424   private final static int CONTEXT_ATTRIBUTEVALUE = 3;
425
426
427
428   //////////////////////////////////////////////////////////////////////
429
// Error reporting.
430
//////////////////////////////////////////////////////////////////////
431

432
433   /**
434     * Report an error.
435     * @param message The error message.
436     * @param textFound The text that caused the error (or null).
437     * @see XmlHandler#error
438     * @see #line
439     */

440   void error (String JavaDoc message, String JavaDoc textFound, String JavaDoc textExpected)
441     throws java.lang.Exception JavaDoc
442   {
443     errorCount++;
444     if (textFound != null) {
445       message = message + " (found \"" + textFound + "\")";
446     }
447     if (textExpected != null) {
448       message = message + " (expected \"" + textExpected + "\")";
449     }
450     if (handler != null) {
451       String JavaDoc uri = null;
452
453       if (externalEntity != null) {
454     uri = externalEntity.getURL().toString();
455       }
456       handler.error(message, uri, line, column);
457     }
458   }
459
460
461   /**
462     * Report a serious error.
463     * @param message The error message.
464     * @param textFound The text that caused the error (or null).
465     */

466   void error (String JavaDoc message, char textFound, String JavaDoc textExpected)
467     throws java.lang.Exception JavaDoc
468   {
469     error(message, new Character JavaDoc(textFound).toString(), textExpected);
470   }
471
472
473
474   //////////////////////////////////////////////////////////////////////
475
// Major syntactic productions.
476
//////////////////////////////////////////////////////////////////////
477

478
479   /**
480     * Parse an XML document.
481     * <pre>
482     * [1] document ::= prolog element Misc*
483     * </pre>
484     * <p>This is the top-level parsing function for a single XML
485     * document. As a minimum, a well-formed document must have
486     * a document element, and a valid document must have a prolog
487     * as well.
488     */

489   void parseDocument ()
490     throws java.lang.Exception JavaDoc
491     {
492     char c;
493
494     parseProlog();
495     require('<');
496     parseElement();
497     try
498       {
499       parseMisc(); //skip all white, PIs, and comments
500
c=readCh(); //if this doesn't throw an exception...
501
error("unexpected characters after document end",c,null);
502       }
503     catch (EOFException JavaDoc e)
504       {return;}
505     }
506
507
508   /**
509     * Skip a comment.
510     * <pre>
511     * [18] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
512     * </pre>
513     * <p>(The <code>&lt;!--</code> has already been read.)
514     */

515   void parseComment ()
516     throws java.lang.Exception JavaDoc
517   {
518     skipUntil("-->");
519   }
520
521
522   /**
523     * Parse a processing instruction and do a call-back.
524     * <pre>
525     * [19] PI ::= '&lt;?' Name (S (Char* - (Char* '?&gt;' Char*)))? '?&gt;'
526     * </pre>
527     * <p>(The <code>&lt;?</code> has already been read.)
528     * <p>An XML processing instruction <em>must</em> begin with
529     * a Name, which is the instruction's target.
530     */

531   void parsePI ()
532     throws java.lang.Exception JavaDoc
533   {
534     String JavaDoc name;
535
536     name = readNmtoken(true);
537     if (!tryRead("?>")) {
538       requireWhitespace();
539       parseUntil("?>");
540     }
541     if (handler != null) {
542       handler.processingInstruction(name, dataBufferToString());
543     }
544   }
545
546
547   /**
548     * Parse a CDATA marked section.
549     * <pre>
550     * [20] CDSect ::= CDStart CData CDEnd
551     * [21] CDStart ::= '&lt;![CDATA['
552     * [22] CData ::= (Char* - (Char* ']]&gt;' Char*))
553     * [23] CDEnd ::= ']]&gt;'
554     * </pre>
555     * <p>(The '&lt;![CDATA[' has already been read.)
556     * <p>Note that this just appends characters to the dataBuffer,
557     * without actually generating an event.
558     */

559   void parseCDSect ()
560     throws java.lang.Exception JavaDoc
561   {
562     parseUntil("]]>");
563   }
564
565
566   /**
567     * Parse the prolog of an XML document.
568     * <pre>
569     * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
570     * </pre>
571     * <p>There are a couple of tricks here. First, it is necessary to
572     * declare the XML default attributes after the DTD (if present)
573     * has been read. Second, it is not possible to expand general
574     * references in attribute value literals until after the entire
575     * DTD (if present) has been parsed.
576     * <p>We do not look for the XML declaration here, because it is
577     * handled by pushURL().
578     * @see pushURL
579     */

580   void parseProlog ()
581     throws java.lang.Exception JavaDoc
582   {
583     parseMisc();
584
585     if (tryRead("<!DOCTYPE")) {
586       parseDoctypedecl();
587       parseMisc();
588     }
589   }
590
591
592   /**
593     * Parse the XML declaration.
594     * <pre>
595     * [25] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
596     * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
597     * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
598     * | S 'standalone' Eq '"' ("yes" | "no") '"'
599     * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
600     * </pre>
601     * <p>([80] to [82] are also significant.)
602     * <p>(The <code>&lt;?xml</code> and whitespace have already been read.)
603     * <p>TODO: validate value of standalone.
604     * @see #parseTextDecl
605     * @see #checkEncoding
606     */

607   void parseXMLDecl (boolean ignoreEncoding)
608     throws java.lang.Exception JavaDoc
609   {
610     String JavaDoc version;
611     String JavaDoc encodingName = null;
612     String JavaDoc standalone = null;
613
614                 // Read the version.
615
require("version");
616     parseEq();
617     version = readLiteral(0);
618     if (!version.equals("1.0")) {
619       error("unsupported XML version", version, "1.0");
620     }
621
622                 // Try reading an encoding declaration.
623
skipWhitespace();
624     if (tryRead("encoding")) {
625       parseEq();
626       encodingName = readLiteral(0);
627       checkEncoding(encodingName, ignoreEncoding);
628     }
629
630                 // Try reading a standalone declaration
631
skipWhitespace();
632     if (tryRead("standalone")) {
633       parseEq();
634       standalone = readLiteral(0);
635     }
636
637     skipWhitespace();
638     require("?>");
639   }
640
641
642   /**
643     * Parse the Encoding PI.
644     * <pre>
645     * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
646     * [79] EncodingPI ::= '&lt;?xml' S 'encoding' Eq QEncoding S? '?&gt;'
647     * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
648     * [81] Encoding ::= LatinName
649     * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
650     * </pre>
651     * <p>(The <code>&lt;?xml</code>' and whitespace have already been read.)
652     * @see #parseXMLDecl
653     * @see #checkEncoding
654     */

655   void parseTextDecl (boolean ignoreEncoding)
656     throws java.lang.Exception JavaDoc
657   {
658     String JavaDoc encodingName = null;
659     
660                 // Read an optional version.
661
if (tryRead("version")) {
662       String JavaDoc version;
663       parseEq();
664       version = readLiteral(0);
665       if (!version.equals("1.0")) {
666     error("unsupported XML version", version, "1.0");
667       }
668       requireWhitespace();
669     }
670       
671
672                 // Read the encoding.
673
require("encoding");
674     parseEq();
675     encodingName = readLiteral(0);
676     checkEncoding(encodingName, ignoreEncoding);
677
678     skipWhitespace();
679     require("?>");
680   }
681
682
683   /**
684     * Check that the encoding specified makes sense.
685     * <p>Compare what the author has specified in the XML declaration
686     * or encoding PI with what we have detected.
687     * <p>This is also important for distinguishing among the various
688     * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
689     * those).
690     * @param encodingName The name of the encoding specified by the user.
691     * @see #parseXMLDecl
692     * @see #parseTextDecl
693     */

694   void checkEncoding (String JavaDoc encodingName, boolean ignoreEncoding)
695     throws java.lang.Exception JavaDoc
696   {
697     encodingName = encodingName.toUpperCase();
698
699     if (ignoreEncoding) {
700       return;
701     }
702
703     switch (encoding) {
704                 // 8-bit encodings
705
case ENCODING_UTF_8:
706       if (encodingName.equals("ISO-8859-1")) {
707     encoding = ENCODING_ISO_8859_1;
708       } else if (!encodingName.equals("UTF-8")) {
709     error("unsupported 8-bit encoding",
710           encodingName,
711           "UTF-8 or ISO-8859-1");
712       }
713       break;
714                 // 16-bit encodings
715
case ENCODING_UCS_2_12:
716     case ENCODING_UCS_2_21:
717       if (!encodingName.equals("ISO-10646-UCS-2") &&
718       !encodingName.equals("UTF-16")) {
719     error("unsupported 16-bit encoding",
720           encodingName,
721           "ISO-10646-UCS-2");
722       }
723       break;
724                 // 32-bit encodings
725
case ENCODING_UCS_4_1234:
726     case ENCODING_UCS_4_4321:
727     case ENCODING_UCS_4_2143:
728     case ENCODING_UCS_4_3412:
729       if (!encodingName.equals("ISO-10646-UCS-4")) {
730     error("unsupported 32-bit encoding",
731           encodingName,
732           "ISO-10646-UCS-4");
733       }
734     }
735   }
736
737
738   /**
739     * Parse miscellaneous markup outside the document element and DOCTYPE
740     * declaration.
741     * <pre>
742     * [27] Misc ::= Comment | PI | S
743     * </pre>
744     */

745   void parseMisc ()
746     throws java.lang.Exception JavaDoc
747     {
748     while (true)
749       {
750       skipWhitespace();
751       if (tryRead("<?"))
752         {parsePI();}
753       else if (tryRead("<!--"))
754         {parseComment();}
755       else
756         {return;}
757       }
758     }
759
760
761   /**
762     * Parse a document type declaration.
763     * <pre>
764     * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
765     * ('[' %markupdecl* ']' S?)? '&gt;'
766     * </pre>
767     * <p>(The <code>&lt;!DOCTYPE</code> has already been read.)
768     */

769   void parseDoctypedecl ()
770     throws java.lang.Exception JavaDoc
771   {
772     char c;
773     String JavaDoc doctypeName, ids[];
774
775                 // Read the document type name.
776
requireWhitespace();
777     doctypeName = readNmtoken(true);
778
779                 // Read the ExternalIDs.
780
skipWhitespace();
781     ids = readExternalIds(false);
782
783                 // Look for a declaration subset.
784
skipWhitespace();
785     if (tryRead('[')) {
786
787                 // loop until the subset ends
788
while (true) {
789     context = CONTEXT_DTD;
790     skipWhitespace();
791     context = CONTEXT_NONE;
792     if (tryRead(']')) {
793       break; // end of subset
794
} else {
795       context = CONTEXT_DTD;
796       parseMarkupdecl();
797       context = CONTEXT_NONE;
798     }
799       }
800     }
801
802                 // Read the external subset, if any
803
if (ids[1] != null) {
804       pushURL("[external subset]", ids[0], ids[1], null, null, null);
805
806                 // Loop until we end up back at '>'
807
while (true) {
808     context = CONTEXT_DTD;
809     skipWhitespace();
810     context = CONTEXT_NONE;
811     if (tryRead('>')) {
812       break;
813     } else {
814       context = CONTEXT_DTD;
815       parseMarkupdecl();
816       context = CONTEXT_NONE;
817     }
818       }
819     } else {
820                 // No external subset.
821
skipWhitespace();
822       require('>');
823     }
824
825     if (handler != null) {
826       handler.doctypeDecl(doctypeName, ids[0], ids[1]);
827     }
828
829                 // Expand general entities in
830
// default values of attributes.
831
// (Do this after the doctypeDecl
832
// event!).
833
// expandAttributeDefaultValues();
834
}
835
836
837   /**
838     * Parse a markup declaration in the internal or external DTD subset.
839     * <pre>
840     * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
841     * %NotationDecl | %PI | %S | %Comment |
842     * InternalPERef )
843     * [30] InternalPERef ::= PEReference
844     * [31] extSubset ::= (%markupdecl | %conditionalSect)*
845     * </pre>
846     */

847   void parseMarkupdecl ()
848     throws java.lang.Exception JavaDoc
849   {
850     if (tryRead("<!ELEMENT")) {
851       parseElementdecl();
852     } else if (tryRead("<!ATTLIST")) {
853       parseAttlistDecl();
854     } else if (tryRead("<!ENTITY")) {
855       parseEntityDecl();
856     } else if (tryRead("<!NOTATION")) {
857       parseNotationDecl();
858     } else if (tryRead("<?")) {
859       parsePI();
860     } else if (tryRead("<!--")) {
861       parseComment();
862     } else if (tryRead("<![")) {
863       parseConditionalSect();
864     } else {
865       error("expected markup declaration", null, null);
866     }
867   }
868
869
870   /**
871     * Parse an element, with its tags.
872     * <pre>
873     * [33] STag ::= '&lt;' Name (S Attribute)* S? '&gt;' [WFC: unique Att spec]
874     * [38] element ::= EmptyElement | STag content ETag
875     * [39] EmptyElement ::= '&lt;' Name (S Attribute)* S? '/&gt;'
876     * [WFC: unique Att spec]
877     * </pre>
878     * <p>(The '&lt;' has already been read.)
879     * <p>NOTE: this method actually chains onto parseContent(), if necessary,
880     * and parseContent() will take care of calling parseETag().
881     */

882   void parseElement ()
883     throws java.lang.Exception JavaDoc
884   {
885     String JavaDoc gi;
886     char c;
887     int oldElementContent = currentElementContent;
888     String JavaDoc oldElement = currentElement;
889
890                 // This is the (global) counter for the
891
// array of specified attributes.
892
tagAttributePos = 0;
893
894                 // Read the element type name.
895
gi = readNmtoken(true);
896
897                 // Determine the current content type.
898
currentElement = gi;
899     currentElementContent = getElementContentType(gi);
900     if (currentElementContent == CONTENT_UNDECLARED) {
901       currentElementContent = CONTENT_ANY;
902     }
903
904                 // Read the attributes, if any.
905
// After this loop, we should be just
906
// in front of the closing delimiter.
907
skipWhitespace();
908     c = readCh();
909     while (c != '/' && c != '>') {
910       unread(c);
911       parseAttribute(gi);
912       skipWhitespace();
913       c = readCh();
914     }
915     unread(c);
916
917                 // Supply any defaulted attributes.
918
Enumeration JavaDoc atts = declaredAttributes(gi);
919     if (atts != null) {
920       String JavaDoc aname;
921     loop: while (atts.hasMoreElements()) {
922       aname = (String JavaDoc)atts.nextElement();
923                 // See if it was specified.
924
for (int i = 0; i < tagAttributePos; i++) {
925     if (tagAttributes[i] == aname) {
926       continue loop;
927     }
928       }
929                 // I guess not...
930
if (handler != null) {
931     handler.attribute(aname,
932               getAttributeExpandedValue(gi, aname),
933               false);
934       }
935     }
936     }
937
938                 // Figure out if this is a start tag
939
// or an empty element, and dispatch an
940
// event accordingly.
941
c = readCh();
942     switch (c) {
943     case '>':
944       if (handler != null) {
945     handler.startElement(gi);
946       }
947       parseContent();
948       break;
949     case '/':
950       require('>');
951       if (handler != null) {
952     handler.startElement(gi);
953     handler.endElement(gi);
954       }
955       break;
956     }
957
958                 // Restore the previous state.
959
currentElement = oldElement;
960     currentElementContent = oldElementContent;
961   }
962
963
964   /**
965     * Parse an attribute assignment.
966     * <pre>
967     * [34] Attribute ::= Name Eq AttValue
968     * </pre>
969     * @param name The name of the attribute's element.
970     * @see XmlHandler#attribute
971     */

972   void parseAttribute (String JavaDoc name)
973     throws java.lang.Exception JavaDoc
974   {
975     String JavaDoc aname;
976     int type;
977     String JavaDoc value;
978
979                 // Read the attribute name.
980
aname = readNmtoken(true).intern();
981     type = getAttributeDefaultValueType(name, aname);
982
983                 // Parse '='
984
parseEq();
985
986                 // Read the value, normalizing whitespace
987
// if it is not CDATA.
988
if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
989       value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF);
990     } else {
991       value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE);
992     }
993
994                 // Inform the handler about the
995
// attribute.
996
if (handler != null) {
997       handler.attribute(aname, value, true);
998     }
999     dataBufferPos = 0;
1000
1001                // Note that the attribute has been
1002
// specified.
1003
if (tagAttributePos == tagAttributes.length) {
1004      String JavaDoc newAttrib[] = new String JavaDoc[tagAttributes.length * 2];
1005      System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1006      tagAttributes = newAttrib;
1007    }
1008    tagAttributes[tagAttributePos++] = aname;
1009  }
1010
1011
1012  /**
1013    * Parse an equals sign surrounded by optional whitespace.
1014    * [35] Eq ::= S? '=' S?
1015    */

1016  void parseEq ()
1017    throws java.lang.Exception JavaDoc
1018  {
1019    skipWhitespace();
1020    require('=');
1021    skipWhitespace();
1022  }
1023
1024
1025  /**
1026    * Parse an end tag.
1027    * [36] ETag ::= '</' Name S? '>'
1028    * *NOTE: parseContent() chains to here.
1029    */

1030  void parseETag ()
1031    throws java.lang.Exception JavaDoc
1032  {
1033    String JavaDoc name;
1034    name = readNmtoken(true);
1035    if (name != currentElement) {
1036      error("mismatched end tag", name, currentElement);
1037    }
1038    skipWhitespace();
1039    require('>');
1040    if (handler != null) {
1041      handler.endElement(name);
1042    }
1043  }
1044
1045
1046  /**
1047    * Parse the content of an element.
1048    * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)*
1049    * [68] Reference ::= EntityRef | CharRef
1050    */

1051  void parseContent ()
1052    throws java.lang.Exception JavaDoc
1053  {
1054    String JavaDoc data;
1055    char c;
1056
1057    while (true) {
1058
1059      switch (currentElementContent) {
1060      case CONTENT_ANY:
1061      case CONTENT_MIXED:
1062    parsePCData();
1063    break;
1064      case CONTENT_ELEMENTS:
1065    parseWhitespace();
1066    break;
1067      }
1068
1069                // Handle delimiters
1070
c = readCh();
1071      switch (c) {
1072
1073      case '&': // Found "&"
1074
c = readCh();
1075    if (c == '#') {
1076      parseCharRef();
1077    } else {
1078      unread(c);
1079      parseEntityRef(true);
1080    }
1081    break;
1082
1083      case '<': // Found "<"
1084

1085    c = readCh();
1086    switch (c) {
1087
1088    case '!': // Found "<!"
1089
c = readCh();
1090      switch (c) {
1091      case '-': // Found "<!-"
1092
require('-');
1093        parseComment();
1094        break;
1095      case '[': // Found "<!["
1096
require("CDATA[");
1097        parseCDSect();
1098        break;
1099      default:
1100        error("expected comment or CDATA section", c, null);
1101        break;
1102      }
1103      break;
1104
1105    case '?': // Found "<?"
1106
dataBufferFlush();
1107      parsePI();
1108      break;
1109
1110    case '/': // Found "</"
1111
dataBufferFlush();
1112      parseETag();
1113      return;
1114
1115    default: // Found "<" followed by something else
1116
dataBufferFlush();
1117      unread(c);
1118      parseElement();
1119      break;
1120    }
1121      }
1122    }
1123  }
1124
1125
1126  /**
1127    * Parse an element type declaration.
1128    * [40] elementdecl ::= '<!ELEMENT' S %Name S (%S S)? %contentspec S? '>'
1129    * [VC: Unique Element Declaration]
1130    * *NOTE: the '<!ELEMENT' has already been read.
1131    */

1132  void parseElementdecl ()
1133    throws java.lang.Exception JavaDoc
1134  {
1135    String JavaDoc name;
1136
1137    requireWhitespace();
1138                // Read the element type name.
1139
name = readNmtoken(true);
1140
1141    requireWhitespace();
1142                // Read the content model.
1143
parseContentspec(name);
1144
1145    skipWhitespace();
1146    require('>');
1147  }
1148
1149
1150  /**
1151    * Content specification.
1152    * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1153    */

1154  void parseContentspec (String JavaDoc name)
1155    throws java.lang.Exception JavaDoc
1156  {
1157    if (tryRead("EMPTY")) {
1158      setElement(name, CONTENT_EMPTY, null, null);
1159      return;
1160    } else if (tryRead("ANY")) {
1161      setElement(name, CONTENT_ANY, null, null);
1162      return;
1163    } else {
1164      require('(');
1165      dataBufferAppend('(');
1166      skipWhitespace();
1167      if (tryRead("#PCDATA")) {
1168    dataBufferAppend("#PCDATA");
1169    parseMixed();
1170    setElement(name, CONTENT_MIXED, dataBufferToString(), null);
1171      } else {
1172    parseElements();
1173    setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null);
1174      }
1175    }
1176  }
1177
1178
1179  /**
1180    * Parse an element-content model.
1181    * [42] elements ::= (choice | seq) ('?' | '*' | '+')?
1182    * [44] cps ::= S? %cp S?
1183    * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')'
1184    * [46] ctokplus ::= cps ('|' cps)+
1185    * [47] ctoks ::= cps ('|' cps)*
1186    * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')'
1187    * [49] stoks ::= cps (',' cps)*
1188    * *NOTE: the opening '(' and S have already been read.
1189    * *TODO: go over parameter entity boundaries more carefully.
1190    */

1191  void parseElements ()
1192    throws java.lang.Exception JavaDoc
1193  {
1194    char c;
1195    char sep;
1196
1197                // Parse the first content particle
1198
skipWhitespace();
1199    parseCp();
1200
1201                // Check for end or for a separator.
1202
skipWhitespace();
1203    c = readCh();
1204    switch (c) {
1205    case ')':
1206      dataBufferAppend(')');
1207      c = readCh();
1208      switch (c) {
1209      case '*':
1210      case '+':
1211      case '?':
1212    dataBufferAppend(c);
1213    break;
1214      default:
1215    unread(c);
1216      }
1217      return;
1218    case ',': // Register the separator.
1219
case '|':
1220      sep = c;
1221      dataBufferAppend(c);
1222      break;
1223    default:
1224      error("bad separator in content model", c, null);
1225      return;
1226    }
1227
1228                // Parse the rest of the content model.
1229
while (true) {
1230      skipWhitespace();
1231      parseCp();
1232      skipWhitespace();
1233      c = readCh();
1234      if (c == ')') {
1235    dataBufferAppend(')');
1236    break;
1237      } else if (c != sep) {
1238    error("bad separator in content model", c, null);
1239    return;
1240      } else {
1241    dataBufferAppend(c);
1242      }
1243    }
1244
1245                // Check for the occurrence indicator.
1246
c = readCh();
1247    switch (c) {
1248    case '?':
1249    case '*':
1250    case '+':
1251      dataBufferAppend(c);
1252      return;
1253    default:
1254      unread(c);
1255      return;
1256    }
1257  }
1258
1259
1260  /**
1261    * Parse a content particle.
1262    * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+')
1263    * *NOTE: I actually use a slightly different production here:
1264    * cp ::= (elements | (Name ('?' | '*' | '+')?))
1265    */

1266  void parseCp ()
1267    throws java.lang.Exception JavaDoc
1268  {
1269    char c;
1270
1271    if (tryRead('(')) {
1272      dataBufferAppend('(');
1273      parseElements();
1274    } else {
1275      dataBufferAppend(readNmtoken(true));
1276      c = readCh();
1277      switch (c) {
1278      case '?':
1279      case '*':
1280      case '+':
1281    dataBufferAppend(c);
1282    break;
1283      default:
1284    unread(c);
1285    break;
1286      }
1287    }
1288  }
1289
1290
1291  /**
1292    * Parse mixed content.
1293    * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*'
1294    * | '(' S? %('#PCDATA') S? ')'
1295    * [51] Mtoks ::= %Name (S? '|' S? %Name)*
1296    * *NOTE: the S and '#PCDATA' have already been read.
1297    */

1298  void parseMixed ()
1299    throws java.lang.Exception JavaDoc
1300  {
1301    char c;
1302
1303                // Check for PCDATA alone.
1304
skipWhitespace();
1305    if (tryRead(')')) {
1306      dataBufferAppend(")*");
1307      tryRead('*');
1308      return;
1309    }
1310
1311                // Parse mixed content.
1312
skipWhitespace();
1313    while (!tryRead(")*")) {
1314      require('|');
1315      dataBufferAppend('|');
1316      skipWhitespace();
1317      dataBufferAppend(readNmtoken(true));
1318      skipWhitespace();
1319    }
1320    dataBufferAppend(")*");
1321  }
1322
1323
1324  /**
1325    * Parse an attribute list declaration.
1326    * [52] AttlistDecl ::= '<!ATTLIST' S %Name S? %AttDef+ S? '>'
1327    * *NOTE: the '<!ATTLIST' has already been read.
1328    */

1329  void parseAttlistDecl ()
1330    throws java.lang.Exception JavaDoc
1331  {
1332    String JavaDoc elementName;
1333
1334    requireWhitespace();
1335    elementName = readNmtoken(true);
1336    requireWhitespace();
1337    while (!tryRead('>')) {
1338      parseAttDef(elementName);
1339      skipWhitespace();
1340    }
1341  }
1342
1343
1344  /**
1345    * Parse a single attribute definition.
1346    * [53] AttDef ::= S %Name S %AttType S %Default
1347    */

1348  void parseAttDef (String JavaDoc elementName)
1349    throws java.lang.Exception JavaDoc
1350  {
1351    String JavaDoc name;
1352    int type;
1353    String JavaDoc enumeration = null;
1354
1355                // Read the attribute name.
1356
name = readNmtoken(true);
1357
1358                // Read the attribute type.
1359
requireWhitespace();
1360    type = readAttType();
1361
1362                // Get the string of enumerated values
1363
// if necessary.
1364
if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1365      enumeration = dataBufferToString();
1366    }
1367
1368                // Read the default value.
1369
requireWhitespace();
1370    parseDefault(elementName, name, type, enumeration);
1371  }
1372
1373
1374  /**
1375    * Parse the attribute type.
1376    * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1377    * [55] StringType ::= 'CDATA'
1378    * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' |
1379    * 'NMTOKEN' | 'NMTOKENS'
1380    * [57] EnumeratedType ::= NotationType | Enumeration
1381    * *TODO: validate the type!!
1382    */

1383  int readAttType ()
1384    throws java.lang.Exception JavaDoc
1385  {
1386    String JavaDoc typeString;
1387    Integer JavaDoc type;
1388
1389    if (tryRead('(')) {
1390      parseEnumeration();
1391      return ATTRIBUTE_ENUMERATED;
1392    } else {
1393      typeString = readNmtoken(true);
1394      if (typeString.equals("NOTATION")) {
1395    parseNotationType();
1396      }
1397      type = (Integer JavaDoc)attributeTypeHash.get(typeString);
1398      if (type == null) {
1399    error("illegal attribute type", typeString, null);
1400    return ATTRIBUTE_UNDECLARED;
1401      } else {
1402    return type.intValue();
1403      }
1404    }
1405  }
1406
1407
1408  /**
1409    * Parse an enumeration.
1410    * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')'
1411    * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)*
1412    * *NOTE: the '(' has already been read.
1413    */

1414  void parseEnumeration ()
1415    throws java.lang.Exception JavaDoc
1416  {
1417    char c;
1418
1419    dataBufferAppend('(');
1420
1421                // Read the first token.
1422
skipWhitespace();
1423    dataBufferAppend(readNmtoken(true));
1424                // Read the remaining tokens.
1425
skipWhitespace();
1426    while (!tryRead(')')) {
1427      require('|');
1428      dataBufferAppend('|');
1429      skipWhitespace();
1430      dataBufferAppend(readNmtoken(true));
1431      skipWhitespace();
1432    }
1433    dataBufferAppend(')');
1434  }
1435
1436
1437  /**
1438    * Parse a notation type for an attribute.
1439    * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)*
1440    * S? ')'
1441    * [59] Ntoks ::= %Name (S? '|' S? %Name)
1442    * *NOTE: the 'NOTATION' has already been read
1443    */

1444  void parseNotationType ()
1445    throws java.lang.Exception JavaDoc
1446  {
1447    requireWhitespace();
1448    require('(');
1449
1450    parseEnumeration();
1451  }
1452
1453
1454  /**
1455    * Parse the default value for an attribute.
1456    * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue
1457    */

1458  void parseDefault (String JavaDoc elementName, String JavaDoc name, int type, String JavaDoc enumeration)
1459    throws java.lang.Exception JavaDoc
1460  {
1461    int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1462    String JavaDoc value = null;
1463    boolean normalizeWSFlag;
1464
1465    if (tryRead('#')) {
1466      if (tryRead("FIXED")) {
1467    valueType = ATTRIBUTE_DEFAULT_FIXED;
1468    requireWhitespace();
1469    context = CONTEXT_ATTRIBUTEVALUE;
1470    value = readLiteral(LIT_CHAR_REF);
1471    context = CONTEXT_DTD;
1472      } else if (tryRead("REQUIRED")) {
1473    valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1474      } else if (tryRead("IMPLIED")) {
1475    valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1476      } else {
1477    error("illegal keyword for attribute default value", null, null);
1478      }
1479    } else {
1480      context = CONTEXT_ATTRIBUTEVALUE;
1481      value = readLiteral(LIT_CHAR_REF);
1482      context = CONTEXT_DTD;
1483    }
1484    setAttribute(elementName, name, type, enumeration, value, valueType);
1485  }
1486
1487
1488  /**
1489    * Parse a conditional section.
1490    * [63] conditionalSect ::= includeSect || ignoreSect
1491    * [64] includeSect ::= '<![' %'INCLUDE' '[' (%markupdecl*)* ']]>'
1492    * [65] ignoreSect ::= '<![' %'IGNORE' '[' ignoreSectContents* ']]>'
1493    * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>'))
1494    * | ('<![' ignoreSectContents* ']]>')
1495    * | (Char - (']' | [<'"]))
1496    * | ('<!' (Char - ('-' | '[')))
1497    * *NOTE: the '<![' has already been read.
1498    * *TODO: verify that I am handling ignoreSectContents right.
1499    */

1500  void parseConditionalSect ()
1501    throws java.lang.Exception JavaDoc
1502  {
1503    skipWhitespace();
1504    if (tryRead("INCLUDE")) {
1505      skipWhitespace();
1506      require('[');
1507      skipWhitespace();
1508      while (!tryRead("]]>")) {
1509    parseMarkupdecl();
1510    skipWhitespace();
1511      }
1512    } else if (tryRead("IGNORE")) {
1513      skipWhitespace();
1514      require('[');
1515      int nesting = 1;
1516      char c;
1517      for (int nest = 1; nest > 0; ) {
1518    c = readCh();
1519    switch (c) {
1520    case '<':
1521      if (tryRead("![")) {
1522        nest++;
1523      }
1524    case ']':
1525      if (tryRead("]>")) {
1526        nest--;
1527      }
1528    }
1529      }
1530    } else {
1531      error("conditional section must begin with INCLUDE or IGNORE",
1532        null, null);
1533    }
1534  }
1535
1536
1537  /**
1538    * Read a character reference.
1539    * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1540    * *NOTE: the '&#' has already been read.
1541    */

1542  void parseCharRef ()
1543    throws java.lang.Exception JavaDoc
1544  {
1545    int value = 0;
1546    char c;
1547
1548    if (tryRead('x')) {
1549      loop1: while (true) {
1550    c = readCh();
1551    switch (c) {
1552    case '0':
1553    case '1':
1554    case '2':
1555    case '3':
1556    case '4':
1557    case '5':
1558    case '6':
1559    case '7':
1560    case '8':
1561    case '9':
1562    case 'a':
1563    case 'A':
1564    case 'b':
1565    case 'B':
1566    case 'c':
1567    case 'C':
1568    case 'd':
1569    case 'D':
1570    case 'e':
1571    case 'E':
1572    case 'f':
1573    case 'F':
1574      value *= 16;
1575      value += Integer.parseInt(new Character JavaDoc(c).toString(), 16);
1576      break;
1577    case ';':
1578      break loop1;
1579    default:
1580      error("illegal character in character reference", c, null);
1581      break loop1;
1582    }
1583      }
1584    } else {
1585      loop2: while (true) {
1586    c = readCh();
1587    switch (c) {
1588    case '0':
1589    case '1':
1590    case '2':
1591    case '3':
1592    case '4':
1593    case '5':
1594    case '6':
1595    case '7':
1596    case '8':
1597    case '9':
1598      value *= 10;
1599      value += Integer.parseInt(new Character JavaDoc(c).toString(), 10);
1600      break;
1601    case ';':
1602      break loop2;
1603    default:
1604      error("illegal character in character reference", c, null);
1605      break loop2;
1606    }
1607      }
1608    }
1609
1610    // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
1611
// (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
1612
if (value <= 0x0000ffff) {
1613                // no surrogates needed
1614
dataBufferAppend((char)value);
1615    } else if (value <= 0x000fffff) {
1616                // > 16 bits, surrogate needed
1617
dataBufferAppend((char)(0xd8 | ((value & 0x000ffc00) >> 10)));
1618      dataBufferAppend((char)(0xdc | (value & 0x0003ff)));
1619    } else {
1620                // too big for surrogate
1621
error("character reference " + value + " is too large for UTF-16",
1622        new Integer JavaDoc(value).toString(), null);
1623    }
1624  }
1625
1626
1627  /**
1628    * Parse a reference.
1629    * [69] EntityRef ::= '&' Name ';'
1630    * *NOTE: the '&' has already been read.
1631    * @param externalAllowed External entities are allowed here.
1632    */

1633  void parseEntityRef (boolean externalAllowed)
1634    throws java.lang.Exception JavaDoc
1635  {
1636    String JavaDoc name;
1637
1638    name = readNmtoken(true);
1639    require(';');
1640    switch (getEntityType(name)) {
1641    case ENTITY_UNDECLARED:
1642      error("reference to undeclared entity", name, null);
1643      break;
1644    case ENTITY_INTERNAL:
1645      pushString(name, getEntityValue(name));
1646      break;
1647    case ENTITY_TEXT:
1648      if (externalAllowed) {
1649    pushURL(name, getEntityPublicId(name),
1650        getEntitySystemId(name),
1651        null, null, null);
1652      } else {
1653    error("reference to external entity in attribute value.", name, null);
1654      }
1655      break;
1656    case ENTITY_NDATA:
1657      if (externalAllowed) {
1658    error("data entity reference in content", name, null);
1659      } else {
1660    error("reference to external entity in attribute value.", name, null);
1661      }
1662      break;
1663    }
1664  }
1665
1666
1667  /**
1668    * Parse a parameter entity reference.
1669    * [70] PEReference ::= '%' Name ';'
1670    * *NOTE: the '%' has already been read.
1671    */

1672  void parsePEReference (boolean isEntityValue)
1673    throws java.lang.Exception JavaDoc
1674  {
1675    String JavaDoc name;
1676
1677    name = "%" + readNmtoken(true);
1678    require(';');
1679    switch (getEntityType(name)) {
1680    case ENTITY_UNDECLARED:
1681      error("reference to undeclared parameter entity", name, null);
1682      break;
1683    case ENTITY_INTERNAL:
1684      if (isEntityValue) {
1685    pushString(name, getEntityValue(name));
1686      } else {
1687    pushString(name, " " + getEntityValue(name) + ' ');
1688      }
1689      break;
1690    case ENTITY_TEXT:
1691      if (isEntityValue) {
1692    pushString(null, " ");
1693      }
1694      pushURL(name, getEntityPublicId(name),
1695          getEntitySystemId(name),
1696          null, null, null);
1697      if (isEntityValue) {
1698    pushString(null, " ");
1699      }
1700      break;
1701    }
1702  }
1703
1704
1705  /**
1706    * Parse an entity declaration.
1707    * [71] EntityDecl ::= '<!ENTITY' S %Name S %EntityDef S? '>'
1708    * | '<!ENTITY' S '%' S %Name S %EntityDef S? '>'
1709    * [72] EntityDef ::= EntityValue | ExternalDef
1710    * [73] ExternalDef ::= ExternalID %NDataDecl?
1711    * [74] ExternalID ::= 'SYSTEM' S SystemLiteral
1712    * | 'PUBLIC' S PubidLiteral S SystemLiteral
1713    * [75] NDataDecl ::= S %'NDATA' S %Name
1714    * *NOTE: the '<!ENTITY' has already been read.
1715    */

1716  void parseEntityDecl ()
1717    throws java.lang.Exception JavaDoc
1718  {
1719    char c;
1720    boolean peFlag = false;
1721    String JavaDoc name, value, notationName, ids[];
1722
1723                // Check for a parameter entity.
1724
requireWhitespace();
1725    if (tryRead('%')) {
1726      peFlag = true;
1727      requireWhitespace();
1728    }
1729
1730                // Read the entity name, and prepend
1731
// '%' if necessary.
1732
name = readNmtoken(true);
1733    if (peFlag) {
1734      name = "%" + name;
1735    }
1736
1737                // Read the entity value.
1738
requireWhitespace();
1739    c = readCh();
1740    unread(c);
1741    if (c == '"' || c == '\'') {
1742                // Internal entity.
1743
context = CONTEXT_ENTITYVALUE;
1744      value = readLiteral(LIT_CHAR_REF|LIT_PE_REF);
1745      context = CONTEXT_DTD;
1746      setInternalEntity(name,value);
1747    } else {
1748                // Read the external IDs
1749
ids = readExternalIds(false);
1750      if (ids[1] == null) {
1751    error("system identifer missing", name, null);
1752      }
1753
1754                // Check for NDATA declaration.
1755
skipWhitespace();
1756      if (tryRead("NDATA")) {
1757    requireWhitespace();
1758    notationName = readNmtoken(true);
1759    setExternalDataEntity(name, ids[0], ids[1], notationName);
1760      } else {
1761    setExternalTextEntity(name, ids[0], ids[1]);
1762      }
1763    }
1764
1765                // Finish the declaration.
1766
skipWhitespace();
1767    require('>');
1768  }
1769
1770
1771  /**
1772    * Parse a notation declaration.
1773    * [81] NotationDecl ::= '<!NOTATION' S %Name S %ExternalID S? '>'
1774    * *NOTE: the '<!NOTATION' has already been read.
1775    */

1776  void parseNotationDecl ()
1777    throws java.lang.Exception JavaDoc
1778  {
1779    String JavaDoc nname, ids[];
1780    
1781
1782    requireWhitespace();
1783    nname = readNmtoken(true);
1784
1785    requireWhitespace();
1786
1787                // Read the external identifiers.
1788
ids = readExternalIds(true);
1789    if (ids[0] == null && ids[1] == null) {
1790      error("external identifer missing", nname, null);
1791    }
1792
1793                // Register the notation.
1794
setNotation(nname, ids[0], ids[1]);
1795
1796    skipWhitespace();
1797    require('>');
1798  }
1799
1800
1801  /**
1802    * Parse PCDATA.
1803    * <pre>
1804    * [16] PCData ::= [^&lt;&amp;]*
1805    * </pre>
1806    * <p>The trick here is that the data stays in the dataBuffer without
1807    * necessarily being converted to a string right away.
1808    */

1809  void parsePCData ()
1810    throws java.lang.Exception JavaDoc
1811  {
1812    char c;
1813
1814                // Start with a little cheat -- in most
1815
// cases, the entire sequence of
1816
// character data will already be in
1817
// the readBuffer; if not, fall through to
1818
// the normal approach.
1819
if (USE_CHEATS) {
1820      int lineAugment = 0;
1821      int columnAugment = 0;
1822
1823      loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1824    switch (readBuffer[i]) {
1825    case '\n':
1826      lineAugment++;
1827      columnAugment = 0;
1828      break;
1829    case '&':
1830    case '<':
1831      int start = readBufferPos;
1832      columnAugment++;
1833      readBufferPos = i;
1834      if (lineAugment > 0) {
1835        line += lineAugment;
1836        column = columnAugment;
1837      } else {
1838        column += columnAugment;
1839      }
1840      dataBufferAppend(readBuffer, start, i-start);
1841      return;
1842    default:
1843      columnAugment++;
1844    }
1845      }
1846    }
1847
1848                // OK, the cheat didn't work; start over
1849
// and do it by the book.
1850
while (true) {
1851      c = readCh();
1852      switch (c) {
1853      case '<':
1854      case '&':
1855    unread(c);
1856    return;
1857      default:
1858    dataBufferAppend(c);
1859    break;
1860      }
1861    }
1862  }
1863
1864
1865
1866  //////////////////////////////////////////////////////////////////////
1867
// High-level reading and scanning methods.
1868
//////////////////////////////////////////////////////////////////////
1869

1870  /**
1871    * Require whitespace characters.
1872    * [1] S ::= (#x20 | #x9 | #xd | #xa)+
1873    */

1874  void requireWhitespace ()
1875    throws java.lang.Exception JavaDoc
1876  {
1877    char c = readCh();
1878    if (isWhitespace(c)) {
1879      skipWhitespace();
1880    } else {
1881      error("whitespace expected", c, null);
1882    }
1883  }
1884
1885
1886  /**
1887    * Parse whitespace characters, and leave them in the data buffer.
1888    */

1889  void parseWhitespace ()
1890    throws java.lang.Exception JavaDoc
1891  {
1892    char c = readCh();
1893    while (isWhitespace(c)) {
1894      dataBufferAppend(c);
1895      c = readCh();
1896    }
1897    unread(c);
1898  }
1899
1900
1901  /**
1902    * Skip whitespace characters.
1903    * [1] S ::= (#x20 | #x9 | #xd | #xa)+
1904    */

1905  void skipWhitespace ()
1906    throws java.lang.Exception JavaDoc
1907  {
1908                // Start with a little cheat. Most of
1909
// the time, the white space will fall
1910
// within the current read buffer; if
1911
// not, then fall through.
1912
if (USE_CHEATS) {
1913      int lineAugment = 0;
1914      int columnAugment = 0;
1915
1916      loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1917    switch (readBuffer[i]) {
1918    case ' ':
1919    case '\t':
1920    case '\r':
1921      columnAugment++;
1922      break;
1923    case '\n':
1924      lineAugment++;
1925      columnAugment = 0;
1926      break;
1927    case '%':
1928      if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) {
1929        break loop;
1930      } // else fall through...
1931
default:
1932      readBufferPos = i;
1933      if (lineAugment > 0) {
1934        line += lineAugment;
1935        column = columnAugment;
1936      } else {
1937        column += columnAugment;
1938      }
1939      return;
1940    }
1941      }
1942    }
1943
1944                // OK, do it by the book.
1945
char c = readCh();
1946    while (isWhitespace(c)) {
1947      c = readCh();
1948    }
1949    unread(c);
1950  }
1951
1952
1953  /**
1954    * Read a name or name token.
1955    * [5] Name ::= (Letter | '_' | ':') (NameChar)*
1956    * [7] Nmtoken ::= (NameChar)+
1957    * *NOTE: [6] is implemented implicitly where required.
1958    */

1959  String JavaDoc readNmtoken (boolean isName)
1960    throws java.lang.Exception JavaDoc
1961  {
1962    char c;
1963
1964    if (USE_CHEATS) {
1965      loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1966    switch (readBuffer[i]) {
1967    case '%':
1968      if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) {
1969        break loop;
1970      } // else fall through...
1971
case '<':
1972    case '>':
1973    case '&':
1974    case ',':
1975    case '|':
1976    case '*':
1977    case '+':
1978    case '?':
1979    case ')':
1980    case '=':
1981    case '\'':
1982    case '"':
1983    case '[':
1984    case ' ':
1985    case '\t':
1986    case '\r':
1987    case '\n':
1988    case ';':
1989    case '/':
1990    case '#':
1991      int start = readBufferPos;
1992      if (i == start) {
1993        error("name expected", readBuffer[i], null);
1994      }
1995      readBufferPos = i;
1996      return intern(readBuffer, start, i - start);
1997    }
1998      }
1999    }
2000
2001    nameBufferPos = 0;
2002
2003                // Read the first character.
2004
loop: while (true) {
2005      c = readCh();
2006      switch (c) {
2007      case '%':
2008      case '<':
2009      case '>':
2010      case '&':
2011      case ',':
2012      case '|':
2013      case '*':
2014      case '+':
2015      case '?':
2016      case ')':
2017      case '=':
2018      case '\'':
2019      case '"':
2020      case '[':
2021      case ' ':
2022      case '\t':
2023      case '\n':
2024      case '\r':
2025      case ';':
2026      case '/':
2027    unread(c);
2028    if (nameBufferPos == 0) {
2029      error("name expected", null, null);
2030    }
2031    String JavaDoc s = intern(nameBuffer,0,nameBufferPos);
2032    nameBufferPos = 0;
2033    return s;
2034      default:
2035    nameBuffer =
2036      (char[])extendArray(nameBuffer, nameBuffer.length, nameBufferPos);
2037    nameBuffer[nameBufferPos++] = c;
2038      }
2039    }
2040  }
2041
2042
2043  /**
2044    * Read a literal.
2045    * [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
2046    * | "'" ([^<&'] | Reference)* "'"
2047    * [11] SystemLiteral ::= '"' URLchar* '"' | "'" (URLchar - "'")* "'"
2048    * [13] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2049    * [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
2050    * | "'" ([^%&'] | PEReference | Reference)* "'"
2051    */

2052  String JavaDoc readLiteral (int flags)
2053    throws java.lang.Exception JavaDoc
2054  {
2055    char delim, c;
2056    int startLine = line;
2057
2058                // Find the delimiter.
2059
delim = readCh();
2060    if (delim != '"' && delim != '\'' && delim != (char)0) {
2061      error("expected '\"' or \"'\"", delim, null);
2062      return null;
2063    }
2064
2065                // Read the literal.
2066
try {
2067      c = readCh();
2068
2069    loop: while (c != delim) {
2070      switch (c) {
2071                // Literals never have line ends
2072
case '\n':
2073      case '\r':
2074    c = ' ';
2075    break;
2076                // References may be allowed
2077
case '&':
2078    if ((flags & LIT_CHAR_REF) > 0) {
2079      c = readCh();
2080      if (c == '#') {
2081        parseCharRef();
2082        c = readCh();
2083        continue loop; // check the next character
2084
} else if ((flags & LIT_ENTITY_REF) > 0) {
2085        unread(c);
2086        parseEntityRef(false);
2087        c = readCh();
2088        continue loop;
2089      } else {
2090        dataBufferAppend('&');
2091      }
2092    }
2093    break;
2094
2095      default:
2096    break;
2097      }
2098      dataBufferAppend(c);
2099      c = readCh();
2100    }
2101    } catch (EOFException JavaDoc e) {
2102      error("end of input while looking for delimiter (started on line "
2103        + startLine + ')', null, new Character JavaDoc(delim).toString());
2104    }
2105
2106                // Normalise whitespace if necessary.
2107
if ((flags & LIT_NORMALIZE) > 0) {
2108      dataBufferNormalize();
2109    }
2110
2111                // Return the value.
2112
return dataBufferToString();
2113  }
2114
2115
2116  /**
2117    * Try reading external identifiers.
2118    * <p>The system identifier is not required for notations.
2119    * @param inNotation Are we in a notation?
2120    * @return A two-member String array containing the identifiers.
2121    */

2122  String JavaDoc[] readExternalIds (boolean inNotation)
2123    throws java.lang.Exception JavaDoc
2124  {
2125    char c;
2126    String JavaDoc ids[] = new String JavaDoc[2];
2127
2128    if (tryRead("PUBLIC")) {
2129      requireWhitespace();
2130      ids[0] = readLiteral(LIT_NORMALIZE); // public id
2131
if (inNotation) {
2132    skipWhitespace();
2133    if (tryRead('"') || tryRead('\'')) {
2134      ids[1] = readLiteral(0);
2135    }
2136      } else {
2137    requireWhitespace();
2138    ids[1] = readLiteral(0); // system id
2139
}
2140    } else if (tryRead("SYSTEM")) {
2141      requireWhitespace();
2142      ids[1] = readLiteral(0); // system id
2143
}
2144
2145    return ids;
2146  }
2147
2148
2149  /**
2150    * Test if a character is whitespace.
2151    * <pre>
2152    * [1] S ::= (#x20 | #x9 | #xd | #xa)+
2153    * </pre>
2154    * @param c The character to test.
2155    * @return true if the character is whitespace.
2156    */

2157  final boolean isWhitespace (char c)
2158  {
2159    switch ((int)c) {
2160    case 0x20:
2161    case 0x09:
2162    case 0x0d:
2163    case 0x0a:
2164      return true;
2165    default:
2166      return false;
2167    }
2168  }
2169
2170
2171
2172  //////////////////////////////////////////////////////////////////////
2173
// Utility routines.
2174
//////////////////////////////////////////////////////////////////////
2175

2176
2177  /**
2178    * Add a character to the data buffer.
2179    */

2180  void dataBufferAppend (char c)
2181  {
2182                // Expand buffer if necessary.
2183
dataBuffer =
2184      (char[])extendArray(dataBuffer, dataBuffer.length, dataBufferPos);
2185    dataBuffer[dataBufferPos++] = c;
2186  }
2187
2188
2189  /**
2190    * Add a string to the data buffer.
2191    */

2192  void dataBufferAppend (String JavaDoc s)
2193  {
2194    dataBufferAppend(s.toCharArray(), 0, s.length());
2195  }
2196
2197
2198  /**
2199    * Append (part of) a character array to the data buffer.
2200    */

2201  void dataBufferAppend (char ch[], int start, int length)
2202  {
2203    dataBuffer =
2204      (char[])extendArray(dataBuffer, dataBuffer.length,
2205              dataBufferPos + length);
2206    System.arraycopy((Object JavaDoc)ch, start,
2207             (Object JavaDoc)dataBuffer, dataBufferPos,
2208             length);
2209    dataBufferPos += length;
2210  }
2211
2212
2213  /**
2214    * Normalise whitespace in the data buffer.
2215    */

2216  void dataBufferNormalize ()
2217  {
2218    int i = 0;
2219    int j = 0;
2220    int end = dataBufferPos;
2221
2222                // Skip whitespace at the start.
2223
while (j < end && isWhitespace(dataBuffer[j])) {
2224      j++;
2225    }
2226
2227                // Skip whitespace at the end.
2228
while (end > j && isWhitespace(dataBuffer[end - 1])) {
2229      end --;
2230    }
2231
2232                // Start copying to the left.
2233
while (j < end) {
2234
2235      char c = dataBuffer[j++];
2236
2237                // Normalise all other whitespace to
2238
// a single space.
2239
if (isWhitespace(c)) {
2240    while (j < end && isWhitespace(dataBuffer[j++])) {
2241    }
2242    dataBuffer[i++] = ' ';
2243    dataBuffer[i++] = dataBuffer[j-1];
2244      } else {
2245    dataBuffer[i++] = c;
2246      }
2247    }
2248
2249                // The new length is <= the old one.
2250
dataBufferPos = i;
2251  }
2252
2253
2254  /**
2255    * Convert the data buffer to a string.
2256    * @param internFlag true if the contents should be interned.
2257    * @see #intern(char[],int,int)
2258    */

2259  String JavaDoc dataBufferToString ()
2260  {
2261    String JavaDoc s = new String JavaDoc(dataBuffer, 0, dataBufferPos);
2262    dataBufferPos = 0;
2263    return s;
2264  }
2265
2266
2267  /**
2268    * Flush the contents of the data buffer to the handler, if
2269    * appropriate, and reset the buffer for new input.
2270    */

2271  void dataBufferFlush ()
2272    throws java.lang.Exception JavaDoc
2273  {
2274    if (dataBufferPos > 0) {
2275      switch (currentElementContent) {
2276      case CONTENT_UNDECLARED:
2277      case CONTENT_EMPTY:
2278    // do nothing
2279
break;
2280      case CONTENT_MIXED:
2281      case CONTENT_ANY:
2282    if (handler != null) {
2283      handler.charData(dataBuffer, 0, dataBufferPos);
2284    }
2285    break;
2286      case CONTENT_ELEMENTS:
2287    if (handler != null) {
2288      handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
2289    }
2290    break;
2291      }
2292      dataBufferPos = 0;
2293    }
2294  }
2295
2296
2297  /**
2298    * Require a string to appear, or throw an exception.
2299    */

2300  void require (String JavaDoc delim)
2301    throws java.lang.Exception JavaDoc
2302  {
2303    char ch[] = delim.toCharArray();
2304    for (int i = 0; i < ch.length; i++) {
2305      require(ch[i]);
2306    }
2307  }
2308
2309
2310  /**
2311    * Require a character to appear, or throw an exception.
2312    */

2313  void require (char delim)
2314       throws java.lang.Exception JavaDoc
2315  {
2316    char c = readCh();
2317
2318    if (c != delim) {
2319      error("expected character", c, new Character JavaDoc(delim).toString());
2320    }
2321  }
2322
2323
2324  /**
2325    * Return an internalised version of a string.
2326    * <p>&AElig;lfred uses this method to create an internalised version
2327    * of all names and attribute values, so that it can test equality
2328    * with <code>==</code> instead of <code>String.equals()</code>.
2329    * <p>If you want to be able to test for equality in the same way,
2330    * you can use this method to internalise your own strings first:
2331    * <pre>
2332    * String PARA = handler.intern("PARA");
2333    * </pre>
2334    * <p>Note that this will not return the same results as String.intern().
2335    * @param s The string to internalise.
2336    * @return An internalised version of the string.
2337    * @see #intern(char[],int,int)
2338    * @see java.lang.String#intern
2339    */

2340  public String JavaDoc intern (String JavaDoc s)
2341  {
2342    char ch[] = s.toCharArray();
2343    return intern(ch, 0, ch.length);
2344  }
2345
2346
2347  /**
2348    * Create an internalised string from a character array.
2349    * <p>This is much more efficient than constructing a non-internalised
2350    * string first, and then internalising it.
2351    * <p>Note that this will not return the same results as String.intern().
2352    * @param ch an array of characters for building the string.
2353    * @param start the starting position in the array.
2354    * @param length the number of characters to place in the string.
2355    * @return an internalised string.
2356    * @see #intern(String)
2357    * @see java.lang.String#intern
2358    */

2359  public String JavaDoc intern (char ch[], int start, int length)
2360  {
2361    int index;
2362    int hash = 0;
2363
2364                // Generate a hash code.
2365
for (int i = start; i < start + length; i++) {
2366      hash = ((hash << 1) & 0xffffff) + (int)ch[i];
2367    }
2368
2369    hash = hash % SYMBOL_TABLE_LENGTH;
2370
2371                // Get the bucket.
2372
Object JavaDoc bucket[] = (Object JavaDoc[])symbolTable[hash];
2373    if (bucket == null) {
2374      symbolTable[hash] = bucket = new Object JavaDoc[8];
2375    }
2376
2377                // Search for a matching tuple, and
2378
// return the string if we find one.
2379
for (index = 0; index < bucket.length; index += 2) {
2380      char chFound[] = (char[])bucket[index];
2381
2382                // Stop when we hit a null index.
2383
if (chFound == null) {
2384    break;
2385      }
2386
2387                // If they're the same length,
2388
// check for a match.
2389
// If the loop finishes, 'index' will
2390
// contain the current bucket
2391
// position.
2392
if (chFound.length == length) {
2393    for (int i = 0; i < chFound.length; i++) {
2394                // Stop if there are no more tuples.
2395
if (ch[start+i] != chFound[i]) {
2396        break;
2397      } else if (i == length-1) {
2398                // That's it, we have a match!
2399
return (String JavaDoc)bucket[index+1];
2400      }
2401    }
2402      }
2403    }
2404
2405                // Not found -- we'll have to add it.
2406

2407                // Do we have to grow the bucket?
2408
bucket =
2409      (Object JavaDoc[])extendArray(bucket, bucket.length, index);
2410
2411                // OK, add it to the end of the
2412
// bucket.
2413
String JavaDoc s = new String JavaDoc(ch, start, length);
2414    bucket[index] = s.toCharArray();
2415    bucket[index+1] = s;
2416    symbolTable[hash] = bucket;
2417    return s;
2418  }
2419
2420
2421  /**
2422    * Ensure the capacity of an array, allocating a new one if
2423    * necessary.
2424    */

2425  Object JavaDoc extendArray (Object JavaDoc array, int currentSize, int requiredSize)
2426  {
2427    if (requiredSize < currentSize) {
2428      return array;
2429    } else {
2430      Object JavaDoc newArray = null;
2431      int newSize = currentSize * 2;
2432
2433      if (newSize <= requiredSize) {
2434    newSize = requiredSize + 1;
2435      }
2436
2437      if (array instanceof char[]) {
2438    newArray = new char[newSize];
2439      } else if (array instanceof Object JavaDoc[]) {
2440    newArray = new Object JavaDoc[newSize];
2441      }
2442
2443      System.arraycopy(array, 0, newArray, 0, currentSize);
2444      return newArray;
2445    }
2446  }
2447
2448
2449
2450  //////////////////////////////////////////////////////////////////////
2451
// XML query routines.
2452
//////////////////////////////////////////////////////////////////////
2453

2454
2455  //
2456
// Elements
2457
//
2458

2459  /**
2460    * Get the declared elements for an XML document.
2461    * <p>The results will be valid only after the DTD (if any) has been
2462    * parsed.
2463    * @return An enumeration of all element types declared for this
2464    * document (as Strings).
2465    * @see #getElementContentType
2466    * @see #getElementContentModel
2467    */

2468  public Enumeration JavaDoc declaredElements ()
2469  {
2470    return elementInfo.keys();
2471  }
2472
2473
2474  /**
2475    * Look up the content type of an element.
2476    * @param name The element type name.
2477    * @return An integer constant representing the content type.
2478    * @see #getElementContentModel
2479    * @see #CONTENT_UNDECLARED
2480    * @see #CONTENT_ANY
2481    * @see #CONTENT_EMPTY
2482    * @see #CONTENT_MIXED
2483    * @see #CONTENT_ELEMENTS
2484    */

2485  public int getElementContentType (String JavaDoc name)
2486  {
2487    Object JavaDoc element[] = (Object JavaDoc[])elementInfo.get(name);
2488    if (element == null) {
2489      return CONTENT_UNDECLARED;
2490    } else {
2491      return ((Integer JavaDoc)element[0]).intValue();
2492    }
2493  }
2494
2495
2496  /**
2497    * Look up the content model of an element.
2498    * <p>The result will always be null unless the content type is
2499    * CONTENT_ELEMENTS or CONTENT_MIXED.
2500    * @param name The element type name.
2501    * @return The normalised content model, as a string.
2502    * @see #getElementContentType
2503    */

2504  public String JavaDoc getElementContentModel (String JavaDoc name)
2505  {
2506    Object JavaDoc element[] = (Object JavaDoc[])elementInfo.get(name);
2507    if (element == null) {
2508      return null;
2509    } else {
2510      return (String JavaDoc)element[1];
2511    }
2512  }
2513
2514
2515  /**
2516    * Register an element.
2517    * Array format:
2518    * element type
2519    * attribute hash table
2520    */

2521  void setElement (String JavaDoc name, int contentType,
2522           String JavaDoc contentModel, Hashtable JavaDoc attributes)
2523    throws java.lang.Exception JavaDoc
2524  {
2525    Object JavaDoc element[];
2526
2527                // Try looking up the element
2528
element = (Object JavaDoc[])elementInfo.get(name);
2529
2530                // Make a new one if necessary.
2531
if (element == null) {
2532      element = new Object JavaDoc[3];
2533      element[0] = new Integer JavaDoc(CONTENT_UNDECLARED);
2534      element[1] = null;
2535      element[2] = null;
2536    } else if (contentType != CONTENT_UNDECLARED &&
2537           ((Integer JavaDoc)element[0]).intValue() != CONTENT_UNDECLARED) {
2538      error("multiple declarations for element type", name, null);
2539      return;
2540    }
2541
2542                // Insert the content type, if any.
2543
if (contentType != CONTENT_UNDECLARED) {
2544      element[0] = new Integer JavaDoc(contentType);
2545    }
2546
2547                // Insert the content model, if any.
2548
if (contentModel != null) {
2549      element[1] = contentModel;
2550    }
2551
2552                // Insert the attributes, if any.
2553
if (attributes != null) {
2554      element[2] =attributes;
2555    }
2556
2557                // Save the element info.
2558
elementInfo.put(name,element);
2559  }
2560
2561
2562  /**
2563    * Look up the attribute hash table for an element.
2564    * The hash table is the second item in the element array.
2565    */

2566  Hashtable JavaDoc getElementAttributes (String JavaDoc name)
2567  {
2568    Object JavaDoc element[] = (Object JavaDoc[])elementInfo.get(name);
2569    if (element == null) {
2570      return null;
2571    } else {
2572      return (Hashtable JavaDoc)element[2];
2573    }
2574  }
2575
2576
2577
2578  //
2579
// Attributes
2580
//
2581

2582  /**
2583    * Get the declared attributes for an element type.
2584    * @param elname The name of the element type.
2585    * @return An Enumeration of all the attributes declared for
2586    * a specific element type. The results will be valid only
2587    * after the DTD (if any) has been parsed.
2588    * @see #getAttributeType
2589    * @see #getAttributeEnumeration
2590    * @see #getAttributeDefaultValueType
2591    * @see #getAttributeDefaultValue
2592    * @see #getAttributeExpandedValue
2593    */

2594  public Enumeration JavaDoc declaredAttributes (String JavaDoc elname)
2595  {
2596    Hashtable JavaDoc attlist = getElementAttributes(elname);
2597
2598    if (attlist == null) {
2599      return null;
2600    } else {
2601      return attlist.keys();
2602    }
2603  }
2604
2605
2606  /**
2607    * Retrieve the declared type of an attribute.
2608    * @param name The name of the associated element.
2609    * @param aname The name of the attribute.
2610    * @return An integer constant representing the attribute type.
2611    * @see #ATTRIBUTE_UNDECLARED
2612    * @see #ATTRIBUTE_CDATA
2613    * @see #ATTRIBUTE_ID
2614    * @see #ATTRIBUTE_IDREF
2615    * @see #ATTRIBUTE_IDREFS
2616    * @see #ATTRIBUTE_ENTITY
2617    * @see #ATTRIBUTE_ENTITIES
2618    * @see #ATTRIBUTE_NMTOKEN
2619    * @see #ATTRIBUTE_NMTOKENS
2620    * @see #ATTRIBUTE_ENUMERATED
2621    * @see #ATTRIBUTE_NOTATION
2622    */

2623  public int getAttributeType (String JavaDoc name, String JavaDoc aname)
2624  {
2625    Object JavaDoc attribute[] = getAttribute(name, aname);
2626    if (attribute == null) {
2627      return ATTRIBUTE_UNDECLARED;
2628    } else {
2629      return ((Integer JavaDoc)attribute[0]).intValue();
2630    }
2631  }
2632
2633
2634  /**
2635    * Retrieve the allowed values for an enumerated attribute type.
2636    * @param name The name of the associated element.
2637    * @param aname The name of the attribute.
2638    * @return A string containing the token list.
2639    * @see #ATTRIBUTE_ENUMERATED
2640    * @see #ATTRIBUTE_NOTATION
2641    */

2642  public String JavaDoc getAttributeEnumeration (String JavaDoc name, String JavaDoc aname)
2643  {
2644    Object JavaDoc attribute[] = getAttribute(name, aname);
2645    if (attribute == null) {
2646      return null;
2647    } else {
2648      return (String JavaDoc)attribute[3];
2649    }
2650  }
2651
2652
2653  /**
2654    * Retrieve the default value of a declared attribute.
2655    * @param name The name of the associated element.
2656    * @param aname The name of the attribute.
2657    * @return The default value, or null if the attribute was
2658    * #IMPLIED or simply undeclared and unspecified.
2659    * @see #getAttributeExpandedValue
2660    */

2661  public String JavaDoc getAttributeDefaultValue (String JavaDoc name, String JavaDoc aname)
2662  {
2663    Object JavaDoc attribute[] = getAttribute(name, aname);
2664    if (attribute == null) {
2665      return null;
2666    } else {
2667      return (String JavaDoc)attribute[1];
2668    }
2669  }
2670
2671
2672  /**
2673    * Retrieve the expanded value of a declared attribute.
2674    * <p>All general entities will be expanded.
2675    * @param name The name of the associated element.
2676    * @param aname The name of the attribute.
2677    * @return The expanded default value, or null if the attribute was
2678    * #IMPLIED or simply undeclared
2679    * @see #getAttributeDefaultValue
2680    */

2681  public String JavaDoc getAttributeExpandedValue (String JavaDoc name, String JavaDoc aname)
2682  {
2683    Object JavaDoc attribute[] = getAttribute(name, aname);
2684    if (attribute == null) {
2685      return null;
2686    } else if (attribute[4] == null && attribute[1] != null) {
2687      try {
2688    pushString(null, (char)0 + (String JavaDoc)attribute[1] + (char)0);
2689    attribute[4] = readLiteral(LIT_NORMALIZE |
2690                   LIT_CHAR_REF |
2691                   LIT_ENTITY_REF);
2692      } catch (Exception JavaDoc e) {}
2693    }
2694    return (String JavaDoc)attribute[4];
2695  }
2696
2697
2698  /**
2699    * Retrieve the default value type of a declared attribute.
2700    * @see #ATTRIBUTE_DEFAULT_SPECIFIED
2701    * @see #ATTRIBUTE_DEFAULT_IMPLIED
2702    * @see #ATTRIBUTE_DEFAULT_REQUIRED
2703    * @see #ATTRIBUTE_DEFAULT_FIXED
2704    */

2705  public int getAttributeDefaultValueType (String JavaDoc name, String JavaDoc aname)
2706  {
2707    Object JavaDoc attribute[] = getAttribute(name, aname);
2708    if (attribute == null) {
2709      return ATTRIBUTE_DEFAULT_UNDECLARED;
2710    } else {
2711      return ((Integer JavaDoc)attribute[2]).intValue();
2712    }
2713  }
2714
2715
2716  /**
2717    * Register an attribute declaration for later retrieval.
2718    * Format:
2719    * - String type
2720    * - String default value
2721    * - int value type
2722    * *TODO: do something with attribute types.
2723    */

2724  void setAttribute (String JavaDoc elName, String JavaDoc name, int type, String JavaDoc enumeration,
2725             String JavaDoc value, int valueType)
2726    throws java.lang.Exception JavaDoc
2727  {
2728    Hashtable JavaDoc attlist;
2729    Object JavaDoc attribute[];
2730
2731                // Create a new hashtable if necessary.
2732
attlist = getElementAttributes(elName);
2733    if (attlist == null) {
2734      attlist = new Hashtable JavaDoc();
2735    }
2736
2737                // Check that the attribute doesn't
2738
// already exist!
2739
if (attlist.get(name) != null) {
2740      return;
2741    } else {
2742      attribute = new Object JavaDoc[5];
2743      attribute[0] = new Integer JavaDoc(type);
2744      attribute[1] = value;
2745      attribute[2] = new Integer JavaDoc(valueType);
2746      attribute[3] = enumeration;
2747      attribute[4] = null;
2748      attlist.put(name.intern(), attribute);
2749
2750                // Use CONTENT_UNDECLARED to avoid overwriting
2751
// existing element declaration.
2752
setElement(elName,CONTENT_UNDECLARED, null, attlist);
2753    }
2754  }
2755
2756
2757  /**
2758    * Retrieve the three-member array representing an
2759    * attribute declaration.
2760    */

2761  Object JavaDoc[] getAttribute (String JavaDoc elName, String JavaDoc name)
2762  {
2763    Hashtable JavaDoc attlist;
2764    Object JavaDoc attribute[];
2765
2766    attlist = getElementAttributes(elName);
2767    if (attlist == null) {
2768      return null;
2769    }
2770
2771    attribute = (Object JavaDoc[])attlist.get(name);
2772    return attribute;
2773  }
2774
2775
2776  //
2777
// Entities
2778
//
2779

2780  /**
2781    * Get declared entities.
2782    * @return An Enumeration of all the entities declared for
2783    * this XML document. The results will be valid only
2784    * after the DTD (if any) has been parsed.
2785    * @see #getEntityType
2786    * @see #getEntityPublicId
2787    * @see #getEntitySystemId
2788    * @see #getEntityValue
2789    * @see #getEntityNotationName
2790    */

2791  public Enumeration JavaDoc declaredEntities ()
2792  {
2793    return entityInfo.keys();
2794  }
2795
2796
2797  /**
2798    * Find the type of an entity.
2799    * @returns An integer constant representing the entity type.
2800    * @see #ENTITY_UNDECLARED
2801    * @see #ENTITY_INTERNAL
2802    * @see #ENTITY_NDATA
2803    * @see #ENTITY_TEXT
2804    */

2805  public int getEntityType (String JavaDoc ename)
2806  {
2807    Object JavaDoc entity[] = (Object JavaDoc[])entityInfo.get(ename);
2808    if (entity == null) {
2809      return ENTITY_UNDECLARED;
2810    } else {
2811      return ((Integer JavaDoc)entity[0]).intValue();
2812    }
2813  }
2814
2815
2816  /**
2817    * Return an external entity's public identifier, if any.
2818    * @param ename The name of the external entity.
2819    * @return The entity's system identifier, or null if the
2820    * entity was not declared, if it is not an
2821    * external entity, or if no public identifier was
2822    * provided.
2823    * @see #getEntityType
2824    */

2825  public String JavaDoc getEntityPublicId (String JavaDoc ename)
2826  {
2827    Object JavaDoc entity[] = (Object JavaDoc[])entityInfo.get(ename);
2828    if (entity == null) {
2829      return null;
2830    } else {
2831      return (String JavaDoc)entity[1];
2832    }
2833  }
2834
2835
2836  /**
2837    * Return an external entity's system identifier.
2838    * @param ename The name of the external entity.
2839    * @return The entity's system identifier, or null if the
2840    * entity was not declared, or if it is not an
2841    * external entity.
2842    * @see #getEntityType
2843    */

2844  public String JavaDoc getEntitySystemId (String JavaDoc ename)
2845  {
2846    Object JavaDoc entity[] = (Object JavaDoc[])entityInfo.get(ename);
2847    if (entity == null) {
2848      return null;
2849    } else {
2850      return (String JavaDoc)entity[2];
2851    }
2852  }
2853
2854
2855  /**
2856    * Return the value of an internal entity.
2857    * @param ename The name of the internal entity.
2858    * @return The entity's value, or null if the entity was
2859    * not declared, or if it is not an internal entity.
2860    * @see #getEntityType
2861    */

2862  public String JavaDoc getEntityValue (String JavaDoc ename)
2863  {
2864    Object JavaDoc entity[] = (Object JavaDoc[])entityInfo.get(ename);
2865    if (entity == null) {
2866      return null;
2867    } else {
2868      return (String JavaDoc)entity[3];
2869    }
2870  }
2871
2872
2873  /**
2874    * Get the notation name associated with an NDATA entity.
2875    * @param ename The NDATA entity name.
2876    * @return The associated notation name, or null if the
2877    * entity was not declared, or if it is not an
2878    * NDATA entity.
2879    * @see #getEntityType
2880    */

2881  public String JavaDoc getEntityNotationName (String JavaDoc eName)
2882  {
2883    Object JavaDoc entity[] = (Object JavaDoc[])entityInfo.get(eName);
2884    if (entity == null) {
2885      return null;
2886    } else {
2887      return (String JavaDoc)entity[4];
2888    }
2889  }
2890
2891
2892  /**
2893    * Register an entity declaration for later retrieval.
2894    */

2895  void setInternalEntity (String JavaDoc eName, String JavaDoc value)
2896  {
2897    setEntity(eName, ENTITY_INTERNAL, null, null, value, null);
2898  }
2899
2900
2901  /**
2902    * Register an external data entity.
2903    */

2904  void setExternalDataEntity (String JavaDoc eName, String JavaDoc pubid,
2905                  String JavaDoc sysid, String JavaDoc nName)
2906  {
2907    setEntity(eName, ENTITY_NDATA, pubid, sysid, null, nName);
2908  }
2909
2910
2911  /**
2912    * Register an external text entity.
2913    */

2914  void setExternalTextEntity (String JavaDoc eName, String JavaDoc pubid, String JavaDoc sysid)
2915  {
2916    setEntity(eName, ENTITY_TEXT, pubid, sysid, null, null);
2917  }
2918
2919
2920  /**
2921    * Register an entity declaration for later retrieval.
2922    */

2923  void setEntity (String JavaDoc eName, int eClass,
2924          String JavaDoc pubid, String JavaDoc sysid,
2925          String JavaDoc value, String JavaDoc nName)
2926  {
2927    Object JavaDoc entity[];
2928
2929    if (entityInfo.get(eName) == null) {
2930      entity = new Object JavaDoc[5];
2931      entity[0] = new Integer JavaDoc(eClass);
2932      entity[1] = pubid;
2933      entity[2] = sysid;
2934      entity[3] = value;
2935      entity[4] = nName;
2936
2937      entityInfo.put(eName,entity);
2938    }
2939  }
2940
2941
2942  //
2943
// Notations.
2944
//
2945

2946  /**
2947    * Get declared notations.
2948    * @return An Enumeration of all the notations declared for
2949    * this XML document. The results will be valid only
2950    * after the DTD (if any) has been parsed.
2951    * @see #getNotationPublicId
2952    * @see #getNotationSystemId
2953    */

2954  public Enumeration JavaDoc declaredNotations ()
2955  {
2956    return notationInfo.keys();
2957  }
2958
2959
2960  /**
2961    * Look up the public identifier for a notation.
2962    * You will normally use this method to look up a notation
2963    * that was provided as an attribute value or for an NDATA entity.
2964    * @param nname The name of the notation.
2965    * @return A string containing the public identifier, or null
2966    * if none was provided or if no such notation was
2967    * declared.
2968    * @see #getNotationSystemId
2969    */

2970  public String JavaDoc getNotationPublicId (String JavaDoc nname)
2971  {
2972    Object JavaDoc notation[] = (Object JavaDoc[])notationInfo.get(nname);
2973    if (notation == null) {
2974      return null;
2975    } else {
2976      return (String JavaDoc)notation[0];
2977    }
2978  }
2979
2980
2981  /**
2982    * Look up the system identifier for a notation.
2983    * You will normally use this method to look up a notation
2984    * that was provided as an attribute value or for an NDATA entity.
2985    * @param nname The name of the notation.
2986    * @return A string containing the system identifier, or null
2987    * if no such notation was declared.
2988    * @see #getNotationPublicId
2989    */

2990  public String JavaDoc getNotationSystemId (String JavaDoc nname)
2991  {
2992    Object JavaDoc notation[] = (Object JavaDoc[])notationInfo.get(nname);
2993    if (notation == null) {
2994      return null;
2995    } else {
2996      return (String JavaDoc)notation[1];
2997    }
2998  }
2999
3000
3001  /**
3002    * Register a notation declaration for later retrieval.
3003    * Format:
3004    * - public id
3005    * - system id
3006    */

3007  void setNotation (String JavaDoc nname, String JavaDoc pubid, String JavaDoc sysid)
3008    throws java.lang.Exception JavaDoc
3009  {
3010    Object JavaDoc notation[];
3011
3012    if (notationInfo.get(nname) == null) {
3013      notation = new Object JavaDoc[2];
3014      notation[0] = pubid;
3015      notation[1] = sysid;
3016      notationInfo.put(nname,notation);
3017    } else {
3018      error("multiple declarations of notation", nname, null);
3019    }
3020  }
3021
3022
3023  //
3024
// Location.
3025
//
3026

3027
3028  /**
3029    * Return the current line number.
3030    */

3031  public int getLineNumber ()
3032  {
3033    return line;
3034  }
3035
3036
3037  /**
3038    * Return the current column number.
3039    */

3040  public int getColumnNumber ()
3041  {
3042    return column;
3043  }
3044
3045
3046
3047  //////////////////////////////////////////////////////////////////////
3048
// High-level I/O.
3049
//////////////////////////////////////////////////////////////////////
3050

3051
3052  /**
3053    * Read a single character from the readBuffer.
3054    * <p>The readDataChunk() method maintains the buffer.
3055    * <p>If we hit the end of an entity, try to pop the stack and
3056    * keep going.
3057    * <p>(This approach doesn't really enforce XML's rules about
3058    * entity boundaries, but this is not currently a validating
3059    * parser).
3060    * <p>This routine also attempts to keep track of the current
3061    * position in external entities, but it's not entirely accurate.
3062    * @return The next available input character.
3063    * @see #unread(char)
3064    * @see #unread(String)
3065    * @see #readDataChunk
3066    * @see #readBuffer
3067    * @see #line
3068    * @return The next character from the current input source.
3069    */

3070  char readCh ()
3071    throws java.lang.Exception JavaDoc
3072    {
3073    char c;
3074
3075    // As long as there's nothing in the
3076
// read buffer, try reading more data
3077
// (for an external entity) or popping
3078
// the entity stack (for either).
3079
while (readBufferPos >= readBufferLength)
3080      {
3081      switch (sourceType)
3082        {
3083        case INPUT_READER:
3084        case INPUT_EXTERNAL:
3085        case INPUT_STREAM:
3086        readDataChunk();
3087        while (readBufferLength < 1)
3088          {
3089          popInput();
3090          if (readBufferLength <1)
3091            {
3092            readDataChunk();
3093            }
3094          }
3095        break;
3096
3097        default:
3098        popInput();
3099        break;
3100        }
3101      }
3102
3103    c = readBuffer[readBufferPos++];
3104
3105    // This is a particularly nasty bit
3106
// of code, that checks for a parameter
3107
// entity reference but peeks ahead to
3108
// catch the '%' in parameter entity
3109
// declarations.
3110
if
3111      (
3112      c == '%' &&
3113      (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE)
3114      )
3115      {
3116      char c2 = readCh();
3117      unread(c2);
3118      if (!isWhitespace(c2))
3119        {
3120        parsePEReference(context == CONTEXT_ENTITYVALUE);
3121        return readCh();
3122        }
3123      }
3124
3125    if (c == '\n')
3126      {
3127      line++;
3128      column = 0;
3129      }
3130    else
3131      {
3132      column++;
3133      }
3134
3135    return c;
3136    }
3137
3138
3139  /**
3140    * Push a single character back onto the current input stream.
3141    * <p>This method usually pushes the character back onto
3142    * the readBuffer, while the unread(String) method treats the
3143    * string as a new internal entity.
3144    * <p>I don't think that this would ever be called with
3145    * readBufferPos = 0, because the methods always reads a character
3146    * before unreading it, but just in case, I've added a boundary
3147    * condition.
3148    * @param c The character to push back.
3149    * @see #readCh
3150    * @see #unread(String)
3151    * @see #unread(char[])
3152    * @see #readBuffer
3153    */

3154  void unread (char c)
3155    throws java.lang.Exception JavaDoc
3156    {
3157    // Normal condition.
3158
if (c == '\n')
3159      {
3160      line--;
3161      column = -1;
3162      }
3163    if (readBufferPos > 0)
3164      {
3165      readBuffer[--readBufferPos] = c;
3166      }
3167    else
3168      {
3169      pushString(null, new Character JavaDoc(c).toString());
3170      }
3171    }
3172
3173
3174  /**
3175    * Push a char array back onto the current input stream.
3176    * <p>NOTE: you must <em>never</em> push back characters that you
3177    * haven't actually read: use pushString() instead.
3178    * @see #readCh
3179    * @see #unread(char)
3180    * @see #unread(String)
3181    * @see #readBuffer
3182    * @see #pushString
3183    */

3184  void unread (char ch[], int length)
3185    throws java.lang.Exception JavaDoc
3186    {
3187    for (int i = 0; i < length; i++)
3188        {
3189        if (ch[i] == '\n')
3190          {line--;column = -1;}
3191        }
3192    if (length < readBufferPos)
3193      {readBufferPos -= length;}
3194    else
3195      {
3196      pushCharArray(null, ch, 0, length);
3197      sourceType = INPUT_BUFFER;
3198      }
3199    }
3200
3201
3202  /**
3203    * Push a new external input source.
3204    * <p>The source will be either an external text entity, or the DTD
3205    * external subset.
3206    * <p>TO DO: Right now, this method always attempts to autodetect
3207    * the encoding; in the future, it should allow the caller to
3208    * request an encoding explicitly, and it should also look at the
3209    * headers with an HTTP connection.
3210    * @param url The java.net.URL object for the entity.
3211    * @see XmlHandler#resolveEntity
3212    * @see #pushString
3213    * @see #sourceType
3214    * @see #pushInput
3215    * @see #detectEncoding
3216    * @see #sourceType
3217    * @see #readBuffer
3218    */

3219  void pushURL (String JavaDoc ename, String JavaDoc publicId, String JavaDoc systemId,
3220        Reader JavaDoc reader, InputStream JavaDoc stream, String JavaDoc encoding)
3221    throws java.lang.Exception JavaDoc
3222  {
3223    URL JavaDoc url;
3224    boolean ignoreEncoding = false;
3225
3226                // Push the existing status.
3227
pushInput(ename);
3228
3229                // Create a new read buffer.
3230
// (Note the four-character margin)
3231
readBuffer = new char[READ_BUFFER_MAX+4];
3232    readBufferPos = 0;
3233    readBufferLength = 0;
3234    readBufferOverflow = -1;
3235    is = null;
3236    line = 1;
3237
3238    currentByteCount = 0;
3239
3240                // Flush any remaining data.
3241
dataBufferFlush();
3242
3243                // Make the URL absolute.
3244
if (systemId != null && externalEntity != null) {
3245      systemId = new URL JavaDoc(externalEntity.getURL(), systemId).toString();
3246    } else if (baseURI != null) {
3247      try {
3248    systemId = new URL JavaDoc(new URL JavaDoc(baseURI), systemId).toString();
3249      } catch (Exception JavaDoc e) {}
3250    }
3251
3252                // See if the application wants to
3253
// redirect the system ID and/or
3254
// supply its own character stream.
3255
if (systemId != null && handler != null) {
3256      Object JavaDoc input = handler.resolveEntity(publicId, systemId);
3257      if (input != null) {
3258    if (input instanceof String JavaDoc) {
3259      systemId = (String JavaDoc)input;
3260    } else if (input instanceof InputStream JavaDoc) {
3261      stream = (InputStream JavaDoc)input;
3262    } else if (input instanceof Reader JavaDoc) {
3263      reader = (Reader JavaDoc)input;
3264    }
3265      }
3266    }
3267
3268                // Start the entity.
3269
if (handler != null) {
3270      if (systemId != null) {
3271    handler.startExternalEntity(systemId);
3272      } else {
3273    handler.startExternalEntity("[external stream]");
3274      }
3275    }
3276
3277                // Figure out what we're reading from.
3278
if (reader != null) {
3279                // There's an explicit character stream.
3280
sourceType = INPUT_READER;
3281      this.reader = reader;
3282      tryEncodingDecl(true);
3283      return;
3284    } else if (stream != null) {
3285      sourceType = INPUT_STREAM;
3286      is = stream;
3287    } else {
3288                // We have to open our own stream
3289
// to the URL.
3290

3291                // Set the new status
3292
sourceType = INPUT_EXTERNAL;
3293      url = new URL JavaDoc(systemId);
3294
3295      externalEntity = url.openConnection();
3296      externalEntity.connect();
3297      is = externalEntity.getInputStream();
3298    }
3299
3300                // If we get to here, there must be
3301
// an InputStream available.
3302
if (!is.markSupported()) {
3303      is = new BufferedInputStream JavaDoc(is);
3304    }
3305
3306                // Attempt to detect the encoding.
3307
if (encoding == null && externalEntity != null) {
3308      encoding = externalEntity.getContentEncoding();
3309    }
3310
3311    if (encoding != null) {
3312      checkEncoding(encoding, false);
3313      ignoreEncoding = true;
3314    } else {
3315      detectEncoding();
3316      ignoreEncoding = false;
3317    }
3318
3319                // Read an XML or text declaration.
3320
tryEncodingDecl(ignoreEncoding);
3321  }
3322
3323
3324  /**
3325    * Check for an encoding declaration.
3326    */

3327  void tryEncodingDecl (boolean ignoreEncoding)
3328    throws java.lang.Exception JavaDoc
3329  {
3330                // Read the XML/Encoding declaration.
3331
if (tryRead("<?xml")) {
3332      if (tryWhitespace()) {
3333    if (inputStack.size() > 0) {
3334      parseTextDecl(ignoreEncoding);
3335    } else {
3336      parseXMLDecl(ignoreEncoding);
3337    }
3338      } else {
3339    unread("xml".toCharArray(), 3);
3340    parsePI();
3341      }
3342    }
3343  }
3344
3345
3346  /**
3347    * Attempt to detect the encoding of an entity.
3348    * <p>The trick here (as suggested in the XML standard) is that
3349    * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
3350    * <b>must</b> begin with an XML declaration or an encoding
3351    * declaration; we simply have to look for "&lt;?XML" in various
3352    * encodings.
3353    * <p>This method has no way to distinguish among 8-bit encodings.
3354    * Instead, it assumes UTF-8, then (possibly) revises its assumption
3355    * later in checkEncoding(). Any ASCII-derived 8-bit encoding
3356    * should work, but most will be rejected later by checkEncoding().
3357    * <p>I don't currently detect EBCDIC, since I'm concerned that it
3358    * could also be a valid UTF-8 sequence; I'll have to do more checking
3359    * later.
3360    * @see #tryEncoding(byte[], byte, byte, byte, byte)
3361    * @see #tryEncoding(byte[], byte, byte)
3362    * @see #checkEncoding
3363    * @see #read8bitEncodingDeclaration
3364    */

3365  void detectEncoding ()
3366    throws java.lang.Exception JavaDoc
3367  {
3368    byte signature[] = new byte[4];
3369
3370                // Read the first four bytes for
3371
// autodetection.
3372
is.mark(4);
3373    is.read(signature);
3374    is.reset();
3375
3376                // Look for a known signature.
3377
if (tryEncoding(signature, (byte)0x00, (byte)0x00,
3378            (byte)0x00, (byte)0x3c)) {
3379      // UCS-4 must begin with "<!XML"
3380
// 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
3381
encoding = ENCODING_UCS_4_1234;
3382    } else if (tryEncoding(signature, (byte)0x3c, (byte)0x00,
3383               (byte)0x00, (byte)0x00)) {
3384      // UCS-4 must begin with "<!XML"
3385
// 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
3386
encoding = ENCODING_UCS_4_4321;
3387    } else if (tryEncoding(signature, (byte)0x00, (byte)0x00,
3388               (byte)0x3c, (byte)0x00)) {
3389      // UCS-4 must begin with "<!XML"
3390
// 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
3391
encoding = ENCODING_UCS_4_2143;
3392    } else if (tryEncoding(signature, (byte)0x00, (byte)0x3c,
3393               (byte)0x00, (byte)0x00)) {
3394      // UCS-4 must begin with "<!XML"
3395
// 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
3396
encoding = ENCODING_UCS_4_3412;
3397    } else if (tryEncoding(signature, (byte)0xfe, (byte)0xff)) {
3398      // UCS-2 with a byte-order marker.
3399
// 0xfe 0xff: UCS-2, big-endian (12)
3400
encoding = ENCODING_UCS_2_12;
3401      is.read(); is.read();
3402    } else if (tryEncoding(signature, (byte)0xff, (byte)0xfe)) {
3403      // UCS-2 with a byte-order marker.
3404
// 0xff 0xfe: UCS-2, little-endian (21)
3405
encoding = ENCODING_UCS_2_21;
3406      is.read(); is.read();
3407    } else if (tryEncoding(signature, (byte)0x00, (byte)0x3c,
3408               (byte)0x00, (byte)0x3f)) {
3409      // UCS-2 without a BOM must begin with "<?XML"
3410
// 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
3411
encoding = ENCODING_UCS_2_12;
3412      error("no byte-order mark for UCS-2 entity", null, null);
3413    } else if (tryEncoding(signature, (byte)0x3c, (byte)0x00,
3414               (byte)0x3f, (byte)0x00)) {
3415      // UCS-2 without a BOM must begin with "<?XML"
3416
// 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
3417
encoding = ENCODING_UCS_2_21;
3418      error("no byte-order mark for UCS-2 entity", null, null);
3419    } else if (tryEncoding(signature, (byte)0x3c, (byte)0x3f,
3420               (byte)0x78, (byte)0x6d)) {
3421      // Some kind of 8-bit encoding with "<?XML"
3422
// 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
3423
encoding = ENCODING_UTF_8;
3424      read8bitEncodingDeclaration();
3425    } else {
3426      // Some kind of 8-bit encoding without "<?XML"
3427
// (otherwise) UTF-8 without encoding/XML declaration
3428
encoding = ENCODING_UTF_8;
3429    }
3430  }
3431
3432
3433  /**
3434    * Check for a four-byte signature.
3435    * <p>Utility routine for detectEncoding().
3436    * <p>Always looks for some part of "<?XML" in a specific encoding.
3437    * @param sig The first four bytes read.
3438    * @param b1 The first byte of the signature
3439    * @param b2 The second byte of the signature
3440    * @param b3 The third byte of the signature
3441    * @param b4 The fourth byte of the signature
3442    * @see #detectEncoding
3443    */

3444  boolean tryEncoding (byte sig[], byte b1, byte b2, byte b3, byte b4)
3445  {
3446    return (sig[0] == b1 && sig[1] == b2 && sig[2] == b3 && sig[3] == b4);
3447  }
3448
3449
3450  /**
3451    * Check for a two-byte signature.
3452    * <p>Looks for a UCS-2 byte-order mark.
3453    * <p>Utility routine for detectEncoding().
3454    * @param sig The first four bytes read.
3455    * @param b1 The first byte of the signature
3456    * @param b2 The second byte of the signature
3457    * @see #detectEncoding
3458    */

3459  boolean tryEncoding (byte sig[], byte b1, byte b2)
3460  {
3461    return ((sig[0] == b1) && (sig[1] == b2));
3462  }
3463
3464
3465  /**
3466    * This method pushes a string back onto input.
3467    * <p>It is useful either as the expansion of an internal entity,
3468    * or for backtracking during the parse.
3469    * <p>Call pushCharArray() to do the actual work.
3470    * @param s The string to push back onto input.
3471    * @see #pushCharArray
3472    */

3473  void pushString (String JavaDoc ename, String JavaDoc s)
3474    throws java.lang.Exception JavaDoc
3475  {
3476    char ch[] = s.toCharArray();
3477    pushCharArray(ename, ch, 0, ch.length);
3478  }
3479
3480
3481  /**
3482    * Push a new internal input source.
3483    * <p>This method is useful for expanding an internal entity,
3484    * or for unreading a string of characters. It creates a new
3485    * readBuffer containing the characters in the array, instead
3486    * of characters converted from an input byte stream.
3487    * <p>I've added a couple of optimisations: don't push zero-
3488    * length strings, and just push back a single character
3489    * for 1-character strings; this should save some time and memory.
3490    * @param ch The char array to push.
3491    * @see #pushString
3492    * @see #pushURL
3493    * @see #readBuffer
3494    * @see #sourceType
3495    * @see #pushInput
3496    */

3497  void pushCharArray (String JavaDoc ename, char ch[], int start, int length)
3498    throws java.lang.Exception JavaDoc
3499  {
3500                // Push the existing status
3501
pushInput(ename);
3502    sourceType = INPUT_INTERNAL;
3503    readBuffer = ch;
3504    readBufferPos = start;
3505    readBufferLength = length;
3506    readBufferOverflow = -1;
3507  }
3508
3509
3510  /**
3511    * Save the current input source onto the stack.
3512    * <p>This method saves all of the global variables associated with
3513    * the current input source, so that they can be restored when a new
3514    * input source has finished. It also tests for entity recursion.
3515    * <p>The method saves the following global variables onto a stack
3516    * using a fixed-length array:
3517    * <ol>
3518    * <li>sourceType
3519    * <li>externalEntity
3520    * <li>readBuffer
3521    * <li>readBufferPos
3522    * <li>readBufferLength
3523    * <li>line
3524    * <li>encoding
3525    * </ol>
3526    * @param ename The name of the entity (if any) causing the new input.
3527    * @see #popInput
3528    * @see #sourceType
3529    * @see #externalEntity
3530    * @see #readBuffer
3531    * @see #readBufferPos
3532    * @see #readBufferLength
3533    * @see #line
3534    * @see #encoding
3535    */

3536  void pushInput (String JavaDoc ename)
3537    throws java.lang.Exception JavaDoc
3538  {
3539    Object JavaDoc input[] = new Object JavaDoc[12];
3540
3541                // Check for entity recursion.
3542
if (ename != null) {
3543      Enumeration JavaDoc entities = entityStack.elements();
3544      while (entities.hasMoreElements()) {
3545    String JavaDoc e = (String JavaDoc)entities.nextElement();
3546    if (e == ename) {
3547      error("recursive reference to entity", ename, null);
3548    }
3549      }
3550    }
3551    entityStack.push(ename);
3552
3553                // Don't bother if there is no input.
3554
if (sourceType == INPUT_NONE) {
3555      return;
3556    }
3557
3558                // Set up a snapshot of the current
3559
// input source.
3560
input[0] = new Integer JavaDoc(sourceType);
3561    input[1] = externalEntity;
3562    input[2] = readBuffer;
3563    input[3] = new Integer JavaDoc(readBufferPos);
3564    input[4] = new Integer JavaDoc(readBufferLength);
3565    input[5] = new Integer JavaDoc(line);
3566    input[6] = new Integer JavaDoc(encoding);
3567    input[7] = new Integer JavaDoc(readBufferOverflow);
3568    input[8] = is;
3569    input[9] = new Integer JavaDoc(currentByteCount);
3570    input[10] = new Integer JavaDoc(column);
3571    input[11] = reader;
3572
3573                // Push it onto the stack.
3574
inputStack.push(input);
3575  }
3576
3577
3578  /**
3579    * Restore a previous input source.
3580    * <p>This method restores all of the global variables associated with
3581    * the current input source.
3582    * @exception java.io.EOFException
3583    * If there are no more entries on the input stack.
3584    * @see #pushInput
3585    * @see #sourceType
3586    * @see #externalEntity
3587    * @see #readBuffer
3588    * @see #readBufferPos
3589    * @see #readBufferLength
3590    * @see #line
3591    * @see #encoding
3592    */

3593  void popInput ()
3594    throws java.lang.Exception JavaDoc
3595  {
3596    Object JavaDoc input[];
3597
3598
3599    switch (sourceType) {
3600
3601    case INPUT_EXTERNAL:
3602      dataBufferFlush();
3603      if (handler != null && externalEntity != null) {
3604    handler.endExternalEntity(externalEntity.getURL().toString());
3605      }
3606      break;
3607    case INPUT_STREAM:
3608      dataBufferFlush();
3609      if (baseURI != null) {
3610    if (handler != null) {
3611      handler.endExternalEntity(baseURI);
3612    }
3613      }
3614      break;
3615    case INPUT_READER:
3616      dataBufferFlush();
3617      if (baseURI != null) {
3618    if (handler != null) {
3619      handler.endExternalEntity(baseURI);
3620    }
3621      }
3622      break;
3623    }
3624
3625                // Throw an EOFException if there
3626
// is nothing else to pop.
3627
if (inputStack.isEmpty()) {
3628      throw new EOFException JavaDoc();
3629    } else {
3630      String JavaDoc s;
3631      input = (Object JavaDoc[])inputStack.pop();
3632      s = (String JavaDoc)entityStack.pop();
3633    }
3634
3635    sourceType = ((Integer JavaDoc)input[0]).intValue();
3636    externalEntity = (URLConnection JavaDoc)input[1];
3637    readBuffer = (char[])input[2];
3638    readBufferPos = ((Integer JavaDoc)input[3]).intValue();
3639    readBufferLength = ((Integer JavaDoc)input[4]).intValue();
3640    line = ((Integer JavaDoc)input[5]).intValue();
3641    encoding = ((Integer JavaDoc)input[6]).intValue();
3642    readBufferOverflow = ((Integer JavaDoc)input[7]).intValue();
3643    is = (InputStream JavaDoc)input[8];
3644    currentByteCount = ((Integer JavaDoc)input[9]).intValue();
3645    column = ((Integer JavaDoc)input[10]).intValue();
3646    reader = (Reader JavaDoc)input[11];
3647  }
3648
3649
3650  /**
3651    * Return true if we can read the expected character.
3652    * <p>Note that the character will be removed from the input stream
3653    * on success, but will be put back on failure. Do not attempt to
3654    * read the character again if the method succeeds.
3655    * @param delim The character that should appear next. For a
3656    * insensitive match, you must supply this in upper-case.
3657    * @return true if the character was successfully read, or false if
3658    * it was not.
3659    * @see #tryRead(String)
3660    */

3661  boolean tryRead (char delim)
3662    throws java.lang.Exception JavaDoc
3663  {
3664    char c;
3665
3666                // Read the character
3667
c = readCh();
3668
3669                // Test for a match, and push the character
3670
// back if the match fails.
3671
if (c == delim) {
3672      return true;
3673    } else {
3674      unread(c);
3675      return false;
3676    }
3677  }
3678
3679
3680  /**
3681    * Return true if we can read the expected string.
3682    * <p>This is simply a convenience method.
3683    * <p>Note that the string will be removed from the input stream
3684    * on success, but will be put back on failure. Do not attempt to
3685    * read the string again if the method succeeds.
3686    * <p>This method will push back a character rather than an
3687    * array whenever possible (probably the majority of cases).
3688    * <p><b>NOTE:</b> This method currently has a hard-coded limit
3689    * of 100 characters for the delimiter.
3690    * @param delim The string that should appear next.
3691    * @return true if the string was successfully read, or false if
3692    * it was not.
3693    * @see #tryRead(char)
3694    */

3695  boolean tryRead (String JavaDoc delim)
3696    throws java.lang.Exception JavaDoc
3697    {
3698    char ch[] = delim.toCharArray();
3699    char c;
3700
3701    // Compare the input, character-
3702
// by character.
3703

3704    for (int i = 0; i < ch.length; i++)
3705      {
3706      c=readCh();
3707      if (c!=ch[i])
3708        {
3709        unread(c);
3710        if (i!=0)
3711          {unread(ch,i);}
3712        return false;
3713        }
3714      }
3715    return true;
3716    }
3717
3718
3719
3720  /**
3721    * Return true if we can read some whitespace.
3722    * <p>This is simply a convenience method.
3723    * <p>This method will push back a character rather than an
3724    * array whenever possible (probably the majority of cases).
3725    * @return true if whitespace was found.
3726    */

3727  boolean tryWhitespace ()
3728    throws java.lang.Exception JavaDoc
3729  {
3730    char c;
3731    c = readCh();
3732    if (isWhitespace(c)) {
3733      skipWhitespace();
3734      return true;
3735    } else {
3736      unread(c);
3737      return false;
3738    }
3739  }
3740
3741
3742  /**
3743    * Read all data until we find the specified string.
3744    * <p>This is especially useful for scanning marked sections.
3745    * <p>This is a a little inefficient right now, since it calls tryRead()
3746    * for every character.
3747    * @param delim The string delimiter
3748    * @see #tryRead(String, boolean)
3749    * @see #readCh
3750    */

3751  void parseUntil (String JavaDoc delim)
3752    throws java.lang.Exception JavaDoc
3753  {
3754    char c;
3755    int startLine = line;
3756
3757    try {
3758      while (!tryRead(delim)) {
3759    c = readCh();
3760    dataBufferAppend(c);
3761      }
3762    } catch (EOFException JavaDoc e) {
3763      error("end of input while looking for delimiter (started on line " +
3764        startLine + ')', null, delim);
3765    }
3766  }
3767
3768
3769  /**
3770    * Skip all data until we find the specified string.
3771    * <p>This is especially useful for scanning comments.
3772    * <p>This is a a little inefficient right now, since it calls tryRead()
3773    * for every character.
3774    * @param delim The string delimiter
3775    * @see #tryRead(String, boolean)
3776    * @see #readCh
3777    */

3778  void skipUntil (String JavaDoc delim)
3779    throws java.lang.Exception JavaDoc
3780  {
3781    while (!tryRead(delim)) {
3782      readCh();
3783    }
3784  }
3785
3786
3787  /**
3788    * Read just the encoding declaration (or XML declaration) at the
3789    * start of an external entity.
3790    * When this method is called, we know that the declaration is
3791    * present (or appears to be). We also know that the entity is
3792    * in some sort of ASCII-derived 8-bit encoding.
3793    * The idea of this is to let us read what the 8-bit encoding is
3794    * before we've committed to converting any more of the file; the
3795    * XML or encoding declaration must be in 7-bit ASCII, so we're
3796    * safe as long as we don't go past it.
3797    */

3798  void read8bitEncodingDeclaration ()
3799    throws java.lang.Exception JavaDoc
3800  {
3801    int ch;
3802    readBufferPos = readBufferLength = 0;
3803
3804    while (true) {
3805      ch = is.read();
3806      readBuffer[readBufferLength++] = (char)ch;
3807      switch (ch) {
3808      case (int)'>':
3809    return;
3810      case -1:
3811    error("end of file before end of XML or encoding declaration.",
3812          null, "?>");
3813    return;
3814      }
3815      if (readBuffer.length == readBufferLength) {
3816    error("unfinished XML or encoding declaration", null, null);
3817      }
3818    }
3819  }
3820
3821
3822
3823  //////////////////////////////////////////////////////////////////////
3824
// Low-level I/O.
3825
//////////////////////////////////////////////////////////////////////
3826

3827
3828  /**
3829    * Read a chunk of data from an external input source.
3830    * <p>This is simply a front-end that fills the rawReadBuffer
3831    * with bytes, then calls the appropriate encoding handler.
3832    * @see #encoding
3833    * @see #rawReadBuffer
3834    * @see #readBuffer
3835    * @see #filterCR
3836    * @see #copyUtf8ReadBuffer
3837    * @see #copyIso8859_1ReadBuffer
3838    * @see #copyUcs_2ReadBuffer
3839    * @see #copyUcs_4ReadBuffer
3840    */

3841  void readDataChunk ()
3842    throws java.lang.Exception JavaDoc
3843    {
3844    int count, i, j;
3845
3846    // See if we have any overflow.
3847
if (readBufferOverflow > -1)
3848      {
3849      readBuffer[0] = (char)readBufferOverflow;
3850      readBufferOverflow = -1;
3851      readBufferPos = 1;
3852      sawCR = true;
3853      }
3854    else
3855      {
3856      readBufferPos = 0;
3857      sawCR = false;
3858      }
3859
3860    // Special situation -- we're taking
3861
// input from a character stream.
3862
if (sourceType == INPUT_READER)
3863      {
3864      count = reader.read(readBuffer, readBufferPos, READ_BUFFER_MAX-1);
3865      if (count < 0)
3866        {readBufferLength = -1;}
3867      else
3868        {
3869        readBufferLength = readBufferPos+count;
3870        filterCR();
3871        sawCR = false;
3872        }
3873      return;
3874      }
3875
3876    // Read as many bytes as possible
3877
// into the read buffer.
3878
count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX);
3879
3880    // Dispatch to an encoding-specific
3881
// reader method to populate the
3882
// readBuffer.
3883
switch (encoding)
3884      {
3885      case ENCODING_UTF_8:
3886      copyUtf8ReadBuffer(count);
3887      break;
3888      
3889      case ENCODING_ISO_8859_1:
3890      copyIso8859_1ReadBuffer(count);
3891      break;
3892      
3893      case ENCODING_UCS_2_12:
3894      copyUcs2ReadBuffer(count, 8, 0);
3895      break;
3896      
3897      case ENCODING_UCS_2_21:
3898      copyUcs2ReadBuffer(count, 0, 8);
3899      break;
3900      
3901      case ENCODING_UCS_4_1234:
3902      copyUcs4ReadBuffer(count, 24, 16, 8, 0);
3903      break;
3904      
3905      case ENCODING_UCS_4_4321:
3906      copyUcs4ReadBuffer(count, 0, 8, 16, 24);
3907      break;
3908      
3909      case ENCODING_UCS_4_2143:
3910      copyUcs4ReadBuffer(count, 16, 24, 0, 8);
3911      break;
3912      
3913      case ENCODING_UCS_4_3412:
3914      copyUcs4ReadBuffer(count, 8, 0, 24, 16);
3915      break;
3916      }
3917
3918    // Filter out all carriage returns
3919
// if we've seen any.
3920
if (sawCR)
3921      {
3922      filterCR();
3923      sawCR = false;
3924      }
3925    
3926    // Reset the position.
3927
readBufferPos = 0;
3928    currentByteCount += count;
3929    }
3930
3931
3932  /**
3933    * Filter carriage returns in the read buffer.
3934    * <p>CRLF becomes LF; CR becomes LF.
3935    * @see #readDataChunk
3936    * @see #readBuffer
3937    * @see #readBufferOverflow
3938    */

3939  void filterCR ()
3940    {
3941    int i, j;
3942    
3943    readBufferOverflow = -1;
3944    
3945    loop: for (i = 0, j = 0; j < readBufferLength; i++, j++)
3946      {
3947      switch (readBuffer[j])
3948        {
3949        case '\r':
3950        if (j == readBufferLength - 1)
3951          {
3952          readBufferOverflow = '\r';
3953          readBufferLength--;
3954          break loop;
3955          }
3956        else if (readBuffer[j+1] == '\n')
3957          {j++;}
3958        readBuffer[i] = '\n';
3959        break;
3960        
3961        case '\n':
3962        default:
3963        readBuffer[i] = readBuffer[j];
3964        break;
3965        }
3966      }
3967    readBufferLength = i;
3968    }
3969
3970
3971  /**
3972    * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
3973    * <p>When readDataChunk() calls this method, the raw bytes are in
3974    * rawReadBuffer, and the final characters will appear in
3975    * readBuffer.
3976    * <p>The tricky part of this is dealing with UTF-8 multi-byte
3977    * sequences, but it doesn't seem to slow things down too much.
3978    * @param count The number of bytes to convert.
3979    * @see #readDataChunk
3980    * @see #rawReadBuffer
3981    * @see #readBuffer
3982    * @see #getNextUtf8Byte
3983    */

3984  void copyUtf8ReadBuffer (int count)
3985    throws java.lang.Exception JavaDoc
3986  {
3987    int i = 0;
3988    int j = readBufferPos;
3989    int b1;
3990    boolean isSurrogate = false;
3991    while (i < count) {
3992      b1 = rawReadBuffer[i++];
3993      isSurrogate = false;
3994
3995                // Determine whether we are dealing
3996
// with a one-, two-, three-, or four-
3997
// byte sequence.
3998
if ((b1 & 0x80) == 0) {
3999    // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
4000
readBuffer[j++] = (char)b1;
4001      } else if ((b1 & 0xe0) == 0xc0) {
4002    // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
4003
readBuffer[j++] =
4004      (char)(((b1 & 0x1f) << 6) |
4005         getNextUtf8Byte(i++, count));
4006      } else if ((b1 & 0xf0) == 0xe0) {
4007    // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
4008
readBuffer[j++] =
4009      (char)(((b1 & 0x0f) << 12) |
4010         (getNextUtf8Byte(i++, count) << 6) |
4011         getNextUtf8Byte(i++, count));
4012      } else if ((b1 & 0xf8) == 0xf0) {
4013    // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
4014
// = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
4015
// (uuuuu = wwww + 1)
4016
isSurrogate = true;
4017    int b2 = getNextUtf8Byte(i++, count);
4018    int b3 = getNextUtf8Byte(i++, count);
4019    int b4 = getNextUtf8Byte(i++, count);
4020    readBuffer[j++] =
4021      (char)(0xd800 |
4022         ((((b1 & 0x07) << 2) | ((b2 & 0x30) >> 4) - 1) << 6) |
4023         ((b2 & 0x0f) << 2) |
4024         ((b3 & 0x30) >> 4));
4025    readBuffer[j++] =
4026      (char)(0xdc |
4027         ((b3 & 0x0f) << 6) |
4028         b4);
4029                // TODO: test that surrogate value is legal.
4030
} else {
4031    // Otherwise, the 8th bit may not be set in UTF-8
4032
encodingError("bad start for UTF-8 multi-byte sequence", b1, i);
4033      }
4034      if (readBuffer[j-1] == '\r') {
4035    sawCR = true;
4036      }
4037    }
4038                // How many characters have we read?
4039
readBufferLength = j;
4040  }
4041
4042
4043  /**
4044    * Return the next byte value in a UTF-8 sequence.
4045    * If it is not possible to get a byte from the current
4046    * entity, throw an exception.
4047    * @param pos The current position in the rawReadBuffer.
4048    * @param count The number of bytes in the rawReadBuffer
4049    * @return The significant six bits of a non-initial byte in
4050    * a UTF-8 sequence.
4051    * @exception EOFException If the sequence is incomplete.
4052    */

4053  int getNextUtf8Byte (int pos, int count)
4054    throws java.lang.Exception JavaDoc
4055  {
4056    int val;
4057
4058                // Take a character from the buffer
4059
// or from the actual input stream.
4060
if (pos < count) {
4061      val = rawReadBuffer[pos];
4062    } else {
4063      val = is.read();
4064      if (val == -1) {
4065    encodingError("unfinished multi-byte UTF-8 sequence at EOF", -1, pos);
4066      }
4067    }
4068
4069                // Check for the correct bits at the
4070
// start.
4071
if ((val & 0xc0) != 0x80) {
4072      encodingError("bad continuation of multi-byte UTF-8 sequence", val,
4073            pos + 1);
4074    }
4075
4076                // Return the significant bits.
4077
return (val & 0x3f);
4078  }
4079
4080
4081  /**
4082    * Convert a buffer of ISO-8859-1-encoded bytes into UTF-16 characters.
4083    * <p>When readDataChunk() calls this method, the raw bytes are in
4084    * rawReadBuffer, and the final characters will appear in
4085    * readBuffer.
4086    * <p>This is a direct conversion, with no tricks.
4087    * @param count The number of bytes to convert.
4088    * @see #readDataChunk
4089    * @see #rawReadBuffer
4090    * @see #readBuffer
4091    */

4092  void copyIso8859_1ReadBuffer (int count)
4093  {
4094    int i, j;
4095    for (i = 0, j = readBufferPos; i < count; i++, j++) {
4096      readBuffer[j] = (char)(rawReadBuffer[i] & 0xff);
4097      if (readBuffer[j] == '\r') {
4098    sawCR = true;
4099      }
4100    }
4101    readBufferLength = j;
4102  }
4103
4104
4105  /**
4106    * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters.
4107    * <p>When readDataChunk() calls this method, the raw bytes are in
4108    * rawReadBuffer, and the final characters will appear in
4109    * readBuffer.
4110    * @param count The number of bytes to convert.
4111    * @param shift1 The number of bits to shift byte 1.
4112    * @param shift2 The number of bits to shift byte 2
4113    * @see #readDataChunk
4114    * @see #rawReadBuffer
4115    * @see #readBuffer
4116    */

4117  void copyUcs2ReadBuffer (int count, int shift1, int shift2)
4118    throws java.lang.Exception JavaDoc
4119  {
4120    int j = readBufferPos;
4121
4122    if (count > 0 && (count % 2) != 0) {
4123      encodingError("odd number of bytes in UCS-2 encoding", -1, count);
4124    }
4125    for (int i = 0; i < count; i+=2) {
4126      readBuffer[j++] =
4127    (char)(((rawReadBuffer[i] & 0xff) << shift1) |
4128           ((rawReadBuffer[i+1] & 0xff) << shift2));
4129      if (readBuffer[j-1] == '\r') {
4130    sawCR = true;
4131      }
4132    }
4133    readBufferLength = j;
4134  }
4135
4136
4137  /**
4138    * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
4139    * <p>When readDataChunk() calls this method, the raw bytes are in
4140    * rawReadBuffer, and the final characters will appear in
4141    * readBuffer.
4142    * <p>Java has 16-bit chars, but this routine will attempt to use
4143    * surrogates to encoding values between 0x00010000 and 0x000fffff.
4144    * @param count The number of bytes to convert.
4145    * @param shift1 The number of bits to shift byte 1.
4146    * @param shift2 The number of bits to shift byte 2
4147    * @param shift3 The number of bits to shift byte 2
4148    * @param shift4 The number of bits to shift byte 2
4149    * @see #readDataChunk
4150    * @see #rawReadBuffer
4151    * @see #readBuffer
4152    */

4153  void copyUcs4ReadBuffer (int count, int shift1, int shift2,
4154               int shift3, int shift4)
4155    throws java.lang.Exception JavaDoc
4156  {
4157    int j = readBufferPos;
4158    int value;
4159
4160    if (count > 0 && (count % 4) != 0) {
4161      encodingError("number of bytes in UCS-4 encoding not divisible by 4",
4162            -1, count);
4163    }
4164    for (int i = 0; i < count; i+=4) {
4165      value = (((rawReadBuffer[i] & 0xff) << shift1) |
4166           ((rawReadBuffer[i+1] & 0xff) << shift2) |
4167           ((rawReadBuffer[i+2] & 0xff) << shift3) |
4168           ((rawReadBuffer[i+3] & 0xff) << shift4));
4169      if (value < 0x0000ffff) {
4170    readBuffer[j++] = (char)value;
4171    if (value == (int)'\r') {
4172      sawCR = true;
4173    }
4174      } else if (value < 0x000fffff) {
4175    readBuffer[j++] = (char)(0xd8 | ((value & 0x000ffc00) >> 10));
4176    readBuffer[j++] = (char)(0xdc | (value & 0x0003ff));
4177      } else {
4178    encodingError("value cannot be represented in UTF-16",
4179              value, i);
4180      }
4181    }
4182    readBufferLength = j;
4183  }
4184
4185
4186  /**
4187    * Report a character encoding error.
4188    */

4189  void encodingError (String JavaDoc message, int value, int offset)
4190    throws java.lang.Exception JavaDoc
4191  {
4192    String JavaDoc uri;
4193
4194    if (value >= 0) {
4195      message = message + " (byte value: 0x" +
4196    Integer.toHexString(value) + ')';
4197    }
4198    if (externalEntity != null) {
4199      uri = externalEntity.getURL().toString();
4200    } else {
4201      uri = baseURI;
4202    }
4203    handler.error(message, uri, -1, offset + currentByteCount);
4204  }
4205
4206
4207
4208  //////////////////////////////////////////////////////////////////////
4209
// Local Variables.
4210
//////////////////////////////////////////////////////////////////////
4211

4212  /**
4213    * Re-initialize the variables for each parse.
4214    */

4215  void initializeVariables ()
4216  {
4217                // No errors; first line
4218
errorCount = 0;
4219    line = 1;
4220    column = 0;
4221
4222                // Set up the buffers for data and names
4223
dataBufferPos = 0;
4224    dataBuffer = new char[DATA_BUFFER_INITIAL];
4225    nameBufferPos = 0;
4226    nameBuffer = new char[NAME_BUFFER_INITIAL];
4227
4228                // Set up the DTD hash tables
4229
elementInfo = new Hashtable JavaDoc();
4230    entityInfo = new Hashtable JavaDoc();
4231    notationInfo = new Hashtable JavaDoc();
4232
4233                // Set up the variables for the current
4234
// element context.
4235
currentElement = null;
4236    currentElementContent = CONTENT_UNDECLARED;
4237
4238                // Set up the input variables
4239
sourceType = INPUT_NONE;
4240    inputStack = new Stack JavaDoc();
4241    entityStack = new Stack JavaDoc();
4242    externalEntity = null;
4243    tagAttributePos = 0;
4244    tagAttributes = new String JavaDoc[100];
4245    rawReadBuffer = new byte[READ_BUFFER_MAX];
4246    readBufferOverflow = -1;
4247
4248    context = CONTEXT_NONE;
4249
4250    symbolTable = new Object JavaDoc[SYMBOL_TABLE_LENGTH];
4251  }
4252
4253
4254  /**
4255    * Clean up after the parse to allow some garbage collection.
4256    * Leave around anything that might be useful for queries.
4257    */

4258  void cleanupVariables ()
4259  {
4260    errorCount = -1;
4261    line = -1;
4262    column = -1;
4263    dataBuffer = null;
4264    nameBuffer = null;
4265    currentElement = null;
4266    currentElementContent = CONTENT_UNDECLARED;
4267    sourceType = INPUT_NONE;
4268    inputStack = null;
4269    externalEntity = null;
4270    entityStack = null;
4271  }
4272
4273  //
4274
// The current XML handler interface.
4275
//
4276
XmlHandler handler;
4277
4278  //
4279
// I/O information.
4280
//
4281
private Reader JavaDoc reader; // current reader
4282
private InputStream JavaDoc is; // current input stream
4283
private int line; // current line number
4284
private int column; // current column number
4285
private int sourceType; // type of input source
4286
private Stack JavaDoc inputStack; // stack of input soruces
4287
private URLConnection JavaDoc externalEntity; // current external entity
4288
private int encoding; // current character encoding.
4289
private int currentByteCount; // how many bytes read from current source.
4290

4291  //
4292
// Maintain a count of errors.
4293
//
4294
private int errorCount;
4295
4296  //
4297
// Buffers for decoded but unparsed character input.
4298
//
4299
private final static int READ_BUFFER_MAX = 16384;
4300  private char readBuffer[];
4301  private int readBufferPos;
4302  private int readBufferLength;
4303  private int readBufferOverflow; // overflow character from last data chunk.
4304

4305
4306  //
4307
// Buffer for undecoded raw byte input.
4308
//
4309
private byte rawReadBuffer[];
4310
4311
4312  //
4313
// Buffer for parsed character data.
4314
//
4315
private static int DATA_BUFFER_INITIAL = 4096;
4316  private char dataBuffer[];
4317  private int dataBufferPos;
4318
4319  //
4320
// Buffer for parsed names.
4321
//
4322
private static int NAME_BUFFER_INITIAL = 1024;
4323  private char nameBuffer[];
4324  private int nameBufferPos;
4325
4326
4327  //
4328
// Hashtables for DTD information on elements, entities, and notations.
4329
//
4330
private Hashtable JavaDoc elementInfo;
4331  private Hashtable JavaDoc entityInfo;
4332  private Hashtable JavaDoc notationInfo;
4333
4334
4335  //
4336
// Element type currently in force.
4337
//
4338
private String JavaDoc currentElement;
4339  private int currentElementContent;
4340
4341  //
4342
// Base external identifiers for resolution.
4343
//
4344
private String JavaDoc basePublicId;
4345  private String JavaDoc baseURI;
4346  private int baseEncoding;
4347  private Reader JavaDoc baseReader;
4348  private InputStream JavaDoc baseInputStream;
4349  private char baseInputBuffer[];
4350  private int baseInputBufferStart;
4351  private int baseInputBufferLength;
4352
4353  //
4354
// Stack of entity names, to help detect recursion.
4355
//
4356
private Stack JavaDoc entityStack;
4357
4358  //
4359
// Are we in a context where PEs are allowed?
4360
//
4361
private int context;
4362
4363  //
4364
// Symbol table, for internalising names.
4365
//
4366
private Object JavaDoc symbolTable[];
4367  private final static int SYMBOL_TABLE_LENGTH = 1087;
4368
4369  //
4370
// Hash table of attributes found in current start tag.
4371
//
4372
private String JavaDoc tagAttributes[];
4373  private int tagAttributePos;
4374
4375  //
4376
// Utility flag: have we noticed a CR while reading the last
4377
// data chunk? If so, we will have to go back and normalise
4378
// CR/LF.
4379
//
4380
private boolean sawCR;
4381}
4382
Popular Tags