HTMLScanner


1   /* 
2    * (C) Copyright 2002-2005, Andy Clark.  All rights reserved.
3    *
4    * This file is distributed under an Apache style license. Please
5    * refer to the LICENSE file for specific details.
6    *
7    * NOTE: The URI fixing code in this source was taken from the Apache
8    *       Xerces parser which is distributed under the Apache license.
9    *       Refer to the LICENSE_apache file for details.
10   */
11  
12  package org.cyberneko.html;
13  
14  import java.io.EOFException  ;
15  import java.io.FileInputStream  ;
16  import java.io.FilterInputStream  ;
17  import java.io.InputStream  ;
18  import java.io.InputStreamReader  ;
19  import java.io.IOException  ;
20  import java.io.PushbackReader  ;
21  import java.io.Reader  ;
22  import java.io.UnsupportedEncodingException  ;
23  import java.lang.reflect.InvocationTargetException  ;
24  import java.lang.reflect.Method  ;
25  import java.net.URL  ;
26  import java.util.Stack  ;
27  
28  import org.apache.xerces.util.EncodingMap;
29  import org.apache.xerces.util.NamespaceSupport;
30  import org.apache.xerces.util.URI;
31  import org.apache.xerces.util.XMLAttributesImpl;
32  import org.apache.xerces.util.XMLResourceIdentifierImpl;
33  import org.apache.xerces.util.XMLStringBuffer;
34  import org.apache.xerces.xni.Augmentations;
35  import org.apache.xerces.xni.NamespaceContext;
36  import org.apache.xerces.xni.QName;
37  import org.apache.xerces.xni.XMLAttributes;
38  import org.apache.xerces.xni.XMLDocumentHandler;
39  import org.apache.xerces.xni.XMLLocator;
40  import org.apache.xerces.xni.XMLResourceIdentifier;
41  import org.apache.xerces.xni.XMLString;
42  import org.apache.xerces.xni.XNIException;
43  import org.apache.xerces.xni.parser.XMLComponentManager;
44  import org.apache.xerces.xni.parser.XMLConfigurationException;
45  import org.apache.xerces.xni.parser.XMLDocumentScanner;
46  import org.apache.xerces.xni.parser.XMLInputSource;
47  
48  /**
49   * A simple HTML scanner. This scanner makes no attempt to balance tags
50   * or fix other problems in the source document &mdash; it just scans what 
51   * it can and generates XNI document "events", ignoring errors of all 
52   * kinds.
53   * <p>
54   * This component recognizes the following features:
55   * <ul>
56   * <li>http://cyberneko.org/html/features/augmentations
57   * <li>http://cyberneko.org/html/features/report-errors
58   * <li>http://apache.org/xml/features/scanner/notify-char-refs
59   * <li>http://apache.org/xml/features/scanner/notify-builtin-refs
60   * <li>http://cyberneko.org/html/features/scanner/notify-builtin-refs
61   * <li>http://cyberneko.org/html/features/scanner/fix-mswindows-refs
62   * <li>http://cyberneko.org/html/features/scanner/script/strip-cdata-delims
63   * <li>http://cyberneko.org/html/features/scanner/script/strip-comment-delims
64   * <li>http://cyberneko.org/html/features/scanner/style/strip-cdata-delims
65   * <li>http://cyberneko.org/html/features/scanner/style/strip-comment-delims
66   * <li>http://cyberneko.org/html/features/scanner/ignore-specified-charset
67   * <li>http://cyberneko.org/html/features/scanner/cdata-sections
68   * <li>http://cyberneko.org/html/features/override-doctype
69   * <li>http://cyberneko.org/html/features/insert-doctype
70   * </ul>
71   * <p>
72   * This component recognizes the following properties:
73   * <ul>
74   * <li>http://cyberneko.org/html/properties/names/elems
75   * <li>http://cyberneko.org/html/properties/names/attrs
76   * <li>http://cyberneko.org/html/properties/default-encoding
77   * <li>http://cyberneko.org/html/properties/error-reporter
78   * <li>http://cyberneko.org/html/properties/doctype/pubid
79   * <li>http://cyberneko.org/html/properties/doctype/sysid
80   * </ul>
81   *
82   * @see HTMLElements
83   * @see HTMLEntities
84   *
85   * @author Andy Clark
86   *
87   * @version $Id: HTMLScanner.java,v 1.19 2005/06/14 05:52:37 andyc Exp $
88   */
89  public class HTMLScanner 
90      implements XMLDocumentScanner, XMLLocator, HTMLComponent {
91  
92      //
93      // Constants
94      //
95  
96      // doctype info: HTML 4.01 strict
97  
98      /** HTML 4.01 strict public identifier ("-//W3C//DTD HTML 4.01//EN"). */
99      public static final String   HTML_4_01_STRICT_PUBID = "-//W3C//DTD HTML 4.01//EN";
100 
101     /** HTML 4.01 strict system identifier ("http://www.w3.org/TR/html4/strict.dtd"). */
102     public static final String   HTML_4_01_STRICT_SYSID = "http://www.w3.org/TR/html4/strict.dtd";
103 
104     // doctype info: HTML 4.01 loose
105 
106     /** HTML 4.01 transitional public identifier ("-//W3C//DTD HTML 4.01 Transitional//EN"). */
107     public static final String   HTML_4_01_TRANSITIONAL_PUBID = "-//W3C//DTD HTML 4.01 Transitional//EN";
108 
109     /** HTML 4.01 transitional system identifier ("http://www.w3.org/TR/html4/loose.dtd"). */
110     public static final String   HTML_4_01_TRANSITIONAL_SYSID = "http://www.w3.org/TR/html4/loose.dtd";
111 
112     // doctype info: HTML 4.01 frameset
113 
114     /** HTML 4.01 frameset public identifier ("-//W3C//DTD HTML 4.01 Frameset//EN"). */
115     public static final String   HTML_4_01_FRAMESET_PUBID = "-//W3C//DTD HTML 4.01 Frameset//EN";
116 
117     /** HTML 4.01 frameset system identifier ("http://www.w3.org/TR/html4/frameset.dtd"). */
118     public static final String   HTML_4_01_FRAMESET_SYSID = "http://www.w3.org/TR/html4/frameset.dtd";
119 
120     // features
121 
122     /** Include infoset augmentations. */
123     protected static final String   AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
124 
125     /** Report errors. */
126     protected static final String   REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
127 
128     /** Notify character entity references (e.g. &amp;#32;, &amp;#x20;, etc). */
129     public static final String   NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs";
130 
131     /** 
132      * Notify handler of built-in entity references (e.g. &amp;amp;, 
133      * &amp;lt;, etc).
134      * <p>
135      * <strong>Note:</strong>
136      * This only applies to the five pre-defined XML general entities.
137      * Specifically, "amp", "lt", "gt", "quot", and "apos". This is done 
138      * for compatibility with the Xerces feature.
139      * <p>
140      * To be notified of the built-in entity references in HTML, set the 
141      * <code>http://cyberneko.org/html/features/scanner/notify-builtin-refs</code> 
142      * feature to <code>true</code>.
143      */
144     public static final String   NOTIFY_XML_BUILTIN_REFS = "http://apache.org/xml/features/scanner/notify-builtin-refs";
145 
146     /** 
147      * Notify handler of built-in entity references (e.g. &amp;nobr;, 
148      * &amp;copy;, etc).
149      * <p>
150      * <strong>Note:</strong>
151      * This <em>includes</em> the five pre-defined XML general entities.
152      */
153     public static final String   NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs";
154 
155     /** Fix Microsoft Windows&reg; character entity references. */
156     public static final String   FIX_MSWINDOWS_REFS = "http://cyberneko.org/html/features/scanner/fix-mswindows-refs";
157 
158     /** 
159      * Strip HTML comment delimiters ("&lt;!&minus;&minus;" and 
160      * "&minus;&minus;&gt;") from SCRIPT tag contents.
161      */
162     public static final String   SCRIPT_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-comment-delims";
163 
164     /** 
165      * Strip XHTML CDATA delimiters ("&lt;![CDATA[" and "]]&gt;") from 
166      * SCRIPT tag contents.
167      */
168     public static final String   SCRIPT_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-cdata-delims";
169 
170     /** 
171      * Strip HTML comment delimiters ("&lt;!&minus;&minus;" and 
172      * "&minus;&minus;&gt;") from STYLE tag contents.
173      */
174     public static final String   STYLE_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-comment-delims";
175 
176     /** 
177      * Strip XHTML CDATA delimiters ("&lt;![CDATA[" and "]]&gt;") from 
178      * STYLE tag contents.
179      */
180     public static final String   STYLE_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims";
181 
182     /**
183      * Ignore specified charset found in the &lt;meta equiv='Content-Type'
184      * content='text/html;charset=&hellip;'&gt; tag.
185      */
186     public static final String   IGNORE_SPECIFIED_CHARSET = "http://cyberneko.org/html/features/scanner/ignore-specified-charset";
187 
188     /** Scan CDATA sections. */
189     public static final String   CDATA_SECTIONS = "http://cyberneko.org/html/features/scanner/cdata-sections";
190 
191     /** Override doctype declaration public and system identifiers. */
192     public static final String   OVERRIDE_DOCTYPE = "http://cyberneko.org/html/features/override-doctype";
193 
194     /** Insert document type declaration. */
195     public static final String   INSERT_DOCTYPE = "http://cyberneko.org/html/features/insert-doctype";
196 
197     /** Recognized features. */
198     private static final String  [] RECOGNIZED_FEATURES = {
199         AUGMENTATIONS,
200         REPORT_ERRORS,
201         NOTIFY_CHAR_REFS,
202         NOTIFY_XML_BUILTIN_REFS,
203         NOTIFY_HTML_BUILTIN_REFS,
204         FIX_MSWINDOWS_REFS,
205         SCRIPT_STRIP_CDATA_DELIMS,
206         SCRIPT_STRIP_COMMENT_DELIMS,
207         STYLE_STRIP_CDATA_DELIMS,
208         STYLE_STRIP_COMMENT_DELIMS,
209         IGNORE_SPECIFIED_CHARSET,
210         CDATA_SECTIONS,
211         OVERRIDE_DOCTYPE,
212         INSERT_DOCTYPE,
213     };
214 
215     /** Recognized features defaults. */
216     private static final Boolean  [] RECOGNIZED_FEATURES_DEFAULTS = {
217         null,
218         null,
219         Boolean.FALSE,
220         Boolean.FALSE,
221         Boolean.FALSE,
222         Boolean.FALSE,
223         Boolean.FALSE,
224         Boolean.FALSE,
225         Boolean.FALSE,
226         Boolean.FALSE,
227         Boolean.FALSE,
228         Boolean.FALSE,
229         Boolean.FALSE,
230         Boolean.FALSE,
231     };
232 
233     // properties
234 
235     /** Modify HTML element names: { "upper", "lower", "default" }. */
236     protected static final String   NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
237 
238     /** Modify HTML attribute names: { "upper", "lower", "default" }. */
239     protected static final String   NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
240     
241     /** Default encoding. */
242     protected static final String   DEFAULT_ENCODING = "http://cyberneko.org/html/properties/default-encoding";
243     
244     /** Error reporter. */
245     protected static final String   ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
246 
247     /** Doctype declaration public identifier. */
248     protected static final String   DOCTYPE_PUBID = "http://cyberneko.org/html/properties/doctype/pubid";
249 
250     /** Doctype declaration system identifier. */
251     protected static final String   DOCTYPE_SYSID = "http://cyberneko.org/html/properties/doctype/sysid";
252 
253     /** Recognized properties. */
254     private static final String  [] RECOGNIZED_PROPERTIES = {
255         NAMES_ELEMS,
256         NAMES_ATTRS,
257         DEFAULT_ENCODING,
258         ERROR_REPORTER,
259         DOCTYPE_PUBID,
260         DOCTYPE_SYSID,
261     };
262 
263     /** Recognized properties defaults. */
264     private static final Object  [] RECOGNIZED_PROPERTIES_DEFAULTS = {
265         null,
266         null,
267         "Windows-1252",
268         null,
269         HTML_4_01_TRANSITIONAL_PUBID,
270         HTML_4_01_TRANSITIONAL_SYSID,
271     };
272 
273     // states
274 
275     /** State: content. */
276     protected static final short STATE_CONTENT = 0;
277 
278     /** State: markup bracket. */
279     protected static final short STATE_MARKUP_BRACKET = 1;
280 
281     /** State: start document. */
282     protected static final short STATE_START_DOCUMENT = 10;
283 
284     /** State: end document. */
285     protected static final short STATE_END_DOCUMENT = 11;
286 
287     // modify HTML names
288 
289     /** Don't modify HTML names. */
290     protected static final short NAMES_NO_CHANGE = 0;
291 
292     /** Uppercase HTML names. */
293     protected static final short NAMES_UPPERCASE = 1;
294 
295     /** Lowercase HTML names. */
296     protected static final short NAMES_LOWERCASE = 2;
297 
298     // defaults
299 
300     /** Default buffer size. */
301     protected static final int DEFAULT_BUFFER_SIZE = 2048;
302 
303     // debugging
304 
305     /** Set to true to debug changes in the scanner. */
306     private static final boolean DEBUG_SCANNER = false;
307 
308     /** Set to true to debug changes in the scanner state. */
309     private static final boolean DEBUG_SCANNER_STATE = false;
310 
311     /** Set to true to debug the buffer. */
312     private static final boolean DEBUG_BUFFER = false;
313 
314     /** Set to true to debug character encoding handling. */
315     private static final boolean DEBUG_CHARSET = false;
316 
317     /** Set to true to debug callbacks. */
318     protected static final boolean DEBUG_CALLBACKS = false;
319 
320     // static vars
321 
322     /** Synthesized event info item. */
323     protected static final HTMLEventInfo SYNTHESIZED_ITEM = 
324         new HTMLEventInfo.SynthesizedItem();
325 
326     //
327     // Data
328     //
329 
330     // features
331 
332     /** Augmentations. */
333     protected boolean fAugmentations;
334 
335     /** Report errors. */
336     protected boolean fReportErrors;
337 
338     /** Notify character entity references. */
339     protected boolean fNotifyCharRefs;
340 
341     /** Notify XML built-in general entity references. */
342     protected boolean fNotifyXmlBuiltinRefs;
343 
344     /** Notify HTML built-in general entity references. */
345     protected boolean fNotifyHtmlBuiltinRefs;
346 
347     /** Fix Microsoft Windows&reg; character entity references. */
348     protected boolean fFixWindowsCharRefs;
349 
350     /** Strip CDATA delimiters from SCRIPT tags. */
351     protected boolean fScriptStripCDATADelims;
352 
353     /** Strip comment delimiters from SCRIPT tags. */
354     protected boolean fScriptStripCommentDelims;
355 
356     /** Strip CDATA delimiters from STYLE tags. */
357     protected boolean fStyleStripCDATADelims;
358 
359     /** Strip comment delimiters from STYLE tags. */
360     protected boolean fStyleStripCommentDelims;
361 
362     /** Ignore specified character set. */
363     protected boolean fIgnoreSpecifiedCharset;
364 
365     /** CDATA sections. */
366     protected boolean fCDATASections;
367 
368     /** Override doctype declaration public and system identifiers. */
369     protected boolean fOverrideDoctype;
370 
371     /** Insert document type declaration. */
372     protected boolean fInsertDoctype;
373 
374     // properties
375 
376     /** Modify HTML element names. */
377     protected short fNamesElems;
378 
379     /** Modify HTML attribute names. */
380     protected short fNamesAttrs;
381 
382     /** Default encoding. */
383     protected String   fDefaultIANAEncoding;
384 
385     /** Error reporter. */
386     protected HTMLErrorReporter fErrorReporter;
387 
388     /** Doctype declaration public identifier. */
389     protected String   fDoctypePubid;
390 
391     /** Doctype declaration system identifier. */
392     protected String   fDoctypeSysid;
393 
394     // boundary locator information
395 
396     /** Beginning line number. */
397     protected int fBeginLineNumber;
398 
399     /** Beginning column number. */
400     protected int fBeginColumnNumber;
401 
402     /** Ending line number. */
403     protected int fEndLineNumber;
404 
405     /** Ending column number. */
406     protected int fEndColumnNumber;
407 
408     // state
409 
410     /** The playback byte stream. */
411     protected PlaybackInputStream fByteStream;
412 
413     /** Current entity. */
414     protected CurrentEntity fCurrentEntity;
415     
416     /** The current entity stack. */
417     protected final Stack   fCurrentEntityStack = new Stack  ();
418 
419     /** The current scanner. */
420     protected Scanner fScanner;
421 
422     /** The current scanner state. */
423     protected short fScannerState;
424 
425     /** The document handler. */
426     protected XMLDocumentHandler fDocumentHandler;
427 
428     /** Auto-detected IANA encoding. */
429     protected String   fIANAEncoding;
430 
431     /** Auto-detected Java encoding. */
432     protected String   fJavaEncoding;
433 
434     /** True if the encoding matches "ISO-8859-*". */
435     protected boolean fIso8859Encoding;
436 
437     /** Element count. */
438     protected int fElementCount;
439 
440     /** Element depth. */
441     protected int fElementDepth;
442 
443     // scanners
444 
445     /** Content scanner. */
446     protected Scanner fContentScanner = new ContentScanner();
447 
448     /** 
449      * Special scanner used for elements whose content needs to be scanned 
450      * as plain text, ignoring markup such as elements and entity references.
451      * For example: &lt;SCRIPT&gt; and &lt;COMMENT&gt;.
452      */
453     protected SpecialScanner fSpecialScanner = new SpecialScanner();
454 
455     // temp vars
456 
457     /** String. */
458     protected final XMLString fString = new XMLString();
459 
460     /** String buffer. */
461     protected final XMLStringBuffer fStringBuffer = new XMLStringBuffer(1024);
462 
463     /** String buffer. */
464     private final XMLStringBuffer fStringBuffer2 = new XMLStringBuffer(1024);
465 
466     /** Non-normalized attribute string buffer. */
467     private final XMLStringBuffer fNonNormAttr = new XMLStringBuffer(128);
468 
469     /** Augmentations. */
470     private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
471 
472     /** Location infoset item. */
473     private final LocationItem fLocationItem = new LocationItem();
474 
475     /** Single boolean array. */
476     private final boolean[] fSingleBoolean = { false };
477 
478     /** Resource identifier. */
479     private final XMLResourceIdentifierImpl fResourceId = new XMLResourceIdentifierImpl();
480 
481     //
482     // Public methods
483     //
484 
485     /** 
486      * Pushes an input source onto the current entity stack. This 
487      * enables the scanner to transparently scan new content (e.g. 
488      * the output written by an embedded script). At the end of the
489      * current entity, the scanner returns where it left off at the
490      * time this entity source was pushed.
491      * <p>
492      * <strong>Note:</strong>
493      * This functionality is experimental at this time and is
494      * subject to change in future releases of NekoHTML.
495      *
496      * @param inputSource The new input source to start scanning.
497      */
498     public void pushInputSource(XMLInputSource inputSource) {
499         Reader   reader = inputSource.getCharacterStream();
500         if (reader == null) {
501             throw new IllegalArgumentException  ("pushed input source has no reader");
502         }
503         fCurrentEntityStack.push(fCurrentEntity);
504         String   encoding = inputSource.getEncoding();
505         String   publicId = inputSource.getPublicId();
506         String   baseSystemId = inputSource.getBaseSystemId();
507         String   literalSystemId = inputSource.getSystemId();
508         String   expandedSystemId = expandSystemId(literalSystemId, baseSystemId);
509         fCurrentEntity = new CurrentEntity(reader, encoding, 
510                                            publicId, baseSystemId,
511                                            literalSystemId, expandedSystemId);
512     } // pushInputSource(XMLInputSource)
513 
514     /**
515      * Cleans up used resources. For example, if scanning is terminated
516      * early, then this method ensures all remaining open streams are
517      * closed.
518      *
519      * @param closeall Close all streams, including the original.
520      *                 This is used in cases when the application has
521      *                 opened the original document stream and should
522      *                 be responsible for closing it.
523      */
524     public void cleanup(boolean closeall) {
525         int size = fCurrentEntityStack.size();
526         if (size > 0) {
527             // current entity is not the original, so close it
528             if (fCurrentEntity != null) {
529                 try {
530                     fCurrentEntity.stream.close();
531                 }
532                 catch (IOException   e) {
533                     // ignore
534                 }
535             }
536             // close remaining streams
537             for (int i = closeall ? 0 : 1; i < size; i++) {
538                 fCurrentEntity = (CurrentEntity)fCurrentEntityStack.pop();
539                 try {
540                     fCurrentEntity.stream.close();
541                 }
542                 catch (IOException   e) {
543                     // ignore
544                 }
545             }
546         }
547         else if (closeall && fCurrentEntity != null) {
548             try {
549                 fCurrentEntity.stream.close();
550             }
551             catch (IOException   e) {
552                 // ignore
553             }
554         }
555     } // cleanup(boolean)
556 
557     //
558     // XMLLocator methods
559     //
560 
561     /** Returns the encoding. */
562     public String   getEncoding() {
563         return fCurrentEntity != null ? fCurrentEntity.encoding : null;
564     } // getEncoding():String
565 
566     /** Returns the public identifier. */
567     public String   getPublicId() { 
568         return fCurrentEntity != null ? fCurrentEntity.publicId : null; 
569     } // getPublicId():String
570 
571     /** Returns the base system identifier. */
572     public String   getBaseSystemId() { 
573         return fCurrentEntity != null ? fCurrentEntity.baseSystemId : null; 
574     } // getBaseSystemId():String
575 
576     /** Returns the literal system identifier. */
577     public String   getLiteralSystemId() { 
578         return fCurrentEntity != null ? fCurrentEntity.literalSystemId : null; 
579     } // getLiteralSystemId():String
580 
581     /** Returns the expanded system identifier. */
582     public String   getExpandedSystemId() { 
583         return fCurrentEntity != null ? fCurrentEntity.expandedSystemId : null; 
584     } // getExpandedSystemId():String
585 
586     /** Returns the current line number. */
587     public int getLineNumber() { 
588         return fCurrentEntity != null ? fCurrentEntity.lineNumber : -1; 
589     } // getLineNumber():int
590 
591     /** Returns the current column number. */
592     public int getColumnNumber() { 
593         return fCurrentEntity != null ? fCurrentEntity.columnNumber : -1; 
594     } // getColumnNumber():int
595 
596     //
597     // HTMLComponent methods
598     //
599 
600     /** Returns the default state for a feature. */
601     public Boolean   getFeatureDefault(String   featureId) {
602         int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0;
603         for (int i = 0; i < length; i++) {
604             if (RECOGNIZED_FEATURES[i].equals(featureId)) {
605                 return RECOGNIZED_FEATURES_DEFAULTS[i];
606             }
607         }
608         return null;
609     } // getFeatureDefault(String):Boolean
610 
611     /** Returns the default state for a property. */
612     public Object   getPropertyDefault(String   propertyId) {
613         int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0;
614         for (int i = 0; i < length; i++) {
615             if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
616                 return RECOGNIZED_PROPERTIES_DEFAULTS[i];
617             }
618         }
619         return null;
620     } // getPropertyDefault(String):Object
621 
622     //
623     // XMLComponent methods
624     //
625 
626     /** Returns recognized features. */
627     public String  [] getRecognizedFeatures() {
628         return RECOGNIZED_FEATURES;
629     } // getRecognizedFeatures():String[]
630 
631     /** Returns recognized properties. */
632     public String  [] getRecognizedProperties() {
633         return RECOGNIZED_PROPERTIES;
634     } // getRecognizedProperties():String[]
635 
636     /** Resets the component. */
637     public void reset(XMLComponentManager manager)
638         throws XMLConfigurationException {
639 
640         // get features
641         fAugmentations = manager.getFeature(AUGMENTATIONS);
642         fReportErrors = manager.getFeature(REPORT_ERRORS);
643         fNotifyCharRefs = manager.getFeature(NOTIFY_CHAR_REFS);
644         fNotifyXmlBuiltinRefs = manager.getFeature(NOTIFY_XML_BUILTIN_REFS);
645         fNotifyHtmlBuiltinRefs = manager.getFeature(NOTIFY_HTML_BUILTIN_REFS);
646         fFixWindowsCharRefs = manager.getFeature(FIX_MSWINDOWS_REFS);
647         fScriptStripCDATADelims = manager.getFeature(SCRIPT_STRIP_CDATA_DELIMS);
648         fScriptStripCommentDelims = manager.getFeature(SCRIPT_STRIP_COMMENT_DELIMS);
649         fStyleStripCDATADelims = manager.getFeature(STYLE_STRIP_CDATA_DELIMS);
650         fStyleStripCommentDelims = manager.getFeature(STYLE_STRIP_COMMENT_DELIMS);
651         fIgnoreSpecifiedCharset = manager.getFeature(IGNORE_SPECIFIED_CHARSET);
652         fCDATASections = manager.getFeature(CDATA_SECTIONS);
653         fOverrideDoctype = manager.getFeature(OVERRIDE_DOCTYPE);
654         fInsertDoctype = manager.getFeature(INSERT_DOCTYPE);
655 
656         // get properties
657         fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
658         fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS)));
659         fDefaultIANAEncoding = String.valueOf(manager.getProperty(DEFAULT_ENCODING));
660         fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER);
661         fDoctypePubid = String.valueOf(manager.getProperty(DOCTYPE_PUBID));
662         fDoctypeSysid = String.valueOf(manager.getProperty(DOCTYPE_SYSID));
663     
664     } // reset(XMLComponentManager)
665 
666     /** Sets a feature. */
667     public void setFeature(String   featureId, boolean state)
668         throws XMLConfigurationException {
669 
670         if (featureId.equals(AUGMENTATIONS)) { 
671             fAugmentations = state; 
672         }
673         else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) { 
674             fIgnoreSpecifiedCharset = state; 
675         }
676         else if (featureId.equals(NOTIFY_CHAR_REFS)) { 
677             fNotifyCharRefs = state; 
678         }
679         else if (featureId.equals(NOTIFY_XML_BUILTIN_REFS)) { 
680             fNotifyXmlBuiltinRefs = state; 
681         }
682         else if (featureId.equals(NOTIFY_HTML_BUILTIN_REFS)) { 
683             fNotifyHtmlBuiltinRefs = state; 
684         }
685         else if (featureId.equals(FIX_MSWINDOWS_REFS)) { 
686             fFixWindowsCharRefs = state; 
687         }
688         else if (featureId.equals(SCRIPT_STRIP_CDATA_DELIMS)) { 
689             fScriptStripCDATADelims = state; 
690         }
691         else if (featureId.equals(SCRIPT_STRIP_COMMENT_DELIMS)) { 
692             fScriptStripCommentDelims = state; 
693         }
694         else if (featureId.equals(STYLE_STRIP_CDATA_DELIMS)) { 
695             fStyleStripCDATADelims = state; 
696         }
697         else if (featureId.equals(STYLE_STRIP_COMMENT_DELIMS)) { 
698             fStyleStripCommentDelims = state; 
699         }
700         else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) { 
701             fIgnoreSpecifiedCharset = state; 
702         }
703 
704     } // setFeature(String,boolean)
705 
706     /** Sets a property. */
707     public void setProperty(String   propertyId, Object   value)
708         throws XMLConfigurationException {
709     
710         if (propertyId.equals(NAMES_ELEMS)) {
711             fNamesElems = getNamesValue(String.valueOf(value));
712             return;
713         }
714 
715         if (propertyId.equals(NAMES_ATTRS)) {
716             fNamesAttrs = getNamesValue(String.valueOf(value));
717             return;
718         }
719 
720         if (propertyId.equals(DEFAULT_ENCODING)) {
721             fDefaultIANAEncoding = String.valueOf(value);
722             return;
723         }
724 
725     } // setProperty(String,Object)
726 
727     //
728     // XMLDocumentScanner methods
729     //
730 
731     /** Sets the input source. */
732     public void setInputSource(XMLInputSource source) throws IOException   {
733 
734         // reset state
735         fElementCount = 0;
736         fElementDepth = -1;
737         fByteStream = null;
738         fCurrentEntityStack.removeAllElements();
739 
740         fBeginLineNumber = 1;
741         fBeginColumnNumber = 1;
742         fEndLineNumber = fBeginLineNumber;
743         fEndColumnNumber = fBeginColumnNumber;
744 
745         // reset encoding information
746         fIANAEncoding = fDefaultIANAEncoding;
747         fJavaEncoding = fIANAEncoding;
748 
749         // get location information
750         String   encoding = source.getEncoding();
751         String   publicId = source.getPublicId();
752         String   baseSystemId = source.getBaseSystemId();
753         String   literalSystemId = source.getSystemId();
754         String   expandedSystemId = expandSystemId(literalSystemId, baseSystemId);
755 
756         // open stream
757         Reader   reader = source.getCharacterStream();
758         if (reader == null) {
759             InputStream   inputStream = source.getByteStream();
760             if (inputStream == null) {
761                 URL   url = new URL  (expandedSystemId);
762                 inputStream = url.openStream();
763             }
764             fByteStream = new PlaybackInputStream(inputStream);
765             String  [] encodings = new String  [2];
766             if (encoding == null) {
767                 fByteStream.detectEncoding(encodings);
768             }
769             else {
770                 encodings[0] = encoding;
771             }
772             if (encodings[0] == null) {
773                 encodings[0] = fDefaultIANAEncoding;
774                 if (fReportErrors) {
775                     fErrorReporter.reportWarning("HTML1000", null);
776                 }
777             }
778             if (encodings[1] == null) {
779                 encodings[1] = EncodingMap.getIANA2JavaMapping(encodings[0].toUpperCase());
780                 if (encodings[1] == null) {
781                     encodings[1] = encodings[0];
782                     if (fReportErrors) {
783                         fErrorReporter.reportWarning("HTML1001", new Object  []{encodings[0]});
784                     }
785                 }
786             }
787             fIANAEncoding = encodings[0];
788             fJavaEncoding = encodings[1];
789             /* PATCH: Asgeir Asgeirsson */
790             fIso8859Encoding = fIANAEncoding == null 
791                             || fIANAEncoding.toUpperCase().startsWith("ISO-8859")
792                             || fIANAEncoding.equalsIgnoreCase(fDefaultIANAEncoding);
793             encoding = fIANAEncoding;
794             reader = new InputStreamReader  (fByteStream, fJavaEncoding);
795         }
796         fCurrentEntity = new CurrentEntity(reader, encoding,
797                                            publicId, baseSystemId,
798                                            literalSystemId, expandedSystemId);
799 
800         // set scanner and state
801         setScanner(fContentScanner);
802         setScannerState(STATE_START_DOCUMENT);
803 
804     } // setInputSource(XMLInputSource)
805 
806     /** Scans the document. */
807     public boolean scanDocument(boolean complete) throws XNIException, IOException   {
808         do {
809             if (!fScanner.scan(complete)) {
810                 return false;
811             }
812         } while (complete);
813         return true;
814     } // scanDocument(boolean):boolean
815 
816     /** Sets the document handler. */
817     public void setDocumentHandler(XMLDocumentHandler handler) {
818         fDocumentHandler = handler;
819     } // setDocumentHandler(XMLDocumentHandler)
820 
821     // @since Xerces 2.1.0
822 
823     /** Returns the document handler. */
824     public XMLDocumentHandler getDocumentHandler() {
825         return fDocumentHandler;
826     } // getDocumentHandler():XMLDocumentHandler
827 
828     //
829     // Protected static methods
830     //
831 
832     /** Returns the value of the specified attribute, ignoring case. */
833     protected static String   getValue(XMLAttributes attrs, String   aname) {
834         int length = attrs != null ? attrs.getLength() : 0;
835         for (int i = 0; i < length; i++) {
836             if (attrs.getQName(i).equalsIgnoreCase(aname)) {
837                 return attrs.getValue(i);
838             }
839         }
840         return null;
841     } // getValue(XMLAttributes,String):String
842 
843     /**
844      * Expands a system id and returns the system id as a URI, if
845      * it can be expanded. A return value of null means that the
846      * identifier is already expanded. An exception thrown
847      * indicates a failure to expand the id.
848      *
849      * @param systemId The systemId to be expanded.
850      *
851      * @return Returns the URI string representing the expanded system
852      *         identifier. A null value indicates that the given
853      *         system identifier is already expanded.
854      *
855      */
856     public static String   expandSystemId(String   systemId, String   baseSystemId) {
857 
858         // check for bad parameters id
859         if (systemId == null || systemId.length() == 0) {
860             return systemId;
861         }
862         // if id already expanded, return
863         try {
864             URI uri = new URI(systemId);
865             if (uri != null) {
866                 return systemId;
867             }
868         }
869         catch (URI.MalformedURIException e) {
870             // continue on...
871         }
872         // normalize id
873         String   id = fixURI(systemId);
874 
875         // normalize base
876         URI base = null;
877         URI uri = null;
878         try {
879             if (baseSystemId == null || baseSystemId.length() == 0 ||
880                 baseSystemId.equals(systemId)) {
881                 String   dir;
882                 try {
883                     dir = fixURI(System.getProperty("user.dir"));
884                 }
885                 catch (SecurityException   se) {
886                     dir = "";
887                 }
888                 if (!dir.endsWith("/")) {
889                     dir = dir + "/";
890                 }
891                 base = new URI("file", "", dir, null, null);
892             }
893             else {
894                 try {
895                     base = new URI(fixURI(baseSystemId));
896                 }
897                 catch (URI.MalformedURIException e) {
898                     String   dir;
899                     try {
900                         dir = fixURI(System.getProperty("user.dir"));
901                     }
902                     catch (SecurityException   se) {
903                         dir = "";
904                     }
905                     if (baseSystemId.indexOf(':') != -1) {
906                         // for xml schemas we might have baseURI with
907                         // a specified drive
908                         base = new URI("file", "", fixURI(baseSystemId), null, null);
909                     }
910                     else {
911                         if (!dir.endsWith("/")) {
912                             dir = dir + "/";
913                         }
914                         dir = dir + fixURI(baseSystemId);
915                         base = new URI("file", "", dir, null, null);
916                     }
917                 }
918              }
919              // expand id
920              uri = new URI(base, id);
921         }
922         catch (URI.MalformedURIException e) {
923             // let it go through
924         }
925 
926         if (uri == null) {
927             return systemId;
928         }
929         return uri.toString();
930 
931     } // expandSystemId(String,String):String
932 
933     /**
934      * Fixes a platform dependent filename to standard URI form.
935      *
936      * @param str The string to fix.
937      *
938      * @return Returns the fixed URI string.
939      */
940     protected static String   fixURI(String   str) {
941 
942         // handle platform dependent strings
943         str = str.replace(java.io.File.separatorChar, '/');
944 
945         // Windows fix
946         if (str.length() >= 2) {
947             char ch1 = str.charAt(1);
948             // change "C:blah" to "/C:blah"
949             if (ch1 == ':') {
950                 char ch0 = Character.toUpperCase(str.charAt(0));
951                 if (ch0 >= 'A' && ch0 <= 'Z') {
952                     str = "/" + str;
953                 }
954             }
955             // change "//blah" to "file://blah"
956             else if (ch1 == '/' && str.charAt(0) == '/') {
957                 str = "file:" + str;
958             }
959         }
960 
961         // done
962         return str;
963 
964     } // fixURI(String):String
965 
966     /** Modifies the given name based on the specified mode. */
967     protected static final String   modifyName(String   name, short mode) {
968         switch (mode) {
969             case NAMES_UPPERCASE: return name.toUpperCase();
970             case NAMES_LOWERCASE: return name.toLowerCase();
971         }
972         return name;
973     } // modifyName(String,short):String
974 
975     /**
976      * Converts HTML names string value to constant value. 
977      *
978      * @see #NAMES_NO_CHANGE
979      * @see #NAMES_LOWERCASE
980      * @see #NAMES_UPPERCASE
981      */
982     protected static final short getNamesValue(String   value) {
983         if (value.equals("lower")) {
984             return NAMES_LOWERCASE;
985         }
986         if (value.equals("upper")) {
987             return NAMES_UPPERCASE;
988         }
989         return NAMES_NO_CHANGE;
990     } // getNamesValue(String):short
991 
992     /**
993      * Fixes Microsoft Windows&reg; specific characters.
994      * <p>
995      * Details about this common problem can be found at 
996      * <a HREF='http://www.cs.tut.fi/~jkorpela/www/windows-chars.html'>http://www.cs.tut.fi/~jkorpela/www/windows-chars.html</a>
997      */
998     protected int fixWindowsCharacter(int origChar) {
999         /* PATCH: Asgeir Asgeirsson */
1000        switch(origChar) {
1001            case 130: return 8218;
1002            case 131: return 402;
1003            case 132: return 8222;
1004            case 133: return 8230;
1005            case 134: return 8224;
1006            case 135: return 8225;
1007            case 136: return 710;
1008            case 137: return 8240;
1009            case 138: return 352;
1010            case 139: return 8249;
1011            case 140: return 338;
1012            case 145: return 8216;
1013            case 146: return 8217;
1014            case 147: return 8220;
1015            case 148: return 8221;
1016            case 149: return 8226;
1017            case 150: return 8211;
1018            case 151: return 8212;
1019            case 152: return 732;
1020            case 153: return 8482;
1021            case 154: return 353;
1022            case 155: return 8250;
1023            case 156: return 339;
1024            case 159: return 376;
1025        }
1026        return origChar;
1027    } // fixWindowsCharacter(int):int
1028
1029    //
1030    // Protected methods
1031    //
1032
1033    // i/o
1034
1035    /** Reads a single character. */
1036    protected int read() throws IOException   {
1037        if (DEBUG_BUFFER) { 
1038            System.out.print("(read: ");
1039            printBuffer();
1040            System.out.println();
1041        }
1042        if (fCurrentEntity.offset == fCurrentEntity.length) {
1043            if (load(0) == -1) {
1044                if (DEBUG_BUFFER) { 
1045                    System.out.println(")read: -> -1");
1046                }
1047                return -1;
1048            }
1049        }
1050        int c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1051        fCurrentEntity.columnNumber++;
1052        if (DEBUG_BUFFER) { 
1053            System.out.print(")read: ");
1054            printBuffer();
1055            System.out.print(" -> ");
1056            System.out.print(c);
1057            System.out.println();
1058        }
1059        return c;
1060    } // read():int
1061
1062    /** 
1063     * Loads a new chunk of data into the buffer and returns the number of
1064     * characters loaded or -1 if no additional characters were loaded.
1065     *
1066     * @param offset The offset at which new characters should be loaded.
1067     */
1068    protected int load(int offset) throws IOException   {
1069        if (DEBUG_BUFFER) { 
1070            System.out.print("(load: ");
1071            printBuffer();
1072            System.out.println();
1073        }
1074        // resize buffer, if needed
1075        if (offset == fCurrentEntity.buffer.length) {
1076            int adjust = fCurrentEntity.buffer.length / 4;
1077            char[] array = new char[fCurrentEntity.buffer.length + adjust];
1078            System.arraycopy(fCurrentEntity.buffer, 0, array, 0, fCurrentEntity.length);
1079            fCurrentEntity.buffer = array;
1080        }
1081        // read a block of characters
1082        int count = fCurrentEntity.stream.read(fCurrentEntity.buffer, offset, fCurrentEntity.buffer.length - offset);
1083        fCurrentEntity.length = count != -1 ? count + offset : offset;
1084        fCurrentEntity.offset = offset;
1085        if (DEBUG_BUFFER) { 
1086            System.out.print(")load: ");
1087            printBuffer();
1088            System.out.print(" -> ");
1089            System.out.print(count);
1090            System.out.println();
1091        }
1092        return count;
1093    } // load():int
1094
1095    // debugging
1096
1097    /** Sets the scanner. */
1098    protected void setScanner(Scanner scanner) {
1099        fScanner = scanner;
1100        if (DEBUG_SCANNER) {
1101            System.out.print("$$$ setScanner(");
1102            System.out.print(scanner!=null?scanner.getClass().getName():"null");
1103            System.out.println(");");
1104        }
1105    } // setScanner(Scanner)
1106    
1107    /** Sets the scanner state. */
1108    protected void setScannerState(short state) {
1109        fScannerState = state;
1110        if (DEBUG_SCANNER_STATE) {
1111            System.out.print("$$$ setScannerState(");
1112            switch (fScannerState) {
1113                case STATE_CONTENT: { System.out.print("STATE_CONTENT"); break; }
1114                case STATE_MARKUP_BRACKET: { System.out.print("STATE_MARKUP_BRACKET"); break; }
1115                case STATE_START_DOCUMENT: { System.out.print("STATE_START_DOCUMENT"); break; }
1116                case STATE_END_DOCUMENT: { System.out.print("STATE_END_DOCUMENT"); break; }
1117            }
1118            System.out.println(");");
1119        }
1120    } // setScannerState(short)
1121
1122    // scanning
1123
1124    /** Scans a DOCTYPE line. */
1125    protected void scanDoctype() throws IOException   {
1126        String   root = null;
1127        String   pubid = null;
1128        String   sysid = null;
1129
1130        if (skipSpaces()) {
1131            root = scanName();
1132            if (root == null) {
1133                if (fReportErrors) {
1134                    fErrorReporter.reportError("HTML1014", null);
1135                }
1136            }
1137            else {
1138                root = modifyName(root, fNamesElems);
1139            }
1140            if (skipSpaces()) {
1141                if (skip("PUBLIC", false)) {
1142                    skipSpaces();
1143                    pubid = scanLiteral();
1144                    if (skipSpaces()) {
1145                        sysid = scanLiteral();
1146                    }
1147                }
1148                else if (skip("SYSTEM", false)) {
1149                    skipSpaces();
1150                    sysid = scanLiteral();
1151                }
1152            }
1153        }
1154        int c;
1155        while ((c = read()) != -1) {
1156            if (c == '<') {
1157                fCurrentEntity.offset--;
1158                fCurrentEntity.columnNumber--;
1159                break;
1160            }
1161            if (c == '>') {
1162                break;
1163            }
1164            if (c == '[') {
1165                skipMarkup(true);
1166                break;
1167            }
1168        }
1169
1170        if (fDocumentHandler != null) {
1171            if (fOverrideDoctype) {
1172                pubid = fDoctypePubid;
1173                sysid = fDoctypeSysid;
1174            }
1175            fEndLineNumber = fCurrentEntity.lineNumber;
1176            fEndColumnNumber = fCurrentEntity.columnNumber;
1177            fDocumentHandler.doctypeDecl(root, pubid, sysid, locationAugs());
1178        }
1179
1180    } // scanDoctype()
1181
1182    /** Scans a quoted literal. */
1183    protected String   scanLiteral() throws IOException   {
1184        int quote = read();
1185        if (quote == '\'' || quote == '"') {
1186            StringBuffer   str = new StringBuffer  ();
1187            int c;
1188            while ((c = read()) != -1) {
1189                if (c == quote) {
1190                    break;
1191                }
1192                if (c == '\r' || c == '\n') {
1193                    fCurrentEntity.offset--;
1194                    fCurrentEntity.columnNumber--;
1195                    // NOTE: This collapses newlines to a single space.
1196                    //       [Q] Is this the right thing to do here? -Ac
1197                    skipNewlines();
1198                    str.append(' ');
1199                }
1200                else if (c == '<') {
1201                    fCurrentEntity.offset--;
1202                    fCurrentEntity.columnNumber--;
1203                    break;
1204                }
1205                else {
1206                    str.append((char)c);
1207                }
1208            }
1209            if (c == -1) {
1210                if (fReportErrors) {
1211                    fErrorReporter.reportError("HTML1007", null);
1212                }
1213                throw new EOFException  ();
1214            }
1215            return str.toString();
1216        }
1217        else {
1218            fCurrentEntity.offset--;
1219            fCurrentEntity.columnNumber--;
1220        }
1221        return null;
1222    } // scanLiteral():String
1223
1224    /** Scans a name. */
1225    protected String   scanName() throws IOException   {
1226        if (DEBUG_BUFFER) {
1227            System.out.print("(scanName: ");
1228            printBuffer();
1229            System.out.println();
1230        }
1231        if (fCurrentEntity.offset == fCurrentEntity.length) {
1232            if (load(0) == -1) {
1233                if (DEBUG_BUFFER) {
1234                    System.out.print(")scanName: ");
1235                    printBuffer();
1236                    System.out.println(" -> null");
1237                }
1238                return null;
1239            }
1240        }
1241        int offset = fCurrentEntity.offset;
1242        while (true) {
1243            while (fCurrentEntity.offset < fCurrentEntity.length) {
1244                char c = fCurrentEntity.buffer[fCurrentEntity.offset];
1245                if (!Character.isLetterOrDigit(c) &&
1246                    !(c == '-' || c == '.' || c == ':' || c == '_')) {
1247                    break;
1248                }
1249                fCurrentEntity.offset++;
1250                fCurrentEntity.columnNumber++;
1251            }
1252            if (fCurrentEntity.offset == fCurrentEntity.length) {
1253                int length = fCurrentEntity.length - offset;
1254                System.arraycopy(fCurrentEntity.buffer, offset, fCurrentEntity.buffer, 0, length);
1255                int count = load(length);
1256                offset = 0;
1257                if (count == -1) {
1258                    break;
1259                }
1260            }
1261            else {
1262                break;
1263            }
1264        }
1265        int length = fCurrentEntity.offset - offset;
1266        String   name = length > 0 ? new String  (fCurrentEntity.buffer, offset, length) : null;
1267        if (DEBUG_BUFFER) {
1268            System.out.print(")scanName: ");
1269            printBuffer();
1270            System.out.print(" -> \"");
1271            System.out.print(name);
1272            System.out.println('"');
1273        }
1274        return name;
1275    } // scanName():String
1276
1277    /** Scans an entity reference. */
1278    protected int scanEntityRef(XMLStringBuffer str, boolean content) 
1279        throws IOException   {
1280        str.clear();
1281        str.append('&');
1282        while (true) {
1283            int c = read();
1284            if (c == ';') {
1285                str.append(';');
1286                break;
1287            }
1288            if (c == -1) {
1289                if (fReportErrors) {
1290                    fErrorReporter.reportWarning("HTML1004", null);
1291                }
1292                if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
1293                    fEndLineNumber = fCurrentEntity.lineNumber;
1294                    fEndColumnNumber = fCurrentEntity.columnNumber;
1295                    fDocumentHandler.characters(str, locationAugs());
1296                }
1297                return -1;
1298            }
1299            if (!Character.isLetterOrDigit((char)c) && c != '#') {
1300                if (fReportErrors) {
1301                    fErrorReporter.reportWarning("HTML1004", null);
1302                }
1303                fCurrentEntity.offset--;
1304                fCurrentEntity.columnNumber--;
1305                if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
1306                    fEndLineNumber = fCurrentEntity.lineNumber;
1307                    fEndColumnNumber = fCurrentEntity.columnNumber;
1308                    fDocumentHandler.characters(str, locationAugs());
1309                }
1310                return -1;
1311            }
1312            str.append((char)c);
1313        }
1314        if (str.length == 1) {
1315            if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
1316                fEndLineNumber = fCurrentEntity.lineNumber;
1317                fEndColumnNumber = fCurrentEntity.columnNumber;
1318                fDocumentHandler.characters(str, locationAugs());
1319            }
1320            return -1;
1321        }
1322
1323        String   name = str.toString().substring(1, str.length-1);
1324        if (name.startsWith("#")) {
1325            int value = -1;
1326            try {
1327                if (name.startsWith("#x")) {
1328                    value = Integer.parseInt(name.substring(2), 16);
1329                }
1330                else {
1331                    value = Integer.parseInt(name.substring(1));
1332                }
1333                /* PATCH: Asgeir Asgeirsson */
1334                if (fFixWindowsCharRefs && fIso8859Encoding) {
1335                    value = fixWindowsCharacter(value);
1336                }
1337                if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
1338                    fEndLineNumber = fCurrentEntity.lineNumber;
1339                    fEndColumnNumber = fCurrentEntity.columnNumber;
1340                    if (fNotifyCharRefs) {
1341                        XMLResourceIdentifier id = resourceId();
1342                        String   encoding = null;
1343                        fDocumentHandler.startGeneralEntity(name, id, encoding, locationAugs());
1344                    }
1345                    str.clear();
1346                    str.append((char)value);
1347                    fDocumentHandler.characters(str, locationAugs());
1348                    if (fNotifyCharRefs) {
1349                        fDocumentHandler.endGeneralEntity(name, locationAugs());
1350                    }
1351                }
1352            }
1353            catch (NumberFormatException   e) {
1354                if (fReportErrors) {
1355                    fErrorReporter.reportError("HTML1005", new Object  []{name});
1356                }
1357                if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
1358                    fEndLineNumber = fCurrentEntity.lineNumber;
1359                    fEndColumnNumber = fCurrentEntity.columnNumber;
1360                    fDocumentHandler.characters(str, locationAugs());
1361                }
1362            }
1363            return value;
1364        }
1365
1366        int c = HTMLEntities.get(name);
1367        if (c == -1) {
1368            if (fReportErrors) {
1369                fErrorReporter.reportWarning("HTML1006", new Object  []{name});
1370            }
1371            if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
1372                fEndLineNumber = fCurrentEntity.lineNumber;
1373                fEndColumnNumber = fCurrentEntity.columnNumber;
1374                fDocumentHandler.characters(str, locationAugs());
1375            }
1376            return -1;
1377        }
1378        if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
1379            fEndLineNumber = fCurrentEntity.lineNumber;
1380            fEndColumnNumber = fCurrentEntity.columnNumber;
1381            boolean notify = fNotifyHtmlBuiltinRefs || (fNotifyXmlBuiltinRefs && builtinXmlRef(name));
1382            if (notify) {
1383                XMLResourceIdentifier id = resourceId();
1384                String   encoding = null;
1385                fDocumentHandler.startGeneralEntity(name, id, encoding, locationAugs());
1386            }
1387            str.clear();
1388            str.append((char)c);
1389            fDocumentHandler.characters(str, locationAugs());
1390            if (notify) {
1391                fDocumentHandler.endGeneralEntity(name, locationAugs());
1392            }
1393        }
1394        return c;
1395
1396    } // scanEntityRef(XMLStringBuffer,boolean):int
1397
1398    /** Returns true if the specified text is present and is skipped. */
1399    protected boolean skip(String   s, boolean caseSensitive) throws IOException   {
1400        int length = s != null ? s.length() : 0;
1401        for (int i = 0; i < length; i++) {
1402            if (fCurrentEntity.offset == fCurrentEntity.length) {
1403                System.arraycopy(fCurrentEntity.buffer, fCurrentEntity.offset - i, fCurrentEntity.buffer, 0, i);
1404                if (load(i) == -1) {
1405                    fCurrentEntity.offset = 0;
1406                    return false;
1407                }
1408            }
1409            char c0 = s.charAt(i);
1410            char c1 = fCurrentEntity.buffer[fCurrentEntity.offset++];
1411            fCurrentEntity.columnNumber++;
1412            if (!caseSensitive) {
1413                c0 = Character.toUpperCase(c0);
1414                c1 = Character.toUpperCase(c1);
1415            }
1416            if (c0 != c1) {
1417                fCurrentEntity.offset -= i + 1;
1418                return false;
1419            }
1420        }
1421        return true;
1422    } // skip(String):boolean
1423
1424    /** Skips markup. */
1425    protected boolean skipMarkup(boolean balance) throws IOException   {
1426        if (DEBUG_BUFFER) {
1427            System.out.print("(skipMarkup: ");
1428            printBuffer();
1429            System.out.println();
1430        }
1431        int depth = 1;
1432        boolean slashgt = false;
1433        OUTER: while (true) {
1434            if (fCurrentEntity.offset == fCurrentEntity.length) {
1435                if (load(0) == -1) {
1436                    break OUTER;
1437                }
1438            }
1439            while (fCurrentEntity.offset < fCurrentEntity.length) {
1440                char c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1441                fCurrentEntity.columnNumber++;
1442                if (balance && c == '<') {
1443                    depth++;
1444                }
1445                else if (c == '>') {
1446                    depth--;
1447                    if (depth == 0) {
1448                        break OUTER;
1449                    }
1450                }
1451                else if (c == '/') {
1452                    if (fCurrentEntity.offset == fCurrentEntity.length) {
1453                        if (load(0) == -1) {
1454                            break OUTER;
1455                        }
1456                    }
1457                    c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1458                    fCurrentEntity.columnNumber++;
1459                    if (c == '>') {
1460                        slashgt = true;
1461                        depth--;
1462                        if (depth == 0) {
1463                            break OUTER;
1464                        }
1465                    }
1466                    else {
1467                        fCurrentEntity.offset--;
1468                        fCurrentEntity.columnNumber--;
1469                    }
1470                }
1471                else if (c == '\r' || c == '\n') {
1472                    skipNewlines();
1473                }
1474            }
1475        }
1476        if (DEBUG_BUFFER) {
1477            System.out.print(")skipMarkup: ");
1478            printBuffer();
1479            System.out.print(" -> "+slashgt);
1480            System.out.println();
1481        }
1482        return slashgt;
1483    } // skipMarkup():boolean
1484
1485    /** Skips whitespace. */
1486    protected boolean skipSpaces() throws IOException   {
1487        if (DEBUG_BUFFER) {
1488            System.out.print("(skipSpaces: ");
1489            printBuffer();
1490            System.out.println();
1491        }
1492        boolean spaces = false;
1493        while (true) {
1494            if (fCurrentEntity.offset == fCurrentEntity.length) {
1495                if (load(0) == -1) {
1496                    break;
1497                }
1498            }
1499            char c = fCurrentEntity.buffer[fCurrentEntity.offset];
1500            if (!Character.isSpace(c)) {
1501                break;
1502            }
1503            spaces = true;
1504            if (c == '\r' || c == '\n') {
1505                skipNewlines();
1506                continue;
1507            }
1508            fCurrentEntity.offset++;
1509            fCurrentEntity.columnNumber++;
1510        }
1511        if (DEBUG_BUFFER) {
1512            System.out.print(")skipSpaces: ");
1513            printBuffer();
1514            System.out.print(" -> ");
1515            System.out.print(spaces);
1516            System.out.println();
1517        }
1518        return spaces;
1519    } // skipSpaces()
1520
1521    /** Skips newlines and returns the number of newlines skipped. */
1522    protected int skipNewlines() throws IOException   {
1523        return skipNewlines(Integer.MAX_VALUE);
1524    } // skipNewlines():int
1525
1526    /** Skips newlines and returns the number of newlines skipped. */
1527    protected int skipNewlines(int maxlines) throws IOException   {
1528        if (DEBUG_BUFFER) {
1529            System.out.print("(skipNewlines: ");
1530            printBuffer();
1531            System.out.println();
1532        }
1533        if (fCurrentEntity.offset == fCurrentEntity.length) {
1534            if (load(0) == -1) {
1535                if (DEBUG_BUFFER) {
1536                    System.out.print(")skipNewlines: ");
1537                    printBuffer();
1538                    System.out.println();
1539                }
1540                return 0;
1541            }
1542        }
1543        char c = fCurrentEntity.buffer[fCurrentEntity.offset];
1544        int newlines = 0;
1545        int offset = fCurrentEntity.offset;
1546        if (c == '\n' || c == '\r') {
1547            do {
1548                c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1549                if (c == '\r') {
1550                    newlines++;
1551                    if (fCurrentEntity.offset == fCurrentEntity.length) {
1552                        offset = 0;
1553                        fCurrentEntity.offset = newlines;
1554                        if (load(newlines) == -1) {
1555                            break;
1556                        }
1557                    }
1558                    if (fCurrentEntity.buffer[fCurrentEntity.offset] == '\n') {
1559                        fCurrentEntity.offset++;
1560                        offset++;
1561                    }
1562                }
1563                else if (c == '\n') {
1564                    newlines++;
1565                    if (fCurrentEntity.offset == fCurrentEntity.length) {
1566                        offset = 0;
1567                        fCurrentEntity.offset = newlines;
1568                        if (load(newlines) == -1) {
1569                            break;
1570                        }
1571                    }
1572                }
1573                else {
1574                    fCurrentEntity.offset--;
1575                    break;
1576                }
1577            } while (newlines < maxlines &&
1578                     fCurrentEntity.offset < fCurrentEntity.length - 1);
1579            fCurrentEntity.lineNumber += newlines;
1580            fCurrentEntity.columnNumber = 1;
1581        }
1582        if (DEBUG_BUFFER) {
1583            System.out.print(")skipNewlines: ");
1584            printBuffer();
1585            System.out.print(" -> ");
1586            System.out.print(newlines);
1587            System.out.println();
1588        }
1589        return newlines;
1590    } // skipNewlines(int):int
1591
1592    // infoset utility methods
1593
1594    /** Returns an augmentations object with a location item added. */
1595    protected final Augmentations locationAugs() {
1596        HTMLAugmentations augs = null;
1597        if (fAugmentations) {
1598            fLocationItem.setValues(fBeginLineNumber, fBeginColumnNumber, 
1599                                    fEndLineNumber, fEndColumnNumber);
1600            augs = fInfosetAugs;
1601            augs.removeAllItems();
1602            augs.putItem(AUGMENTATIONS, fLocationItem);
1603        }
1604        return augs;
1605    } // locationAugs():Augmentations
1606
1607    /** Returns an augmentations object with a synthesized item added. */
1608    protected final Augmentations synthesizedAugs() {
1609        HTMLAugmentations augs = null;
1610        if (fAugmentations) {
1611            augs = fInfosetAugs;
1612            augs.removeAllItems();
1613            augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
1614        }
1615        return augs;
1616    } // synthesizedAugs():Augmentations
1617
1618    /** Returns an empty resource identifier. */
1619    protected final XMLResourceIdentifier resourceId() {
1620        /***/
1621        fResourceId.clear();
1622        return fResourceId;
1623        /***
1624        // NOTE: Unfortunately, the Xerces DOM parser classes expect a
1625        //       non-null resource identifier object to be passed to
1626        //       startGeneralEntity. -Ac
1627        return null;
1628        /***/
1629    } // resourceId():XMLResourceIdentifier
1630
1631    //
1632    // Protected static methods
1633    //
1634
1635    /** Returns true if the name is a built-in XML general entity reference. */
1636    protected static boolean builtinXmlRef(String   name) {
1637        return name.equals("amp") || name.equals("lt") || name.equals("gt") ||
1638               name.equals("quot") || name.equals("apos");
1639    } // builtinXmlRef(String):boolean
1640
1641    //
1642    // Private methods
1643    //
1644
1645    /** Prints the contents of the character buffer to standard out. */
1646    private void printBuffer() {
1647        if (DEBUG_BUFFER) {
1648            System.out.print('[');
1649            System.out.print(fCurrentEntity.length);
1650            System.out.print(' ');
1651            System.out.print(fCurrentEntity.offset);
1652            if (fCurrentEntity.length > 0) {
1653                System.out.print(" \"");
1654                for (int i = 0; i < fCurrentEntity.length; i++) {
1655                    if (i == fCurrentEntity.offset) {
1656                        System.out.print('^');
1657                    }
1658                    char c = fCurrentEntity.buffer[i];
1659                    switch (c) {
1660                        case '\r': {
1661                            System.out.print("\\r");
1662                            break;
1663                        }
1664                        case '\n': {
1665                            System.out.print("\\n");
1666                            break;
1667                        }
1668                        case '\t': {
1669                            System.out.print("\\t");
1670                            break;
1671                        }
1672                        case '"': {
1673                            System.out.print("\\\"");
1674                            break;
1675                        }
1676                        default: {
1677                            System.out.print(c);
1678                        }
1679                    }
1680                }
1681                if (fCurrentEntity.offset == fCurrentEntity.length) {
1682                    System.out.print('^');
1683                }
1684                System.out.print('"');
1685            }
1686            System.out.print(']');
1687        }
1688    } // printBuffer()
1689
1690    //
1691    // Interfaces
1692    //
1693
1694    /**
1695     * Basic scanner interface.
1696     *
1697     * @author Andy Clark
1698     */
1699    public interface Scanner {
1700
1701        //
1702        // Scanner methods
1703        //
1704
1705        /** 
1706         * Scans part of the document. This interface allows scanning to
1707         * be performed in a pulling manner.
1708         *
1709         * @param complete True if the scanner should not return until
1710         *                 scanning is complete.
1711         *
1712         * @return True if additional scanning is required.
1713         *
1714         * @throws IOException Thrown if I/O error occurs.
1715         */
1716        public boolean scan(boolean complete) throws IOException  ;
1717
1718    } // interface Scanner
1719
1720    //
1721    // Classes
1722    //
1723
1724    /**
1725     * Current entity.
1726     *
1727     * @author Andy Clark
1728     */
1729    public static class CurrentEntity {
1730
1731        //
1732        // Data
1733        //
1734
1735        /** Character stream. */
1736        public Reader   stream;
1737
1738        /** Encoding. */
1739        public String   encoding;
1740
1741        /** Public identifier. */
1742        public String   publicId;
1743
1744        /** Base system identifier. */
1745        public String   baseSystemId;
1746
1747        /** Literal system identifier. */
1748        public String   literalSystemId;
1749
1750        /** Expanded system identifier. */
1751        public String   expandedSystemId;
1752
1753        /** Line number. */
1754        public int lineNumber = 1;
1755
1756        /** Column number. */
1757        public int columnNumber = 1;
1758
1759        // buffer
1760
1761        /** Character buffer. */
1762        public char[] buffer = new char[DEFAULT_BUFFER_SIZE];
1763
1764        /** Offset into character buffer. */
1765        public int offset = 0;
1766
1767        /** Length of characters read into character buffer. */
1768        public int length = 0;
1769
1770        //
1771        // Constructors
1772        //
1773
1774        /** Constructs an entity from the specified stream. */
1775        public CurrentEntity(Reader   stream, String   encoding, 
1776                             String   publicId, String   baseSystemId,
1777                             String   literalSystemId, String   expandedSystemId) {
1778            this.stream = stream;
1779            this.encoding = encoding;
1780            this.publicId = publicId;
1781            this.baseSystemId = baseSystemId;
1782            this.literalSystemId = literalSystemId;
1783            this.expandedSystemId = expandedSystemId;
1784        } // <init>(Reader,String,String,String,String)
1785
1786    } // class CurrentEntity
1787
1788    /**
1789     * The primary HTML document scanner.
1790     *
1791     * @author Andy Clark
1792     */
1793    public class ContentScanner 
1794        implements Scanner {
1795
1796        //
1797        // Data
1798        //
1799
1800        // temp vars
1801
1802        /** A qualified name. */
1803        private final QName fQName = new QName();
1804
1805        /** Attributes. */
1806        private final XMLAttributesImpl fAttributes = new XMLAttributesImpl();
1807
1808        //
1809        // Scanner methods
1810        //
1811
1812        /** Scan. */
1813        public boolean scan(boolean complete) throws IOException   {
1814            boolean next;
1815            do {
1816                try {
1817                    next = false;
1818                    switch (fScannerState) {
1819                        case STATE_CONTENT: {
1820                            fBeginLineNumber = fCurrentEntity.lineNumber;
1821                            fBeginColumnNumber = fCurrentEntity.columnNumber;
1822                            int c = read();
1823                            if (c == '<') {
1824                                setScannerState(STATE_MARKUP_BRACKET);
1825                                next = true;
1826                            }
1827                            else if (c == '&') {
1828                                scanEntityRef(fStringBuffer, true);
1829                            }
1830                            else if (c == -1) {
1831                                throw new EOFException  ();
1832                            }
1833                            else {
1834                                fCurrentEntity.offset--;
1835                                fCurrentEntity.columnNumber--;
1836                                scanCharacters();
1837                            }
1838                            break;
1839                        }
1840                        case STATE_MARKUP_BRACKET: {
1841                            int c = read();
1842                            if (c == '!') {
1843                                if (skip("--", false)) {
1844                                    scanComment();
1845                                }
1846                                else if (skip("[CDATA[", false)) {
1847                                    scanCDATA();
1848                                }
1849                                else if (skip("DOCTYPE", false)) {
1850                                    scanDoctype();
1851                                }
1852                                else {
1853                                    if (fReportErrors) {
1854                                        fErrorReporter.reportError("HTML1002", null);
1855                                    }
1856                                    skipMarkup(true);
1857                                }
1858                            }
1859                            else if (c == '?') {
1860                                scanPI();
1861                            }
1862                            else if (c == '/') {
1863                                scanEndElement();
1864                            }
1865                            else if (c == -1) {
1866                                if (fReportErrors) {
1867                                    fErrorReporter.reportError("HTML1003", null);
1868                                }
1869                                if (fDocumentHandler != null && fElementCount >= fElementDepth) {
1870                                    fStringBuffer.clear();
1871                                    fStringBuffer.append('<');
1872                                    fDocumentHandler.characters(fStringBuffer, null);
1873                                }
1874                                throw new EOFException  ();
1875                            }
1876                            else {
1877                                fCurrentEntity.offset--;
1878                                fCurrentEntity.columnNumber--;
1879                                fElementCount++;
1880                                fSingleBoolean[0] = false;
1881                                String   ename = scanStartElement(fSingleBoolean);
1882                                if (ename != null && !fSingleBoolean[0] &&
1883                                    HTMLElements.getElement(ename).isSpecial()) {
1884                                    setScanner(fSpecialScanner.setElementName(ename));
1885                                    setScannerState(STATE_CONTENT);
1886                                    return true;
1887                                }
1888                            }
1889                            setScannerState(STATE_CONTENT);
1890                            break;
1891                        }
1892                        case STATE_START_DOCUMENT: {
1893                            if (fDocumentHandler != null && fElementCount >= fElementDepth) {
1894                                if (DEBUG_CALLBACKS) {
1895                                    System.out.println("startDocument()");
1896                                }
1897                                XMLLocator locator = HTMLScanner.this;
1898                                String   encoding = fIANAEncoding;
1899                                Augmentations augs = locationAugs();
1900                                try {
1901                                    // NOTE: Hack to allow the default filter to work with
1902                                    //       old and new versions of the XNI document handler
1903                                    //       interface. -Ac
1904                                    Class   cls = fDocumentHandler.getClass();
1905                                    Class  [] types = {
1906                                        XMLLocator.class, String  .class,
1907                                        NamespaceContext.class, Augmentations.class
1908                                    };
1909                                    Method   method = cls.getMethod("startDocument", types);
1910                                    NamespaceContext nscontext = new NamespaceSupport();
1911                                    Object  [] params = {
1912                                        locator, encoding, 
1913                                        nscontext, augs
1914                                    };
1915                                    method.invoke(fDocumentHandler, params);
1916                                }
1917                                catch (IllegalAccessException   e) {
1918                                    throw new XNIException(e);
1919                                } 
1920                                catch (InvocationTargetException   e) {
1921                                    throw new XNIException(e);
1922                                } 
1923                                catch (NoSuchMethodException   e) {
1924                                    try {
1925                                        // NOTE: Hack to allow the default filter to work with
1926                                        //       old and new versions of the XNI document handler
1927                                        //       interface. -Ac
1928                                        Class   cls = fDocumentHandler.getClass();
1929                                        Class  [] types = {
1930                                            XMLLocator.class, String  .class, Augmentations.class
1931                                        };
1932                                        Method   method = cls.getMethod("startDocument", types);
1933                                        Object  [] params = {
1934                                            locator, encoding, augs
1935                                        };
1936                                        method.invoke(fDocumentHandler, params);
1937                                    }
1938                                    catch (IllegalAccessException   ex) {
1939                                        // NOTE: Should never reach here!
1940                                        throw new XNIException(ex);
1941                                    } 
1942                                    catch (InvocationTargetException   ex) {
1943                                        // NOTE: Should never reach here!
1944                                        throw new XNIException(ex);
1945                                    } 
1946                                    catch (NoSuchMethodException   ex) {
1947                                        // NOTE: Should never reach here!
1948                                        throw new XNIException(ex);
1949                                    }
1950                                }
1951                            }
1952                            if (fInsertDoctype && fDocumentHandler != null) {
1953                                String   root = HTMLElements.getElement(HTMLElements.HTML).name;
1954                                root = modifyName(root, fNamesElems);
1955                                String   pubid = fDoctypePubid;
1956                                String   sysid = fDoctypeSysid;
1957                                fDocumentHandler.doctypeDecl(root, pubid, sysid,
1958                                                             synthesizedAugs());
1959                            }
1960                            setScannerState(STATE_CONTENT);
1961                            break;
1962                        }
1963                        case STATE_END_DOCUMENT: {
1964                            if (fDocumentHandler != null && fElementCount >= fElementDepth) {
1965                                if (DEBUG_CALLBACKS) {
1966                                    System.out.println("endDocument()");
1967                                }
1968                                fEndLineNumber = fCurrentEntity.lineNumber;
1969                                fEndColumnNumber = fCurrentEntity.columnNumber;
1970                                fDocumentHandler.endDocument(locationAugs());
1971                            }
1972                            return false;
1973                        }
1974                        default: {
1975                            throw new RuntimeException  ("unknown scanner state: "+fScannerState);
1976                        }
1977                    }
1978                }
1979                catch (EOFException   e) {
1980                    if (fCurrentEntityStack.empty()) {
1981                        setScannerState(STATE_END_DOCUMENT);
1982                    }
1983                    else {
1984                        fCurrentEntity = (CurrentEntity)fCurrentEntityStack.pop();
1985                    }
1986                    next = true;
1987                }
1988            } while (next || complete);
1989            return true;
1990        } // scan(boolean):boolean
1991
1992        //
1993        // Protected methods
1994        //
1995
1996        /** Scans characters. */
1997        protected void scanCharacters() throws IOException   {
1998            if (DEBUG_BUFFER) {
1999                System.out.print("(scanCharacters: ");
2000                printBuffer();
2001                System.out.println();
2002            }
2003            int newlines = skipNewlines();
2004            if (newlines == 0 && fCurrentEntity.offset == fCurrentEntity.length) {
2005                if (DEBUG_BUFFER) {
2006                    System.out.print(")scanCharacters: ");
2007                    printBuffer();
2008                    System.out.println();
2009                }
2010                return;
2011            }
2012            char c;
2013            int offset = fCurrentEntity.offset - newlines;
2014            for (int i = offset; i < fCurrentEntity.offset; i++) {
2015                fCurrentEntity.buffer[i] = '\n';
2016            }
2017            while (fCurrentEntity.offset < fCurrentEntity.length) {
2018                c = fCurrentEntity.buffer[fCurrentEntity.offset];
2019                if (c == '<' || c == '&' || c == '\n' || c == '\r') {
2020                    break;
2021                }
2022                fCurrentEntity.offset++;
2023                fCurrentEntity.columnNumber++;
2024            }
2025            if (fCurrentEntity.offset > offset && 
2026                fDocumentHandler != null && fElementCount >= fElementDepth) {
2027                fString.setValues(fCurrentEntity.buffer, offset, fCurrentEntity.offset - offset);
2028                if (DEBUG_CALLBACKS) {
2029                    System.out.println("characters("+fString+")");
2030                }
2031                fEndLineNumber = fCurrentEntity.lineNumber;
2032                fEndColumnNumber = fCurrentEntity.columnNumber;
2033                fDocumentHandler.characters(fString, locationAugs());
2034            }
2035            if (DEBUG_BUFFER) {
2036                System.out.print(")scanCharacters: ");
2037                printBuffer();
2038                System.out.println();
2039            }
2040        } // scanCharacters()
2041
2042        /** Scans a CDATA section. */
2043        protected void scanCDATA() throws IOException   {
2044            if (DEBUG_BUFFER) {
2045                System.out.print("(scanCDATA: ");
2046                printBuffer();
2047                System.out.println();
2048            }
2049            fStringBuffer.clear();
2050            if (fCDATASections) {
2051                if (fDocumentHandler != null && fElementCount >= fElementDepth) {
2052                    fEndLineNumber = fCurrentEntity.lineNumber;
2053                    fEndColumnNumber = fCurrentEntity.columnNumber;
2054                    if (DEBUG_CALLBACKS) {
2055                        System.out.println("startCDATA()");
2056                    }
2057                    fDocumentHandler.startCDATA(locationAugs());
2058                }
2059            }
2060            else {
2061                fStringBuffer.append("[CDATA[");
2062            }
2063            boolean eof = scanMarkupContent(fStringBuffer, ']');
2064            if (!fCDATASections) {
2065                fStringBuffer.append("]]");
2066            }
2067            if (fDocumentHandler != null && fElementCount >= fElementDepth) {
2068                fEndLineNumber = fCurrentEntity.lineNumber;
2069                fEndColumnNumber = fCurrentEntity.columnNumber;
2070                if (fCDATASections) {
2071                    if (DEBUG_CALLBACKS) {
2072                        System.out.println("characters("+fStringBuffer+")");
2073                    }
2074                    fDocumentHandler.characters(fStringBuffer, locationAugs());
2075                    if (DEBUG_CALLBACKS) {
2076                        System.out.println("endCDATA()");
2077                    }
2078                    fDocumentHandler.endCDATA(locationAugs());
2079                }
2080                else {
2081                    if (DEBUG_CALLBACKS) {
2082                        System.out.println("comment("+fStringBuffer+")");
2083                    }
2084                    fDocumentHandler.comment(fStringBuffer, locationAugs());
2085                }
2086            }
2087            if (DEBUG_BUFFER) {
2088                System.out.print(")scanCDATA: ");
2089                printBuffer();
2090                System.out.println();
2091            }
2092            if (eof) {
2093                throw new EOFException  ();
2094            }
2095        } // scanCDATA()
2096        
2097        /** Scans a comment. */
2098        protected void scanComment() throws IOException   {
2099            if (DEBUG_BUFFER) {
2100                System.out.print("(scanComment: ");
2101                printBuffer();
2102                System.out.println();
2103            }
2104            fStringBuffer.clear();
2105            boolean eof = scanMarkupContent(fStringBuffer, '-');
2106            if (fDocumentHandler != null && fElementCount >= fElementDepth) {
2107                if (DEBUG_CALLBACKS) {
2108                    System.out.println("comment("+fStringBuffer+")");
2109                }
2110                fEndLineNumber = fCurrentEntity.lineNumber;
2111                fEndColumnNumber = fCurrentEntity.columnNumber;
2112                fDocumentHandler.comment(fStringBuffer, locationAugs());
2113            }
2114            if (DEBUG_BUFFER) {
2115                System.out.print(")scanComment: ");
2116                printBuffer();
2117                System.out.println();
2118            }
2119            if (eof) {
2120                throw new EOFException  ();
2121            }
2122        } // scanComment()
2123
2124        /** Scans markup content. */
2125        protected boolean scanMarkupContent(XMLStringBuffer buffer, 
2126                                            char cend) throws IOException   {
2127            int c = -1;
2128            OUTER: while (true) {
2129                c = read();
2130                if (c == cend) {
2131                    int count = 1;
2132                    while (true) {
2133                        c = read();
2134                        if (c == cend) {
2135                            count++;
2136                            continue;
2137                        }
2138                        break;
2139                    }
2140                    if (c == -1) {
2141                        if (fReportErrors) {
2142                            fErrorReporter.reportError("HTML1007", null);
2143                        }
2144                        break OUTER;
2145                    }
2146                    if (count < 2) {
2147                        buffer.append(cend);
2148                        //if (c != -1) {
2149                        fCurrentEntity.offset--;
2150                        fCurrentEntity.columnNumber--;
2151                        //}
2152                        continue;
2153                    }
2154                    if (c != '>') {
2155                        for (int i = 0; i < count; i++) {
2156                            buffer.append(cend);
2157                        }
2158                        fCurrentEntity.offset--;
2159                        fCurrentEntity.columnNumber--;
2160                        continue;
2161                    }
2162                    for (int i = 0; i < count - 2; i++) {
2163                        buffer.append(cend);
2164                    }
2165                    break;
2166                }
2167                else if (c == '\n' || c == '\r') {
2168                    fCurrentEntity.offset--;
2169                    fCurrentEntity.columnNumber--;
2170                    int newlines = skipNewlines();
2171                    for (int i = 0; i < newlines; i++) {
2172                        buffer.append('\n');
2173                    }
2174                    continue;
2175                }
2176                else if (c == -1) {
2177                    if (fReportErrors) {
2178                        fErrorReporter.reportError("HTML1007", null);
2179                    }
2180                    break;
2181                }
2182                buffer.append((char)c);
2183            }
2184            return c == -1;
2185        } // scanMarkupContent(XMLStringBuffer,char):boolean
2186
2187        /** Scans a processing instruction. */
2188        protected void scanPI() throws IOException   {
2189            if (DEBUG_BUFFER) {
2190                System.out.print("(scanPI: ");
2191                printBuffer();
2192                System.out.println();
2193            }
2194            if (fReportErrors) {
2195                fErrorReporter.reportWarning("HTML1008", null);
2196            }
2197
2198            // scan processing instruction
2199            String   target = scanName();
2200            if (target != null && !target.equalsIgnoreCase("xml")) {
2201                while (true) {
2202                    int c = read();
2203                    if (c == '\r' || c == '\n') {
2204                        fCurrentEntity.lineNumber++;
2205                        fCurrentEntity.columnNumber = 1;
2206                        if (c == '\r') {
2207                            c = read();
2208                            if (c != '\n') {
2209                                fCurrentEntity.offset--;
2210                            }
2211                        }
2212                        continue;
2213                    }
2214                    if (c == -1) {
2215                        break;
2216                    }
2217                    if (c != ' ' && c != '\t') {
2218                        fCurrentEntity.offset--;
2219                        fCurrentEntity.columnNumber--;
2220                        break;
2221                    }
2222                }
2223                fStringBuffer.clear();
2224                while (true) {
2225                    int c = read();
2226                    if (c == '?' || c == '/') {
2227                        char c0 = (char)c;
2228                        c = read();
2229                        if (c == '>') {
2230                            break;
2231                        }
2232                        else {
2233                            fStringBuffer.append(c0);
2234                            fCurrentEntity.offset--;
2235                            fCurrentEntity.columnNumber--;
2236                            continue;
2237                        }
2238                    }
2239                    else if (c == '\r' || c == '\n') {
2240                        fStringBuffer.append('\n');
2241                        fCurrentEntity.lineNumber++;
2242                        fCurrentEntity.columnNumber = 1;
2243                        if (c == '\r') {
2244                            c = read();
2245                            if (c != '\n') {
2246                                fCurrentEntity.offset--;
2247                            }
2248                        }
2249                        continue;
2250                    }
2251                    else if (c == -1) {
2252                        break;
2253                    }
2254                    else {
2255                        fStringBuffer.append((char)c);
2256                    }
2257                }
2258                XMLString data = fStringBuffer;
2259                if (fDocumentHandler != null) {
2260                    fEndLineNumber = fCurrentEntity.lineNumber;
2261                    fEndColumnNumber = fCurrentEntity.columnNumber;
2262                    fDocumentHandler.processingInstruction(target, data, locationAugs());
2263                }
2264            }
2265
2266            // scan xml/text declaration
2267            else {
2268                int beginLineNumber = fBeginLineNumber;
2269                int beginColumnNumber = fBeginColumnNumber;
2270                fAttributes.removeAllAttributes();
2271                int aindex = 0;
2272                while (scanPseudoAttribute(fAttributes)) {
2273                    fAttributes.getName(aindex,fQName);
2274                    fQName.rawname = fQName.rawname.toLowerCase();
2275                    fAttributes.setName(aindex,fQName);
2276                    aindex++;
2277                }
2278                if (fDocumentHandler != null) {
2279                    String   version = fAttributes.getValue("version");
2280                    String   encoding = fAttributes.getValue("encoding");
2281                    String   standalone = fAttributes.getValue("standalone");
2282
2283                    fBeginLineNumber = beginLineNumber;
2284                    fBeginColumnNumber = beginColumnNumber;
2285                    fEndLineNumber = fCurrentEntity.lineNumber;
2286                    fEndColumnNumber = fCurrentEntity.columnNumber;
2287                    fDocumentHandler.xmlDecl(version, encoding, standalone,
2288                                             locationAugs());
2289                }
2290            }
2291
2292            if (DEBUG_BUFFER) {
2293                System.out.print(")scanPI: ");
2294                printBuffer();
2295                System.out.println();
2296            }
2297        } // scanPI()
2298
2299        /** 
2300         * Scans a start element. 
2301         *
2302         * @param empty Is used for a second return value to indicate whether
2303         *              the start element tag is empty (e.g. "/&gt;").
2304         */
2305        protected String   scanStartElement(boolean[] empty) throws IOException   {
2306            String   ename = scanName();
2307            int length = ename != null ? ename.length() : 0;
2308            int c = length > 0 ? ename.charAt(0) : -1;
2309            if (length == 0 || !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
2310                if (fReportErrors) {
2311                    fErrorReporter.reportError("HTML1009", null);
2312                }
2313                if (fDocumentHandler != null && fElementCount >= fElementDepth) {
2314                    fStringBuffer.clear();
2315                    fStringBuffer.append('<');
2316                    if (length > 0) {
2317                        fStringBuffer.append(ename);
2318                    }
2319                    fDocumentHandler.characters(fStringBuffer, null);
2320                }
2321                return null;
2322            }
2323            ename = modifyName(ename, fNamesElems);
2324            fAttributes.removeAllAttributes();
2325            boolean print = false;
2326            int beginLineNumber = fBeginLineNumber;
2327            int beginColumnNumber = fBeginColumnNumber;
2328            while (scanAttribute(fAttributes, empty)) {
2329                // do nothing
2330            }
2331            fBeginLineNumber = beginLineNumber;
2332            fBeginColumnNumber = beginColumnNumber;
2333            if (fByteStream != null && fElementDepth == -1) {
2334                if (ename.equalsIgnoreCase("META")) {
2335                    if (DEBUG_CHARSET) {
2336                        System.out.println("+++ <META>");
2337                    }
2338                    String   httpEquiv = getValue(fAttributes, "http-equiv");
2339                    if (httpEquiv != null && httpEquiv.equalsIgnoreCase("content-type")) {
2340                        if (DEBUG_CHARSET) {
2341                            System.out.println("+++ @content-type: \""+httpEquiv+'"');
2342                        }
2343                        String   content = getValue(fAttributes, "content");
2344                        int index1 = content != null ? content.toLowerCase().indexOf("charset=") : -1;
2345                        if (index1 != -1 && !fIgnoreSpecifiedCharset) {
2346                            int index2 = content.indexOf(';', index1);
2347                            String   charset = index2 != -1 ? content.substring(index1+8, index2) : content.substring(index1+8);
2348                            try {
2349                                String   ianaEncoding = charset;
2350                                String   javaEncoding = EncodingMap.getIANA2JavaMapping(ianaEncoding.toUpperCase());
2351                                if (DEBUG_CHARSET) {
2352                                    System.out.println("+++ ianaEncoding: "+ianaEncoding);
2353                                    System.out.println("+++ javaEncoding: "+javaEncoding);
2354                                }
2355                                if (javaEncoding == null) {
2356                                    javaEncoding = ianaEncoding;
2357                                    if (fReportErrors) {
2358                                        fErrorReporter.reportError("HTML1001", new Object  []{ianaEncoding});
2359                                    }
2360                                }
2361                                fIso8859Encoding = ianaEncoding == null 
2362                                                || ianaEncoding.toUpperCase().startsWith("ISO-8859")
2363                                                || ianaEncoding.equalsIgnoreCase(fDefaultIANAEncoding);
2364                                fCurrentEntity.stream = new InputStreamReader  (fByteStream, javaEncoding);
2365                                fByteStream.playback();
2366                                fElementDepth = fElementCount;
2367                                fElementCount = 0;
2368                                fCurrentEntity.offset = fCurrentEntity.length = 0;
2369                                fCurrentEntity.lineNumber = 1;
2370                                fCurrentEntity.columnNumber = 1;
2371                            }
2372                            catch (UnsupportedEncodingException   e) {
2373                                if (fReportErrors) {
2374                                    fErrorReporter.reportError("HTML1010", new Object  []{charset});
2375                                }
2376                                // NOTE: If the encoding change doesn't work, 
2377                                //       then there's no point in continuing to 
2378                                //       buffer the input stream.
2379                                fByteStream.clear();
2380                            }
2381                        }
2382                    }
2383                }
2384                else if (ename.equalsIgnoreCase("BODY")) {
2385                    fByteStream.clear();
2386                }
2387                else {
2388                     HTMLElements.Element element = HTMLElements.getElement(ename);
2389                     if (element.parent != null && element.parent.length > 0) {
2390                         if (element.parent[0].code == HTMLElements.BODY) {
2391                             fByteStream.clear();
2392                         }
2393                     }
2394                }
2395            }
2396            if (fDocumentHandler != null && fElementCount >= fElementDepth) {
2397                fQName.setValues(null, ename, ename, null);
2398                if (DEBUG_CALLBACKS) {
2399                    System.out.println("startElement("+fQName+','+fAttributes+")");
2400                }
2401                fEndLineNumber = fCurrentEntity.lineNumber;
2402                fEndColumnNumber = fCurrentEntity.columnNumber;
2403                if (empty[0]) {
2404                    fDocumentHandler.emptyElement(fQName, fAttributes, locationAugs());
2405                }
2406                else {
2407                    fDocumentHandler.startElement(fQName, fAttributes, locationAugs());
2408                }
2409            }
2410            return ename;
2411        } // scanStartElement():ename
2412
2413        /** 
2414         * Scans a real attribute. 
2415         *
2416         * @param attributes The list of attributes.
2417         * @param empty      Is used for a second return value to indicate 
2418         *                   whether the start element tag is empty 
2419         *                   (e.g. "/&gt;").
2420         */
2421        protected boolean scanAttribute(XMLAttributesImpl attributes,
2422                                        boolean[] empty)
2423            throws IOException   {
2424            return scanAttribute(attributes,empty,'/');
2425        } // scanAttribute(XMLAttributesImpl,boolean[]):boolean
2426
2427        /** 
2428         * Scans a pseudo attribute. 
2429         *
2430         * @param attributes The list of attributes.
2431         */
2432        protected boolean scanPseudoAttribute(XMLAttributesImpl attributes)
2433            throws IOException   {
2434            return scanAttribute(attributes,fSingleBoolean,'?');
2435        } // scanPseudoAttribute(XMLAttributesImpl):boolean
2436
2437        /** 
2438         * Scans an attribute, pseudo or real. 
2439         *
2440         * @param attributes The list of attributes.
2441         * @param empty      Is used for a second return value to indicate 
2442         *                   whether the start element tag is empty 
2443         *                   (e.g. "/&gt;").
2444         * @param endc       The end character that appears before the
2445         *                   closing angle bracket ('>').
2446         */
2447        protected boolean scanAttribute(XMLAttributesImpl attributes,
2448                                        boolean[] empty, char endc)
2449            throws IOException   {
2450            boolean skippedSpaces = skipSpaces();
2451            fBeginLineNumber = fCurrentEntity.lineNumber;
2452            fBeginColumnNumber = fCurrentEntity.columnNumber;
2453            int c = read();
2454            if (c == -1) {
2455                if (fReportErrors) {
2456                    fErrorReporter.reportError("HTML1007", null);
2457                }
2458                throw new EOFException  ();
2459            }
2460            if (c == '>') {
2461                return false;
2462            }
2463            fCurrentEntity.offset--;
2464            fCurrentEntity.columnNumber--;
2465            String   aname = scanName();
2466            if (aname == null) {
2467                if (fReportErrors) {
2468                    fErrorReporter.reportError("HTML1011", null);
2469                }
2470                empty[0] = skipMarkup(false);
2471                return false;
2472            }
2473            if (!skippedSpaces && fReportErrors) {
2474                fErrorReporter.reportError("HTML1013", new Object  [] { aname });
2475            }
2476            aname = modifyName(aname, fNamesAttrs);
2477            skipSpaces();
2478            c = read();
2479            if (c == -1) {
2480                if (fReportErrors) {
2481                    fErrorReporter.reportError("HTML1007", null);
2482                }
2483                throw new EOFException  ();
2484            }
2485            if (c == '/' || c == '>') {
2486                fQName.setValues(null, aname, aname, null);
2487                attributes.addAttribute(fQName, "CDATA", "");
2488                attributes.setSpecified(attributes.getLength()-1, true);
2489                if (fAugmentations) {
2490                    addLocationItem(attributes, attributes.getLength() - 1);
2491                }
2492                if (c == '/') {
2493                    fCurrentEntity.offset--;
2494                    fCurrentEntity.columnNumber--;
2495                    empty[0] = skipMarkup(false);
2496                }
2497                return false;
2498            }
2499            /***
2500            // REVISIT: [Q] Why is this still here? -Ac
2501            if (c == '/' || c == '>') {
2502                if (c == '/') {
2503                    fCurrentEntity.offset--;
2504                    fCurrentEntity.columnNumber--;
2505                    empty[0] = skipMarkup(false);
2506                }
2507                fQName.setValues(null, aname, aname, null);
2508                attributes.addAttribute(fQName, "CDATA", "");
2509                attributes.setSpecified(attributes.getLength()-1, true);
2510                if (fAugmentations) {
2511                    addLocationItem(attributes, attributes.getLength() - 1);
2512                }
2513                return false;
2514            }
2515            /***/
2516            if (c == '=') {
2517                skipSpaces();
2518                c = read();
2519                if (c == -1) {
2520                    if (fReportErrors) {
2521                        fErrorReporter.reportError("HTML1007", null);
2522                    }
2523                    throw new EOFException  ();
2524                }
2525                // Xiaowei/Ac: Fix for <a HREF=/cgi-bin/myscript>...</a>
2526                if (c == '>') {
2527                    fQName.setValues(null, aname, aname, null);
2528                    attributes.addAttribute(fQName, "CDATA", "");
2529                    attributes.setSpecified(attributes.getLength()-1, true);
2530                    if (fAugmentations) {
2531                        addLocationItem(attributes, attributes.getLength() - 1);
2532                    }
2533                    return false;
2534                }
2535                fStringBuffer.clear();
2536                fNonNormAttr.clear();
2537                if (c != '\'' && c != '"') {
2538                    fCurrentEntity.offset--;
2539                    fCurrentEntity.columnNumber--;
2540                    while (true) {
2541                        c = read();
2542                        // Xiaowei/Ac: Fix for <a HREF=/broken/>...</a>
2543                        if (Character.isSpace((char)c) || c == '>') {
2544                            //fCharOffset--;
2545                            fCurrentEntity.offset--;
2546                            fCurrentEntity.columnNumber--;
2547                            break;
2548                        }
2549                        if (c == -1) {
2550                            if (fReportErrors) {
2551                                fErrorReporter.reportError("HTML1007", null);
2552                            }
2553                            throw new EOFException  ();
2554                        }
2555                        if (c == '&') {
2556                            int ce = scanEntityRef(fStringBuffer2, false);
2557                            if (ce != -1) {
2558                                fStringBuffer.append((char)ce);
2559                            }
2560                            else {
2561                                fStringBuffer.append(fStringBuffer2);
2562                            }
2563                            fNonNormAttr.append(fStringBuffer2);
2564                        }
2565                        else {
2566                            fStringBuffer.append((char)c);
2567                            fNonNormAttr.append((char)c);
2568                        }
2569                    }
2570                    fQName.setValues(null, aname, aname, null);
2571                    String   avalue = fStringBuffer.toString();
2572                    attributes.addAttribute(fQName, "CDATA", avalue);
2573
2574                    int lastattr = attributes.getLength()-1;
2575                    attributes.setSpecified(lastattr, true);
2576                    attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString());
2577                    if (fAugmentations) {
2578                        addLocationItem(attributes, attributes.getLength() - 1);
2579                    }
2580                    return true;
2581                }
2582                char quote = (char)c;
2583                do {
2584                    c = read();
2585                    if (c == -1) {
2586                        if (fReportErrors) {
2587                            fErrorReporter.reportError("HTML1007", null);
2588                        }
2589                        throw new EOFException  ();
2590                    }
2591                    if (c == '&') {
2592                        int ce = scanEntityRef(fStringBuffer2, false);
2593                        if (ce != -1) {
2594                            fStringBuffer.append((char)ce);
2595                        }
2596                        else {
2597                            fStringBuffer.append(fStringBuffer2);
2598                        }
2599                        fNonNormAttr.append(fStringBuffer2);
2600                    }
2601                    else if (c == '\t') {
2602                        fStringBuffer.append(' ');
2603                        fNonNormAttr.append('\t');
2604                    }
2605                    else if (c == '\r' || c == '\n') {
2606                        fCurrentEntity.lineNumber++;
2607                        fCurrentEntity.columnNumber = 0;
2608                        if (c == '\r') {
2609                            int c2 = read();
2610                            if (c2 != '\n') {
2611                                fCurrentEntity.offset--;
2612                                fCurrentEntity.columnNumber--;
2613                            }
2614                            else {
2615                                fNonNormAttr.append('\r');
2616                                c = c2;
2617                            }
2618                        }
2619                        fStringBuffer.append(' ');
2620                        fNonNormAttr.append((char)c);
2621                    }
2622                    else if (c != quote) {
2623                        fStringBuffer.append((char)c);
2624                        fNonNormAttr.append((char)c);
2625                    }
2626                } while (c != quote);
2627                fQName.setValues(null, aname, aname, null);
2628                String   avalue = fStringBuffer.toString();
2629                attributes.addAttribute(fQName, "CDATA", avalue);
2630
2631                int lastattr = attributes.getLength()-1;
2632                attributes.setSpecified(lastattr, true);
2633                attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString());
2634                if (fAugmentations) {
2635                    addLocationItem(attributes, attributes.getLength() - 1);
2636                }
2637            }
2638            else {
2639                fQName.setValues(null, aname, aname, null);
2640                attributes.addAttribute(fQName, "CDATA", "");
2641                attributes.setSpecified(attributes.getLength()-1, true);
2642                fCurrentEntity.offset--;
2643                fCurrentEntity.columnNumber--;
2644                if (fAugmentations) {
2645                    addLocationItem(attributes, attributes.getLength() - 1);
2646                }
2647            }
2648            return true;
2649        } // scanAttribute(XMLAttributesImpl):boolean
2650
2651        /** Adds location augmentations to the specified attribute. */
2652        protected void addLocationItem(XMLAttributes attributes, int index) {
2653            fEndLineNumber = fCurrentEntity.lineNumber;
2654            fEndColumnNumber = fCurrentEntity.columnNumber;
2655            LocationItem locationItem = new LocationItem();
2656            locationItem.setValues(fBeginLineNumber, fBeginColumnNumber,
2657                                   fEndLineNumber, fEndColumnNumber);
2658            Augmentations augs = attributes.getAugmentations(index);
2659            augs.putItem(AUGMENTATIONS, locationItem);
2660        } // addLocationItem(XMLAttributes,int)
2661
2662        /** Scans an end element. */
2663        protected void scanEndElement() throws IOException   {
2664            String   ename = scanName();
2665            if (fReportErrors && ename == null) {
2666                fErrorReporter.reportError("HTML1012", null);
2667            }
2668            skipMarkup(false);
2669            if (ename != null) {
2670                ename = modifyName(ename, fNamesElems);
2671                if (fDocumentHandler != null && fElementCount >= fElementDepth) {
2672                    fQName.setValues(null, ename, ename, null);
2673                    if (DEBUG_CALLBACKS) {
2674                        System.out.println("endElement("+fQName+")");
2675                    }
2676                    fEndLineNumber = fCurrentEntity.lineNumber;
2677                    fEndColumnNumber = fCurrentEntity.columnNumber;
2678                    fDocumentHandler.endElement(fQName, locationAugs());
2679                }
2680            }
2681        } // scanEndElement()
2682
2683    } // class ContentScanner
2684
2685    /**
2686     * Special scanner used for elements whose content needs to be scanned 
2687     * as plain text, ignoring markup such as elements and entity references.
2688     * For example: &lt;SCRIPT&gt; and &lt;COMMENT&gt;.
2689     *
2690     * @author Andy Clark
2691     */
2692    public class SpecialScanner
2693        implements Scanner {
2694
2695        //
2696        // Data
2697        //
2698
2699        /** Name of element whose content needs to be scanned as text. */
2700        protected String   fElementName;
2701
2702        /** True if &lt;script&gt; element. */
2703        protected boolean fScript;
2704
2705        /** True if &lt;style&gt; element. */
2706        protected boolean fStyle;
2707
2708        /** True if &lt;textarea&gt; element. */
2709        protected boolean fTextarea;
2710
2711        // temp vars
2712
2713        /** A qualified name. */
2714        private final QName fQName = new QName();
2715
2716        /** A string buffer. */
2717        private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
2718
2719        //
2720        // Public methods
2721        //
2722
2723        /** Sets the element name. */
2724        public Scanner setElementName(String   ename) {
2725            fElementName = ename;
2726            fScript = fElementName.equalsIgnoreCase("SCRIPT");
2727            fStyle = fElementName.equalsIgnoreCase("STYLE");
2728            fTextarea = fElementName.equalsIgnoreCase("TEXTAREA");
2729            return this;
2730        } // setElementName(String):Scanner
2731
2732        //
2733        // Scanner methods
2734        //
2735
2736        /** Scan. */
2737        public boolean scan(boolean complete) throws IOException   {
2738            boolean next;
2739            do {
2740                try {
2741                    next = false;
2742                    int delimiter = -1;
2743                    switch (fScannerState) {
2744                        case STATE_CONTENT: {
2745                            fBeginLineNumber = fCurrentEntity.lineNumber;
2746                            fBeginColumnNumber = fCurrentEntity.columnNumber;
2747                            int c = read();
2748                            if (c == '<') {
2749                                setScannerState(STATE_MARKUP_BRACKET);
2750                                continue;
2751                            }
2752                            if (c == '&') {
2753                                if (fTextarea) {
2754                                    scanEntityRef(fStringBuffer, true);
2755                                    continue;
2756                                }
2757                                fStringBuffer.clear();
2758                                fStringBuffer.append('&');
2759                            }
2760                            else if (c == -1) {
2761                                if (fReportErrors) {
2762                                    fErrorReporter.reportError("HTML1007", null);
2763                                }
2764                                throw new EOFException  ();
2765                            }
2766                            else {
2767                                fCurrentEntity.offset--;
2768                                fCurrentEntity.columnNumber--;
2769                                fStringBuffer.clear();
2770                            }
2771                            scanCharacters(fStringBuffer, -1);
2772                            break;
2773                        } // case STATE_CONTENT
2774                        case STATE_MARKUP_BRACKET: {
2775                            int c = read();
2776                            if (c == '!') {
2777                                if (skip("--", false)) {
2778                                    fStringBuffer.clear();
2779                                    boolean strip = (fScript && fScriptStripCommentDelims) ||
2780                                                    (fStyle && fStyleStripCommentDelims);
2781                                    if (strip) {
2782                                        do {
2783                                            c = read();
2784                                            if (c == '\r' || c == '\n') {
2785                                                fCurrentEntity.columnNumber--;
2786                                                fCurrentEntity.offset--;
2787                                                break;
2788                                            }
2789                                        } while (c != -1);
2790                                        skipNewlines(1);
2791                                        delimiter = '-';
2792                                    }
2793                                    else {
2794                                        fStringBuffer.append("<!--");
2795                                    }
2796                                }
2797                                else if (skip("[CDATA[", false)) {
2798                                    fStringBuffer.clear();
2799                                    boolean strip = (fScript && fScriptStripCDATADelims) ||
2800                                                    (fStyle  && fStyleStripCDATADelims);
2801                                    if (strip) {
2802                                        do {
2803                                            c = read();
2804                                            if (c == '\r' || c == '\n') {
2805                                                fCurrentEntity.columnNumber--;
2806                                                fCurrentEntity.offset--;
2807                                                break;
2808                                            }
2809                                        } while (c != -1);
2810                                        skipNewlines(1);
2811                                        delimiter = ']';
2812                                    }
2813                                    else {
2814                                        fStringBuffer.append("<![CDATA[");
2815                                    }
2816                                }
2817                            }
2818                            else if (c == '/') {
2819                                String   ename = scanName();
2820                                if (ename != null) {
2821                                    if (ename.equalsIgnoreCase(fElementName)) {
2822                                        if (read() == '>') {
2823                                            ename = modifyName(ename, fNamesElems);
2824                                            if (fDocumentHandler != null && fElementCount >= fElementDepth) {
2825                                                fQName.setValues(null, ename, ename, null);
2826                                                if (DEBUG_CALLBACKS) {
2827                                                    System.out.println("endElement("+fQName+")");
2828                                                }
2829                                                fEndLineNumber = fCurrentEntity.lineNumber;
2830                                                fEndColumnNumber = fCurrentEntity.columnNumber;
2831                                                fDocumentHandler.endElement(fQName, locationAugs());
2832                                            }
2833                                            setScanner(fContentScanner);
2834                                            setScannerState(STATE_CONTENT);
2835                                            return true;
2836                                        }
2837                                        else {
2838                                            fCurrentEntity.offset--;
2839                                            fCurrentEntity.columnNumber--;
2840                                        }
2841                                    }
2842                                    fStringBuffer.clear();
2843                                    fStringBuffer.append("</");
2844                                    fStringBuffer.append(ename);
2845                                }
2846                                else {
2847                                    fStringBuffer.clear();
2848                                    fStringBuffer.append("</");
2849                                }
2850                            }
2851                            else {
2852                                fStringBuffer.clear();
2853                                fStringBuffer.append('<');
2854                                fStringBuffer.append((char)c);
2855                            }
2856                            scanCharacters(fStringBuffer, delimiter);
2857                            setScannerState(STATE_CONTENT);
2858                            break;
2859                        } // case STATE_MARKUP_BRACKET
2860                    } // switch
2861                } // try
2862                catch (EOFException   e) {
2863                    setScanner(fContentScanner);
2864                    if (fCurrentEntityStack.empty()) {
2865                        setScannerState(STATE_END_DOCUMENT);
2866                    }
2867                    else {
2868                        fCurrentEntity = (CurrentEntity)fCurrentEntityStack.pop();
2869                        setScannerState(STATE_CONTENT);
2870                    }
2871                    return true;
2872                }
2873            } // do
2874            while (next || complete);
2875            return true;
2876        } // scan(boolean):boolean
2877
2878        //
2879        // Protected methods
2880        //
2881
2882        /** Scan characters. */
2883        protected void scanCharacters(XMLStringBuffer buffer,
2884                                      int delimiter) throws IOException   {
2885            if (DEBUG_BUFFER) {
2886                System.out.print("(scanCharacters, delimiter="+delimiter+": ");
2887                printBuffer();
2888                System.out.println();
2889            }
2890            boolean strip = (fScript && fScriptStripCommentDelims) ||
2891                            (fScript && fScriptStripCDATADelims) ||
2892                            (fStyle  && fStyleStripCommentDelims) ||
2893                            (fStyle  && fStyleStripCDATADelims);
2894            while (true) {
2895                int c = read();
2896                if (c == -1 || (delimiter == -1 && (c == '<' || c == '&'))) {
2897                    if (c != -1) {
2898                        fCurrentEntity.offset--;
2899                        fCurrentEntity.columnNumber--;
2900                    }
2901                    break;
2902                }
2903                // Patch supplied by Jonathan Baxter
2904                else if (c == '\r' || c == '\n') {
2905                    fCurrentEntity.offset--;
2906                    fCurrentEntity.columnNumber--;
2907                    int newlines = skipNewlines();
2908                    for (int i = 0; i < newlines; i++) {
2909                        buffer.append('\n');
2910                    }
2911                }
2912                else if (delimiter != -1 && c == (char)delimiter) {
2913                    int count = 0;
2914                    do {
2915                        count++;
2916                        c = read();
2917                    } while (c == (char)delimiter);
2918                    for (int i = strip && c == '>' ? 2 : 0; i < count; i++) {
2919                        buffer.append((char)delimiter);
2920                    }
2921                    if (c == -1 || (count >= 2 && c == '>')) {
2922                        if (!strip) {
2923                            buffer.append((char)c);
2924                        }
2925                        break;
2926                    }
2927                    fCurrentEntity.offset--;
2928                    fCurrentEntity.columnNumber--;
2929                }
2930                else {
2931                    buffer.append((char)c);
2932                    if (c == '\n') {
2933                        fCurrentEntity.columnNumber = 1;
2934                        fCurrentEntity.lineNumber++;
2935                    }
2936                }
2937            }
2938            if (buffer.length > 0 && fDocumentHandler != null && fElementCount >= fElementDepth) {
2939                if (DEBUG_CALLBACKS) {
2940                    System.out.println("characters("+buffer+")");
2941                }
2942                fEndLineNumber = fCurrentEntity.lineNumber;
2943                fEndColumnNumber = fCurrentEntity.columnNumber;
2944                fDocumentHandler.characters(buffer, locationAugs());
2945            }
2946            if (DEBUG_BUFFER) {
2947                System.out.print(")scanCharacters: ");
2948                printBuffer();
2949                System.out.println();
2950            }
2951        } // scanCharacters(StringBuffer)
2952
2953    } // class SpecialScanner
2954
2955    /**
2956     * A playback input stream. This class has the ability to save the bytes
2957     * read from the underlying input stream and play the bytes back later.
2958     * This class is used by the HTML scanner to switch encodings when a 
2959     * &lt;meta&gt; tag is detected that specifies a different encoding. 
2960     * <p>
2961     * If the encoding is changed, then the scanner calls the 
2962     * <code>playback</code> method and re-scans the beginning of the HTML
2963     * document again. This should not be too much of a performance problem
2964     * because the &lt;meta&gt; tag appears at the beginning of the document.
2965     * <p>
2966     * If the &lt;body&gt; tag is reached without playing back the bytes,
2967     * then the buffer can be cleared by calling the <code>clear</code>
2968     * method. This stops the buffering of bytes and allows the memory used
2969     * by the buffer to be reclaimed. 
2970     * <p>
2971     * <strong>Note:</strong> 
2972     * If the buffer is never played back or cleared, this input stream
2973     * will continue to buffer the entire stream. Therefore, it is very
2974     * important to use this stream correctly.
2975     *
2976     * @author Andy Clark
2977     */
2978    public static class PlaybackInputStream
2979        extends FilterInputStream   {
2980
2981        //
2982        // Constants
2983        //
2984
2985        /** Set to true to debug playback. */
2986        private static final boolean DEBUG_PLAYBACK = false;
2987
2988        //
2989        // Data
2990        //
2991
2992        // state
2993
2994        /** Playback mode. */
2995        protected boolean fPlayback = false;
2996
2997        /** Buffer cleared. */
2998        protected boolean fCleared = false;
2999
3000        /** Encoding detected. */
3001        protected boolean fDetected = false;
3002
3003        // buffer info
3004
3005        /** Byte buffer. */
3006        protected byte[] fByteBuffer = new byte[1024];
3007
3008        /** Offset into byte buffer during playback. */
3009        protected int fByteOffset = 0;
3010
3011        /** Length of bytes read into byte buffer. */
3012        protected int fByteLength = 0;
3013
3014        /** Pushback offset. */
3015        public int fPushbackOffset = 0;
3016
3017        /** Pushback length. */
3018        public int fPushbackLength = 0;
3019
3020        //
3021        // Constructors
3022        //
3023
3024        /** Constructor. */
3025        public PlaybackInputStream(InputStream   in) {
3026            super(in);
3027        } // <init>(InputStream)
3028
3029        //
3030        // Public methods
3031        //
3032
3033        /** Detect encoding. */
3034        public void detectEncoding(String  [] encodings) throws IOException   {
3035            if (fDetected) {
3036                throw new IOException  ("Should not detect encoding twice.");
3037            }
3038            fDetected = true;
3039            int b1 = read();
3040            if (b1 == -1) {
3041                return;
3042            }
3043            int b2 = read();
3044            if (b2 == -1) {
3045                fPushbackLength = 1;
3046                return;
3047            }
3048            // UTF-8 BOM: 0xEFBBBF
3049            if (b1 == 0xEF && b2 == 0xBB) {
3050                int b3 = read();
3051                if (b3 == 0xBF) {
3052                    fPushbackOffset = 3;
3053                    encodings[0] = "UTF-8";
3054                    encodings[1] = "UTF8";
3055                    return;
3056                }
3057                fPushbackLength = 3;
3058            }
3059            // UTF-16 LE BOM: 0xFFFE
3060            if (b1 == 0xFF && b2 == 0xFE) {
3061                encodings[0] = "UTF-16";
3062                encodings[1] = "UnicodeLittleUnmarked";
3063                return;
3064            }
3065            // UTF-16 BE BOM: 0xFEFF
3066            else if (b1 == 0xFE && b2 == 0xFF) {
3067                encodings[0] = "UTF-16";
3068                encodings[1] = "UnicodeBigUnmarked";
3069                return;
3070            }
3071            // unknown
3072            fPushbackLength = 2;
3073        } // detectEncoding()
3074
3075        /** Playback buffer contents. */
3076        public void playback() {
3077            fPlayback = true;
3078        } // playback()
3079
3080        /** 
3081         * Clears the buffer.
3082         * <p>
3083         * <strong>Note:</strong>
3084         * The buffer cannot be cleared during playback. Therefore, calling
3085         * this method during playback will not do anything. However, the
3086         * buffer will be cleared automatically at the end of playback.
3087         */
3088        public void clear() {
3089            if (!fPlayback) {
3090                fCleared = true;
3091                fByteBuffer = null;
3092            }
3093        } // clear()
3094
3095        //
3096        // InputStream methods
3097        //
3098
3099        /** Read a byte. */
3100        public int read() throws IOException   {
3101            if (DEBUG_PLAYBACK) {
3102                System.out.println("(read");
3103            }
3104            if (fPushbackOffset < fPushbackLength) {
3105                return fByteBuffer[fPushbackOffset++];
3106            }
3107            if (fCleared) {
3108                return in.read();
3109            }
3110            if (fPlayback) {
3111                int c = fByteBuffer[fByteOffset++];
3112                if (fByteOffset == fByteLength) {
3113                    fCleared = true;
3114                    fByteBuffer = null;
3115                }
3116                if (DEBUG_PLAYBACK) {
3117                    System.out.println(")read -> "+(char)c);
3118                }
3119                return c;
3120            }
3121            int c = in.read();
3122            if (c != -1) {
3123                if (fByteLength == fByteBuffer.length) {
3124                    byte[] newarray = new byte[fByteLength + 1024];
3125                    System.arraycopy(fByteBuffer, 0, newarray, 0, fByteLength);
3126                    fByteBuffer = newarray;
3127                }
3128                fByteBuffer[fByteLength++] = (byte)c;
3129            }
3130            if (DEBUG_PLAYBACK) {
3131                System.out.println(")read -> "+(char)c);
3132            }
3133            return c;
3134        } // read():int
3135
3136        /** Read an array of bytes. */
3137        public int read(byte[] array) throws IOException   {
3138            return read(array, 0, array.length);
3139        } // read(byte[]):int
3140
3141        /** Read an array of bytes. */
3142        public int read(byte[] array, int offset, int length) throws IOException   {
3143            if (DEBUG_PLAYBACK) {
3144                System.out.println(")read("+offset+','+length+')');
3145            }
3146            if (fPushbackOffset < fPushbackLength) {
3147                int count = fPushbackLength - fPushbackOffset;
3148                if (count > length) {
3149                    count = length;
3150                }
3151                System.arraycopy(fByteBuffer, fPushbackOffset, array, offset, count);
3152                fPushbackOffset += count;
3153                return count;
3154            }
3155            if (fCleared) {
3156                return in.read(array, offset, length);
3157            }
3158            if (fPlayback) {
3159                if (fByteOffset + length > fByteLength) {
3160                    length = fByteLength - fByteOffset;
3161                }
3162                System.arraycopy(fByteBuffer, fByteOffset, array, offset, length);
3163                fByteOffset += length;
3164                if (fByteOffset == fByteLength) {
3165                    fCleared = true;
3166                    fByteBuffer = null;
3167                }
3168                return length;
3169            }
3170            int count = in.read(array, offset, length);
3171            if (count != -1) {
3172                if (fByteLength + count > fByteBuffer.length) {
3173                    byte[] newarray = new byte[fByteLength + count + 512];
3174                    System.arraycopy(fByteBuffer, 0, newarray, 0, fByteLength);
3175                    fByteBuffer = newarray;
3176                }
3177                System.arraycopy(array, offset, fByteBuffer, fByteLength, count);
3178                fByteLength += count;
3179            }
3180            if (DEBUG_PLAYBACK) {
3181                System.out.println(")read("+offset+','+length+") -> "+count);
3182            }
3183            return count;
3184        } // read(byte[]):int
3185
3186    } // class PlaybackInputStream
3187
3188    /**
3189     * Location infoset item. 
3190     *
3191     * @author Andy Clark
3192     */
3193    protected static class LocationItem 
3194        implements HTMLEventInfo {
3195
3196        //
3197        // Data
3198        //
3199
3200        /** Beginning line number. */
3201        protected int fBeginLineNumber;
3202
3203        /** Beginning column number. */
3204        protected int fBeginColumnNumber;
3205
3206        /** Ending line number. */
3207        protected int fEndLineNumber;
3208
3209        /** Ending column number. */
3210        protected int fEndColumnNumber;
3211
3212        //
3213        // Public methods
3214        //
3215
3216        /** Sets the values of this item. */
3217        public void setValues(int beginLine, int beginColumn,
3218                              int endLine, int endColumn) {
3219            fBeginLineNumber = beginLine;
3220            fBeginColumnNumber = beginColumn;
3221            fEndLineNumber = endLine;
3222            fEndColumnNumber = endColumn;
3223        } // setValues(int,int,int,int)
3224
3225        //
3226        // HTMLEventInfo methods
3227        //
3228
3229        // location information
3230
3231        /** Returns the line number of the beginning of this event.*/
3232        public int getBeginLineNumber() {
3233            return fBeginLineNumber;
3234        } // getBeginLineNumber():int
3235
3236        /** Returns the column number of the beginning of this event.*/
3237        public int getBeginColumnNumber() { 
3238            return fBeginColumnNumber;
3239        } // getBeginColumnNumber():int
3240
3241        /** Returns the line number of the end of this event.*/
3242        public int getEndLineNumber() {
3243            return fEndLineNumber;
3244        } // getEndLineNumber():int
3245
3246        /** Returns the column number of the end of this event.*/
3247        public int getEndColumnNumber() {
3248            return fEndColumnNumber;
3249        } // getEndColumnNumber():int
3250
3251        // other information
3252
3253        /** Returns true if this corresponding event was synthesized. */
3254        public boolean isSynthesized() {
3255            return false;
3256        } // isSynthesize():boolean
3257
3258        //
3259        // Object methods
3260        //
3261
3262        /** Returns a string representation of this object. */
3263        public String   toString() {
3264            StringBuffer   str = new StringBuffer  ();
3265            str.append(fBeginLineNumber);
3266            str.append(':');
3267            str.append(fBeginColumnNumber);
3268            str.append(':');
3269            str.append(fEndLineNumber);
3270            str.append(':');
3271            str.append(fEndColumnNumber);
3272            return str.toString();
3273        } // toString():String
3274
3275    } // class LocationItem
3276
3277} // class HTMLScanner
3278
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags