KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > cyberneko > html > HTMLConfiguration


1 /*
2  * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
3  *
4  * This file is distributed under an Apache style license. Please
5  * refer to the LICENSE file for specific details.
6  */

7
8 package org.cyberneko.html;
9
10 import java.io.BufferedReader JavaDoc;
11 import java.io.File JavaDoc;
12 import java.io.FileInputStream JavaDoc;
13 import java.io.InputStream JavaDoc;
14 import java.io.InputStreamReader JavaDoc;
15 import java.io.IOException JavaDoc;
16 import java.lang.reflect.Method JavaDoc;
17 import java.lang.reflect.InvocationTargetException JavaDoc;
18 import java.text.MessageFormat JavaDoc;
19 import java.util.Locale JavaDoc;
20 import java.util.MissingResourceException JavaDoc;
21 import java.util.Properties JavaDoc;
22 import java.util.ResourceBundle JavaDoc;
23 import java.util.Vector JavaDoc;
24                                                                                
25 import org.cyberneko.html.filters.NamespaceBinder;
26
27 import org.apache.xerces.util.DefaultErrorHandler;
28 import org.apache.xerces.util.ParserConfigurationSettings;
29 import org.apache.xerces.xni.XMLDocumentHandler;
30 import org.apache.xerces.xni.XMLDTDHandler;
31 import org.apache.xerces.xni.XMLDTDContentModelHandler;
32 import org.apache.xerces.xni.XNIException;
33 import org.apache.xerces.xni.parser.XMLConfigurationException;
34 import org.apache.xerces.xni.parser.XMLDocumentFilter;
35 import org.apache.xerces.xni.parser.XMLDocumentSource;
36 import org.apache.xerces.xni.parser.XMLEntityResolver;
37 import org.apache.xerces.xni.parser.XMLErrorHandler;
38 import org.apache.xerces.xni.parser.XMLInputSource;
39 import org.apache.xerces.xni.parser.XMLParseException;
40 import org.apache.xerces.xni.parser.XMLPullParserConfiguration;
41                                       
42 /**
43  * An XNI-based parser configuration that can be used to parse HTML
44  * documents. This configuration can be used directly in order to
45  * parse HTML documents or can be used in conjunction with any XNI
46  * based tools, such as the Xerces2 implementation.
47  * <p>
48  * This configuration recognizes the following features:
49  * <ul>
50  * <li>http://cyberneko.org/html/features/augmentations
51  * <li>http://cyberneko.org/html/features/report-errors
52  * <li>http://cyberneko.org/html/features/report-errors/simple
53  * <li>http://cyberneko.org/html/features/balance-tags
54  * <li><i>and</i>
55  * <li>the features supported by the scanner and tag balancer components.
56  * </ul>
57  * <p>
58  * This configuration recognizes the following properties:
59  * <ul>
60  * <li>http://cyberneko.org/html/properties/names/elems
61  * <li>http://cyberneko.org/html/properties/names/attrs
62  * <li>http://cyberneko.org/html/properties/filters
63  * <li>http://cyberneko.org/html/properties/error-reporter
64  * <li><i>and</i>
65  * <li>the properties supported by the scanner and tag balancer.
66  * </ul>
67  * <p>
68  * For complete usage information, refer to the documentation.
69  *
70  * @see HTMLScanner
71  * @see HTMLTagBalancer
72  * @see HTMLErrorReporter
73  *
74  * @author Andy Clark
75  *
76  * @version $Id: HTMLConfiguration.java,v 1.9 2005/02/14 03:56:54 andyc Exp $
77  */

78 public class HTMLConfiguration
79     extends ParserConfigurationSettings
80     implements XMLPullParserConfiguration {
81
82     //
83
// Constants
84
//
85

86     // features
87

88     /** Namespaces. */
89     protected static final String JavaDoc NAMESPACES = "http://xml.org/sax/features/namespaces";
90
91     /** Include infoset augmentations. */
92     protected static final String JavaDoc AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
93
94     /** Report errors. */
95     protected static final String JavaDoc REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
96
97     /** Simple report format. */
98     protected static final String JavaDoc SIMPLE_ERROR_FORMAT = "http://cyberneko.org/html/features/report-errors/simple";
99
100     /** Balance tags. */
101     protected static final String JavaDoc BALANCE_TAGS = "http://cyberneko.org/html/features/balance-tags";
102
103     // properties
104

105     /** Modify HTML element names: { "upper", "lower", "default" }. */
106     protected static final String JavaDoc NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
107
108     /** Modify HTML attribute names: { "upper", "lower", "default" }. */
109     protected static final String JavaDoc NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
110     
111     /** Pipeline filters. */
112     protected static final String JavaDoc FILTERS = "http://cyberneko.org/html/properties/filters";
113
114     /** Error reporter. */
115     protected static final String JavaDoc ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
116
117     // other
118

119     /** Error domain. */
120     protected static final String JavaDoc ERROR_DOMAIN = "http://cyberneko.org/html";
121
122     // private
123

124     /** Document source class array. */
125     private static final Class JavaDoc[] DOCSOURCE = { XMLDocumentSource.class };
126
127     //
128
// Data
129
//
130

131     // handlers
132

133     /** Document handler. */
134     protected XMLDocumentHandler fDocumentHandler;
135
136     /** DTD handler. */
137     protected XMLDTDHandler fDTDHandler;
138
139     /** DTD content model handler. */
140     protected XMLDTDContentModelHandler fDTDContentModelHandler;
141
142     /** Error handler. */
143     protected XMLErrorHandler fErrorHandler = new DefaultErrorHandler();
144
145     // other settings
146

147     /** Entity resolver. */
148     protected XMLEntityResolver fEntityResolver;
149
150     /** Locale. */
151     protected Locale JavaDoc fLocale = Locale.getDefault();
152
153     // state
154

155     /**
156      * Stream opened by parser. Therefore, must close stream manually upon
157      * termination of parsing.
158      */

159     protected boolean fCloseStream;
160
161     // components
162

163     /** Components. */
164     protected Vector JavaDoc fHTMLComponents = new Vector JavaDoc(2);
165
166     // pipeline
167

168     /** Document scanner. */
169     protected HTMLScanner fDocumentScanner = new HTMLScanner();
170
171     /** HTML tag balancer. */
172     protected HTMLTagBalancer fTagBalancer = new HTMLTagBalancer();
173
174     /** Namespace binder. */
175     protected NamespaceBinder fNamespaceBinder = new NamespaceBinder();
176
177     // other components
178

179     /** Error reporter. */
180     protected HTMLErrorReporter fErrorReporter = new ErrorReporter();
181
182     // HACK: workarounds Xerces 2.0.x problems
183

184     /** Parser version is Xerces 2.0.0. */
185     protected static boolean XERCES_2_0_0 = false;
186
187     /** Parser version is Xerces 2.0.1. */
188     protected static boolean XERCES_2_0_1 = false;
189
190     /** Parser version is XML4J 4.0.x. */
191     protected static boolean XML4J_4_0_x = false;
192
193     //
194
// Static initializer
195
//
196

197     static {
198         try {
199             String JavaDoc VERSION = "org.apache.xerces.impl.Version";
200             Object JavaDoc version = ObjectFactory.createObject(VERSION, VERSION);
201             java.lang.reflect.Field JavaDoc field = version.getClass().getField("fVersion");
202             String JavaDoc versionStr = String.valueOf(field.get(version));
203             XERCES_2_0_0 = versionStr.equals("Xerces-J 2.0.0");
204             XERCES_2_0_1 = versionStr.equals("Xerces-J 2.0.1");
205             XML4J_4_0_x = versionStr.startsWith("XML4J 4.0.");
206         }
207         catch (Throwable JavaDoc e) {
208             // ignore
209
}
210     } // <clinit>()
211

212     //
213
// Constructors
214
//
215

216     /** Default constructor. */
217     public HTMLConfiguration() {
218
219         // add components
220
addComponent(fDocumentScanner);
221         addComponent(fTagBalancer);
222         addComponent(fNamespaceBinder);
223
224         //
225
// features
226
//
227

228         // recognized features
229
String JavaDoc VALIDATION = "http://xml.org/sax/features/validation";
230         String JavaDoc[] recognizedFeatures = {
231             AUGMENTATIONS,
232             NAMESPACES,
233             VALIDATION,
234             REPORT_ERRORS,
235             SIMPLE_ERROR_FORMAT,
236             BALANCE_TAGS,
237         };
238         addRecognizedFeatures(recognizedFeatures);
239         setFeature(AUGMENTATIONS, false);
240         setFeature(NAMESPACES, true);
241         setFeature(VALIDATION, false);
242         setFeature(REPORT_ERRORS, false);
243         setFeature(SIMPLE_ERROR_FORMAT, false);
244         setFeature(BALANCE_TAGS, true);
245
246         // HACK: Xerces 2.0.0
247
if (XERCES_2_0_0) {
248             // NOTE: These features should not be required but it causes a
249
// problem if they're not there. This will be fixed in
250
// subsequent releases of Xerces.
251
recognizedFeatures = new String JavaDoc[] {
252                 "http://apache.org/xml/features/scanner/notify-builtin-refs",
253             };
254             addRecognizedFeatures(recognizedFeatures);
255         }
256         
257         // HACK: Xerces 2.0.1
258
if (XERCES_2_0_0 || XERCES_2_0_1 || XML4J_4_0_x) {
259             // NOTE: These features should not be required but it causes a
260
// problem if they're not there. This should be fixed in
261
// subsequent releases of Xerces.
262
recognizedFeatures = new String JavaDoc[] {
263                 "http://apache.org/xml/features/validation/schema/normalized-value",
264                 "http://apache.org/xml/features/scanner/notify-char-refs",
265             };
266             addRecognizedFeatures(recognizedFeatures);
267         }
268         
269         //
270
// properties
271
//
272

273         // recognized properties
274
String JavaDoc[] recognizedProperties = {
275             NAMES_ELEMS,
276             NAMES_ATTRS,
277             FILTERS,
278             ERROR_REPORTER,
279         };
280         addRecognizedProperties(recognizedProperties);
281         setProperty(NAMES_ELEMS, "upper");
282         setProperty(NAMES_ATTRS, "lower");
283         setProperty(ERROR_REPORTER, fErrorReporter);
284         
285         // HACK: Xerces 2.0.0
286
if (XERCES_2_0_0) {
287             // NOTE: This is a hack to get around a problem in the Xerces 2.0.0
288
// AbstractSAXParser. If it uses a parser configuration that
289
// does not have a SymbolTable, then it will remove *all*
290
// attributes. This will be fixed in subsequent releases of
291
// Xerces.
292
String JavaDoc SYMBOL_TABLE = "http://apache.org/xml/properties/internal/symbol-table";
293             recognizedProperties = new String JavaDoc[] {
294                 SYMBOL_TABLE,
295             };
296             addRecognizedProperties(recognizedProperties);
297             Object JavaDoc symbolTable = ObjectFactory.createObject("org.apache.xerces.util.SymbolTable",
298                                                             "org.apache.xerces.util.SymbolTable");
299             setProperty(SYMBOL_TABLE, symbolTable);
300         }
301
302     } // <init>()
303

304     //
305
// Public methods
306
//
307

308     /**
309      * Pushes an input source onto the current entity stack. This
310      * enables the scanner to transparently scan new content (e.g.
311      * the output written by an embedded script). At the end of the
312      * current entity, the scanner returns where it left off at the
313      * time this entity source was pushed.
314      * <p>
315      * <strong>Hint:</strong>
316      * To use this feature to insert the output of &lt;SCRIPT&gt;
317      * tags, remember to buffer the <em>entire</em> output of the
318      * processed instructions before pushing a new input source.
319      * Otherwise, events may appear out of sequence.
320      *
321      * @param inputSource The new input source to start scanning.
322      */

323     public void pushInputSource(XMLInputSource inputSource) {
324         fDocumentScanner.pushInputSource(inputSource);
325     } // pushInputSource(XMLInputSource)
326

327     // XMLParserConfiguration methods
328
//
329

330     /** Sets a feature. */
331     public void setFeature(String JavaDoc featureId, boolean state)
332         throws XMLConfigurationException {
333         super.setFeature(featureId, state);
334         int size = fHTMLComponents.size();
335         for (int i = 0; i < size; i++) {
336             HTMLComponent component = (HTMLComponent)fHTMLComponents.elementAt(i);
337             component.setFeature(featureId, state);
338         }
339     } // setFeature(String,boolean)
340

341     /** Sets a property. */
342     public void setProperty(String JavaDoc propertyId, Object JavaDoc value)
343         throws XMLConfigurationException {
344         super.setProperty(propertyId, value);
345
346         if (propertyId.equals(FILTERS)) {
347             XMLDocumentFilter[] filters = (XMLDocumentFilter[])getProperty(FILTERS);
348             if (filters != null) {
349                 for (int i = 0; i < filters.length; i++) {
350                     XMLDocumentFilter filter = filters[i];
351                     if (filter instanceof HTMLComponent) {
352                         addComponent((HTMLComponent)filter);
353                     }
354                 }
355             }
356         }
357
358         int size = fHTMLComponents.size();
359         for (int i = 0; i < size; i++) {
360             HTMLComponent component = (HTMLComponent)fHTMLComponents.elementAt(i);
361             component.setProperty(propertyId, value);
362         }
363     } // setProperty(String,Object)
364

365     /** Sets the document handler. */
366     public void setDocumentHandler(XMLDocumentHandler handler) {
367         fDocumentHandler = handler;
368     } // setDocumentHandler(XMLDocumentHandler)
369

370     /** Returns the document handler. */
371     public XMLDocumentHandler getDocumentHandler() {
372         return fDocumentHandler;
373     } // getDocumentHandler():XMLDocumentHandler
374

375     /** Sets the DTD handler. */
376     public void setDTDHandler(XMLDTDHandler handler) {
377         fDTDHandler = handler;
378     } // setDTDHandler(XMLDTDHandler)
379

380     /** Returns the DTD handler. */
381     public XMLDTDHandler getDTDHandler() {
382         return fDTDHandler;
383     } // getDTDHandler():XMLDTDHandler
384

385     /** Sets the DTD content model handler. */
386     public void setDTDContentModelHandler(XMLDTDContentModelHandler handler) {
387         fDTDContentModelHandler = handler;
388     } // setDTDContentModelHandler(XMLDTDContentModelHandler)
389

390     /** Returns the DTD content model handler. */
391     public XMLDTDContentModelHandler getDTDContentModelHandler() {
392         return fDTDContentModelHandler;
393     } // getDTDContentModelHandler():XMLDTDContentModelHandler
394

395     /** Sets the error handler. */
396     public void setErrorHandler(XMLErrorHandler handler) {
397         fErrorHandler = handler;
398     } // setErrorHandler(XMLErrorHandler)
399

400     /** Returns the error handler. */
401     public XMLErrorHandler getErrorHandler() {
402         return fErrorHandler;
403     } // getErrorHandler():XMLErrorHandler
404

405     /** Sets the entity resolver. */
406     public void setEntityResolver(XMLEntityResolver resolver) {
407         fEntityResolver = resolver;
408     } // setEntityResolver(XMLEntityResolver)
409

410     /** Returns the entity resolver. */
411     public XMLEntityResolver getEntityResolver() {
412         return fEntityResolver;
413     } // getEntityResolver():XMLEntityResolver
414

415     /** Sets the locale. */
416     public void setLocale(Locale JavaDoc locale) {
417         if (locale == null) {
418             locale = Locale.getDefault();
419         }
420         fLocale = locale;
421     } // setLocale(Locale)
422

423     /** Returns the locale. */
424     public Locale JavaDoc getLocale() {
425         return fLocale;
426     } // getLocale():Locale
427

428     /** Parses a document. */
429     public void parse(XMLInputSource source) throws XNIException, IOException JavaDoc {
430         setInputSource(source);
431         parse(true);
432     } // parse(XMLInputSource)
433

434     //
435
// XMLPullParserConfiguration methods
436
//
437

438     // parsing
439

440     /**
441      * Sets the input source for the document to parse.
442      *
443      * @param inputSource The document's input source.
444      *
445      * @exception XMLConfigurationException Thrown if there is a
446      * configuration error when initializing the
447      * parser.
448      * @exception IOException Thrown on I/O error.
449      *
450      * @see #parse(boolean)
451      */

452     public void setInputSource(XMLInputSource inputSource)
453         throws XMLConfigurationException, IOException JavaDoc {
454         reset();
455         fCloseStream = inputSource.getByteStream() == null &&
456                        inputSource.getCharacterStream() == null;
457         fDocumentScanner.setInputSource(inputSource);
458     } // setInputSource(XMLInputSource)
459

460     /**
461      * Parses the document in a pull parsing fashion.
462      *
463      * @param complete True if the pull parser should parse the
464      * remaining document completely.
465      *
466      * @return True if there is more document to parse.
467      *
468      * @exception XNIException Any XNI exception, possibly wrapping
469      * another exception.
470      * @exception IOException An IO exception from the parser, possibly
471      * from a byte stream or character stream
472      * supplied by the parser.
473      *
474      * @see #setInputSource
475      */

476     public boolean parse(boolean complete) throws XNIException, IOException JavaDoc {
477         try {
478             boolean more = fDocumentScanner.scanDocument(complete);
479             if (!more) {
480                 cleanup();
481             }
482             return more;
483         }
484         catch (XNIException e) {
485             cleanup();
486             throw e;
487         }
488         catch (IOException JavaDoc e) {
489             cleanup();
490             throw e;
491         }
492     } // parse(boolean):boolean
493

494     /**
495      * If the application decides to terminate parsing before the xml document
496      * is fully parsed, the application should call this method to free any
497      * resource allocated during parsing. For example, close all opened streams.
498      */

499     public void cleanup() {
500         fDocumentScanner.cleanup(fCloseStream);
501     } // cleanup()
502

503     //
504
// Protected methods
505
//
506

507     /** Adds a component. */
508     protected void addComponent(HTMLComponent component) {
509
510         // add component to list
511
fHTMLComponents.addElement(component);
512
513         // add recognized features and set default states
514
String JavaDoc[] features = component.getRecognizedFeatures();
515         addRecognizedFeatures(features);
516         int featureCount = features != null ? features.length : 0;
517         for (int i = 0; i < featureCount; i++) {
518             Boolean JavaDoc state = component.getFeatureDefault(features[i]);
519             if (state != null) {
520                 setFeature(features[i], state.booleanValue());
521             }
522         }
523
524         // add recognized properties and set default values
525
String JavaDoc[] properties = component.getRecognizedProperties();
526         addRecognizedProperties(properties);
527         int propertyCount = properties != null ? properties.length : 0;
528         for (int i = 0; i < propertyCount; i++) {
529             Object JavaDoc value = component.getPropertyDefault(properties[i]);
530             if (value != null) {
531                 setProperty(properties[i], value);
532             }
533         }
534
535     } // addComponent(HTMLComponent)
536

537     /** Resets the parser configuration. */
538     protected void reset() throws XMLConfigurationException {
539
540         // reset components
541
int size = fHTMLComponents.size();
542         for (int i = 0; i < size; i++) {
543             HTMLComponent component = (HTMLComponent)fHTMLComponents.elementAt(i);
544             component.reset(this);
545         }
546
547         // configure pipeline
548
XMLDocumentSource lastSource = fDocumentScanner;
549         if (getFeature(BALANCE_TAGS)) {
550             lastSource.setDocumentHandler(fTagBalancer);
551             fTagBalancer.setDocumentSource(fDocumentScanner);
552             lastSource = fTagBalancer;
553         }
554         if (getFeature(NAMESPACES)) {
555             lastSource.setDocumentHandler(fNamespaceBinder);
556             fNamespaceBinder.setDocumentSource(fTagBalancer);
557             lastSource = fNamespaceBinder;
558         }
559         XMLDocumentFilter[] filters = (XMLDocumentFilter[])getProperty(FILTERS);
560         if (filters != null) {
561             for (int i = 0; i < filters.length; i++) {
562                 XMLDocumentFilter filter = filters[i];
563                 Class JavaDoc filterClass = filter.getClass();
564                 try {
565                     Method JavaDoc filterMethod = filterClass.getMethod("setDocumentSource", DOCSOURCE);
566                     if (filterMethod != null) {
567                         filterMethod.invoke(filter, new Object JavaDoc[] { lastSource });
568                     }
569                 }
570                 catch (IllegalAccessException JavaDoc e) {
571                     // ignore
572
}
573                 catch (InvocationTargetException JavaDoc e) {
574                     // ignore
575
}
576                 catch (NoSuchMethodException JavaDoc e) {
577                     // ignore
578
}
579                 lastSource.setDocumentHandler(filter);
580                 lastSource = filter;
581             }
582         }
583         lastSource.setDocumentHandler(fDocumentHandler);
584
585     } // reset()
586

587     //
588
// Interfaces
589
//
590

591     /**
592      * Defines an error reporter for reporting HTML errors. There is no such
593      * thing as a fatal error in parsing HTML. I/O errors are fatal but should
594      * throw an <code>IOException</code> directly instead of reporting an error.
595      * <p>
596      * When used in a configuration, the error reporter instance should be
597      * set as a property with the following property identifier:
598      * <pre>
599      * "http://cyberneko.org/html/internal/error-reporter" in the
600      * </pre>
601      * Components in the configuration can query the error reporter using this
602      * property identifier.
603      * <p>
604      * <strong>Note:</strong>
605      * All reported errors are within the domain "http://cyberneko.org/html".
606      *
607      * @author Andy Clark
608      */

609     protected class ErrorReporter
610         implements HTMLErrorReporter {
611
612         //
613
// Data
614
//
615

616         /** Last locale. */
617         protected Locale JavaDoc fLastLocale;
618
619         /** Error messages. */
620         protected ResourceBundle JavaDoc fErrorMessages;
621
622         //
623
// HTMLErrorReporter methods
624
//
625

626         /** Format message without reporting error. */
627         public String JavaDoc formatMessage(String JavaDoc key, Object JavaDoc[] args) {
628             if (!getFeature(SIMPLE_ERROR_FORMAT)) {
629                 if (!fLocale.equals(fLastLocale)) {
630                     fErrorMessages = null;
631                     fLastLocale = fLocale;
632                 }
633                 if (fErrorMessages == null) {
634                     fErrorMessages =
635                         ResourceBundle.getBundle("org/cyberneko/html/res/ErrorMessages",
636                                                  fLocale);
637                 }
638                 try {
639                     String JavaDoc value = fErrorMessages.getString(key);
640                     String JavaDoc message = MessageFormat.format(value, args);
641                     return message;
642                 }
643                 catch (MissingResourceException JavaDoc e) {
644                     // ignore and return a simple format
645
}
646             }
647             return formatSimpleMessage(key, args);
648         } // formatMessage(String,Object[]):String
649

650         /** Reports a warning. */
651         public void reportWarning(String JavaDoc key, Object JavaDoc[] args)
652             throws XMLParseException {
653             if (fErrorHandler != null) {
654                 fErrorHandler.warning(ERROR_DOMAIN, key, createException(key, args));
655             }
656         } // reportWarning(String,Object[])
657

658         /** Reports an error. */
659         public void reportError(String JavaDoc key, Object JavaDoc[] args)
660             throws XMLParseException {
661             if (fErrorHandler != null) {
662                 fErrorHandler.error(ERROR_DOMAIN, key, createException(key, args));
663             }
664         } // reportError(String,Object[])
665

666         //
667
// Protected methods
668
//
669

670         /** Creates parse exception. */
671         protected XMLParseException createException(String JavaDoc key, Object JavaDoc[] args) {
672             String JavaDoc message = formatMessage(key, args);
673             return new XMLParseException(fDocumentScanner, message);
674         } // createException(String,Object[]):XMLParseException
675

676         /** Format simple message. */
677         protected String JavaDoc formatSimpleMessage(String JavaDoc key, Object JavaDoc[] args) {
678             StringBuffer JavaDoc str = new StringBuffer JavaDoc();
679             str.append(ERROR_DOMAIN);
680             str.append('#');
681             str.append(key);
682             if (args != null && args.length > 0) {
683                 str.append('\t');
684                 for (int i = 0; i < args.length; i++) {
685                     if (i > 0) {
686                         str.append('\t');
687                     }
688                     str.append(String.valueOf(args[i]));
689                 }
690             }
691             return str.toString();
692         } // formatSimpleMessage(String,
693

694     } // class ErrorReporter
695

696 } // class HTMLConfiguration
697
Popular Tags