KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > sax > XMLReader


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Derrick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/sax/XMLReader.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2004/07/14 01:58:02 $
10
// $Revision: 1.1 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.sax;
28
29 import java.io.IOException JavaDoc;
30
31 import org.xml.sax.ContentHandler JavaDoc;
32 import org.xml.sax.DTDHandler JavaDoc;
33 import org.xml.sax.EntityResolver JavaDoc;
34 import org.xml.sax.ErrorHandler JavaDoc;
35 import org.xml.sax.InputSource JavaDoc;
36 import org.xml.sax.SAXException JavaDoc;
37 import org.xml.sax.SAXNotRecognizedException JavaDoc;
38 import org.xml.sax.SAXNotSupportedException JavaDoc;
39 import org.xml.sax.SAXParseException JavaDoc;
40 import org.xml.sax.helpers.NamespaceSupport JavaDoc;
41
42 import org.htmlparser.Node;
43 import org.htmlparser.Parser;
44 import org.htmlparser.Remark;
45 import org.htmlparser.Tag;
46 import org.htmlparser.Text;
47 import org.htmlparser.util.DefaultParserFeedback;
48 import org.htmlparser.util.NodeIterator;
49 import org.htmlparser.util.NodeList;
50 import org.htmlparser.util.ParserException;
51 import org.htmlparser.util.ParserFeedback;
52
53 /**
54  * SAX parser.
55  * Generates callbacks on the {@link ContentHandler} based on encountered nodes.
56  * <br><em>Preliminary</em>.
57  * <pre>
58  * org.xml.sax.XMLReader reader = org.xml.sax.helpers.XMLReaderFactory.createXMLReader ("org.htmlparser.sax.XMLReader");
59  * org.xml.sax.ContentHandler content = new MyContentHandler ();
60  * reader.setContentHandler (content);
61  * org.xml.sax.ErrorHandler errors = new MyErrorHandler ();
62  * reader.setErrorHandler (errors);
63  * reader.parse ("http://cbc.ca");
64  * </pre>
65  */

66 public class XMLReader
67     implements
68         org.xml.sax.XMLReader JavaDoc
69 {
70     /**
71      * Determines if namespace handling is on.
72      * All XMLReaders are required to recognize the feature names:
73      * <ul>
74      * <li><code>http://xml.org/sax/features/namespaces</code> -
75      * a value of "true" indicates namespace URIs and unprefixed
76      * local names for element and attribute names will be available</li>
77      * <li><code>http://xml.org/sax/features/namespace-prefixes</code> -
78      * a value of "true" indicates that XML qualified names (with
79      * prefixes) and attributes (including xmlns* attributes) will
80      * be available.
81      * </ul>
82      */

83     protected boolean mNameSpaces; // namespaces
84

85     /**
86      * Determines if namespace prefix handling is on.
87      * @see #mNameSpaces
88      */

89     protected boolean mNameSpacePrefixes; // namespace-prefixes
90

91     /**
92      * <em> not implemented</em>
93      */

94     protected EntityResolver JavaDoc mEntityResolver;
95
96     /**
97      * <em> not implemented</em>
98      */

99     protected DTDHandler JavaDoc mDTDHandler;
100
101     /**
102      * The content callback object.
103      */

104     protected ContentHandler JavaDoc mContentHandler;
105
106     /**
107      * The error handler object.
108      */

109     protected ErrorHandler JavaDoc mErrorHandler;
110
111     /**
112      * The underlying DOM parser.
113      */

114     protected Parser mParser;
115
116     /**
117      * Namspace utility object.
118      */

119     protected NamespaceSupport JavaDoc mSupport;
120
121     /**
122      * Qualified name parts.
123      */

124     protected String JavaDoc mParts[];
125
126     /**
127      * Create an SAX parser.
128      */

129     public XMLReader ()
130     {
131         mNameSpaces = true;
132         mNameSpacePrefixes = false;
133         
134         mEntityResolver = null;
135         mDTDHandler = null;
136         mContentHandler = null;
137         mErrorHandler = null;
138
139         mSupport = new NamespaceSupport JavaDoc ();
140         mSupport.pushContext ();
141         mSupport.declarePrefix ("", "http://www.w3.org/TR/REC-html40");
142         // todo:
143
// xmlns:html='http://www.w3.org/TR/REC-html40'
144
// or xmlns:html='http://www.w3.org/1999/xhtml'
145
mParts = new String JavaDoc[3];
146     }
147     
148     ////////////////////////////////////////////////////////////////////
149
// Configuration.
150
////////////////////////////////////////////////////////////////////
151

152
153     /**
154      * Look up the value of a feature flag.
155      *
156      * <p>The feature name is any fully-qualified URI. It is
157      * possible for an XMLReader to recognize a feature name but
158      * temporarily be unable to return its value.
159      * Some feature values may be available only in specific
160      * contexts, such as before, during, or after a parse.
161      * Also, some feature values may not be programmatically accessible.
162      * (In the case of an adapter for SAX1 {@link Parser}, there is no
163      * implementation-independent way to expose whether the underlying
164      * parser is performing validation, expanding external entities,
165      * and so forth.) </p>
166      *
167      * <p>All XMLReaders are required to recognize the
168      * http://xml.org/sax/features/namespaces and the
169      * http://xml.org/sax/features/namespace-prefixes feature names.</p>
170      *
171      * <p>Typical usage is something like this:</p>
172      *
173      * <pre>
174      * XMLReader r = new MySAXDriver();
175      *
176      * // try to activate validation
177      * try {
178      * r.setFeature("http://xml.org/sax/features/validation", true);
179      * } catch (SAXException e) {
180      * System.err.println("Cannot activate validation.");
181      * }
182      *
183      * // register event handlers
184      * r.setContentHandler(new MyContentHandler());
185      * r.setErrorHandler(new MyErrorHandler());
186      *
187      * // parse the first document
188      * try {
189      * r.parse("http://www.foo.com/mydoc.xml");
190      * } catch (IOException e) {
191      * System.err.println("I/O exception reading XML document");
192      * } catch (SAXException e) {
193      * System.err.println("XML exception reading document.");
194      * }
195      * </pre>
196      *
197      * <p>Implementors are free (and encouraged) to invent their own features,
198      * using names built on their own URIs.</p>
199      *
200      * @param name The feature name, which is a fully-qualified URI.
201      * @return The current value of the feature (true or false).
202      * @exception org.xml.sax.SAXNotRecognizedException If the feature
203      * value can't be assigned or retrieved.
204      * @exception org.xml.sax.SAXNotSupportedException When the
205      * XMLReader recognizes the feature name but
206      * cannot determine its value at this time.
207      * @see #setFeature
208      */

209     public boolean getFeature (String JavaDoc name)
210         throws SAXNotRecognizedException JavaDoc, SAXNotSupportedException JavaDoc
211     {
212         boolean ret;
213
214         if (name.equals ("http://xml.org/sax/features/namespaces"))
215             ret = mNameSpaces;
216         else if (name.equals ("http://xml.org/sax/features/namespace-prefixes"))
217             ret = mNameSpacePrefixes;
218         else
219             throw new SAXNotSupportedException JavaDoc (name + " not yet understood");
220
221         return (ret);
222     }
223
224
225     /**
226      * Set the value of a feature flag.
227      *
228      * <p>The feature name is any fully-qualified URI. It is
229      * possible for an XMLReader to expose a feature value but
230      * to be unable to change the current value.
231      * Some feature values may be immutable or mutable only
232      * in specific contexts, such as before, during, or after
233      * a parse.</p>
234      *
235      * <p>All XMLReaders are required to support setting
236      * http://xml.org/sax/features/namespaces to true and
237      * http://xml.org/sax/features/namespace-prefixes to false.</p>
238      *
239      * @param name The feature name, which is a fully-qualified URI.
240      * @param value The requested value of the feature (true or false).
241      * @exception org.xml.sax.SAXNotRecognizedException If the feature
242      * value can't be assigned or retrieved.
243      * @exception org.xml.sax.SAXNotSupportedException When the
244      * XMLReader recognizes the feature name but
245      * cannot set the requested value.
246      * @see #getFeature
247      */

248     public void setFeature (String JavaDoc name, boolean value)
249     throws SAXNotRecognizedException JavaDoc, SAXNotSupportedException JavaDoc
250     {
251         if (name.equals ("http://xml.org/sax/features/namespaces"))
252             mNameSpaces = value;
253         else if (name.equals ("http://xml.org/sax/features/namespace-prefixes"))
254             mNameSpacePrefixes = value;
255         else
256             throw new SAXNotSupportedException JavaDoc (name + " not yet understood");
257     }
258
259
260     /**
261      * Look up the value of a property.
262      *
263      * <p>The property name is any fully-qualified URI. It is
264      * possible for an XMLReader to recognize a property name but
265      * temporarily be unable to return its value.
266      * Some property values may be available only in specific
267      * contexts, such as before, during, or after a parse.</p>
268      *
269      * <p>XMLReaders are not required to recognize any specific
270      * property names, though an initial core set is documented for
271      * SAX2.</p>
272      *
273      * <p>Implementors are free (and encouraged) to invent their own properties,
274      * using names built on their own URIs.</p>
275      *
276      * @param name The property name, which is a fully-qualified URI.
277      * @return The current value of the property.
278      * @exception org.xml.sax.SAXNotRecognizedException If the property
279      * value can't be assigned or retrieved.
280      * @exception org.xml.sax.SAXNotSupportedException When the
281      * XMLReader recognizes the property name but
282      * cannot determine its value at this time.
283      * @see #setProperty
284      */

285     public Object JavaDoc getProperty (String JavaDoc name)
286     throws SAXNotRecognizedException JavaDoc, SAXNotSupportedException JavaDoc
287     {
288         throw new SAXNotSupportedException JavaDoc (name + " not yet understood");
289     }
290
291
292     /**
293      * Set the value of a property.
294      *
295      * <p>The property name is any fully-qualified URI. It is
296      * possible for an XMLReader to recognize a property name but
297      * to be unable to change the current value.
298      * Some property values may be immutable or mutable only
299      * in specific contexts, such as before, during, or after
300      * a parse.</p>
301      *
302      * <p>XMLReaders are not required to recognize setting
303      * any specific property names, though a core set is defined by
304      * SAX2.</p>
305      *
306      * <p>This method is also the standard mechanism for setting
307      * extended handlers.</p>
308      *
309      * @param name The property name, which is a fully-qualified URI.
310      * @param value The requested value for the property.
311      * @exception org.xml.sax.SAXNotRecognizedException If the property
312      * value can't be assigned or retrieved.
313      * @exception org.xml.sax.SAXNotSupportedException When the
314      * XMLReader recognizes the property name but
315      * cannot set the requested value.
316      */

317     public void setProperty (String JavaDoc name, Object JavaDoc value)
318     throws SAXNotRecognizedException JavaDoc, SAXNotSupportedException JavaDoc
319     {
320         throw new SAXNotSupportedException JavaDoc (name + " not yet understood");
321     }
322
323     ////////////////////////////////////////////////////////////////////
324
// Event handlers.
325
////////////////////////////////////////////////////////////////////
326

327
328     /**
329      * Allow an application to register an entity resolver.
330      *
331      * <p>If the application does not register an entity resolver,
332      * the XMLReader will perform its own default resolution.</p>
333      *
334      * <p>Applications may register a new or different resolver in the
335      * middle of a parse, and the SAX parser must begin using the new
336      * resolver immediately.</p>
337      *
338      * @param resolver The entity resolver.
339      * @see #getEntityResolver
340      */

341     public void setEntityResolver (EntityResolver JavaDoc resolver)
342     {
343         mEntityResolver = resolver;
344     }
345
346
347     /**
348      * Return the current entity resolver.
349      *
350      * @return The current entity resolver, or null if none
351      * has been registered.
352      * @see #setEntityResolver
353      */

354     public EntityResolver JavaDoc getEntityResolver ()
355     {
356         return (mEntityResolver);
357     }
358
359
360     /**
361      * Allow an application to register a DTD event handler.
362      *
363      * <p>If the application does not register a DTD handler, all DTD
364      * events reported by the SAX parser will be silently ignored.</p>
365      *
366      * <p>Applications may register a new or different handler in the
367      * middle of a parse, and the SAX parser must begin using the new
368      * handler immediately.</p>
369      *
370      * @param handler The DTD handler.
371      * @see #getDTDHandler
372      */

373     public void setDTDHandler (DTDHandler JavaDoc handler)
374     {
375         mDTDHandler = handler;
376     }
377
378
379     /**
380      * Return the current DTD handler.
381      *
382      * @return The current DTD handler, or null if none
383      * has been registered.
384      * @see #setDTDHandler
385      */

386     public DTDHandler JavaDoc getDTDHandler ()
387     {
388         return (mDTDHandler);
389     }
390
391
392     /**
393      * Allow an application to register a content event handler.
394      *
395      * <p>If the application does not register a content handler, all
396      * content events reported by the SAX parser will be silently
397      * ignored.</p>
398      *
399      * <p>Applications may register a new or different handler in the
400      * middle of a parse, and the SAX parser must begin using the new
401      * handler immediately.</p>
402      *
403      * @param handler The content handler.
404      * @see #getContentHandler
405      */

406     public void setContentHandler (ContentHandler JavaDoc handler)
407     {
408         mContentHandler = handler;
409     }
410
411
412     /**
413      * Return the current content handler.
414      *
415      * @return The current content handler, or null if none
416      * has been registered.
417      * @see #setContentHandler
418      */

419     public ContentHandler JavaDoc getContentHandler ()
420     {
421         return (mContentHandler);
422     }
423
424
425     /**
426      * Allow an application to register an error event handler.
427      *
428      * <p>If the application does not register an error handler, all
429      * error events reported by the SAX parser will be silently
430      * ignored; however, normal processing may not continue. It is
431      * highly recommended that all SAX applications implement an
432      * error handler to avoid unexpected bugs.</p>
433      *
434      * <p>Applications may register a new or different handler in the
435      * middle of a parse, and the SAX parser must begin using the new
436      * handler immediately.</p>
437      *
438      * @param handler The error handler.
439      * @see #getErrorHandler
440      */

441     public void setErrorHandler (ErrorHandler JavaDoc handler)
442     {
443         mErrorHandler = handler;
444     }
445
446
447     /**
448      * Return the current error handler.
449      *
450      * @return The current error handler, or null if none
451      * has been registered.
452      * @see #setErrorHandler
453      */

454     public ErrorHandler JavaDoc getErrorHandler ()
455     {
456         return (mErrorHandler);
457     }
458
459
460     ////////////////////////////////////////////////////////////////////
461
// Parsing.
462
////////////////////////////////////////////////////////////////////
463

464     /**
465      * Parse an XML document.
466      *
467      * <p>The application can use this method to instruct the XML
468      * reader to begin parsing an XML document from any valid input
469      * source (a character stream, a byte stream, or a URI).</p>
470      *
471      * <p>Applications may not invoke this method while a parse is in
472      * progress (they should create a new XMLReader instead for each
473      * nested XML document). Once a parse is complete, an
474      * application may reuse the same XMLReader object, possibly with a
475      * different input source.
476      * Configuration of the XMLReader object (such as handler bindings and
477      * values established for feature flags and properties) is unchanged
478      * by completion of a parse, unless the definition of that aspect of
479      * the configuration explicitly specifies other behavior.
480      * (For example, feature flags or properties exposing
481      * characteristics of the document being parsed.)
482      * </p>
483      *
484      * <p>During the parse, the XMLReader will provide information
485      * about the XML document through the registered event
486      * handlers.</p>
487      *
488      * <p>This method is synchronous: it will not return until parsing
489      * has ended. If a client application wants to terminate
490      * parsing early, it should throw an exception.</p>
491      *
492      * @param input The input source for the top-level of the
493      * XML document.
494      * @exception org.xml.sax.SAXException Any SAX exception, possibly
495      * wrapping another exception.
496      * @exception java.io.IOException An IO exception from the parser,
497      * possibly from a byte stream or character stream
498      * supplied by the application.
499      * @see org.xml.sax.InputSource
500      * @see #parse(java.lang.String)
501      * @see #setEntityResolver
502      * @see #setDTDHandler
503      * @see #setContentHandler
504      * @see #setErrorHandler
505      */

506     public void parse (InputSource JavaDoc input)
507     throws IOException JavaDoc, SAXException JavaDoc
508     {
509         throw new SAXException JavaDoc ("parse (InputSource input) is not yet supported");
510     }
511
512
513     /**
514      * Parse an XML document from a system identifier (URI).
515      *
516      * <p>This method is a shortcut for the common case of reading a
517      * document from a system identifier. It is the exact
518      * equivalent of the following:</p>
519      *
520      * <pre>
521      * parse(new InputSource(systemId));
522      * </pre>
523      *
524      * <p>If the system identifier is a URL, it must be fully resolved
525      * by the application before it is passed to the parser.</p>
526      *
527      * @param systemId The system identifier (URI).
528      * @exception org.xml.sax.SAXException Any SAX exception, possibly
529      * wrapping another exception.
530      * @exception java.io.IOException An IO exception from the parser,
531      * possibly from a byte stream or character stream
532      * supplied by the application.
533      * @see #parse(org.xml.sax.InputSource)
534      */

535     public void parse (String JavaDoc systemId)
536     throws IOException JavaDoc, SAXException JavaDoc
537     {
538         Locator locator;
539         ParserFeedback feedback;
540
541         if (null != mContentHandler)
542             try
543             {
544                 mParser = new Parser (systemId);
545                 locator = new Locator (mParser);
546                 if (null != mErrorHandler)
547                     feedback = new Feedback (mErrorHandler, locator);
548                 else
549                     feedback = new DefaultParserFeedback (DefaultParserFeedback.QUIET);
550                 mParser.setFeedback (feedback);
551
552                 // OK, try a simplistic parse
553
mContentHandler.setDocumentLocator (locator);
554                 try
555                 {
556                     mContentHandler.startDocument ();
557                     for (NodeIterator iterator = mParser.elements (); iterator.hasMoreNodes (); )
558                         doSAX (iterator.nextNode ());
559                     mContentHandler.endDocument ();
560                 }
561                 catch (SAXException JavaDoc se)
562                 {
563                     if (null != mErrorHandler)
564                         mErrorHandler.fatalError (
565                             new SAXParseException JavaDoc ("contentHandler threw me", locator, se));
566                 }
567             }
568             catch (ParserException pe)
569             {
570                 if (null != mErrorHandler)
571                     mErrorHandler.fatalError (
572                         new SAXParseException JavaDoc (pe.getMessage (), "", systemId, 0, 0));
573
574             }
575     }
576
577     /**
578      * Process nodes recursively on the DocumentHandler.
579      * Calls methods on the handler based on the type and whether it's an end tag.
580      * Processes composite tags recursively.
581      * Does rudimentary namespace processing according to the state of {@link #mNameSpaces}
582      * and {@link #mNameSpacePrefixes}.
583      * @param node The htmlparser node to traverse.
584      */

585     protected void doSAX (Node node)
586         throws
587             ParserException,
588             SAXException JavaDoc
589     {
590         Tag tag;
591         Tag end;
592
593         if (node instanceof Remark)
594         {
595             String JavaDoc text = mParser.getLexer ().getPage ().getText (node.getStartPosition (), node.getEndPosition ());
596             mContentHandler.ignorableWhitespace (text.toCharArray (), 0, text.length ());
597         }
598         else if (node instanceof Text)
599         {
600             String JavaDoc text = mParser.getLexer ().getPage ().getText (node.getStartPosition (), node.getEndPosition ());
601             mContentHandler.characters (text.toCharArray (), 0, text.length ());
602         }
603         else if (node instanceof Tag)
604         {
605             tag = (Tag)node;
606             if (mNameSpaces)
607                 mSupport.processName (tag.getTagName (), mParts, false);
608             else
609             {
610                 mParts[0] = "";
611                 mParts[1] = "";
612             }
613             if (mNameSpacePrefixes)
614                 mParts[2] = tag.getTagName ();
615             else if (mNameSpaces)
616                 mParts[2] = "";
617             else
618                 mParts[2] = tag.getTagName ();
619
620             mContentHandler.startElement (
621                 mParts[0], // uri
622
mParts[1], // local
623
mParts[2], // raw
624
new Attributes (tag, mSupport, mParts));
625             NodeList children = tag.getChildren ();
626             if (null != children)
627                 for (int i = 0; i < children.size (); i++)
628                     doSAX (children.elementAt (i));
629             end = tag.getEndTag ();
630             if (null != end)
631             {
632                 if (mNameSpaces)
633                     mSupport.processName (end.getTagName (), mParts, false);
634                 else
635                 {
636                     mParts[0] = "";
637                     mParts[1] = "";
638                 }
639                 if (mNameSpacePrefixes)
640                     mParts[2] = end.getTagName ();
641                 else if (mNameSpaces)
642                     mParts[2] = "";
643                 else
644                     mParts[2] = end.getTagName ();
645                 mContentHandler.endElement (
646                     mParts[0], // uri
647
mParts[1], // local
648
mParts[2]); // raw
649
}
650         }
651     }
652 }
653
654
Popular Tags