KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > gargoylesoftware > htmlunit > html > HTMLParser


1 /*
2  * Copyright (c) 2002, 2005 Gargoyle Software Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  * 1. Redistributions of source code must retain the above copyright notice,
8  * this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright notice,
10  * this list of conditions and the following disclaimer in the documentation
11  * and/or other materials provided with the distribution.
12  * 3. The end-user documentation included with the redistribution, if any, must
13  * include the following acknowledgment:
14  *
15  * "This product includes software developed by Gargoyle Software Inc.
16  * (http://www.GargoyleSoftware.com/)."
17  *
18  * Alternately, this acknowledgment may appear in the software itself, if
19  * and wherever such third-party acknowledgments normally appear.
20  * 4. The name "Gargoyle Software" must not be used to endorse or promote
21  * products derived from this software without prior written permission.
22  * For written permission, please contact info@GargoyleSoftware.com.
23  * 5. Products derived from this software may not be called "HtmlUnit", nor may
24  * "HtmlUnit" appear in their name, without prior written permission of
25  * Gargoyle Software Inc.
26  *
27  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
28  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
29  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GARGOYLE
30  * SOFTWARE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
31  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
33  * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
36  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37  */

38 package com.gargoylesoftware.htmlunit.html;
39
40 import java.io.ByteArrayInputStream JavaDoc;
41 import java.io.IOException JavaDoc;
42 import java.io.InputStreamReader JavaDoc;
43 import java.io.UnsupportedEncodingException JavaDoc;
44 import java.net.URL JavaDoc;
45 import java.util.HashMap JavaDoc;
46 import java.util.Map JavaDoc;
47 import java.util.Stack JavaDoc;
48
49 import org.apache.xerces.parsers.AbstractSAXParser;
50 import org.apache.xerces.util.DefaultErrorHandler;
51 import org.apache.xerces.xni.XNIException;
52 import org.apache.xerces.xni.parser.XMLInputSource;
53 import org.apache.xerces.xni.parser.XMLParseException;
54 import org.cyberneko.html.HTMLConfiguration;
55 import org.xml.sax.Attributes JavaDoc;
56 import org.xml.sax.ContentHandler JavaDoc;
57 import org.xml.sax.Locator JavaDoc;
58 import org.xml.sax.SAXException JavaDoc;
59
60 import com.gargoylesoftware.htmlunit.Assert;
61 import com.gargoylesoftware.htmlunit.ObjectInstantiationException;
62 import com.gargoylesoftware.htmlunit.WebClient;
63 import com.gargoylesoftware.htmlunit.WebResponse;
64 import com.gargoylesoftware.htmlunit.WebWindow;
65
66 /**
67  * SAX parser implementation that uses the neko {@link org.cyberneko.html.HTMLConfiguration}
68  * to parse HTML into a HtmlUnit-specific DOM (HU-DOM) tree.
69  * <p>
70  * <em>Note that the parser currently does not handle CDATA or comment sections, i.e. these
71  * do not appear in the resulting DOM tree</em>
72  *
73  * @version $Revision: 100 $
74  * @author <a HREF="mailto:cse@dynabean.de">Christian Sell</a>
75  * @author David K. Taylor
76  * @author Chris Erskine
77  */

78 public class HTMLParser {
79
80     private static final Map JavaDoc ELEMENT_FACTORIES = new HashMap JavaDoc();
81     private static boolean IgnoreOutsideContent_ = false;
82     
83     static {
84         ELEMENT_FACTORIES.put("input", InputElementFactory.instance);
85
86         putFactory( HtmlAnchor.TAG_NAME, HtmlAnchor.class);
87         putFactory( HtmlApplet.TAG_NAME, HtmlApplet.class);
88         putFactory( HtmlAddress.TAG_NAME, HtmlAddress.class);
89         putFactory( HtmlArea.TAG_NAME, HtmlArea.class);
90         putFactory( HtmlBase.TAG_NAME, HtmlBase.class);
91         putFactory( HtmlBaseFont.TAG_NAME, HtmlBaseFont.class);
92         putFactory( HtmlBidirectionalOverride.TAG_NAME, HtmlBidirectionalOverride.class);
93         putFactory( HtmlBlockQuote.TAG_NAME, HtmlBlockQuote.class);
94         putFactory( HtmlBody.TAG_NAME, HtmlBody.class);
95         putFactory( HtmlBreak.TAG_NAME, HtmlBreak.class);
96         putFactory( HtmlButton.TAG_NAME, HtmlButton.class);
97         putFactory( HtmlCaption.TAG_NAME, HtmlCaption.class);
98         putFactory( HtmlCenter.TAG_NAME, HtmlCenter.class);
99         putFactory( HtmlTableColumn.TAG_NAME, HtmlTableColumn.class);
100         putFactory( HtmlTableColumnGroup.TAG_NAME, HtmlTableColumnGroup.class);
101         putFactory( HtmlDefinitionDescription.TAG_NAME, HtmlDefinitionDescription.class);
102         putFactory( HtmlDeletedText.TAG_NAME, HtmlDeletedText.class);
103         putFactory( HtmlTextDirection.TAG_NAME, HtmlTextDirection.class);
104         putFactory( HtmlDivision.TAG_NAME, HtmlDivision.class);
105         putFactory( HtmlDefinitionList.TAG_NAME, HtmlDefinitionList.class);
106         putFactory( HtmlDefinitionTerm.TAG_NAME, HtmlDefinitionTerm.class);
107         putFactory( HtmlFieldSet.TAG_NAME, HtmlFieldSet.class);
108         putFactory( HtmlFont.TAG_NAME, HtmlFont.class);
109         putFactory( HtmlForm.TAG_NAME, HtmlForm.class);
110         putFactory( HtmlFrame.TAG_NAME, HtmlFrame.class);
111         putFactory( HtmlFrameSet.TAG_NAME, HtmlFrameSet.class);
112         putFactory( HtmlHeader1.TAG_NAME, HtmlHeader1.class);
113         putFactory( HtmlHeader2.TAG_NAME, HtmlHeader2.class);
114         putFactory( HtmlHeader3.TAG_NAME, HtmlHeader3.class);
115         putFactory( HtmlHeader4.TAG_NAME, HtmlHeader4.class);
116         putFactory( HtmlHeader5.TAG_NAME, HtmlHeader5.class);
117         putFactory( HtmlHeader6.TAG_NAME, HtmlHeader6.class);
118         putFactory( HtmlHead.TAG_NAME, HtmlHead.class);
119         putFactory( HtmlHorizontalRule.TAG_NAME, HtmlHorizontalRule.class);
120         putFactory( HtmlHtml.TAG_NAME, HtmlHtml.class);
121         putFactory( HtmlInlineFrame.TAG_NAME, HtmlInlineFrame.class);
122         putFactory( HtmlImage.TAG_NAME, HtmlImage.class);
123         putFactory( HtmlInsertedText.TAG_NAME, HtmlInsertedText.class);
124         putFactory( HtmlIsIndex.TAG_NAME, HtmlIsIndex.class);
125         putFactory( HtmlLabel.TAG_NAME, HtmlLabel.class);
126         putFactory( HtmlLegend.TAG_NAME, HtmlLegend.class);
127         putFactory( HtmlListItem.TAG_NAME, HtmlListItem.class);
128         putFactory( HtmlLink.TAG_NAME, HtmlLink.class);
129         putFactory( HtmlMap.TAG_NAME, HtmlMap.class);
130         putFactory( HtmlMenu.TAG_NAME, HtmlMenu.class);
131         putFactory( HtmlMeta.TAG_NAME, HtmlMeta.class);
132         putFactory( HtmlNoFrames.TAG_NAME, HtmlNoFrames.class);
133         putFactory( HtmlNoScript.TAG_NAME, HtmlNoScript.class);
134         putFactory( HtmlObject.TAG_NAME, HtmlObject.class);
135         putFactory( HtmlOrderedList.TAG_NAME, HtmlOrderedList.class);
136         putFactory( HtmlOptionGroup.TAG_NAME, HtmlOptionGroup.class);
137         putFactory( HtmlOption.TAG_NAME, HtmlOption.class);
138         putFactory( HtmlParagraph.TAG_NAME, HtmlParagraph.class);
139         putFactory( HtmlParameter.TAG_NAME, HtmlParameter.class);
140         putFactory( HtmlPreformattedText.TAG_NAME, HtmlPreformattedText.class);
141         putFactory( HtmlInlineQuotation.TAG_NAME, HtmlInlineQuotation.class);
142         putFactory( HtmlScript.TAG_NAME, HtmlScript.class);
143         putFactory( HtmlSelect.TAG_NAME, HtmlSelect.class);
144         putFactory( HtmlSpan.TAG_NAME, HtmlSpan.class);
145         putFactory( HtmlStyle.TAG_NAME, HtmlStyle.class);
146         putFactory( HtmlTitle.TAG_NAME, HtmlTitle.class);
147
148         putFactory( HtmlTable.TAG_NAME, HtmlTable.class);
149         putFactory( HtmlTableBody.TAG_NAME, HtmlTableBody.class);
150         putFactory( HtmlTableDataCell.TAG_NAME, HtmlTableDataCell.class);
151         putFactory( HtmlTableHeaderCell.TAG_NAME, HtmlTableHeaderCell.class);
152         putFactory( HtmlTableRow.TAG_NAME, HtmlTableRow.class);
153
154         putFactory( HtmlTextArea.TAG_NAME, HtmlTextArea.class);
155         putFactory( HtmlTableFooter.TAG_NAME, HtmlTableFooter.class);
156         putFactory( HtmlTableHeader.TAG_NAME, HtmlTableHeader.class);
157         putFactory( HtmlUnorderedList.TAG_NAME, HtmlUnorderedList.class);
158     }
159
160     private static void putFactory(final String JavaDoc tagName, final Class JavaDoc elementClass) {
161         ELEMENT_FACTORIES.put(tagName, new DefaultElementFactory(elementClass));
162     }
163
164     /**
165      * Set the flag to control validation of the HTML content that is outside of the
166      * BODY and HTML tags. This flag is false by default to maintain compatability with
167      * current NekoHTML defaults.
168      * @param ignoreOutsideContent - boolean flag to set
169      */

170     public static void setIgnoreOutsideContent(final boolean ignoreOutsideContent) {
171         IgnoreOutsideContent_ = ignoreOutsideContent;
172     }
173
174     /**
175      * Get the state of the flag to ignore contant outside the BODY and HTML tags
176      * @return - The current state
177      */

178     public static boolean getIgnoreOutsideContent() {
179         return IgnoreOutsideContent_;
180     }
181
182     /**
183      * @param tagName an HTML element tag name
184      * @return a factory for creating HtmlElements representing the given tag
185      */

186     public static IElementFactory getFactory(final String JavaDoc tagName) {
187         final IElementFactory result = (IElementFactory)ELEMENT_FACTORIES.get(tagName);
188
189         //return result != null ? result : UnknownElementFactory.instance;
190
if(result != null) {
191             return result;
192         }
193         else {
194             return UnknownElementFactory.instance;
195         }
196     }
197
198     /**
199      * You should never need to create one of these!
200      * @deprecated
201      */

202     public HTMLParser() {
203     }
204
205     /**
206      * This method should no longer be used
207      *
208      * @param webClient NOT USED
209      * @param webResponse the response data
210      * @param webWindow the web window into which the page is to be loaded
211      * @return the page object which forms the root of the DOM tree, or <code>null</code> if the &lt;HTML&gt;
212      * tag is missing
213      * @throws java.io.IOException io error
214      * @deprecated
215      */

216     public HtmlPage parse(
217             final WebClient webClient,
218             final WebResponse webResponse,
219             final WebWindow webWindow) throws IOException JavaDoc {
220         return parse(webResponse, webWindow);
221     }
222     /**
223      * parse the HTML content from the given WebResponse into an object tree representation
224      *
225      * @param webResponse the response data
226      * @param webWindow the web window into which the page is to be loaded
227      * @return the page object which forms the root of the DOM tree, or <code>null</code> if the &lt;HTML&gt;
228      * tag is missing
229      * @throws java.io.IOException io error
230      */

231     public static HtmlPage parse(final WebResponse webResponse, final WebWindow webWindow)
232         throws IOException JavaDoc {
233         final HtmlUnitDOMBuilder domBuilder = new HtmlUnitDOMBuilder(webResponse, webWindow);
234         String JavaDoc charSet = webResponse.getContentCharSet();
235         if( isSupportedCharacterSet(charSet) == false ) {
236             charSet = "ISO-8859-1";
237         }
238         final XMLInputSource in = new XMLInputSource(
239                 null,
240                 webResponse.getUrl().toString(),
241                 null,
242                 webResponse.getContentAsStream(),
243                 charSet);
244
245         domBuilder.parse(in);
246         return domBuilder.page_;
247     }
248     
249     /**
250      * <p>Return true if the specified charset is supported on this platform.</p>
251      * @param charset The charset to check.
252      * @return True if this charset is supported.
253      */

254     private static boolean isSupportedCharacterSet( final String JavaDoc charset ) {
255         //TODO: There's got to be a cleaner way to figure out if a given encoding is
256
// supported but I couldn't find it.
257
try {
258             new InputStreamReader JavaDoc( new ByteArrayInputStream JavaDoc(new byte[0]), charset );
259             return true;
260         }
261         catch( final UnsupportedEncodingException JavaDoc e ) {
262             return false;
263         }
264     }
265
266     /**
267      * The parser and DOM builder. This class subclasses Xerces's AbstractSAXParser and implements
268      * the ContentHandler interface. Thus all parser APIs are kept private. The ContentHandler methods
269      * consume SAX events to build the page DOM
270      */

271     private static class HtmlUnitDOMBuilder extends AbstractSAXParser implements ContentHandler JavaDoc /*, LexicalHandler */ {
272
273         private final WebResponse webResponse_;
274         private final WebWindow webWindow_;
275
276         // private final ScriptFilter scriptFilter_;
277

278         private HtmlPage page_;
279
280         private Locator JavaDoc locator_;
281         private final Stack JavaDoc stack_ = new Stack JavaDoc();
282
283         private DomNode currentNode_;
284         private StringBuffer JavaDoc characters_;
285
286         /**
287          * create a new builder for parsing the given response contents
288          * @param webResponse the response data
289          * @param webWindow the web window into which the page is to be loaded
290          */

291         public HtmlUnitDOMBuilder(final WebResponse webResponse, final WebWindow webWindow) {
292             super(new HTMLConfiguration());
293
294             webResponse_ = webResponse;
295             webWindow_ = webWindow;
296
297             final HTMLParserListener listener = webWindow.getWebClient().getHTMLParserListener();
298             final boolean reportErrors;
299             if (listener != null) {
300                 reportErrors = true;
301                 fConfiguration.setErrorHandler(new HTMLErrorHandler(listener, webResponse.getUrl()));
302             }
303             else {
304                 reportErrors = false;
305             }
306
307             try {
308                 setFeature( "http://cyberneko.org/html/features/augmentations", true );
309                 setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
310                 setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
311                 setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
312                     IgnoreOutsideContent_);
313             }
314             catch (final SAXException JavaDoc e) {
315                 throw new ObjectInstantiationException("unable to create HTML parser", e);
316             }
317         }
318
319         /**
320          * parse the input source. This is the only parse() method that should be called.
321          *
322          * @param inputSource an XMLInputSource
323          * @throws java.io.IOException
324          */

325         public void parse(final XMLInputSource inputSource) throws IOException JavaDoc {
326
327             setContentHandler(this);
328             //setLexicalHandler(this); comments and CDATA
329

330             super.parse(inputSource);
331         }
332
333         /**
334          * @return the document locator
335          */

336         public Locator JavaDoc getLocator() {
337             return locator_;
338         }
339
340         /**
341          * set the document locator
342          * @param locator
343          */

344         public void setDocumentLocator(final Locator JavaDoc locator) {
345             locator_ = locator;
346         }
347
348         /** @inheritDoc ContentHandler#startDocument() */
349         public void startDocument() throws SAXException JavaDoc {
350             page_ = new HtmlPage(webResponse_.getUrl(), webResponse_, webWindow_);
351             webWindow_.setEnclosedPage(page_);
352
353             currentNode_ = page_;
354             stack_.push(currentNode_);
355         }
356
357         /** @inheritDoc ContentHandler#startElement(String,String,String,Attributes) */
358         public void startElement(
359                 final String JavaDoc namespaceURI, final String JavaDoc localName,
360                 final String JavaDoc qName, final Attributes JavaDoc atts)
361             throws SAXException JavaDoc {
362
363             handleCharacters();
364
365             final String JavaDoc tagLower = localName.toLowerCase();
366             final IElementFactory factory = getElementFactory(tagLower);
367             HtmlElement newElement = factory.createElement(page_, tagLower, atts);
368             currentNode_.appendChild(newElement);
369             currentNode_ = newElement;
370             stack_.push(currentNode_);
371         }
372
373         /** @inheritDoc ContentHandler@endElement(String,String,String) */
374         public void endElement(final String JavaDoc namespaceURI, final String JavaDoc localName, final String JavaDoc qName)
375             throws SAXException JavaDoc {
376
377             handleCharacters();
378             stack_.pop(); //remove currentElement from stack
379

380             if(!stack_.isEmpty()) {
381                 currentNode_ = (DomNode)stack_.peek();
382             }
383         }
384
385         /** @inheritDoc ContentHandler#characters(char,int,int) */
386         public void characters(final char ch[], final int start, final int length) throws SAXException JavaDoc {
387
388             if(characters_ == null) {
389                 characters_ = new StringBuffer JavaDoc();
390             }
391             characters_.append(ch, start, length);
392         }
393
394         /** @inheritDoc ContentHandler#ignorableWhitespace(char,int,int) */
395         public void ignorableWhitespace(final char ch[], final int start, final int length) throws SAXException JavaDoc {
396
397             if(characters_ == null) {
398                 characters_ = new StringBuffer JavaDoc();
399             }
400             characters_.append(ch, start, length);
401         }
402
403         /**
404          * pick up the chacracter data accumulated so far and add it to the
405          * current element as a text node
406          */

407         private void handleCharacters() {
408
409             if(characters_ != null && characters_.length() > 0) {
410                 final DomText text = new DomText(page_, characters_.toString());
411                 currentNode_.appendChild(text);
412                 characters_.setLength(0);
413             }
414         }
415
416         /**
417          * @param tagName an HTML tag name, in lowercase
418          * @return the pre-registered element factory for the tag, or an UnknownElementFactory
419          */

420         private IElementFactory getElementFactory(final String JavaDoc tagName) {
421
422             final IElementFactory factory = (IElementFactory)ELEMENT_FACTORIES.get(tagName);
423
424             //return factory != null ? factory : UnknownElementFactory.instance;
425
if(factory != null) {
426                 return factory;
427             }
428             else {
429                 return UnknownElementFactory.instance;
430             }
431         }
432
433         /** @inheritDoc ContentHandler#endDocument() */
434         public void endDocument() throws SAXException JavaDoc {
435         }
436
437         /** @inheritDoc ContentHandler#startPrefixMapping(String,String) */
438         public void startPrefixMapping(final String JavaDoc prefix, final String JavaDoc uri) throws SAXException JavaDoc {
439         }
440
441         /** @inheritDoc ContentHandler#endPrefixMapping(String) */
442         public void endPrefixMapping(final String JavaDoc prefix) throws SAXException JavaDoc {
443         }
444
445         /** @inheritDoc ContentHandler#processingInstrucction(String,String) */
446         public void processingInstruction(final String JavaDoc target, final String JavaDoc data) throws SAXException JavaDoc {
447         }
448
449         /** @inheritDoc ContentHandler#skippedEntity(String) */
450         public void skippedEntity(final String JavaDoc name) throws SAXException JavaDoc {
451         }
452     }
453 }
454
455 /**
456  * Utility to transmit parsing errors to a {@link HTMLParserListener}.
457  */

458 class HTMLErrorHandler extends DefaultErrorHandler {
459     private final HTMLParserListener listener_;
460     private final URL JavaDoc url_;
461
462     HTMLErrorHandler(final HTMLParserListener listener, final URL JavaDoc url) {
463         Assert.notNull("listener", listener);
464         Assert.notNull("url", url);
465         listener_ = listener;
466         url_ = url;
467     }
468
469     /** @see DefaultErrorHandler#error(String,String,XMLParseException) */
470     public void error(final String JavaDoc domain, final String JavaDoc key,
471             final XMLParseException exception) throws XNIException {
472         listener_.error(exception.getMessage(),
473                 url_,
474                 exception.getLineNumber(),
475                 exception.getColumnNumber(),
476                 key);
477     }
478
479     /** @see DefaultErrorHandler#warning(String,String,XMLParseException) */
480     public void warning(final String JavaDoc domain, final String JavaDoc key,
481             final XMLParseException exception) throws XNIException {
482         listener_.warning(exception.getMessage(),
483                 url_,
484                 exception.getLineNumber(),
485                 exception.getColumnNumber(),
486                 key);
487     }
488 }
489
Popular Tags