1 38 package com.gargoylesoftware.htmlunit.html; 39 40 import java.io.ByteArrayInputStream ; 41 import java.io.IOException ; 42 import java.io.InputStreamReader ; 43 import java.io.UnsupportedEncodingException ; 44 import java.net.URL ; 45 import java.util.HashMap ; 46 import java.util.Map ; 47 import java.util.Stack ; 48 49 import org.apache.xerces.parsers.AbstractSAXParser; 50 import org.apache.xerces.util.DefaultErrorHandler; 51 import org.apache.xerces.xni.XNIException; 52 import org.apache.xerces.xni.parser.XMLInputSource; 53 import org.apache.xerces.xni.parser.XMLParseException; 54 import org.cyberneko.html.HTMLConfiguration; 55 import org.xml.sax.Attributes ; 56 import org.xml.sax.ContentHandler ; 57 import org.xml.sax.Locator ; 58 import org.xml.sax.SAXException ; 59 60 import com.gargoylesoftware.htmlunit.Assert; 61 import com.gargoylesoftware.htmlunit.ObjectInstantiationException; 62 import com.gargoylesoftware.htmlunit.WebClient; 63 import com.gargoylesoftware.htmlunit.WebResponse; 64 import com.gargoylesoftware.htmlunit.WebWindow; 65 66 78 public class HTMLParser { 79 80 private static final Map ELEMENT_FACTORIES = new HashMap (); 81 private static boolean IgnoreOutsideContent_ = false; 82 83 static { 84 ELEMENT_FACTORIES.put("input", InputElementFactory.instance); 85 86 putFactory( HtmlAnchor.TAG_NAME, HtmlAnchor.class); 87 putFactory( HtmlApplet.TAG_NAME, HtmlApplet.class); 88 putFactory( HtmlAddress.TAG_NAME, HtmlAddress.class); 89 putFactory( HtmlArea.TAG_NAME, HtmlArea.class); 90 putFactory( HtmlBase.TAG_NAME, HtmlBase.class); 91 putFactory( HtmlBaseFont.TAG_NAME, HtmlBaseFont.class); 92 putFactory( HtmlBidirectionalOverride.TAG_NAME, HtmlBidirectionalOverride.class); 93 putFactory( HtmlBlockQuote.TAG_NAME, HtmlBlockQuote.class); 94 putFactory( HtmlBody.TAG_NAME, HtmlBody.class); 95 putFactory( HtmlBreak.TAG_NAME, HtmlBreak.class); 96 putFactory( HtmlButton.TAG_NAME, HtmlButton.class); 97 putFactory( HtmlCaption.TAG_NAME, HtmlCaption.class); 98 putFactory( HtmlCenter.TAG_NAME, HtmlCenter.class); 99 putFactory( HtmlTableColumn.TAG_NAME, HtmlTableColumn.class); 100 putFactory( HtmlTableColumnGroup.TAG_NAME, HtmlTableColumnGroup.class); 101 putFactory( HtmlDefinitionDescription.TAG_NAME, HtmlDefinitionDescription.class); 102 putFactory( HtmlDeletedText.TAG_NAME, HtmlDeletedText.class); 103 putFactory( HtmlTextDirection.TAG_NAME, HtmlTextDirection.class); 104 putFactory( HtmlDivision.TAG_NAME, HtmlDivision.class); 105 putFactory( HtmlDefinitionList.TAG_NAME, HtmlDefinitionList.class); 106 putFactory( HtmlDefinitionTerm.TAG_NAME, HtmlDefinitionTerm.class); 107 putFactory( HtmlFieldSet.TAG_NAME, HtmlFieldSet.class); 108 putFactory( HtmlFont.TAG_NAME, HtmlFont.class); 109 putFactory( HtmlForm.TAG_NAME, HtmlForm.class); 110 putFactory( HtmlFrame.TAG_NAME, HtmlFrame.class); 111 putFactory( HtmlFrameSet.TAG_NAME, HtmlFrameSet.class); 112 putFactory( HtmlHeader1.TAG_NAME, HtmlHeader1.class); 113 putFactory( HtmlHeader2.TAG_NAME, HtmlHeader2.class); 114 putFactory( HtmlHeader3.TAG_NAME, HtmlHeader3.class); 115 putFactory( HtmlHeader4.TAG_NAME, HtmlHeader4.class); 116 putFactory( HtmlHeader5.TAG_NAME, HtmlHeader5.class); 117 putFactory( HtmlHeader6.TAG_NAME, HtmlHeader6.class); 118 putFactory( HtmlHead.TAG_NAME, HtmlHead.class); 119 putFactory( HtmlHorizontalRule.TAG_NAME, HtmlHorizontalRule.class); 120 putFactory( HtmlHtml.TAG_NAME, HtmlHtml.class); 121 putFactory( HtmlInlineFrame.TAG_NAME, HtmlInlineFrame.class); 122 putFactory( HtmlImage.TAG_NAME, HtmlImage.class); 123 putFactory( HtmlInsertedText.TAG_NAME, HtmlInsertedText.class); 124 putFactory( HtmlIsIndex.TAG_NAME, HtmlIsIndex.class); 125 putFactory( HtmlLabel.TAG_NAME, HtmlLabel.class); 126 putFactory( HtmlLegend.TAG_NAME, HtmlLegend.class); 127 putFactory( HtmlListItem.TAG_NAME, HtmlListItem.class); 128 putFactory( HtmlLink.TAG_NAME, HtmlLink.class); 129 putFactory( HtmlMap.TAG_NAME, HtmlMap.class); 130 putFactory( HtmlMenu.TAG_NAME, HtmlMenu.class); 131 putFactory( HtmlMeta.TAG_NAME, HtmlMeta.class); 132 putFactory( HtmlNoFrames.TAG_NAME, HtmlNoFrames.class); 133 putFactory( HtmlNoScript.TAG_NAME, HtmlNoScript.class); 134 putFactory( HtmlObject.TAG_NAME, HtmlObject.class); 135 putFactory( HtmlOrderedList.TAG_NAME, HtmlOrderedList.class); 136 putFactory( HtmlOptionGroup.TAG_NAME, HtmlOptionGroup.class); 137 putFactory( HtmlOption.TAG_NAME, HtmlOption.class); 138 putFactory( HtmlParagraph.TAG_NAME, HtmlParagraph.class); 139 putFactory( HtmlParameter.TAG_NAME, HtmlParameter.class); 140 putFactory( HtmlPreformattedText.TAG_NAME, HtmlPreformattedText.class); 141 putFactory( HtmlInlineQuotation.TAG_NAME, HtmlInlineQuotation.class); 142 putFactory( HtmlScript.TAG_NAME, HtmlScript.class); 143 putFactory( HtmlSelect.TAG_NAME, HtmlSelect.class); 144 putFactory( HtmlSpan.TAG_NAME, HtmlSpan.class); 145 putFactory( HtmlStyle.TAG_NAME, HtmlStyle.class); 146 putFactory( HtmlTitle.TAG_NAME, HtmlTitle.class); 147 148 putFactory( HtmlTable.TAG_NAME, HtmlTable.class); 149 putFactory( HtmlTableBody.TAG_NAME, HtmlTableBody.class); 150 putFactory( HtmlTableDataCell.TAG_NAME, HtmlTableDataCell.class); 151 putFactory( HtmlTableHeaderCell.TAG_NAME, HtmlTableHeaderCell.class); 152 putFactory( HtmlTableRow.TAG_NAME, HtmlTableRow.class); 153 154 putFactory( HtmlTextArea.TAG_NAME, HtmlTextArea.class); 155 putFactory( HtmlTableFooter.TAG_NAME, HtmlTableFooter.class); 156 putFactory( HtmlTableHeader.TAG_NAME, HtmlTableHeader.class); 157 putFactory( HtmlUnorderedList.TAG_NAME, HtmlUnorderedList.class); 158 } 159 160 private static void putFactory(final String tagName, final Class elementClass) { 161 ELEMENT_FACTORIES.put(tagName, new DefaultElementFactory(elementClass)); 162 } 163 164 170 public static void setIgnoreOutsideContent(final boolean ignoreOutsideContent) { 171 IgnoreOutsideContent_ = ignoreOutsideContent; 172 } 173 174 178 public static boolean getIgnoreOutsideContent() { 179 return IgnoreOutsideContent_; 180 } 181 182 186 public static IElementFactory getFactory(final String tagName) { 187 final IElementFactory result = (IElementFactory)ELEMENT_FACTORIES.get(tagName); 188 189 if(result != null) { 191 return result; 192 } 193 else { 194 return UnknownElementFactory.instance; 195 } 196 } 197 198 202 public HTMLParser() { 203 } 204 205 216 public HtmlPage parse( 217 final WebClient webClient, 218 final WebResponse webResponse, 219 final WebWindow webWindow) throws IOException { 220 return parse(webResponse, webWindow); 221 } 222 231 public static HtmlPage parse(final WebResponse webResponse, final WebWindow webWindow) 232 throws IOException { 233 final HtmlUnitDOMBuilder domBuilder = new HtmlUnitDOMBuilder(webResponse, webWindow); 234 String charSet = webResponse.getContentCharSet(); 235 if( isSupportedCharacterSet(charSet) == false ) { 236 charSet = "ISO-8859-1"; 237 } 238 final XMLInputSource in = new XMLInputSource( 239 null, 240 webResponse.getUrl().toString(), 241 null, 242 webResponse.getContentAsStream(), 243 charSet); 244 245 domBuilder.parse(in); 246 return domBuilder.page_; 247 } 248 249 254 private static boolean isSupportedCharacterSet( final String charset ) { 255 try { 258 new InputStreamReader ( new ByteArrayInputStream (new byte[0]), charset ); 259 return true; 260 } 261 catch( final UnsupportedEncodingException e ) { 262 return false; 263 } 264 } 265 266 271 private static class HtmlUnitDOMBuilder extends AbstractSAXParser implements ContentHandler { 272 273 private final WebResponse webResponse_; 274 private final WebWindow webWindow_; 275 276 278 private HtmlPage page_; 279 280 private Locator locator_; 281 private final Stack stack_ = new Stack (); 282 283 private DomNode currentNode_; 284 private StringBuffer characters_; 285 286 291 public HtmlUnitDOMBuilder(final WebResponse webResponse, final WebWindow webWindow) { 292 super(new HTMLConfiguration()); 293 294 webResponse_ = webResponse; 295 webWindow_ = webWindow; 296 297 final HTMLParserListener listener = webWindow.getWebClient().getHTMLParserListener(); 298 final boolean reportErrors; 299 if (listener != null) { 300 reportErrors = true; 301 fConfiguration.setErrorHandler(new HTMLErrorHandler(listener, webResponse.getUrl())); 302 } 303 else { 304 reportErrors = false; 305 } 306 307 try { 308 setFeature( "http://cyberneko.org/html/features/augmentations", true ); 309 setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); 310 setFeature("http://cyberneko.org/html/features/report-errors", reportErrors); 311 setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", 312 IgnoreOutsideContent_); 313 } 314 catch (final SAXException e) { 315 throw new ObjectInstantiationException("unable to create HTML parser", e); 316 } 317 } 318 319 325 public void parse(final XMLInputSource inputSource) throws IOException { 326 327 setContentHandler(this); 328 330 super.parse(inputSource); 331 } 332 333 336 public Locator getLocator() { 337 return locator_; 338 } 339 340 344 public void setDocumentLocator(final Locator locator) { 345 locator_ = locator; 346 } 347 348 349 public void startDocument() throws SAXException { 350 page_ = new HtmlPage(webResponse_.getUrl(), webResponse_, webWindow_); 351 webWindow_.setEnclosedPage(page_); 352 353 currentNode_ = page_; 354 stack_.push(currentNode_); 355 } 356 357 358 public void startElement( 359 final String namespaceURI, final String localName, 360 final String qName, final Attributes atts) 361 throws SAXException { 362 363 handleCharacters(); 364 365 final String tagLower = localName.toLowerCase(); 366 final IElementFactory factory = getElementFactory(tagLower); 367 HtmlElement newElement = factory.createElement(page_, tagLower, atts); 368 currentNode_.appendChild(newElement); 369 currentNode_ = newElement; 370 stack_.push(currentNode_); 371 } 372 373 374 public void endElement(final String namespaceURI, final String localName, final String qName) 375 throws SAXException { 376 377 handleCharacters(); 378 stack_.pop(); 380 if(!stack_.isEmpty()) { 381 currentNode_ = (DomNode)stack_.peek(); 382 } 383 } 384 385 386 public void characters(final char ch[], final int start, final int length) throws SAXException { 387 388 if(characters_ == null) { 389 characters_ = new StringBuffer (); 390 } 391 characters_.append(ch, start, length); 392 } 393 394 395 public void ignorableWhitespace(final char ch[], final int start, final int length) throws SAXException { 396 397 if(characters_ == null) { 398 characters_ = new StringBuffer (); 399 } 400 characters_.append(ch, start, length); 401 } 402 403 407 private void handleCharacters() { 408 409 if(characters_ != null && characters_.length() > 0) { 410 final DomText text = new DomText(page_, characters_.toString()); 411 currentNode_.appendChild(text); 412 characters_.setLength(0); 413 } 414 } 415 416 420 private IElementFactory getElementFactory(final String tagName) { 421 422 final IElementFactory factory = (IElementFactory)ELEMENT_FACTORIES.get(tagName); 423 424 if(factory != null) { 426 return factory; 427 } 428 else { 429 return UnknownElementFactory.instance; 430 } 431 } 432 433 434 public void endDocument() throws SAXException { 435 } 436 437 438 public void startPrefixMapping(final String prefix, final String uri) throws SAXException { 439 } 440 441 442 public void endPrefixMapping(final String prefix) throws SAXException { 443 } 444 445 446 public void processingInstruction(final String target, final String data) throws SAXException { 447 } 448 449 450 public void skippedEntity(final String name) throws SAXException { 451 } 452 } 453 } 454 455 458 class HTMLErrorHandler extends DefaultErrorHandler { 459 private final HTMLParserListener listener_; 460 private final URL url_; 461 462 HTMLErrorHandler(final HTMLParserListener listener, final URL url) { 463 Assert.notNull("listener", listener); 464 Assert.notNull("url", url); 465 listener_ = listener; 466 url_ = url; 467 } 468 469 470 public void error(final String domain, final String key, 471 final XMLParseException exception) throws XNIException { 472 listener_.error(exception.getMessage(), 473 url_, 474 exception.getLineNumber(), 475 exception.getColumnNumber(), 476 key); 477 } 478 479 480 public void warning(final String domain, final String key, 481 final XMLParseException exception) throws XNIException { 482 listener_.warning(exception.getMessage(), 483 url_, 484 exception.getLineNumber(), 485 exception.getColumnNumber(), 486 key); 487 } 488 } 489 | Popular Tags |