1 27 package org.htmlparser; 28 29 import java.io.Serializable ; 30 import java.net.HttpURLConnection ; 31 import java.net.URLConnection ; 32 33 import org.htmlparser.filters.TagNameFilter; 34 import org.htmlparser.filters.NodeClassFilter; 35 import org.htmlparser.http.ConnectionManager; 36 import org.htmlparser.http.ConnectionMonitor; 37 import org.htmlparser.lexer.Lexer; 38 import org.htmlparser.lexer.Page; 39 import org.htmlparser.util.DefaultParserFeedback; 40 import org.htmlparser.util.IteratorImpl; 41 import org.htmlparser.util.NodeIterator; 42 import org.htmlparser.util.NodeList; 43 import org.htmlparser.util.ParserException; 44 import org.htmlparser.util.ParserFeedback; 45 import org.htmlparser.visitors.NodeVisitor; 46 47 59 public class Parser 60 implements 61 Serializable , 62 ConnectionMonitor 63 { 64 67 70 public final static double 71 VERSION_NUMBER = 1.5 72 ; 73 74 77 public final static String 78 VERSION_TYPE = "Integration Build" 79 ; 80 81 84 public final static String 85 VERSION_DATE = "Mar 13, 2005" 86 ; 87 88 91 public final static String 92 VERSION_STRING = "" + VERSION_NUMBER + " (" + VERSION_TYPE + " " + VERSION_DATE + ")" 93 ; 94 95 97 100 protected ParserFeedback mFeedback; 101 102 105 protected Lexer mLexer; 106 107 111 public static ParserFeedback noFeedback = new DefaultParserFeedback (DefaultParserFeedback.QUIET); 112 113 117 public static ParserFeedback stdout = new DefaultParserFeedback (); 118 119 123 130 public static String getVersion () 131 { 132 return (VERSION_STRING); 133 } 134 135 140 public static double getVersionNumber () 141 { 142 return (VERSION_NUMBER); 143 } 144 145 149 public static ConnectionManager getConnectionManager () 150 { 151 return (Page.getConnectionManager ()); 152 } 153 154 158 public static void setConnectionManager (ConnectionManager manager) 159 { 160 Page.setConnectionManager (manager); 161 } 162 163 171 public static Parser createParser (String html, String charset) 172 { 173 Parser ret; 174 175 if (null == html) 176 throw new IllegalArgumentException ("html cannot be null"); 177 ret = new Parser (new Lexer (new Page (html, charset))); 178 179 return (ret); 180 } 181 182 186 193 public Parser () 194 { 195 this (new Lexer (new Page ("")), noFeedback); 196 } 197 198 217 public Parser (Lexer lexer, ParserFeedback fb) 218 { 219 setFeedback (fb); 220 if (null == lexer) 221 throw new IllegalArgumentException ("lexer cannot be null"); 222 setLexer (lexer); 223 setNodeFactory (new PrototypicalNodeFactory ()); 224 } 225 226 232 public Parser (URLConnection connection, ParserFeedback fb) 233 throws 234 ParserException 235 { 236 this (new Lexer (connection), fb); 237 } 238 239 249 public Parser (String resourceLocn, ParserFeedback feedback) throws ParserException 250 { 251 this (getConnectionManager ().openConnection (resourceLocn), feedback); 252 } 253 254 259 public Parser (String resourceLocn) throws ParserException 260 { 261 this (resourceLocn, stdout); 262 } 263 264 279 public Parser (Lexer lexer) 280 { 281 this (lexer, stdout); 282 } 283 284 291 public Parser (URLConnection connection) throws ParserException 292 { 293 this (connection, stdout); 294 } 295 296 300 311 public void setConnection (URLConnection connection) 312 throws 313 ParserException 314 { 315 if (null != connection) 316 setLexer (new Lexer (connection)); 317 } 318 319 325 public URLConnection getConnection () 326 { 327 return (getLexer ().getPage ().getConnection ()); 328 } 329 330 336 public void setURL (String url) 337 throws 338 ParserException 339 { 340 if ((null != url) && !"".equals (url)) 341 setConnection (Page.getConnectionManager ().openConnection (url)); 342 } 343 344 349 public String getURL () 350 { 351 return (getLexer ().getPage ().getUrl ()); 352 } 353 354 358 public void setEncoding (String encoding) 359 throws 360 ParserException 361 { 362 getLexer ().getPage ().setEncoding (encoding); 363 } 364 365 370 public String getEncoding () 371 { 372 return (getLexer ().getPage ().getEncoding ()); 373 } 374 375 383 public void setLexer (Lexer lexer) 384 { 385 NodeFactory factory; 386 String type; 387 388 if (null != lexer) 389 { factory = null; 391 if (null != getLexer ()) 392 factory = getLexer ().getNodeFactory (); 393 if (null != factory) 394 lexer.setNodeFactory (factory); 395 mLexer = lexer; 396 type = mLexer.getPage ().getContentType (); 398 if (type != null && !type.startsWith ("text")) 399 getFeedback ().warning ( 400 "URL " 401 + mLexer.getPage ().getUrl () 402 + " does not contain text"); 403 } 404 } 405 406 410 public Lexer getLexer () 411 { 412 return (mLexer); 413 } 414 415 419 public NodeFactory getNodeFactory () 420 { 421 return (getLexer ().getNodeFactory ()); 422 } 423 424 428 public void setNodeFactory (NodeFactory factory) 429 { 430 if (null == factory) 431 throw new IllegalArgumentException ("node factory cannot be null"); 432 getLexer ().setNodeFactory (factory); 433 } 434 435 439 public void setFeedback (ParserFeedback fb) 440 { 441 mFeedback = (null == fb) ? noFeedback : fb; 442 } 443 444 448 public ParserFeedback getFeedback() 449 { 450 return (mFeedback); 451 } 452 453 457 460 public void reset () 461 { 462 getLexer ().reset (); 463 } 464 465 488 public NodeIterator elements () throws ParserException 489 { 490 return (new IteratorImpl (getLexer (), getFeedback ())); 491 } 492 493 499 public NodeList parse (NodeFilter filter) throws ParserException 500 { 501 NodeIterator e; 502 Node node; 503 NodeList ret; 504 505 ret = new NodeList (); 506 for (e = elements (); e.hasMoreNodes (); ) 507 { 508 node = e.nextNode (); 509 if (null != filter) 510 node.collectInto (ret, filter); 511 else 512 ret.add (node); 513 } 514 515 return (ret); 516 } 517 518 public void visitAllNodesWith(NodeVisitor visitor) throws ParserException { 519 Node node; 520 visitor.beginParsing(); 521 for (NodeIterator e = elements();e.hasMoreNodes();) { 522 node = e.nextNode(); 523 node.accept(visitor); 524 } 525 visitor.finishedParsing(); 526 } 527 528 532 public void setInputHTML (String inputHTML) 533 throws 534 ParserException 535 { 536 if (null == inputHTML) 537 throw new IllegalArgumentException ("html cannot be null"); 538 if (!"".equals (inputHTML)) 539 setLexer (new Lexer (new Page (inputHTML))); 540 } 541 542 546 public NodeList extractAllNodesThatMatch (NodeFilter filter) throws ParserException 547 { 548 NodeIterator e; 549 NodeList ret; 550 551 ret = new NodeList (); 552 for (e = elements (); e.hasMoreNodes (); ) 553 e.nextNode ().collectInto (ret, filter); 554 555 return (ret); 556 } 557 558 562 public Node [] extractAllNodesThatAre (Class nodeType) throws ParserException 563 { 564 NodeList ret; 565 566 ret = extractAllNodesThatMatch (new NodeClassFilter (nodeType)); 567 568 return (ret.toNodeArray ()); 569 } 570 571 575 584 public void preConnect (HttpURLConnection connection) 585 throws 586 ParserException 587 { 588 if (null != getFeedback ()) 589 getFeedback ().info (ConnectionManager.getRequestHeader (connection)); 590 } 591 592 598 public void postConnect (HttpURLConnection connection) 599 throws 600 ParserException 601 { 602 if (null != getFeedback ()) 603 getFeedback ().info (ConnectionManager.getResponseHeader (connection)); 604 } 605 606 609 public static void main (String [] args) 610 { 611 Parser parser; 612 NodeFilter filter; 613 614 if (args.length < 1 || args[0].equals ("-help")) 615 { 616 System.out.println ("HTML Parser v" + VERSION_STRING + "\n"); 617 System.out.println (); 618 System.out.println ("Syntax : java -jar htmlparser.jar <resourceLocn/website> [node_type]"); 619 System.out.println (" <resourceLocn/website> the URL or file to be parsed"); 620 System.out.println (" node_type an optional node name, for example:"); 621 System.out.println (" A - Show only the link tags extracted from the document"); 622 System.out.println (" IMG - Show only the image tags extracted from the document"); 623 System.out.println (" TITLE - Extract the title from the document"); 624 System.out.println (); 625 System.out.println ("Example : java -jar htmlparser.jar http://www.yahoo.com"); 626 System.out.println (); 627 System.out.println ("For support, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page..."); 628 System.out.println ("HTML Parser home page : http://htmlparser.org"); 629 System.out.println (); 630 } 631 else 632 try 633 { 634 parser = new Parser (); 635 if (1 < args.length) 636 filter = new TagNameFilter (args[1]); 637 else 638 { filter = null; 640 parser.setFeedback (Parser.stdout); 641 getConnectionManager ().setMonitor (parser); 642 } 643 parser.setURL (args[0]); 644 System.out.println (parser.parse (filter)); 645 } 646 catch (ParserException e) 647 { 648 e.printStackTrace (); 649 } 650 } 651 } 652 | Popular Tags |