Parser


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2004 Somik Raha
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2005/03/13 15:36:11 $
10  // $Revision: 1.103 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser;
28  
29  import java.io.Serializable  ;
30  import java.net.HttpURLConnection  ;
31  import java.net.URLConnection  ;
32  
33  import org.htmlparser.filters.TagNameFilter;
34  import org.htmlparser.filters.NodeClassFilter;
35  import org.htmlparser.http.ConnectionManager;
36  import org.htmlparser.http.ConnectionMonitor;
37  import org.htmlparser.lexer.Lexer;
38  import org.htmlparser.lexer.Page;
39  import org.htmlparser.util.DefaultParserFeedback;
40  import org.htmlparser.util.IteratorImpl;
41  import org.htmlparser.util.NodeIterator;
42  import org.htmlparser.util.NodeList;
43  import org.htmlparser.util.ParserException;
44  import org.htmlparser.util.ParserFeedback;
45  import org.htmlparser.visitors.NodeVisitor;
46  
47  /**
48   * This is the class that the user will use, either to get an iterator into
49   * the html page or to directly parse the page and print the results
50   * <BR>
51   * Typical usage of the parser is as follows : <BR>
52   * [1] Create a parser object - passing the URL and a feedback object to the parser<BR>
53   * [2] Enumerate through the elements from the parser object <BR>
54   * It is important to note that the parsing occurs when you enumerate, ON DEMAND.
55   * This is a thread-safe way, and you only get the control back after a
56   * particular element is parsed and returned, which could be the entire body.
57   * @see Parser#elements()
58   */
59  public class Parser
60      implements
61          Serializable  ,
62          ConnectionMonitor
63  {
64      // Please don't change the formatting of the version variables below.
65      // This is done so as to facilitate ant script processing.
66  
67      /**
68       * The floating point version number.
69       */
70      public final static double
71      VERSION_NUMBER = 1.5
72      ;
73  
74      /**
75       * The type of version.
76       */
77      public final static String  
78      VERSION_TYPE = "Integration Build"
79      ;
80  
81      /**
82       * The date of the version.
83       */
84      public final static String  
85      VERSION_DATE = "Mar 13, 2005"
86      ;
87  
88      /**
89       * The display version.
90       */
91      public final static String  
92      VERSION_STRING = "" + VERSION_NUMBER + " (" + VERSION_TYPE + " " + VERSION_DATE + ")"
93      ;
94  
95      // End of formatting
96  
97      /**
98       * Feedback object.
99       */
100     protected ParserFeedback mFeedback;
101 
102     /**
103      * The html lexer associated with this parser.
104      */
105     protected Lexer mLexer;
106 
107     /**
108      * A quiet message sink.
109      * Use this for no feedback.
110      */
111     public static ParserFeedback noFeedback = new DefaultParserFeedback (DefaultParserFeedback.QUIET);
112 
113     /**
114      * A verbose message sink.
115      * Use this for output on <code>System.out</code>.
116      */
117     public static ParserFeedback stdout = new DefaultParserFeedback ();
118 
119     //
120     // Static methods
121     //
122 
123     /**
124      * Return the version string of this parser.
125      * @return A string of the form:
126      * <pre>
127      * "[floating point number] ([build-type] [build-date])"
128      * </pre>
129      */
130     public static String   getVersion ()
131     {
132         return (VERSION_STRING);
133     }
134 
135     /**
136      * Return the version number of this parser.
137      * @return A floating point number, the whole number part is the major
138      * version, and the fractional part is the minor version.
139      */
140     public static double getVersionNumber ()
141     {
142         return (VERSION_NUMBER);
143     }
144 
145     /**
146      * Get the connection manager all Parsers use.
147      * @return The connection manager.
148      */
149     public static ConnectionManager getConnectionManager ()
150     {
151         return (Page.getConnectionManager ());
152     }
153 
154     /**
155      * Set the connection manager all Parsers use.
156      * @param manager The new connection manager.
157      */
158     public static void setConnectionManager (ConnectionManager manager)
159     {
160         Page.setConnectionManager (manager);
161     }
162 
163     /**
164      * Creates the parser on an input string.
165      * @param html The string containing HTML.
166      * @param charset <em>Optional</em>. The character set encoding that will
167      * be reported by {@link #getEncoding}. If charset is <code>null</code>
168      * the default character set is used.
169      * @return A parser with the <code>html</code> string as input.
170      */
171     public static Parser createParser (String   html, String   charset)
172     {
173         Parser ret;
174 
175         if (null == html)
176             throw new IllegalArgumentException   ("html cannot be null");
177         ret = new Parser (new Lexer (new Page (html, charset)));
178 
179         return (ret);
180     }
181 
182     //
183     // Constructors
184     //
185 
186     /**
187      * Zero argument constructor.
188      * The parser is in a safe but useless state.
189      * Set the lexer or connection using setLexer() or setConnection().
190      * @see #setLexer(Lexer)
191      * @see #setConnection(URLConnection)
192      */
193     public Parser ()
194     {
195         this (new Lexer (new Page ("")), noFeedback);
196     }
197 
198     /**
199      * This constructor enables the construction of test cases, with readers
200      * associated with test string buffers. It can also be used with readers of the user's choice
201      * streaming data into the parser.<p/>
202      * <B>Important:</B> If you are using this constructor, and you would like to use the parser
203      * to parse multiple times (multiple calls to parser.elements()), you must ensure the following:<br>
204      * <ul>
205      * <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
206      * <li>After the first parse, calls to elements() must be preceded by calls to :
207      * <pre>
208      * parser.getReader().reset();
209      * </pre>
210      * </li>
211      * </ul>
212      * @param lexer The lexer to draw characters from.
213      * @param fb The object to use when information,
214      * warning and error messages are produced. If <em>null</em> no feedback
215      * is provided.
216      */
217     public Parser (Lexer lexer, ParserFeedback fb)
218     {
219         setFeedback (fb);
220         if (null == lexer)
221             throw new IllegalArgumentException   ("lexer cannot be null");
222         setLexer (lexer);
223         setNodeFactory (new PrototypicalNodeFactory ());
224     }
225 
226     /**
227      * Constructor for custom HTTP access.
228      * @param connection A fully conditioned connection. The connect()
229      * method will be called so it need not be connected yet.
230      * @param fb The object to use for message communication.
231      */
232     public Parser (URLConnection   connection, ParserFeedback fb)
233         throws
234             ParserException
235     {
236         this (new Lexer (connection), fb);
237     }
238 
239     /**
240      * Creates a Parser object with the location of the resource (URL or file)
241      * You would typically create a DefaultHTMLParserFeedback object and pass it in.
242      * @param resourceLocn Either the URL or the filename (autodetects).
243      * A standard HTTP GET is performed to read the content of the URL.
244      * @param feedback The HTMLParserFeedback object to use when information,
245      * warning and error messages are produced. If <em>null</em> no feedback
246      * is provided.
247      * @see #Parser(URLConnection,ParserFeedback)
248      */
249     public Parser (String   resourceLocn, ParserFeedback feedback) throws ParserException
250     {
251         this (getConnectionManager ().openConnection (resourceLocn), feedback);
252     }
253 
254     /**
255      * Creates a Parser object with the location of the resource (URL or file).
256      * A DefaultHTMLParserFeedback object is used for feedback.
257      * @param resourceLocn Either the URL or the filename (autodetects).
258      */
259     public Parser (String   resourceLocn) throws ParserException
260     {
261         this (resourceLocn, stdout);
262     }
263 
264     /**
265      * This constructor is present to enable users to plugin their own lexers.
266      * A DefaultHTMLParserFeedback object is used for feedback. It can also be used with readers of the user's choice
267      * streaming data into the parser.<p/>
268      * <B>Important:</B> If you are using this constructor, and you would like to use the parser
269      * to parse multiple times (multiple calls to parser.elements()), you must ensure the following:<br>
270      * <ul>
271      * <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
272      * <li>After the first parse, calls to elements() must be preceded by calls to :
273      * <pre>
274      * parser.getReader().reset();
275      * </pre>
276      * </li>
277      * @param lexer The source for HTML to be parsed.
278      */
279     public Parser (Lexer lexer)
280     {
281         this (lexer, stdout);
282     }
283 
284     /**
285      * Constructor for non-standard access.
286      * A DefaultHTMLParserFeedback object is used for feedback.
287      * @param connection A fully conditioned connection. The connect()
288      * method will be called so it need not be connected yet.
289      * @see #Parser(URLConnection,ParserFeedback)
290      */
291     public Parser (URLConnection   connection) throws ParserException
292     {
293         this (connection, stdout);
294     }
295 
296     //
297     // Bean patterns
298     //
299 
300     /**
301      * Set the connection for this parser.
302      * This method creates a new <code>Lexer</code> reading from the connection.
303      * Trying to set the connection to null is a noop.
304      * @param connection A fully conditioned connection. The connect()
305      * method will be called so it need not be connected yet.
306      * @exception ParserException if the character set specified in the
307      * HTTP header is not supported, or an i/o exception occurs creating the
308      * lexer.
309      * @see #setLexer
310      */
311     public void setConnection (URLConnection   connection)
312         throws
313             ParserException
314     {
315         if (null != connection)
316             setLexer (new Lexer (connection));
317     }
318 
319     /**
320      * Return the current connection.
321      * @return The connection either created by the parser or passed into this
322      * parser via <code>setConnection</code>.
323      * @see #setConnection(URLConnection)
324      */
325     public URLConnection   getConnection ()
326     {
327         return (getLexer ().getPage ().getConnection ());
328     }
329 
330     /**
331      * Set the URL for this parser.
332      * This method creates a new Lexer reading from the given URL.
333      * Trying to set the url to null or an empty string is a noop.
334      * @see #setConnection(URLConnection)
335      */
336     public void setURL (String   url)
337         throws
338             ParserException
339     {
340         if ((null != url) && !"".equals (url))
341             setConnection (Page.getConnectionManager ().openConnection (url));
342     }
343 
344     /**
345      * Return the current URL being parsed.
346      * @return The url passed into the constructor or the file name
347      * passed to the constructor modified to be a URL.
348      */
349     public String   getURL ()
350     {
351         return (getLexer ().getPage ().getUrl ());
352     }
353 
354     /**
355      * Set the encoding for the page this parser is reading from.
356      * @param encoding The new character set to use.
357      */
358     public void setEncoding (String   encoding)
359         throws
360             ParserException
361     {
362         getLexer ().getPage ().setEncoding (encoding);
363     }
364         
365     /**
366      * Get the encoding for the page this parser is reading from.
367      * This item is set from the HTTP header but may be overridden by meta
368      * tags in the head, so this may change after the head has been parsed.
369      */
370     public String   getEncoding ()
371     {
372         return (getLexer ().getPage ().getEncoding ());
373     }
374 
375     /**
376      * Set the lexer for this parser.
377      * The current NodeFactory is set on the given lexer, since the lexer
378      * contains the node factory object.
379      * It does not adjust the <code>feedback</code> object.
380      * Trying to set the lexer to <code>null</code> is a noop.
381      * @param lexer The lexer object to use.
382      */
383     public void setLexer (Lexer lexer)
384     {
385         NodeFactory factory;
386         String   type;
387 
388         if (null != lexer)
389         {   // move a node factory that's been set to the new lexer
390             factory = null;
391             if (null != getLexer ())
392                 factory = getLexer ().getNodeFactory ();
393             if (null != factory)
394                 lexer.setNodeFactory (factory);
395             mLexer = lexer;
396             // warn about content that's not likely text
397             type = mLexer.getPage ().getContentType ();
398             if (type != null && !type.startsWith ("text"))
399                 getFeedback ().warning (
400                     "URL "
401                     + mLexer.getPage ().getUrl ()
402                     + " does not contain text");
403         }
404     }
405 
406     /**
407      * Returns the reader associated with the parser
408      * @return The current lexer.
409      */
410     public Lexer getLexer ()
411     {
412         return (mLexer);
413     }
414 
415     /**
416      * Get the current node factory.
417      * @return The parser's node factory.
418      */
419     public NodeFactory getNodeFactory ()
420     {
421         return (getLexer ().getNodeFactory ());
422     }
423 
424     /**
425      * Set the current node factory.
426      * @param factory The new node factory for the parser.
427      */
428     public void setNodeFactory (NodeFactory factory)
429     {
430         if (null == factory)
431             throw new IllegalArgumentException   ("node factory cannot be null");
432         getLexer ().setNodeFactory (factory);
433     }
434 
435     /**
436      * Sets the feedback object used in scanning.
437      * @param fb The new feedback object to use.
438      */
439     public void setFeedback (ParserFeedback fb)
440     {
441         mFeedback = (null == fb) ? noFeedback : fb;
442     }
443 
444     /**
445      * Returns the feedback.
446      * @return HTMLParserFeedback
447      */
448     public ParserFeedback getFeedback()
449     {
450         return (mFeedback);
451     }
452 
453     //
454     // Public methods
455     //
456 
457     /**
458      * Reset the parser to start from the beginning again.
459      */
460     public void reset ()
461     {
462         getLexer ().reset ();
463     }
464 
465     /**
466      * Returns an iterator (enumeration) to the html nodes. Each node can be a tag/endtag/
467      * string/link/image<br>
468      * This is perhaps the most important method of this class. In typical situations, you will need to use
469      * the parser like this :
470      * <pre>
471      * Parser parser = new Parser("http://www.yahoo.com");
472      * for (NodeIterator i = parser.elements();i.hasMoreElements();) {
473      *    Node node = i.nextHTMLNode();
474      *    if (node instanceof StringNode) {
475      *      // Downcasting to StringNode
476      *      StringNode stringNode = (StringNode)node;
477      *      // Do whatever processing you want with the string node
478      *      System.out.println(stringNode.getText());
479      *    }
480      *    // Check for the node or tag that you want
481      *    if (node instanceof ...) {
482      *      // Downcast, and process
483      *      // recursively (nodes within nodes)
484      *    }
485      * }
486      * </pre>
487      */
488     public NodeIterator elements () throws ParserException
489     {
490         return (new IteratorImpl (getLexer (), getFeedback ()));
491     }
492 
493     /**
494      * Parse the given resource, using the filter provided.
495      * @param filter The filter to apply to the parsed nodes.
496      * @return The list of matching nodes (for a <code>null</code>
497      * filter this is all the top level nodes).
498      */
499     public NodeList parse (NodeFilter filter) throws ParserException
500     {
501         NodeIterator e;
502         Node node;
503         NodeList ret;
504 
505         ret = new NodeList ();
506         for (e = elements (); e.hasMoreNodes (); )
507         {
508             node = e.nextNode ();
509             if (null != filter)
510                 node.collectInto (ret, filter);
511             else
512                 ret.add (node);
513         }
514         
515         return (ret);
516     }
517 
518     public void visitAllNodesWith(NodeVisitor visitor) throws ParserException {
519         Node node;
520         visitor.beginParsing();
521         for (NodeIterator e = elements();e.hasMoreNodes();) {
522             node = e.nextNode();
523             node.accept(visitor);
524         }
525         visitor.finishedParsing();
526     }
527 
528     /**
529      * Initializes the parser with the given input HTML String.
530      * @param inputHTML the input HTML that is to be parsed.
531      */
532     public void setInputHTML (String   inputHTML)
533         throws
534             ParserException
535     {
536         if (null == inputHTML)
537             throw new IllegalArgumentException   ("html cannot be null");
538         if (!"".equals (inputHTML))
539             setLexer (new Lexer (new Page (inputHTML)));
540     }
541 
542     /**
543      * Extract all nodes matching the given filter.
544      * @see Node#collectInto(NodeList, NodeFilter)
545      */
546     public NodeList extractAllNodesThatMatch (NodeFilter filter) throws ParserException
547     {
548         NodeIterator e;
549         NodeList ret;
550         
551         ret = new NodeList ();
552         for (e = elements (); e.hasMoreNodes (); )
553             e.nextNode ().collectInto (ret, filter);
554 
555         return (ret);
556     }
557 
558     /**
559      * Convenience method to extract all nodes of a given class type.
560      * @see Node#collectInto(NodeList, NodeFilter)
561      */
562     public Node [] extractAllNodesThatAre (Class   nodeType) throws ParserException
563     {
564         NodeList ret;
565 
566         ret = extractAllNodesThatMatch (new NodeClassFilter (nodeType)); 
567 
568         return (ret.toNodeArray ());
569     }
570 
571     //
572     // ConnectionMonitor interface
573     //
574 
575     /**
576      * Called just prior to calling connect.
577      * The connection has been conditioned with proxy, URL user/password,
578      * and cookie information. It is still possible to adjust the
579      * connection to alter the request method for example. 
580      * @param connection The connection which is about to be connected.
581      * @exception This exception is thrown if the connection monitor
582      * wants the ConnectionManager to bail out.
583      */
584     public void preConnect (HttpURLConnection   connection)
585         throws
586             ParserException
587     {
588         if (null != getFeedback ())
589             getFeedback ().info (ConnectionManager.getRequestHeader (connection));
590     }
591 
592     /** Called just after calling connect.
593      * The response code and header fields can be examined.
594      * @param connection The connection that was just connected.
595      * @exception This exception is thrown if the connection monitor
596      * wants the ConnectionManager to bail out.
597      */
598     public void postConnect (HttpURLConnection   connection)
599         throws
600             ParserException
601     {
602         if (null != getFeedback ())
603             getFeedback ().info (ConnectionManager.getResponseHeader (connection));
604     }
605 
606     /**
607      * The main program, which can be executed from the command line
608      */
609     public static void main (String   [] args)
610     {
611         Parser parser;
612         NodeFilter filter;
613 
614         if (args.length < 1 || args[0].equals ("-help"))
615         {
616             System.out.println ("HTML Parser v" + VERSION_STRING + "\n");
617             System.out.println ();
618             System.out.println ("Syntax : java -jar htmlparser.jar <resourceLocn/website> [node_type]");
619             System.out.println ("   <resourceLocn/website> the URL or file to be parsed");
620             System.out.println ("   node_type an optional node name, for example:");
621             System.out.println ("     A - Show only the link tags extracted from the document");
622             System.out.println ("     IMG - Show only the image tags extracted from the document");
623             System.out.println ("     TITLE - Extract the title from the document");
624             System.out.println ();
625             System.out.println ("Example : java -jar htmlparser.jar http://www.yahoo.com");
626             System.out.println ();
627             System.out.println ("For support, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page...");
628             System.out.println ("HTML Parser home page : http://htmlparser.org");
629             System.out.println ();
630         }
631         else
632             try
633             {
634                 parser = new Parser ();
635                 if (1 < args.length)
636                     filter = new TagNameFilter (args[1]);
637                 else
638                 {   // for a simple dump, use more verbose settings
639                     filter = null;
640                     parser.setFeedback (Parser.stdout);
641                     getConnectionManager ().setMonitor (parser);
642                 }
643                 parser.setURL (args[0]);
644                 System.out.println (parser.parse (filter));
645             }
646             catch (ParserException e)
647             {
648                 e.printStackTrace ();
649             }
650     }
651 }
652
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags