KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > Parser


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Somik Raha
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2005/03/13 15:36:11 $
10
// $Revision: 1.103 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser;
28
29 import java.io.Serializable JavaDoc;
30 import java.net.HttpURLConnection JavaDoc;
31 import java.net.URLConnection JavaDoc;
32
33 import org.htmlparser.filters.TagNameFilter;
34 import org.htmlparser.filters.NodeClassFilter;
35 import org.htmlparser.http.ConnectionManager;
36 import org.htmlparser.http.ConnectionMonitor;
37 import org.htmlparser.lexer.Lexer;
38 import org.htmlparser.lexer.Page;
39 import org.htmlparser.util.DefaultParserFeedback;
40 import org.htmlparser.util.IteratorImpl;
41 import org.htmlparser.util.NodeIterator;
42 import org.htmlparser.util.NodeList;
43 import org.htmlparser.util.ParserException;
44 import org.htmlparser.util.ParserFeedback;
45 import org.htmlparser.visitors.NodeVisitor;
46
47 /**
48  * This is the class that the user will use, either to get an iterator into
49  * the html page or to directly parse the page and print the results
50  * <BR>
51  * Typical usage of the parser is as follows : <BR>
52  * [1] Create a parser object - passing the URL and a feedback object to the parser<BR>
53  * [2] Enumerate through the elements from the parser object <BR>
54  * It is important to note that the parsing occurs when you enumerate, ON DEMAND.
55  * This is a thread-safe way, and you only get the control back after a
56  * particular element is parsed and returned, which could be the entire body.
57  * @see Parser#elements()
58  */

59 public class Parser
60     implements
61         Serializable JavaDoc,
62         ConnectionMonitor
63 {
64     // Please don't change the formatting of the version variables below.
65
// This is done so as to facilitate ant script processing.
66

67     /**
68      * The floating point version number.
69      */

70     public final static double
71     VERSION_NUMBER = 1.5
72     ;
73
74     /**
75      * The type of version.
76      */

77     public final static String JavaDoc
78     VERSION_TYPE = "Integration Build"
79     ;
80
81     /**
82      * The date of the version.
83      */

84     public final static String JavaDoc
85     VERSION_DATE = "Mar 13, 2005"
86     ;
87
88     /**
89      * The display version.
90      */

91     public final static String JavaDoc
92     VERSION_STRING = "" + VERSION_NUMBER + " (" + VERSION_TYPE + " " + VERSION_DATE + ")"
93     ;
94
95     // End of formatting
96

97     /**
98      * Feedback object.
99      */

100     protected ParserFeedback mFeedback;
101
102     /**
103      * The html lexer associated with this parser.
104      */

105     protected Lexer mLexer;
106
107     /**
108      * A quiet message sink.
109      * Use this for no feedback.
110      */

111     public static ParserFeedback noFeedback = new DefaultParserFeedback (DefaultParserFeedback.QUIET);
112
113     /**
114      * A verbose message sink.
115      * Use this for output on <code>System.out</code>.
116      */

117     public static ParserFeedback stdout = new DefaultParserFeedback ();
118
119     //
120
// Static methods
121
//
122

123     /**
124      * Return the version string of this parser.
125      * @return A string of the form:
126      * <pre>
127      * "[floating point number] ([build-type] [build-date])"
128      * </pre>
129      */

130     public static String JavaDoc getVersion ()
131     {
132         return (VERSION_STRING);
133     }
134
135     /**
136      * Return the version number of this parser.
137      * @return A floating point number, the whole number part is the major
138      * version, and the fractional part is the minor version.
139      */

140     public static double getVersionNumber ()
141     {
142         return (VERSION_NUMBER);
143     }
144
145     /**
146      * Get the connection manager all Parsers use.
147      * @return The connection manager.
148      */

149     public static ConnectionManager getConnectionManager ()
150     {
151         return (Page.getConnectionManager ());
152     }
153
154     /**
155      * Set the connection manager all Parsers use.
156      * @param manager The new connection manager.
157      */

158     public static void setConnectionManager (ConnectionManager manager)
159     {
160         Page.setConnectionManager (manager);
161     }
162
163     /**
164      * Creates the parser on an input string.
165      * @param html The string containing HTML.
166      * @param charset <em>Optional</em>. The character set encoding that will
167      * be reported by {@link #getEncoding}. If charset is <code>null</code>
168      * the default character set is used.
169      * @return A parser with the <code>html</code> string as input.
170      */

171     public static Parser createParser (String JavaDoc html, String JavaDoc charset)
172     {
173         Parser ret;
174
175         if (null == html)
176             throw new IllegalArgumentException JavaDoc ("html cannot be null");
177         ret = new Parser (new Lexer (new Page (html, charset)));
178
179         return (ret);
180     }
181
182     //
183
// Constructors
184
//
185

186     /**
187      * Zero argument constructor.
188      * The parser is in a safe but useless state.
189      * Set the lexer or connection using setLexer() or setConnection().
190      * @see #setLexer(Lexer)
191      * @see #setConnection(URLConnection)
192      */

193     public Parser ()
194     {
195         this (new Lexer (new Page ("")), noFeedback);
196     }
197
198     /**
199      * This constructor enables the construction of test cases, with readers
200      * associated with test string buffers. It can also be used with readers of the user's choice
201      * streaming data into the parser.<p/>
202      * <B>Important:</B> If you are using this constructor, and you would like to use the parser
203      * to parse multiple times (multiple calls to parser.elements()), you must ensure the following:<br>
204      * <ul>
205      * <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
206      * <li>After the first parse, calls to elements() must be preceded by calls to :
207      * <pre>
208      * parser.getReader().reset();
209      * </pre>
210      * </li>
211      * </ul>
212      * @param lexer The lexer to draw characters from.
213      * @param fb The object to use when information,
214      * warning and error messages are produced. If <em>null</em> no feedback
215      * is provided.
216      */

217     public Parser (Lexer lexer, ParserFeedback fb)
218     {
219         setFeedback (fb);
220         if (null == lexer)
221             throw new IllegalArgumentException JavaDoc ("lexer cannot be null");
222         setLexer (lexer);
223         setNodeFactory (new PrototypicalNodeFactory ());
224     }
225
226     /**
227      * Constructor for custom HTTP access.
228      * @param connection A fully conditioned connection. The connect()
229      * method will be called so it need not be connected yet.
230      * @param fb The object to use for message communication.
231      */

232     public Parser (URLConnection JavaDoc connection, ParserFeedback fb)
233         throws
234             ParserException
235     {
236         this (new Lexer (connection), fb);
237     }
238
239     /**
240      * Creates a Parser object with the location of the resource (URL or file)
241      * You would typically create a DefaultHTMLParserFeedback object and pass it in.
242      * @param resourceLocn Either the URL or the filename (autodetects).
243      * A standard HTTP GET is performed to read the content of the URL.
244      * @param feedback The HTMLParserFeedback object to use when information,
245      * warning and error messages are produced. If <em>null</em> no feedback
246      * is provided.
247      * @see #Parser(URLConnection,ParserFeedback)
248      */

249     public Parser (String JavaDoc resourceLocn, ParserFeedback feedback) throws ParserException
250     {
251         this (getConnectionManager ().openConnection (resourceLocn), feedback);
252     }
253
254     /**
255      * Creates a Parser object with the location of the resource (URL or file).
256      * A DefaultHTMLParserFeedback object is used for feedback.
257      * @param resourceLocn Either the URL or the filename (autodetects).
258      */

259     public Parser (String JavaDoc resourceLocn) throws ParserException
260     {
261         this (resourceLocn, stdout);
262     }
263
264     /**
265      * This constructor is present to enable users to plugin their own lexers.
266      * A DefaultHTMLParserFeedback object is used for feedback. It can also be used with readers of the user's choice
267      * streaming data into the parser.<p/>
268      * <B>Important:</B> If you are using this constructor, and you would like to use the parser
269      * to parse multiple times (multiple calls to parser.elements()), you must ensure the following:<br>
270      * <ul>
271      * <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
272      * <li>After the first parse, calls to elements() must be preceded by calls to :
273      * <pre>
274      * parser.getReader().reset();
275      * </pre>
276      * </li>
277      * @param lexer The source for HTML to be parsed.
278      */

279     public Parser (Lexer lexer)
280     {
281         this (lexer, stdout);
282     }
283
284     /**
285      * Constructor for non-standard access.
286      * A DefaultHTMLParserFeedback object is used for feedback.
287      * @param connection A fully conditioned connection. The connect()
288      * method will be called so it need not be connected yet.
289      * @see #Parser(URLConnection,ParserFeedback)
290      */

291     public Parser (URLConnection JavaDoc connection) throws ParserException
292     {
293         this (connection, stdout);
294     }
295
296     //
297
// Bean patterns
298
//
299

300     /**
301      * Set the connection for this parser.
302      * This method creates a new <code>Lexer</code> reading from the connection.
303      * Trying to set the connection to null is a noop.
304      * @param connection A fully conditioned connection. The connect()
305      * method will be called so it need not be connected yet.
306      * @exception ParserException if the character set specified in the
307      * HTTP header is not supported, or an i/o exception occurs creating the
308      * lexer.
309      * @see #setLexer
310      */

311     public void setConnection (URLConnection JavaDoc connection)
312         throws
313             ParserException
314     {
315         if (null != connection)
316             setLexer (new Lexer (connection));
317     }
318
319     /**
320      * Return the current connection.
321      * @return The connection either created by the parser or passed into this
322      * parser via <code>setConnection</code>.
323      * @see #setConnection(URLConnection)
324      */

325     public URLConnection JavaDoc getConnection ()
326     {
327         return (getLexer ().getPage ().getConnection ());
328     }
329
330     /**
331      * Set the URL for this parser.
332      * This method creates a new Lexer reading from the given URL.
333      * Trying to set the url to null or an empty string is a noop.
334      * @see #setConnection(URLConnection)
335      */

336     public void setURL (String JavaDoc url)
337         throws
338             ParserException
339     {
340         if ((null != url) && !"".equals (url))
341             setConnection (Page.getConnectionManager ().openConnection (url));
342     }
343
344     /**
345      * Return the current URL being parsed.
346      * @return The url passed into the constructor or the file name
347      * passed to the constructor modified to be a URL.
348      */

349     public String JavaDoc getURL ()
350     {
351         return (getLexer ().getPage ().getUrl ());
352     }
353
354     /**
355      * Set the encoding for the page this parser is reading from.
356      * @param encoding The new character set to use.
357      */

358     public void setEncoding (String JavaDoc encoding)
359         throws
360             ParserException
361     {
362         getLexer ().getPage ().setEncoding (encoding);
363     }
364         
365     /**
366      * Get the encoding for the page this parser is reading from.
367      * This item is set from the HTTP header but may be overridden by meta
368      * tags in the head, so this may change after the head has been parsed.
369      */

370     public String JavaDoc getEncoding ()
371     {
372         return (getLexer ().getPage ().getEncoding ());
373     }
374
375     /**
376      * Set the lexer for this parser.
377      * The current NodeFactory is set on the given lexer, since the lexer
378      * contains the node factory object.
379      * It does not adjust the <code>feedback</code> object.
380      * Trying to set the lexer to <code>null</code> is a noop.
381      * @param lexer The lexer object to use.
382      */

383     public void setLexer (Lexer lexer)
384     {
385         NodeFactory factory;
386         String JavaDoc type;
387
388         if (null != lexer)
389         { // move a node factory that's been set to the new lexer
390
factory = null;
391             if (null != getLexer ())
392                 factory = getLexer ().getNodeFactory ();
393             if (null != factory)
394                 lexer.setNodeFactory (factory);
395             mLexer = lexer;
396             // warn about content that's not likely text
397
type = mLexer.getPage ().getContentType ();
398             if (type != null && !type.startsWith ("text"))
399                 getFeedback ().warning (
400                     "URL "
401                     + mLexer.getPage ().getUrl ()
402                     + " does not contain text");
403         }
404     }
405
406     /**
407      * Returns the reader associated with the parser
408      * @return The current lexer.
409      */

410     public Lexer getLexer ()
411     {
412         return (mLexer);
413     }
414
415     /**
416      * Get the current node factory.
417      * @return The parser's node factory.
418      */

419     public NodeFactory getNodeFactory ()
420     {
421         return (getLexer ().getNodeFactory ());
422     }
423
424     /**
425      * Set the current node factory.
426      * @param factory The new node factory for the parser.
427      */

428     public void setNodeFactory (NodeFactory factory)
429     {
430         if (null == factory)
431             throw new IllegalArgumentException JavaDoc ("node factory cannot be null");
432         getLexer ().setNodeFactory (factory);
433     }
434
435     /**
436      * Sets the feedback object used in scanning.
437      * @param fb The new feedback object to use.
438      */

439     public void setFeedback (ParserFeedback fb)
440     {
441         mFeedback = (null == fb) ? noFeedback : fb;
442     }
443
444     /**
445      * Returns the feedback.
446      * @return HTMLParserFeedback
447      */

448     public ParserFeedback getFeedback()
449     {
450         return (mFeedback);
451     }
452
453     //
454
// Public methods
455
//
456

457     /**
458      * Reset the parser to start from the beginning again.
459      */

460     public void reset ()
461     {
462         getLexer ().reset ();
463     }
464
465     /**
466      * Returns an iterator (enumeration) to the html nodes. Each node can be a tag/endtag/
467      * string/link/image<br>
468      * This is perhaps the most important method of this class. In typical situations, you will need to use
469      * the parser like this :
470      * <pre>
471      * Parser parser = new Parser("http://www.yahoo.com");
472      * for (NodeIterator i = parser.elements();i.hasMoreElements();) {
473      * Node node = i.nextHTMLNode();
474      * if (node instanceof StringNode) {
475      * // Downcasting to StringNode
476      * StringNode stringNode = (StringNode)node;
477      * // Do whatever processing you want with the string node
478      * System.out.println(stringNode.getText());
479      * }
480      * // Check for the node or tag that you want
481      * if (node instanceof ...) {
482      * // Downcast, and process
483      * // recursively (nodes within nodes)
484      * }
485      * }
486      * </pre>
487      */

488     public NodeIterator elements () throws ParserException
489     {
490         return (new IteratorImpl (getLexer (), getFeedback ()));
491     }
492
493     /**
494      * Parse the given resource, using the filter provided.
495      * @param filter The filter to apply to the parsed nodes.
496      * @return The list of matching nodes (for a <code>null</code>
497      * filter this is all the top level nodes).
498      */

499     public NodeList parse (NodeFilter filter) throws ParserException
500     {
501         NodeIterator e;
502         Node node;
503         NodeList ret;
504
505         ret = new NodeList ();
506         for (e = elements (); e.hasMoreNodes (); )
507         {
508             node = e.nextNode ();
509             if (null != filter)
510                 node.collectInto (ret, filter);
511             else
512                 ret.add (node);
513         }
514         
515         return (ret);
516     }
517
518     public void visitAllNodesWith(NodeVisitor visitor) throws ParserException {
519         Node node;
520         visitor.beginParsing();
521         for (NodeIterator e = elements();e.hasMoreNodes();) {
522             node = e.nextNode();
523             node.accept(visitor);
524         }
525         visitor.finishedParsing();
526     }
527
528     /**
529      * Initializes the parser with the given input HTML String.
530      * @param inputHTML the input HTML that is to be parsed.
531      */

532     public void setInputHTML (String JavaDoc inputHTML)
533         throws
534             ParserException
535     {
536         if (null == inputHTML)
537             throw new IllegalArgumentException JavaDoc ("html cannot be null");
538         if (!"".equals (inputHTML))
539             setLexer (new Lexer (new Page (inputHTML)));
540     }
541
542     /**
543      * Extract all nodes matching the given filter.
544      * @see Node#collectInto(NodeList, NodeFilter)
545      */

546     public NodeList extractAllNodesThatMatch (NodeFilter filter) throws ParserException
547     {
548         NodeIterator e;
549         NodeList ret;
550         
551         ret = new NodeList ();
552         for (e = elements (); e.hasMoreNodes (); )
553             e.nextNode ().collectInto (ret, filter);
554
555         return (ret);
556     }
557
558     /**
559      * Convenience method to extract all nodes of a given class type.
560      * @see Node#collectInto(NodeList, NodeFilter)
561      */

562     public Node [] extractAllNodesThatAre (Class JavaDoc nodeType) throws ParserException
563     {
564         NodeList ret;
565
566         ret = extractAllNodesThatMatch (new NodeClassFilter (nodeType));
567
568         return (ret.toNodeArray ());
569     }
570
571     //
572
// ConnectionMonitor interface
573
//
574

575     /**
576      * Called just prior to calling connect.
577      * The connection has been conditioned with proxy, URL user/password,
578      * and cookie information. It is still possible to adjust the
579      * connection to alter the request method for example.
580      * @param connection The connection which is about to be connected.
581      * @exception This exception is thrown if the connection monitor
582      * wants the ConnectionManager to bail out.
583      */

584     public void preConnect (HttpURLConnection JavaDoc connection)
585         throws
586             ParserException
587     {
588         if (null != getFeedback ())
589             getFeedback ().info (ConnectionManager.getRequestHeader (connection));
590     }
591
592     /** Called just after calling connect.
593      * The response code and header fields can be examined.
594      * @param connection The connection that was just connected.
595      * @exception This exception is thrown if the connection monitor
596      * wants the ConnectionManager to bail out.
597      */

598     public void postConnect (HttpURLConnection JavaDoc connection)
599         throws
600             ParserException
601     {
602         if (null != getFeedback ())
603             getFeedback ().info (ConnectionManager.getResponseHeader (connection));
604     }
605
606     /**
607      * The main program, which can be executed from the command line
608      */

609     public static void main (String JavaDoc [] args)
610     {
611         Parser parser;
612         NodeFilter filter;
613
614         if (args.length < 1 || args[0].equals ("-help"))
615         {
616             System.out.println ("HTML Parser v" + VERSION_STRING + "\n");
617             System.out.println ();
618             System.out.println ("Syntax : java -jar htmlparser.jar <resourceLocn/website> [node_type]");
619             System.out.println (" <resourceLocn/website> the URL or file to be parsed");
620             System.out.println (" node_type an optional node name, for example:");
621             System.out.println (" A - Show only the link tags extracted from the document");
622             System.out.println (" IMG - Show only the image tags extracted from the document");
623             System.out.println (" TITLE - Extract the title from the document");
624             System.out.println ();
625             System.out.println ("Example : java -jar htmlparser.jar http://www.yahoo.com");
626             System.out.println ();
627             System.out.println ("For support, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page...");
628             System.out.println ("HTML Parser home page : http://htmlparser.org");
629             System.out.println ();
630         }
631         else
632             try
633             {
634                 parser = new Parser ();
635                 if (1 < args.length)
636                     filter = new TagNameFilter (args[1]);
637                 else
638                 { // for a simple dump, use more verbose settings
639
filter = null;
640                     parser.setFeedback (Parser.stdout);
641                     getConnectionManager ().setMonitor (parser);
642                 }
643                 parser.setURL (args[0]);
644                 System.out.println (parser.parse (filter));
645             }
646             catch (ParserException e)
647             {
648                 e.printStackTrace ();
649             }
650     }
651 }
652
Popular Tags