KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > parserapplications > StringExtractor


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Somik Raha
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/StringExtractor.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2004/02/29 15:09:56 $
10
// $Revision: 1.47 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.parserapplications;
28
29 import org.htmlparser.beans.StringBean;
30 import org.htmlparser.util.ParserException;
31
32 /**
33  * Extract plaintext strings from a web page.
34  * Illustrative program to gather the textual contents of a web page.
35  * Uses a {@link org.htmlparser.beans.StringBean StringBean} to accumulate
36  * the user visible text (what a browser would display) into a single string.
37  */

38 public class StringExtractor
39 {
40     private String JavaDoc resource;
41
42     /**
43      * Construct a StringExtractor to read from the given resource.
44      * @param resource Either a URL or a file name.
45      */

46     public StringExtractor (String JavaDoc resource)
47     {
48         this.resource = resource;
49     }
50
51     /**
52      * Extract the text from a page.
53      * @param links if <code>true</code> include hyperlinks in output.
54      * @return The textual contents of the page.
55      */

56     public String JavaDoc extractStrings (boolean links)
57         throws
58             ParserException
59     {
60         StringBean sb;
61
62         sb = new StringBean ();
63         sb.setLinks (links);
64         sb.setURL (resource);
65
66         return (sb.getStrings ());
67     }
68
69     /**
70      * Mainline.
71      * @param args The command line arguments.
72      */

73     public static void main (String JavaDoc[] args)
74     {
75         boolean links;
76         String JavaDoc url;
77         StringExtractor se;
78
79         links = false;
80         url = null;
81         for (int i = 0; i < args.length; i++)
82             if (args[i].equalsIgnoreCase ("-links"))
83                 links = true;
84             else
85                 url = args[i];
86         if (null != url)
87         {
88             se = new StringExtractor (url);
89             try
90             {
91                 System.out.println (se.extractStrings (links));
92             }
93             catch (ParserException e)
94             {
95                 e.printStackTrace ();
96             }
97         }
98         else
99             System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.parserapplications.StringExtractor [-links] url");
100     }
101 }
102
Popular Tags