StringExtractor


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2004 Somik Raha
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/StringExtractor.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2004/02/29 15:09:56 $
10  // $Revision: 1.47 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.parserapplications;
28  
29  import org.htmlparser.beans.StringBean;
30  import org.htmlparser.util.ParserException;
31  
32  /**
33   * Extract plaintext strings from a web page.
34   * Illustrative program to gather the textual contents of a web page.
35   * Uses a {@link org.htmlparser.beans.StringBean StringBean} to accumulate
36   * the user visible text (what a browser would display) into a single string.
37   */
38  public class StringExtractor
39  {
40      private String   resource;
41  
42      /**
43       * Construct a StringExtractor to read from the given resource.
44       * @param resource Either a URL or a file name.
45       */
46      public StringExtractor (String   resource)
47      {
48          this.resource = resource;
49      }
50  
51      /**
52       * Extract the text from a page.
53       * @param links if <code>true</code> include hyperlinks in output.
54       * @return The textual contents of the page.
55       */
56      public String   extractStrings (boolean links)
57          throws
58              ParserException
59      {
60          StringBean sb;
61  
62          sb = new StringBean ();
63          sb.setLinks (links);
64          sb.setURL (resource);
65  
66          return (sb.getStrings ());
67      }
68  
69      /**
70       * Mainline.
71       * @param args The command line arguments.
72       */
73      public static void main (String  [] args)
74      {
75          boolean links;
76          String   url;
77          StringExtractor se;
78  
79          links = false;
80          url = null;
81          for (int i = 0; i < args.length; i++)
82              if (args[i].equalsIgnoreCase ("-links"))
83                  links = true;
84              else
85                  url = args[i];
86          if (null != url)
87          {
88              se = new StringExtractor (url);
89              try
90              {
91                  System.out.println (se.extractStrings (links));
92              }
93              catch (ParserException e)
94              {
95                  e.printStackTrace ();
96              }
97          }
98          else
99              System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.parserapplications.StringExtractor [-links] url");
100     }
101 }
102
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags