WebPageXtractor


1   package bplatt.spider;
2   
3   /**
4    * WebPageXtractor - extracts information from a WebPage
5    * passed as an input stream.  Makes use of SimpleHTMLParser
6    * object.  Used to use HTMLEditorKit and HTMLEditorKit.Parser. 
7    * This turned out to be too buggy for this application.
8    * Cannot use XML parser as HTML does not follow stricter XML
9    * syntax rules.  In fact many Web pages are a "tag salad" that
10   * don't even follow proper HTML syntax.  WebPageXtractor parses
11   * a page and extracts links, images, and title(s).
12   * 
13   * Copyright 2002, Robert L. Platt, All rights reserved
14   * @author Robert L. Platt 
15   * 
16   * This program is free software; you can redistribute it and/or modify
17   * it under the terms of the GNU General Public License as published by
18   * the Free Software Foundation; either version 2 of the License, or
19   * (at your option) any later version.
20   *
21   * This program is distributed in the hope that it will be useful,
22   * but WITHOUT ANY WARRANTY; without even the implied warranty of
23   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24   * GNU General Public License for more details.
25   *
26   * You should have received a copy of the GNU General Public License
27   * along with this program; if not, write to the Free Software
28   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
29   */
30   
31   import java.io.*;
32   import java.util.*;
33   
34   public class WebPageXtractor extends SimpleHTMLParser {
35      private ArrayList links;
36      private ArrayList images;
37      private ArrayList title;
38      private boolean inTitle;
39      
40      /** Constructor */
41      public WebPageXtractor() {
42          super();
43          links = new ArrayList();
44          images = new ArrayList();
45          title = new ArrayList();
46      }
47      
48      /**
49       * If we're within TITLE tags - save the title
50       * @see SimpleHTMLParser#processContent(SimpleHTMLToken)
51       */
52      public void processContent(SimpleHTMLToken token) {
53          String   s = token.getContent().trim();
54          if (s != null && s.length() != 0) {
55              if (inTitle) title.add(s);
56          }
57      }
58  
59      /**
60       * Look for </title> tags
61       * @see SimpleHTMLParser#processEndTag(SimpleHTMLToken)
62       */
63      public void processEndTag(SimpleHTMLToken token) throws IOException
64      {
65          String   tag = SimpleHTMLParser.getTagType(token,true);
66          if (tag == null) throw new IOException("HTML parsing error");
67          else if (tag.equals("title")) inTitle = false;
68      }
69  
70      /**
71       * Handle Anchor, Image, Frame, and Title tags
72       * @see SimpleHTMLParser#processTag(SimpleHTMLToken)
73       */
74      public void processTag(SimpleHTMLToken token) throws IOException
75      {
76          String   tag = SimpleHTMLParser.getTagType(token,true);
77          if (tag == null) throw new IOException("HTML parsing error");
78          else if (tag.equals("a")) {
79              String   link = extractHref(token.getContent());
80              if (link != null) links.add(link);
81          }
82          else if (tag.equals("img")) {
83              String   image = extractSrc(token.getContent());
84              if (image != null) images.add(image);
85          }
86          else if (tag.equals("frame")) {
87              String   link = extractSrc(token.getContent());
88              if (link != null) links.add(link);
89          }
90          else if (tag.equals("title")) inTitle = true;
91      }
92      
93      
94      // Utility method for extracting href attribute
95      private String   extractHref(String   tag)
96      {
97          String   delims="\t\r\f\n \'\"=";
98          StringTokenizer tt = new StringTokenizer(tag,delims);
99          while(tt.hasMoreElements()) {
100             String   s = tt.nextToken();
101             if (s.equalsIgnoreCase("href")) {
102                 if (!tt.hasMoreElements()) return(null);
103                 else return(tt.nextToken());
104             }
105         }
106         return(null);
107     }
108     
109     // Utility method for extracting src attribute
110     private String   extractSrc(String   tag)
111     {
112         String   delims="\t\r\f\n \'\"=";
113         StringTokenizer tt = new StringTokenizer(tag,delims);
114         while(tt.hasMoreElements()) {
115             String   s = tt.nextToken();
116             if (s.equalsIgnoreCase("src")) {
117                 if (!tt.hasMoreElements()) return(null);
118                 else return(tt.nextToken());
119             }
120         }
121         return(null);
122     }
123     /**
124      * Returns the images.
125      * @return ArrayList
126      */
127     public ArrayList getImages() {
128         return images;
129     }
130 
131     /**
132      * Returns the links.
133      * @return ArrayList
134      */
135     public ArrayList getLinks() {
136         return links;
137     }
138 
139     /**
140      * Returns the title.
141      * @return ArrayList
142      */
143     public ArrayList getTitle() {
144         return title;
145     }
146 }
147
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags