KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > bplatt > spider > WebPageXtractor


1 package bplatt.spider;
2
3 /**
4  * WebPageXtractor - extracts information from a WebPage
5  * passed as an input stream. Makes use of SimpleHTMLParser
6  * object. Used to use HTMLEditorKit and HTMLEditorKit.Parser.
7  * This turned out to be too buggy for this application.
8  * Cannot use XML parser as HTML does not follow stricter XML
9  * syntax rules. In fact many Web pages are a "tag salad" that
10  * don't even follow proper HTML syntax. WebPageXtractor parses
11  * a page and extracts links, images, and title(s).
12  *
13  * Copyright 2002, Robert L. Platt, All rights reserved
14  * @author Robert L. Platt
15  *
16  * This program is free software; you can redistribute it and/or modify
17  * it under the terms of the GNU General Public License as published by
18  * the Free Software Foundation; either version 2 of the License, or
19  * (at your option) any later version.
20  *
21  * This program is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24  * GNU General Public License for more details.
25  *
26  * You should have received a copy of the GNU General Public License
27  * along with this program; if not, write to the Free Software
28  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29  */

30  
31  import java.io.*;
32  import java.util.*;
33  
34  public class WebPageXtractor extends SimpleHTMLParser {
35     private ArrayList links;
36     private ArrayList images;
37     private ArrayList title;
38     private boolean inTitle;
39     
40     /** Constructor */
41     public WebPageXtractor() {
42         super();
43         links = new ArrayList();
44         images = new ArrayList();
45         title = new ArrayList();
46     }
47     
48     /**
49      * If we're within TITLE tags - save the title
50      * @see SimpleHTMLParser#processContent(SimpleHTMLToken)
51      */

52     public void processContent(SimpleHTMLToken token) {
53         String JavaDoc s = token.getContent().trim();
54         if (s != null && s.length() != 0) {
55             if (inTitle) title.add(s);
56         }
57     }
58
59     /**
60      * Look for </title> tags
61      * @see SimpleHTMLParser#processEndTag(SimpleHTMLToken)
62      */

63     public void processEndTag(SimpleHTMLToken token) throws IOException
64     {
65         String JavaDoc tag = SimpleHTMLParser.getTagType(token,true);
66         if (tag == null) throw new IOException("HTML parsing error");
67         else if (tag.equals("title")) inTitle = false;
68     }
69
70     /**
71      * Handle Anchor, Image, Frame, and Title tags
72      * @see SimpleHTMLParser#processTag(SimpleHTMLToken)
73      */

74     public void processTag(SimpleHTMLToken token) throws IOException
75     {
76         String JavaDoc tag = SimpleHTMLParser.getTagType(token,true);
77         if (tag == null) throw new IOException("HTML parsing error");
78         else if (tag.equals("a")) {
79             String JavaDoc link = extractHref(token.getContent());
80             if (link != null) links.add(link);
81         }
82         else if (tag.equals("img")) {
83             String JavaDoc image = extractSrc(token.getContent());
84             if (image != null) images.add(image);
85         }
86         else if (tag.equals("frame")) {
87             String JavaDoc link = extractSrc(token.getContent());
88             if (link != null) links.add(link);
89         }
90         else if (tag.equals("title")) inTitle = true;
91     }
92     
93     
94     // Utility method for extracting href attribute
95
private String JavaDoc extractHref(String JavaDoc tag)
96     {
97         String JavaDoc delims="\t\r\f\n \'\"=";
98         StringTokenizer tt = new StringTokenizer(tag,delims);
99         while(tt.hasMoreElements()) {
100             String JavaDoc s = tt.nextToken();
101             if (s.equalsIgnoreCase("href")) {
102                 if (!tt.hasMoreElements()) return(null);
103                 else return(tt.nextToken());
104             }
105         }
106         return(null);
107     }
108     
109     // Utility method for extracting src attribute
110
private String JavaDoc extractSrc(String JavaDoc tag)
111     {
112         String JavaDoc delims="\t\r\f\n \'\"=";
113         StringTokenizer tt = new StringTokenizer(tag,delims);
114         while(tt.hasMoreElements()) {
115             String JavaDoc s = tt.nextToken();
116             if (s.equalsIgnoreCase("src")) {
117                 if (!tt.hasMoreElements()) return(null);
118                 else return(tt.nextToken());
119             }
120         }
121         return(null);
122     }
123     /**
124      * Returns the images.
125      * @return ArrayList
126      */

127     public ArrayList getImages() {
128         return images;
129     }
130
131     /**
132      * Returns the links.
133      * @return ArrayList
134      */

135     public ArrayList getLinks() {
136         return links;
137     }
138
139     /**
140      * Returns the title.
141      * @return ArrayList
142      */

143     public ArrayList getTitle() {
144         return title;
145     }
146 }
147
Popular Tags