KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > bplatt > spider > PageInfo


1 package bplatt.spider;
2
3 /** PageInfo - Web Page Information object
4  * Copyright 2002, Robert L. Platt, All rights reserved
5  * @author Robert L. Platt
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20  */

21
22 import java.io.*;
23 import java.net.*;
24 import java.util.*;
25 import javax.swing.text.*;
26 import javax.swing.text.html.*;
27
28 public class PageInfo {
29     private URL url;
30     private URL parentUrl;
31     private String JavaDoc title;
32     private URL[] links;
33     private URL[] images;
34     private boolean valid;
35     private int responseCode;
36     private String JavaDoc contentType;
37     private int contentLength;
38     private final static URL[] dummy = new URL[1];
39     private final static String JavaDoc HTML = "text/html";
40     
41     /** Constructor */
42     public PageInfo(URL url, URL parentUrl, String JavaDoc contentType, int contentLength, int responseCode) {
43         this.url = url;
44         this.parentUrl = parentUrl;
45         this.contentType = contentType;
46         this.contentLength = contentLength;
47         this.responseCode = responseCode;
48         valid = false;
49     }
50     
51     // Accessors
52
public URL getUrl() { return(url); }
53     public URL getParentUrl() { return(parentUrl); }
54     public String JavaDoc getTitle() { return(title); }
55     public URL[] getLinks() { return(links); }
56     public URL[] getImages() { return(images); }
57     public String JavaDoc getContentType() { return(contentType); }
58     public boolean isValid() { return(valid); }
59     public int getResponseCode() { return responseCode; }
60     
61     /** Call WebPageXtractor and process WebPage */
62     public void extract(Reader reader) throws IOException
63     {
64         // Note: contentLength of -1 means UNKNOWN
65
if (reader == null || url == null ||
66             responseCode != HttpURLConnection.HTTP_OK ||
67             contentLength == 0 || contentType.equalsIgnoreCase(HTML) == false) {
68             valid = false;
69             return;
70         }
71         WebPageXtractor x = new WebPageXtractor();
72         try { x.parse(reader); }
73         catch(EOFException e) {
74             valid = false;
75             return;
76         }
77         catch(SocketTimeoutException e) {
78             valid = false;
79             throw(e);
80         }
81         catch(IOException e) {
82             valid = false;
83             return;
84         }
85         ArrayList rawlinks = x.getLinks();
86         ArrayList rawimages = x.getImages();
87         
88         // Get web page title (1st title if more than one!)
89
ArrayList rawtitle = x.getTitle();
90         if (rawtitle.isEmpty()) title = null;
91         else title = new String JavaDoc((String JavaDoc)rawtitle.get(0));
92         
93         // Get links
94
int numelem = rawlinks.size();
95         if (numelem == 0) links = null;
96         else {
97             ArrayList t = new ArrayList();
98             for (int i=0; i<numelem; ++i) {
99                 String JavaDoc slink = (String JavaDoc)rawlinks.get(i);
100                 try {
101                     URL link = new URL(url,slink);
102                     t.add(link);
103                 }
104                 catch(MalformedURLException e) { /* Ignore */ }
105             }
106             if (t.isEmpty()) links = null;
107             else links = (URL[])t.toArray(dummy);
108         }
109         
110         // Get images
111
numelem = rawimages.size();
112         if (numelem == 0) images = null;
113         else {
114             ArrayList t = new ArrayList();
115             for (int i=0; i<numelem; ++i) {
116                 String JavaDoc simage = (String JavaDoc)rawimages.get(i);
117                 try {
118                     URL image = new URL(url,simage);
119                     t.add(image);
120                 }
121                 catch(MalformedURLException e) { }
122             }
123             if (t.isEmpty()) images = null;
124             else images = (URL[])t.toArray(dummy);
125         }
126
127         // Set valid flag
128
valid = true;
129     }
130     
131     /** For debugging - dump page information */
132     public void dump() {
133         System.out.println("URL: "+url);
134         System.out.println("Parent URL: "+parentUrl);
135         System.out.println("Title: "+title);
136         if (links != null) {
137             System.out.print("Links: [");
138             for (int i=0; i<links.length; ++i) {
139                 System.out.print(links[i]);
140                 if (i<(links.length-1)) System.out.print(", ");
141             }
142             System.out.println("]");
143         }
144         if (images != null) {
145             System.out.print("Images: [");
146             for (int i=0; i<images.length; ++i) {
147                 System.out.print(images[i]);
148                 if (i<(images.length-1)) System.out.print(", ");
149             }
150             System.out.println("]");
151         }
152         System.out.println("Valid: "+valid);
153         System.out.println("Response Code: "+responseCode);
154         System.out.println("Content Type: "+contentType);
155         System.out.println("Content Length: "+contentLength);
156     }
157 }
158
Popular Tags