KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > bplatt > spider > Arachnid


1 package bplatt.spider;
2
3 /** Arachnid - Abstract Web spider class
4  * To use, derive class from Arachnid,
5  * Add handleLink(), handleBadLink(), handleNonHTMLlink(),
6  * handleExternalLink(), and handleBadIO() methods
7  * Instantiate and call traverse()
8  *
9  * Copyright 2002, Robert L. Platt, All rights reserved
10  * @author Robert L. Platt
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with this program; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26  
27 import java.io.*;
28 import java.net.*;
29 import java.util.*;
30
31 public abstract class Arachnid {
32     private String JavaDoc base;
33     private URL baseUrl;
34     private HashSet visited;
35     private int delay;
36     private static final String JavaDoc HTML = "text/html";
37     
38     /** Constructor */
39     public Arachnid(String JavaDoc base) throws MalformedURLException {
40         this.base = base;
41         baseUrl = new URL(base);
42         visited = new HashSet();
43         delay = 2;
44     }
45     
46     /** Traverse Web site */
47     public void traverse() { traverse(baseUrl,null); }
48     
49     private void traverse(URL url, URL parent)
50     {
51         boolean isHTMLfile = true;
52         PageInfo p = null;
53         try { p = getWebPage(url,parent); }
54         catch(IOException e) {
55             handleBadIO(url,parent);
56             sleep(delay);
57             return;
58         }
59         if (p == null) {
60             handleBadLink(url,parent,null);
61             sleep(delay);
62             return;
63         }
64         if (p.isValid() == false) {
65             if (p.getContentType().equalsIgnoreCase(HTML) == false)
66                 handleNonHTMLlink(url,parent,p);
67             else handleBadLink(url,parent,p);
68             sleep(delay);
69             return;
70         }
71         else handleLink(p);
72         
73         // Navigate through links on page
74
URL[] links = p.getLinks();
75         if (links == null) {
76             sleep(delay);
77             return;
78         }
79         int n = links.length;
80         for (int i=0; i<n; ++i) {
81             if (isOKtoVisit(links[i])) {
82                 visited.add(links[i]);
83                 traverse(links[i],url);
84             }
85             else if (isExternalSite(links[i])) handleExternalLink(links[i],url);
86         }
87         sleep(delay);
88         return;
89     }
90     
91     /** (Abstract) Handle bad URL */
92     protected abstract void handleBadLink(URL url,URL parent,PageInfo p);
93     
94     /** (Abstract) Handle a link; a Web page in the site */
95     protected abstract void handleLink(PageInfo p);
96     
97     /** (Abstract) Handle a non-HTML link */
98     protected abstract void handleNonHTMLlink(URL url, URL parent, PageInfo p);
99     
100     /** (Abstract) Handle an external (outside of Web site) link */
101     protected abstract void handleExternalLink(URL url, URL parent);
102     
103     /** (Abstract) Handle an I/O Exception (server problem) */
104     protected abstract void handleBadIO(URL url, URL parent);
105     
106     /** Return true if it's OK to visit the link,
107         false if it's not */

108     private boolean isOKtoVisit(URL link) {
109         // Return false if it's not HTTP protocol
110
if (!link.getProtocol().equals("http")) return(false);
111         // Return false if it's an external site
112
else if (isExternalSite(link)) return(false);
113         else if (visited.contains(link)) return(false);
114         else return(true);
115     }
116     
117     private boolean isExternalSite(URL link) {
118         // Return true if link host is different from base or
119
// if path of link is not a superset of base URL
120
if (link.getAuthority() != baseUrl.getAuthority() ||
121             (!UrlPathDir(link).startsWith(UrlPathDir(baseUrl)))) return(true);
122         else return(false);
123     }
124     
125     private String JavaDoc UrlPathDir(URL u) {
126         String JavaDoc p = u.getPath();
127         if (p == null || p.equals("")) return("/");
128         int i = p.lastIndexOf("/");
129         if (i == -1) return("/");
130         else p = p.substring(0,i+1);
131         return(p);
132     }
133     
134     // Populate a PageInfo object from a URL
135
private PageInfo getWebPage(URL url, URL parentUrl) throws IOException
136     {
137         HttpURLConnection connection = (HttpURLConnection)url.openConnection();
138         int responseCode = connection.getResponseCode();
139         String JavaDoc contentType = connection.getContentType();
140         // Note: contentLength == -1 if NOT KNOWN (i.e. not returned from server)
141
int contentLength = connection.getContentLength();
142         PageInfo p = new PageInfo(url,parentUrl,contentType,contentLength,responseCode);
143         InputStreamReader rdr =
144             new InputStreamReader(connection.getInputStream());
145         p.extract(rdr);
146         rdr.close();
147         connection.disconnect();
148         return(p);
149     }
150     
151     /** Get contents of a URL */
152     public byte[] getContent(URL url) {
153         byte[] buf = null;
154         try {
155             HttpURLConnection connection = (HttpURLConnection)url.openConnection();
156             int responseCode = connection.getResponseCode();
157             int contentLength = connection.getContentLength();
158             // System.out.println("Content length: "+contentLength);
159
if (responseCode != HttpURLConnection.HTTP_OK || contentLength <= 0) return(null);
160             InputStream in = connection.getInputStream();
161             BufferedInputStream bufIn = new BufferedInputStream(in);
162             buf = new byte[contentLength];
163             // Added code to handle blocked reads
164
int bytesToRead = contentLength;
165             int flag = 10;
166             while(bytesToRead != 0 && flag != 0) {
167                 int bytesRead = bufIn.read(buf,(contentLength-bytesToRead),bytesToRead);
168                 bytesToRead = bytesToRead - bytesRead;
169                 flag--;
170                 if (flag <= 5) sleep(1);
171             }
172             in.close();
173             connection.disconnect();
174             if (flag == 0) return(null);
175         }
176         catch(Exception JavaDoc e) {
177             // System.out.println(e);
178
// e.printStackTrace();
179
return(null);
180         }
181         
182         return(buf);
183     }
184         
185     /** Return base URL (starting point for Web traversal) */
186     public URL getBaseUrl() { return(baseUrl); }
187     
188     // Sleep N seconds
189
private void sleep(int n) {
190         if (n <= 0) return;
191         Thread JavaDoc mythread = Thread.currentThread();
192         try { mythread.sleep(n*1000); }
193         catch(InterruptedException JavaDoc e) { // Ignore
194
}
195     }
196     /**
197      * Returns delay (N second pause after processing EACH web page)
198      * @return int
199      */

200     public int getDelay() {
201         return delay;
202     }
203
204     /**
205      * Sets delay (N second pause after processing EACH web page)
206      * @param delay The delay to set
207      */

208     public void setDelay(int delay) {
209         this.delay = delay;
210     }
211
212 }
Popular Tags