Arachnid


1   package bplatt.spider;
2   
3   /** Arachnid - Abstract Web spider class
4    * To use, derive class from Arachnid,
5    * Add handleLink(), handleBadLink(), handleNonHTMLlink(),
6    * handleExternalLink(), and handleBadIO() methods
7    * Instantiate and call traverse()
8    * 
9    * Copyright 2002, Robert L. Platt, All rights reserved
10   * @author Robert L. Platt 
11   * 
12   * This program is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU General Public License as published by
14   * the Free Software Foundation; either version 2 of the License, or
15   * (at your option) any later version.
16   *
17   * This program is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU General Public License for more details.
21   *
22   * You should have received a copy of the GNU General Public License
23   * along with this program; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26   
27  import java.io.*;
28  import java.net.*;
29  import java.util.*;
30  
31  public abstract class Arachnid {
32      private String   base;
33      private URL baseUrl;
34      private HashSet visited;
35      private int delay;
36      private static final String   HTML = "text/html";
37      
38      /** Constructor */
39      public Arachnid(String   base) throws MalformedURLException {
40          this.base = base;
41          baseUrl = new URL(base);
42          visited = new HashSet();
43          delay = 2;
44      }
45      
46      /** Traverse Web site */
47      public void traverse() { traverse(baseUrl,null); }
48      
49      private void traverse(URL url, URL parent)
50      {
51          boolean isHTMLfile = true;
52          PageInfo p = null;
53          try { p = getWebPage(url,parent); }
54          catch(IOException e) {
55              handleBadIO(url,parent);
56              sleep(delay);
57              return;
58          }
59          if (p == null) {
60              handleBadLink(url,parent,null);
61              sleep(delay);
62              return;
63          }
64          if (p.isValid() == false) {
65              if (p.getContentType().equalsIgnoreCase(HTML) == false) 
66                  handleNonHTMLlink(url,parent,p);
67              else handleBadLink(url,parent,p);
68              sleep(delay);
69              return;
70          }
71          else handleLink(p);
72          
73          // Navigate through links on page
74          URL[] links = p.getLinks();
75          if (links == null) {
76              sleep(delay);
77              return;
78          }
79          int n = links.length;
80          for (int i=0; i<n; ++i) {
81              if (isOKtoVisit(links[i])) {
82                  visited.add(links[i]);
83                  traverse(links[i],url);
84              }
85              else if (isExternalSite(links[i])) handleExternalLink(links[i],url);
86          }
87          sleep(delay);
88          return;
89      }
90      
91      /** (Abstract) Handle bad URL */
92      protected abstract void handleBadLink(URL url,URL parent,PageInfo p);
93      
94      /** (Abstract) Handle a link; a Web page in the site */
95      protected abstract void handleLink(PageInfo p);
96      
97      /** (Abstract) Handle a non-HTML link */
98      protected abstract void handleNonHTMLlink(URL url, URL parent, PageInfo p);
99      
100     /** (Abstract) Handle an external (outside of Web site) link */
101     protected abstract void handleExternalLink(URL url, URL parent);
102     
103     /** (Abstract) Handle an I/O Exception (server problem) */
104     protected abstract void handleBadIO(URL url, URL parent);
105     
106     /** Return true if it's OK to visit the link,
107         false if it's not */
108     private boolean isOKtoVisit(URL link) {
109         // Return false if it's not HTTP protocol
110         if (!link.getProtocol().equals("http")) return(false);
111         // Return false if it's an external site
112         else if (isExternalSite(link)) return(false);
113         else if (visited.contains(link)) return(false);
114         else return(true);
115     }
116     
117     private boolean isExternalSite(URL link) {
118         // Return true if link host is different from base or
119         // if path of link is not a superset of base URL
120         if (link.getAuthority() != baseUrl.getAuthority() ||
121             (!UrlPathDir(link).startsWith(UrlPathDir(baseUrl)))) return(true);
122         else return(false);
123     }
124     
125     private String   UrlPathDir(URL u) {
126         String   p = u.getPath();
127         if (p == null || p.equals("")) return("/");
128         int i = p.lastIndexOf("/");
129         if (i == -1) return("/");
130         else p = p.substring(0,i+1);
131         return(p);
132     }
133     
134     // Populate a PageInfo object from a URL
135     private PageInfo getWebPage(URL url, URL parentUrl) throws IOException
136     {
137         HttpURLConnection connection = (HttpURLConnection)url.openConnection();
138         int responseCode = connection.getResponseCode();
139         String   contentType = connection.getContentType();
140         // Note: contentLength == -1 if NOT KNOWN (i.e. not returned from server)
141         int contentLength = connection.getContentLength();  
142         PageInfo p = new PageInfo(url,parentUrl,contentType,contentLength,responseCode);
143         InputStreamReader rdr =
144             new InputStreamReader(connection.getInputStream());
145         p.extract(rdr);
146         rdr.close();
147         connection.disconnect();
148         return(p);
149     }
150     
151     /** Get contents of a URL */
152     public byte[] getContent(URL url) {
153         byte[] buf = null;
154         try { 
155             HttpURLConnection connection = (HttpURLConnection)url.openConnection();
156             int responseCode = connection.getResponseCode();
157             int contentLength = connection.getContentLength();
158             // System.out.println("Content length: "+contentLength);
159             if (responseCode != HttpURLConnection.HTTP_OK || contentLength <= 0) return(null);
160             InputStream in = connection.getInputStream();
161             BufferedInputStream bufIn = new BufferedInputStream(in);
162             buf = new byte[contentLength];
163             // Added code to handle blocked reads
164             int bytesToRead = contentLength;
165             int flag = 10;
166             while(bytesToRead != 0 && flag != 0) {
167                 int bytesRead = bufIn.read(buf,(contentLength-bytesToRead),bytesToRead);
168                 bytesToRead = bytesToRead - bytesRead;
169                 flag--;
170                 if (flag <= 5) sleep(1);
171             }
172             in.close();
173             connection.disconnect();
174             if (flag == 0) return(null);
175         }
176         catch(Exception   e) {
177             // System.out.println(e);
178             // e.printStackTrace();
179             return(null);
180         }
181         
182         return(buf);
183     }
184         
185     /** Return base URL (starting point for Web traversal) */
186     public URL getBaseUrl() { return(baseUrl); }    
187     
188     // Sleep N seconds
189     private void sleep(int n) {
190         if (n <= 0) return;
191         Thread   mythread = Thread.currentThread();
192         try { mythread.sleep(n*1000); }
193         catch(InterruptedException   e) { // Ignore
194         }
195     }
196     /**
197      * Returns delay (N second pause after processing EACH web page)
198      * @return int
199      */
200     public int getDelay() {
201         return delay;
202     }
203 
204     /**
205      * Sets delay (N second pause after processing EACH web page)
206      * @param delay The delay to set
207      */
208     public void setDelay(int delay) {
209         this.delay = delay;
210     }
211 
212 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags