1 package bplatt.spider; 2 3 26 27 import java.io.*; 28 import java.net.*; 29 import java.util.*; 30 31 public abstract class Arachnid { 32 private String base; 33 private URL baseUrl; 34 private HashSet visited; 35 private int delay; 36 private static final String HTML = "text/html"; 37 38 39 public Arachnid(String base) throws MalformedURLException { 40 this.base = base; 41 baseUrl = new URL(base); 42 visited = new HashSet(); 43 delay = 2; 44 } 45 46 47 public void traverse() { traverse(baseUrl,null); } 48 49 private void traverse(URL url, URL parent) 50 { 51 boolean isHTMLfile = true; 52 PageInfo p = null; 53 try { p = getWebPage(url,parent); } 54 catch(IOException e) { 55 handleBadIO(url,parent); 56 sleep(delay); 57 return; 58 } 59 if (p == null) { 60 handleBadLink(url,parent,null); 61 sleep(delay); 62 return; 63 } 64 if (p.isValid() == false) { 65 if (p.getContentType().equalsIgnoreCase(HTML) == false) 66 handleNonHTMLlink(url,parent,p); 67 else handleBadLink(url,parent,p); 68 sleep(delay); 69 return; 70 } 71 else handleLink(p); 72 73 URL[] links = p.getLinks(); 75 if (links == null) { 76 sleep(delay); 77 return; 78 } 79 int n = links.length; 80 for (int i=0; i<n; ++i) { 81 if (isOKtoVisit(links[i])) { 82 visited.add(links[i]); 83 traverse(links[i],url); 84 } 85 else if (isExternalSite(links[i])) handleExternalLink(links[i],url); 86 } 87 sleep(delay); 88 return; 89 } 90 91 92 protected abstract void handleBadLink(URL url,URL parent,PageInfo p); 93 94 95 protected abstract void handleLink(PageInfo p); 96 97 98 protected abstract void handleNonHTMLlink(URL url, URL parent, PageInfo p); 99 100 101 protected abstract void handleExternalLink(URL url, URL parent); 102 103 104 protected abstract void handleBadIO(URL url, URL parent); 105 106 108 private boolean isOKtoVisit(URL link) { 109 if (!link.getProtocol().equals("http")) return(false); 111 else if (isExternalSite(link)) return(false); 113 else if (visited.contains(link)) return(false); 114 else return(true); 115 } 116 117 private boolean isExternalSite(URL link) { 118 if (link.getAuthority() != baseUrl.getAuthority() || 121 (!UrlPathDir(link).startsWith(UrlPathDir(baseUrl)))) return(true); 122 else return(false); 123 } 124 125 private String UrlPathDir(URL u) { 126 String p = u.getPath(); 127 if (p == null || p.equals("")) return("/"); 128 int i = p.lastIndexOf("/"); 129 if (i == -1) return("/"); 130 else p = p.substring(0,i+1); 131 return(p); 132 } 133 134 private PageInfo getWebPage(URL url, URL parentUrl) throws IOException 136 { 137 HttpURLConnection connection = (HttpURLConnection)url.openConnection(); 138 int responseCode = connection.getResponseCode(); 139 String contentType = connection.getContentType(); 140 int contentLength = connection.getContentLength(); 142 PageInfo p = new PageInfo(url,parentUrl,contentType,contentLength,responseCode); 143 InputStreamReader rdr = 144 new InputStreamReader(connection.getInputStream()); 145 p.extract(rdr); 146 rdr.close(); 147 connection.disconnect(); 148 return(p); 149 } 150 151 152 public byte[] getContent(URL url) { 153 byte[] buf = null; 154 try { 155 HttpURLConnection connection = (HttpURLConnection)url.openConnection(); 156 int responseCode = connection.getResponseCode(); 157 int contentLength = connection.getContentLength(); 158 if (responseCode != HttpURLConnection.HTTP_OK || contentLength <= 0) return(null); 160 InputStream in = connection.getInputStream(); 161 BufferedInputStream bufIn = new BufferedInputStream(in); 162 buf = new byte[contentLength]; 163 int bytesToRead = contentLength; 165 int flag = 10; 166 while(bytesToRead != 0 && flag != 0) { 167 int bytesRead = bufIn.read(buf,(contentLength-bytesToRead),bytesToRead); 168 bytesToRead = bytesToRead - bytesRead; 169 flag--; 170 if (flag <= 5) sleep(1); 171 } 172 in.close(); 173 connection.disconnect(); 174 if (flag == 0) return(null); 175 } 176 catch(Exception e) { 177 return(null); 180 } 181 182 return(buf); 183 } 184 185 186 public URL getBaseUrl() { return(baseUrl); } 187 188 private void sleep(int n) { 190 if (n <= 0) return; 191 Thread mythread = Thread.currentThread(); 192 try { mythread.sleep(n*1000); } 193 catch(InterruptedException e) { } 195 } 196 200 public int getDelay() { 201 return delay; 202 } 203 204 208 public void setDelay(int delay) { 209 this.delay = delay; 210 } 211 212 } | Popular Tags |