1 24 25 27 package weblech.spider; 28 29 import weblech.util.Logger; 30 import weblech.util.Log4j; 31 32 import java.util.*; 33 import java.io.*; 34 import java.net.URL ; 35 36 import org.apache.log4j.Category; 37 38 public class Spider extends Logger implements Runnable , Constants 39 { 40 41 private SpiderConfig config; 42 46 private DownloadQueue queue; 47 52 private Set urlsDownloadedOrScheduled; 53 57 private Set urlsDownloading; 58 63 private int downloadsInProgress; 64 65 private boolean quit; 66 67 private int running; 68 69 private long lastCheckpoint; 70 71 public Spider(SpiderConfig config) 72 { 73 this.config = config; 74 queue = new DownloadQueue(config); 75 queue.queueURL(new URLToDownload(config.getStartLocation(), 0)); 76 urlsDownloadedOrScheduled = new HashSet(); 77 urlsDownloading = new HashSet(); 78 downloadsInProgress = 0; 79 lastCheckpoint = 0; 80 } 81 82 public void start() 83 { 84 quit = false; 85 running = 0; 86 87 for(int i = 0; i < config.getSpiderThreads(); i++) 88 { 89 _logClass.info("Starting Spider thread"); 90 Thread t = new Thread (this, "Spider-Thread-" + (i + 1)); 91 t.start(); 92 running++; 93 } 94 } 95 96 public void stop() 97 { 98 quit = true; 99 } 100 101 public boolean isRunning() 102 { 103 return running == 0; 104 } 105 106 private void checkpointIfNeeded() 107 { 108 if(config.getCheckpointInterval() == 0) 109 { 110 return; 111 } 112 113 if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval()) 114 { 115 synchronized(queue) 116 { 117 if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval()) 118 { 119 writeCheckpoint(); 120 lastCheckpoint = System.currentTimeMillis(); 121 } 122 } 123 } 124 } 125 126 private void writeCheckpoint() 127 { 128 _logClass.debug("writeCheckpoint()"); 129 try 130 { 131 FileOutputStream fos = new FileOutputStream("spider.checkpoint", false); 132 ObjectOutputStream oos = new ObjectOutputStream(fos); 133 oos.writeObject(queue); 134 oos.writeObject(urlsDownloading); 135 oos.close(); 136 } 137 catch(IOException ioe) 138 { 139 _logClass.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe); 140 } 141 } 142 143 public void readCheckpoint() 144 { 145 try 146 { 147 FileInputStream fis = new FileInputStream("spider.checkpoint"); 148 ObjectInputStream ois = new ObjectInputStream(fis); 149 queue = (DownloadQueue) ois.readObject(); 150 urlsDownloading = (Set) ois.readObject(); 151 queue.queueURLs(urlsDownloading); 152 urlsDownloading.clear(); 153 } 154 catch(Exception e) 155 { 156 _logClass.error("Caught exception reading checkpoint: " + e.getMessage(), e); 157 } 158 } 159 160 public void run() 161 { 162 HTMLParser htmlParser = new HTMLParser(config); 163 URLGetter urlGetter = new URLGetter(config); 164 165 while((queueSize() > 0 || downloadsInProgress > 0) && quit == false) 166 { 167 checkpointIfNeeded(); 168 if(queueSize() == 0 && downloadsInProgress > 0) 169 { 170 try 172 { 173 Thread.sleep(QUEUE_CHECK_INTERVAL); 174 } 175 catch(InterruptedException ignored) 176 { 177 } 178 continue; 180 } 181 else if(queueSize() == 0) 182 { 183 break; 184 } 185 URLToDownload nextURL; 186 synchronized(queue) 187 { 188 nextURL = queue.getNextInQueue(); 189 downloadsInProgress++; 190 } 191 synchronized(urlsDownloading) 192 { 193 urlsDownloading.add(nextURL); 194 } 195 int newDepth = nextURL.getDepth() + 1; 196 int maxDepth = config.getMaxDepth(); 197 synchronized(urlsDownloading) 198 { 199 urlsDownloading.remove(nextURL); 200 } 201 List newURLs = downloadURL(nextURL, urlGetter, htmlParser); 202 203 newURLs = filterURLs(newURLs); 204 205 ArrayList u2dsToQueue = new ArrayList(); 206 for(Iterator i = newURLs.iterator(); i.hasNext(); ) 207 { 208 URL u = (URL ) i.next(); 209 synchronized(urlsDownloadedOrScheduled) 211 { 212 if(!urlsDownloadedOrScheduled.contains(u) 213 && (maxDepth == 0 || newDepth <= maxDepth)) 214 { 215 u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth)); 216 urlsDownloadedOrScheduled.add(u); 217 } 218 } 219 } 220 synchronized(queue) 221 { 222 queue.queueURLs(u2dsToQueue); 223 downloadsInProgress--; 224 } 225 } 226 _logClass.info("Spider thread stopping"); 227 running--; 228 } 229 230 233 private int queueSize() 234 { 235 synchronized(queue) 236 { 237 return queue.size(); 238 } 239 } 240 241 246 private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser) 247 { 248 _logClass.debug("downloadURL(" + url + ")"); 249 250 URLObject obj = new URLObject(url.getURL(), config); 252 if(obj.existsOnDisk()) 253 { 254 if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML())) 255 { 256 _logClass.info("Q: [" + queue + "] " + url); 257 obj = urlGetter.getURL(url); 258 } 259 else if(config.refreshImages() && obj.isImage()) 260 { 261 _logClass.info("Q: [" + queue + "] " + url); 262 obj = urlGetter.getURL(url); 263 } 264 } 265 else 266 { 267 _logClass.info("Q: [" + queue + "] " + url); 268 obj = urlGetter.getURL(url); 269 } 270 271 if(obj == null) 272 { 273 return new ArrayList(); 274 } 275 276 if(!obj.existsOnDisk()) 277 { 278 obj.writeToFile(); 279 } 280 281 if(obj.isHTML() || obj.isXML()) 282 { 283 return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent()); 284 } 285 else if(obj.isImage()) 286 { 287 return new ArrayList(); 288 } 289 else 290 { 291 _logClass.warn("Unsupported content type received: " + obj.getContentType()); 292 _logClass.info("URL was " + url); 293 return new ArrayList(); 294 } 295 } 296 297 private List filterURLs(List URLs) 298 { 299 String match = config.getURLMatch(); 300 ArrayList retVal = new ArrayList(); 301 302 synchronized(urlsDownloadedOrScheduled) 303 { 304 for(Iterator i = URLs.iterator(); i.hasNext(); ) 305 { 306 URL u = (URL ) i.next(); 307 if(urlsDownloadedOrScheduled.contains(u)) 308 { 309 continue; 310 } 311 312 String s = u.toExternalForm(); 313 if(s.indexOf(match) != -1) 314 { 315 retVal.add(u); 316 } 317 } 318 } 319 return retVal; 320 } 321 322 } 323 | Popular Tags |