1 package net.javacoding.jspider.core.task.work; 2 3 import net.javacoding.jspider.api.model.HTTPHeader; 4 import net.javacoding.jspider.api.model.Site; 5 import net.javacoding.jspider.core.SpiderContext; 6 import net.javacoding.jspider.core.logging.LogFactory; 7 import net.javacoding.jspider.core.event.CoreEvent; 8 import net.javacoding.jspider.core.event.impl.*; 9 import net.javacoding.jspider.core.task.WorkerTask; 10 import net.javacoding.jspider.core.util.http.HTTPHeaderUtil; 11 12 import java.io.*; 13 import java.net.*; 14 15 18 public class FetchRobotsTXTTaskImpl extends BaseWorkerTaskImpl { 19 20 protected URL url; 21 protected Site site; 22 23 public FetchRobotsTXTTaskImpl(SpiderContext context, URL url, Site site) { 24 super(context, WorkerTask.WORKERTASK_SPIDERTASK); 25 this.url = url; 26 this.site = site; 27 } 28 29 public void prepare() { 30 context.throttle(site); 31 } 32 33 public void execute() { 34 35 CoreEvent event = null; 36 URLConnection connection = null; 37 38 InputStream inputStream = null; 39 40 int httpStatus = 0; 41 HTTPHeader[] headers = null; 42 43 try { 44 connection = url.openConnection(); 45 46 ((HttpURLConnection) connection).setInstanceFollowRedirects(true); 49 connection.setRequestProperty("User-agent", site.getUserAgent() ); 50 context.preHandle(connection, site); 51 52 long start = System.currentTimeMillis(); 53 connection.connect(); 54 55 if (connection instanceof HttpURLConnection) { 56 httpStatus = ((HttpURLConnection) connection).getResponseCode(); 57 switch (httpStatus) { 58 case HttpURLConnection.HTTP_MOVED_PERM: 59 case HttpURLConnection.HTTP_MOVED_TEMP: 60 return; 61 default: 62 break; 63 } 64 } 65 inputStream = new BufferedInputStream(connection.getInputStream()); 66 67 ByteArrayOutputStream os = new ByteArrayOutputStream(); 68 InputStream is = new BufferedInputStream(inputStream); 69 try { 70 int i = is.read(); 71 while (i != -1) { 72 os.write(i); 73 i = is.read(); 74 } 75 } catch (IOException e) { 76 LogFactory.getLog(FetchRobotsTXTTaskImpl.class).error("i/o exception during fetch robots.txt",e); 77 } 78 String contentType = connection.getContentType(); 79 int size = connection.getContentLength(); 80 int timeMs = (int) (System.currentTimeMillis() - start); 81 82 headers = HTTPHeaderUtil.getHeaders(connection); 83 84 if (httpStatus >= 200 && httpStatus < 303) { 85 event = new RobotsTXTSpideredOkEvent(url,context, url, httpStatus, connection, contentType, timeMs, size, os.toByteArray(), headers); 86 } else if (httpStatus >= 400 && httpStatus < 500) { 87 event = new RobotsTXTUnexistingEvent(url,context, url, httpStatus, connection, headers, null); 88 } else { 89 event = new RobotsTXTSpideredErrorEvent(url,context, url, httpStatus, connection, headers, null); 90 } 91 } catch (FileNotFoundException e) { 92 headers = HTTPHeaderUtil.getHeaders(connection); 93 event = new RobotsTXTUnexistingEvent(url,context, url, 404, connection, headers, e); 94 } catch (Exception e) { 95 event = new RobotsTXTSpideredErrorEvent(url,context, url, httpStatus, connection, headers, e); 96 } finally { 97 notifyEvent(url, event); 98 if (inputStream != null) { 99 try { 100 inputStream.close(); 101 } catch (IOException e) { 102 LogFactory.getLog(FetchRobotsTXTTaskImpl.class).error("i/o exception closing inputstream",e); 103 } 104 } 105 } 106 } 107 108 } 109 | Popular Tags |