1 package net.javacoding.jspider.core.task.work; 2 3 4 import net.javacoding.jspider.api.model.HTTPHeader; 5 import net.javacoding.jspider.api.model.Site; 6 import net.javacoding.jspider.core.SpiderContext; 7 import net.javacoding.jspider.core.logging.LogFactory; 8 import net.javacoding.jspider.core.event.CoreEvent; 9 import net.javacoding.jspider.core.event.impl.*; 10 import net.javacoding.jspider.core.task.WorkerTask; 11 import net.javacoding.jspider.core.util.http.HTTPHeaderUtil; 12 import net.javacoding.jspider.core.util.URLUtil; 13 14 import java.io.*; 15 import java.net.*; 16 17 18 24 public class SpiderHttpURLTask extends BaseWorkerTaskImpl { 25 26 protected URL url; 27 protected Site site; 28 29 30 public SpiderHttpURLTask(SpiderContext context, URL url, Site site) { 31 super(context, WorkerTask.WORKERTASK_SPIDERTASK); 32 this.url = url; 33 this.site = site; 34 } 35 36 public void prepare() { 37 context.throttle(site); 38 } 39 40 public void execute() { 41 42 CoreEvent event = null; 43 URLConnection connection = null; 44 45 InputStream inputStream = null; 46 47 int httpStatus = 0; 48 HTTPHeader[] headers = null; 49 50 try { 51 52 connection = url.openConnection(); 53 54 if (connection instanceof HttpURLConnection) { 55 ((HttpURLConnection) connection).setInstanceFollowRedirects(false); 56 } 57 58 connection.setRequestProperty("User-agent", site.getUserAgent()); 59 context.preHandle(connection, site); 60 61 long start = System.currentTimeMillis(); 62 connection.connect(); 63 64 if (connection instanceof HttpURLConnection) { 65 httpStatus = ((HttpURLConnection) connection).getResponseCode(); 66 switch (httpStatus) { 67 case HttpURLConnection.HTTP_MOVED_PERM: 68 case HttpURLConnection.HTTP_MOVED_TEMP: 69 String redirectURL = connection.getHeaderField("location"); 70 notifyEvent(url, new URLFoundEvent(context, url, URLUtil.normalize(new URL(redirectURL)))); 71 break; 72 default: 73 break; 74 } 75 } 76 inputStream = new BufferedInputStream(connection.getInputStream()); 77 78 ByteArrayOutputStream os = new ByteArrayOutputStream(); 79 InputStream is = new BufferedInputStream(inputStream); 80 int size = 0; 82 try { 83 int i = is.read(); 84 while (i != -1) { 85 size++; 86 os.write(i); 87 i = is.read(); 88 } 89 } catch (IOException e) { 90 LogFactory.getLog(SpiderHttpURLTask.class).error("i/o exception during fetch",e); 91 } 92 93 String contentType = connection.getContentType(); 94 int timeMs = (int) (System.currentTimeMillis() - start); 95 96 headers = HTTPHeaderUtil.getHeaders(connection); 97 98 if (httpStatus >= 200 && httpStatus < 303) { 99 event = new URLSpideredOkEvent(context, url, httpStatus, connection, contentType, timeMs, size, os.toByteArray(), headers); 100 } else { 101 event = new URLSpideredErrorEvent(context, url, httpStatus, connection, headers, null); 102 } 103 104 context.postHandle(connection, site); 105 106 } catch (FileNotFoundException e) { 107 headers = HTTPHeaderUtil.getHeaders(connection); 108 event = new URLSpideredErrorEvent(context, url, 404, connection, headers, e); 109 } catch (Exception e) { 110 LogFactory.getLog(this.getClass()).error("exception during spidering", e); 111 event = new URLSpideredErrorEvent(context, url, httpStatus, connection, headers, e); 112 } finally { 113 notifyEvent(url, event); 114 if (inputStream != null) { 115 try { 116 inputStream.close(); 117 } catch (IOException e) { 118 LogFactory.getLog(SpiderHttpURLTask.class).error("i/o exception closing inputstream",e); 119 } 120 } 121 } 122 } 123 124 } 125 | Popular Tags |