KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > javacoding > jspider > core > task > work > FetchRobotsTXTTaskImpl


1 package net.javacoding.jspider.core.task.work;
2
3 import net.javacoding.jspider.api.model.HTTPHeader;
4 import net.javacoding.jspider.api.model.Site;
5 import net.javacoding.jspider.core.SpiderContext;
6 import net.javacoding.jspider.core.logging.LogFactory;
7 import net.javacoding.jspider.core.event.CoreEvent;
8 import net.javacoding.jspider.core.event.impl.*;
9 import net.javacoding.jspider.core.task.WorkerTask;
10 import net.javacoding.jspider.core.util.http.HTTPHeaderUtil;
11
12 import java.io.*;
13 import java.net.*;
14
15 /**
16  * $Id: FetchRobotsTXTTaskImpl.java,v 1.19 2003/04/25 21:29:05 vanrogu Exp $
17  */

18 public class FetchRobotsTXTTaskImpl extends BaseWorkerTaskImpl {
19
20     protected URL url;
21     protected Site site;
22
23     public FetchRobotsTXTTaskImpl(SpiderContext context, URL url, Site site) {
24         super(context, WorkerTask.WORKERTASK_SPIDERTASK);
25         this.url = url;
26         this.site = site;
27     }
28
29     public void prepare() {
30         context.throttle(site);
31     }
32
33     public void execute() {
34
35         CoreEvent event = null;
36         URLConnection connection = null;
37
38         InputStream inputStream = null;
39
40         int httpStatus = 0;
41         HTTPHeader[] headers = null;
42
43         try {
44             connection = url.openConnection();
45
46             // RFC states that redirects should be followed.
47
// see: http://www.robotstxt.org/wc/norobots-rfc.txt
48
((HttpURLConnection) connection).setInstanceFollowRedirects(true);
49             connection.setRequestProperty("User-agent", site.getUserAgent() );
50             context.preHandle(connection, site);
51
52             long start = System.currentTimeMillis();
53             connection.connect();
54
55             if (connection instanceof HttpURLConnection) {
56                 httpStatus = ((HttpURLConnection) connection).getResponseCode();
57                 switch (httpStatus) {
58                     case HttpURLConnection.HTTP_MOVED_PERM:
59                     case HttpURLConnection.HTTP_MOVED_TEMP:
60                         return;
61                     default:
62                         break;
63                 }
64             }
65             inputStream = new BufferedInputStream(connection.getInputStream());
66
67             ByteArrayOutputStream os = new ByteArrayOutputStream();
68             InputStream is = new BufferedInputStream(inputStream);
69             try {
70                     int i = is.read();
71                     while (i != -1) {
72                         os.write(i);
73                         i = is.read();
74                     }
75             } catch (IOException e) {
76                 LogFactory.getLog(FetchRobotsTXTTaskImpl.class).error("i/o exception during fetch robots.txt",e);
77             }
78             String JavaDoc contentType = connection.getContentType();
79             int size = connection.getContentLength();
80             int timeMs = (int) (System.currentTimeMillis() - start);
81
82             headers = HTTPHeaderUtil.getHeaders(connection);
83
84             if (httpStatus >= 200 && httpStatus < 303) {
85                 event = new RobotsTXTSpideredOkEvent(url,context, url, httpStatus, connection, contentType, timeMs, size, os.toByteArray(), headers);
86             } else if (httpStatus >= 400 && httpStatus < 500) {
87                 event = new RobotsTXTUnexistingEvent(url,context, url, httpStatus, connection, headers, null);
88             } else {
89                 event = new RobotsTXTSpideredErrorEvent(url,context, url, httpStatus, connection, headers, null);
90             }
91         } catch (FileNotFoundException e) {
92             headers = HTTPHeaderUtil.getHeaders(connection);
93             event = new RobotsTXTUnexistingEvent(url,context, url, 404, connection, headers, e);
94         } catch (Exception JavaDoc e) {
95             event = new RobotsTXTSpideredErrorEvent(url,context, url, httpStatus, connection, headers, e);
96         } finally {
97             notifyEvent(url, event);
98             if (inputStream != null) {
99                 try {
100                     inputStream.close();
101                 } catch (IOException e) {
102                     LogFactory.getLog(FetchRobotsTXTTaskImpl.class).error("i/o exception closing inputstream",e);
103                 }
104             }
105         }
106     }
107
108 }
109
Popular Tags