KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > matuschek > spider > NoRobots


1 // NoRobots - implements the Robot Exclusion Standard
2
//
3
// Copyright (C)1996,1998 by Jef Poskanzer <jef@acme.com>.
4
// All rights reserved.
5
//
6
// Redistribution and use in source and binary forms, with or without
7
// modification, are permitted provided that the following conditions
8
// are met:
9
// 1. Redistributions of source code must retain the above copyright
10
// notice, this list of conditions and the following disclaimer.
11
// 2. Redistributions in binary form must reproduce the above copyright
12
// notice, this list of conditions and the following disclaimer in the
13
// documentation and/or other materials provided with the distribution.
14
//
15
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25
// SUCH DAMAGE.
26

27 /**
28  * moved to the net.matuschek.spider package bei Daniel Matuschek
29  * did some minimal modifications to use HttpTool for retrieval of robots.txt
30  */

31
32 package net.matuschek.spider;
33
34 import java.io.*;
35 import java.net.*;
36 import java.util.*;
37
38 import org.apache.log4j.Category;
39
40 import net.matuschek.http.*;
41
42 /**
43  * Implements the Robot Exclusion Standard.
44  * <P>
45  * The basic idea of the Robot Exclusion Standard is that each web server
46  * can set up a single file called "/robots.txt" which contains pathnames
47  * that robots should not look at.
48  * See <A HREF="http://www.robotstxt.org/wc/norobots.html">the full spec</A>
49  * for details.
50  * Using this class is very simple - you create the object using your robot's
51  * name and the httptool to retrieve the date, and then you call check() on
52  * each URL. For efficiency, the class caches entries for servers you've
53  * visited recently.
54  * <p>
55  * @author cn
56  * @version 0.1
57  */

58 public class NoRobots {
59
60     Category log = Category.getInstance(getClass().getName());
61         
62     // The file with the robot rules in it.
63
private static final String JavaDoc robotFile = "/robots.txt";
64
65     // The name of this robot.
66
private String JavaDoc robotName;
67
68     // A table of all the servers we have visited recently.
69
private Hashtable servers = new net.matuschek.util.LruHashtable(500);
70
71     // tool to get /robots.txt
72
private HttpTool httpTool;
73     private boolean ignore = false;
74
75     /**
76      * Constructor.
77      * @param robotName the name of the robot
78      * @param httpTool the HttpTool instance for downloading the robotFile
79      */

80     public NoRobots(String JavaDoc robotName, HttpTool inhttpTool) {
81         this.robotName = robotName;
82         this.httpTool = inhttpTool;
83         /*
84         this.httpTool = new HttpTool();
85         httpTool.setAgentName(inhttpTool.getAgentName());
86         try{
87             httpTool.setProxy(inhttpTool.getProxy());
88         } catch (HttpException e){
89             // ignore
90         }
91         */

92     }
93     
94     /**
95      * Check whether it's ok for this robot to fetch this URL. reads the
96      * information in the robots.txt file on this host. If a robots.txt file is
97      * there and this file disallows the robot to retrieve the requested url
98      * then the method returns false
99      * @param url the url we want to retrieve
100      * @return boolean true if allowed to retireve the url, false otherwise
101      */

102     public boolean ok(URL url) {
103         // if ignore is set to true, then this check returs true
104
if (ignore) {
105             return true;
106         }
107
108         String JavaDoc protocol = url.getProtocol();
109         String JavaDoc host = url.getHost();
110         int port = url.getPort();
111         if (port == -1) {
112             port = 80;
113         }
114
115         String JavaDoc file = url.getFile();
116
117         Vector disallows = getDisallows(protocol, host, port);
118         Enumeration en = disallows.elements();
119         while (en.hasMoreElements()) {
120             String JavaDoc pattern = (String JavaDoc) en.nextElement();
121             if (file.startsWith(pattern))
122                 return false;
123         }
124         return true;
125     }
126
127     /**
128      * Method getDisallows.
129      * Get the disallows list for the given server. If it's not already in
130      * the servers hash table, we fetch it, parse it, and save it.
131      * @param protocol
132      * @param host
133      * @param port
134      * @return Vector
135      */

136     private Vector getDisallows(String JavaDoc protocol, String JavaDoc host, int port) {
137         String JavaDoc key = protocol + "://" + host + ":" + port;
138         Vector disallows = (Vector) servers.get(key);
139         if (disallows != null)
140             return disallows;
141
142         disallows = new Vector();
143         try {
144             URL robotUrl = new URL(protocol, host, port, robotFile);
145             try {
146
147                 // get document
148
log.debug("Retrieving robot file '" + robotUrl + "'.");
149                 httpTool.setReferer("-");
150                 String JavaDoc robotsFile = "";
151                 try {
152                     HttpDoc doc =
153                         httpTool.retrieveDocument(
154                             robotUrl,
155                             HttpConstants.GET,
156                             "");
157                     //old source if (doc.isOk()) {
158
if (doc != null && doc.isOk()) {
159                         robotsFile = new String JavaDoc(doc.getContent());
160                     }
161                 } catch (HttpException e) {
162                     // ignore HTTP errors
163
log.info("Cannot read robots.txt: " + e.getMessage());
164                 }
165
166                 BufferedReader robotReader =
167                     new BufferedReader(new StringReader(robotsFile));
168                 boolean userAgentIsMe = false;
169                 while (true) {
170                     String JavaDoc line = robotReader.readLine();
171                     if (line == null)
172                         break;
173                     line = line.trim();
174
175                     // Completely ignore lines that are just a comment - they
176
// don't even separate records.
177
if (line.startsWith("#"))
178                         continue;
179
180                     // Trim off any other comments.
181
int cmt = line.indexOf('#');
182                     if (cmt != -1)
183                         line = line.substring(0, cmt).trim();
184
185                     if (line.length() == 0)
186                         userAgentIsMe = false;
187                     else if (line.toLowerCase().startsWith("user-agent:")) {
188                         if (!userAgentIsMe) {
189                             String JavaDoc value = line.substring(11).trim();
190                             if (match(value, robotName))
191                                 userAgentIsMe = true;
192                         }
193                     } else if (line.toLowerCase().startsWith("disallow:")) {
194                         if (userAgentIsMe) {
195                             String JavaDoc value = line.substring(9).trim();
196                             disallows.addElement(value);
197                         }
198                     }
199                 }
200             } catch (IOException ignore) {
201             }
202         } catch (MalformedURLException ignore) {
203         }
204
205         servers.put(key, disallows);
206         return disallows;
207     }
208
209     /**
210      * Method match.
211      * Checks whether a string matches a given wildcard pattern.
212      * Only does ? and *, and multiple patterns separated by |.
213      * @param pattern
214      * @param string
215      * @return boolean
216      */

217     protected static boolean match(String JavaDoc pattern, String JavaDoc string) {
218         for (int p = 0;; ++p) {
219             for (int s = 0;; ++p, ++s) {
220                 boolean sEnd = (s >= string.length());
221                 boolean pEnd =
222                     (p >= pattern.length() || pattern.charAt(p) == '|');
223                 if (sEnd && pEnd)
224                     return true;
225                 if (sEnd || pEnd)
226                     break;
227                 if (pattern.charAt(p) == '?')
228                     continue;
229                 if (pattern.charAt(p) == '*') {
230                     int i;
231                     ++p;
232                     for (i = string.length(); i >= s; --i)
233                         if (match(pattern.substring(p),
234                             string.substring(i))) /* not quite right */
235                             return true;
236                     break;
237                 }
238                 if (pattern.charAt(p) != string.charAt(s))
239                     break;
240             }
241             p = pattern.indexOf('|', p);
242             if (p == -1)
243                 return false;
244         }
245     }
246
247     /**
248      * Method getIgnore.
249      * tells if the robot exclusion standard is ignored
250      * @return boolean true if the check on robots.txt is not done
251      */

252     public boolean getIgnore() {
253         return ignore;
254     }
255
256     /**
257      * Method setIgnore.
258      * set the robot exclusion standard.
259      * @param ignore if ignore is true then the robot exclusion standard is
260      * ignored
261      */

262     public void setIgnore(boolean ignore) {
263         this.ignore = ignore;
264     }
265
266     /**
267      * This method finishes the HttpTool.
268      */

269     public void finish() {
270         if (httpTool != null) {
271             httpTool.finish();
272         }
273     }
274 }
275
Popular Tags