KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > protocol > http > RobotRulesParser


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.protocol.http;
5
6 import java.io.FileInputStream JavaDoc;
7 import java.io.FileReader JavaDoc;
8 import java.io.LineNumberReader JavaDoc;
9 import java.io.IOException JavaDoc;
10 import java.io.UnsupportedEncodingException JavaDoc;
11 import java.net.URL JavaDoc;
12 import java.net.URLDecoder JavaDoc;
13 import java.util.Arrays JavaDoc;
14 import java.util.ArrayList JavaDoc;
15 import java.util.HashMap JavaDoc;
16 import java.util.Hashtable JavaDoc;
17 import java.util.StringTokenizer JavaDoc;
18
19 import java.util.logging.Logger JavaDoc;
20 import java.util.logging.Level JavaDoc;
21 import java.util.logging.Handler JavaDoc;
22
23 import net.nutch.util.NutchConf;
24 import net.nutch.util.LogFormatter;
25 import net.nutch.protocol.ProtocolException;
26
27 /**
28  * This class handles the parsing of <code>robots.txt</code> files.
29  * It emits RobotRules objects, which describe the download permissions
30  * as described in RobotRulesParser.
31  *
32  * @author Tom Pierce
33  * @author Mike Cafarella
34  * @author Doug Cutting
35  */

36 public class RobotRulesParser {
37   public static final Logger JavaDoc LOG=
38     LogFormatter.getLogger("net.nutch.fetcher.RobotRulesParser");
39
40   private static final String JavaDoc[] AGENTS = getAgents();
41   private static final Hashtable JavaDoc CACHE = new Hashtable JavaDoc();
42   
43   private static final String JavaDoc CHARACTER_ENCODING= "UTF-8";
44   private static final int NO_PRECEDENCE= Integer.MAX_VALUE;
45     
46   private static final RobotRuleSet EMPTY_RULES= new RobotRuleSet();
47
48   private static RobotRuleSet FORBID_ALL_RULES =
49     new RobotRulesParser().getForbidAllRules();
50
51   private HashMap JavaDoc robotNames;
52
53   /**
54    * This class holds the rules which were parsed from a robots.txt
55    * file, and can test paths against those rules.
56    */

57   public static class RobotRuleSet {
58     ArrayList JavaDoc tmpEntries;
59     RobotsEntry[] entries;
60     long expireTime;
61
62     /**
63      */

64     private class RobotsEntry {
65       String JavaDoc prefix;
66       boolean allowed;
67
68       RobotsEntry(String JavaDoc prefix, boolean allowed) {
69         this.prefix= prefix;
70         this.allowed= allowed;
71       }
72     }
73
74     /**
75      * should not be instantiated from outside RobotRulesParser
76      */

77     private RobotRuleSet() {
78       tmpEntries= new ArrayList JavaDoc();
79       entries= null;
80     }
81
82     /**
83      */

84     private void addPrefix(String JavaDoc prefix, boolean allow) {
85       if (tmpEntries == null) {
86         tmpEntries= new ArrayList JavaDoc();
87         if (entries != null) {
88           for (int i= 0; i < entries.length; i++)
89             tmpEntries.add(entries[i]);
90         }
91         entries= null;
92       }
93
94       tmpEntries.add(new RobotsEntry(prefix, allow));
95     }
96
97     /**
98      */

99     private void clearPrefixes() {
100       if (tmpEntries == null) {
101         tmpEntries= new ArrayList JavaDoc();
102         entries= null;
103       } else {
104         tmpEntries.clear();
105       }
106     }
107
108     /**
109      * Change when the ruleset goes stale.
110      */

111     public void setExpireTime(long expireTime) {
112       this.expireTime = expireTime;
113     }
114
115     /**
116      * Get expire time
117      */

118     public long getExpireTime() {
119       return expireTime;
120     }
121
122     /**
123      * Returns <code>false</code> if the <code>robots.txt</code> file
124      * prohibits us from accessing the given <code>path</code>, or
125      * <code>true</code> otherwise.
126      */

127     public boolean isAllowed(String JavaDoc path) {
128       try {
129         path= URLDecoder.decode(path, CHARACTER_ENCODING);
130       } catch (Exception JavaDoc e) {
131         // just ignore it- we can still try to match
132
// path prefixes
133
}
134
135       if (entries == null) {
136         entries= new RobotsEntry[tmpEntries.size()];
137         entries= (RobotsEntry[])
138           tmpEntries.toArray(entries);
139         tmpEntries= null;
140       }
141
142       int pos= 0;
143       int end= entries.length;
144       while (pos < end) {
145         if (path.startsWith(entries[pos].prefix))
146           return entries[pos].allowed;
147         pos++;
148       }
149
150       return true;
151     }
152
153     /**
154      */

155     public String JavaDoc toString() {
156       isAllowed("x"); // force String[] representation
157
StringBuffer JavaDoc buf= new StringBuffer JavaDoc();
158       for (int i= 0; i < entries.length; i++)
159         if (entries[i].allowed)
160           buf.append("Allow: " + entries[i].prefix
161                      + System.getProperty("line.separator"));
162         else
163           buf.append("Disallow: " + entries[i].prefix
164                      + System.getProperty("line.separator"));
165       return buf.toString();
166     }
167   }
168
169
170   public RobotRulesParser() { this(AGENTS); }
171
172   private static String JavaDoc[] getAgents() {
173     //
174
// Grab the agent names we advertise to robots files.
175
//
176
String JavaDoc agentName = NutchConf.get("http.agent.name");
177     String JavaDoc agentNames = NutchConf.get("http.robots.agents");
178     StringTokenizer JavaDoc tok = new StringTokenizer JavaDoc(agentNames, ",");
179     ArrayList JavaDoc agents = new ArrayList JavaDoc();
180     while (tok.hasMoreTokens()) {
181       agents.add(tok.nextToken().trim());
182     }
183
184     //
185
// If there are no agents for robots-parsing, use our
186
// default agent-string. If both are present, our agent-string
187
// should be the first one we advertise to robots-parsing.
188
//
189
if (agents.size() == 0) {
190       agents.add(agentName);
191       LOG.severe("No agents listed in 'http.robots.agents' property!");
192     } else if (!((String JavaDoc)agents.get(0)).equalsIgnoreCase(agentName)) {
193       agents.add(0, agentName);
194       LOG.severe("Agent we advertise (" + agentName
195                  + ") not listed first in 'http.robots.agents' property!");
196     }
197
198     return (String JavaDoc[])agents.toArray(new String JavaDoc[agents.size()]);
199   }
200
201
202   /**
203    * Creates a new <code>RobotRulesParser</code> which will use the
204    * supplied <code>robotNames</code> when choosing which stanza to
205    * follow in <code>robots.txt</code> files. Any name in the array
206    * may be matched. The order of the <code>robotNames</code>
207    * determines the precedence- if many names are matched, only the
208    * rules associated with the robot name having the smallest index
209    * will be used.
210    */

211   public RobotRulesParser(String JavaDoc[] robotNames) {
212     this.robotNames= new HashMap JavaDoc();
213     for (int i= 0; i < robotNames.length; i++) {
214       this.robotNames.put(robotNames[i].toLowerCase(), new Integer JavaDoc(i));
215     }
216     // always make sure "*" is included
217
if (!this.robotNames.containsKey("*"))
218       this.robotNames.put("*", new Integer JavaDoc(robotNames.length));
219   }
220
221   /**
222    * Returns a {@link RobotRuleSet} object which encapsulates the
223    * rules parsed from the supplied <code>robotContent</code>.
224    */

225   RobotRuleSet parseRules(byte[] robotContent) {
226     if (robotContent == null)
227       return EMPTY_RULES;
228
229     String JavaDoc content= new String JavaDoc (robotContent);
230
231     StringTokenizer JavaDoc lineParser= new StringTokenizer JavaDoc(content, "\n\r");
232
233     RobotRuleSet bestRulesSoFar= null;
234     int bestPrecedenceSoFar= NO_PRECEDENCE;
235
236     RobotRuleSet currentRules= new RobotRuleSet();
237     int currentPrecedence= NO_PRECEDENCE;
238
239     boolean addRules= false; // in stanza for our robot
240
boolean doneAgents= false; // detect multiple agent lines
241

242     while (lineParser.hasMoreTokens()) {
243       String JavaDoc line= lineParser.nextToken();
244
245       // trim out comments and whitespace
246
int hashPos= line.indexOf("#");
247       if (hashPos >= 0)
248         line= line.substring(0, hashPos);
249       line= line.trim();
250
251       if ( (line.length() >= 11)
252            && (line.substring(0, 11).equalsIgnoreCase("User-agent:")) ) {
253
254         if (doneAgents) {
255           if (currentPrecedence < bestPrecedenceSoFar) {
256             bestPrecedenceSoFar= currentPrecedence;
257             bestRulesSoFar= currentRules;
258             currentPrecedence= NO_PRECEDENCE;
259             currentRules= new RobotRuleSet();
260           }
261           addRules= false;
262         }
263         doneAgents= false;
264
265         String JavaDoc agentNames= line.substring(line.indexOf(":") + 1);
266         agentNames= agentNames.trim();
267         StringTokenizer JavaDoc agentTokenizer= new StringTokenizer JavaDoc(agentNames);
268
269         while (agentTokenizer.hasMoreTokens()) {
270           // for each agent listed, see if it's us:
271
String JavaDoc agentName= agentTokenizer.nextToken().toLowerCase();
272
273           Integer JavaDoc precedenceInt= (Integer JavaDoc) robotNames.get(agentName);
274
275           if (precedenceInt != null) {
276             int precedence= precedenceInt.intValue();
277             if ( (precedence < currentPrecedence)
278                  && (precedence < bestPrecedenceSoFar) )
279               currentPrecedence= precedence;
280           }
281         }
282
283         if (currentPrecedence < bestPrecedenceSoFar)
284           addRules= true;
285
286       } else if ( (line.length() >= 9)
287                   && (line.substring(0, 9).equalsIgnoreCase("Disallow:")) ) {
288
289         doneAgents= true;
290         String JavaDoc path= line.substring(line.indexOf(":") + 1);
291         path= path.trim();
292         try {
293           path= URLDecoder.decode(path, CHARACTER_ENCODING);
294         } catch (Exception JavaDoc e) {
295           LOG.warning("error parsing robots rules- can't decode path: "
296                       + path);
297         }
298
299         if (path.length() == 0) { // "empty rule"
300
if (addRules)
301             currentRules.clearPrefixes();
302         } else { // rule with path
303
if (addRules)
304             currentRules.addPrefix(path, false);
305         }
306
307       } else if ( (line.length() >= 6)
308                   && (line.substring(0, 6).equalsIgnoreCase("Allow:")) ) {
309
310         doneAgents= true;
311         String JavaDoc path= line.substring(line.indexOf(":") + 1);
312         path= path.trim();
313
314         if (path.length() == 0) {
315           // "empty rule"- treat same as empty disallow
316
if (addRules)
317             currentRules.clearPrefixes();
318         } else { // rule with path
319
if (addRules)
320             currentRules.addPrefix(path, true);
321         }
322       }
323     }
324
325     if (currentPrecedence < bestPrecedenceSoFar) {
326       bestPrecedenceSoFar= currentPrecedence;
327       bestRulesSoFar= currentRules;
328     }
329
330     if (bestPrecedenceSoFar == NO_PRECEDENCE)
331       return EMPTY_RULES;
332     return bestRulesSoFar;
333   }
334
335   /**
336    * Returns a <code>RobotRuleSet</code> object appropriate for use
337    * when the <code>robots.txt</code> file is empty or missing; all
338    * requests are allowed.
339    */

340   static RobotRuleSet getEmptyRules() {
341     return EMPTY_RULES;
342   }
343
344   /**
345    * Returns a <code>RobotRuleSet</code> object appropriate for use
346    * when the <code>robots.txt</code> file is not fetched due to a
347    * <code>403/Forbidden</code> response; all requests are
348    * disallowed.
349    */

350   static RobotRuleSet getForbidAllRules() {
351     RobotRuleSet rules= new RobotRuleSet();
352     rules.addPrefix("", false);
353     return rules;
354   }
355   
356   public static boolean isAllowed(URL JavaDoc url)
357     throws ProtocolException, IOException JavaDoc {
358
359     String JavaDoc host = url.getHost();
360
361     RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
362
363     if (robotRules == null) { // cache miss
364
HttpResponse response = new HttpResponse(new URL JavaDoc(url, "/robots.txt"));
365
366       if (response.getCode() == 200) // found rules: parse them
367
robotRules = new RobotRulesParser().parseRules(response.getContent());
368       else if (response.getCode() == 403)
369         robotRules = FORBID_ALL_RULES; // use forbid all
370
else
371         robotRules = EMPTY_RULES; // use default rules
372

373       CACHE.put(host, robotRules); // cache rules for host
374
}
375
376     String JavaDoc path = url.getPath(); // check rules
377
if ((path == null) || "".equals(path)) {
378       path= "/";
379     }
380
381     return robotRules.isAllowed(path);
382   }
383
384   private final static int BUFSIZE= 2048;
385
386   /** command-line main for testing */
387   public static void main(String JavaDoc[] argv) {
388     if (argv.length != 3) {
389       System.out.println("Usage:");
390       System.out.println(" java <robots-file> <url-file> <agent-name>+");
391       System.out.println("");
392       System.out.println("The <robots-file> will be parsed as a robots.txt file,");
393       System.out.println("using the given <agent-name> to select rules. URLs ");
394       System.out.println("will be read (one per line) from <url-file>, and tested");
395       System.out.println("against the rules.");
396       System.exit(-1);
397     }
398     try {
399       FileInputStream JavaDoc robotsIn= new FileInputStream JavaDoc(argv[0]);
400       LineNumberReader JavaDoc testsIn= new LineNumberReader JavaDoc(new FileReader JavaDoc(argv[1]));
401       String JavaDoc[] robotNames= new String JavaDoc[argv.length - 1];
402
403       for (int i= 0; i < argv.length - 2; i++)
404         robotNames[i]= argv[i+2];
405
406       ArrayList JavaDoc bufs= new ArrayList JavaDoc();
407       byte[] buf= new byte[BUFSIZE];
408       int totBytes= 0;
409
410       int rsize= robotsIn.read(buf);
411       while (rsize >= 0) {
412         totBytes+= rsize;
413         if (rsize != BUFSIZE) {
414           byte[] tmp= new byte[rsize];
415           System.arraycopy(buf, 0, tmp, 0, rsize);
416           bufs.add(tmp);
417         } else {
418           bufs.add(buf);
419           buf= new byte[BUFSIZE];
420         }
421         rsize= robotsIn.read(buf);
422       }
423
424       byte[] robotsBytes= new byte[totBytes];
425       int pos= 0;
426
427       for (int i= 0; i < bufs.size(); i++) {
428         byte[] currBuf= (byte[]) bufs.get(i);
429         int currBufLen= currBuf.length;
430         System.arraycopy(currBuf, 0, robotsBytes, pos, currBufLen);
431         pos+= currBufLen;
432       }
433
434       RobotRulesParser parser=
435         new RobotRulesParser(robotNames);
436       RobotRuleSet rules= parser.parseRules(robotsBytes);
437       System.out.println("Rules:");
438       System.out.println(rules);
439       System.out.println();
440
441       String JavaDoc testPath= testsIn.readLine().trim();
442       while (testPath != null) {
443         System.out.println( (rules.isAllowed(testPath) ?
444                              "allowed" : "not allowed")
445                             + ":\t" + testPath);
446         testPath= testsIn.readLine();
447       }
448
449     } catch (Exception JavaDoc e) {
450       e.printStackTrace();
451     }
452   }
453
454 }
455
Popular Tags