1 2 3 4 package net.nutch.protocol.http; 5 6 import java.io.FileInputStream ; 7 import java.io.FileReader ; 8 import java.io.LineNumberReader ; 9 import java.io.IOException ; 10 import java.io.UnsupportedEncodingException ; 11 import java.net.URL ; 12 import java.net.URLDecoder ; 13 import java.util.Arrays ; 14 import java.util.ArrayList ; 15 import java.util.HashMap ; 16 import java.util.Hashtable ; 17 import java.util.StringTokenizer ; 18 19 import java.util.logging.Logger ; 20 import java.util.logging.Level ; 21 import java.util.logging.Handler ; 22 23 import net.nutch.util.NutchConf; 24 import net.nutch.util.LogFormatter; 25 import net.nutch.protocol.ProtocolException; 26 27 36 public class RobotRulesParser { 37 public static final Logger LOG= 38 LogFormatter.getLogger("net.nutch.fetcher.RobotRulesParser"); 39 40 private static final String [] AGENTS = getAgents(); 41 private static final Hashtable CACHE = new Hashtable (); 42 43 private static final String CHARACTER_ENCODING= "UTF-8"; 44 private static final int NO_PRECEDENCE= Integer.MAX_VALUE; 45 46 private static final RobotRuleSet EMPTY_RULES= new RobotRuleSet(); 47 48 private static RobotRuleSet FORBID_ALL_RULES = 49 new RobotRulesParser().getForbidAllRules(); 50 51 private HashMap robotNames; 52 53 57 public static class RobotRuleSet { 58 ArrayList tmpEntries; 59 RobotsEntry[] entries; 60 long expireTime; 61 62 64 private class RobotsEntry { 65 String prefix; 66 boolean allowed; 67 68 RobotsEntry(String prefix, boolean allowed) { 69 this.prefix= prefix; 70 this.allowed= allowed; 71 } 72 } 73 74 77 private RobotRuleSet() { 78 tmpEntries= new ArrayList (); 79 entries= null; 80 } 81 82 84 private void addPrefix(String prefix, boolean allow) { 85 if (tmpEntries == null) { 86 tmpEntries= new ArrayList (); 87 if (entries != null) { 88 for (int i= 0; i < entries.length; i++) 89 tmpEntries.add(entries[i]); 90 } 91 entries= null; 92 } 93 94 tmpEntries.add(new RobotsEntry(prefix, allow)); 95 } 96 97 99 private void clearPrefixes() { 100 if (tmpEntries == null) { 101 tmpEntries= new ArrayList (); 102 entries= null; 103 } else { 104 tmpEntries.clear(); 105 } 106 } 107 108 111 public void setExpireTime(long expireTime) { 112 this.expireTime = expireTime; 113 } 114 115 118 public long getExpireTime() { 119 return expireTime; 120 } 121 122 127 public boolean isAllowed(String path) { 128 try { 129 path= URLDecoder.decode(path, CHARACTER_ENCODING); 130 } catch (Exception e) { 131 } 134 135 if (entries == null) { 136 entries= new RobotsEntry[tmpEntries.size()]; 137 entries= (RobotsEntry[]) 138 tmpEntries.toArray(entries); 139 tmpEntries= null; 140 } 141 142 int pos= 0; 143 int end= entries.length; 144 while (pos < end) { 145 if (path.startsWith(entries[pos].prefix)) 146 return entries[pos].allowed; 147 pos++; 148 } 149 150 return true; 151 } 152 153 155 public String toString() { 156 isAllowed("x"); StringBuffer buf= new StringBuffer (); 158 for (int i= 0; i < entries.length; i++) 159 if (entries[i].allowed) 160 buf.append("Allow: " + entries[i].prefix 161 + System.getProperty("line.separator")); 162 else 163 buf.append("Disallow: " + entries[i].prefix 164 + System.getProperty("line.separator")); 165 return buf.toString(); 166 } 167 } 168 169 170 public RobotRulesParser() { this(AGENTS); } 171 172 private static String [] getAgents() { 173 String agentName = NutchConf.get("http.agent.name"); 177 String agentNames = NutchConf.get("http.robots.agents"); 178 StringTokenizer tok = new StringTokenizer (agentNames, ","); 179 ArrayList agents = new ArrayList (); 180 while (tok.hasMoreTokens()) { 181 agents.add(tok.nextToken().trim()); 182 } 183 184 if (agents.size() == 0) { 190 agents.add(agentName); 191 LOG.severe("No agents listed in 'http.robots.agents' property!"); 192 } else if (!((String )agents.get(0)).equalsIgnoreCase(agentName)) { 193 agents.add(0, agentName); 194 LOG.severe("Agent we advertise (" + agentName 195 + ") not listed first in 'http.robots.agents' property!"); 196 } 197 198 return (String [])agents.toArray(new String [agents.size()]); 199 } 200 201 202 211 public RobotRulesParser(String [] robotNames) { 212 this.robotNames= new HashMap (); 213 for (int i= 0; i < robotNames.length; i++) { 214 this.robotNames.put(robotNames[i].toLowerCase(), new Integer (i)); 215 } 216 if (!this.robotNames.containsKey("*")) 218 this.robotNames.put("*", new Integer (robotNames.length)); 219 } 220 221 225 RobotRuleSet parseRules(byte[] robotContent) { 226 if (robotContent == null) 227 return EMPTY_RULES; 228 229 String content= new String (robotContent); 230 231 StringTokenizer lineParser= new StringTokenizer (content, "\n\r"); 232 233 RobotRuleSet bestRulesSoFar= null; 234 int bestPrecedenceSoFar= NO_PRECEDENCE; 235 236 RobotRuleSet currentRules= new RobotRuleSet(); 237 int currentPrecedence= NO_PRECEDENCE; 238 239 boolean addRules= false; boolean doneAgents= false; 242 while (lineParser.hasMoreTokens()) { 243 String line= lineParser.nextToken(); 244 245 int hashPos= line.indexOf("#"); 247 if (hashPos >= 0) 248 line= line.substring(0, hashPos); 249 line= line.trim(); 250 251 if ( (line.length() >= 11) 252 && (line.substring(0, 11).equalsIgnoreCase("User-agent:")) ) { 253 254 if (doneAgents) { 255 if (currentPrecedence < bestPrecedenceSoFar) { 256 bestPrecedenceSoFar= currentPrecedence; 257 bestRulesSoFar= currentRules; 258 currentPrecedence= NO_PRECEDENCE; 259 currentRules= new RobotRuleSet(); 260 } 261 addRules= false; 262 } 263 doneAgents= false; 264 265 String agentNames= line.substring(line.indexOf(":") + 1); 266 agentNames= agentNames.trim(); 267 StringTokenizer agentTokenizer= new StringTokenizer (agentNames); 268 269 while (agentTokenizer.hasMoreTokens()) { 270 String agentName= agentTokenizer.nextToken().toLowerCase(); 272 273 Integer precedenceInt= (Integer ) robotNames.get(agentName); 274 275 if (precedenceInt != null) { 276 int precedence= precedenceInt.intValue(); 277 if ( (precedence < currentPrecedence) 278 && (precedence < bestPrecedenceSoFar) ) 279 currentPrecedence= precedence; 280 } 281 } 282 283 if (currentPrecedence < bestPrecedenceSoFar) 284 addRules= true; 285 286 } else if ( (line.length() >= 9) 287 && (line.substring(0, 9).equalsIgnoreCase("Disallow:")) ) { 288 289 doneAgents= true; 290 String path= line.substring(line.indexOf(":") + 1); 291 path= path.trim(); 292 try { 293 path= URLDecoder.decode(path, CHARACTER_ENCODING); 294 } catch (Exception e) { 295 LOG.warning("error parsing robots rules- can't decode path: " 296 + path); 297 } 298 299 if (path.length() == 0) { if (addRules) 301 currentRules.clearPrefixes(); 302 } else { if (addRules) 304 currentRules.addPrefix(path, false); 305 } 306 307 } else if ( (line.length() >= 6) 308 && (line.substring(0, 6).equalsIgnoreCase("Allow:")) ) { 309 310 doneAgents= true; 311 String path= line.substring(line.indexOf(":") + 1); 312 path= path.trim(); 313 314 if (path.length() == 0) { 315 if (addRules) 317 currentRules.clearPrefixes(); 318 } else { if (addRules) 320 currentRules.addPrefix(path, true); 321 } 322 } 323 } 324 325 if (currentPrecedence < bestPrecedenceSoFar) { 326 bestPrecedenceSoFar= currentPrecedence; 327 bestRulesSoFar= currentRules; 328 } 329 330 if (bestPrecedenceSoFar == NO_PRECEDENCE) 331 return EMPTY_RULES; 332 return bestRulesSoFar; 333 } 334 335 340 static RobotRuleSet getEmptyRules() { 341 return EMPTY_RULES; 342 } 343 344 350 static RobotRuleSet getForbidAllRules() { 351 RobotRuleSet rules= new RobotRuleSet(); 352 rules.addPrefix("", false); 353 return rules; 354 } 355 356 public static boolean isAllowed(URL url) 357 throws ProtocolException, IOException { 358 359 String host = url.getHost(); 360 361 RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host); 362 363 if (robotRules == null) { HttpResponse response = new HttpResponse(new URL (url, "/robots.txt")); 365 366 if (response.getCode() == 200) robotRules = new RobotRulesParser().parseRules(response.getContent()); 368 else if (response.getCode() == 403) 369 robotRules = FORBID_ALL_RULES; else 371 robotRules = EMPTY_RULES; 373 CACHE.put(host, robotRules); } 375 376 String path = url.getPath(); if ((path == null) || "".equals(path)) { 378 path= "/"; 379 } 380 381 return robotRules.isAllowed(path); 382 } 383 384 private final static int BUFSIZE= 2048; 385 386 387 public static void main(String [] argv) { 388 if (argv.length != 3) { 389 System.out.println("Usage:"); 390 System.out.println(" java <robots-file> <url-file> <agent-name>+"); 391 System.out.println(""); 392 System.out.println("The <robots-file> will be parsed as a robots.txt file,"); 393 System.out.println("using the given <agent-name> to select rules. URLs "); 394 System.out.println("will be read (one per line) from <url-file>, and tested"); 395 System.out.println("against the rules."); 396 System.exit(-1); 397 } 398 try { 399 FileInputStream robotsIn= new FileInputStream (argv[0]); 400 LineNumberReader testsIn= new LineNumberReader (new FileReader (argv[1])); 401 String [] robotNames= new String [argv.length - 1]; 402 403 for (int i= 0; i < argv.length - 2; i++) 404 robotNames[i]= argv[i+2]; 405 406 ArrayList bufs= new ArrayList (); 407 byte[] buf= new byte[BUFSIZE]; 408 int totBytes= 0; 409 410 int rsize= robotsIn.read(buf); 411 while (rsize >= 0) { 412 totBytes+= rsize; 413 if (rsize != BUFSIZE) { 414 byte[] tmp= new byte[rsize]; 415 System.arraycopy(buf, 0, tmp, 0, rsize); 416 bufs.add(tmp); 417 } else { 418 bufs.add(buf); 419 buf= new byte[BUFSIZE]; 420 } 421 rsize= robotsIn.read(buf); 422 } 423 424 byte[] robotsBytes= new byte[totBytes]; 425 int pos= 0; 426 427 for (int i= 0; i < bufs.size(); i++) { 428 byte[] currBuf= (byte[]) bufs.get(i); 429 int currBufLen= currBuf.length; 430 System.arraycopy(currBuf, 0, robotsBytes, pos, currBufLen); 431 pos+= currBufLen; 432 } 433 434 RobotRulesParser parser= 435 new RobotRulesParser(robotNames); 436 RobotRuleSet rules= parser.parseRules(robotsBytes); 437 System.out.println("Rules:"); 438 System.out.println(rules); 439 System.out.println(); 440 441 String testPath= testsIn.readLine().trim(); 442 while (testPath != null) { 443 System.out.println( (rules.isAllowed(testPath) ? 444 "allowed" : "not allowed") 445 + ":\t" + testPath); 446 testPath= testsIn.readLine(); 447 } 448 449 } catch (Exception e) { 450 e.printStackTrace(); 451 } 452 } 453 454 } 455 | Popular Tags |