1 24 package org.archive.crawler.prefetch; 25 26 import java.util.Iterator ; 27 import java.util.Set ; 28 import java.util.logging.Level ; 29 import java.util.logging.Logger ; 30 31 import javax.management.AttributeNotFoundException ; 32 33 import org.apache.commons.httpclient.URIException; 34 import org.archive.crawler.datamodel.CoreAttributeConstants; 35 import org.archive.crawler.datamodel.CrawlHost; 36 import org.archive.crawler.datamodel.CrawlServer; 37 import org.archive.crawler.datamodel.CrawlURI; 38 import org.archive.crawler.datamodel.CredentialStore; 39 import org.archive.crawler.datamodel.FetchStatusCodes; 40 import org.archive.crawler.datamodel.credential.Credential; 41 import org.archive.crawler.datamodel.credential.CredentialAvatar; 42 import org.archive.crawler.framework.Processor; 43 import org.archive.crawler.settings.SimpleType; 44 import org.archive.crawler.settings.Type; 45 import org.archive.net.UURI; 46 47 54 public class PreconditionEnforcer 55 extends Processor 56 implements CoreAttributeConstants, FetchStatusCodes { 57 58 private static final long serialVersionUID = 4636474153589079615L; 59 60 private static final Logger logger = 61 Logger.getLogger(PreconditionEnforcer.class.getName()); 62 63 private final static Integer DEFAULT_IP_VALIDITY_DURATION = 64 new Integer (60*60*6); private final static Integer DEFAULT_ROBOTS_VALIDITY_DURATION = 66 new Integer (60*60*24); 68 69 public final static String ATTR_IP_VALIDITY_DURATION 70 = "ip-validity-duration-seconds"; 71 72 public final static String ATTR_ROBOTS_VALIDITY_DURATION 73 = "robot-validity-duration-seconds"; 74 75 76 public final static Boolean DEFAULT_CALCULATE_ROBOTS_ONLY = Boolean.FALSE; 77 public final static String ATTR_CALCULATE_ROBOTS_ONLY 78 = "calculate-robots-only"; 79 80 public PreconditionEnforcer(String name) { 81 super(name, "Precondition enforcer"); 82 83 Type e; 84 85 e = addElementToDefinition(new SimpleType(ATTR_IP_VALIDITY_DURATION, 86 "The minimum interval for which a dns-record will be considered " + 87 "valid (in seconds). " + 88 "If the record's DNS TTL is larger, that will be used instead.", 89 DEFAULT_IP_VALIDITY_DURATION)); 90 e.setExpertSetting(true); 91 92 e = addElementToDefinition(new SimpleType(ATTR_ROBOTS_VALIDITY_DURATION, 93 "The time in seconds that fetched robots.txt information is " + 94 "considered to be valid. " + 95 "If the value is set to '0', then the robots.txt information" + 96 " will never expire.", 97 DEFAULT_ROBOTS_VALIDITY_DURATION)); 98 e.setExpertSetting(true); 99 100 e = addElementToDefinition(new SimpleType(ATTR_CALCULATE_ROBOTS_ONLY, 101 "Whether to only calculate the robots status of an URI, " + 102 "without actually applying any exclusions found. If true, " + 103 "exlcuded URIs will only be annotated in the crawl.log, but " + 104 "still fetched. Default is false. ", 105 DEFAULT_CALCULATE_ROBOTS_ONLY)); 106 e.setExpertSetting(true); 107 } 108 109 protected void innerProcess(CrawlURI curi) { 110 111 if (considerDnsPreconditions(curi)) { 112 return; 113 } 114 115 String scheme = curi.getUURI().getScheme().toLowerCase(); 117 if (! (scheme.equals("http") || scheme.equals("https"))) { 118 logger.fine("PolitenessEnforcer doesn't understand uri's of type " + 119 scheme + " (ignoring)"); 120 return; 121 } 122 123 if (considerRobotsPreconditions(curi)) { 124 return; 125 } 126 127 if (!curi.isPrerequisite() && credentialPrecondition(curi)) { 128 return; 129 } 130 131 133 138 return; 139 } 140 141 149 private boolean considerRobotsPreconditions(CrawlURI curi) { 150 UURI uuri = curi.getUURI(); 152 try { 153 if (uuri != null && uuri.getPath() != null && 154 curi.getUURI().getPath().equals("/robots.txt")) { 155 curi.setPrerequisite(true); 157 return false; 158 } 159 } 160 catch (URIException e) { 161 logger.severe("Failed get of path for " + curi); 162 } 163 if (isRobotsExpired(curi)) { 165 if (logger.isLoggable(Level.FINE)) { 167 logger.fine( "No valid robots for " + 168 getController().getServerCache().getServerFor(curi) + 169 "; deferring " + curi); 170 } 171 172 try { 175 String prereq = curi.getUURI().resolve("/robots.txt").toString(); 176 curi.markPrerequisite(prereq, 177 getController().getPostprocessorChain()); 178 } 179 catch (URIException e1) { 180 logger.severe("Failed resolve using " + curi); 181 throw new RuntimeException (e1); } 183 return true; 184 } 185 CrawlServer cs = getController().getServerCache().getServerFor(curi); 187 if(cs.isValidRobots()){ 188 String ua = getController().getOrder().getUserAgent(curi); 189 if(cs.getRobots().disallows(curi, ua)) { 190 if(((Boolean )getUncheckedAttribute(curi,ATTR_CALCULATE_ROBOTS_ONLY)).booleanValue() == true) { 191 curi.addAnnotation("robotExcluded"); 193 return false; 194 } 195 curi.setFetchStatus(S_ROBOTS_PRECLUDED); 200 curi.putString("error","robots.txt exclusion"); 201 logger.fine("robots.txt precluded " + curi); 202 return true; 203 } 204 return false; 205 } 206 curi.skipToProcessorChain(getController().getPostprocessorChain()); 208 curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE); 209 curi.putString("error","robots.txt prerequisite failed"); 210 if (logger.isLoggable(Level.FINE)) { 211 logger.fine("robots.txt prerequisite failed " + curi); 212 } 213 return true; 214 } 215 216 220 private boolean considerDnsPreconditions(CrawlURI curi) { 221 if(curi.getUURI().getScheme().equals("dns")){ 222 curi.setPrerequisite(true); 224 return false; 225 } 226 227 CrawlServer cs = getController().getServerCache().getServerFor(curi); 228 if(cs == null) { 229 curi.setFetchStatus(S_UNFETCHABLE_URI); 230 curi.skipToProcessorChain(getController().getPostprocessorChain()); 231 return true; 232 } 233 234 CrawlHost ch = getController().getServerCache().getHostFor(curi); 238 if (ch == null || ch.hasBeenLookedUp() && ch.getIP() == null) { 239 if (logger.isLoggable(Level.FINE)) { 240 logger.fine( "no dns for " + ch + 241 " cancelling processing for CrawlURI " + curi.toString()); 242 } 243 curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE); 244 curi.skipToProcessorChain(getController().getPostprocessorChain()); 245 return true; 246 } 247 248 if (isIpExpired(curi) && !curi.getUURI().getScheme().equals("dns")) { 251 logger.fine("Deferring processing of CrawlURI " + curi.toString() 252 + " for dns lookup."); 253 String preq = "dns:" + ch.getHostName(); 254 try { 255 curi.markPrerequisite(preq, 256 getController().getPostprocessorChain()); 257 } catch (URIException e) { 258 throw new RuntimeException (e); } 260 return true; 261 } 262 263 return false; 265 } 266 267 274 public long getIPValidityDuration(CrawlURI curi) { 275 Integer d; 276 try { 277 d = (Integer )getAttribute(ATTR_IP_VALIDITY_DURATION, curi); 278 } catch (AttributeNotFoundException e) { 279 d = DEFAULT_IP_VALIDITY_DURATION; 280 } 281 282 return d.longValue(); 283 } 284 285 290 public boolean isIpExpired(CrawlURI curi) { 291 CrawlHost host = getController().getServerCache().getHostFor(curi); 292 if (!host.hasBeenLookedUp()) { 293 return true; 295 } 296 297 if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) { 298 return false; 300 } 301 302 long duration = getIPValidityDuration(curi); 303 if (duration == 0) { 304 return false; 307 } 308 309 if (duration <= 0) { 312 duration = DEFAULT_IP_VALIDITY_DURATION.intValue(); 313 } 314 315 long ttl = host.getIpTTL(); 316 if (ttl > duration) { 317 duration = ttl; 320 } 321 322 if (duration > 0) { 324 duration *= 1000; 325 } 326 327 return (duration + host.getIpFetched()) < System.currentTimeMillis(); 328 } 329 330 335 public long getRobotsValidityDuration(CrawlURI curi) { 336 Integer d; 337 try { 338 d = (Integer ) getAttribute(ATTR_ROBOTS_VALIDITY_DURATION, curi); 339 } catch (AttributeNotFoundException e) { 340 logger.severe(e.getLocalizedMessage()); 342 d = DEFAULT_ROBOTS_VALIDITY_DURATION; 343 } 344 return d.longValue() * 1000; 346 } 347 348 357 public boolean isRobotsExpired(CrawlURI curi) { 358 CrawlServer server = 359 getController().getServerCache().getServerFor(curi); 360 long robotsFetched = server.getRobotsFetchedTime(); 361 if (robotsFetched == CrawlServer.ROBOTS_NOT_FETCHED) { 362 return true; 364 } 365 long duration = getRobotsValidityDuration(curi); 366 if (duration == 0) { 367 return false; 369 } 370 if (robotsFetched + duration < System.currentTimeMillis()) { 371 return true; 373 } 374 return false; 375 } 376 377 395 private boolean credentialPrecondition(final CrawlURI curi) { 396 397 boolean result = false; 398 399 CredentialStore cs = 400 CredentialStore.getCredentialStore(getSettingsHandler()); 401 if (cs == null) { 402 logger.severe("No credential store for " + curi); 403 return result; 404 } 405 406 Iterator i = cs.iterator(curi); 407 if (i == null) { 408 return result; 409 } 410 411 while (i.hasNext()) { 412 Credential c = (Credential)i.next(); 413 414 if (c.isPrerequisite(curi)) { 415 c.attach(curi); 422 curi.setPost(c.isPost(curi)); 423 break; 424 } 425 426 if (!c.rootUriMatch(getController(), curi)) { 427 continue; 428 } 429 430 if (!c.hasPrerequisite(curi)) { 431 continue; 432 } 433 434 if (!authenticated(c, curi)) { 435 String prereq = c.getPrerequisite(curi); 439 if (prereq == null || prereq.length() <= 0) { 440 CrawlServer server = 441 getController().getServerCache().getServerFor(curi); 442 logger.severe(server.getName() + " has " 443 + " credential(s) of type " + c + " but prereq" 444 + " is null."); 445 } else { 446 try { 447 curi.markPrerequisite(prereq, 448 getController().getPostprocessorChain()); 449 } catch (URIException e) { 450 logger.severe("unable to set credentials prerequisite "+prereq); 451 getController().logUriError(e,curi.getUURI(),prereq); 452 return false; 453 } 454 result = true; 455 if (logger.isLoggable(Level.FINE)) { 456 logger.fine("Queueing prereq " + prereq + " of type " + 457 c + " for " + curi); 458 } 459 break; 460 } 461 } 462 } 463 return result; 464 } 465 466 473 private boolean authenticated(final Credential credential, 474 final CrawlURI curi) { 475 boolean result = false; 476 CrawlServer server = 477 getController().getServerCache().getServerFor(curi); 478 if (!server.hasCredentialAvatars()) { 479 return result; 480 } 481 Set avatars = server.getCredentialAvatars(); 482 for (Iterator i = avatars.iterator(); i.hasNext();) { 483 CredentialAvatar ca = (CredentialAvatar)i.next(); 484 String key = null; 485 try { 486 key = credential.getKey(curi); 487 } catch (AttributeNotFoundException e) { 488 logger.severe("Failed getting key for " + credential + 489 " for " + curi); 490 continue; 491 } 492 if (ca.match(credential.getClass(), key)) { 493 result = true; 494 } 495 } 496 return result; 497 } 498 } 499 | Popular Tags |