KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > prefetch > PreconditionEnforcer


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * SimplePolitenessEnforcer.java
20  * Created on May 22, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/prefetch/PreconditionEnforcer.java,v 1.25.2.1 2007/01/13 01:31:24 stack-sf Exp $
23  */

24 package org.archive.crawler.prefetch;
25
26 import java.util.Iterator JavaDoc;
27 import java.util.Set JavaDoc;
28 import java.util.logging.Level JavaDoc;
29 import java.util.logging.Logger JavaDoc;
30
31 import javax.management.AttributeNotFoundException JavaDoc;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.datamodel.CoreAttributeConstants;
35 import org.archive.crawler.datamodel.CrawlHost;
36 import org.archive.crawler.datamodel.CrawlServer;
37 import org.archive.crawler.datamodel.CrawlURI;
38 import org.archive.crawler.datamodel.CredentialStore;
39 import org.archive.crawler.datamodel.FetchStatusCodes;
40 import org.archive.crawler.datamodel.credential.Credential;
41 import org.archive.crawler.datamodel.credential.CredentialAvatar;
42 import org.archive.crawler.framework.Processor;
43 import org.archive.crawler.settings.SimpleType;
44 import org.archive.crawler.settings.Type;
45 import org.archive.net.UURI;
46
47 /**
48  * Ensures the preconditions for a fetch -- such as DNS lookup
49  * or acquiring and respecting a robots.txt policy -- are
50  * satisfied before a URI is passed to subsequent stages.
51  *
52  * @author gojomo
53  */

54 public class PreconditionEnforcer
55         extends Processor
56         implements CoreAttributeConstants, FetchStatusCodes {
57
58     private static final long serialVersionUID = 4636474153589079615L;
59
60     private static final Logger JavaDoc logger =
61         Logger.getLogger(PreconditionEnforcer.class.getName());
62
63     private final static Integer JavaDoc DEFAULT_IP_VALIDITY_DURATION =
64         new Integer JavaDoc(60*60*6); // six hours
65
private final static Integer JavaDoc DEFAULT_ROBOTS_VALIDITY_DURATION =
66         new Integer JavaDoc(60*60*24); // one day
67

68     /** seconds to keep IP information for */
69     public final static String JavaDoc ATTR_IP_VALIDITY_DURATION
70         = "ip-validity-duration-seconds";
71     /** seconds to cache robots info */
72     public final static String JavaDoc ATTR_ROBOTS_VALIDITY_DURATION
73         = "robot-validity-duration-seconds";
74
75     /** whether to calculate robots exclusion without applying */
76     public final static Boolean JavaDoc DEFAULT_CALCULATE_ROBOTS_ONLY = Boolean.FALSE;
77     public final static String JavaDoc ATTR_CALCULATE_ROBOTS_ONLY
78         = "calculate-robots-only";
79     
80     public PreconditionEnforcer(String JavaDoc name) {
81         super(name, "Precondition enforcer");
82
83         Type e;
84
85         e = addElementToDefinition(new SimpleType(ATTR_IP_VALIDITY_DURATION,
86                 "The minimum interval for which a dns-record will be considered " +
87                 "valid (in seconds). " +
88                 "If the record's DNS TTL is larger, that will be used instead.",
89                 DEFAULT_IP_VALIDITY_DURATION));
90         e.setExpertSetting(true);
91
92         e = addElementToDefinition(new SimpleType(ATTR_ROBOTS_VALIDITY_DURATION,
93                 "The time in seconds that fetched robots.txt information is " +
94                 "considered to be valid. " +
95                 "If the value is set to '0', then the robots.txt information" +
96                 " will never expire.",
97                 DEFAULT_ROBOTS_VALIDITY_DURATION));
98         e.setExpertSetting(true);
99         
100         e = addElementToDefinition(new SimpleType(ATTR_CALCULATE_ROBOTS_ONLY,
101                 "Whether to only calculate the robots status of an URI, " +
102                 "without actually applying any exclusions found. If true, " +
103                 "exlcuded URIs will only be annotated in the crawl.log, but " +
104                 "still fetched. Default is false. ",
105                 DEFAULT_CALCULATE_ROBOTS_ONLY));
106         e.setExpertSetting(true);
107     }
108
109     protected void innerProcess(CrawlURI curi) {
110
111         if (considerDnsPreconditions(curi)) {
112             return;
113         }
114
115         // make sure we only process schemes we understand (i.e. not dns)
116
String JavaDoc scheme = curi.getUURI().getScheme().toLowerCase();
117         if (! (scheme.equals("http") || scheme.equals("https"))) {
118             logger.fine("PolitenessEnforcer doesn't understand uri's of type " +
119                 scheme + " (ignoring)");
120             return;
121         }
122
123         if (considerRobotsPreconditions(curi)) {
124             return;
125         }
126
127         if (!curi.isPrerequisite() && credentialPrecondition(curi)) {
128             return;
129         }
130
131         // OK, it's allowed
132

133         // For all curis that will in fact be fetched, set appropriate delays.
134
// TODO: SOMEDAY: allow per-host, per-protocol, etc. factors
135
// curi.setDelayFactor(getDelayFactorFor(curi));
136
// curi.setMinimumDelay(getMinimumDelayFor(curi));
137

138         return;
139     }
140
141     /**
142      * Consider the robots precondition.
143      *
144      * @param curi CrawlURI we're checking for any required preconditions.
145      * @return True, if this <code>curi</code> has a precondition or processing
146      * should be terminated for some other reason. False if
147      * we can precede to process this url.
148      */

149     private boolean considerRobotsPreconditions(CrawlURI curi) {
150         // treat /robots.txt fetches specially
151
UURI uuri = curi.getUURI();
152         try {
153             if (uuri != null && uuri.getPath() != null &&
154                     curi.getUURI().getPath().equals("/robots.txt")) {
155                 // allow processing to continue
156
curi.setPrerequisite(true);
157                 return false;
158             }
159         }
160         catch (URIException e) {
161             logger.severe("Failed get of path for " + curi);
162         }
163         // require /robots.txt if not present
164
if (isRobotsExpired(curi)) {
165             // Need to get robots
166
if (logger.isLoggable(Level.FINE)) {
167                 logger.fine( "No valid robots for " +
168                     getController().getServerCache().getServerFor(curi) +
169                     "; deferring " + curi);
170             }
171
172             // Robots expired - should be refetched even though its already
173
// crawled.
174
try {
175                 String JavaDoc prereq = curi.getUURI().resolve("/robots.txt").toString();
176                 curi.markPrerequisite(prereq,
177                     getController().getPostprocessorChain());
178             }
179             catch (URIException e1) {
180                 logger.severe("Failed resolve using " + curi);
181                 throw new RuntimeException JavaDoc(e1); // shouldn't ever happen
182
}
183             return true;
184         }
185         // test against robots.txt if available
186
CrawlServer cs = getController().getServerCache().getServerFor(curi);
187         if(cs.isValidRobots()){
188             String JavaDoc ua = getController().getOrder().getUserAgent(curi);
189             if(cs.getRobots().disallows(curi, ua)) {
190                 if(((Boolean JavaDoc)getUncheckedAttribute(curi,ATTR_CALCULATE_ROBOTS_ONLY)).booleanValue() == true) {
191                     // annotate URI as excluded, but continue to process normally
192
curi.addAnnotation("robotExcluded");
193                     return false;
194                 }
195                 // mark as precluded; in FetchHTTP, this will
196
// prevent fetching and cause a skip to the end
197
// of processing (unless an intervening processor
198
// overrules)
199
curi.setFetchStatus(S_ROBOTS_PRECLUDED);
200                 curi.putString("error","robots.txt exclusion");
201                 logger.fine("robots.txt precluded " + curi);
202                 return true;
203             }
204             return false;
205         }
206         // No valid robots found => Attempt to get robots.txt failed
207
curi.skipToProcessorChain(getController().getPostprocessorChain());
208         curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE);
209         curi.putString("error","robots.txt prerequisite failed");
210         if (logger.isLoggable(Level.FINE)) {
211             logger.fine("robots.txt prerequisite failed " + curi);
212         }
213         return true;
214     }
215
216     /**
217      * @param curi CrawlURI whose dns prerequisite we're to check.
218      * @return true if no further processing in this module should occur
219      */

220     private boolean considerDnsPreconditions(CrawlURI curi) {
221         if(curi.getUURI().getScheme().equals("dns")){
222             // DNS URIs never have a DNS precondition
223
curi.setPrerequisite(true);
224             return false;
225         }
226         
227         CrawlServer cs = getController().getServerCache().getServerFor(curi);
228         if(cs == null) {
229             curi.setFetchStatus(S_UNFETCHABLE_URI);
230             curi.skipToProcessorChain(getController().getPostprocessorChain());
231             return true;
232         }
233
234         // If we've done a dns lookup and it didn't resolve a host
235
// cancel further fetch-processing of this URI, because
236
// the domain is unresolvable
237
CrawlHost ch = getController().getServerCache().getHostFor(curi);
238         if (ch == null || ch.hasBeenLookedUp() && ch.getIP() == null) {
239             if (logger.isLoggable(Level.FINE)) {
240                 logger.fine( "no dns for " + ch +
241                     " cancelling processing for CrawlURI " + curi.toString());
242             }
243             curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
244             curi.skipToProcessorChain(getController().getPostprocessorChain());
245             return true;
246         }
247
248         // If we haven't done a dns lookup and this isn't a dns uri
249
// shoot that off and defer further processing
250
if (isIpExpired(curi) && !curi.getUURI().getScheme().equals("dns")) {
251             logger.fine("Deferring processing of CrawlURI " + curi.toString()
252                 + " for dns lookup.");
253             String JavaDoc preq = "dns:" + ch.getHostName();
254             try {
255                 curi.markPrerequisite(preq,
256                     getController().getPostprocessorChain());
257             } catch (URIException e) {
258                 throw new RuntimeException JavaDoc(e); // shouldn't ever happen
259
}
260             return true;
261         }
262         
263         // DNS preconditions OK
264
return false;
265     }
266
267     /**
268      * Get the maximum time a dns-record is valid.
269      *
270      * @param curi the uri this time is valid for.
271      * @return the maximum time a dns-record is valid -- in seconds -- or
272      * negative if record's ttl should be used.
273      */

274     public long getIPValidityDuration(CrawlURI curi) {
275         Integer JavaDoc d;
276         try {
277             d = (Integer JavaDoc)getAttribute(ATTR_IP_VALIDITY_DURATION, curi);
278         } catch (AttributeNotFoundException JavaDoc e) {
279             d = DEFAULT_IP_VALIDITY_DURATION;
280         }
281
282         return d.longValue();
283     }
284
285     /** Return true if ip should be looked up.
286      *
287      * @param curi the URI to check.
288      * @return true if ip should be looked up.
289      */

290     public boolean isIpExpired(CrawlURI curi) {
291         CrawlHost host = getController().getServerCache().getHostFor(curi);
292         if (!host.hasBeenLookedUp()) {
293             // IP has not been looked up yet.
294
return true;
295         }
296
297         if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) {
298             // IP never expires (numeric IP)
299
return false;
300         }
301
302         long duration = getIPValidityDuration(curi);
303         if (duration == 0) {
304             // Never expire ip if duration is null (set by user or more likely,
305
// set to zero in case where we tried in FetchDNS but failed).
306
return false;
307         }
308
309         // catch old "default" -1 settings that are now problematic,
310
// convert to new minimum
311
if (duration <= 0) {
312             duration = DEFAULT_IP_VALIDITY_DURATION.intValue();
313         }
314         
315         long ttl = host.getIpTTL();
316         if (ttl > duration) {
317             // Use the larger of the operator-set minimum duration
318
// or the DNS record TTL
319
duration = ttl;
320         }
321
322         // Duration and ttl are in seconds. Convert to millis.
323
if (duration > 0) {
324             duration *= 1000;
325         }
326
327         return (duration + host.getIpFetched()) < System.currentTimeMillis();
328     }
329
330     /** Get the maximum time a robots.txt is valid.
331      *
332      * @param curi
333      * @return the time a robots.txt is valid in milliseconds.
334      */

335     public long getRobotsValidityDuration(CrawlURI curi) {
336         Integer JavaDoc d;
337         try {
338             d = (Integer JavaDoc) getAttribute(ATTR_ROBOTS_VALIDITY_DURATION, curi);
339         } catch (AttributeNotFoundException JavaDoc e) {
340             // This should never happen, but if it does, return default
341
logger.severe(e.getLocalizedMessage());
342             d = DEFAULT_ROBOTS_VALIDITY_DURATION;
343         }
344         // convert from seconds to milliseconds
345
return d.longValue() * 1000;
346     }
347
348     /**
349      * Is the robots policy expired.
350      *
351      * This method will also return true if we haven't tried to get the
352      * robots.txt for this server.
353      *
354      * @param curi
355      * @return true if the robots policy is expired.
356      */

357     public boolean isRobotsExpired(CrawlURI curi) {
358         CrawlServer server =
359             getController().getServerCache().getServerFor(curi);
360         long robotsFetched = server.getRobotsFetchedTime();
361         if (robotsFetched == CrawlServer.ROBOTS_NOT_FETCHED) {
362             // Have not attempted to fetch robots
363
return true;
364         }
365         long duration = getRobotsValidityDuration(curi);
366         if (duration == 0) {
367             // When zero, robots should be valid forever
368
return false;
369         }
370         if (robotsFetched + duration < System.currentTimeMillis()) {
371             // Robots is still valid
372
return true;
373         }
374         return false;
375     }
376
377    /**
378     * Consider credential preconditions.
379     *
380     * Looks to see if any credential preconditions (e.g. html form login
381     * credentials) for this <code>CrawlServer</code>. If there are, have they
382     * been run already? If not, make the running of these logins a precondition
383     * of accessing any other url on this <code>CrawlServer</code>.
384     *
385     * <p>
386     * One day, do optimization and avoid running the bulk of the code below.
387     * Argument for running the code everytime is that overrides and refinements
388     * may change what comes back from credential store.
389     *
390     * @param curi CrawlURI we're checking for any required preconditions.
391     * @return True, if this <code>curi</code> has a precondition that needs to
392     * be met before we can proceed. False if we can precede to process
393     * this url.
394     */

395     private boolean credentialPrecondition(final CrawlURI curi) {
396
397         boolean result = false;
398
399         CredentialStore cs =
400             CredentialStore.getCredentialStore(getSettingsHandler());
401         if (cs == null) {
402             logger.severe("No credential store for " + curi);
403             return result;
404         }
405
406         Iterator JavaDoc i = cs.iterator(curi);
407         if (i == null) {
408             return result;
409         }
410
411         while (i.hasNext()) {
412             Credential c = (Credential)i.next();
413
414             if (c.isPrerequisite(curi)) {
415                 // This credential has a prereq. and this curi is it. Let it
416
// through. Add its avatar to the curi as a mark. Also, does
417
// this curi need to be posted? Note, we do this test for
418
// is it a prereq BEFORE we do the check that curi is of the
419
// credential domain because such as yahoo have you go to
420
// another domain altogether to login.
421
c.attach(curi);
422                 curi.setPost(c.isPost(curi));
423                 break;
424             }
425
426             if (!c.rootUriMatch(getController(), curi)) {
427                 continue;
428             }
429
430             if (!c.hasPrerequisite(curi)) {
431                 continue;
432             }
433
434             if (!authenticated(c, curi)) {
435                 // Han't been authenticated. Queue it and move on (Assumption
436
// is that we can do one authentication at a time -- usually one
437
// html form).
438
String JavaDoc prereq = c.getPrerequisite(curi);
439                 if (prereq == null || prereq.length() <= 0) {
440                     CrawlServer server =
441                         getController().getServerCache().getServerFor(curi);
442                     logger.severe(server.getName() + " has "
443                         + " credential(s) of type " + c + " but prereq"
444                         + " is null.");
445                 } else {
446                     try {
447                         curi.markPrerequisite(prereq,
448                             getController().getPostprocessorChain());
449                     } catch (URIException e) {
450                         logger.severe("unable to set credentials prerequisite "+prereq);
451                         getController().logUriError(e,curi.getUURI(),prereq);
452                         return false;
453                     }
454                     result = true;
455                     if (logger.isLoggable(Level.FINE)) {
456                         logger.fine("Queueing prereq " + prereq + " of type " +
457                             c + " for " + curi);
458                     }
459                     break;
460                 }
461             }
462         }
463         return result;
464     }
465
466     /**
467      * Has passed credential already been authenticated.
468      *
469      * @param credential Credential to test.
470      * @param curi CrawlURI.
471      * @return True if already run.
472      */

473     private boolean authenticated(final Credential credential,
474             final CrawlURI curi) {
475         boolean result = false;
476         CrawlServer server =
477             getController().getServerCache().getServerFor(curi);
478         if (!server.hasCredentialAvatars()) {
479             return result;
480         }
481         Set JavaDoc avatars = server.getCredentialAvatars();
482         for (Iterator JavaDoc i = avatars.iterator(); i.hasNext();) {
483             CredentialAvatar ca = (CredentialAvatar)i.next();
484             String JavaDoc key = null;
485             try {
486                 key = credential.getKey(curi);
487             } catch (AttributeNotFoundException JavaDoc e) {
488                 logger.severe("Failed getting key for " + credential +
489                     " for " + curi);
490                 continue;
491             }
492             if (ca.match(credential.getClass(), key)) {
493                 result = true;
494             }
495         }
496         return result;
497     }
498 }
499
Popular Tags