KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > frontier > AdaptiveRevisitFrontier


1 /* AdaptiveRevisitFrontier.java
2 *
3 * Created on Sep 13, 2004
4 *
5 * Copyright (C) 2004 Kristinn Sigur?sson.
6 *
7 * This file is part of the Heritrix web crawler (crawler.archive.org).
8 *
9 * Heritrix is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * any later version.
13 *
14 * Heritrix is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser Public License
20 * along with Heritrix; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */

23 package org.archive.crawler.frontier;
24
25 import java.io.File JavaDoc;
26 import java.io.IOException JavaDoc;
27 import java.io.PrintWriter JavaDoc;
28 import java.io.Serializable JavaDoc;
29 import java.io.StringWriter JavaDoc;
30 import java.io.Writer JavaDoc;
31 import java.util.ArrayList JavaDoc;
32 import java.util.Date JavaDoc;
33 import java.util.Iterator JavaDoc;
34 import java.util.List JavaDoc;
35 import java.util.logging.Level JavaDoc;
36 import java.util.logging.Logger JavaDoc;
37
38 import javax.management.AttributeNotFoundException JavaDoc;
39
40 import org.apache.commons.httpclient.HttpStatus;
41 import org.archive.crawler.datamodel.CandidateURI;
42 import org.archive.crawler.datamodel.CoreAttributeConstants;
43 import org.archive.crawler.datamodel.CrawlServer;
44 import org.archive.crawler.datamodel.CrawlURI;
45 import org.archive.crawler.datamodel.FetchStatusCodes;
46 import org.archive.crawler.datamodel.UriUniqFilter;
47 import org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver;
48 import org.archive.crawler.event.CrawlStatusListener;
49 import org.archive.crawler.framework.CrawlController;
50 import org.archive.crawler.framework.Frontier;
51 import org.archive.crawler.framework.FrontierMarker;
52 import org.archive.crawler.framework.exceptions.EndedException;
53 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
54 import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
55 import org.archive.crawler.settings.ModuleType;
56 import org.archive.crawler.settings.RegularExpressionConstraint;
57 import org.archive.crawler.settings.SimpleType;
58 import org.archive.crawler.settings.Type;
59 import org.archive.crawler.url.Canonicalizer;
60 import org.archive.crawler.util.BdbUriUniqFilter;
61 import org.archive.net.UURI;
62 import org.archive.queue.MemQueue;
63 import org.archive.queue.Queue;
64 import org.archive.util.ArchiveUtils;
65
66
67 /**
68  * A Frontier that will repeatedly visit all encountered URIs.
69  * <p>
70  * Wait time between visits is configurable and varies based on observed
71  * changes of documents.
72  * <p>
73  * The Frontier borrows many things from HostQueuesFrontier, but implements
74  * an entirely different strategy in issuing URIs and consequently in keeping a
75  * record of discovered URIs.
76  *
77  * @author Kristinn Sigurdsson
78  */

79 public class AdaptiveRevisitFrontier extends ModuleType
80 implements Frontier, FetchStatusCodes, CoreAttributeConstants,
81         AdaptiveRevisitAttributeConstants, CrawlStatusListener, HasUriReceiver {
82
83     private static final long serialVersionUID = -8666872690438543671L;
84
85     private static final Logger JavaDoc logger =
86         Logger.getLogger(AdaptiveRevisitFrontier.class.getName());
87
88     /** How many multiples of last fetch elapsed time to wait before recontacting
89      * same server */

90     public final static String JavaDoc ATTR_DELAY_FACTOR = "delay-factor";
91     private final static Float JavaDoc DEFAULT_DELAY_FACTOR = new Float JavaDoc(5);
92     
93     /** Always wait this long after one completion before recontacting
94      * same server, regardless of multiple */

95     public final static String JavaDoc ATTR_MIN_DELAY = "min-delay-ms";
96
97     // 2 seconds
98
private final static Integer JavaDoc DEFAULT_MIN_DELAY = new Integer JavaDoc(2000);
99     
100     /** Never wait more than this long, regardless of multiple */
101     public final static String JavaDoc ATTR_MAX_DELAY = "max-delay-ms";
102     
103     // 30 seconds
104
private final static Integer JavaDoc DEFAULT_MAX_DELAY = new Integer JavaDoc(30000);
105     
106     /** Maximum times to emit a CrawlURI without final disposition */
107     public final static String JavaDoc ATTR_MAX_RETRIES = "max-retries";
108     private final static Integer JavaDoc DEFAULT_MAX_RETRIES = new Integer JavaDoc(30);
109
110     /** For retryable problems, seconds to wait before a retry */
111     public final static String JavaDoc ATTR_RETRY_DELAY = "retry-delay-seconds";
112     
113     // 15 minutes
114
private final static Long JavaDoc DEFAULT_RETRY_DELAY = new Long JavaDoc(900);
115     
116     /** Maximum simultaneous requests in process to a host (queue) */
117     public final static String JavaDoc ATTR_HOST_VALENCE = "host-valence";
118     private final static Integer JavaDoc DEFAULT_HOST_VALENCE = new Integer JavaDoc(1);
119
120     /** Number of hops of embeds (ERX) to bump to front of host queue */
121     public final static String JavaDoc ATTR_PREFERENCE_EMBED_HOPS =
122         "preference-embed-hops";
123     private final static Integer JavaDoc DEFAULT_PREFERENCE_EMBED_HOPS = new Integer JavaDoc(0);
124     
125     /** Queue assignment to force on CrawlURIs. Intended to be used
126      * via overrides*/

127     public final static String JavaDoc ATTR_FORCE_QUEUE = "force-queue-assignment";
128     protected final static String JavaDoc DEFAULT_FORCE_QUEUE = "";
129     /** Acceptable characters in forced queue names.
130      * Word chars, dash, period, comma, colon */

131     protected final static String JavaDoc ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*";
132
133     /** Should the queue assignment ignore www in hostnames, effectively
134      * stripping them away.
135      */

136     public final static String JavaDoc ATTR_QUEUE_IGNORE_WWW = "queue-ignore-www";
137     protected final static Boolean JavaDoc DEFAULT_QUEUE_IGNORE_WWW = new Boolean JavaDoc(false);
138     
139     /** Should the Frontier use a seperate 'already included' datastructure
140      * or rely on the queues'.
141      */

142     public final static String JavaDoc ATTR_USE_URI_UNIQ_FILTER = "use-uri-uniq-filter";
143     protected final static Boolean JavaDoc DEFAULT_USE_URI_UNIQ_FILTER = new Boolean JavaDoc(false);
144     
145     private CrawlController controller;
146     
147     private AdaptiveRevisitQueueList hostQueues;
148     
149     private UriUniqFilter alreadyIncluded;
150
151     private ThreadLocalQueue threadWaiting = new ThreadLocalQueue();
152
153     /** Policy for assigning CrawlURIs to named queues */
154     private QueueAssignmentPolicy queueAssignmentPolicy = null;
155     
156     // top-level stats
157
private long succeededFetchCount = 0;
158     private long failedFetchCount = 0;
159     // URI's that are disregarded (for example because of robot.txt rules)
160
private long disregardedUriCount = 0;
161
162     private long totalProcessedBytes = 0;
163     
164     // Flags indicating operator-specified crawl pause/end
165
private boolean shouldPause = false;
166     private boolean shouldTerminate = false;
167     
168
169     public AdaptiveRevisitFrontier(String JavaDoc name) {
170         this(name, "AdaptiveRevisitFrontier. EXPERIMENTAL Frontier that " +
171                 "will repeatedly visit all " +
172                 "encountered URIs. Wait time between visits is configurable" +
173                 " and is determined by seperate Processor(s). See " +
174                 "WaitEvaluators " +
175                 "See documentation for ARFrontier limitations.");
176     }
177
178     public AdaptiveRevisitFrontier(String JavaDoc name, String JavaDoc description) {
179         super(Frontier.ATTR_NAME, description);
180         addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
181                 "How many multiples of last fetch elapsed time to wait before " +
182                 "recontacting same server", DEFAULT_DELAY_FACTOR));
183             addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,
184                 "Never wait more than this long, regardless of multiple",
185                 DEFAULT_MAX_DELAY));
186             addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
187                 "Always wait this long after one completion before recontacting " +
188                 "same server, regardless of multiple", DEFAULT_MIN_DELAY));
189              addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES,
190                 "How often to retry fetching a URI that failed to be retrieved.\n" +
191                 "If zero, the crawler will get the robots.txt only.",
192                 DEFAULT_MAX_RETRIES));
193             addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY,
194                     "How long to wait by default until we retry fetching a" +
195                     " URI that failed to be retrieved (seconds). ",
196                     DEFAULT_RETRY_DELAY));
197             addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,
198                     "Number of embedded (or redirected) hops up to which " +
199                     "a URI has higher priority scheduling. For example, if set " +
200                     "to 1 (the default), items such as inline images (1-hop " +
201                     "embedded resources) will be scheduled ahead of all regular " +
202                     "links (or many-hop resources, like nested frames). If set to " +
203                     "zero, no preferencing will occur, and embeds/redirects are " +
204                     "scheduled the same as regular links.",
205                     DEFAULT_PREFERENCE_EMBED_HOPS));
206             Type t;
207             t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE,
208                     "Maximum number of simultaneous requests to a single" +
209                     " host.",
210                     DEFAULT_HOST_VALENCE));
211             t.setExpertSetting(true);
212             t = addElementToDefinition(new SimpleType(ATTR_QUEUE_IGNORE_WWW,
213                     "If true then documents from x.com, www.x.com and any " +
214                     "www[0-9]+.x.com will be assigned to the same queue.",
215                     DEFAULT_QUEUE_IGNORE_WWW));
216             t.setExpertSetting(true);
217             t = addElementToDefinition(new SimpleType(
218                     ATTR_FORCE_QUEUE,
219                     "The queue name into which to force URIs. Should "
220                     + "be left blank at global level. Specify a "
221                     + "per-domain/per-host override to force URIs into "
222                     + "a particular named queue, regardless of the assignment "
223                     + "policy in effect (domain or ip-based politeness). "
224                     + "This could be used on domains known to all be from "
225                     + "the same small set of IPs (eg blogspot, dailykos, etc.) "
226                     + "to simulate IP-based politeness, or it could be used if "
227                     + "you wanted to enforce politeness over a whole domain, even "
228                     + "though the subdomains are split across many IPs.",
229                     DEFAULT_FORCE_QUEUE));
230             t.setOverrideable(true);
231             t.setExpertSetting(true);
232             t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE,
233                     Level.WARNING, "This field must contain only alphanumeric "
234                     + "characters plus period, dash, comma, colon, or underscore."));
235             t = addElementToDefinition(new SimpleType(ATTR_USE_URI_UNIQ_FILTER,
236                     "If true then the Frontier will use a seperate " +
237                     "datastructure to detect and eliminate duplicates.\n" +
238                     "This is required for Canonicalization rules to work.",
239                     DEFAULT_USE_URI_UNIQ_FILTER));
240             t.setExpertSetting(true);
241             t.setOverrideable(false);
242
243         // Register persistent CrawlURI items
244
CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY);
245         CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING);
246     }
247
248     public synchronized void initialize(CrawlController c)
249             throws FatalConfigurationException, IOException JavaDoc {
250         controller = c;
251         controller.addCrawlStatusListener(this);
252
253         queueAssignmentPolicy = new HostnameQueueAssignmentPolicy();
254         
255         hostQueues = new AdaptiveRevisitQueueList(c.getBdbEnvironment(),
256             c.getClassCatalog());
257         
258         if(((Boolean JavaDoc)getUncheckedAttribute(
259                 null,ATTR_USE_URI_UNIQ_FILTER)).booleanValue()){
260             alreadyIncluded = createAlreadyIncluded();
261         } else {
262             alreadyIncluded = null;
263         }
264         
265         loadSeeds();
266     }
267
268     /**
269      * Create a UriUniqFilter that will serve as record
270      * of already seen URIs.
271      *
272      * @return A UURISet that will serve as a record of already seen URIs
273      * @throws IOException
274      */

275     protected UriUniqFilter createAlreadyIncluded() throws IOException JavaDoc {
276         UriUniqFilter uuf = new BdbUriUniqFilter(
277                 this.controller.getBdbEnvironment());
278         uuf.setDestination(this);
279         return uuf;
280     }
281     
282     /**
283      * Loads the seeds
284      * <p>
285      * This method is called by initialize() and kickUpdate()
286      */

287     public void loadSeeds() {
288         Writer JavaDoc ignoredWriter = new StringWriter JavaDoc();
289         // Get the seeds to refresh.
290
Iterator JavaDoc iter = this.controller.getScope().seedsIterator(ignoredWriter);
291         while (iter.hasNext()) {
292             CandidateURI caUri =
293                 CandidateURI.createSeedCandidateURI((UURI)iter.next());
294             caUri.setSchedulingDirective(CandidateURI.MEDIUM);
295             schedule(caUri);
296         }
297         batchFlush();
298         // save ignored items (if any) where they can be consulted later
299
AbstractFrontier.saveIgnoredItems(
300                 ignoredWriter.toString(),
301                 controller.getDisk());
302     }
303     
304     public String JavaDoc getClassKey(CandidateURI cauri) {
305         String JavaDoc queueKey = (String JavaDoc)getUncheckedAttribute(cauri,
306                 ATTR_FORCE_QUEUE);
307             if ("".equals(queueKey)) {
308                 // Typical case, barring overrides
309
queueKey =
310                     queueAssignmentPolicy.getClassKey(controller, cauri);
311                 // The queueAssignmentPolicy is always based on Hostnames
312
// We may need to remove any www[0-9]{0,}\. prefixes from the
313
// hostnames
314
if(((Boolean JavaDoc)getUncheckedAttribute(
315                         cauri,ATTR_QUEUE_IGNORE_WWW)).booleanValue()){
316                     queueKey = queueKey.replaceAll("^www[0-9]{0,}\\.","");
317                 }
318             }
319             return queueKey;
320     }
321
322     /**
323      * Canonicalize passed uuri. Its would be sweeter if this canonicalize
324      * function was encapsulated by that which it canonicalizes but because
325      * settings change with context -- i.e. there may be overrides in operation
326      * for a particular URI -- its not so easy; Each CandidateURI would need a
327      * reference to the settings system. That's awkward to pass in.
328      *
329      * @param uuri Candidate URI to canonicalize.
330      * @return Canonicalized version of passed <code>uuri</code>.
331      */

332     protected String JavaDoc canonicalize(UURI uuri) {
333         return Canonicalizer.canonicalize(uuri, this.controller.getOrder());
334     }
335
336     /**
337      * Canonicalize passed CandidateURI. This method differs from
338      * {@link #canonicalize(UURI)} in that it takes a look at
339      * the CandidateURI context possibly overriding any canonicalization effect if
340      * it could make us miss content. If canonicalization produces an URL that
341      * was 'alreadyseen', but the entry in the 'alreadyseen' database did
342      * nothing but redirect to the current URL, we won't get the current URL;
343      * we'll think we've already see it. Examples would be archive.org
344      * redirecting to www.archive.org or the inverse, www.netarkivet.net
345      * redirecting to netarkivet.net (assuming stripWWW rule enabled).
346      * <p>Note, this method under circumstance sets the forceFetch flag.
347      *
348      * @param cauri CandidateURI to examine.
349      * @return Canonicalized <code>cacuri</code>.
350      */

351     protected String JavaDoc canonicalize(CandidateURI cauri) {
352         String JavaDoc canon = canonicalize(cauri.getUURI());
353         if (cauri.isLocation()) {
354             // If the via is not the same as where we're being redirected (i.e.
355
// we're not being redirected back to the same page, AND the
356
// canonicalization of the via is equal to the the current cauri,
357
// THEN forcefetch (Forcefetch so no chance of our not crawling
358
// content because alreadyseen check things its seen the url before.
359
// An example of an URL that redirects to itself is:
360
// http://bridalelegance.com/images/buttons3/tuxedos-off.gif.
361
// An example of an URL whose canonicalization equals its via's
362
// canonicalization, and we want to fetch content at the
363
// redirection (i.e. need to set forcefetch), is netarkivet.dk.
364
if (!cauri.toString().equals(cauri.getVia().toString()) &&
365                     canonicalize(cauri.getVia()).equals(canon)) {
366                 cauri.setForceFetch(true);
367             }
368         }
369         return canon;
370     }
371
372     /**
373      *
374      * @param caUri The URI to schedule.
375      */

376     protected void innerSchedule(CandidateURI caUri) {
377         CrawlURI curi;
378         if(caUri instanceof CrawlURI) {
379             curi = (CrawlURI) caUri;
380         } else {
381             curi = CrawlURI.from(caUri,System.currentTimeMillis());
382             curi.putLong(A_TIME_OF_NEXT_PROCESSING,
383                 System.currentTimeMillis());
384             // New CrawlURIs get 'current time' as the time of next processing.
385
}
386         
387         if(curi.getClassKey() == null){
388             curi.setClassKey(getClassKey(curi));
389         }
390
391         if(curi.isSeed() && curi.getVia() != null
392                 && curi.flattenVia().length() > 0) {
393             // The only way a seed can have a non-empty via is if it is the
394
// result of a seed redirect. Add it to the seeds list.
395
//
396
// This is a feature. This is handling for case where a seed
397
// gets immediately redirected to another page. What we're doing
398
// is treating the immediate redirect target as a seed.
399
this.controller.getScope().addSeed(curi);
400             // And it needs rapid scheduling.
401
curi.setSchedulingDirective(CandidateURI.MEDIUM);
402         }
403         
404         // Optionally preferencing embeds up to MEDIUM
405
int prefHops = ((Integer JavaDoc) getUncheckedAttribute(curi,
406                 ATTR_PREFERENCE_EMBED_HOPS)).intValue();
407         boolean prefEmbed = false;
408         if (prefHops > 0) {
409             int embedHops = curi.getTransHops();
410             if (embedHops > 0 && embedHops <= prefHops
411                     && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
412                 // number of embed hops falls within the preferenced range, and
413
// uri is not already MEDIUM -- so promote it
414
curi.setSchedulingDirective(CandidateURI.MEDIUM);
415                 prefEmbed = true;
416             }
417         }
418
419         // Finally, allow curi to be fetched right now
420
// (while not overriding overdue items)
421
curi.putLong(A_TIME_OF_NEXT_PROCESSING,
422                 System.currentTimeMillis());
423         
424         try {
425             logger.finest("scheduling " + curi.toString());
426             AdaptiveRevisitHostQueue hq = getHQ(curi);
427             hq.add(curi,prefEmbed);
428         } catch (IOException JavaDoc e) {
429             // TODO Handle IOExceptions
430
e.printStackTrace();
431         }
432         
433     }
434
435     /**
436      * Get the AdaptiveRevisitHostQueue for the given CrawlURI, creating
437      * it if necessary.
438      *
439      * @param curi CrawlURI for which to get a queue
440      * @return AdaptiveRevisitHostQueue for given CrawlURI
441      * @throws IOException
442      */

443     protected AdaptiveRevisitHostQueue getHQ(CrawlURI curi) throws IOException JavaDoc {
444         AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
445         if(hq == null){
446             // Need to create it.
447
int valence = DEFAULT_HOST_VALENCE.intValue();
448             try {
449                 valence = ((Integer JavaDoc)getAttribute(curi,ATTR_HOST_VALENCE)).intValue();
450             } catch (AttributeNotFoundException JavaDoc e2) {
451                 logger.severe("Unable to load valence.");
452             }
453             hq = hostQueues.createHQ(curi.getClassKey(),valence);
454         }
455         return hq;
456     }
457
458     protected void batchSchedule(CandidateURI caUri) {
459         threadWaiting.getQueue().enqueue(caUri);
460     }
461
462     protected void batchFlush() {
463         innerBatchFlush();
464     }
465
466     private void innerBatchFlush() {
467         Queue q = threadWaiting.getQueue();
468         while(!q.isEmpty()) {
469             CandidateURI caUri = (CandidateURI)q.dequeue();
470             if(alreadyIncluded != null){
471                 String JavaDoc cannon = canonicalize(caUri);
472                 System.out.println("Cannon of " + caUri + " is " + cannon);
473                 if (caUri.forceFetch()) {
474                     alreadyIncluded.addForce(cannon, caUri);
475                 } else {
476                     alreadyIncluded.add(cannon, caUri);
477                 }
478             } else {
479                 innerSchedule(caUri);
480             }
481         }
482     }
483     
484     /**
485      * @param curi
486      * @return the CrawlServer to be associated with this CrawlURI
487      */

488     protected CrawlServer getServer(CrawlURI curi) {
489         return this.controller.getServerCache().getServerFor(curi);
490     }
491
492     /* (non-Javadoc)
493      * @see org.archive.crawler.framework.Frontier#next()
494      */

495     public synchronized CrawlURI next()
496             throws InterruptedException JavaDoc, EndedException {
497         controller.checkFinish();
498         
499         while(shouldPause){
500             controller.toePaused();
501             wait();
502         }
503         
504         if(shouldTerminate){
505             throw new EndedException("terminated");
506         }
507         
508         AdaptiveRevisitHostQueue hq = hostQueues.getTopHQ();
509         
510         while(hq.getState() != AdaptiveRevisitHostQueue.HQSTATE_READY){
511             // Ok, so we don't have a ready queue, wait until the top one
512
// will become available.
513
long waitTime = hq.getNextReadyTime() - System.currentTimeMillis();
514             if(waitTime > 0){
515                 wait(waitTime);
516             }
517             // The top HQ may have changed, so get it again
518
hq = hostQueues.getTopHQ();
519         }
520
521         if(shouldTerminate){
522             // May have been terminated while thread was waiting for IO
523
throw new EndedException("terminated");
524         }
525         
526         try {
527             CrawlURI curi = hq.next();
528             // Populate CURI with 'transient' variables such as server.
529
logger.fine("Issuing " + curi.toString());
530             long temp = curi.getLong(A_TIME_OF_NEXT_PROCESSING);
531             long currT = System.currentTimeMillis();
532             long overdue = (currT-temp);
533             if(logger.isLoggable(Level.FINER)){
534                 String JavaDoc waitI = "not set";
535                 if(curi.containsKey(A_WAIT_INTERVAL)){
536                     waitI = ArchiveUtils.formatMillisecondsToConventional(
537                             curi.getLong(A_WAIT_INTERVAL));
538                 }
539                 logger.finer("Wait interval: " + waitI +
540                         ", Time of next proc: " + temp +
541                         ", Current time: " + currT +
542                         ", Overdue by: " + overdue + "ms");
543             }
544             if(overdue < 0){
545                 // This should never happen.
546
logger.severe("Time overdue for " + curi.toString() +
547                         "is negative (" + overdue + ")!");
548             }
549             curi.putLong(A_FETCH_OVERDUE,overdue);
550             return curi;
551         } catch (IOException JavaDoc e) {
552             // TODO: Need to handle this in an intelligent manner.
553
// Is probably fatal?
554
e.printStackTrace();
555         }
556
557         return null;
558     }
559
560     /* (non-Javadoc)
561      * @see org.archive.crawler.framework.Frontier#isEmpty()
562      */

563     public boolean isEmpty() {
564         // Technically, the Frontier should never become empty since URIs are
565
// only discarded under exceptional circumstances.
566
return hostQueues.getSize() == 0;
567     }
568
569     /* (non-Javadoc)
570      * @see org.archive.crawler.framework.Frontier#schedule(org.archive.crawler.datamodel.CandidateURI)
571      */

572     public void schedule(CandidateURI caURI) {
573         batchSchedule(caURI);
574     }
575
576     /* (non-Javadoc)
577      * @see org.archive.crawler.framework.Frontier#finished(org.archive.crawler.datamodel.CrawlURI)
578      */

579     public synchronized void finished(CrawlURI curi) {
580         logger.fine(curi.toString()+ " " +
581                 CrawlURI.fetchStatusCodesToString(curi.getFetchStatus()));
582         curi.incrementFetchAttempts();
583         logLocalizedErrors(curi);
584
585         innerFinished(curi);
586     }
587     
588     protected synchronized void innerFinished(CrawlURI curi) {
589         try {
590             innerBatchFlush();
591             
592             if (curi.isSuccess()) {
593                 successDisposition(curi);
594             } else if (needsPromptRetry(curi)) {
595                 // Consider statuses which allow nearly-immediate retry
596
// (like deferred to allow precondition to be fetched)
597
reschedule(curi,false);
598             } else if (needsRetrying(curi)) {
599                 // Consider errors which can be retried
600
reschedule(curi,true);
601                 controller.fireCrawledURINeedRetryEvent(curi);
602             } else if(isDisregarded(curi)) {
603                 // Check for codes that mean that while the crawler did
604
// manage to get it it must be disregarded for any reason.
605
disregardDisposition(curi);
606             } else {
607                 // In that case FAILURE, note & log
608
failureDisposition(curi);
609             }
610
611             // New items might be available, let waiting threads know
612
// More then one queue might have become available due to
613
// scheduling of items outside the parent URIs host, so we
614
// wake all waiting threads.
615
notifyAll();
616         } catch (RuntimeException JavaDoc e) {
617             curi.setFetchStatus(S_RUNTIME_EXCEPTION);
618             // store exception temporarily for logging
619
logger.warning("RTE in innerFinished() " +
620                 e.getMessage());
621             e.printStackTrace();
622             curi.putObject(A_RUNTIME_EXCEPTION, e);
623             failureDisposition(curi);
624         } catch (AttributeNotFoundException JavaDoc e) {
625             logger.severe(e.getMessage());
626         }
627     }
628
629     /**
630      * Take note of any processor-local errors that have
631      * been entered into the CrawlURI.
632      * @param curi CrawlURI with errors.
633      */

634     private void logLocalizedErrors(CrawlURI curi) {
635         if(curi.containsKey(A_LOCALIZED_ERRORS)) {
636             List JavaDoc localErrors = (List JavaDoc)curi.getObject(A_LOCALIZED_ERRORS);
637             Iterator JavaDoc iter = localErrors.iterator();
638             while(iter.hasNext()) {
639                 Object JavaDoc array[] = {curi, iter.next()};
640                 controller.localErrors.log(Level.WARNING,
641                     curi.getUURI().toString(), array);
642             }
643             // once logged, discard
644
curi.remove(A_LOCALIZED_ERRORS);
645         }
646     }
647     
648     /**
649      * The CrawlURI has been successfully crawled.
650      *
651      * @param curi The CrawlURI
652      */

653     protected void successDisposition(CrawlURI curi) {
654         curi.aboutToLog();
655
656         long waitInterval = 0;
657         
658         if(curi.containsKey(A_WAIT_INTERVAL)){
659             waitInterval = curi.getLong(A_WAIT_INTERVAL);
660             curi.addAnnotation("wt:" +
661                     ArchiveUtils.formatMillisecondsToConventional(
662                             waitInterval));
663         } else {
664             logger.severe("Missing wait interval for " + curi.toString() +
665                     " WaitEvaluator may be missing.");
666         }
667         if(curi.containsKey(A_NUMBER_OF_VISITS)){
668             curi.addAnnotation(curi.getInt(A_NUMBER_OF_VISITS) + "vis");
669         }
670         if(curi.containsKey(A_NUMBER_OF_VERSIONS)){
671             curi.addAnnotation(curi.getInt(A_NUMBER_OF_VERSIONS) + "ver");
672         }
673         if(curi.containsKey(A_FETCH_OVERDUE)){
674             curi.addAnnotation("ov:" +
675                     ArchiveUtils.formatMillisecondsToConventional(
676                     (curi.getLong(A_FETCH_OVERDUE))));
677         }
678         
679         Object JavaDoc array[] = { curi };
680         controller.uriProcessing.log(
681             Level.INFO,
682             curi.getUURI().toString(),
683             array);
684
685         succeededFetchCount++;
686         totalProcessedBytes += curi.getContentSize();
687
688         // Let everyone know in case they want to do something before we strip
689
// the curi.
690
controller.fireCrawledURISuccessfulEvent(curi);
691         
692         curi.setSchedulingDirective(CandidateURI.NORMAL);
693
694         // Set time of next processing
695
curi.putLong(A_TIME_OF_NEXT_PROCESSING,
696                 System.currentTimeMillis()+waitInterval);
697         
698         
699         /* Update HQ */
700         AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
701         
702         // Wake up time is based on the time when a fetch was completed + the
703
// calculated snooze time for politeness. If the fetch completion time
704
// is missing, we'll use current time.
705
long wakeupTime = (curi.containsKey(A_FETCH_COMPLETED_TIME)?
706                 curi.getLong(A_FETCH_COMPLETED_TIME):
707                     (new Date JavaDoc()).getTime()) + calculateSnoozeTime(curi);
708         
709         // Ready the URI for reserialization.
710
curi.processingCleanup();
711         curi.resetDeferrals();
712         curi.resetFetchAttempts();
713         try {
714             hq.update(curi, true, wakeupTime);
715         } catch (IOException JavaDoc e) {
716             logger.severe("An IOException occured when updating " +
717                     curi.toString() + "\n" + e.getMessage());
718             e.printStackTrace();
719         }
720     }
721
722     /**
723      * Put near top of relevant hostQueue (but behind anything recently
724      * scheduled 'high')..
725      *
726      * @param curi CrawlURI to reschedule. Its time of next processing is not
727      * modified.
728      * @param errorWait signals if there should be a wait before retrying.
729      * @throws AttributeNotFoundException
730      */

731     protected void reschedule(CrawlURI curi, boolean errorWait)
732             throws AttributeNotFoundException JavaDoc {
733         long delay = 0;
734         if(errorWait){
735             if(curi.containsKey(A_RETRY_DELAY)) {
736                 delay = curi.getLong(A_RETRY_DELAY);
737             } else {
738                 // use ARFrontier default
739
delay = ((Long JavaDoc)getAttribute(ATTR_RETRY_DELAY,curi)).longValue();
740             }
741         }
742         
743         long retryTime = (curi.containsKey(A_FETCH_COMPLETED_TIME)?
744                 curi.getLong(A_FETCH_COMPLETED_TIME):
745                     (new Date JavaDoc()).getTime()) + delay;
746         
747         AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
748         // Ready the URI for reserialization.
749
curi.processingCleanup();
750         if(errorWait){
751             curi.resetDeferrals(); //Defferals only refer to immediate retries.
752
}
753         try {
754             hq.update(curi, errorWait, retryTime);
755         } catch (IOException JavaDoc e) {
756             // TODO Handle IOException
757
e.printStackTrace();
758         }
759     }
760
761     /**
762      * The CrawlURI has encountered a problem, and will not
763      * be retried.
764      *
765      * @param curi The CrawlURI
766      */

767     protected void failureDisposition(CrawlURI curi) {
768         //Let interested listeners know of failed disposition.
769
this.controller.fireCrawledURIFailureEvent(curi);
770
771         // send to basic log
772
curi.aboutToLog();
773         Object JavaDoc array[] = { curi };
774         this.controller.uriProcessing.log(
775             Level.INFO,
776             curi.getUURI().toString(),
777             array);
778
779         // if exception, also send to crawlErrors
780
if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
781             this.controller.runtimeErrors.log(
782                 Level.WARNING,
783                 curi.getUURI().toString(),
784                 array);
785         }
786         failedFetchCount++;
787         
788         // Put the failed URI at the very back of the queue.
789
curi.setSchedulingDirective(CandidateURI.NORMAL);
790         // TODO: reconsider this
791
curi.putLong(A_TIME_OF_NEXT_PROCESSING,Long.MAX_VALUE);
792
793         AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
794         // Ready the URI for serialization.
795
curi.processingCleanup();
796         curi.resetDeferrals();
797         curi.resetFetchAttempts();
798         try {
799             // No wait on failure. No contact was made with the server.
800
boolean shouldForget = shouldBeForgotten(curi);
801             if(shouldForget && alreadyIncluded != null){
802                 alreadyIncluded.forget(canonicalize(curi.getUURI()),curi);
803             }
804             hq.update(curi,false, 0, shouldForget);
805         } catch (IOException JavaDoc e) {
806             // TODO Handle IOException
807
e.printStackTrace();
808         }
809     }
810
811     protected void disregardDisposition(CrawlURI curi) {
812         //Let interested listeners know of disregard disposition.
813
controller.fireCrawledURIDisregardEvent(curi);
814
815         // send to basic log
816
curi.aboutToLog();
817         Object JavaDoc array[] = { curi };
818         controller.uriProcessing.log(
819             Level.INFO,
820             curi.getUURI().toString(),
821             array);
822
823         disregardedUriCount++;
824         
825         // Todo: consider timout before retrying disregarded elements.
826
// Possibly add a setting to the WaitEvaluators?
827
curi.putLong(A_TIME_OF_NEXT_PROCESSING,Long.MAX_VALUE);
828         curi.setSchedulingDirective(CandidateURI.NORMAL);
829
830         AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
831         // Ready the URI for reserialization.
832
curi.processingCleanup();
833         curi.resetDeferrals();
834         curi.resetFetchAttempts();
835         try {
836             // No politness wait on disregard. No contact was made with server
837
hq.update(curi, false, 0, shouldBeForgotten(curi));
838         } catch (IOException JavaDoc e) {
839             // TODO Handle IOException
840
e.printStackTrace();
841         }
842     }
843
844     /**
845      * Some URIs, if they recur, deserve another
846      * chance at consideration: they might not be too
847      * many hops away via another path, or the scope
848      * may have been updated to allow them passage.
849      *
850      * @param curi
851      * @return True if curi should be forgotten.
852      */

853     protected boolean shouldBeForgotten(CrawlURI curi) {
854         switch(curi.getFetchStatus()) {
855             case S_OUT_OF_SCOPE:
856             case S_TOO_MANY_EMBED_HOPS:
857             case S_TOO_MANY_LINK_HOPS:
858                 return true;
859             default:
860                 return false;
861         }
862     }
863
864     /**
865      * Checks if a recently completed CrawlURI that did not finish successfully
866      * needs to be retried immediately (processed again as soon as politeness
867      * allows.)
868      *
869      * @param curi The CrawlURI to check
870      * @return True if we need to retry promptly.
871      * @throws AttributeNotFoundException If problems occur trying to read the
872      * maximum number of retries from the settings framework.
873      */

874     protected boolean needsPromptRetry(CrawlURI curi)
875             throws AttributeNotFoundException JavaDoc {
876         if (curi.getFetchAttempts() >=
877                 ((Integer JavaDoc)getAttribute(ATTR_MAX_RETRIES, curi)).intValue() ) {
878             return false;
879         }
880
881         switch (curi.getFetchStatus()) {
882             case S_DEFERRED:
883                 return true;
884
885             case HttpStatus.SC_UNAUTHORIZED:
886                 // We can get here though usually a positive status code is
887
// a success. We get here if there is rfc2617 credential data
888
// loaded and we're supposed to go around again. See if any
889
// rfc2617 credential present and if there, assume it got
890
// loaded in FetchHTTP on expectation that we're to go around
891
// again. If no rfc2617 loaded, we should not be here.
892
boolean loaded = curi.hasRfc2617CredentialAvatar();
893                 if (!loaded) {
894                     logger.severe("Have 401 but no creds loaded " + curi);
895                 }
896                 return loaded;
897
898             default:
899                 return false;
900         }
901     }
902
903     /**
904      * Checks if a recently completed CrawlURI that did not finish successfully
905      * needs to be retried (processed again after some time elapses)
906      *
907      * @param curi The CrawlURI to check
908      * @return True if we need to retry.
909      * @throws AttributeNotFoundException If problems occur trying to read the
910      * maximum number of retries from the settings framework.
911      */

912     protected boolean needsRetrying(CrawlURI curi)
913             throws AttributeNotFoundException JavaDoc {
914         // Check to see if maximum number of retries has been exceeded.
915
if (curi.getFetchAttempts() >=
916             ((Integer JavaDoc)getAttribute(ATTR_MAX_RETRIES,curi)).intValue() ) {
917             return false;
918         } else {
919             // Check if FetchStatus indicates that a delayed retry is needed.
920
switch (curi.getFetchStatus()) {
921                 case S_CONNECT_FAILED:
922                 case S_CONNECT_LOST:
923                 case S_DOMAIN_UNRESOLVABLE:
924                     // these are all worth a retry
925
// TODO: consider if any others (S_TIMEOUT in some cases?)
926
// deserve retry
927
return true;
928                 default:
929                     return false;
930             }
931         }
932     }
933     
934     protected boolean isDisregarded(CrawlURI curi) {
935         switch (curi.getFetchStatus()) {
936             case S_ROBOTS_PRECLUDED : // they don't want us to have it
937
case S_OUT_OF_SCOPE : // filtered out by scope
938
case S_BLOCKED_BY_CUSTOM_PROCESSOR:
939             case S_BLOCKED_BY_USER : // filtered out by user
940
case S_TOO_MANY_EMBED_HOPS : // too far from last true link
941
case S_TOO_MANY_LINK_HOPS : // too far from seeds
942
case S_DELETED_BY_USER : // user deleted
943
return true;
944             default:
945                 return false;
946         }
947     }
948     
949     /**
950      * Calculates how long a host queue needs to be snoozed following the
951      * crawling of a URI.
952      *
953      * @param curi The CrawlURI
954      * @return How long to snooze.
955      */

956     protected long calculateSnoozeTime(CrawlURI curi) {
957         long durationToWait = 0;
958         if (curi.containsKey(A_FETCH_BEGAN_TIME)
959             && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
960             
961             try{
962             
963                 long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);
964                 long durationTaken =
965                     (completeTime - curi.getLong(A_FETCH_BEGAN_TIME));
966                 
967                 durationToWait = (long)(
968                         ((Float JavaDoc) getAttribute(ATTR_DELAY_FACTOR, curi))
969                             .floatValue() * durationTaken);
970     
971                 long minDelay =
972                     ((Integer JavaDoc) getAttribute(ATTR_MIN_DELAY, curi)).longValue();
973                 
974                 if (minDelay > durationToWait) {
975                     // wait at least the minimum
976
durationToWait = minDelay;
977                 }
978     
979                 long maxDelay = ((Integer JavaDoc) getAttribute(ATTR_MAX_DELAY, curi)).longValue();
980                 if (durationToWait > maxDelay) {
981                     // wait no more than the maximum
982
durationToWait = maxDelay;
983                 }
984             } catch (AttributeNotFoundException JavaDoc e) {
985                 logger.severe("Unable to find attribute. " +
986                         curi.toString());
987                 //Wait for max interval.
988
durationToWait = DEFAULT_MAX_DELAY.longValue();
989             }
990
991         }
992         long ret = durationToWait > DEFAULT_MIN_DELAY.longValue() ?
993                 durationToWait : DEFAULT_MIN_DELAY.longValue();
994         logger.finest("Snooze time for " + curi.toString() + " = " + ret );
995         return ret;
996     }
997
998     /* (non-Javadoc)
999      * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
1000     */

1001    public synchronized long discoveredUriCount() {
1002        return (this.alreadyIncluded != null) ?
1003                this.alreadyIncluded.count() : hostQueues.getSize();
1004    }
1005
1006    /* (non-Javadoc)
1007     * @see org.archive.crawler.framework.Frontier#queuedUriCount()
1008     */

1009    public synchronized long queuedUriCount() {
1010        return hostQueues.getSize();
1011    }
1012
1013    /* (non-Javadoc)
1014     * @see org.archive.crawler.framework.Frontier#finishedUriCount()
1015     */

1016    public long finishedUriCount() {
1017        return succeededFetchCount+failedFetchCount+disregardedUriCount;
1018    }
1019
1020    /* (non-Javadoc)
1021     * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
1022     */

1023    public long succeededFetchCount() {
1024        return succeededFetchCount;
1025    }
1026
1027    /* (non-Javadoc)
1028     * @see org.archive.crawler.framework.Frontier#failedFetchCount()
1029     */

1030    public long failedFetchCount() {
1031        return failedFetchCount;
1032    }
1033
1034    /* (non-Javadoc)
1035     * @see org.archive.crawler.framework.Frontier#disregardedUriCount()
1036     */

1037    public long disregardedUriCount() {
1038        return disregardedUriCount++;
1039    }
1040
1041    /* (non-Javadoc)
1042     * @see org.archive.crawler.framework.Frontier#totalBytesWritten()
1043     */

1044    public long totalBytesWritten() {
1045        return totalProcessedBytes;
1046    }
1047
1048    /**
1049     * Method is not supported by this Frontier implementation..
1050     * @param pathToLog
1051     * @throws IOException
1052     */

1053    public void importRecoverLog(String JavaDoc pathToLog) throws IOException JavaDoc {
1054        throw new IOException JavaDoc("Unsupported by this frontier.");
1055    }
1056
1057    public synchronized FrontierMarker getInitialMarker(String JavaDoc regexpr,
1058            boolean inCacheOnly) {
1059        return null;
1060    }
1061
1062    /* (non-Javadoc)
1063     * @see org.archive.crawler.framework.Frontier#getURIsList(org.archive.crawler.framework.FrontierMarker, int, boolean)
1064     */

1065    public synchronized ArrayList JavaDoc getURIsList(FrontierMarker marker,
1066            int numberOfMatches, boolean verbose)
1067        throws InvalidFrontierMarkerException {
1068        // TODO Auto-generated method stub
1069
return null;
1070    }
1071
1072    /* (non-Javadoc)
1073     * @see org.archive.crawler.framework.Frontier#deleteURIs(java.lang.String)
1074     */

1075    public synchronized long deleteURIs(String JavaDoc match) {
1076        // TODO Auto-generated method stub
1077
return 0;
1078    }
1079
1080    /* (non-Javadoc)
1081     * @see org.archive.crawler.framework.Frontier#deleted(org.archive.crawler.datamodel.CrawlURI)
1082     */

1083    public synchronized void deleted(CrawlURI curi) {
1084        // TODO Auto-generated method stub
1085
}
1086
1087    public void considerIncluded(UURI u) {
1088        // This will cause the URI to be crawled!!!
1089
CrawlURI curi = new CrawlURI(u);
1090        innerSchedule(curi);
1091
1092    }
1093
1094    public void kickUpdate() {
1095        loadSeeds();
1096    }
1097    
1098    public void start() {
1099        unpause();
1100    }
1101    
1102    synchronized public void pause() {
1103        shouldPause = true;
1104        notifyAll();
1105    }
1106    synchronized public void unpause() {
1107        shouldPause = false;
1108        notifyAll();
1109    }
1110    synchronized public void terminate() {
1111        shouldTerminate = true;
1112    }
1113
1114    /* (non-Javadoc)
1115     * @see org.archive.crawler.framework.Frontier#getFrontierJournal()
1116     */

1117    public FrontierJournal getFrontierJournal() {
1118        return null;
1119    }
1120
1121    private static class ThreadLocalQueue
1122    extends ThreadLocal JavaDoc<Queue<CandidateURI>> implements Serializable JavaDoc {
1123
1124        private static final long serialVersionUID = 8268977225156462059L;
1125
1126        protected Queue<CandidateURI> initialValue() {
1127            return new MemQueue<CandidateURI>();
1128        }
1129
1130        /**
1131         * @return Queue of 'batched' items
1132         */

1133        public Queue<CandidateURI> getQueue() {
1134            return get();
1135        }
1136    }
1137    
1138    /**
1139     * This method is not supported by this Frontier implementation
1140     * @param pathToLog
1141     * @param retainFailures
1142     * @throws IOException
1143     */

1144    public void importRecoverLog(String JavaDoc pathToLog, boolean retainFailures)
1145    throws IOException JavaDoc {
1146        throw new IOException JavaDoc("Unsupported");
1147    }
1148
1149    //
1150
// Reporter implementation
1151
//
1152

1153    public String JavaDoc[] getReports() {
1154        // none but default for now
1155
return new String JavaDoc[] {};
1156    }
1157    
1158    /* (non-Javadoc)
1159     * @see org.archive.util.Reporter#singleLineReport()
1160     */

1161    public String JavaDoc singleLineReport() {
1162        return ArchiveUtils.singleLineReport(this);
1163    }
1164
1165    /* (non-Javadoc)
1166     * @see org.archive.util.Reporter#reportTo(java.io.Writer)
1167     */

1168    public void reportTo(PrintWriter JavaDoc writer) throws IOException JavaDoc {
1169        reportTo(null,writer);
1170    }
1171    
1172    /* (non-Javadoc)
1173     * @see org.archive.crawler.framework.Frontier#oneLineReport()
1174     */

1175    public synchronized void singleLineReportTo(PrintWriter JavaDoc w) throws IOException JavaDoc {
1176        hostQueues.singleLineReportTo(w);
1177    }
1178
1179    /* (non-Javadoc)
1180     * @see org.archive.util.Reporter#singleLineLegend()
1181     */

1182    public String JavaDoc singleLineLegend() {
1183        return hostQueues.singleLineLegend();
1184    }
1185    
1186    /* (non-Javadoc)
1187     * @see org.archive.crawler.framework.Frontier#report()
1188     */

1189    public synchronized void reportTo(String JavaDoc name, PrintWriter JavaDoc writer) {
1190        // ignore name; only one report for now
1191
hostQueues.reportTo(name, writer);
1192    }
1193
1194    /* (non-Javadoc)
1195     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1196     */

1197    public void crawlStarted(String JavaDoc message) {
1198        // Not interested
1199
}
1200
1201    /* (non-Javadoc)
1202     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
1203     */

1204    public void crawlEnding(String JavaDoc sExitMessage) {
1205        // Not interested
1206
}
1207
1208    /* (non-Javadoc)
1209     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
1210     */

1211    public void crawlEnded(String JavaDoc sExitMessage) {
1212        // Cleanup!
1213
if (this.alreadyIncluded != null) {
1214            this.alreadyIncluded.close();
1215            this.alreadyIncluded = null;
1216        }
1217        hostQueues.close();
1218    }
1219
1220    /* (non-Javadoc)
1221     * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
1222     */

1223    public void crawlPausing(String JavaDoc statusMessage) {
1224        // Not interested
1225
}
1226
1227    /* (non-Javadoc)
1228     * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
1229     */

1230    public void crawlPaused(String JavaDoc statusMessage) {
1231        // Not interested
1232
}
1233
1234    /* (non-Javadoc)
1235     * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
1236     */

1237    public void crawlResuming(String JavaDoc statusMessage) {
1238        // Not interested
1239
}
1240
1241    /* (non-Javadoc)
1242     * @see org.archive.crawler.event.CrawlStatusListener#crawlCheckpoint(java.io.File)
1243     */

1244    public void crawlCheckpoint(File JavaDoc checkpointDir) throws Exception JavaDoc {
1245        // Not interested
1246
}
1247
1248    /* (non-Javadoc)
1249     * @see org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver#receive(org.archive.crawler.datamodel.CandidateURI)
1250     */

1251    public void receive(CandidateURI item) {
1252        System.out.println("Received " + item);
1253        innerSchedule(item);
1254    }
1255
1256    /* (non-Javadoc)
1257     * @see org.archive.crawler.framework.Frontier#getGroup(org.archive.crawler.datamodel.CrawlURI)
1258     */

1259    public FrontierGroup getGroup(CrawlURI curi) {
1260        try {
1261            return getHQ(curi);
1262        } catch (IOException JavaDoc ioe) {
1263            throw new RuntimeException JavaDoc(ioe);
1264        }
1265    }
1266    
1267    public long averageDepth() {
1268        return hostQueues.getAverageDepth();
1269    }
1270    
1271    public float congestionRatio() {
1272        return hostQueues.getCongestionRatio();
1273    }
1274    
1275    public long deepestUri() {
1276        return hostQueues.getDeepestQueueSize();
1277    }
1278}
1279
Popular Tags