StatisticsTracking


1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   */
19  package org.archive.crawler.framework;
20  
21  import java.util.Iterator  ;
22  import java.util.Map  ;
23  
24  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
25  
26  /**
27   * An interface for objects that want to collect statistics on
28   * running crawls. An implementation of this is referenced in the
29   * crawl order and loaded when the crawl begins.
30   *
31   * <p>It will be given a reference to the relevant CrawlController.
32   * The CrawlController will contain any additional configuration
33   * information needed.
34   *
35   * <p>Any class that implements this interface can be specified as a
36   * statistics tracker in a crawl order.  The CrawlController will
37   * then create and initialize a copy of it and call it's start()
38   * method.
39   *
40   * <p>This interface also specifies several methods to access data that
41   * the CrawlController or the URIFrontier may be interested in at
42   * run time but do not want to have keep track of for themselves.
43   * {@link org.archive.crawler.framework.AbstractTracker AbstractTracker}
44   * implements these. If there are more then one StatisticsTracking
45   * classes defined in the crawl order only the first one will be
46   * used to access this data.
47   *
48   * <p>It is recommended that it register for
49   * {@link org.archive.crawler.event.CrawlStatusListener CrawlStatus} events and
50   * {@link org.archive.crawler.event.CrawlURIDispositionListener CrawlURIDisposition}
51   * events to be able to properly monitor a crawl. Both are registered with the
52   * CrawlController.
53   *
54   * @author Kristinn Sigurdsson
55   *
56   * @see AbstractTracker
57   * @see org.archive.crawler.event.CrawlStatusListener
58   * @see org.archive.crawler.event.CrawlURIDispositionListener
59   * @see org.archive.crawler.framework.CrawlController
60   */
61  public interface StatisticsTracking extends Runnable   {
62      /** Seed successfully crawled */
63      public static final String   SEED_DISPOSITION_SUCCESS =
64          "Seed successfully crawled";
65      /** Failed to crawl seed */
66      public static final String   SEED_DISPOSITION_FAILURE =
67          "Failed to crawl seed";
68      /** Failed to crawl seed, will retry */
69      public static final String   SEED_DISPOSITION_RETRY =
70          "Failed to crawl seed, will retry";
71      /** Seed was disregarded */
72      public static final String   SEED_DISPOSITION_DISREGARD =
73          "Seed was disregarded";
74      /** Seed has not been processed */
75      public static final String   SEED_DISPOSITION_NOT_PROCESSED =
76          "Seed has not been processed";
77      
78      /**
79       * Do initialization.
80       *
81       * The CrawlController will call this method before calling the start()
82       * method.
83       *
84       * @param c The {@link CrawlController CrawlController} running the crawl
85       * that this class is to gather statistics on.
86       * @throws FatalConfigurationException
87       */
88      public void initialize(CrawlController c)
89      throws FatalConfigurationException;
90  
91      /**
92       * Returns how long the current crawl has been running (excluding any time
93       * spent paused/suspended/stopped) since it began.
94       *
95       * @return The length of time - in msec - that this crawl has been running.
96       */
97      public long crawlDuration();
98  
99      /**
100      * Start the tracker's crawl timing. 
101      */
102     public void noteStart();
103     
104 
105     /**
106      * Returns the total number of uncompressed bytes written to disk.  This may
107      * be different from the actual number if you are using compression.
108      *
109      * @return The total number of uncompressed bytes written to disk
110      */
111     public long totalBytesWritten();
112     
113     /**
114      * Total amount of time spent actively crawling so far.<p>
115      * Returns the total amount of time (in milliseconds) that has elapsed from
116      * the start of the crawl and until the current time or if the crawl has
117      * ended until the the end of the crawl <b>minus</b> any
118      * time spent paused.
119      * @return Total amount of time (in msec.) spent crawling so far.
120      */
121     public long getCrawlerTotalElapsedTime();
122     
123     /**
124      * Returns an estimate of recent document download rates
125      * based on a queue of recently seen CrawlURIs (as of last snapshot).
126      *
127      * @return The rate per second of documents gathered during the last
128      * snapshot
129      */
130     public double currentProcessedDocsPerSec();
131     
132     /**
133      * Returns the number of documents that have been processed
134      * per second over the life of the crawl (as of last snapshot)
135      *
136      * @return  The rate per second of documents gathered so far
137      */
138     public double processedDocsPerSec();
139     
140     /**
141      * Calculates the rate that data, in kb, has been processed
142      * over the life of the crawl (as of last snapshot.)
143      *
144      * @return The rate per second of KB gathered so far
145      */
146     public long processedKBPerSec();
147 
148     /**
149      * Calculates an estimate of the rate, in kb, at which documents
150      * are currently being processed by the crawler.  For more
151      * accurate estimates set a larger queue size, or get
152      * and average multiple values (as of last snapshot).
153      *
154      * @return The rate per second of KB gathered during the last snapshot
155      */
156     public int currentProcessedKBPerSec();
157     
158     /**
159      * Get the number of active (non-paused) threads.
160      * 
161      * @return The number of active (non-paused) threads
162      */
163     public int activeThreadCount();
164     
165     /**
166      * Number of <i>successfully</i> processed URIs.
167      *
168      * <p>If crawl not running (paused or stopped) this will return the value
169      * of the last snapshot.
170      *
171      * @return The number of successully fetched URIs
172      *
173      * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
174      */
175     public long successfullyFetchedCount();
176     
177     /**
178      * @return Total number of URIs (processed + queued +
179      * currently being processed)
180      */
181     public long totalCount();
182     
183     public float congestionRatio();
184     public long deepestUri();
185     public long averageDepth();
186     
187     /**
188      * Get a SeedRecord iterator for the job being monitored. If job is no 
189      * longer running, stored values will be returned. If job is running, 
190      * current seed iterator will be fetched and stored values will be updated.
191      * <p>
192      * Sort order is:<br>
193      * No status code (not processed)<br>
194      * Status codes smaller then 0 (largest to smallest)<br>
195      * Status codes larger then 0 (largest to smallest)<br>
196      * <p>
197      * <b>Note:</b> This iterator will iterate over a list of 
198      * <i>SeedRecords</i>.
199      * @return the seed iterator
200      */
201     public Iterator   getSeedRecordsSortedByStatusCode();
202 
203     /**
204      * @return legend of progress-statistics
205      */
206     public String   progressStatisticsLegend();
207 
208     /**
209      * @return line of progress-statistics
210      */
211     public String   getProgressStatisticsLine();
212     
213     /**
214      * @return Map of progress-statistics.
215      */
216     public Map   getProgressStatistics();
217 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags