KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > framework > StatisticsTracking


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  */

19 package org.archive.crawler.framework;
20
21 import java.util.Iterator JavaDoc;
22 import java.util.Map JavaDoc;
23
24 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
25
26 /**
27  * An interface for objects that want to collect statistics on
28  * running crawls. An implementation of this is referenced in the
29  * crawl order and loaded when the crawl begins.
30  *
31  * <p>It will be given a reference to the relevant CrawlController.
32  * The CrawlController will contain any additional configuration
33  * information needed.
34  *
35  * <p>Any class that implements this interface can be specified as a
36  * statistics tracker in a crawl order. The CrawlController will
37  * then create and initialize a copy of it and call it's start()
38  * method.
39  *
40  * <p>This interface also specifies several methods to access data that
41  * the CrawlController or the URIFrontier may be interested in at
42  * run time but do not want to have keep track of for themselves.
43  * {@link org.archive.crawler.framework.AbstractTracker AbstractTracker}
44  * implements these. If there are more then one StatisticsTracking
45  * classes defined in the crawl order only the first one will be
46  * used to access this data.
47  *
48  * <p>It is recommended that it register for
49  * {@link org.archive.crawler.event.CrawlStatusListener CrawlStatus} events and
50  * {@link org.archive.crawler.event.CrawlURIDispositionListener CrawlURIDisposition}
51  * events to be able to properly monitor a crawl. Both are registered with the
52  * CrawlController.
53  *
54  * @author Kristinn Sigurdsson
55  *
56  * @see AbstractTracker
57  * @see org.archive.crawler.event.CrawlStatusListener
58  * @see org.archive.crawler.event.CrawlURIDispositionListener
59  * @see org.archive.crawler.framework.CrawlController
60  */

61 public interface StatisticsTracking extends Runnable JavaDoc {
62     /** Seed successfully crawled */
63     public static final String JavaDoc SEED_DISPOSITION_SUCCESS =
64         "Seed successfully crawled";
65     /** Failed to crawl seed */
66     public static final String JavaDoc SEED_DISPOSITION_FAILURE =
67         "Failed to crawl seed";
68     /** Failed to crawl seed, will retry */
69     public static final String JavaDoc SEED_DISPOSITION_RETRY =
70         "Failed to crawl seed, will retry";
71     /** Seed was disregarded */
72     public static final String JavaDoc SEED_DISPOSITION_DISREGARD =
73         "Seed was disregarded";
74     /** Seed has not been processed */
75     public static final String JavaDoc SEED_DISPOSITION_NOT_PROCESSED =
76         "Seed has not been processed";
77     
78     /**
79      * Do initialization.
80      *
81      * The CrawlController will call this method before calling the start()
82      * method.
83      *
84      * @param c The {@link CrawlController CrawlController} running the crawl
85      * that this class is to gather statistics on.
86      * @throws FatalConfigurationException
87      */

88     public void initialize(CrawlController c)
89     throws FatalConfigurationException;
90
91     /**
92      * Returns how long the current crawl has been running (excluding any time
93      * spent paused/suspended/stopped) since it began.
94      *
95      * @return The length of time - in msec - that this crawl has been running.
96      */

97     public long crawlDuration();
98
99     /**
100      * Start the tracker's crawl timing.
101      */

102     public void noteStart();
103     
104
105     /**
106      * Returns the total number of uncompressed bytes written to disk. This may
107      * be different from the actual number if you are using compression.
108      *
109      * @return The total number of uncompressed bytes written to disk
110      */

111     public long totalBytesWritten();
112     
113     /**
114      * Total amount of time spent actively crawling so far.<p>
115      * Returns the total amount of time (in milliseconds) that has elapsed from
116      * the start of the crawl and until the current time or if the crawl has
117      * ended until the the end of the crawl <b>minus</b> any
118      * time spent paused.
119      * @return Total amount of time (in msec.) spent crawling so far.
120      */

121     public long getCrawlerTotalElapsedTime();
122     
123     /**
124      * Returns an estimate of recent document download rates
125      * based on a queue of recently seen CrawlURIs (as of last snapshot).
126      *
127      * @return The rate per second of documents gathered during the last
128      * snapshot
129      */

130     public double currentProcessedDocsPerSec();
131     
132     /**
133      * Returns the number of documents that have been processed
134      * per second over the life of the crawl (as of last snapshot)
135      *
136      * @return The rate per second of documents gathered so far
137      */

138     public double processedDocsPerSec();
139     
140     /**
141      * Calculates the rate that data, in kb, has been processed
142      * over the life of the crawl (as of last snapshot.)
143      *
144      * @return The rate per second of KB gathered so far
145      */

146     public long processedKBPerSec();
147
148     /**
149      * Calculates an estimate of the rate, in kb, at which documents
150      * are currently being processed by the crawler. For more
151      * accurate estimates set a larger queue size, or get
152      * and average multiple values (as of last snapshot).
153      *
154      * @return The rate per second of KB gathered during the last snapshot
155      */

156     public int currentProcessedKBPerSec();
157     
158     /**
159      * Get the number of active (non-paused) threads.
160      *
161      * @return The number of active (non-paused) threads
162      */

163     public int activeThreadCount();
164     
165     /**
166      * Number of <i>successfully</i> processed URIs.
167      *
168      * <p>If crawl not running (paused or stopped) this will return the value
169      * of the last snapshot.
170      *
171      * @return The number of successully fetched URIs
172      *
173      * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
174      */

175     public long successfullyFetchedCount();
176     
177     /**
178      * @return Total number of URIs (processed + queued +
179      * currently being processed)
180      */

181     public long totalCount();
182     
183     public float congestionRatio();
184     public long deepestUri();
185     public long averageDepth();
186     
187     /**
188      * Get a SeedRecord iterator for the job being monitored. If job is no
189      * longer running, stored values will be returned. If job is running,
190      * current seed iterator will be fetched and stored values will be updated.
191      * <p>
192      * Sort order is:<br>
193      * No status code (not processed)<br>
194      * Status codes smaller then 0 (largest to smallest)<br>
195      * Status codes larger then 0 (largest to smallest)<br>
196      * <p>
197      * <b>Note:</b> This iterator will iterate over a list of
198      * <i>SeedRecords</i>.
199      * @return the seed iterator
200      */

201     public Iterator JavaDoc getSeedRecordsSortedByStatusCode();
202
203     /**
204      * @return legend of progress-statistics
205      */

206     public String JavaDoc progressStatisticsLegend();
207
208     /**
209      * @return line of progress-statistics
210      */

211     public String JavaDoc getProgressStatisticsLine();
212     
213     /**
214      * @return Map of progress-statistics.
215      */

216     public Map JavaDoc getProgressStatistics();
217 }
Popular Tags