KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > framework > AbstractTracker


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  */

19 package org.archive.crawler.framework;
20
21 import java.io.Serializable JavaDoc;
22 import java.util.Date JavaDoc;
23 import java.util.EventObject JavaDoc;
24 import java.util.logging.Level JavaDoc;
25
26 import javax.management.AttributeNotFoundException JavaDoc;
27
28 import org.archive.crawler.event.CrawlStatusListener;
29 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
30 import org.archive.crawler.settings.ModuleType;
31 import org.archive.crawler.settings.SimpleType;
32 import org.archive.crawler.settings.Type;
33 import org.archive.util.ArchiveUtils;
34 import org.archive.util.PaddingStringBuffer;
35 import org.xbill.DNS.DClass;
36 import org.xbill.DNS.Lookup;
37
38 /**
39  * A partial implementation of the StatisticsTracking interface.
40  * <p>
41  * It covers the thread handling. (Launching, pausing etc.) Included in this is
42  * keeping track of the total time spent (actually) crawling. Several methods
43  * to access the time started, finished etc. are provided.
44  * <p>
45  * To handle the thread work the class implements the CrawlStatusListener and
46  * uses it's events to pause, resume and stop logging of statistics. The run()
47  * method will call logActivity() at intervals specified in the crawl order.
48  * <p>
49  * Implementation of logActivity (the actual logging) as well as listening for
50  * CrawlURIDisposition events is not addressed.
51  *
52  * @author Kristinn Sigurdsson
53  *
54  * @see org.archive.crawler.framework.StatisticsTracking
55  * @see org.archive.crawler.admin.StatisticsTracker
56  */

57 public abstract class AbstractTracker extends ModuleType
58 implements StatisticsTracking, CrawlStatusListener, Serializable JavaDoc {
59     /** Default period between logging stat values */
60     public static final Integer JavaDoc DEFAULT_STATISTICS_REPORT_INTERVAL =
61         new Integer JavaDoc(20);
62     /** Attribute name for logging interval in seconds setting
63      */

64     public static final String JavaDoc ATTR_STATS_INTERVAL = "interval-seconds";
65
66     /** A reference to the CrawlContoller of the crawl that we are to track
67      * statistics for.
68      */

69     protected transient CrawlController controller;
70
71     // Keep track of time.
72
protected long crawlerStartTime;
73     protected long crawlerEndTime = -1; // Until crawl ends, this value is -1.
74
protected long crawlerPauseStarted = 0;
75     protected long crawlerTotalPausedTime = 0;
76
77     /** Timestamp of when this logger last wrote something to the log */
78     protected long lastLogPointTime;
79
80     protected boolean shouldrun = true;
81
82     /**
83      * @param name
84      * @param description
85      */

86     public AbstractTracker(String JavaDoc name, String JavaDoc description) {
87         super(name, description);
88         Type e = addElementToDefinition(new SimpleType(ATTR_STATS_INTERVAL,
89                 "The interval between writing progress information to log.",
90                 DEFAULT_STATISTICS_REPORT_INTERVAL));
91         e.setOverrideable(false);
92     }
93
94     /**
95      * Sets up the Logger (including logInterval) and registers with the
96      * CrawlController for CrawlStatus and CrawlURIDisposition events.
97      *
98      * @param c A crawl controller instance.
99      * @throws FatalConfigurationException Not thrown here. For overrides that
100      * go to settings system for configuration.
101      * @see CrawlStatusListener
102      * @see org.archive.crawler.event.CrawlURIDispositionListener
103      */

104     public void initialize(CrawlController c)
105     throws FatalConfigurationException {
106         this.controller = c;
107
108         // Add listeners
109
this.controller.addCrawlStatusListener(this);
110     }
111     
112     /**
113      * Start thread. Will call logActivity() at intervals specified by
114      * logInterval
115      *
116      */

117     public void run() {
118         // Don't start logging if we have no logger
119
if (this.controller == null) {
120             return;
121         }
122
123         shouldrun = true; //If we are starting, this should always be true.
124

125         // Log the legend
126
this.controller.logProgressStatistics(progressStatisticsLegend());
127         lastLogPointTime = System.currentTimeMillis(); // The first interval begins now.
128

129         // Keep logging until someone calls stop()
130
while (shouldrun) {
131             // Pause before writing the first entry (so we have real numbers)
132
// and then pause between entries
133
try {
134                 Thread.sleep(getLogWriteInterval() * 1000);
135             } catch (InterruptedException JavaDoc e) {
136                 e.printStackTrace();
137                 controller.runtimeErrors.log(Level.INFO,
138                     "Periodic stat logger interrupted while sleeping.");
139             }
140
141             // In case stop() was invoked while the thread was sleeping or we
142
// are paused.
143
if (shouldrun && getCrawlPauseStartedTime() == 0) {
144                 progressStatisticsEvent(new EventObject JavaDoc(this));
145             }
146         }
147     }
148
149     /**
150      * @return legend for progress-statistics lines/log
151      */

152     public String JavaDoc progressStatisticsLegend() {
153         return " timestamp" +
154             " discovered " +
155             " queued downloaded doc/s(avg) KB/s(avg) " +
156             " dl-failures busy-thread mem-use-KB heap-size-KB " +
157             " congestion max-depth avg-depth";
158     }
159
160     /**
161      * Notify tracker that crawl has begun. Must be called
162      * outside tracker's own thread, to ensure it is noted
163      * before other threads start interacting with tracker.
164      */

165     public void noteStart() {
166         if (this.crawlerStartTime == 0) {
167             // Note the time the crawl starts (only if not already set)
168
this.crawlerStartTime = System.currentTimeMillis();
169         }
170     }
171
172     /**
173      * A method for logging current crawler state.
174      *
175      * This method will be called by run() at intervals specified in
176      * the crawl order file. It is also invoked when pausing or
177      * stopping a crawl to capture the state at that point. Default behavior is
178      * call to {@link CrawlController#logProgressStatistics} so CrawlController
179      * can act on progress statistics event.
180      * <p>
181      * It is recommended that for implementations of this method it be
182      * carefully considered if it should be synchronized in whole or in
183      * part
184      * @param e Progress statistics event.
185      */

186     protected synchronized void progressStatisticsEvent(final EventObject JavaDoc e) {
187         this.controller.progressStatisticsEvent(e);
188         // temporary workaround for
189
// [ 996161 ] Fix DNSJava issues (memory) -- replace with JNDI-DNS?
190
// http://sourceforge.net/support/tracker.php?aid=996161
191
Lookup.getDefaultCache(DClass.IN).clearCache();
192     }
193
194     /**
195      * Get the starting time of the crawl (as given by
196      * <code>System.currentTimeMillis()</code> when the crawl started).
197      * @return time fo the crawl's start
198      */

199     public long getCrawlStartTime() {
200         return this.crawlerStartTime;
201     }
202
203     /**
204      * If crawl has ended it will return the time it ended (given by
205      * <code>System.currentTimeMillis()</code> at that time).
206      * <br>
207      * If crawl is still going on it will return the same as
208      * <code>System.currentTimeMillis()</code> at the time of the call.
209      * @return The time of the crawl ending or the current time if the crawl has
210      * not ended.
211      */

212     public long getCrawlEndTime() {
213         return (this.crawlerEndTime == -1)?
214             System.currentTimeMillis(): this.crawlerEndTime;
215     }
216
217     /**
218      * Returns the number of milliseconds that the crawl spent paused or
219      * otherwise in a nonactive state.
220      * @return the number of msec. that the crawl was paused or otherwise
221      * suspended.
222      */

223     public long getCrawlTotalPauseTime() {
224         return this.crawlerTotalPausedTime;
225     }
226
227     /**
228      * Get the time when the the crawl was last paused/suspended (as given by
229      * <code>System.currentTimeMillis()</code> at that time). Will be 0 if the
230      * crawl is not currently paused.
231      * @return time of the crawl's last pause/suspend or 0 if the crawl is not
232      * currently paused.
233      */

234     public long getCrawlPauseStartedTime() {
235         return this.crawlerPauseStarted;
236     }
237
238     public long getCrawlerTotalElapsedTime() {
239         if (getCrawlStartTime() == 0) {
240             // if no start time set yet, consider elapsed time zero
241
return 0;
242         }
243         
244         return (getCrawlPauseStartedTime() != 0)?
245             // Are currently paused, calculate time up to last pause
246
(getCrawlPauseStartedTime() - getCrawlTotalPauseTime() -
247                 getCrawlStartTime()):
248             // Not paused, calculate total time.
249
(getCrawlEndTime() - getCrawlTotalPauseTime() - getCrawlStartTime());
250     }
251
252     /**
253      * The number of seconds to wait between writing snapshot data to log file.
254      * @return the number of seconds to wait between writing snapshot data to
255      * log file.
256      */

257     protected int getLogWriteInterval() {
258         int logInterval;
259         try {
260             logInterval =
261                 ((Integer JavaDoc) getAttribute(null, ATTR_STATS_INTERVAL)).intValue();
262         } catch (AttributeNotFoundException JavaDoc e) {
263             logInterval = 10;
264         }
265         return logInterval;
266     }
267
268     /**
269      * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
270      */

271     public void crawlPausing(String JavaDoc statusMessage) {
272         logNote("CRAWL WAITING - " + statusMessage);
273     }
274
275     protected void logNote(final String JavaDoc note) {
276         this.controller.logProgressStatistics(new PaddingStringBuffer()
277                      .append(ArchiveUtils.TIMESTAMP14.format(new Date JavaDoc()))
278                      .append(" ")
279                      .append(note)
280                      .toString());
281     }
282
283     public void crawlPaused(String JavaDoc statusMessage) {
284         crawlerPauseStarted = System.currentTimeMillis();
285         progressStatisticsEvent(new EventObject JavaDoc(this));
286         logNote("CRAWL PAUSED - " + statusMessage);
287     }
288
289     public void crawlResuming(String JavaDoc statusMessage) {
290         tallyCurrentPause();
291         logNote("CRAWL RESUMED - " + statusMessage);
292         lastLogPointTime = System.currentTimeMillis();
293     }
294
295     /**
296      * For a current pause (if any), add paused time to total and reset
297      */

298     protected void tallyCurrentPause() {
299         if (this.crawlerPauseStarted > 0) {
300             // Ok, we managed to actually pause before resuming.
301
this.crawlerTotalPausedTime
302                 += (System.currentTimeMillis() - this.crawlerPauseStarted);
303         }
304         this.crawlerPauseStarted = 0;
305     }
306
307     public void crawlEnding(String JavaDoc sExitMessage) {
308         logNote("CRAWL ENDING - " + sExitMessage);
309     }
310
311     /**
312      * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
313      */

314     public void crawlEnded(String JavaDoc sExitMessage) {
315         // Note the time when the crawl stops.
316
crawlerEndTime = System.currentTimeMillis();
317         progressStatisticsEvent(new EventObject JavaDoc(this));
318         logNote("CRAWL ENDED - " + sExitMessage);
319         shouldrun = false;
320         dumpReports();
321         finalCleanup();
322     }
323
324     public void crawlStarted(String JavaDoc message) {
325         tallyCurrentPause();
326         noteStart();
327     }
328     
329     /**
330      * Dump reports, if any, on request or at crawl end.
331      */

332     protected void dumpReports() {
333         // by default do nothing; subclasses may override
334
}
335
336     /**
337      * Cleanup resources used, at crawl end.
338      */

339     protected void finalCleanup() {
340         controller = null; // Facilitate GC.
341
}
342
343     /**
344      * @see org.archive.crawler.framework.StatisticsTracking#crawlDuration()
345      */

346     public long crawlDuration() {
347         return getCrawlerTotalElapsedTime();
348     }
349 }
350
Popular Tags