KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > admin > StatisticsTracker


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * Created on Jul 16, 2003
20  *
21  */

22 package org.archive.crawler.admin;
23
24 import java.io.File JavaDoc;
25 import java.io.FileWriter JavaDoc;
26 import java.io.IOException JavaDoc;
27 import java.io.PrintWriter JavaDoc;
28 import java.io.Serializable JavaDoc;
29 import java.util.Comparator JavaDoc;
30 import java.util.Date JavaDoc;
31 import java.util.EventObject JavaDoc;
32 import java.util.Hashtable JavaDoc;
33 import java.util.Iterator JavaDoc;
34 import java.util.List JavaDoc;
35 import java.util.Map JavaDoc;
36 import java.util.HashMap JavaDoc;
37 import java.util.SortedMap JavaDoc;
38 import java.util.TreeMap JavaDoc;
39 import java.util.TreeSet JavaDoc;
40 import java.util.Vector JavaDoc;
41 import java.util.logging.Level JavaDoc;
42 import java.util.logging.Logger JavaDoc;
43
44 import org.archive.crawler.datamodel.CrawlURI;
45 import org.archive.crawler.event.CrawlURIDispositionListener;
46 import org.archive.crawler.framework.AbstractTracker;
47 import org.archive.crawler.framework.CrawlController;
48 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
49 import org.archive.net.UURI;
50 import org.archive.util.ArchiveUtils;
51 import org.archive.util.LongWrapper;
52 import org.archive.util.MimetypeUtils;
53 import org.archive.util.PaddingStringBuffer;
54
55 /**
56  * This is an implementation of the AbstractTracker. It is designed to function
57  * with the WUI as well as performing various logging activity.
58  * <p>
59  * At the end of each snapshot a line is written to the
60  * 'progress-statistics.log' file.
61  * <p>
62  * The header of that file is as follows:
63  * <pre> [timestamp] [discovered] [queued] [downloaded] [doc/s(avg)] [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre>
64  * First there is a <b>timestamp</b>, accurate down to 1 second.
65  * <p>
66  * <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b>
67  * are (respectively) the discovered URI count, pending URI count, successfully
68  * fetched count and failed fetch count from the frontier at the time of the
69  * snapshot.
70  * <p>
71  * <b>KB/s(avg)</b> is the bandwidth usage. We use the total bytes downloaded
72  * to calculate average bandwidth usage (KB/sec). Since we also note the value
73  * each time a snapshot is made we can calculate the average bandwidth usage
74  * during the last snapshot period to gain a "current" rate. The first number is
75  * the current and the average is in parenthesis.
76  * <p>
77  * <b>doc/s(avg)</b> works the same way as doc/s except it show the number of
78  * documents (URIs) rather then KB downloaded.
79  * <p>
80  * <b>busy-threads</b> is the total number of ToeThreads that are not available
81  * (and thus presumably busy processing a URI). This information is extracted
82  * from the crawl controller.
83  * <p>
84  * Finally mem-use-KB is extracted from the run time environment
85  * (<code>Runtime.getRuntime().totalMemory()</code>).
86  * <p>
87  * In addition to the data collected for the above logs, various other data
88  * is gathered and stored by this tracker.
89  * <ul>
90  * <li> Successfully downloaded documents per fetch status code
91  * <li> Successfully downloaded documents per document mime type
92  * <li> Amount of data per mime type
93  * <li> Successfully downloaded documents per host
94  * <li> Amount of data per host
95  * <li> Disposition of all seeds (this is written to 'reports.log' at end of
96  * crawl)
97  * <li> Successfully downloaded documents per host per source
98  * </ul>
99  *
100  * @author Parker Thompson
101  * @author Kristinn Sigurdsson
102  *
103  * @see org.archive.crawler.framework.StatisticsTracking
104  * @see org.archive.crawler.framework.AbstractTracker
105  */

106 public class StatisticsTracker extends AbstractTracker
107 implements CrawlURIDispositionListener, Serializable JavaDoc {
108     private static final long serialVersionUID = 8004878315916392305L;
109
110     /**
111      * Messages from the StatisticsTracker.
112      */

113     private final static Logger JavaDoc logger =
114         Logger.getLogger(StatisticsTracker.class.getName());
115     
116     // TODO: Need to be able to specify file where the object will be
117
// written once the CrawlEnded event occurs
118

119     protected long lastPagesFetchedCount = 0;
120     protected long lastProcessedBytesCount = 0;
121
122     /*
123      * Snapshot data.
124      */

125     protected long discoveredUriCount = 0;
126     protected long queuedUriCount = 0;
127     protected long finishedUriCount = 0;
128
129     protected long downloadedUriCount = 0;
130     protected long downloadFailures = 0;
131     protected long downloadDisregards = 0;
132     protected double docsPerSecond = 0;
133     protected double currentDocsPerSecond = 0;
134     protected int currentKBPerSec = 0;
135     protected long totalKBPerSec = 0;
136     protected int busyThreads = 0;
137     protected long totalProcessedBytes = 0;
138     protected float congestionRatio = 0;
139     protected long deepestUri;
140     protected long averageDepth;
141     
142     /*
143      * Cumulative data
144      */

145     /** Keep track of the file types we see (mime type -> count) */
146     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> mimeTypeDistribution
147      = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
148     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> mimeTypeBytes
149      = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
150     
151     /** Keep track of fetch status codes */
152     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> statusCodeDistribution
153      = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
154     
155     /** Keep track of hosts.
156      *
157      * Each of these Maps are individually unsynchronized, and cannot
158      * be trivially synchronized with the Collections wrapper. Thus
159      * their synchronized access is enforced by this class.
160      *
161      * <p>They're transient because usually bigmaps that get reconstituted
162      * on recover from checkpoint.
163      */

164     protected transient Map JavaDoc<String JavaDoc,LongWrapper> hostsDistribution = null;
165     protected transient Map JavaDoc<String JavaDoc,LongWrapper> hostsBytes = null;
166     protected transient Map JavaDoc<String JavaDoc,Long JavaDoc> hostsLastFinished = null;
167
168     /** Keep track of URL counts per host per seed */
169     protected transient
170     Map JavaDoc<String JavaDoc,HashMap JavaDoc<String JavaDoc,LongWrapper>> sourceHostDistribution = null;
171
172     /**
173      * Record of seeds' latest actions.
174      */

175     protected transient Map JavaDoc<String JavaDoc,SeedRecord> processedSeedsRecords;
176
177     // seeds tallies: ONLY UPDATED WHEN SEED REPORT WRITTEN
178
private int seedsCrawled;
179     private int seedsNotCrawled;
180     // sExitMessage: only set at crawl-end
181
private String JavaDoc sExitMessage = "Before crawl end";
182
183
184     public StatisticsTracker(String JavaDoc name) {
185         super( name, "A statistics tracker thats integrated into " +
186             "the web UI and that creates the progress-statistics log.");
187     }
188
189     public void initialize(CrawlController c)
190     throws FatalConfigurationException {
191         super.initialize(c);
192         try {
193             this.sourceHostDistribution = c.getBigMap("sourceHostDistribution",
194                     String JavaDoc.class, HashMap JavaDoc.class);
195             this.hostsDistribution = c.getBigMap("hostsDistribution",
196                 String JavaDoc.class, LongWrapper.class);
197             this.hostsBytes = c.getBigMap("hostsBytes", String JavaDoc.class,
198                 LongWrapper.class);
199             this.hostsLastFinished = c.getBigMap("hostsLastFinished",
200                 String JavaDoc.class, Long JavaDoc.class);
201             this.processedSeedsRecords = c.getBigMap("processedSeedsRecords",
202                     String JavaDoc.class, SeedRecord.class);
203         } catch (Exception JavaDoc e) {
204             throw new FatalConfigurationException("Failed setup of" +
205                 " StatisticsTracker: " + e);
206         }
207         controller.addCrawlURIDispositionListener(this);
208     }
209
210     protected void finalCleanup() {
211         super.finalCleanup();
212         if (this.hostsBytes != null) {
213             this.hostsBytes.clear();
214             this.hostsBytes = null;
215         }
216         if (this.hostsDistribution != null) {
217             this.hostsDistribution.clear();
218             this.hostsDistribution = null;
219         }
220         if (this.hostsLastFinished != null) {
221             this.hostsLastFinished.clear();
222             this.hostsLastFinished = null;
223         }
224         if (this.processedSeedsRecords != null) {
225             this.processedSeedsRecords.clear();
226             this.processedSeedsRecords = null;
227         }
228         if (this.sourceHostDistribution != null) {
229             this.sourceHostDistribution.clear();
230             this.sourceHostDistribution = null;
231         }
232
233     }
234
235     protected synchronized void progressStatisticsEvent(final EventObject JavaDoc e) {
236         // This method loads "snapshot" data.
237
discoveredUriCount = discoveredUriCount();
238         downloadedUriCount = successfullyFetchedCount();
239         finishedUriCount = finishedUriCount();
240         queuedUriCount = queuedUriCount();
241         downloadFailures = failedFetchAttempts();
242         downloadDisregards = disregardedFetchAttempts();
243         totalProcessedBytes = totalBytesWritten();
244         congestionRatio = congestionRatio();
245         deepestUri = deepestUri();
246         averageDepth = averageDepth();
247         
248         if (finishedUriCount() == 0) {
249             docsPerSecond = 0;
250             totalKBPerSec = 0;
251         } else if (getCrawlerTotalElapsedTime() < 1000) {
252             return; // Not enough time has passed for a decent snapshot.
253
} else {
254             docsPerSecond = (double) downloadedUriCount /
255                 (double)(getCrawlerTotalElapsedTime() / 1000);
256             // Round to nearest long.
257
totalKBPerSec = (long)(((totalProcessedBytes / 1024) /
258                  ((getCrawlerTotalElapsedTime()) / 1000)) + .5 );
259         }
260
261         busyThreads = activeThreadCount();
262
263         if(shouldrun ||
264             (System.currentTimeMillis() - lastLogPointTime) >= 1000) {
265             // If shouldrun is false there is a chance that the time interval
266
// since last time is too small for a good sample. We only want
267
// to update "current" data when the interval is long enough or
268
// shouldrun is true.
269
currentDocsPerSecond = 0;
270             currentKBPerSec = 0;
271
272             // Note time.
273
long currentTime = System.currentTimeMillis();
274             long sampleTime = currentTime - lastLogPointTime;
275
276             // if we haven't done anyting or there isn't a reasonable sample
277
// size give up.
278
if (sampleTime >= 1000) {
279                 // Update docs/sec snapshot
280
long currentPageCount = successfullyFetchedCount();
281                 long samplePageCount = currentPageCount - lastPagesFetchedCount;
282
283                 currentDocsPerSecond =
284                     (double) samplePageCount / (double)(sampleTime / 1000);
285
286                 lastPagesFetchedCount = currentPageCount;
287
288                 // Update kbytes/sec snapshot
289
long currentProcessedBytes = totalProcessedBytes;
290                 long sampleProcessedBytes =
291                     currentProcessedBytes - lastProcessedBytesCount;
292
293                 currentKBPerSec =
294                     (int)(((sampleProcessedBytes/1024)/(sampleTime/1000)) + .5);
295
296                 lastProcessedBytesCount = currentProcessedBytes;
297             }
298         }
299
300         if (this.controller != null) {
301             this.controller.logProgressStatistics(getProgressStatisticsLine());
302         }
303         lastLogPointTime = System.currentTimeMillis();
304         super.progressStatisticsEvent(e);
305     }
306
307     /**
308      * Return one line of current progress-statistics
309      *
310      * @param now
311      * @return String of stats
312      */

313     public String JavaDoc getProgressStatisticsLine(Date JavaDoc now) {
314         return new PaddingStringBuffer()
315             .append(ArchiveUtils.TIMESTAMP14ISO8601Z.format(now))
316             .raAppend(32, discoveredUriCount)
317             .raAppend(44, queuedUriCount)
318             .raAppend(57, downloadedUriCount)
319             .raAppend(74, ArchiveUtils.
320                 doubleToString(currentDocsPerSecond, 2) +
321                 "(" + ArchiveUtils.doubleToString(docsPerSecond, 2) + ")")
322             .raAppend(85, currentKBPerSec + "(" + totalKBPerSec + ")")
323             .raAppend(99, downloadFailures)
324             .raAppend(113, busyThreads)
325             .raAppend(126, (Runtime.getRuntime().totalMemory() -
326                 Runtime.getRuntime().freeMemory()) / 1024)
327             .raAppend(140, Runtime.getRuntime().totalMemory() / 1024)
328             .raAppend(153, ArchiveUtils.doubleToString(congestionRatio, 2))
329             .raAppend(165, deepestUri)
330             .raAppend(177, averageDepth)
331             .toString();
332     }
333     
334     public Map JavaDoc<String JavaDoc,Number JavaDoc> getProgressStatistics() {
335         Map JavaDoc<String JavaDoc,Number JavaDoc> stats = new HashMap JavaDoc<String JavaDoc,Number JavaDoc>();
336         stats.put("discoveredUriCount", new Long JavaDoc(discoveredUriCount));
337         stats.put("queuedUriCount", new Long JavaDoc(queuedUriCount));
338         stats.put("downloadedUriCount", new Long JavaDoc(downloadedUriCount));
339         stats.put("currentDocsPerSecond", new Double JavaDoc(currentDocsPerSecond));
340         stats.put("docsPerSecond", new Double JavaDoc(docsPerSecond));
341         stats.put("totalKBPerSec", new Long JavaDoc(totalKBPerSec));
342         stats.put("totalProcessedBytes", new Long JavaDoc(totalProcessedBytes));
343         stats.put("currentKBPerSec", new Long JavaDoc(currentKBPerSec));
344         stats.put("downloadFailures", new Long JavaDoc(downloadFailures));
345         stats.put("busyThreads", new Integer JavaDoc(busyThreads));
346         stats.put("congestionRatio", new Double JavaDoc(congestionRatio));
347         stats.put("deepestUri", new Long JavaDoc(deepestUri));
348         stats.put("averageDepth", new Long JavaDoc(averageDepth));
349         stats.put("totalMemory", new Long JavaDoc(Runtime.getRuntime().totalMemory()));
350         stats.put("freeMemory", new Long JavaDoc(Runtime.getRuntime().freeMemory()));
351         return stats;
352     }
353
354     /**
355      * Return one line of current progress-statistics
356      *
357      * @return String of stats
358      */

359     public String JavaDoc getProgressStatisticsLine() {
360         return getProgressStatisticsLine(new Date JavaDoc());
361     }
362     
363     public double processedDocsPerSec(){
364         return docsPerSecond;
365     }
366
367     public double currentProcessedDocsPerSec(){
368         return currentDocsPerSecond;
369     }
370
371     public long processedKBPerSec(){
372         return totalKBPerSec;
373     }
374
375     public int currentProcessedKBPerSec(){
376         return currentKBPerSec;
377     }
378
379     /** Returns a HashMap that contains information about distributions of
380      * encountered mime types. Key/value pairs represent
381      * mime type -> count.
382      * <p>
383      * <b>Note:</b> All the values are wrapped with a {@link LongWrapper LongWrapper}
384      * @return mimeTypeDistribution
385      */

386     public Hashtable JavaDoc<String JavaDoc,LongWrapper> getFileDistribution() {
387         return mimeTypeDistribution;
388     }
389
390
391     /**
392      * Increment a counter for a key in a given HashMap. Used for various
393      * aggregate data.
394      *
395      * As this is used to change Maps which depend on StatisticsTracker
396      * for their synchronization, this method should only be invoked
397      * from a a block synchronized on 'this'.
398      *
399      * @param map The HashMap
400      * @param key The key for the counter to be incremented, if it does not
401      * exist it will be added (set to 1). If null it will
402      * increment the counter "unknown".
403      */

404     protected static void incrementMapCount(Map JavaDoc<String JavaDoc,LongWrapper> map,
405             String JavaDoc key) {
406         incrementMapCount(map,key,1);
407     }
408
409     /**
410      * Increment a counter for a key in a given HashMap by an arbitrary amount.
411      * Used for various aggregate data. The increment amount can be negative.
412      *
413      * As this is used to change Maps which depend on StatisticsTracker
414      * for their synchronization, this method should only be invoked
415      * from a a block synchronized on 'this'.
416      *
417      * @param map
418      * The HashMap
419      * @param key
420      * The key for the counter to be incremented, if it does not exist
421      * it will be added (set to equal to <code>increment</code>).
422      * If null it will increment the counter "unknown".
423      * @param increment
424      * The amount to increment counter related to the <code>key</code>.
425      */

426     protected static void incrementMapCount(Map JavaDoc<String JavaDoc,LongWrapper> map,
427             String JavaDoc key, long increment) {
428         if (key == null) {
429             key = "unknown";
430         }
431         LongWrapper lw = (LongWrapper)map.get(key);
432         if(lw == null) {
433             map.put(key, new LongWrapper(increment));
434         } else {
435             lw.longValue += increment;
436         }
437     }
438
439     /**
440      * Sort the entries of the given HashMap in descending order by their
441      * values, which must be longs wrapped with <code>LongWrapper</code>.
442      * <p>
443      * Elements are sorted by value from largest to smallest. Equal values are
444      * sorted in an arbitrary, but consistent manner by their keys. Only items
445      * with identical value and key are considered equal.
446      *
447      * If the passed-in map requires access to be synchronized, the caller
448      * should ensure this synchronization.
449      *
450      * @param mapOfLongWrapperValues
451      * Assumes values are wrapped with LongWrapper.
452      * @return a sorted set containing the same elements as the map.
453      */

454     public TreeMap JavaDoc<String JavaDoc,LongWrapper> getReverseSortedCopy(
455             final Map JavaDoc<String JavaDoc,LongWrapper> mapOfLongWrapperValues) {
456         TreeMap JavaDoc<String JavaDoc,LongWrapper> sortedMap =
457           new TreeMap JavaDoc<String JavaDoc,LongWrapper>(new Comparator JavaDoc<String JavaDoc>() {
458             public int compare(String JavaDoc e1, String JavaDoc e2) {
459                 long firstVal = mapOfLongWrapperValues.get(e1).
460                     longValue;
461                 long secondVal = mapOfLongWrapperValues.get(e2).
462                     longValue;
463                 if (firstVal < secondVal) {
464                     return 1;
465                 }
466                 if (secondVal < firstVal) {
467                     return -1;
468                 }
469                 // If the values are the same, sort by keys.
470
return e1.compareTo(e2);
471             }
472         });
473         try {
474             sortedMap.putAll(mapOfLongWrapperValues);
475         } catch (UnsupportedOperationException JavaDoc e) {
476             Iterator JavaDoc<String JavaDoc> i = mapOfLongWrapperValues.keySet().iterator();
477             for (;i.hasNext();) {
478                 // Ok. Try doing it the slow way then.
479
String JavaDoc key = i.next();
480                 sortedMap.put(key, mapOfLongWrapperValues.get(key));
481             }
482         }
483         return sortedMap;
484     }
485
486     /**
487      * Return a HashMap representing the distribution of status codes for
488      * successfully fetched curis, as represented by a hashmap where key -&gt;
489      * val represents (string)code -&gt; (integer)count.
490      *
491      * <b>Note: </b> All the values are wrapped with a
492      * {@link LongWrapper LongWrapper}
493      *
494      * @return statusCodeDistribution
495      */

496     public Hashtable JavaDoc<String JavaDoc,LongWrapper> getStatusCodeDistribution() {
497         return statusCodeDistribution;
498     }
499     
500     /**
501      * Returns the time (in millisec) when a URI belonging to a given host was
502      * last finished processing.
503      *
504      * @param host The host to look up time of last completed URI.
505      * @return Returns the time (in millisec) when a URI belonging to a given
506      * host was last finished processing. If no URI has been completed for host
507      * -1 will be returned.
508      */

509     public long getHostLastFinished(String JavaDoc host){
510         Long JavaDoc l = null;
511         synchronized(hostsLastFinished){
512             l = (Long JavaDoc)hostsLastFinished.get(host);
513         }
514         return (l != null)? l.longValue(): -1;
515     }
516
517     /**
518      * Returns the accumulated number of bytes downloaded from a given host.
519      * @param host name of the host
520      * @return the accumulated number of bytes downloaded from a given host
521      */

522     public long getBytesPerHost(String JavaDoc host){
523         synchronized(hostsBytes){
524             return ((LongWrapper)hostsBytes.get(host)).longValue;
525         }
526     }
527
528     /**
529      * Returns the accumulated number of bytes from files of a given file type.
530      * @param filetype Filetype to check.
531      * @return the accumulated number of bytes from files of a given mime type
532      */

533     public long getBytesPerFileType(String JavaDoc filetype){
534         return ((LongWrapper)mimeTypeBytes.get(filetype)).longValue;
535     }
536
537     /**
538      * Get the total number of ToeThreads (sleeping and active)
539      *
540      * @return The total number of ToeThreads
541      */

542     public int threadCount() {
543         return this.controller != null? controller.getToeCount(): 0;
544     }
545
546     /**
547      * @return Current thread count (or zero if can't figure it out).
548      */

549     public int activeThreadCount() {
550         return this.controller != null? controller.getActiveToeCount(): 0;
551         // note: reuse of old busy value seemed misleading: anyone asking
552
// for thread count when paused or stopped still wants accurate reading
553
}
554
555     /**
556      * This returns the number of completed URIs as a percentage of the total
557      * number of URIs encountered (should be inverse to the discovery curve)
558      *
559      * @return The number of completed URIs as a percentage of the total
560      * number of URIs encountered
561      */

562     public int percentOfDiscoveredUrisCompleted() {
563         long completed = finishedUriCount();
564         long total = discoveredUriCount();
565
566         if (total == 0) {
567             return 0;
568         }
569
570         return (int) (100 * completed / total);
571     }
572
573     /**
574      * Number of <i>discovered</i> URIs.
575      *
576      * <p>If crawl not running (paused or stopped) this will return the value of
577      * the last snapshot.
578      *
579      * @return A count of all uris encountered
580      *
581      * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
582      */

583     public long discoveredUriCount() {
584         // While shouldrun is true we can use info direct from the crawler.
585
// After that our last snapshot will have to do.
586
return shouldrun && this.controller != null &&
587                 this.controller.getFrontier() != null?
588             controller.getFrontier().discoveredUriCount() : discoveredUriCount;
589     }
590
591     /**
592      * Number of URIs that have <i>finished</i> processing.
593      *
594      * @return Number of URIs that have finished processing
595      *
596      * @see org.archive.crawler.framework.Frontier#finishedUriCount()
597      */

598     public long finishedUriCount() {
599         return shouldrun && this.controller != null &&
600                 this.controller.getFrontier() != null ?
601             controller.getFrontier().finishedUriCount() : finishedUriCount;
602     }
603
604     /**
605      * Get the total number of failed fetch attempts (connection failures -> give up, etc)
606      *
607      * @return The total number of failed fetch attempts
608      */

609     public long failedFetchAttempts() {
610         // While shouldrun is true we can use info direct from the crawler.
611
// After that our last snapshot will have to do.
612
return shouldrun && this.controller != null &&
613                 this.controller.getFrontier() != null ?
614             controller.getFrontier().failedFetchCount() : downloadFailures;
615     }
616
617     /**
618      * Get the total number of failed fetch attempts (connection failures -> give up, etc)
619      *
620      * @return The total number of failed fetch attempts
621      */

622     public long disregardedFetchAttempts() {
623         // While shouldrun is true we can use info direct from the crawler.
624
// After that our last snapshot will have to do.
625
return shouldrun && this.controller != null &&
626                 this.controller.getFrontier() != null?
627             controller.getFrontier().disregardedUriCount() : downloadDisregards;
628     }
629
630     public long successfullyFetchedCount() {
631         // While shouldrun is true we can use info direct from the crawler.
632
// After that our last snapshot will have to do.
633
return shouldrun && this.controller != null &&
634                 this.controller.getFrontier() != null?
635             controller.getFrontier().succeededFetchCount() : downloadedUriCount;
636     }
637     
638     public long totalCount() {
639         return queuedUriCount() + activeThreadCount() +
640             successfullyFetchedCount();
641     }
642
643     /**
644      * Ratio of number of threads that would theoretically allow
645      * maximum crawl progress (if each was as productive as current
646      * threads), to current number of threads.
647      *
648      * @return float congestion ratio
649      */

650     public float congestionRatio() {
651         // While shouldrun is true we can use info direct from the crawler.
652
// After that our last snapshot will have to do.
653
return shouldrun && this.controller != null &&
654                 this.controller.getFrontier() != null ?
655             controller.getFrontier().congestionRatio() : congestionRatio;
656     }
657     
658     /**
659      * Ordinal position of the 'deepest' URI eligible
660      * for crawling. Essentially, the length of the longest
661      * frontier internal queue.
662      *
663      * @return long URI count to deepest URI
664      */

665     public long deepestUri() {
666         // While shouldrun is true we can use info direct from the crawler.
667
// After that our last snapshot will have to do.
668
return shouldrun && this.controller != null &&
669                 this.controller.getFrontier() != null ?
670             controller.getFrontier().deepestUri() : deepestUri;
671     }
672     
673     /**
674      * Average depth of the last URI in all eligible queues.
675      * That is, the average length of all eligible queues.
676      *
677      * @return long average depth of last URIs in queues
678      */

679     public long averageDepth() {
680         // While shouldrun is true we can use info direct from the crawler.
681
// After that our last snapshot will have to do.
682
return shouldrun && this.controller != null &&
683                 this.controller.getFrontier() != null ?
684             controller.getFrontier().averageDepth() : averageDepth;
685     }
686     
687     /**
688      * Number of URIs <i>queued</i> up and waiting for processing.
689      *
690      * <p>If crawl not running (paused or stopped) this will return the value
691      * of the last snapshot.
692      *
693      * @return Number of URIs queued up and waiting for processing.
694      *
695      * @see org.archive.crawler.framework.Frontier#queuedUriCount()
696      */

697     public long queuedUriCount() {
698         // While shouldrun is true we can use info direct from the crawler.
699
// After that our last snapshot will have to do.
700
return shouldrun && this.controller != null &&
701                 this.controller.getFrontier() != null?
702             controller.getFrontier().queuedUriCount() : queuedUriCount;
703     }
704
705     public long totalBytesWritten() {
706         return shouldrun && this.controller != null &&
707                 this.controller.getFrontier() != null?
708             controller.getFrontier().totalBytesWritten() : totalProcessedBytes;
709     }
710
711     /**
712      * If the curi is a seed, we update the processedSeeds table.
713      *
714      * @param curi The CrawlURI that may be a seed.
715      * @param disposition The dispositino of the CrawlURI.
716      */

717     private void handleSeed(CrawlURI curi, String JavaDoc disposition) {
718         if(curi.isSeed()){
719             SeedRecord sr = new SeedRecord(curi, disposition);
720             processedSeedsRecords.put(sr.getUri(), sr);
721         }
722     }
723
724     public void crawledURISuccessful(CrawlURI curi) {
725         handleSeed(curi,SEED_DISPOSITION_SUCCESS);
726         // Save status codes
727
incrementMapCount(statusCodeDistribution,
728             Integer.toString(curi.getFetchStatus()));
729
730         // Save mime types
731
String JavaDoc mime = MimetypeUtils.truncate(curi.getContentType());
732         incrementMapCount(mimeTypeDistribution, mime);
733         incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());
734
735         // Save hosts stats.
736
saveHostStats((curi.getFetchStatus() == 1)? "dns:":
737                 this.controller.getServerCache().
738                 getHostFor(curi).getHostName(),
739                 curi.getContentSize());
740         
741         if (curi.containsKey(CrawlURI.A_SOURCE_TAG)){
742             saveSourceStats(curi.getString(CrawlURI.A_SOURCE_TAG),
743                     this.controller.getServerCache().getHostFor(curi).
744                     getHostName());
745         }
746     }
747          
748     protected void saveSourceStats(String JavaDoc source, String JavaDoc hostname) {
749         synchronized(sourceHostDistribution) {
750             HashMap JavaDoc<String JavaDoc,LongWrapper> hostUriCount =
751                 sourceHostDistribution.get(source);
752             if (hostUriCount == null) {
753                 hostUriCount = new HashMap JavaDoc<String JavaDoc,LongWrapper>();
754             }
755             // TODO: Dan suggests we don't need a hashtable value. Might
756
// be faster if we went without. Could just have keys of:
757
// seed | host (concatenated as string)
758
// and values of:
759
// #urls
760
incrementMapCount(hostUriCount, hostname);
761             sourceHostDistribution.put(source, hostUriCount);
762         }
763     }
764     
765     protected void saveHostStats(String JavaDoc hostname, long size) {
766         synchronized(hostsDistribution){
767             incrementMapCount(hostsDistribution, hostname);
768         }
769         synchronized(hostsBytes){
770             incrementMapCount(hostsBytes, hostname, size);
771         }
772         synchronized(hostsLastFinished){
773             hostsLastFinished.put(hostname,
774                 new Long JavaDoc(System.currentTimeMillis()));
775         }
776     }
777
778     public void crawledURINeedRetry(CrawlURI curi) {
779         handleSeed(curi,SEED_DISPOSITION_RETRY);
780     }
781
782     public void crawledURIDisregard(CrawlURI curi) {
783         handleSeed(curi,SEED_DISPOSITION_DISREGARD);
784     }
785
786     public void crawledURIFailure(CrawlURI curi) {
787         handleSeed(curi,SEED_DISPOSITION_FAILURE);
788     }
789
790     /**
791      * Get a seed iterator for the job being monitored.
792      *
793      * <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not
794      * UURIs like the Scope seed iterator. The strings are equal to the URIs'
795      * getURIString() values.
796      * @return the seed iterator
797      * FIXME: Consider using TransformingIterator here
798      */

799     public Iterator JavaDoc<String JavaDoc> getSeeds() {
800         List JavaDoc<String JavaDoc> seedsCopy = new Vector JavaDoc<String JavaDoc>();
801         Iterator JavaDoc<UURI> i = controller.getScope().seedsIterator();
802         while (i.hasNext()) {
803             seedsCopy.add(i.next().toString());
804         }
805         return seedsCopy.iterator();
806     }
807
808     public Iterator JavaDoc getSeedRecordsSortedByStatusCode() {
809         return getSeedRecordsSortedByStatusCode(getSeeds());
810     }
811     
812     protected Iterator JavaDoc<SeedRecord> getSeedRecordsSortedByStatusCode(
813             Iterator JavaDoc<String JavaDoc> i) {
814         TreeSet JavaDoc<SeedRecord> sortedSet =
815           new TreeSet JavaDoc<SeedRecord>(new Comparator JavaDoc<SeedRecord>() {
816             public int compare(SeedRecord sr1, SeedRecord sr2) {
817                 int code1 = sr1.getStatusCode();
818                 int code2 = sr2.getStatusCode();
819                 if (code1 == code2) {
820                     // If the values are equal, sort by URIs.
821
return sr1.getUri().compareTo(sr2.getUri());
822                 }
823                 // mirror and shift the nubmer line so as to
824
// place zero at the beginning, then all negatives
825
// in order of ascending absolute value, then all
826
// positives descending
827
code1 = -code1 - Integer.MAX_VALUE;
828                 code2 = -code2 - Integer.MAX_VALUE;
829                 
830                 return new Integer JavaDoc(code1).compareTo(new Integer JavaDoc(code2));
831             }
832         });
833         while (i.hasNext()) {
834             String JavaDoc seed = i.next();
835             SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed);
836             if(sr==null) {
837                 sr = new SeedRecord(seed,SEED_DISPOSITION_NOT_PROCESSED);
838                 processedSeedsRecords.put(seed,sr);
839             }
840             sortedSet.add(sr);
841         }
842         return sortedSet.iterator();
843     }
844
845     public void crawlEnded(String JavaDoc message) {
846         logger.info("Entered crawlEnded");
847         this.sExitMessage = message; // held for reference by reports
848
super.crawlEnded(message);
849         logger.info("Leaving crawlEnded");
850     }
851     
852     /**
853      * @param writer Where to write.
854      */

855     protected void writeSeedsReportTo(PrintWriter JavaDoc writer) {
856         // Build header.
857
writer.print("[code] [status] [seed] [redirect]\n");
858
859         seedsCrawled = 0;
860         seedsNotCrawled = 0;
861         for (Iterator JavaDoc i = getSeedRecordsSortedByStatusCode(getSeeds());
862                 i.hasNext();) {
863             SeedRecord sr = (SeedRecord)i.next();
864             writer.print(sr.getStatusCode());
865             writer.print(" ");
866             if((sr.getStatusCode() > 0)) {
867                 seedsCrawled++;
868                 writer.print("CRAWLED");
869             } else {
870                 seedsNotCrawled++;
871                 writer.print("NOTCRAWLED");
872             }
873             writer.print(" ");
874             writer.print(sr.getUri());
875             if(sr.getRedirectUri()!=null) {
876                 writer.print(" ");
877                 writer.print(sr.getRedirectUri());
878             }
879             writer.print("\n");
880         }
881     }
882     
883     protected void writeSourceReportTo(PrintWriter JavaDoc writer) {
884         
885         writer.print("[source] [host] [#urls]\n");
886         // for each source
887
for (Iterator JavaDoc i = sourceHostDistribution.keySet().iterator(); i.hasNext();) {
888             Object JavaDoc sourceKey = i.next();
889             Map JavaDoc<String JavaDoc,LongWrapper> hostCounts
890              = (Map JavaDoc<String JavaDoc,LongWrapper>)sourceHostDistribution.get(sourceKey);
891             // sort hosts by #urls
892
SortedMap JavaDoc sortedHostCounts = getReverseSortedHostCounts(hostCounts);
893             // for each host
894
for (Iterator JavaDoc j = sortedHostCounts.keySet().iterator(); j.hasNext();) {
895                 Object JavaDoc hostKey = j.next();
896                 LongWrapper hostCount = (LongWrapper) hostCounts.get(hostKey);
897                 writer.print(sourceKey.toString());
898                 writer.print(" ");
899                 writer.print(hostKey.toString());
900                 writer.print(" ");
901                 writer.print(hostCount.longValue);
902                 writer.print("\n");
903             }
904         }
905     }
906   
907     /**
908      * Return a copy of the hosts distribution in reverse-sorted (largest first)
909      * order.
910      *
911      * @return SortedMap of hosts distribution
912      */

913     public SortedMap JavaDoc getReverseSortedHostCounts(
914             Map JavaDoc<String JavaDoc,LongWrapper> hostCounts) {
915         synchronized(hostCounts){
916             return getReverseSortedCopy(hostCounts);
917         }
918     }
919
920     
921     protected void writeHostsReportTo(PrintWriter JavaDoc writer) {
922         SortedMap JavaDoc hd = getReverseSortedHostsDistribution();
923         // header
924
writer.print("[#urls] [#bytes] [host]\n");
925         for (Iterator JavaDoc i = hd.keySet().iterator(); i.hasNext();) {
926             // Key is 'host'.
927
Object JavaDoc key = i.next();
928             if (hd.get(key)!=null) {
929                 writer.print(((LongWrapper)hd.get(key)).longValue);
930             } else {
931                 writer.print("-");
932             }
933             writer.print(" ");
934             writer.print(getBytesPerHost((String JavaDoc)key));
935             writer.print(" ");
936             writer.print((String JavaDoc)key);
937             writer.print("\n");
938         }
939     }
940     
941     /**
942      * Return a copy of the hosts distribution in reverse-sorted
943      * (largest first) order.
944      * @return SortedMap of hosts distribution
945      */

946     public SortedMap JavaDoc getReverseSortedHostsDistribution() {
947         synchronized(hostsDistribution){
948             return getReverseSortedCopy(hostsDistribution);
949         }
950     }
951
952     protected void writeMimetypesReportTo(PrintWriter JavaDoc writer) {
953         // header
954
writer.print("[#urls] [#bytes] [mime-types]\n");
955         TreeMap JavaDoc fd = getReverseSortedCopy(getFileDistribution());
956         for (Iterator JavaDoc i = fd.keySet().iterator(); i.hasNext();) {
957             Object JavaDoc key = i.next();
958             // Key is mime type.
959
writer.print(Long.toString(((LongWrapper)fd.get(key)).longValue));
960             writer.print(" ");
961             writer.print(Long.toString(getBytesPerFileType((String JavaDoc)key)));
962             writer.print(" ");
963             writer.print((String JavaDoc)key);
964             writer.print("\n");
965         }
966     }
967     
968     protected void writeResponseCodeReportTo(PrintWriter JavaDoc writer) {
969         // Build header.
970
writer.print("[rescode] [#urls]\n");
971         TreeMap JavaDoc scd = getReverseSortedCopy(getStatusCodeDistribution());
972         for (Iterator JavaDoc i = scd.keySet().iterator(); i.hasNext();) {
973             Object JavaDoc key = i.next();
974             writer.print((String JavaDoc)key);
975             writer.print(" ");
976             writer.print(Long.toString(((LongWrapper)scd.get(key)).longValue));
977             writer.print("\n");
978         }
979     }
980     
981     protected void writeCrawlReportTo(PrintWriter JavaDoc writer) {
982         writer.print("Crawl Name: " + controller.getOrder().getCrawlOrderName());
983         writer.print("\nCrawl Status: " + sExitMessage);
984         writer.print("\nDuration Time: " +
985                 ArchiveUtils.formatMillisecondsToConventional(crawlDuration()));
986         writer.print("\nTotal Seeds Crawled: " + seedsCrawled);
987         writer.print("\nTotal Seeds not Crawled: " + seedsNotCrawled);
988         // hostsDistribution contains all hosts crawled plus an entry for dns.
989
writer.print("\nTotal Hosts Crawled: " + (hostsDistribution.size()-1));
990         writer.print("\nTotal Documents Crawled: " + finishedUriCount);
991         writer.print("\nProcessed docs/sec: " +
992                 ArchiveUtils.doubleToString(docsPerSecond,2));
993         writer.print("\nBandwidth in Kbytes/sec: " + totalKBPerSec);
994         writer.print("\nTotal Raw Data Size in Bytes: " + totalProcessedBytes +
995                 " (" + ArchiveUtils.formatBytesForDisplay(totalProcessedBytes) +
996                 ") \n");
997     }
998     
999     protected void writeProcessorsReportTo(PrintWriter JavaDoc writer) {
1000        controller.reportTo(CrawlController.PROCESSORS_REPORT,writer);
1001    }
1002    
1003    protected void writeReportFile(String JavaDoc reportName, String JavaDoc filename) {
1004        File JavaDoc f = new File JavaDoc(controller.getDisk().getPath(), filename);
1005        try {
1006            PrintWriter JavaDoc bw = new PrintWriter JavaDoc(new FileWriter JavaDoc(f));
1007            writeReportTo(reportName, bw);
1008            bw.close();
1009            controller.addToManifest(f.getAbsolutePath(),
1010                CrawlController.MANIFEST_REPORT_FILE, true);
1011        } catch (IOException JavaDoc e) {
1012            logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() +
1013                " at the end of crawl.", e);
1014        }
1015        logger.info("wrote report: " + f.getAbsolutePath());
1016    }
1017    
1018    /**
1019     * @param writer Where to write.
1020     */

1021    protected void writeManifestReportTo(PrintWriter JavaDoc writer) {
1022        controller.reportTo(CrawlController.MANIFEST_REPORT, writer);
1023    }
1024    
1025    /**
1026     * @param reportName Name of report.
1027     * @param w Where to write.
1028     */

1029    private void writeReportTo(String JavaDoc reportName, PrintWriter JavaDoc w) {
1030        if("hosts".equals(reportName)) {
1031            writeHostsReportTo(w);
1032        } else if ("mime types".equals(reportName)) {
1033            writeMimetypesReportTo(w);
1034        } else if ("response codes".equals(reportName)) {
1035            writeResponseCodeReportTo(w);
1036        } else if ("seeds".equals(reportName)) {
1037            writeSeedsReportTo(w);
1038        } else if ("crawl".equals(reportName)) {
1039            writeCrawlReportTo(w);
1040        } else if ("processors".equals(reportName)) {
1041            writeProcessorsReportTo(w);
1042        } else if ("manifest".equals(reportName)) {
1043            writeManifestReportTo(w);
1044        } else if ("frontier".equals(reportName)) {
1045            writeFrontierReportTo(w);
1046        } else if ("source".equals(reportName)) {
1047            writeSourceReportTo(w);
1048        }// / TODO else default/error
1049
}
1050
1051    /**
1052     * Write the Frontier's 'nonempty' report (if available)
1053     * @param writer to report to
1054     */

1055    protected void writeFrontierReportTo(PrintWriter JavaDoc writer) {
1056        if(controller.getFrontier().isEmpty()) {
1057            writer.println("frontier empty");
1058        } else {
1059            controller.getFrontier().reportTo("nonempty", writer);
1060        }
1061    }
1062
1063    /**
1064     * Run the reports.
1065     */

1066    public void dumpReports() {
1067        // Add all files mentioned in the crawl order to the
1068
// manifest set.
1069
controller.addOrderToManifest();
1070        writeReportFile("hosts","hosts-report.txt");
1071        writeReportFile("mime types","mimetype-report.txt");
1072        writeReportFile("response codes","responsecode-report.txt");
1073        writeReportFile("seeds","seeds-report.txt");
1074        writeReportFile("crawl","crawl-report.txt");
1075        writeReportFile("processors","processors-report.txt");
1076        writeReportFile("manifest","crawl-manifest.txt");
1077        writeReportFile("frontier","frontier-report.txt");
1078        if (!sourceHostDistribution.isEmpty()) {
1079            writeReportFile("source","source-report.txt");
1080        }
1081        // TODO: Save object to disk?
1082
}
1083
1084    public void crawlCheckpoint(File JavaDoc cpDir) throws Exception JavaDoc {
1085        // CrawlController is managing the checkpointing of this object.
1086
logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());
1087    }
1088}
1089
Popular Tags