StatisticsTracker


1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Jul 16, 2003
20   *
21   */
22  package org.archive.crawler.admin;
23  
24  import java.io.File  ;
25  import java.io.FileWriter  ;
26  import java.io.IOException  ;
27  import java.io.PrintWriter  ;
28  import java.io.Serializable  ;
29  import java.util.Comparator  ;
30  import java.util.Date  ;
31  import java.util.EventObject  ;
32  import java.util.Hashtable  ;
33  import java.util.Iterator  ;
34  import java.util.List  ;
35  import java.util.Map  ;
36  import java.util.HashMap  ;
37  import java.util.SortedMap  ;
38  import java.util.TreeMap  ;
39  import java.util.TreeSet  ;
40  import java.util.Vector  ;
41  import java.util.logging.Level  ;
42  import java.util.logging.Logger  ;
43  
44  import org.archive.crawler.datamodel.CrawlURI;
45  import org.archive.crawler.event.CrawlURIDispositionListener;
46  import org.archive.crawler.framework.AbstractTracker;
47  import org.archive.crawler.framework.CrawlController;
48  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
49  import org.archive.net.UURI;
50  import org.archive.util.ArchiveUtils;
51  import org.archive.util.LongWrapper;
52  import org.archive.util.MimetypeUtils;
53  import org.archive.util.PaddingStringBuffer;
54  
55  /**
56   * This is an implementation of the AbstractTracker. It is designed to function
57   * with the WUI as well as performing various logging activity.
58   * <p>
59   * At the end of each snapshot a line is written to the
60   * 'progress-statistics.log' file.
61   * <p>
62   * The header of that file is as follows:
63   * <pre> [timestamp] [discovered]    [queued] [downloaded] [doc/s(avg)]  [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre>
64   * First there is a <b>timestamp</b>, accurate down to 1 second.
65   * <p>
66   * <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b>
67   * are (respectively) the discovered URI count, pending URI count, successfully
68   * fetched count and failed fetch count from the frontier at the time of the
69   * snapshot.
70   * <p>
71   * <b>KB/s(avg)</b> is the bandwidth usage.  We use the total bytes downloaded
72   * to calculate average bandwidth usage (KB/sec). Since we also note the value
73   * each time a snapshot is made we can calculate the average bandwidth usage
74   * during the last snapshot period to gain a "current" rate. The first number is
75   * the current and the average is in parenthesis.
76   * <p>
77   * <b>doc/s(avg)</b> works the same way as doc/s except it show the number of
78   * documents (URIs) rather then KB downloaded.
79   * <p>
80   * <b>busy-threads</b> is the total number of ToeThreads that are not available
81   * (and thus presumably busy processing a URI). This information is extracted
82   * from the crawl controller.
83   * <p>
84   * Finally mem-use-KB is extracted from the run time environment
85   * (<code>Runtime.getRuntime().totalMemory()</code>).
86   * <p>
87   * In addition to the data collected for the above logs, various other data
88   * is gathered and stored by this tracker.
89   * <ul>
90   *   <li> Successfully downloaded documents per fetch status code
91   *   <li> Successfully downloaded documents per document mime type
92   *   <li> Amount of data per mime type
93   *   <li> Successfully downloaded documents per host
94   *   <li> Amount of data per host
95   *   <li> Disposition of all seeds (this is written to 'reports.log' at end of
96   *        crawl)
97   *   <li> Successfully downloaded documents per host per source
98   * </ul>
99   *
100  * @author Parker Thompson
101  * @author Kristinn Sigurdsson
102  *
103  * @see org.archive.crawler.framework.StatisticsTracking
104  * @see org.archive.crawler.framework.AbstractTracker
105  */
106 public class StatisticsTracker extends AbstractTracker
107 implements CrawlURIDispositionListener, Serializable   {
108     private static final long serialVersionUID = 8004878315916392305L;
109 
110     /**
111      * Messages from the StatisticsTracker.
112      */
113     private final static Logger   logger =
114         Logger.getLogger(StatisticsTracker.class.getName());
115     
116     // TODO: Need to be able to specify file where the object will be
117     // written once the CrawlEnded event occurs
118 
119     protected long lastPagesFetchedCount = 0;
120     protected long lastProcessedBytesCount = 0;
121 
122     /*
123      * Snapshot data.
124      */
125     protected long discoveredUriCount = 0;
126     protected long queuedUriCount = 0;
127     protected long finishedUriCount = 0;
128 
129     protected long downloadedUriCount = 0;
130     protected long downloadFailures = 0;
131     protected long downloadDisregards = 0;
132     protected double docsPerSecond = 0;
133     protected double currentDocsPerSecond = 0;
134     protected int currentKBPerSec = 0;
135     protected long totalKBPerSec = 0;
136     protected int busyThreads = 0;
137     protected long totalProcessedBytes = 0;
138     protected float congestionRatio = 0; 
139     protected long deepestUri;
140     protected long averageDepth;
141     
142     /*
143      * Cumulative data
144      */
145     /** Keep track of the file types we see (mime type -> count) */
146     protected Hashtable  <String  ,LongWrapper> mimeTypeDistribution
147      = new Hashtable  <String  ,LongWrapper>();
148     protected Hashtable  <String  ,LongWrapper> mimeTypeBytes
149      = new Hashtable  <String  ,LongWrapper>();
150     
151     /** Keep track of fetch status codes */
152     protected Hashtable  <String  ,LongWrapper> statusCodeDistribution
153      = new Hashtable  <String  ,LongWrapper>();
154     
155     /** Keep track of hosts. 
156      * 
157      * Each of these Maps are individually unsynchronized, and cannot 
158      * be trivially synchronized with the Collections wrapper. Thus
159      * their synchronized access is enforced by this class.
160      * 
161      * <p>They're transient because usually bigmaps that get reconstituted
162      * on recover from checkpoint.
163      */
164     protected transient Map  <String  ,LongWrapper> hostsDistribution = null;
165     protected transient Map  <String  ,LongWrapper> hostsBytes = null;
166     protected transient Map  <String  ,Long  > hostsLastFinished = null;
167 
168     /** Keep track of URL counts per host per seed */
169     protected transient 
170     Map  <String  ,HashMap  <String  ,LongWrapper>> sourceHostDistribution = null;
171 
172     /**
173      * Record of seeds' latest actions.
174      */
175     protected transient Map  <String  ,SeedRecord> processedSeedsRecords;
176 
177     // seeds tallies: ONLY UPDATED WHEN SEED REPORT WRITTEN
178     private int seedsCrawled;
179     private int seedsNotCrawled;
180     // sExitMessage: only set at crawl-end
181     private String   sExitMessage = "Before crawl end";
182 
183 
184     public StatisticsTracker(String   name) {
185         super( name, "A statistics tracker thats integrated into " +
186             "the web UI and that creates the progress-statistics log.");
187     }
188 
189     public void initialize(CrawlController c)
190     throws FatalConfigurationException {
191         super.initialize(c);
192         try {
193             this.sourceHostDistribution = c.getBigMap("sourceHostDistribution",
194                     String  .class, HashMap  .class);
195             this.hostsDistribution = c.getBigMap("hostsDistribution",
196                 String  .class, LongWrapper.class);
197             this.hostsBytes = c.getBigMap("hostsBytes", String  .class,
198                 LongWrapper.class);
199             this.hostsLastFinished = c.getBigMap("hostsLastFinished",
200                 String  .class, Long  .class);
201             this.processedSeedsRecords = c.getBigMap("processedSeedsRecords",
202                     String  .class, SeedRecord.class);
203         } catch (Exception   e) {
204             throw new FatalConfigurationException("Failed setup of" +
205                 " StatisticsTracker: " + e);
206         }
207         controller.addCrawlURIDispositionListener(this);
208     }
209 
210     protected void finalCleanup() {
211         super.finalCleanup();
212         if (this.hostsBytes != null) {
213             this.hostsBytes.clear();
214             this.hostsBytes = null;
215         }
216         if (this.hostsDistribution != null) {
217             this.hostsDistribution.clear();
218             this.hostsDistribution = null;
219         }
220         if (this.hostsLastFinished != null) {
221             this.hostsLastFinished.clear();
222             this.hostsLastFinished = null;
223         }
224         if (this.processedSeedsRecords != null) {
225             this.processedSeedsRecords.clear();
226             this.processedSeedsRecords = null;
227         }
228         if (this.sourceHostDistribution != null) {
229             this.sourceHostDistribution.clear();
230             this.sourceHostDistribution = null;
231         }
232 
233     }
234 
235     protected synchronized void progressStatisticsEvent(final EventObject   e) {
236         // This method loads "snapshot" data.
237         discoveredUriCount = discoveredUriCount();
238         downloadedUriCount = successfullyFetchedCount();
239         finishedUriCount = finishedUriCount();
240         queuedUriCount = queuedUriCount();
241         downloadFailures = failedFetchAttempts();
242         downloadDisregards = disregardedFetchAttempts();
243         totalProcessedBytes = totalBytesWritten();
244         congestionRatio = congestionRatio();
245         deepestUri = deepestUri();
246         averageDepth = averageDepth();
247         
248         if (finishedUriCount() == 0) {
249             docsPerSecond = 0;
250             totalKBPerSec = 0;
251         } else if (getCrawlerTotalElapsedTime() < 1000) {
252             return; // Not enough time has passed for a decent snapshot.
253         } else {
254             docsPerSecond = (double) downloadedUriCount /
255                 (double)(getCrawlerTotalElapsedTime() / 1000);
256             // Round to nearest long.
257             totalKBPerSec = (long)(((totalProcessedBytes / 1024) /
258                  ((getCrawlerTotalElapsedTime()) / 1000)) + .5 );
259         }
260 
261         busyThreads = activeThreadCount();
262 
263         if(shouldrun ||
264             (System.currentTimeMillis() - lastLogPointTime) >= 1000) {
265             // If shouldrun is false there is a chance that the time interval
266             // since last time is too small for a good sample.  We only want
267             // to update "current" data when the interval is long enough or
268             // shouldrun is true.
269             currentDocsPerSecond = 0;
270             currentKBPerSec = 0;
271 
272             // Note time.
273             long currentTime = System.currentTimeMillis();
274             long sampleTime = currentTime - lastLogPointTime;
275 
276             // if we haven't done anyting or there isn't a reasonable sample
277             // size give up.
278             if (sampleTime >= 1000) {
279                 // Update docs/sec snapshot
280                 long currentPageCount = successfullyFetchedCount();
281                 long samplePageCount = currentPageCount - lastPagesFetchedCount;
282 
283                 currentDocsPerSecond =
284                     (double) samplePageCount / (double)(sampleTime / 1000);
285 
286                 lastPagesFetchedCount = currentPageCount;
287 
288                 // Update kbytes/sec snapshot
289                 long currentProcessedBytes = totalProcessedBytes;
290                 long sampleProcessedBytes =
291                     currentProcessedBytes - lastProcessedBytesCount;
292 
293                 currentKBPerSec =
294                     (int)(((sampleProcessedBytes/1024)/(sampleTime/1000)) + .5);
295 
296                 lastProcessedBytesCount = currentProcessedBytes;
297             }
298         }
299 
300         if (this.controller != null) {
301             this.controller.logProgressStatistics(getProgressStatisticsLine());
302         }
303         lastLogPointTime = System.currentTimeMillis();
304         super.progressStatisticsEvent(e);
305     }
306 
307     /**
308      * Return one line of current progress-statistics
309      * 
310      * @param now
311      * @return String of stats
312      */
313     public String   getProgressStatisticsLine(Date   now) {
314         return new PaddingStringBuffer()
315             .append(ArchiveUtils.TIMESTAMP14ISO8601Z.format(now))
316             .raAppend(32, discoveredUriCount)
317             .raAppend(44, queuedUriCount)
318             .raAppend(57, downloadedUriCount)
319             .raAppend(74, ArchiveUtils.
320                 doubleToString(currentDocsPerSecond, 2) +
321                 "(" + ArchiveUtils.doubleToString(docsPerSecond, 2) + ")")
322             .raAppend(85, currentKBPerSec + "(" + totalKBPerSec + ")")
323             .raAppend(99, downloadFailures)
324             .raAppend(113, busyThreads)
325             .raAppend(126, (Runtime.getRuntime().totalMemory() -
326                 Runtime.getRuntime().freeMemory()) / 1024)
327             .raAppend(140, Runtime.getRuntime().totalMemory() / 1024)
328             .raAppend(153, ArchiveUtils.doubleToString(congestionRatio, 2))
329             .raAppend(165, deepestUri)
330             .raAppend(177, averageDepth)
331             .toString();
332     }
333     
334     public Map  <String  ,Number  > getProgressStatistics() {
335         Map  <String  ,Number  > stats = new HashMap  <String  ,Number  >();
336         stats.put("discoveredUriCount", new Long  (discoveredUriCount));
337         stats.put("queuedUriCount", new Long  (queuedUriCount));
338         stats.put("downloadedUriCount", new Long  (downloadedUriCount));
339         stats.put("currentDocsPerSecond", new Double  (currentDocsPerSecond));
340         stats.put("docsPerSecond", new Double  (docsPerSecond));
341         stats.put("totalKBPerSec", new Long  (totalKBPerSec));
342         stats.put("totalProcessedBytes", new Long  (totalProcessedBytes));
343         stats.put("currentKBPerSec", new Long  (currentKBPerSec));
344         stats.put("downloadFailures", new Long  (downloadFailures));
345         stats.put("busyThreads", new Integer  (busyThreads));
346         stats.put("congestionRatio", new Double  (congestionRatio));
347         stats.put("deepestUri", new Long  (deepestUri));
348         stats.put("averageDepth", new Long  (averageDepth));
349         stats.put("totalMemory", new Long  (Runtime.getRuntime().totalMemory()));
350         stats.put("freeMemory", new Long  (Runtime.getRuntime().freeMemory()));
351         return stats;
352     }
353 
354     /**
355      * Return one line of current progress-statistics
356      * 
357      * @return String of stats
358      */
359     public String   getProgressStatisticsLine() {
360         return getProgressStatisticsLine(new Date  ());
361     }
362     
363     public double processedDocsPerSec(){
364         return docsPerSecond;
365     }
366 
367     public double currentProcessedDocsPerSec(){
368         return currentDocsPerSecond;
369     }
370 
371     public long processedKBPerSec(){
372         return totalKBPerSec;
373     }
374 
375     public int currentProcessedKBPerSec(){
376         return currentKBPerSec;
377     }
378 
379     /** Returns a HashMap that contains information about distributions of
380      *  encountered mime types.  Key/value pairs represent
381      *  mime type -> count.
382      * <p>
383      * <b>Note:</b> All the values are wrapped with a {@link LongWrapper LongWrapper}
384      * @return mimeTypeDistribution
385      */
386     public Hashtable  <String  ,LongWrapper> getFileDistribution() {
387         return mimeTypeDistribution;
388     }
389 
390 
391     /**
392      * Increment a counter for a key in a given HashMap. Used for various
393      * aggregate data.
394      * 
395      * As this is used to change Maps which depend on StatisticsTracker
396      * for their synchronization, this method should only be invoked
397      * from a a block synchronized on 'this'. 
398      *
399      * @param map The HashMap
400      * @param key The key for the counter to be incremented, if it does not
401      *               exist it will be added (set to 1).  If null it will
402      *            increment the counter "unknown".
403      */
404     protected static void incrementMapCount(Map  <String  ,LongWrapper> map, 
405             String   key) {
406         incrementMapCount(map,key,1);
407     }
408 
409     /**
410      * Increment a counter for a key in a given HashMap by an arbitrary amount.
411      * Used for various aggregate data. The increment amount can be negative.
412      *
413      * As this is used to change Maps which depend on StatisticsTracker
414      * for their synchronization, this method should only be invoked
415      * from a a block synchronized on 'this'. 
416      *
417      * @param map
418      *            The HashMap
419      * @param key
420      *            The key for the counter to be incremented, if it does not exist
421      *            it will be added (set to equal to <code>increment</code>).
422      *            If null it will increment the counter "unknown".
423      * @param increment
424      *            The amount to increment counter related to the <code>key</code>.
425      */
426     protected static void incrementMapCount(Map  <String  ,LongWrapper> map, 
427             String   key, long increment) {
428         if (key == null) {
429             key = "unknown";
430         }
431         LongWrapper lw = (LongWrapper)map.get(key);
432         if(lw == null) {
433             map.put(key, new LongWrapper(increment));
434         } else {
435             lw.longValue += increment;
436         }
437     }
438 
439     /**
440      * Sort the entries of the given HashMap in descending order by their
441      * values, which must be longs wrapped with <code>LongWrapper</code>.
442      * <p>
443      * Elements are sorted by value from largest to smallest. Equal values are
444      * sorted in an arbitrary, but consistent manner by their keys. Only items
445      * with identical value and key are considered equal.
446      *
447      * If the passed-in map requires access to be synchronized, the caller
448      * should ensure this synchronization. 
449      * 
450      * @param mapOfLongWrapperValues
451      *            Assumes values are wrapped with LongWrapper.
452      * @return a sorted set containing the same elements as the map.
453      */
454     public TreeMap  <String  ,LongWrapper> getReverseSortedCopy(
455             final Map  <String  ,LongWrapper> mapOfLongWrapperValues) {
456         TreeMap  <String  ,LongWrapper> sortedMap = 
457           new TreeMap  <String  ,LongWrapper>(new Comparator  <String  >() {
458             public int compare(String   e1, String   e2) {
459                 long firstVal = mapOfLongWrapperValues.get(e1).
460                     longValue;
461                 long secondVal = mapOfLongWrapperValues.get(e2).
462                     longValue;
463                 if (firstVal < secondVal) {
464                     return 1;
465                 }
466                 if (secondVal < firstVal) {
467                     return -1;
468                 }
469                 // If the values are the same, sort by keys.
470                 return e1.compareTo(e2);
471             }
472         });
473         try {
474             sortedMap.putAll(mapOfLongWrapperValues);
475         } catch (UnsupportedOperationException   e) {
476             Iterator  <String  > i = mapOfLongWrapperValues.keySet().iterator();
477             for (;i.hasNext();) {
478                 // Ok. Try doing it the slow way then.
479                 String   key = i.next();
480                 sortedMap.put(key, mapOfLongWrapperValues.get(key));
481             }
482         }
483         return sortedMap;
484     }
485 
486     /**
487      * Return a HashMap representing the distribution of status codes for
488      * successfully fetched curis, as represented by a hashmap where key -&gt;
489      * val represents (string)code -&gt; (integer)count.
490      * 
491      * <b>Note: </b> All the values are wrapped with a
492      * {@link LongWrapper LongWrapper}
493      * 
494      * @return statusCodeDistribution
495      */
496     public Hashtable  <String  ,LongWrapper> getStatusCodeDistribution() {
497         return statusCodeDistribution;
498     }
499     
500     /**
501      * Returns the time (in millisec) when a URI belonging to a given host was
502      * last finished processing. 
503      * 
504      * @param host The host to look up time of last completed URI.
505      * @return Returns the time (in millisec) when a URI belonging to a given 
506      * host was last finished processing. If no URI has been completed for host
507      * -1 will be returned. 
508      */
509     public long getHostLastFinished(String   host){
510         Long   l = null;
511         synchronized(hostsLastFinished){
512             l = (Long  )hostsLastFinished.get(host);
513         }
514         return (l != null)? l.longValue(): -1;
515     }
516 
517     /**
518      * Returns the accumulated number of bytes downloaded from a given host.
519      * @param host name of the host
520      * @return the accumulated number of bytes downloaded from a given host
521      */
522     public long getBytesPerHost(String   host){
523         synchronized(hostsBytes){
524             return ((LongWrapper)hostsBytes.get(host)).longValue;
525         }
526     }
527 
528     /**
529      * Returns the accumulated number of bytes from files of a given file type.
530      * @param filetype Filetype to check.
531      * @return the accumulated number of bytes from files of a given mime type
532      */
533     public long getBytesPerFileType(String   filetype){
534         return ((LongWrapper)mimeTypeBytes.get(filetype)).longValue;
535     }
536 
537     /**
538      * Get the total number of ToeThreads (sleeping and active)
539      *
540      * @return The total number of ToeThreads
541      */
542     public int threadCount() {
543         return this.controller != null? controller.getToeCount(): 0;
544     }
545 
546     /**
547      * @return Current thread count (or zero if can't figure it out).
548      */ 
549     public int activeThreadCount() {
550         return this.controller != null? controller.getActiveToeCount(): 0;
551         // note: reuse of old busy value seemed misleading: anyone asking
552         // for thread count when paused or stopped still wants accurate reading
553     }
554 
555     /**
556      * This returns the number of completed URIs as a percentage of the total
557      * number of URIs encountered (should be inverse to the discovery curve)
558      *
559      * @return The number of completed URIs as a percentage of the total
560      * number of URIs encountered
561      */
562     public int percentOfDiscoveredUrisCompleted() {
563         long completed = finishedUriCount();
564         long total = discoveredUriCount();
565 
566         if (total == 0) {
567             return 0;
568         }
569 
570         return (int) (100 * completed / total);
571     }
572 
573     /**
574      * Number of <i>discovered</i> URIs.
575      *
576      * <p>If crawl not running (paused or stopped) this will return the value of
577      * the last snapshot.
578      *
579      * @return A count of all uris encountered
580      *
581      * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
582      */
583     public long discoveredUriCount() {
584         // While shouldrun is true we can use info direct from the crawler.
585         // After that our last snapshot will have to do.
586         return shouldrun && this.controller != null &&
587                 this.controller.getFrontier() != null?
588             controller.getFrontier().discoveredUriCount() : discoveredUriCount;
589     }
590 
591     /**
592      * Number of URIs that have <i>finished</i> processing.
593      *
594      * @return Number of URIs that have finished processing
595      *
596      * @see org.archive.crawler.framework.Frontier#finishedUriCount()
597      */
598     public long finishedUriCount() {
599         return shouldrun && this.controller != null &&
600                 this.controller.getFrontier() != null ?
601             controller.getFrontier().finishedUriCount() : finishedUriCount;
602     }
603 
604     /**
605      * Get the total number of failed fetch attempts (connection failures -> give up, etc)
606      *
607      * @return The total number of failed fetch attempts
608      */
609     public long failedFetchAttempts() {
610         // While shouldrun is true we can use info direct from the crawler.
611         // After that our last snapshot will have to do.
612         return shouldrun && this.controller != null &&
613                 this.controller.getFrontier() != null ?
614             controller.getFrontier().failedFetchCount() : downloadFailures;
615     }
616 
617     /**
618      * Get the total number of failed fetch attempts (connection failures -> give up, etc)
619      *
620      * @return The total number of failed fetch attempts
621      */
622     public long disregardedFetchAttempts() {
623         // While shouldrun is true we can use info direct from the crawler.
624         // After that our last snapshot will have to do.
625         return shouldrun && this.controller != null &&
626                 this.controller.getFrontier() != null?
627             controller.getFrontier().disregardedUriCount() : downloadDisregards;
628     }
629 
630     public long successfullyFetchedCount() {
631         // While shouldrun is true we can use info direct from the crawler.
632         // After that our last snapshot will have to do.
633         return shouldrun && this.controller != null &&
634                 this.controller.getFrontier() != null?
635             controller.getFrontier().succeededFetchCount() : downloadedUriCount;
636     }
637     
638     public long totalCount() {
639         return queuedUriCount() + activeThreadCount() +
640             successfullyFetchedCount();
641     }
642 
643     /**
644      * Ratio of number of threads that would theoretically allow
645      * maximum crawl progress (if each was as productive as current
646      * threads), to current number of threads.
647      * 
648      * @return float congestion ratio 
649      */
650     public float congestionRatio() {
651         // While shouldrun is true we can use info direct from the crawler.
652         // After that our last snapshot will have to do.
653         return shouldrun && this.controller != null &&
654                 this.controller.getFrontier() != null ?
655             controller.getFrontier().congestionRatio() : congestionRatio;
656     }
657     
658     /**
659      * Ordinal position of the 'deepest' URI eligible 
660      * for crawling. Essentially, the length of the longest
661      * frontier internal queue. 
662      * 
663      * @return long URI count to deepest URI
664      */
665     public long deepestUri() {
666         // While shouldrun is true we can use info direct from the crawler.
667         // After that our last snapshot will have to do.
668         return shouldrun && this.controller != null &&
669                 this.controller.getFrontier() != null ?
670             controller.getFrontier().deepestUri() : deepestUri;
671     }
672     
673     /**
674      * Average depth of the last URI in all eligible queues.
675      * That is, the average length of all eligible queues.
676      * 
677      * @return long average depth of last URIs in queues 
678      */
679     public long averageDepth() {
680         // While shouldrun is true we can use info direct from the crawler.
681         // After that our last snapshot will have to do.
682         return shouldrun && this.controller != null &&
683                 this.controller.getFrontier() != null ?
684             controller.getFrontier().averageDepth() : averageDepth;
685     }
686     
687     /**
688      * Number of URIs <i>queued</i> up and waiting for processing.
689      *
690      * <p>If crawl not running (paused or stopped) this will return the value
691      * of the last snapshot.
692      *
693      * @return Number of URIs queued up and waiting for processing.
694      *
695      * @see org.archive.crawler.framework.Frontier#queuedUriCount()
696      */
697     public long queuedUriCount() {
698         // While shouldrun is true we can use info direct from the crawler.
699         // After that our last snapshot will have to do.
700         return shouldrun && this.controller != null &&
701                 this.controller.getFrontier() != null?
702             controller.getFrontier().queuedUriCount() : queuedUriCount;
703     }
704 
705     public long totalBytesWritten() {
706         return shouldrun && this.controller != null &&
707                 this.controller.getFrontier() != null?
708             controller.getFrontier().totalBytesWritten() : totalProcessedBytes;
709     }
710 
711     /**
712      * If the curi is a seed, we update the processedSeeds table.
713      *
714      * @param curi The CrawlURI that may be a seed.
715      * @param disposition The dispositino of the CrawlURI.
716      */
717     private void handleSeed(CrawlURI curi, String   disposition) {
718         if(curi.isSeed()){
719             SeedRecord sr = new SeedRecord(curi, disposition);
720             processedSeedsRecords.put(sr.getUri(), sr);
721         }
722     }
723 
724     public void crawledURISuccessful(CrawlURI curi) {
725         handleSeed(curi,SEED_DISPOSITION_SUCCESS);
726         // Save status codes
727         incrementMapCount(statusCodeDistribution,
728             Integer.toString(curi.getFetchStatus()));
729 
730         // Save mime types
731         String   mime = MimetypeUtils.truncate(curi.getContentType());
732         incrementMapCount(mimeTypeDistribution, mime);
733         incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());
734 
735         // Save hosts stats.
736         saveHostStats((curi.getFetchStatus() == 1)? "dns:":
737                 this.controller.getServerCache().
738                 getHostFor(curi).getHostName(),
739                 curi.getContentSize());
740         
741         if (curi.containsKey(CrawlURI.A_SOURCE_TAG)){
742             saveSourceStats(curi.getString(CrawlURI.A_SOURCE_TAG), 
743                     this.controller.getServerCache().getHostFor(curi).
744                     getHostName()); 
745         }
746     }
747          
748     protected void saveSourceStats(String   source, String   hostname) {
749         synchronized(sourceHostDistribution) {
750             HashMap  <String  ,LongWrapper> hostUriCount = 
751                 sourceHostDistribution.get(source);
752             if (hostUriCount == null) {
753                 hostUriCount = new HashMap  <String  ,LongWrapper>();
754             }
755             // TODO: Dan suggests we don't need a hashtable value.  Might
756             // be faster if we went without. Could just have keys of:
757             //  seed | host (concatenated as string)
758             // and values of: 
759             //  #urls
760             incrementMapCount(hostUriCount, hostname);
761             sourceHostDistribution.put(source, hostUriCount);
762         }
763     }
764     
765     protected void saveHostStats(String   hostname, long size) {
766         synchronized(hostsDistribution){
767             incrementMapCount(hostsDistribution, hostname);
768         }
769         synchronized(hostsBytes){
770             incrementMapCount(hostsBytes, hostname, size);
771         }
772         synchronized(hostsLastFinished){
773             hostsLastFinished.put(hostname,
774                 new Long  (System.currentTimeMillis()));
775         }
776     }
777 
778     public void crawledURINeedRetry(CrawlURI curi) {
779         handleSeed(curi,SEED_DISPOSITION_RETRY);
780     }
781 
782     public void crawledURIDisregard(CrawlURI curi) {
783         handleSeed(curi,SEED_DISPOSITION_DISREGARD);
784     }
785 
786     public void crawledURIFailure(CrawlURI curi) {
787         handleSeed(curi,SEED_DISPOSITION_FAILURE);
788     }
789 
790     /**
791      * Get a seed iterator for the job being monitored. 
792      * 
793      * <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not
794      * UURIs like the Scope seed iterator. The strings are equal to the URIs'
795      * getURIString() values.
796      * @return the seed iterator
797      * FIXME: Consider using TransformingIterator here
798      */
799     public Iterator  <String  > getSeeds() {
800         List  <String  > seedsCopy = new Vector  <String  >();
801         Iterator  <UURI> i = controller.getScope().seedsIterator();
802         while (i.hasNext()) {
803             seedsCopy.add(i.next().toString());
804         }
805         return seedsCopy.iterator();
806     }
807 
808     public Iterator   getSeedRecordsSortedByStatusCode() {
809         return getSeedRecordsSortedByStatusCode(getSeeds());
810     }
811     
812     protected Iterator  <SeedRecord> getSeedRecordsSortedByStatusCode(
813             Iterator  <String  > i) {
814         TreeSet  <SeedRecord> sortedSet = 
815           new TreeSet  <SeedRecord>(new Comparator  <SeedRecord>() {
816             public int compare(SeedRecord sr1, SeedRecord sr2) {
817                 int code1 = sr1.getStatusCode();
818                 int code2 = sr2.getStatusCode();
819                 if (code1 == code2) {
820                     // If the values are equal, sort by URIs.
821                     return sr1.getUri().compareTo(sr2.getUri());
822                 }
823                 // mirror and shift the nubmer line so as to
824                 // place zero at the beginning, then all negatives 
825                 // in order of ascending absolute value, then all 
826                 // positives descending
827                 code1 = -code1 - Integer.MAX_VALUE;
828                 code2 = -code2 - Integer.MAX_VALUE;
829                 
830                 return new Integer  (code1).compareTo(new Integer  (code2));
831             }
832         });
833         while (i.hasNext()) {
834             String   seed = i.next();
835             SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed);
836             if(sr==null) {
837                 sr = new SeedRecord(seed,SEED_DISPOSITION_NOT_PROCESSED);
838                 processedSeedsRecords.put(seed,sr);
839             }
840             sortedSet.add(sr);
841         }
842         return sortedSet.iterator();
843     }
844 
845     public void crawlEnded(String   message) {
846         logger.info("Entered crawlEnded");
847         this.sExitMessage = message; // held for reference by reports
848         super.crawlEnded(message);
849         logger.info("Leaving crawlEnded");
850     }
851     
852     /**
853      * @param writer Where to write.
854      */
855     protected void writeSeedsReportTo(PrintWriter   writer) {
856         // Build header.
857         writer.print("[code] [status] [seed] [redirect]\n");
858 
859         seedsCrawled = 0;
860         seedsNotCrawled = 0;
861         for (Iterator   i = getSeedRecordsSortedByStatusCode(getSeeds());
862                 i.hasNext();) {
863             SeedRecord sr = (SeedRecord)i.next();
864             writer.print(sr.getStatusCode());
865             writer.print(" ");
866             if((sr.getStatusCode() > 0)) {
867                 seedsCrawled++;
868                 writer.print("CRAWLED");
869             } else {
870                 seedsNotCrawled++;
871                 writer.print("NOTCRAWLED");
872             }
873             writer.print(" ");
874             writer.print(sr.getUri());
875             if(sr.getRedirectUri()!=null) {
876                 writer.print(" ");
877                 writer.print(sr.getRedirectUri());
878             }
879             writer.print("\n");
880         }
881     }
882     
883     protected void writeSourceReportTo(PrintWriter   writer) {
884         
885         writer.print("[source] [host] [#urls]\n");
886         // for each source
887         for (Iterator   i = sourceHostDistribution.keySet().iterator(); i.hasNext();) {
888             Object   sourceKey = i.next();
889             Map  <String  ,LongWrapper> hostCounts 
890              = (Map  <String  ,LongWrapper>)sourceHostDistribution.get(sourceKey);
891             // sort hosts by #urls
892             SortedMap   sortedHostCounts = getReverseSortedHostCounts(hostCounts);
893             // for each host
894             for (Iterator   j = sortedHostCounts.keySet().iterator(); j.hasNext();) {
895                 Object   hostKey = j.next();
896                 LongWrapper hostCount = (LongWrapper) hostCounts.get(hostKey);
897                 writer.print(sourceKey.toString());
898                 writer.print(" ");
899                 writer.print(hostKey.toString());
900                 writer.print(" ");
901                 writer.print(hostCount.longValue);
902                 writer.print("\n");
903             }
904         }
905     }
906   
907     /**
908      * Return a copy of the hosts distribution in reverse-sorted (largest first)
909      * order.
910      * 
911      * @return SortedMap of hosts distribution
912      */
913     public SortedMap   getReverseSortedHostCounts(
914             Map  <String  ,LongWrapper> hostCounts) {
915         synchronized(hostCounts){
916             return getReverseSortedCopy(hostCounts);
917         }
918     }
919 
920     
921     protected void writeHostsReportTo(PrintWriter   writer) {
922         SortedMap   hd = getReverseSortedHostsDistribution();
923         // header
924         writer.print("[#urls] [#bytes] [host]\n");
925         for (Iterator   i = hd.keySet().iterator(); i.hasNext();) {
926             // Key is 'host'.
927             Object   key = i.next();
928             if (hd.get(key)!=null) {
929                 writer.print(((LongWrapper)hd.get(key)).longValue);
930             } else {
931                 writer.print("-");
932             }
933             writer.print(" ");
934             writer.print(getBytesPerHost((String  )key));
935             writer.print(" ");
936             writer.print((String  )key);
937             writer.print("\n");
938         }
939     }
940     
941     /**
942      * Return a copy of the hosts distribution in reverse-sorted
943      * (largest first) order. 
944      * @return SortedMap of hosts distribution
945      */
946     public SortedMap   getReverseSortedHostsDistribution() {
947         synchronized(hostsDistribution){
948             return getReverseSortedCopy(hostsDistribution);
949         }
950     }
951 
952     protected void writeMimetypesReportTo(PrintWriter   writer) {
953         // header
954         writer.print("[#urls] [#bytes] [mime-types]\n");
955         TreeMap   fd = getReverseSortedCopy(getFileDistribution());
956         for (Iterator   i = fd.keySet().iterator(); i.hasNext();) {
957             Object   key = i.next();
958             // Key is mime type.
959             writer.print(Long.toString(((LongWrapper)fd.get(key)).longValue));
960             writer.print(" ");
961             writer.print(Long.toString(getBytesPerFileType((String  )key)));
962             writer.print(" ");
963             writer.print((String  )key);
964             writer.print("\n");
965         }
966     }
967     
968     protected void writeResponseCodeReportTo(PrintWriter   writer) {
969         // Build header.
970         writer.print("[rescode] [#urls]\n");
971         TreeMap   scd = getReverseSortedCopy(getStatusCodeDistribution());
972         for (Iterator   i = scd.keySet().iterator(); i.hasNext();) {
973             Object   key = i.next();
974             writer.print((String  )key);
975             writer.print(" ");
976             writer.print(Long.toString(((LongWrapper)scd.get(key)).longValue));
977             writer.print("\n");
978         }
979     }
980     
981     protected void writeCrawlReportTo(PrintWriter   writer) {
982         writer.print("Crawl Name: " + controller.getOrder().getCrawlOrderName());
983         writer.print("\nCrawl Status: " + sExitMessage);
984         writer.print("\nDuration Time: " +
985                 ArchiveUtils.formatMillisecondsToConventional(crawlDuration()));
986         writer.print("\nTotal Seeds Crawled: " + seedsCrawled);
987         writer.print("\nTotal Seeds not Crawled: " + seedsNotCrawled);
988         // hostsDistribution contains all hosts crawled plus an entry for dns.
989         writer.print("\nTotal Hosts Crawled: " + (hostsDistribution.size()-1));
990         writer.print("\nTotal Documents Crawled: " + finishedUriCount);
991         writer.print("\nProcessed docs/sec: " +
992                 ArchiveUtils.doubleToString(docsPerSecond,2));
993         writer.print("\nBandwidth in Kbytes/sec: " + totalKBPerSec);
994         writer.print("\nTotal Raw Data Size in Bytes: " + totalProcessedBytes +
995                 " (" + ArchiveUtils.formatBytesForDisplay(totalProcessedBytes) +
996                 ") \n");
997     }
998     
999     protected void writeProcessorsReportTo(PrintWriter   writer) {
1000        controller.reportTo(CrawlController.PROCESSORS_REPORT,writer);
1001    }
1002    
1003    protected void writeReportFile(String   reportName, String   filename) {
1004        File   f = new File  (controller.getDisk().getPath(), filename);
1005        try {
1006            PrintWriter   bw = new PrintWriter  (new FileWriter  (f));
1007            writeReportTo(reportName, bw);
1008            bw.close();
1009            controller.addToManifest(f.getAbsolutePath(),
1010                CrawlController.MANIFEST_REPORT_FILE, true);
1011        } catch (IOException   e) {
1012            logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() +
1013                " at the end of crawl.", e);
1014        }
1015        logger.info("wrote report: " + f.getAbsolutePath());
1016    }
1017    
1018    /**
1019     * @param writer Where to write.
1020     */
1021    protected void writeManifestReportTo(PrintWriter   writer) {
1022        controller.reportTo(CrawlController.MANIFEST_REPORT, writer);
1023    }
1024    
1025    /**
1026     * @param reportName Name of report.
1027     * @param w Where to write.
1028     */
1029    private void writeReportTo(String   reportName, PrintWriter   w) {
1030        if("hosts".equals(reportName)) {
1031            writeHostsReportTo(w);
1032        } else if ("mime types".equals(reportName)) {
1033            writeMimetypesReportTo(w);
1034        } else if ("response codes".equals(reportName)) {
1035            writeResponseCodeReportTo(w);
1036        } else if ("seeds".equals(reportName)) {
1037            writeSeedsReportTo(w);
1038        } else if ("crawl".equals(reportName)) {
1039            writeCrawlReportTo(w);
1040        } else if ("processors".equals(reportName)) {
1041            writeProcessorsReportTo(w);
1042        } else if ("manifest".equals(reportName)) {
1043            writeManifestReportTo(w);
1044        } else if ("frontier".equals(reportName)) {
1045            writeFrontierReportTo(w);
1046        } else if ("source".equals(reportName)) {
1047            writeSourceReportTo(w);
1048        }// / TODO else default/error
1049    }
1050
1051    /**
1052     * Write the Frontier's 'nonempty' report (if available)
1053     * @param writer to report to
1054     */
1055    protected void writeFrontierReportTo(PrintWriter   writer) {
1056        if(controller.getFrontier().isEmpty()) {
1057            writer.println("frontier empty");
1058        } else {
1059            controller.getFrontier().reportTo("nonempty", writer);
1060        }
1061    }
1062
1063    /**
1064     * Run the reports.
1065     */
1066    public void dumpReports() {
1067        // Add all files mentioned in the crawl order to the
1068        // manifest set.
1069        controller.addOrderToManifest();
1070        writeReportFile("hosts","hosts-report.txt");
1071        writeReportFile("mime types","mimetype-report.txt");
1072        writeReportFile("response codes","responsecode-report.txt");
1073        writeReportFile("seeds","seeds-report.txt");
1074        writeReportFile("crawl","crawl-report.txt");
1075        writeReportFile("processors","processors-report.txt");
1076        writeReportFile("manifest","crawl-manifest.txt");
1077        writeReportFile("frontier","frontier-report.txt");
1078        if (!sourceHostDistribution.isEmpty()) {
1079            writeReportFile("source","source-report.txt");
1080        }
1081        // TODO: Save object to disk?
1082    }
1083
1084    public void crawlCheckpoint(File   cpDir) throws Exception   {
1085        // CrawlController is managing the checkpointing of this object.
1086        logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());
1087    }
1088}
1089
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags