KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > admin > StatisticsSummary


1 /* StatisticsSummary
2  *
3  * $Id: StatisticsSummary.java,v 1.2.4.1 2007/01/13 01:31:07 stack-sf Exp $$
4  *
5  * Created on July 27, 2006
6  *
7  * Copyright (C) 2006 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.admin;
26
27 import java.io.File JavaDoc;
28 import java.io.FileReader JavaDoc;
29 import java.io.BufferedReader JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.util.Comparator JavaDoc;
32 import java.util.Hashtable JavaDoc;
33 import java.util.Iterator JavaDoc;
34 import java.util.Map JavaDoc;
35 import java.util.SortedMap JavaDoc;
36 import java.util.TreeMap JavaDoc;
37 import java.util.TreeSet JavaDoc;
38 import java.util.logging.Level JavaDoc;
39 import java.util.logging.Logger JavaDoc;
40
41 import org.archive.util.LongWrapper;
42
43
44 /**
45  * This class provides descriptive statistics of a finished crawl job by
46  * using the crawl report files generated by StatisticsTracker. Any formatting
47  * changes to the way StatisticsTracker writes to the summary crawl reports will
48  * require changes to this class.
49  * <p>
50  * The following statistics are accessible from this class:
51  * <ul>
52  * <li> Successfully downloaded documents per fetch status code
53  * <li> Successfully downloaded documents per document mime type
54  * <li> Amount of data per mime type
55  * <li> Successfully downloaded documents per host
56  * <li> Amount of data per host
57  * <li> Successfully downloaded documents per top-level domain name (TLD)
58  * <li> Disposition of all seeds
59  * <li> Successfully downloaded documents per host per source
60  * </ul>
61  *
62  * <p>TODO: Make it so summarizing is not done all in RAM so we avoid
63  * OOME.
64  *
65  * @author Frank McCown
66  *
67  * @see org.archive.crawler.admin.StatisticsTracker
68  */

69 public class StatisticsSummary {
70     /**
71      * Messages from the StatisticsSummary.
72      */

73     private final static Logger JavaDoc logger =
74         Logger.getLogger(StatisticsSummary.class.getName());
75     
76     private boolean stats = true;
77     
78     /** Crawl job whose summary we want to view */
79     private CrawlJob cjob;
80         
81     protected long totalDnsStatusCodeDocuments = 0;
82     protected long totalStatusCodeDocuments = 0;
83     protected long totalFileTypeDocuments = 0;
84     protected long totalMimeTypeDocuments = 0;
85     protected long totalDnsMimeTypeDocuments = 0;
86     protected long totalDnsHostDocuments = 0;
87     protected long totalHostDocuments = 0;
88     protected long totalMimeSize = 0;
89     protected long totalDnsMimeSize = 0;
90     protected long totalHostSize = 0;
91     protected long totalDnsHostSize = 0;
92     protected long totalTldDocuments = 0;
93     protected long totalTldSize = 0;
94     protected long totalHosts = 0;
95     
96     protected String JavaDoc durationTime;
97     protected String JavaDoc processedDocsPerSec;
98     protected String JavaDoc bandwidthKbytesPerSec;
99     protected String JavaDoc totalDataWritten;
100     
101     /** Keep track of the file types we see (mime type -> count) */
102     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> mimeTypeDistribution = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
103     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> mimeTypeBytes = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
104     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> mimeTypeDnsDistribution = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
105     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> mimeTypeDnsBytes = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
106     
107     /** Keep track of status codes */
108     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> statusCodeDistribution = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
109     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> dnsStatusCodeDistribution
110      = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
111     
112     /** Keep track of hosts */
113     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> hostsDistribution = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
114     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> hostsBytes = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
115     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> hostsDnsDistribution = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
116     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> hostsDnsBytes = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
117
118     /** Keep track of TLDs */
119     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> tldDistribution = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
120     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> tldBytes = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
121     protected Hashtable JavaDoc<String JavaDoc,LongWrapper> tldHostDistribution = new Hashtable JavaDoc<String JavaDoc,LongWrapper>();
122
123     /** Keep track of processed seeds */
124     protected transient Map JavaDoc<String JavaDoc,SeedRecord> processedSeedsRecords
125      = new Hashtable JavaDoc<String JavaDoc,SeedRecord>();
126
127     /**
128      * Constructor
129      *
130      * @param cjob
131      * Completed crawl job
132      */

133     public StatisticsSummary(CrawlJob cjob) {
134         this.cjob = cjob;
135         
136         // Read all stats for this crawl job
137
this.stats = calculateStatusCodeDistribution();
138         if (calculateMimeTypeDistribution()) {
139             this.stats = true;
140         }
141         if (calculateHostsDistribution()) {
142             this.stats = true;
143         }
144         if (readCrawlReport()) {
145             this.stats = true;
146         }
147         if (readSeedReport()) {
148             this.stats = true;
149         }
150     }
151     
152     
153     /**
154      * Increment a counter for a key in a given HashMap. Used for various
155      * aggregate data.
156      *
157      * @param map The HashMap
158      * @param key The key for the counter to be incremented, if it does not
159      * exist it will be added (set to 1). If null it will
160      * increment the counter "unknown".
161      */

162     protected static void incrementMapCount(Map JavaDoc<String JavaDoc,LongWrapper> map,
163             String JavaDoc key) {
164         incrementMapCount(map,key,1);
165     }
166
167     /**
168      * Increment a counter for a key in a given HashMap by an arbitrary amount.
169      * Used for various aggregate data. The increment amount can be negative.
170      *
171      * @param map
172      * The HashMap
173      * @param key
174      * The key for the counter to be incremented, if it does not
175      * exist it will be added (set to equal to
176      * <code>increment</code>).
177      * If null it will increment the counter "unknown".
178      * @param increment
179      * The amount to increment counter related to the
180      * <code>key</code>.
181      */

182     protected static void incrementMapCount(Map JavaDoc<String JavaDoc,LongWrapper> map,
183             String JavaDoc key, long increment) {
184         if (key == null) {
185             key = "unknown";
186         }
187         LongWrapper lw = map.get(key);
188         if(lw == null) {
189             map.put(key, new LongWrapper(increment));
190         } else {
191             lw.longValue += increment;
192         }
193     }
194   
195     /** Returns a HashMap that contains information about distributions of
196      * encountered mime types. Key/value pairs represent
197      * mime type -> count.
198      * <p>
199      * <b>Note:</b> All the values are wrapped with a
200      * {@link LongWrapper LongWrapper}
201      * @return mimeTypeDistribution
202      */

203     public Hashtable JavaDoc getMimeDistribution() {
204         return mimeTypeDistribution;
205     }
206     
207     public long getTotalMimeTypeDocuments() {
208         return totalMimeTypeDocuments;
209     }
210     
211     public long getTotalDnsMimeTypeDocuments() {
212         return totalDnsMimeTypeDocuments;
213     }
214     
215     public long getTotalMimeSize() {
216         return totalMimeSize;
217     }
218     
219     public long getTotalDnsMimeSize() {
220         return totalDnsMimeSize;
221     }
222    
223     /**
224      * Return a HashMap representing the distribution of HTTP status codes for
225      * successfully fetched curis, as represented by a hashmap where key -&gt;
226      * val represents (string)code -&gt; (integer)count.
227      *
228      * <b>Note: </b> All the values are wrapped with a
229      * {@link LongWrapper LongWrapper}
230      *
231      * @return statusCodeDistribution
232      */

233     public Hashtable JavaDoc getStatusCodeDistribution() {
234         return statusCodeDistribution;
235     }
236    
237     /**
238      * Return a HashMap representing the distribution of DNS status codes for
239      * successfully fetched curis, as represented by a hashmap where key -&gt;
240      * val represents (string)code -&gt; (integer)count.
241      *
242      * <b>Note: </b> All the values are wrapped with a
243      * {@link LongWrapper LongWrapper}
244      *
245      * @return dnsStatusCodeDistribution
246      */

247     public Hashtable JavaDoc getDnsStatusCodeDistribution() {
248         return dnsStatusCodeDistribution;
249     }
250     
251     public Hashtable JavaDoc getDnsMimeDistribution() {
252         return mimeTypeDnsDistribution;
253     }
254
255     public long getTotalDnsStatusCodeDocuments() {
256         return totalDnsStatusCodeDocuments;
257     }
258     
259     public long getTotalStatusCodeDocuments() {
260         return totalStatusCodeDocuments;
261     }
262     
263     public long getTotalHostDocuments() {
264         return totalHostDocuments;
265     }
266     
267     public long getTotalDnsHostDocuments() {
268         return totalDnsHostDocuments;
269     }
270     
271     public Hashtable JavaDoc getHostsDnsDistribution() {
272         return hostsDnsDistribution;
273     }
274     
275     public long getTotalHostDnsDocuments() {
276         return totalDnsHostDocuments;
277     }
278     
279     public long getTotalHostSize() {
280         return totalHostSize;
281     }
282     
283     public long getTotalDnsHostSize() {
284         return totalDnsHostSize;
285     }
286     
287     public Hashtable JavaDoc getTldDistribution() {
288         return tldDistribution;
289     }
290     
291     public Hashtable JavaDoc getTldBytes() {
292         return tldBytes;
293     }
294     
295     public long getTotalTldDocuments() {
296         return totalTldDocuments;
297     }
298     
299     public long getTotalTldSize() {
300         return totalTldSize;
301     }
302     
303     public Hashtable JavaDoc getTldHostDistribution() {
304         return tldHostDistribution;
305     }
306     
307     public long getTotalHosts() {
308         return totalHosts;
309     }
310     
311     public String JavaDoc getDurationTime() {
312         return durationTime;
313     }
314     
315     public String JavaDoc getProcessedDocsPerSec() {
316         return processedDocsPerSec;
317     }
318     
319     public String JavaDoc getBandwidthKbytesPerSec() {
320         return bandwidthKbytesPerSec;
321     }
322     
323     public String JavaDoc getTotalDataWritten() {
324         return totalDataWritten;
325     }
326
327     /**
328      * Sort the entries of the given HashMap in descending order by their
329      * values, which must be longs wrapped with <code>LongWrapper</code>.
330      * <p>
331      * Elements are sorted by value from largest to smallest. Equal values are
332      * sorted in an arbitrary, but consistent manner by their keys. Only items
333      * with identical value and key are considered equal.
334      *
335      * If the passed-in map requires access to be synchronized, the caller
336      * should ensure this synchronization.
337      *
338      * @param mapOfLongWrapperValues
339      * Assumes values are wrapped with LongWrapper.
340      * @return a sorted set containing the same elements as the map.
341      */

342     public TreeMap JavaDoc<String JavaDoc,LongWrapper> getReverseSortedCopy(
343             final Map JavaDoc<String JavaDoc,LongWrapper> mapOfLongWrapperValues) {
344         TreeMap JavaDoc<String JavaDoc,LongWrapper> sortedMap = new TreeMap JavaDoc<String JavaDoc,LongWrapper>(
345           new Comparator JavaDoc<String JavaDoc>() {
346             public int compare(String JavaDoc e1, String JavaDoc e2) {
347                 long firstVal = mapOfLongWrapperValues.get(e1).longValue;
348                 long secondVal = mapOfLongWrapperValues.get(e2).longValue;
349                 if (firstVal < secondVal) {
350                     return 1;
351                 }
352                 if (secondVal < firstVal) {
353                     return -1;
354                 }
355                 // If the values are the same, sort by keys.
356
return e1.compareTo(e2);
357             }
358         });
359         try {
360             sortedMap.putAll(mapOfLongWrapperValues);
361         } catch (UnsupportedOperationException JavaDoc e) {
362             for (String JavaDoc key: mapOfLongWrapperValues.keySet()) {
363                 sortedMap.put(key, mapOfLongWrapperValues.get(key));
364             }
365         }
366         return sortedMap;
367     }
368      
369     /**
370      * Get the number of hosts with a particular TLD.
371      * @param tld
372      * top-level domain name
373      * @return Total crawled hosts
374      */

375     public long getHostsPerTld(String JavaDoc tld) {
376         LongWrapper lw = (LongWrapper)tldHostDistribution.get(tld);
377         return (lw == null ? 0 : lw.longValue);
378     }
379     
380     /**
381      * Read status code distribution from responsecode-report.txt.
382      * DNS and HTTP status codes are separated when read.
383      * @return True if we found some stats.
384      */

385     private boolean calculateStatusCodeDistribution() {
386         // Read from responsecode-report.txt
387
File JavaDoc f = new File JavaDoc(cjob.getDirectory(), "responsecode-report.txt");
388         if (!f.exists()) {
389             return false;
390         }
391         BufferedReader JavaDoc br = null;
392         try {
393             FileReader JavaDoc reader = new FileReader JavaDoc(f);
394             br = new BufferedReader JavaDoc(reader);
395             String JavaDoc line = br.readLine(); // Ignore heading
396
line = br.readLine();
397             while (line != null) {
398               // Get status code and # urls which are seperated by a space
399

400               String JavaDoc[] items = line.split(" ");
401               if (items.length < 2) {
402                   logger.log(Level.WARNING,
403                           "Unexpected formatting on line [" + line + "]");
404               }
405               else {
406                   // See if DNS or HTTP status code
407
if (items[0].length() < 3) {
408                       // DNS status code
409
long total = Long.parseLong(items[1]);
410                       dnsStatusCodeDistribution.put(items[0],
411                               new LongWrapper(total));
412                       totalDnsStatusCodeDocuments += total;
413                   }
414                   else {
415                       // HTTP status code
416
long total = Long.parseLong(items[1]);
417                       statusCodeDistribution.put(items[0],
418                               new LongWrapper(total));
419                       totalStatusCodeDocuments += total;
420                   }
421               }
422               line = br.readLine();
423             }
424         } catch (IOException JavaDoc e) {
425             logger.log(Level.SEVERE, "Unable to read " + f.getAbsolutePath(),
426                 e);
427         } finally {
428             if (br != null) {
429                 try {
430                     br.close();
431                 } catch (IOException JavaDoc e) {
432                     logger.log(Level.SEVERE,
433                         "Closing " + f.getAbsolutePath(), e);
434                 }
435             }
436         }
437         return true;
438     }
439     
440     /**
441      * Read MIME type data from mimetype-report.txt.
442      * MIME type of text/dns is separated from other MIME types.
443      * @return True if we found some stats.
444      */

445     private boolean calculateMimeTypeDistribution() {
446         File JavaDoc f = new File JavaDoc(cjob.getDirectory(), "mimetype-report.txt");
447         if (!f.exists()) {
448             return false;
449         }
450         BufferedReader JavaDoc br = null;
451         try {
452             FileReader JavaDoc reader = new FileReader JavaDoc(f);
453             br = new BufferedReader JavaDoc(reader);
454             String JavaDoc line = br.readLine(); // Ignore heading
455
line = br.readLine();
456             while (line != null) {
457                 // Get num urls, num bytes, and MIME type (seperated by a space)
458
// Example: 12 134279 text/html
459

460                 String JavaDoc[] items = line.split(" ");
461                 if (items.length < 3) {
462                     logger.log(Level.WARNING,
463                             "Unexpected formatting on line [" + line + "]");
464                 }
465                 else {
466                     long total = Long.parseLong(items[0]);
467                     long bytes = Long.parseLong(items[1]);
468                     String JavaDoc mime = items[2];
469
470                     // Seperate DNS reconrds from HTTP
471
if (mime.equalsIgnoreCase("text/dns")) {
472                         mimeTypeDnsDistribution.put(mime,
473                                 new LongWrapper(total));
474                         mimeTypeDnsBytes.put(mime, new LongWrapper(bytes));
475                         totalDnsMimeTypeDocuments += total;
476                         totalDnsMimeSize += bytes;
477                     }
478                     else {
479                         mimeTypeDistribution.put(mime, new LongWrapper(total));
480                         mimeTypeBytes.put(mime, new LongWrapper(bytes));
481                         totalMimeTypeDocuments += total;
482                         totalMimeSize += bytes;
483                     }
484                 }
485                 line = br.readLine();
486             }
487         } catch (IOException JavaDoc e) {
488             logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
489         } finally {
490             if (br != null) {
491                 try {
492                     br.close();
493                 } catch (IOException JavaDoc e) {
494                     logger.log(Level.SEVERE,
495                         "Closing " + f.getAbsolutePath(), e);
496                 }
497             }
498         }
499         return true;
500     }
501     
502     /**
503      * Read number of URLs and total bytes for each host name from
504      * hosts-report.txt.
505      * Host name of "dns:" is separated from others.
506      * @return true if stats found.
507      */

508     private boolean calculateHostsDistribution() {
509         File JavaDoc f = new File JavaDoc(cjob.getDirectory(), "hosts-report.txt");
510         if (!f.exists()) {
511             return false;
512         }
513         BufferedReader JavaDoc br = null;
514         try {
515             FileReader JavaDoc reader = new FileReader JavaDoc(f);
516             br = new BufferedReader JavaDoc(reader);
517             String JavaDoc line = br.readLine(); // Ignore heading
518
line = br.readLine();
519             while (line != null) {
520                 // Get num urls, num bytes, and host name (seperated by a space)
521
// Example: 9 7468 www.blogger.com
522

523                 String JavaDoc[] items = line.split(" ");
524                 if (items.length < 3) {
525                     logger.log(Level.WARNING,
526                             "Unexpected formatting on line [" + line + "]");
527                 }
528                 else {
529                     long total = Long.parseLong(items[0]);
530                     long bytes = Long.parseLong(items[1]);
531                     String JavaDoc host = items[2];
532
533                     // Seperate DNS reconrds from HTTP
534
if (host.startsWith("dns:", 0)) {
535                         hostsDnsDistribution.put(host, new LongWrapper(total));
536                         hostsDnsBytes.put(host, new LongWrapper(bytes));
537                         totalDnsHostDocuments += total;
538                         totalDnsHostSize += bytes;
539                     }
540                     else {
541                         hostsDistribution.put(host, new LongWrapper(total));
542                         hostsBytes.put(host, new LongWrapper(bytes));
543                         totalHostDocuments += total;
544                         totalHostSize += bytes;
545
546                         // Count top level domain (TLD)
547
String JavaDoc tld = host.substring(host.lastIndexOf('.')+1);
548                         incrementMapCount(tldDistribution, tld, total);
549                         incrementMapCount(tldBytes, tld, bytes);
550                         incrementMapCount(tldHostDistribution, tld);
551                         totalTldDocuments += total;
552                         totalTldSize += bytes;
553
554                         totalHosts++;
555                     }
556                 }
557                 line = br.readLine();
558             }
559         } catch (IOException JavaDoc e) {
560             logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
561         } finally {
562             if (br != null) {
563                 try {
564                     br.close();
565                 } catch (IOException JavaDoc e) {
566                     logger.log(Level.SEVERE,
567                         "Closing " + f.getAbsolutePath(), e);
568                 }
569             }
570         }
571         return true;
572     }
573
574     /**
575      * Returns the accumulated number of bytes downloaded from a given host.
576      * @param host name of the host
577      * @return the accumulated number of bytes downloaded from a given host
578      */

579     public long getBytesPerHost(String JavaDoc host) {
580         long bytes = -1;
581         
582         bytes = host != null && host.startsWith("dns:", 0) ?
583             ((LongWrapper)hostsDnsBytes.get(host)).longValue :
584             ((LongWrapper)hostsBytes.get(host)).longValue;
585         
586         return bytes;
587     }
588     
589     /**
590      * Returns the total number of bytes downloaded for a given TLD.
591      * @param tld TLD
592      * @return the total number of bytes downloaded for a given TLD
593      */

594     public long getBytesPerTld(String JavaDoc tld) {
595         LongWrapper lw = (LongWrapper)tldBytes.get(tld);
596         return (lw == null ? 0 : lw.longValue);
597     }
598
599     /**
600      * Returns the accumulated number of bytes from files of a given file type.
601      * @param filetype Filetype to check.
602      * @return the accumulated number of bytes from files of a given mime type
603      */

604     public long getBytesPerMimeType(String JavaDoc filetype) {
605         long bytes = -1;
606         
607         if (filetype != null) {
608             if (filetype.equals("text/dns")) {
609                 bytes = mimeTypeDnsBytes.get(filetype) == null ? 0 :
610                     ((LongWrapper)mimeTypeDnsBytes.get(filetype)).longValue;
611             }
612             else {
613                 bytes = mimeTypeBytes.get(filetype) == null ? 0 :
614                     ((LongWrapper)mimeTypeBytes.get(filetype)).longValue;
615             }
616         }
617         return bytes;
618     }
619     
620     /**
621      * Reads duration time, processed docs/sec, bandwidth, and total size
622      * of crawl from crawl-report.txt.
623      * @return true if stats found.
624      */

625     public boolean readCrawlReport() {
626         File JavaDoc f = new File JavaDoc(cjob.getDirectory(), "crawl-report.txt");
627         if (!f.exists()) {
628             return false;
629         }
630         BufferedReader JavaDoc br = null;
631         try {
632             FileReader JavaDoc reader = new FileReader JavaDoc(f);
633             br = new BufferedReader JavaDoc(reader);
634             String JavaDoc line = br.readLine();
635             while (line != null) {
636                 if (line.startsWith("Duration Time")) {
637                     durationTime = line.substring(line.indexOf(':')+1);
638                 }
639                 else if (line.startsWith("Processed docs/sec")) {
640                     processedDocsPerSec = line.substring(line.indexOf(':')+1);
641                 }
642                 else if (line.startsWith("Bandwidth in Kbytes/sec")) {
643                     bandwidthKbytesPerSec = line.substring(line.indexOf(':')+1);
644                 }
645                 else if (line.startsWith("Total Raw Data Size in Bytes")) {
646                     totalDataWritten = line.substring(line.indexOf(':')+1);
647                 }
648
649                 line = br.readLine();
650             }
651         }
652         catch (IOException JavaDoc e) {
653             logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
654         } finally {
655             if (br != null) {
656                 try {
657                     br.close();
658                 } catch (IOException JavaDoc e) {
659                     logger.log(Level.SEVERE,
660                         "Failed close of " + f.getAbsolutePath(), e);
661                 }
662             }
663         }
664         return true;
665     }
666   
667     /**
668      * Returns sorted Iterator of seeds records based on status code.
669      * @return sorted Iterator of seeds records
670      */

671     public Iterator JavaDoc<SeedRecord> getSeedRecordsSortedByStatusCode() {
672         TreeSet JavaDoc<SeedRecord> sortedSet = new TreeSet JavaDoc<SeedRecord>(
673           new Comparator JavaDoc<SeedRecord>() {
674             public int compare(SeedRecord sr1, SeedRecord sr2) {
675                 int code1 = sr1.getStatusCode();
676                 int code2 = sr2.getStatusCode();
677                 if (code1 == code2) {
678                     // If the values are equal, sort by URIs.
679
return sr1.getUri().compareTo(sr2.getUri());
680                 }
681                 // mirror and shift the nubmer line so as to
682
// place zero at the beginning, then all negatives
683
// in order of ascending absolute value, then all
684
// positives descending
685
code1 = -code1 - Integer.MAX_VALUE;
686                 code2 = -code2 - Integer.MAX_VALUE;
687                 
688                 return new Integer JavaDoc(code1).compareTo(new Integer JavaDoc(code2));
689             }
690         });
691         for (SeedRecord sr: processedSeedsRecords.values()) {
692             sortedSet.add(sr);
693         }
694         
695         return sortedSet.iterator();
696     }
697     
698     /**
699      * Reads seed data from seeds-report.txt.
700      * @return True if stats found.
701      */

702     private boolean readSeedReport() {
703         File JavaDoc f = new File JavaDoc(cjob.getDirectory(), "seeds-report.txt");
704         if (!f.exists()) {
705             return false;
706         }
707         BufferedReader JavaDoc br = null;
708         try {
709             FileReader JavaDoc reader = new FileReader JavaDoc(f);
710             br = new BufferedReader JavaDoc(reader);
711             
712             // Ignore heading: [code] [status] [seed] [redirect]
713
String JavaDoc line = br.readLine();
714             line = br.readLine();
715             while (line != null) {
716                 // Example lines:
717
// 302 CRAWLED http://www.ashlandcitytimes.com/ http://www.ashlandcitytimes.com/apps/pbcs.dll/section?Category=MTCN01
718
// 200 CRAWLED http://noleeo.com/
719

720                 String JavaDoc[] items = line.split(" ");
721
722                 if (items.length < 3) {
723                     logger.log(Level.WARNING,
724                             "Unexpected formatting on line [" + line + "]");
725                 }
726                 else {
727                     String JavaDoc statusCode = items[0];
728                     String JavaDoc crawlStatus = items[1];
729                     String JavaDoc seed = items[2];
730                     String JavaDoc redirect = items.length > 3 ? items[3] : null;
731
732                     // All values should be CRAWLED or NOTCRAWLED
733
if (crawlStatus.equals("CRAWLED")) {
734                         crawlStatus =org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_SUCCESS;
735                     }
736                     else {
737                         crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_FAILURE;
738                     }
739                     SeedRecord sr = new SeedRecord(seed, crawlStatus,
740                             Integer.parseInt(statusCode), redirect);
741                     processedSeedsRecords.put(seed, sr);
742                 }
743
744                 line = br.readLine();
745             }
746         } catch (IOException JavaDoc e) {
747             logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
748         } finally {
749             if (br != null) {
750                 try {
751                     br.close();
752                 } catch (IOException JavaDoc e) {
753                     logger.log(Level.SEVERE,
754                         "Closing " + f.getAbsolutePath(), e);
755                 }
756             }
757         }
758         return true;
759     }
760         
761     /**
762      * Return a copy of the hosts distribution in reverse-sorted
763      * (largest first) order.
764      *
765      * @return SortedMap of hosts distribution
766      */

767     public SortedMap JavaDoc getReverseSortedHostsDistribution() {
768         return getReverseSortedCopy(hostsDistribution);
769     }
770     
771     /**
772      * @return True if we compiled stats, false if none to compile (e.g.
773      * there are no reports files on disk).
774      */

775     public boolean isStats() {
776         return this.stats;
777     }
778 }
779
Popular Tags