1 22 package org.archive.crawler.admin; 23 24 import java.io.File ; 25 import java.io.FileWriter ; 26 import java.io.IOException ; 27 import java.io.PrintWriter ; 28 import java.io.Serializable ; 29 import java.util.Comparator ; 30 import java.util.Date ; 31 import java.util.EventObject ; 32 import java.util.Hashtable ; 33 import java.util.Iterator ; 34 import java.util.List ; 35 import java.util.Map ; 36 import java.util.HashMap ; 37 import java.util.SortedMap ; 38 import java.util.TreeMap ; 39 import java.util.TreeSet ; 40 import java.util.Vector ; 41 import java.util.logging.Level ; 42 import java.util.logging.Logger ; 43 44 import org.archive.crawler.datamodel.CrawlURI; 45 import org.archive.crawler.event.CrawlURIDispositionListener; 46 import org.archive.crawler.framework.AbstractTracker; 47 import org.archive.crawler.framework.CrawlController; 48 import org.archive.crawler.framework.exceptions.FatalConfigurationException; 49 import org.archive.net.UURI; 50 import org.archive.util.ArchiveUtils; 51 import org.archive.util.LongWrapper; 52 import org.archive.util.MimetypeUtils; 53 import org.archive.util.PaddingStringBuffer; 54 55 106 public class StatisticsTracker extends AbstractTracker 107 implements CrawlURIDispositionListener, Serializable { 108 private static final long serialVersionUID = 8004878315916392305L; 109 110 113 private final static Logger logger = 114 Logger.getLogger(StatisticsTracker.class.getName()); 115 116 119 protected long lastPagesFetchedCount = 0; 120 protected long lastProcessedBytesCount = 0; 121 122 125 protected long discoveredUriCount = 0; 126 protected long queuedUriCount = 0; 127 protected long finishedUriCount = 0; 128 129 protected long downloadedUriCount = 0; 130 protected long downloadFailures = 0; 131 protected long downloadDisregards = 0; 132 protected double docsPerSecond = 0; 133 protected double currentDocsPerSecond = 0; 134 protected int currentKBPerSec = 0; 135 protected long totalKBPerSec = 0; 136 protected int busyThreads = 0; 137 protected long totalProcessedBytes = 0; 138 protected float congestionRatio = 0; 139 protected long deepestUri; 140 protected long averageDepth; 141 142 145 146 protected Hashtable <String ,LongWrapper> mimeTypeDistribution 147 = new Hashtable <String ,LongWrapper>(); 148 protected Hashtable <String ,LongWrapper> mimeTypeBytes 149 = new Hashtable <String ,LongWrapper>(); 150 151 152 protected Hashtable <String ,LongWrapper> statusCodeDistribution 153 = new Hashtable <String ,LongWrapper>(); 154 155 164 protected transient Map <String ,LongWrapper> hostsDistribution = null; 165 protected transient Map <String ,LongWrapper> hostsBytes = null; 166 protected transient Map <String ,Long > hostsLastFinished = null; 167 168 169 protected transient 170 Map <String ,HashMap <String ,LongWrapper>> sourceHostDistribution = null; 171 172 175 protected transient Map <String ,SeedRecord> processedSeedsRecords; 176 177 private int seedsCrawled; 179 private int seedsNotCrawled; 180 private String sExitMessage = "Before crawl end"; 182 183 184 public StatisticsTracker(String name) { 185 super( name, "A statistics tracker thats integrated into " + 186 "the web UI and that creates the progress-statistics log."); 187 } 188 189 public void initialize(CrawlController c) 190 throws FatalConfigurationException { 191 super.initialize(c); 192 try { 193 this.sourceHostDistribution = c.getBigMap("sourceHostDistribution", 194 String .class, HashMap .class); 195 this.hostsDistribution = c.getBigMap("hostsDistribution", 196 String .class, LongWrapper.class); 197 this.hostsBytes = c.getBigMap("hostsBytes", String .class, 198 LongWrapper.class); 199 this.hostsLastFinished = c.getBigMap("hostsLastFinished", 200 String .class, Long .class); 201 this.processedSeedsRecords = c.getBigMap("processedSeedsRecords", 202 String .class, SeedRecord.class); 203 } catch (Exception e) { 204 throw new FatalConfigurationException("Failed setup of" + 205 " StatisticsTracker: " + e); 206 } 207 controller.addCrawlURIDispositionListener(this); 208 } 209 210 protected void finalCleanup() { 211 super.finalCleanup(); 212 if (this.hostsBytes != null) { 213 this.hostsBytes.clear(); 214 this.hostsBytes = null; 215 } 216 if (this.hostsDistribution != null) { 217 this.hostsDistribution.clear(); 218 this.hostsDistribution = null; 219 } 220 if (this.hostsLastFinished != null) { 221 this.hostsLastFinished.clear(); 222 this.hostsLastFinished = null; 223 } 224 if (this.processedSeedsRecords != null) { 225 this.processedSeedsRecords.clear(); 226 this.processedSeedsRecords = null; 227 } 228 if (this.sourceHostDistribution != null) { 229 this.sourceHostDistribution.clear(); 230 this.sourceHostDistribution = null; 231 } 232 233 } 234 235 protected synchronized void progressStatisticsEvent(final EventObject e) { 236 discoveredUriCount = discoveredUriCount(); 238 downloadedUriCount = successfullyFetchedCount(); 239 finishedUriCount = finishedUriCount(); 240 queuedUriCount = queuedUriCount(); 241 downloadFailures = failedFetchAttempts(); 242 downloadDisregards = disregardedFetchAttempts(); 243 totalProcessedBytes = totalBytesWritten(); 244 congestionRatio = congestionRatio(); 245 deepestUri = deepestUri(); 246 averageDepth = averageDepth(); 247 248 if (finishedUriCount() == 0) { 249 docsPerSecond = 0; 250 totalKBPerSec = 0; 251 } else if (getCrawlerTotalElapsedTime() < 1000) { 252 return; } else { 254 docsPerSecond = (double) downloadedUriCount / 255 (double)(getCrawlerTotalElapsedTime() / 1000); 256 totalKBPerSec = (long)(((totalProcessedBytes / 1024) / 258 ((getCrawlerTotalElapsedTime()) / 1000)) + .5 ); 259 } 260 261 busyThreads = activeThreadCount(); 262 263 if(shouldrun || 264 (System.currentTimeMillis() - lastLogPointTime) >= 1000) { 265 currentDocsPerSecond = 0; 270 currentKBPerSec = 0; 271 272 long currentTime = System.currentTimeMillis(); 274 long sampleTime = currentTime - lastLogPointTime; 275 276 if (sampleTime >= 1000) { 279 long currentPageCount = successfullyFetchedCount(); 281 long samplePageCount = currentPageCount - lastPagesFetchedCount; 282 283 currentDocsPerSecond = 284 (double) samplePageCount / (double)(sampleTime / 1000); 285 286 lastPagesFetchedCount = currentPageCount; 287 288 long currentProcessedBytes = totalProcessedBytes; 290 long sampleProcessedBytes = 291 currentProcessedBytes - lastProcessedBytesCount; 292 293 currentKBPerSec = 294 (int)(((sampleProcessedBytes/1024)/(sampleTime/1000)) + .5); 295 296 lastProcessedBytesCount = currentProcessedBytes; 297 } 298 } 299 300 if (this.controller != null) { 301 this.controller.logProgressStatistics(getProgressStatisticsLine()); 302 } 303 lastLogPointTime = System.currentTimeMillis(); 304 super.progressStatisticsEvent(e); 305 } 306 307 313 public String getProgressStatisticsLine(Date now) { 314 return new PaddingStringBuffer() 315 .append(ArchiveUtils.TIMESTAMP14ISO8601Z.format(now)) 316 .raAppend(32, discoveredUriCount) 317 .raAppend(44, queuedUriCount) 318 .raAppend(57, downloadedUriCount) 319 .raAppend(74, ArchiveUtils. 320 doubleToString(currentDocsPerSecond, 2) + 321 "(" + ArchiveUtils.doubleToString(docsPerSecond, 2) + ")") 322 .raAppend(85, currentKBPerSec + "(" + totalKBPerSec + ")") 323 .raAppend(99, downloadFailures) 324 .raAppend(113, busyThreads) 325 .raAppend(126, (Runtime.getRuntime().totalMemory() - 326 Runtime.getRuntime().freeMemory()) / 1024) 327 .raAppend(140, Runtime.getRuntime().totalMemory() / 1024) 328 .raAppend(153, ArchiveUtils.doubleToString(congestionRatio, 2)) 329 .raAppend(165, deepestUri) 330 .raAppend(177, averageDepth) 331 .toString(); 332 } 333 334 public Map <String ,Number > getProgressStatistics() { 335 Map <String ,Number > stats = new HashMap <String ,Number >(); 336 stats.put("discoveredUriCount", new Long (discoveredUriCount)); 337 stats.put("queuedUriCount", new Long (queuedUriCount)); 338 stats.put("downloadedUriCount", new Long (downloadedUriCount)); 339 stats.put("currentDocsPerSecond", new Double (currentDocsPerSecond)); 340 stats.put("docsPerSecond", new Double (docsPerSecond)); 341 stats.put("totalKBPerSec", new Long (totalKBPerSec)); 342 stats.put("totalProcessedBytes", new Long (totalProcessedBytes)); 343 stats.put("currentKBPerSec", new Long (currentKBPerSec)); 344 stats.put("downloadFailures", new Long (downloadFailures)); 345 stats.put("busyThreads", new Integer (busyThreads)); 346 stats.put("congestionRatio", new Double (congestionRatio)); 347 stats.put("deepestUri", new Long (deepestUri)); 348 stats.put("averageDepth", new Long (averageDepth)); 349 stats.put("totalMemory", new Long (Runtime.getRuntime().totalMemory())); 350 stats.put("freeMemory", new Long (Runtime.getRuntime().freeMemory())); 351 return stats; 352 } 353 354 359 public String getProgressStatisticsLine() { 360 return getProgressStatisticsLine(new Date ()); 361 } 362 363 public double processedDocsPerSec(){ 364 return docsPerSecond; 365 } 366 367 public double currentProcessedDocsPerSec(){ 368 return currentDocsPerSecond; 369 } 370 371 public long processedKBPerSec(){ 372 return totalKBPerSec; 373 } 374 375 public int currentProcessedKBPerSec(){ 376 return currentKBPerSec; 377 } 378 379 386 public Hashtable <String ,LongWrapper> getFileDistribution() { 387 return mimeTypeDistribution; 388 } 389 390 391 404 protected static void incrementMapCount(Map <String ,LongWrapper> map, 405 String key) { 406 incrementMapCount(map,key,1); 407 } 408 409 426 protected static void incrementMapCount(Map <String ,LongWrapper> map, 427 String key, long increment) { 428 if (key == null) { 429 key = "unknown"; 430 } 431 LongWrapper lw = (LongWrapper)map.get(key); 432 if(lw == null) { 433 map.put(key, new LongWrapper(increment)); 434 } else { 435 lw.longValue += increment; 436 } 437 } 438 439 454 public TreeMap <String ,LongWrapper> getReverseSortedCopy( 455 final Map <String ,LongWrapper> mapOfLongWrapperValues) { 456 TreeMap <String ,LongWrapper> sortedMap = 457 new TreeMap <String ,LongWrapper>(new Comparator <String >() { 458 public int compare(String e1, String e2) { 459 long firstVal = mapOfLongWrapperValues.get(e1). 460 longValue; 461 long secondVal = mapOfLongWrapperValues.get(e2). 462 longValue; 463 if (firstVal < secondVal) { 464 return 1; 465 } 466 if (secondVal < firstVal) { 467 return -1; 468 } 469 return e1.compareTo(e2); 471 } 472 }); 473 try { 474 sortedMap.putAll(mapOfLongWrapperValues); 475 } catch (UnsupportedOperationException e) { 476 Iterator <String > i = mapOfLongWrapperValues.keySet().iterator(); 477 for (;i.hasNext();) { 478 String key = i.next(); 480 sortedMap.put(key, mapOfLongWrapperValues.get(key)); 481 } 482 } 483 return sortedMap; 484 } 485 486 496 public Hashtable <String ,LongWrapper> getStatusCodeDistribution() { 497 return statusCodeDistribution; 498 } 499 500 509 public long getHostLastFinished(String host){ 510 Long l = null; 511 synchronized(hostsLastFinished){ 512 l = (Long )hostsLastFinished.get(host); 513 } 514 return (l != null)? l.longValue(): -1; 515 } 516 517 522 public long getBytesPerHost(String host){ 523 synchronized(hostsBytes){ 524 return ((LongWrapper)hostsBytes.get(host)).longValue; 525 } 526 } 527 528 533 public long getBytesPerFileType(String filetype){ 534 return ((LongWrapper)mimeTypeBytes.get(filetype)).longValue; 535 } 536 537 542 public int threadCount() { 543 return this.controller != null? controller.getToeCount(): 0; 544 } 545 546 549 public int activeThreadCount() { 550 return this.controller != null? controller.getActiveToeCount(): 0; 551 } 554 555 562 public int percentOfDiscoveredUrisCompleted() { 563 long completed = finishedUriCount(); 564 long total = discoveredUriCount(); 565 566 if (total == 0) { 567 return 0; 568 } 569 570 return (int) (100 * completed / total); 571 } 572 573 583 public long discoveredUriCount() { 584 return shouldrun && this.controller != null && 587 this.controller.getFrontier() != null? 588 controller.getFrontier().discoveredUriCount() : discoveredUriCount; 589 } 590 591 598 public long finishedUriCount() { 599 return shouldrun && this.controller != null && 600 this.controller.getFrontier() != null ? 601 controller.getFrontier().finishedUriCount() : finishedUriCount; 602 } 603 604 609 public long failedFetchAttempts() { 610 return shouldrun && this.controller != null && 613 this.controller.getFrontier() != null ? 614 controller.getFrontier().failedFetchCount() : downloadFailures; 615 } 616 617 622 public long disregardedFetchAttempts() { 623 return shouldrun && this.controller != null && 626 this.controller.getFrontier() != null? 627 controller.getFrontier().disregardedUriCount() : downloadDisregards; 628 } 629 630 public long successfullyFetchedCount() { 631 return shouldrun && this.controller != null && 634 this.controller.getFrontier() != null? 635 controller.getFrontier().succeededFetchCount() : downloadedUriCount; 636 } 637 638 public long totalCount() { 639 return queuedUriCount() + activeThreadCount() + 640 successfullyFetchedCount(); 641 } 642 643 650 public float congestionRatio() { 651 return shouldrun && this.controller != null && 654 this.controller.getFrontier() != null ? 655 controller.getFrontier().congestionRatio() : congestionRatio; 656 } 657 658 665 public long deepestUri() { 666 return shouldrun && this.controller != null && 669 this.controller.getFrontier() != null ? 670 controller.getFrontier().deepestUri() : deepestUri; 671 } 672 673 679 public long averageDepth() { 680 return shouldrun && this.controller != null && 683 this.controller.getFrontier() != null ? 684 controller.getFrontier().averageDepth() : averageDepth; 685 } 686 687 697 public long queuedUriCount() { 698 return shouldrun && this.controller != null && 701 this.controller.getFrontier() != null? 702 controller.getFrontier().queuedUriCount() : queuedUriCount; 703 } 704 705 public long totalBytesWritten() { 706 return shouldrun && this.controller != null && 707 this.controller.getFrontier() != null? 708 controller.getFrontier().totalBytesWritten() : totalProcessedBytes; 709 } 710 711 717 private void handleSeed(CrawlURI curi, String disposition) { 718 if(curi.isSeed()){ 719 SeedRecord sr = new SeedRecord(curi, disposition); 720 processedSeedsRecords.put(sr.getUri(), sr); 721 } 722 } 723 724 public void crawledURISuccessful(CrawlURI curi) { 725 handleSeed(curi,SEED_DISPOSITION_SUCCESS); 726 incrementMapCount(statusCodeDistribution, 728 Integer.toString(curi.getFetchStatus())); 729 730 String mime = MimetypeUtils.truncate(curi.getContentType()); 732 incrementMapCount(mimeTypeDistribution, mime); 733 incrementMapCount(mimeTypeBytes, mime, curi.getContentSize()); 734 735 saveHostStats((curi.getFetchStatus() == 1)? "dns:": 737 this.controller.getServerCache(). 738 getHostFor(curi).getHostName(), 739 curi.getContentSize()); 740 741 if (curi.containsKey(CrawlURI.A_SOURCE_TAG)){ 742 saveSourceStats(curi.getString(CrawlURI.A_SOURCE_TAG), 743 this.controller.getServerCache().getHostFor(curi). 744 getHostName()); 745 } 746 } 747 748 protected void saveSourceStats(String source, String hostname) { 749 synchronized(sourceHostDistribution) { 750 HashMap <String ,LongWrapper> hostUriCount = 751 sourceHostDistribution.get(source); 752 if (hostUriCount == null) { 753 hostUriCount = new HashMap <String ,LongWrapper>(); 754 } 755 incrementMapCount(hostUriCount, hostname); 761 sourceHostDistribution.put(source, hostUriCount); 762 } 763 } 764 765 protected void saveHostStats(String hostname, long size) { 766 synchronized(hostsDistribution){ 767 incrementMapCount(hostsDistribution, hostname); 768 } 769 synchronized(hostsBytes){ 770 incrementMapCount(hostsBytes, hostname, size); 771 } 772 synchronized(hostsLastFinished){ 773 hostsLastFinished.put(hostname, 774 new Long (System.currentTimeMillis())); 775 } 776 } 777 778 public void crawledURINeedRetry(CrawlURI curi) { 779 handleSeed(curi,SEED_DISPOSITION_RETRY); 780 } 781 782 public void crawledURIDisregard(CrawlURI curi) { 783 handleSeed(curi,SEED_DISPOSITION_DISREGARD); 784 } 785 786 public void crawledURIFailure(CrawlURI curi) { 787 handleSeed(curi,SEED_DISPOSITION_FAILURE); 788 } 789 790 799 public Iterator <String > getSeeds() { 800 List <String > seedsCopy = new Vector <String >(); 801 Iterator <UURI> i = controller.getScope().seedsIterator(); 802 while (i.hasNext()) { 803 seedsCopy.add(i.next().toString()); 804 } 805 return seedsCopy.iterator(); 806 } 807 808 public Iterator getSeedRecordsSortedByStatusCode() { 809 return getSeedRecordsSortedByStatusCode(getSeeds()); 810 } 811 812 protected Iterator <SeedRecord> getSeedRecordsSortedByStatusCode( 813 Iterator <String > i) { 814 TreeSet <SeedRecord> sortedSet = 815 new TreeSet <SeedRecord>(new Comparator <SeedRecord>() { 816 public int compare(SeedRecord sr1, SeedRecord sr2) { 817 int code1 = sr1.getStatusCode(); 818 int code2 = sr2.getStatusCode(); 819 if (code1 == code2) { 820 return sr1.getUri().compareTo(sr2.getUri()); 822 } 823 code1 = -code1 - Integer.MAX_VALUE; 828 code2 = -code2 - Integer.MAX_VALUE; 829 830 return new Integer (code1).compareTo(new Integer (code2)); 831 } 832 }); 833 while (i.hasNext()) { 834 String seed = i.next(); 835 SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed); 836 if(sr==null) { 837 sr = new SeedRecord(seed,SEED_DISPOSITION_NOT_PROCESSED); 838 processedSeedsRecords.put(seed,sr); 839 } 840 sortedSet.add(sr); 841 } 842 return sortedSet.iterator(); 843 } 844 845 public void crawlEnded(String message) { 846 logger.info("Entered crawlEnded"); 847 this.sExitMessage = message; super.crawlEnded(message); 849 logger.info("Leaving crawlEnded"); 850 } 851 852 855 protected void writeSeedsReportTo(PrintWriter writer) { 856 writer.print("[code] [status] [seed] [redirect]\n"); 858 859 seedsCrawled = 0; 860 seedsNotCrawled = 0; 861 for (Iterator i = getSeedRecordsSortedByStatusCode(getSeeds()); 862 i.hasNext();) { 863 SeedRecord sr = (SeedRecord)i.next(); 864 writer.print(sr.getStatusCode()); 865 writer.print(" "); 866 if((sr.getStatusCode() > 0)) { 867 seedsCrawled++; 868 writer.print("CRAWLED"); 869 } else { 870 seedsNotCrawled++; 871 writer.print("NOTCRAWLED"); 872 } 873 writer.print(" "); 874 writer.print(sr.getUri()); 875 if(sr.getRedirectUri()!=null) { 876 writer.print(" "); 877 writer.print(sr.getRedirectUri()); 878 } 879 writer.print("\n"); 880 } 881 } 882 883 protected void writeSourceReportTo(PrintWriter writer) { 884 885 writer.print("[source] [host] [#urls]\n"); 886 for (Iterator i = sourceHostDistribution.keySet().iterator(); i.hasNext();) { 888 Object sourceKey = i.next(); 889 Map <String ,LongWrapper> hostCounts 890 = (Map <String ,LongWrapper>)sourceHostDistribution.get(sourceKey); 891 SortedMap sortedHostCounts = getReverseSortedHostCounts(hostCounts); 893 for (Iterator j = sortedHostCounts.keySet().iterator(); j.hasNext();) { 895 Object hostKey = j.next(); 896 LongWrapper hostCount = (LongWrapper) hostCounts.get(hostKey); 897 writer.print(sourceKey.toString()); 898 writer.print(" "); 899 writer.print(hostKey.toString()); 900 writer.print(" "); 901 writer.print(hostCount.longValue); 902 writer.print("\n"); 903 } 904 } 905 } 906 907 913 public SortedMap getReverseSortedHostCounts( 914 Map <String ,LongWrapper> hostCounts) { 915 synchronized(hostCounts){ 916 return getReverseSortedCopy(hostCounts); 917 } 918 } 919 920 921 protected void writeHostsReportTo(PrintWriter writer) { 922 SortedMap hd = getReverseSortedHostsDistribution(); 923 writer.print("[#urls] [#bytes] [host]\n"); 925 for (Iterator i = hd.keySet().iterator(); i.hasNext();) { 926 Object key = i.next(); 928 if (hd.get(key)!=null) { 929 writer.print(((LongWrapper)hd.get(key)).longValue); 930 } else { 931 writer.print("-"); 932 } 933 writer.print(" "); 934 writer.print(getBytesPerHost((String )key)); 935 writer.print(" "); 936 writer.print((String )key); 937 writer.print("\n"); 938 } 939 } 940 941 946 public SortedMap getReverseSortedHostsDistribution() { 947 synchronized(hostsDistribution){ 948 return getReverseSortedCopy(hostsDistribution); 949 } 950 } 951 952 protected void writeMimetypesReportTo(PrintWriter writer) { 953 writer.print("[#urls] [#bytes] [mime-types]\n"); 955 TreeMap fd = getReverseSortedCopy(getFileDistribution()); 956 for (Iterator i = fd.keySet().iterator(); i.hasNext();) { 957 Object key = i.next(); 958 writer.print(Long.toString(((LongWrapper)fd.get(key)).longValue)); 960 writer.print(" "); 961 writer.print(Long.toString(getBytesPerFileType((String )key))); 962 writer.print(" "); 963 writer.print((String )key); 964 writer.print("\n"); 965 } 966 } 967 968 protected void writeResponseCodeReportTo(PrintWriter writer) { 969 writer.print("[rescode] [#urls]\n"); 971 TreeMap scd = getReverseSortedCopy(getStatusCodeDistribution()); 972 for (Iterator i = scd.keySet().iterator(); i.hasNext();) { 973 Object key = i.next(); 974 writer.print((String )key); 975 writer.print(" "); 976 writer.print(Long.toString(((LongWrapper)scd.get(key)).longValue)); 977 writer.print("\n"); 978 } 979 } 980 981 protected void writeCrawlReportTo(PrintWriter writer) { 982 writer.print("Crawl Name: " + controller.getOrder().getCrawlOrderName()); 983 writer.print("\nCrawl Status: " + sExitMessage); 984 writer.print("\nDuration Time: " + 985 ArchiveUtils.formatMillisecondsToConventional(crawlDuration())); 986 writer.print("\nTotal Seeds Crawled: " + seedsCrawled); 987 writer.print("\nTotal Seeds not Crawled: " + seedsNotCrawled); 988 writer.print("\nTotal Hosts Crawled: " + (hostsDistribution.size()-1)); 990 writer.print("\nTotal Documents Crawled: " + finishedUriCount); 991 writer.print("\nProcessed docs/sec: " + 992 ArchiveUtils.doubleToString(docsPerSecond,2)); 993 writer.print("\nBandwidth in Kbytes/sec: " + totalKBPerSec); 994 writer.print("\nTotal Raw Data Size in Bytes: " + totalProcessedBytes + 995 " (" + ArchiveUtils.formatBytesForDisplay(totalProcessedBytes) + 996 ") \n"); 997 } 998 999 protected void writeProcessorsReportTo(PrintWriter writer) { 1000 controller.reportTo(CrawlController.PROCESSORS_REPORT,writer); 1001 } 1002 1003 protected void writeReportFile(String reportName, String filename) { 1004 File f = new File (controller.getDisk().getPath(), filename); 1005 try { 1006 PrintWriter bw = new PrintWriter (new FileWriter (f)); 1007 writeReportTo(reportName, bw); 1008 bw.close(); 1009 controller.addToManifest(f.getAbsolutePath(), 1010 CrawlController.MANIFEST_REPORT_FILE, true); 1011 } catch (IOException e) { 1012 logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() + 1013 " at the end of crawl.", e); 1014 } 1015 logger.info("wrote report: " + f.getAbsolutePath()); 1016 } 1017 1018 1021 protected void writeManifestReportTo(PrintWriter writer) { 1022 controller.reportTo(CrawlController.MANIFEST_REPORT, writer); 1023 } 1024 1025 1029 private void writeReportTo(String reportName, PrintWriter w) { 1030 if("hosts".equals(reportName)) { 1031 writeHostsReportTo(w); 1032 } else if ("mime types".equals(reportName)) { 1033 writeMimetypesReportTo(w); 1034 } else if ("response codes".equals(reportName)) { 1035 writeResponseCodeReportTo(w); 1036 } else if ("seeds".equals(reportName)) { 1037 writeSeedsReportTo(w); 1038 } else if ("crawl".equals(reportName)) { 1039 writeCrawlReportTo(w); 1040 } else if ("processors".equals(reportName)) { 1041 writeProcessorsReportTo(w); 1042 } else if ("manifest".equals(reportName)) { 1043 writeManifestReportTo(w); 1044 } else if ("frontier".equals(reportName)) { 1045 writeFrontierReportTo(w); 1046 } else if ("source".equals(reportName)) { 1047 writeSourceReportTo(w); 1048 } } 1050 1051 1055 protected void writeFrontierReportTo(PrintWriter writer) { 1056 if(controller.getFrontier().isEmpty()) { 1057 writer.println("frontier empty"); 1058 } else { 1059 controller.getFrontier().reportTo("nonempty", writer); 1060 } 1061 } 1062 1063 1066 public void dumpReports() { 1067 controller.addOrderToManifest(); 1070 writeReportFile("hosts","hosts-report.txt"); 1071 writeReportFile("mime types","mimetype-report.txt"); 1072 writeReportFile("response codes","responsecode-report.txt"); 1073 writeReportFile("seeds","seeds-report.txt"); 1074 writeReportFile("crawl","crawl-report.txt"); 1075 writeReportFile("processors","processors-report.txt"); 1076 writeReportFile("manifest","crawl-manifest.txt"); 1077 writeReportFile("frontier","frontier-report.txt"); 1078 if (!sourceHostDistribution.isEmpty()) { 1079 writeReportFile("source","source-report.txt"); 1080 } 1081 } 1083 1084 public void crawlCheckpoint(File cpDir) throws Exception { 1085 logNote("CRAWL CHECKPOINTING TO " + cpDir.toString()); 1087 } 1088} 1089 | Popular Tags |