1 25 package org.archive.crawler.admin; 26 27 import java.io.File ; 28 import java.io.FileReader ; 29 import java.io.BufferedReader ; 30 import java.io.IOException ; 31 import java.util.Comparator ; 32 import java.util.Hashtable ; 33 import java.util.Iterator ; 34 import java.util.Map ; 35 import java.util.SortedMap ; 36 import java.util.TreeMap ; 37 import java.util.TreeSet ; 38 import java.util.logging.Level ; 39 import java.util.logging.Logger ; 40 41 import org.archive.util.LongWrapper; 42 43 44 69 public class StatisticsSummary { 70 73 private final static Logger logger = 74 Logger.getLogger(StatisticsSummary.class.getName()); 75 76 private boolean stats = true; 77 78 79 private CrawlJob cjob; 80 81 protected long totalDnsStatusCodeDocuments = 0; 82 protected long totalStatusCodeDocuments = 0; 83 protected long totalFileTypeDocuments = 0; 84 protected long totalMimeTypeDocuments = 0; 85 protected long totalDnsMimeTypeDocuments = 0; 86 protected long totalDnsHostDocuments = 0; 87 protected long totalHostDocuments = 0; 88 protected long totalMimeSize = 0; 89 protected long totalDnsMimeSize = 0; 90 protected long totalHostSize = 0; 91 protected long totalDnsHostSize = 0; 92 protected long totalTldDocuments = 0; 93 protected long totalTldSize = 0; 94 protected long totalHosts = 0; 95 96 protected String durationTime; 97 protected String processedDocsPerSec; 98 protected String bandwidthKbytesPerSec; 99 protected String totalDataWritten; 100 101 102 protected Hashtable <String ,LongWrapper> mimeTypeDistribution = new Hashtable <String ,LongWrapper>(); 103 protected Hashtable <String ,LongWrapper> mimeTypeBytes = new Hashtable <String ,LongWrapper>(); 104 protected Hashtable <String ,LongWrapper> mimeTypeDnsDistribution = new Hashtable <String ,LongWrapper>(); 105 protected Hashtable <String ,LongWrapper> mimeTypeDnsBytes = new Hashtable <String ,LongWrapper>(); 106 107 108 protected Hashtable <String ,LongWrapper> statusCodeDistribution = new Hashtable <String ,LongWrapper>(); 109 protected Hashtable <String ,LongWrapper> dnsStatusCodeDistribution 110 = new Hashtable <String ,LongWrapper>(); 111 112 113 protected Hashtable <String ,LongWrapper> hostsDistribution = new Hashtable <String ,LongWrapper>(); 114 protected Hashtable <String ,LongWrapper> hostsBytes = new Hashtable <String ,LongWrapper>(); 115 protected Hashtable <String ,LongWrapper> hostsDnsDistribution = new Hashtable <String ,LongWrapper>(); 116 protected Hashtable <String ,LongWrapper> hostsDnsBytes = new Hashtable <String ,LongWrapper>(); 117 118 119 protected Hashtable <String ,LongWrapper> tldDistribution = new Hashtable <String ,LongWrapper>(); 120 protected Hashtable <String ,LongWrapper> tldBytes = new Hashtable <String ,LongWrapper>(); 121 protected Hashtable <String ,LongWrapper> tldHostDistribution = new Hashtable <String ,LongWrapper>(); 122 123 124 protected transient Map <String ,SeedRecord> processedSeedsRecords 125 = new Hashtable <String ,SeedRecord>(); 126 127 133 public StatisticsSummary(CrawlJob cjob) { 134 this.cjob = cjob; 135 136 this.stats = calculateStatusCodeDistribution(); 138 if (calculateMimeTypeDistribution()) { 139 this.stats = true; 140 } 141 if (calculateHostsDistribution()) { 142 this.stats = true; 143 } 144 if (readCrawlReport()) { 145 this.stats = true; 146 } 147 if (readSeedReport()) { 148 this.stats = true; 149 } 150 } 151 152 153 162 protected static void incrementMapCount(Map <String ,LongWrapper> map, 163 String key) { 164 incrementMapCount(map,key,1); 165 } 166 167 182 protected static void incrementMapCount(Map <String ,LongWrapper> map, 183 String key, long increment) { 184 if (key == null) { 185 key = "unknown"; 186 } 187 LongWrapper lw = map.get(key); 188 if(lw == null) { 189 map.put(key, new LongWrapper(increment)); 190 } else { 191 lw.longValue += increment; 192 } 193 } 194 195 203 public Hashtable getMimeDistribution() { 204 return mimeTypeDistribution; 205 } 206 207 public long getTotalMimeTypeDocuments() { 208 return totalMimeTypeDocuments; 209 } 210 211 public long getTotalDnsMimeTypeDocuments() { 212 return totalDnsMimeTypeDocuments; 213 } 214 215 public long getTotalMimeSize() { 216 return totalMimeSize; 217 } 218 219 public long getTotalDnsMimeSize() { 220 return totalDnsMimeSize; 221 } 222 223 233 public Hashtable getStatusCodeDistribution() { 234 return statusCodeDistribution; 235 } 236 237 247 public Hashtable getDnsStatusCodeDistribution() { 248 return dnsStatusCodeDistribution; 249 } 250 251 public Hashtable getDnsMimeDistribution() { 252 return mimeTypeDnsDistribution; 253 } 254 255 public long getTotalDnsStatusCodeDocuments() { 256 return totalDnsStatusCodeDocuments; 257 } 258 259 public long getTotalStatusCodeDocuments() { 260 return totalStatusCodeDocuments; 261 } 262 263 public long getTotalHostDocuments() { 264 return totalHostDocuments; 265 } 266 267 public long getTotalDnsHostDocuments() { 268 return totalDnsHostDocuments; 269 } 270 271 public Hashtable getHostsDnsDistribution() { 272 return hostsDnsDistribution; 273 } 274 275 public long getTotalHostDnsDocuments() { 276 return totalDnsHostDocuments; 277 } 278 279 public long getTotalHostSize() { 280 return totalHostSize; 281 } 282 283 public long getTotalDnsHostSize() { 284 return totalDnsHostSize; 285 } 286 287 public Hashtable getTldDistribution() { 288 return tldDistribution; 289 } 290 291 public Hashtable getTldBytes() { 292 return tldBytes; 293 } 294 295 public long getTotalTldDocuments() { 296 return totalTldDocuments; 297 } 298 299 public long getTotalTldSize() { 300 return totalTldSize; 301 } 302 303 public Hashtable getTldHostDistribution() { 304 return tldHostDistribution; 305 } 306 307 public long getTotalHosts() { 308 return totalHosts; 309 } 310 311 public String getDurationTime() { 312 return durationTime; 313 } 314 315 public String getProcessedDocsPerSec() { 316 return processedDocsPerSec; 317 } 318 319 public String getBandwidthKbytesPerSec() { 320 return bandwidthKbytesPerSec; 321 } 322 323 public String getTotalDataWritten() { 324 return totalDataWritten; 325 } 326 327 342 public TreeMap <String ,LongWrapper> getReverseSortedCopy( 343 final Map <String ,LongWrapper> mapOfLongWrapperValues) { 344 TreeMap <String ,LongWrapper> sortedMap = new TreeMap <String ,LongWrapper>( 345 new Comparator <String >() { 346 public int compare(String e1, String e2) { 347 long firstVal = mapOfLongWrapperValues.get(e1).longValue; 348 long secondVal = mapOfLongWrapperValues.get(e2).longValue; 349 if (firstVal < secondVal) { 350 return 1; 351 } 352 if (secondVal < firstVal) { 353 return -1; 354 } 355 return e1.compareTo(e2); 357 } 358 }); 359 try { 360 sortedMap.putAll(mapOfLongWrapperValues); 361 } catch (UnsupportedOperationException e) { 362 for (String key: mapOfLongWrapperValues.keySet()) { 363 sortedMap.put(key, mapOfLongWrapperValues.get(key)); 364 } 365 } 366 return sortedMap; 367 } 368 369 375 public long getHostsPerTld(String tld) { 376 LongWrapper lw = (LongWrapper)tldHostDistribution.get(tld); 377 return (lw == null ? 0 : lw.longValue); 378 } 379 380 385 private boolean calculateStatusCodeDistribution() { 386 File f = new File (cjob.getDirectory(), "responsecode-report.txt"); 388 if (!f.exists()) { 389 return false; 390 } 391 BufferedReader br = null; 392 try { 393 FileReader reader = new FileReader (f); 394 br = new BufferedReader (reader); 395 String line = br.readLine(); line = br.readLine(); 397 while (line != null) { 398 400 String [] items = line.split(" "); 401 if (items.length < 2) { 402 logger.log(Level.WARNING, 403 "Unexpected formatting on line [" + line + "]"); 404 } 405 else { 406 if (items[0].length() < 3) { 408 long total = Long.parseLong(items[1]); 410 dnsStatusCodeDistribution.put(items[0], 411 new LongWrapper(total)); 412 totalDnsStatusCodeDocuments += total; 413 } 414 else { 415 long total = Long.parseLong(items[1]); 417 statusCodeDistribution.put(items[0], 418 new LongWrapper(total)); 419 totalStatusCodeDocuments += total; 420 } 421 } 422 line = br.readLine(); 423 } 424 } catch (IOException e) { 425 logger.log(Level.SEVERE, "Unable to read " + f.getAbsolutePath(), 426 e); 427 } finally { 428 if (br != null) { 429 try { 430 br.close(); 431 } catch (IOException e) { 432 logger.log(Level.SEVERE, 433 "Closing " + f.getAbsolutePath(), e); 434 } 435 } 436 } 437 return true; 438 } 439 440 445 private boolean calculateMimeTypeDistribution() { 446 File f = new File (cjob.getDirectory(), "mimetype-report.txt"); 447 if (!f.exists()) { 448 return false; 449 } 450 BufferedReader br = null; 451 try { 452 FileReader reader = new FileReader (f); 453 br = new BufferedReader (reader); 454 String line = br.readLine(); line = br.readLine(); 456 while (line != null) { 457 460 String [] items = line.split(" "); 461 if (items.length < 3) { 462 logger.log(Level.WARNING, 463 "Unexpected formatting on line [" + line + "]"); 464 } 465 else { 466 long total = Long.parseLong(items[0]); 467 long bytes = Long.parseLong(items[1]); 468 String mime = items[2]; 469 470 if (mime.equalsIgnoreCase("text/dns")) { 472 mimeTypeDnsDistribution.put(mime, 473 new LongWrapper(total)); 474 mimeTypeDnsBytes.put(mime, new LongWrapper(bytes)); 475 totalDnsMimeTypeDocuments += total; 476 totalDnsMimeSize += bytes; 477 } 478 else { 479 mimeTypeDistribution.put(mime, new LongWrapper(total)); 480 mimeTypeBytes.put(mime, new LongWrapper(bytes)); 481 totalMimeTypeDocuments += total; 482 totalMimeSize += bytes; 483 } 484 } 485 line = br.readLine(); 486 } 487 } catch (IOException e) { 488 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e); 489 } finally { 490 if (br != null) { 491 try { 492 br.close(); 493 } catch (IOException e) { 494 logger.log(Level.SEVERE, 495 "Closing " + f.getAbsolutePath(), e); 496 } 497 } 498 } 499 return true; 500 } 501 502 508 private boolean calculateHostsDistribution() { 509 File f = new File (cjob.getDirectory(), "hosts-report.txt"); 510 if (!f.exists()) { 511 return false; 512 } 513 BufferedReader br = null; 514 try { 515 FileReader reader = new FileReader (f); 516 br = new BufferedReader (reader); 517 String line = br.readLine(); line = br.readLine(); 519 while (line != null) { 520 523 String [] items = line.split(" "); 524 if (items.length < 3) { 525 logger.log(Level.WARNING, 526 "Unexpected formatting on line [" + line + "]"); 527 } 528 else { 529 long total = Long.parseLong(items[0]); 530 long bytes = Long.parseLong(items[1]); 531 String host = items[2]; 532 533 if (host.startsWith("dns:", 0)) { 535 hostsDnsDistribution.put(host, new LongWrapper(total)); 536 hostsDnsBytes.put(host, new LongWrapper(bytes)); 537 totalDnsHostDocuments += total; 538 totalDnsHostSize += bytes; 539 } 540 else { 541 hostsDistribution.put(host, new LongWrapper(total)); 542 hostsBytes.put(host, new LongWrapper(bytes)); 543 totalHostDocuments += total; 544 totalHostSize += bytes; 545 546 String tld = host.substring(host.lastIndexOf('.')+1); 548 incrementMapCount(tldDistribution, tld, total); 549 incrementMapCount(tldBytes, tld, bytes); 550 incrementMapCount(tldHostDistribution, tld); 551 totalTldDocuments += total; 552 totalTldSize += bytes; 553 554 totalHosts++; 555 } 556 } 557 line = br.readLine(); 558 } 559 } catch (IOException e) { 560 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e); 561 } finally { 562 if (br != null) { 563 try { 564 br.close(); 565 } catch (IOException e) { 566 logger.log(Level.SEVERE, 567 "Closing " + f.getAbsolutePath(), e); 568 } 569 } 570 } 571 return true; 572 } 573 574 579 public long getBytesPerHost(String host) { 580 long bytes = -1; 581 582 bytes = host != null && host.startsWith("dns:", 0) ? 583 ((LongWrapper)hostsDnsBytes.get(host)).longValue : 584 ((LongWrapper)hostsBytes.get(host)).longValue; 585 586 return bytes; 587 } 588 589 594 public long getBytesPerTld(String tld) { 595 LongWrapper lw = (LongWrapper)tldBytes.get(tld); 596 return (lw == null ? 0 : lw.longValue); 597 } 598 599 604 public long getBytesPerMimeType(String filetype) { 605 long bytes = -1; 606 607 if (filetype != null) { 608 if (filetype.equals("text/dns")) { 609 bytes = mimeTypeDnsBytes.get(filetype) == null ? 0 : 610 ((LongWrapper)mimeTypeDnsBytes.get(filetype)).longValue; 611 } 612 else { 613 bytes = mimeTypeBytes.get(filetype) == null ? 0 : 614 ((LongWrapper)mimeTypeBytes.get(filetype)).longValue; 615 } 616 } 617 return bytes; 618 } 619 620 625 public boolean readCrawlReport() { 626 File f = new File (cjob.getDirectory(), "crawl-report.txt"); 627 if (!f.exists()) { 628 return false; 629 } 630 BufferedReader br = null; 631 try { 632 FileReader reader = new FileReader (f); 633 br = new BufferedReader (reader); 634 String line = br.readLine(); 635 while (line != null) { 636 if (line.startsWith("Duration Time")) { 637 durationTime = line.substring(line.indexOf(':')+1); 638 } 639 else if (line.startsWith("Processed docs/sec")) { 640 processedDocsPerSec = line.substring(line.indexOf(':')+1); 641 } 642 else if (line.startsWith("Bandwidth in Kbytes/sec")) { 643 bandwidthKbytesPerSec = line.substring(line.indexOf(':')+1); 644 } 645 else if (line.startsWith("Total Raw Data Size in Bytes")) { 646 totalDataWritten = line.substring(line.indexOf(':')+1); 647 } 648 649 line = br.readLine(); 650 } 651 } 652 catch (IOException e) { 653 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e); 654 } finally { 655 if (br != null) { 656 try { 657 br.close(); 658 } catch (IOException e) { 659 logger.log(Level.SEVERE, 660 "Failed close of " + f.getAbsolutePath(), e); 661 } 662 } 663 } 664 return true; 665 } 666 667 671 public Iterator <SeedRecord> getSeedRecordsSortedByStatusCode() { 672 TreeSet <SeedRecord> sortedSet = new TreeSet <SeedRecord>( 673 new Comparator <SeedRecord>() { 674 public int compare(SeedRecord sr1, SeedRecord sr2) { 675 int code1 = sr1.getStatusCode(); 676 int code2 = sr2.getStatusCode(); 677 if (code1 == code2) { 678 return sr1.getUri().compareTo(sr2.getUri()); 680 } 681 code1 = -code1 - Integer.MAX_VALUE; 686 code2 = -code2 - Integer.MAX_VALUE; 687 688 return new Integer (code1).compareTo(new Integer (code2)); 689 } 690 }); 691 for (SeedRecord sr: processedSeedsRecords.values()) { 692 sortedSet.add(sr); 693 } 694 695 return sortedSet.iterator(); 696 } 697 698 702 private boolean readSeedReport() { 703 File f = new File (cjob.getDirectory(), "seeds-report.txt"); 704 if (!f.exists()) { 705 return false; 706 } 707 BufferedReader br = null; 708 try { 709 FileReader reader = new FileReader (f); 710 br = new BufferedReader (reader); 711 712 String line = br.readLine(); 714 line = br.readLine(); 715 while (line != null) { 716 720 String [] items = line.split(" "); 721 722 if (items.length < 3) { 723 logger.log(Level.WARNING, 724 "Unexpected formatting on line [" + line + "]"); 725 } 726 else { 727 String statusCode = items[0]; 728 String crawlStatus = items[1]; 729 String seed = items[2]; 730 String redirect = items.length > 3 ? items[3] : null; 731 732 if (crawlStatus.equals("CRAWLED")) { 734 crawlStatus =org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_SUCCESS; 735 } 736 else { 737 crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_FAILURE; 738 } 739 SeedRecord sr = new SeedRecord(seed, crawlStatus, 740 Integer.parseInt(statusCode), redirect); 741 processedSeedsRecords.put(seed, sr); 742 } 743 744 line = br.readLine(); 745 } 746 } catch (IOException e) { 747 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e); 748 } finally { 749 if (br != null) { 750 try { 751 br.close(); 752 } catch (IOException e) { 753 logger.log(Level.SEVERE, 754 "Closing " + f.getAbsolutePath(), e); 755 } 756 } 757 } 758 return true; 759 } 760 761 767 public SortedMap getReverseSortedHostsDistribution() { 768 return getReverseSortedCopy(hostsDistribution); 769 } 770 771 775 public boolean isStats() { 776 return this.stats; 777 } 778 } 779 | Popular Tags |