1 2 3 4 package net.nutch.tools; 5 6 import java.io.*; 7 import java.net.*; 8 import java.util.*; 9 import java.text.*; 10 import java.util.logging.*; 11 12 import net.nutch.db.*; 13 import net.nutch.io.*; 14 import net.nutch.fs.*; 15 import net.nutch.util.*; 16 import net.nutch.pagedb.*; 17 import net.nutch.linkdb.*; 18 19 25 public class FetchListTool { 26 public static final Logger LOG = LogFormatter.getLogger("net.nutch.tools.FetchListTool"); 27 private static String TOP_N_SORTER = "topNSorter"; 28 29 private static final long FETCH_GENERATION_DELAY_MS = 7 * 24 * 60 * 60 * 1000; 30 private boolean scoreByLinkCount = 31 NutchConf.getBoolean("fetchlist.score.by.link.count", false); 32 33 NutchFileSystem nfs; 34 File dbDir; 35 boolean refetchOnly, anchorOptimize; 36 float cutoffScore; 37 int seed; 38 39 59 class TableSet { 60 Vector outputPaths = new Vector(); 61 Vector tables = new Vector(); 62 long appendCounts[]; 63 boolean hasAppended = false; 64 65 67 public TableSet() { 68 } 69 70 74 public synchronized boolean add(String outputPath) throws IOException { 75 if (hasAppended) { 76 return false; 77 } 78 79 outputPaths.add(outputPath); 85 tables.add(new SequenceFile.Writer(nfs, outputPath + ".unsorted", MD5Hash.class, FetchListEntry.class)); 86 return true; 87 } 88 89 93 public synchronized boolean append(FetchListEntry newEntry) throws IOException { 94 hasAppended = true; 95 if (appendCounts == null) { 96 appendCounts = new long[outputPaths.size()]; 97 } 98 99 Page fetchPage = newEntry.getPage(); 100 101 String host = null; 103 try { 104 host = new URL(fetchPage.getURL().toString()).getHost().toLowerCase(); 105 } catch (MalformedURLException e) { 106 return false; 108 } 109 110 MD5Hash hash = MD5Hash.digest(host); 112 int index = Math.abs(hash.hashCode()^seed) % tables.size(); 113 114 SequenceFile.Writer writer = (SequenceFile.Writer) tables.elementAt(index); 116 writer.append(fetchPage.getMD5(), newEntry); 117 appendCounts[index]++; 118 119 return true; 120 } 121 122 129 public synchronized void close() throws IOException { 130 hasAppended = true; 131 132 for (Enumeration e = tables.elements(); e.hasMoreElements(); ) { 134 ((SequenceFile.Writer) e.nextElement()).close(); 135 } 136 137 SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs, new MD5Hash.Comparator(), FetchListEntry.class); 139 140 long totalEntries = 0; 146 double totalTime = 0; 147 int i = 0; 148 for (Enumeration e = outputPaths.elements(); e.hasMoreElements(); i++) { 149 String name = (String ) e.nextElement(); 150 String unsortedName = name + ".unsorted"; 151 152 long localStart = System.currentTimeMillis(); 153 sorter.sort(unsortedName, name + ".sorted"); 154 long localEnd = System.currentTimeMillis(); 155 156 if (appendCounts != null) { 157 double localSecs = ((localEnd - localStart) / 1000.0); 158 LOG.info("Processing " + unsortedName + ": Sorted " + appendCounts[i] + " entries in " + localSecs + " seconds."); 159 LOG.info("Processing " + unsortedName + ": Sorted " + (appendCounts[i] / localSecs) + " entries/second"); 160 161 totalEntries += appendCounts[i]; 162 totalTime += localSecs; 163 } 164 165 nfs.delete(new File(name + ".unsorted")); 166 } 167 168 LOG.info("Overall processing: Sorted " + totalEntries + " entries in " + totalTime + " seconds."); 169 LOG.info("Overall processing: Sorted " + (totalTime / totalEntries) + " entries/second"); 170 171 for (Enumeration e = outputPaths.elements(); e.hasMoreElements(); ) { 173 String name = (String ) e.nextElement(); 174 SequenceFile.Reader reader = new SequenceFile.Reader(nfs, name + ".sorted"); 175 ArrayFile.Writer af = new ArrayFile.Writer(nfs, name, FetchListEntry.class); 176 try { 177 MD5Hash key = new MD5Hash(); 178 FetchListEntry fle = new FetchListEntry(); 179 while (reader.next(key, fle)) { 180 af.append(fle); 181 } 182 } finally { 183 af.close(); 184 reader.close(); 185 nfs.delete(new File(name + ".sorted")); 186 } 187 } 188 } 189 } 190 191 194 public static class SortableScore implements WritableComparable { 195 float score; 196 197 199 public SortableScore() { 200 } 201 202 204 public void set(float score) { 205 this.score = score; 206 } 207 208 210 public float getFloat() { 211 return score; 212 } 213 214 215 219 222 public int compareTo(Object o) { 223 SortableScore otherScore = (SortableScore) o; 224 225 if (score < otherScore.score) { 226 return 1; 227 } else if (score == otherScore.score) { 228 return 0; 229 } else { 230 return -1; 231 } 232 } 233 234 236 public void write(DataOutput out) throws IOException { 237 out.writeFloat(score); 238 } 239 240 242 public void readFields(DataInput in) throws IOException { 243 this.score = in.readFloat(); 244 } 245 } 246 247 251 public FetchListTool(NutchFileSystem nfs, File dbDir, boolean refetchOnly, boolean anchorOptimize, float cutoffScore, int seed) throws IOException, FileNotFoundException { 252 this.nfs = nfs; 253 this.dbDir = dbDir; 254 this.refetchOnly = refetchOnly; 255 this.anchorOptimize = anchorOptimize; 256 this.cutoffScore = cutoffScore; 257 this.seed = seed; 258 } 259 260 264 public void emitMultipleLists(File dir, int numLists, long topN, long curTime) throws IOException { 265 TableSet tables = new TableSet(); 270 try { 271 String datePrefix = getDate(); 272 273 File workingDir = new File(dir, "tmp_" + getDate()); 274 nfs.mkdirs(workingDir); 275 try { 276 for (int i = 0; i < numLists; i++) { 277 File subdir = new File(dir, datePrefix + "-" + i); 278 nfs.mkdirs(subdir); 279 File file = new File(subdir, FetchListEntry.DIR_NAME); 280 tables.add(file.getPath()); 281 } 282 283 emitFetchList(tables, workingDir, topN, curTime); 285 } finally { 286 FileUtil.fullyDelete(nfs, workingDir); 287 } 288 } finally { 289 tables.close(); 290 } 291 } 292 293 296 public void emitFetchList(File segmentDir, long topN, long curTime) throws IOException { 297 TableSet tables = new TableSet(); 298 File workingDir = new File(segmentDir, "tmp_" + getDate()); 299 nfs.mkdirs(workingDir); 300 File subdir = new File(segmentDir, getDate()); 301 nfs.mkdirs(subdir); 302 303 try { 304 tables.add(new File(subdir, FetchListEntry.DIR_NAME).getPath()); 305 306 try { 307 emitFetchList(tables, workingDir, topN, curTime); 308 } finally { 309 tables.close(); 310 } 311 } finally { 312 FileUtil.fullyDelete(nfs, workingDir); 313 } 314 } 315 316 private static String getDate() { 317 return new SimpleDateFormat("yyyyMMddHHmmss").format 318 (new Date(System.currentTimeMillis())); 319 } 320 321 326 void emitFetchList(TableSet tables, File workingDir, long topN, long curTime) throws IOException { 327 long count = 0; 341 TreeMap anchorTable = new TreeMap(); 342 Vector unknownDomainLinks = new Vector(); 343 344 Comparator domainComparator = new Comparator() { 349 public int compare(Object o1, Object o2) { 350 Link l1 = (Link) o1; 351 Link l2 = (Link) o2; 352 if (l1.getDomainID() < l2.getDomainID()) { 353 return -1; 354 } else if (l1.getDomainID() == l2.getDomainID()) { 355 return 0; 356 } else { 357 return 1; 358 } 359 } 360 }; 361 362 SortableScore curScore = new SortableScore(); 368 File unsortedFile = new File(workingDir, TOP_N_SORTER + ".unsorted"); 369 SequenceFile.Writer writer = new SequenceFile.Writer(nfs, unsortedFile.getPath(), SortableScore.class, FetchListEntry.class); 370 try { 371 IWebDBReader webdb = new WebDBReader(nfs, dbDir); 372 try { 373 for (Enumeration e = webdb.pages(); e.hasMoreElements(); count++) { 374 Page page = (Page) e.nextElement(); 376 boolean shouldFetch = true; 377 378 if (((count % 10000) == 0) && (count != 0)) { 379 LOG.info("Processing page " + count + "..."); 380 } 381 382 if ((cutoffScore >= 0) && (page.getScore() < cutoffScore)) { 387 continue; 388 } 389 390 if (page.getNextFetchTime() > curTime || 397 page.getNextFetchTime() == Long.MAX_VALUE) { 398 continue; 399 } 400 401 if (refetchOnly) { 408 MD5Hash urlHash = MD5Hash.digest(page.getURL()); 409 if (page.getMD5().equals(urlHash)) { 410 shouldFetch = false; 411 } 412 } 413 414 Link inlinks[] = webdb.getLinks(page.getURL()); 423 if ((! shouldFetch) && anchorOptimize) { 424 boolean foundUsefulAnchor = false; 425 for (int i = 0; i < inlinks.length; i++) { 426 UTF8 anchorText = inlinks[i].getAnchorText(); 427 if ((anchorText != null) && 428 (anchorText.toString().trim().length() > 0)) { 429 foundUsefulAnchor = true; 430 break; 431 } 432 } 433 if (! foundUsefulAnchor) { 434 continue; 435 } 436 } 437 438 int uniqueAnchors = 0; 448 for (int i = 0; i < inlinks.length; i++) { 449 String anchor = inlinks[i].getAnchorText().toString().trim(); 450 451 if (anchor.length() > 0) { 452 if (inlinks[i].getDomainID() == 0) { 453 unknownDomainLinks.add(anchor); 454 } else { 455 Set domainUniqueLinks = (Set) anchorTable.get(anchor); 456 if (domainUniqueLinks == null) { 457 domainUniqueLinks = new TreeSet(domainComparator); 458 anchorTable.put(anchor, domainUniqueLinks); 459 } 460 if (domainUniqueLinks.add(inlinks[i])) { 461 uniqueAnchors++; 462 } 463 } 464 } 465 } 466 467 int i = 0; 475 String results[] = new String [uniqueAnchors + unknownDomainLinks.size()]; 476 for (Enumeration e2 = unknownDomainLinks.elements(); e2.hasMoreElements(); i++) { 477 results[i] = (String ) e2.nextElement(); 478 } 479 unknownDomainLinks.clear(); 480 481 for (Iterator it = anchorTable.keySet().iterator(); it.hasNext(); ) { 486 String key = (String ) it.next(); 487 Set domainUniqueLinks = (Set) anchorTable.get(key); 488 489 for (int j = 0; j < domainUniqueLinks.size(); j++) { 490 results[i++] = key; 491 } 492 } 493 anchorTable.clear(); 494 495 curScore.set(scoreByLinkCount ? 507 (float)Math.log(results.length) : page.getScore()); 508 page.setNextFetchTime(System.currentTimeMillis() + FETCH_GENERATION_DELAY_MS); 509 writer.append(curScore, new FetchListEntry(shouldFetch, page, results)); 510 } 511 } finally { 512 webdb.close(); 513 } 514 } catch (Exception ex) { 515 ex.printStackTrace(); 516 } finally { 517 writer.close(); 518 } 519 520 File sortedFile = new File(workingDir, TOP_N_SORTER + ".sorted"); 526 SequenceFile.Sorter topNSorter = new SequenceFile.Sorter(nfs, SortableScore.class, FetchListEntry.class); 527 topNSorter.sort(unsortedFile.getPath(), sortedFile.getPath()); 528 529 WebDBWriter dbwriter = new WebDBWriter(nfs, dbDir); 538 try { 539 SequenceFile.Reader reader = new SequenceFile.Reader(nfs, sortedFile.getPath()); 540 try { 541 SortableScore key = new SortableScore(); 542 FetchListEntry value = new FetchListEntry(); 543 while (topN > 0 && reader.next(key, value)) { 544 tables.append(value); 545 topN--; 546 547 dbwriter.addPage(value.getPage()); 559 } 560 } finally { 561 reader.close(); 562 } 563 } finally { 564 dbwriter.close(); 565 } 566 } 567 568 571 public static void main(String argv[]) throws IOException, FileNotFoundException { 572 if (argv.length < 2) { 573 System.out.println("Usage: FetchListTool (-local | -ndfs <namenode:port>) <db> <segment_dir> [-refetchonly] [-anchoroptimize linkdb] [-topN N] [-cutoff cutoffscore] [-numFetchers numFetchers] [-adddays numDays]"); 574 return; 575 } 576 577 int i = 0; 581 NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i); 582 File dbDir = new File(argv[i++]); 583 File segmentDir = new File(argv[i++]); 584 long curTime = System.currentTimeMillis(); 585 586 boolean refetchOnly = false, anchorOptimize = false; 590 long topN = Long.MAX_VALUE; 591 float cutoffScore = -1.0f; 592 int numFetchers = 1; 593 int seed = new Random().nextInt(); 594 595 596 try { 597 for (; i < argv.length; i++) { 598 if ("-refetchonly".equals(argv[i])) { 599 refetchOnly = true; 600 } else if ("-anchoroptimize".equals(argv[i])) { 601 anchorOptimize = true; 602 } else if ("-topN".equals(argv[i])) { 603 if (i+1 < argv.length) { 604 topN = Long.parseLong(argv[i+1]); 605 i++; 606 } else { 607 System.out.println("No argument present for -topN"); 608 return; 609 } 610 } else if ("-cutoff".equals(argv[i])) { 611 if (i+1 < argv.length) { 612 cutoffScore = Float.parseFloat(argv[i+1]); 613 i++; 614 } else { 615 System.out.println("No argument present for -cutoffscore"); 616 return; 617 } 618 } else if ("-numFetchers".equals(argv[i])) { 619 if (i+1 < argv.length) { 620 numFetchers = Integer.parseInt(argv[i+1]); 621 i++; 622 } else { 623 System.out.println("No argument present for -numFetchers"); 624 return; 625 } 626 } else if ("-adddays".equals(argv[i])) { 627 if (i+1 < argv.length) { 628 long numDays = Integer.parseInt(argv[i+1]); 629 curTime += numDays * 1000L * 60 * 60 * 24; 630 } else { 631 System.out.println("No argument present for -adddays"); 632 return; 633 } 634 } 635 } 636 } catch (NumberFormatException nfe) { 637 System.out.println("Badly-formatted number:: " + nfe); 638 return; 639 } 640 641 642 if (anchorOptimize && !refetchOnly) { 646 System.out.println("Tool cannot use -anchoroptimize option without -refetchonly option as well."); 647 return; 648 } 649 650 LOG.info("FetchListTool started"); 654 if (topN != Long.MAX_VALUE) { 655 LOG.info("topN:" + topN); 656 } 657 if (cutoffScore >= 0) { 658 LOG.info("cutoffscore:" + cutoffScore); 659 } 660 if (numFetchers > 1) { 661 LOG.info("seed:" + seed); 662 } 663 664 FetchListTool flt = new FetchListTool(nfs, dbDir, refetchOnly, anchorOptimize, cutoffScore, seed); 665 if (numFetchers > 1) { 666 flt.emitMultipleLists(segmentDir, numFetchers, topN, curTime); 667 } else { 668 flt.emitFetchList(segmentDir, topN, curTime); 669 } 670 nfs.close(); 671 LOG.info("FetchListTool completed"); 672 } 673 } 674 | Popular Tags |