1 2 3 4 package net.nutch.db; 5 6 import java.io.*; 7 import java.util.*; 8 9 import net.nutch.db.*; 10 import net.nutch.io.*; 11 import net.nutch.fs.*; 12 import net.nutch.util.*; 13 import net.nutch.linkdb.*; 14 import net.nutch.pagedb.*; 15 16 29 public class DBTester { 30 static int MAX_OUTLINKS = 20; 31 32 NutchFileSystem nfs; 33 long seed; 34 Random rand; 35 File webdb; 36 int maxPages; 37 TreeSet seenLinks = new TreeSet(); 38 TreeMap md5Hashes = new TreeMap(); 39 long pageCount = 0, linkCount = 0, totalLinksEver = 0; 40 Page pages[]; 41 Vector outlinks[]; 42 Hashtable inlinks; 43 44 46 public DBTester(NutchFileSystem nfs, File dir, int maxPages) throws IOException { 47 this(nfs, dir, new Random().nextLong(), maxPages); 48 } 49 50 54 public DBTester(NutchFileSystem nfs, File dir, long seed, int maxPages) throws IOException { 55 this.nfs = nfs; 56 this.maxPages = maxPages; 57 this.webdb = new File(dir, "webdb_test"); 58 if (webdb.exists()) { 59 throw new IOException("File " + webdb + " already exists"); 60 } 61 webdb.mkdirs(); 62 this.seed = seed; 63 64 WebDBWriter.createWebDB(nfs, webdb); 65 66 this.rand = new Random(seed); 67 System.out.println("-----------------------------------------------"); 68 System.out.println("DBTester created at " + new Date(System.currentTimeMillis())); 69 System.out.println("WebDB: " + webdb); 70 System.out.println("Seed: " + seed); 71 System.out.println("-----------------------------------------------"); 72 73 74 pages = new Page[maxPages]; 81 outlinks = new Vector[maxPages]; 82 for (int i = 0; i < outlinks.length; i++) { 83 outlinks[i] = new Vector(); 84 } 85 inlinks = new Hashtable(); 86 } 87 88 100 public void runTest() throws IOException { 101 System.out.println("CREATING WEB MODEL, CHECKING CONSISTENCY"); 107 createGraph(); 108 checkConsistency(); 110 111 int maxTests = 10; 122 for (int i = 1; i <= maxTests; i++) { 123 System.out.println("EDIT-CONSISTENCY TEST (" + i + " of " + maxTests + ")"); 124 makeEdits(); 125 checkConsistency(); 126 } 127 128 System.out.println("API TEST"); 132 apiTest(); 133 134 System.out.println("DB PAGE-DELETE TEST"); 140 IWebDBReader db = new WebDBReader(nfs, webdb); 141 Vector toRemove = new Vector(); 142 try { 143 for (Enumeration e = db.pages(); e.hasMoreElements(); ) { 144 Page p = (Page) e.nextElement(); 145 146 if (Math.abs(rand.nextInt()) % 100 == 0) { 147 toRemove.add(p); 148 } 149 } 150 } finally { 151 db.close(); 152 } 153 154 IWebDBWriter dbwriter = new WebDBWriter(nfs, webdb); 158 try { 159 for (Enumeration e = toRemove.elements(); e.hasMoreElements(); ) { 160 Page p = (Page) e.nextElement(); 161 dbwriter.deletePage(p.getURL().toString()); 162 } 163 } finally { 164 dbwriter.close(); 165 } 166 167 db = new WebDBReader(nfs, webdb); 169 try { 170 for (Enumeration e = toRemove.elements(); e.hasMoreElements(); ) { 171 Page p = (Page) e.nextElement(); 172 173 Page result = db.getPage(p.getURL().toString()); 174 if (result != null) { 175 throw new IOException("Found a Page that should have been deleted: " + result); 177 } 178 179 Link results[] = db.getLinks(p.getURL()); 180 if (results.length != 0) { 181 throw new IOException("Should find no inlinks for deleted URL " + p.getURL() + ", but found " + results.length + " of them."); 183 } 184 } 185 } finally { 186 db.close(); 187 } 188 189 System.out.println("*** TEST COMPLETE ***"); 190 } 191 192 196 public void cleanup() throws IOException { 197 FileUtil.fullyDelete(nfs, webdb); 198 } 199 200 204 private void createGraph() throws IOException { 205 IWebDBWriter writer = new WebDBWriter(nfs, webdb); 206 try { 207 for (int i = 0; i < maxPages; i++) { 208 pages[i] = createRandomPage(); 210 writer.addPage(pages[i]); 211 pageCount++; 212 } 213 214 for (int i = 0; i < maxPages; i++) { 216 pages[i].setNumOutlinks(makeOutlinkSet(writer, i)); 217 } 218 } finally { 219 writer.close(); 220 } 221 } 222 223 228 private void makeEdits() throws IOException { 229 IWebDBWriter writer = new WebDBWriter(nfs, webdb); 230 try { 231 int actions[] = new int[pages.length]; 232 233 for (int i = 0; i < maxPages; i++) { 234 Page curPage = pages[i]; 235 236 int action = Math.abs(rand.nextInt() % 2); 238 actions[i] = action; 239 if (action == 0) { 240 Integer hashCount = (Integer ) md5Hashes.get(curPage.getMD5()); 242 if (hashCount.intValue() == 1) { 243 md5Hashes.remove(curPage.getMD5()); 244 } else { 245 md5Hashes.put(curPage.getMD5(), new Integer (hashCount.intValue() - 1)); 246 } 247 pages[i] = null; 248 writer.deletePage(curPage.getURL().toString()); 249 linkCount -= outlinks[i].size(); 250 251 for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) { 257 Link curOutlink = (Link) e.nextElement(); 258 259 seenLinks.remove(curOutlink); 261 262 int removeIndex = -1, pos = 0; 268 Vector inlinkList = (Vector) inlinks.get(curOutlink.getURL().toString()); 269 for (Enumeration e2 = inlinkList.elements(); e2.hasMoreElements(); pos++) { 271 Link curInlink = (Link) e2.nextElement(); 272 if (curInlink.getFromID().equals(curOutlink.getFromID())) { 273 removeIndex = pos; 274 break; 275 } 276 } 277 278 if (removeIndex >= 0) { 280 inlinkList.removeElementAt(removeIndex); 281 } 282 } 283 284 outlinks[i].clear(); 286 287 pages[i] = createRandomPage(); 289 290 } else if (action == 1) { 293 Integer hashCount = (Integer ) md5Hashes.get(curPage.getMD5()); 295 if (hashCount.intValue() == 1) { 296 md5Hashes.remove(curPage.getMD5()); 297 } else { 298 md5Hashes.put(curPage.getMD5(), new Integer (hashCount.intValue() - 1)); 299 } 300 301 MD5Hash md5Hash = null; 307 do { 308 md5Hash = MD5Hash.digest(createRandomString(Math.abs(rand.nextInt() % 2048))); 309 hashCount = (Integer ) md5Hashes.get(md5Hash); 310 } while (hashCount != null); 311 312 md5Hashes.put(md5Hash, new Integer (1)); 313 pages[i].setMD5(md5Hash); 314 315 linkCount -= outlinks[i].size(); 319 320 for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) { 327 Link curOutlink = (Link) e.nextElement(); 328 329 seenLinks.remove(curOutlink); 331 332 int removeIndex = -1, pos = 0; 338 Vector inlinkList = (Vector) inlinks.get(curOutlink.getURL().toString()); 339 for (Enumeration e2 = inlinkList.elements(); e2.hasMoreElements(); pos++) { 341 Link curLink = (Link) e2.nextElement(); 342 if (curLink.getFromID().equals(curOutlink.getFromID())) { 343 removeIndex = pos; 344 break; 345 } 346 } 347 348 if (removeIndex >= 0) { 350 inlinkList.removeElementAt(removeIndex); 351 } 352 } 353 354 outlinks[i].clear(); 357 pages[i].setNumOutlinks(0); 358 359 } 361 } 363 364 for (int i = 0; i < maxPages; i++) { 367 if ((actions[i] == 0) || (actions[i] == 1)) { 368 pages[i].setNumOutlinks(makeOutlinkSet(writer, i)); 370 writer.addPage(pages[i]); 371 } 372 } 373 } finally { 374 writer.close(); 375 } 376 } 377 378 383 private void checkConsistency() throws IOException { 384 IWebDBReader reader = new WebDBReader(nfs, webdb); 385 try { 386 if (pageCount != reader.numPages()) { 388 throw new IOException("DB claims " + reader.numPages() + " pages, but should be " + pageCount); 389 } 390 391 if (seenLinks.size() != reader.numLinks()) { 392 throw new IOException("DB claims " + reader.numLinks() + " links, but should be " + seenLinks.size() + ". Total links since last checkConsistency: " + totalLinksEver); 393 } 394 395 for (int i = 0; i < pageCount; i++) { 397 Page dbPage = reader.getPage(pages[i].getURL().toString()); 399 if (dbPage == null) { 400 throw new IOException("DB could not find page " + pages[i].getURL()); 401 } 402 if (! dbPage.getURL().equals(pages[i].getURL())) { 403 throw new IOException("DB's page " + dbPage.getURL() + " should be " + pages[i].getURL()); 404 } 405 if (! dbPage.getMD5().equals(pages[i].getMD5())) { 406 throw new IOException("Page " + pages[i].getURL() + " in the DB has an MD5 of " + dbPage.getMD5() + ", but should be " + pages[i].getMD5()); 407 } 408 409 Link dbOutlinks[] = reader.getLinks(pages[i].getMD5()); 413 for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) { 414 Link curOutlink = (Link) e.nextElement(); 415 boolean foundLink = false; 416 for (int j = 0; j < dbOutlinks.length; j++) { 417 if (dbOutlinks[j].compareTo(curOutlink) == 0) { 418 foundLink = true; 419 break; 420 } 421 } 422 if (! foundLink) { 423 throw new IOException("DB did not return Link " + curOutlink + " when asked for all links from " + pages[i].getMD5()); 424 } 425 } 426 427 int numTooMany = 0; 433 boolean excessLinks = false; 434 for (int j = 0; j < dbOutlinks.length; j++) { 435 boolean foundLink = false; 436 for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) { 437 Link curOutlink = (Link) e.nextElement(); 438 if (dbOutlinks[j].compareTo(curOutlink) == 0) { 439 foundLink = true; 440 break; 441 } 442 } 443 444 if (! foundLink) { 445 System.out.println("Found excess link in WebDB: " + dbOutlinks[j]); 446 excessLinks = true; 447 numTooMany++; 448 } 449 } 450 if (excessLinks) { 451 throw new IOException("DB has " + numTooMany + " too many outlinks."); 452 } 453 454 455 456 Vector inlinkList = (Vector) inlinks.get(pages[i].getURL().toString()); 458 if (inlinkList != null) { 459 Link dbInlinks[] = reader.getLinks(pages[i].getURL()); 460 for (Enumeration e = inlinkList.elements(); e.hasMoreElements(); ) { 461 Link curInlink = (Link) e.nextElement(); 462 boolean foundLink = false; 463 for (int j = 0; j < dbInlinks.length; j++) { 464 if (dbInlinks[j].compareTo(curInlink) == 0) { 465 foundLink = true; 466 break; 467 } 468 } 469 if (! foundLink) { 470 throw new IOException("DB did not return Link " + curInlink + " when asked for all links to " + pages[i].getURL()); 471 } 472 } 473 } 474 } 475 } finally { 476 reader.close(); 477 } 478 totalLinksEver = 0; 479 } 480 481 489 private void apiTest() throws IOException { 490 long urlEnumCount = 0, md5EnumCount = 0, linkEnumCount = 0; 491 IWebDBReader reader = new WebDBReader(nfs, webdb); 492 try { 493 497 System.out.println("Testing IWebDBReader.pages()..."); 499 Page prevPage = null; 500 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { 501 if (prevPage == null) { 502 prevPage = (Page) e.nextElement(); 503 } else { 504 Page curPage = (Page) e.nextElement(); 505 if (! (prevPage.getURL().compareTo(curPage.getURL()) < 0)) { 506 throw new IOException("While enumerating by URL, page " + prevPage + " comes before " + curPage); 507 } 508 prevPage = curPage; 509 } 510 urlEnumCount++; 511 } 512 if (urlEnumCount != reader.numPages()) { 513 throw new IOException("IWebDBReader call to pages() results in " + urlEnumCount + ", but IWebDBReader reports " + reader.numPages() + " items."); 514 } 515 516 System.out.println("Testing IWebDBReader.pagesByMD5()..."); 518 prevPage = null; 519 for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); ) { 520 if (prevPage == null) { 521 prevPage = (Page) e.nextElement(); 522 } else { 523 Page curPage = (Page) e.nextElement(); 524 if (! (prevPage.compareTo(curPage) < 0)) { 525 throw new IOException("While enumerating by MD5, page " + prevPage + " comes before " + curPage); 526 } 527 prevPage = curPage; 528 } 529 md5EnumCount++; 530 } 531 if (md5EnumCount != reader.numPages()) { 532 throw new IOException("IWebDBReader call to pagesByMD5() results in " + md5EnumCount + ", but IWebDBReader reports " + reader.numPages() + " items."); 533 } 534 535 System.out.println("Testing IWebDBReader.getPage()..."); 537 for (int i = 0; i < pages.length; i++) { 538 Page curPage = pages[i]; 539 Page resultPage = reader.getPage(curPage.getURL().toString()); 540 541 if (resultPage == null || (resultPage.compareTo(curPage) != 0)) { 542 throw new IOException("Call to IWebDBReader.getPage(" + curPage.getURL() + ") should have returned " + curPage + ", but returned " + resultPage + " instead."); 543 } 544 } 545 546 System.out.println("Testing IWebDBReader.getPages()..."); 548 for (Iterator it = md5Hashes.keySet().iterator(); it.hasNext(); ) { 549 MD5Hash curHash = (MD5Hash) it.next(); 550 Page pageSet[] = reader.getPages(curHash); 551 int numItems = ((Integer ) md5Hashes.get(curHash)).intValue(); 552 if (pageSet.length != numItems) { 553 throw new IOException("There should be " + numItems + " item(s) with MD5Hash " + curHash + " in the db, but IWebDBReader.getPages() reports " + pageSet.length); 554 } 555 } 556 557 System.out.println("Testing IWebDBReader.pageExists()..."); 559 for (int i = 0; i < pages.length; i++) { 560 Page curPage = pages[i]; 561 if (! reader.pageExists(curPage.getMD5())) { 562 throw new IOException("IWebDBReader.pageExists() reports that a page with MD5 " + curPage.getMD5() + " is not found. It should be!"); 563 } 564 } 565 566 567 571 System.out.println("Testing IWebDBReader.links()..."); 574 Link prevLink = null; 575 for (Enumeration e = reader.links(); e.hasMoreElements(); ) { 576 if (prevLink == null) { 577 prevLink = (Link) e.nextElement(); 578 } else { 579 Link curLink = (Link) e.nextElement(); 580 if (! (prevLink.compareTo(curLink) < 0)) { 581 throw new IOException("While enumerating by Link, link " + prevLink + " comes before " + curLink); 582 } 583 prevLink = curLink; 584 } 585 linkEnumCount++; 586 } 587 if (linkEnumCount != reader.numLinks()) { 588 throw new IOException("IWebDBReader call to links() results in " + linkEnumCount + ", but IWebDBReader reports " + reader.numLinks() + " items."); 589 } 590 591 System.out.println("Testing IWebDBReader.getLinks(UTF8)..."); 593 for (int i = 0; i < pages.length; i++) { 594 Page curPage = pages[i]; 595 Vector inlinkList = (Vector) inlinks.get(curPage.getURL().toString()); 596 Link dbInlinks[] = reader.getLinks(curPage.getURL()); 597 598 if (inlinkList == null || dbInlinks == null) { 599 if ((inlinkList == null || inlinkList.size() == 0) && 600 (dbInlinks.length != 0)) { 601 throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getURL()+ ") should return 0 links, but returns " + dbInlinks.length + " instead."); 602 } 603 } else { 604 if (dbInlinks.length != inlinkList.size()) { 605 throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getURL() + ") should return " + inlinkList.size() + " inlinks, but returns " + dbInlinks.length + " instead."); 606 } 607 } 608 } 609 610 System.out.println("Testing IWebDBReader.getLinks(MD5Hash)..."); 612 for (int i = 0; i < pages.length; i++) { 613 Page curPage = pages[i]; 614 Link dbOutlinks[] = reader.getLinks(curPage.getMD5()); 615 if (dbOutlinks.length != outlinks[i].size()) { 616 throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getMD5() + ") should return " + outlinks[i].size() + " outlinks, but returns " + dbOutlinks.length + " instead."); 617 } 618 if (dbOutlinks.length != curPage.getNumOutlinks()) { 619 throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getMD5() + ") should (according to Page.getNumOutlinks() return " + curPage.getNumOutlinks() + ", but returns " + dbOutlinks.length + " instead."); 620 } 621 } 622 } finally { 623 reader.close(); 624 } 625 } 626 627 631 private String createRandomString(int numChars) { 632 StringBuffer buf = new StringBuffer (); 633 for (int i = 0; i < numChars; i++) { 634 buf.append((char) ('A' + Math.abs(rand.nextInt() % 26))); 635 } 636 return buf.toString(); 637 } 638 639 643 private Page createRandomPage() throws IOException { 644 String curURL = "http://www.somePage." + createRandomString(20) + ".com/index.html"; 645 MD5Hash md5Hash = null; 646 Integer hashCount = null; 647 648 do { 652 md5Hash = MD5Hash.digest(createRandomString(Math.abs(rand.nextInt() % 2048) + 1)); 653 hashCount = (Integer ) md5Hashes.get(md5Hash); 654 } while (hashCount != null); 655 656 md5Hashes.put(md5Hash, new Integer (1)); 657 return new Page(curURL, md5Hash); 658 } 659 660 665 private Page createClonePage(Page cloneSrc) throws IOException { 666 String curURL = "http://www.somePage." + createRandomString(20) + ".com/index.html"; 667 MD5Hash md5Hash = cloneSrc.getMD5(); 668 Integer hashCount = (Integer ) md5Hashes.get(md5Hash); 669 md5Hashes.put(md5Hash, (hashCount == null) ? new Integer (1) : new Integer (hashCount.intValue() + 1)); 670 return new Page(curURL, md5Hash); 671 } 672 673 677 private Link createLink(Page src, Page dst) throws IOException { 678 UTF8 targetURL = dst.getURL(); 679 MD5Hash srcMD5 = new MD5Hash(); 680 srcMD5.set(src.getMD5()); 681 String linkText = createRandomString(Math.abs(rand.nextInt() % 16) + 1); 682 return new Link(srcMD5, src.computeDomainID(), targetURL.toString(), linkText); 683 } 684 685 691 private int makeOutlinkSet(IWebDBWriter writer, int srcIndex) throws IOException { 692 int numOutlinks = Math.abs(rand.nextInt() % MAX_OUTLINKS) + 1; 694 int numInserted = 0; 695 for (int j = 0; j < numOutlinks; j++) { 696 int targetPageIndex = Math.abs(rand.nextInt() % (maxPages)); 697 Page targetPage = pages[targetPageIndex]; 698 Link lr = createLink(pages[srcIndex], targetPage); 699 700 if (! seenLinks.contains(lr)) { 702 outlinks[srcIndex].add(lr); 703 Vector inlinkList = (Vector) inlinks.get(targetPage.getURL().toString()); 704 if (inlinkList == null) { 705 inlinkList = new Vector(); 706 inlinks.put(targetPage.getURL().toString(), inlinkList); 707 } 708 inlinkList.add(lr); 709 writer.addLink(lr); 710 711 linkCount++; 712 totalLinksEver++; 713 numInserted++; 714 seenLinks.add(lr); 715 } 716 } 717 return numInserted; 718 } 719 720 725 public static void main(String argv[]) throws IOException { 726 if (argv.length < 2) { 727 System.out.println("Usage: java net.nutch.db.DBTester (-local | -ndfs <namenode:port>) <workingdir> <numPages> [-seed <seed>]"); 728 return; 729 } 730 731 int i = 0; 733 NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i); 734 try { 735 File dbDir = new File(argv[i++]); 736 int numPages = Integer.parseInt(argv[i++]); 737 738 boolean gotSeed = false; 739 long seed = 0; 740 for (; i < argv.length; i++) { 741 if ("-seed".equals(argv[i])) { 742 gotSeed = true; 743 seed = Long.parseLong(argv[i+1]); 744 i++; 745 } 746 } 747 748 DBTester tester = (gotSeed) ? new DBTester(nfs, dbDir, seed, numPages) : new DBTester(nfs, dbDir, numPages); 749 try { 750 tester.runTest(); 751 } finally { 752 tester.cleanup(); 753 } 754 } finally { 755 nfs.close(); 756 } 757 } 758 } 759 | Popular Tags |