DBTester


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.db;
5   
6   import java.io.*;
7   import java.util.*;
8   
9   import net.nutch.db.*;
10  import net.nutch.io.*;
11  import net.nutch.fs.*;
12  import net.nutch.util.*;
13  import net.nutch.linkdb.*;
14  import net.nutch.pagedb.*;
15  
16  /***********************************************
17   * DBTester runs a test suite against 
18   * net.nutch.db.IWebDBWriter and net.nutch.db.IWebDBReader.
19   *
20   * It tests things by repeatedly:
21   * 1.  Adding new items and editing existing items in the WebDB,
22   * 2.  Closing down the db
23   * 3.  Making sure it's still coherent, via WebDBReader
24   * 4.  Goto 1 a bunch of times.
25   * 5.  Test the full IWebDBReader API.
26   *
27   * @author Mike Cafarella
28   ***********************************************/
29  public class DBTester {
30      static int MAX_OUTLINKS = 20;
31  
32      NutchFileSystem nfs;
33      long seed;
34      Random rand;
35      File webdb;
36      int maxPages;
37      TreeSet seenLinks = new TreeSet();
38      TreeMap md5Hashes = new TreeMap();
39      long pageCount = 0, linkCount = 0, totalLinksEver = 0;
40      Page pages[];
41      Vector outlinks[];
42      Hashtable inlinks;
43  
44      /**
45       */
46      public DBTester(NutchFileSystem nfs, File dir, int maxPages) throws IOException {
47      this(nfs, dir, new Random().nextLong(), maxPages);
48      }
49  
50      /**
51       * Create a tester object by passing in the location of the
52       * webdb and a few parameters.
53       */
54      public DBTester(NutchFileSystem nfs, File dir, long seed, int maxPages) throws IOException {
55          this.nfs = nfs;
56          this.maxPages = maxPages;
57          this.webdb = new File(dir, "webdb_test");
58          if (webdb.exists()) {
59              throw new IOException("File " + webdb + " already exists");
60          }
61          webdb.mkdirs();
62          this.seed = seed;
63  
64          WebDBWriter.createWebDB(nfs, webdb);
65  
66          this.rand = new Random(seed);
67          System.out.println("-----------------------------------------------");
68          System.out.println("DBTester created at " + new Date(System.currentTimeMillis()));
69          System.out.println("WebDB: " + webdb);
70          System.out.println("Seed: " + seed);
71          System.out.println("-----------------------------------------------");
72  
73  
74          //
75          // Create structures to hold our model
76          // webgraph.  As we build our own mini-web, 
77          // we also make changes to the WebDB.  Then we
78          // read the WebDB back and check if it matches
79          // our model.
80          pages = new Page[maxPages];
81          outlinks = new Vector[maxPages];
82          for (int i = 0; i < outlinks.length; i++) {
83              outlinks[i] = new Vector();
84          }
85          inlinks = new Hashtable();
86      }
87  
88      /**
89       * We run a series of tests against the WebDB.  We do
90       * this by first creating a model of our mini web-graph.
91       * We insert this into the webdb, close it, and then
92       * read back to make sure it matches our model.
93       *
94       * Then, we make a series of edits to the graph, making
95       * then simultaneously to the IWebDBWriter.  Again we
96       * close it down, then read it back to see if it matches.
97       *
98       * We do this repeatedly.
99       */
100     public void runTest() throws IOException {
101         // Round 1.
102         // First thing, we create a large number of 
103         // brand-new pages.  We just add them to the
104         // webdb.  Their interlink-stats are very roughly 
105         // reflective of the real world.
106         System.out.println("CREATING WEB MODEL, CHECKING CONSISTENCY");
107         createGraph();
108         // Check to see if it's correct.
109         checkConsistency();
110 
111         // Round 2 (repeated)
112         //
113         // Next, we create a number of edits.  We 
114         // select an existing page with some probability
115         // p, and choose a brand-new page with (1 - p).
116         // Perform some kind of edit to our model's page,
117         // then make the appropriate call to IWebDBWriter.
118         //
119         // After all that, check the db consistency
120         //
121         int maxTests = 10;
122         for (int i = 1; i <= maxTests; i++) {
123             System.out.println("EDIT-CONSISTENCY TEST  (" + i + " of " + maxTests + ")");
124             makeEdits();
125             checkConsistency();
126         }
127 
128         // At this point we're sure that the db is consistent
129         // with our own model.  But is it self-consistent?
130         // The final step is do an API-coverage test.
131         System.out.println("API TEST");
132         apiTest();
133 
134         //
135         // Finally, we make a bunch of random page deletes from
136         // the db, and test to make sure all Pages and Links are
137         // removed properly.  This tests the "pageGone" scenario.
138         //
139         System.out.println("DB PAGE-DELETE TEST");
140         IWebDBReader db = new WebDBReader(nfs, webdb);
141         Vector toRemove = new Vector();
142         try {
143             for (Enumeration e = db.pages(); e.hasMoreElements(); ) {
144                 Page p = (Page) e.nextElement();
145                 
146                 if (Math.abs(rand.nextInt()) % 100 == 0) {
147                     toRemove.add(p);
148                 }
149             }
150         } finally {
151             db.close();
152         }
153 
154         //
155         // Remove the randomly-chosen elements
156         //
157         IWebDBWriter dbwriter = new WebDBWriter(nfs, webdb);
158         try {
159             for (Enumeration e = toRemove.elements(); e.hasMoreElements(); ) {
160                 Page p = (Page) e.nextElement();
161                 dbwriter.deletePage(p.getURL().toString());
162             }
163         } finally {
164             dbwriter.close();
165         }
166 
167         // Test that the Pages and any inlinks are gone
168         db = new WebDBReader(nfs, webdb);
169         try {
170             for (Enumeration e = toRemove.elements(); e.hasMoreElements(); ) {
171                 Page p = (Page) e.nextElement();
172 
173                 Page result = db.getPage(p.getURL().toString());
174                 if (result != null) {
175                     // error
176                     throw new IOException("Found a Page that should have been deleted: " + result);
177                 }
178 
179                 Link results[] = db.getLinks(p.getURL());
180                 if (results.length != 0) {
181                     // error
182                     throw new IOException("Should find no inlinks for deleted URL " + p.getURL() + ", but found " + results.length + " of them.");
183                 }
184             }
185         } finally {
186             db.close();
187         }            
188 
189         System.out.println("*** TEST COMPLETE ***");
190     }
191 
192     /**
193      * Do away with the database.  Only do this if you
194      * no longer need the evidence!
195      */
196     public void cleanup() throws IOException {
197         FileUtil.fullyDelete(nfs, webdb);
198     }
199 
200     /**
201      * We create the 1st iteration of the web graph.  That
202      * means no edits or modifications.  Just adds.
203      */
204     private void createGraph() throws IOException {
205         IWebDBWriter writer = new WebDBWriter(nfs, webdb);
206         try {
207             for (int i = 0; i < maxPages; i++) {
208                 // Make some pages
209                 pages[i] = createRandomPage();
210                 writer.addPage(pages[i]);
211                 pageCount++;
212             }
213 
214             // Make some links that interconnect them
215             for (int i = 0; i < maxPages; i++) {
216                 pages[i].setNumOutlinks(makeOutlinkSet(writer, i));
217             }
218         } finally {
219             writer.close();
220         }
221     }
222 
223     /**
224      * We make a set of adds, deletes, and mods to the
225      * internal web graph.  All of these are also applied
226      * to the WebDB.
227      */
228     private void makeEdits() throws IOException {
229         IWebDBWriter writer = new WebDBWriter(nfs, webdb);
230         try {
231             int actions[] = new int[pages.length];
232             
233             for (int i = 0; i < maxPages; i++) {
234                 Page curPage = pages[i];
235 
236                 // We will either delete, edit, or leave it alone
237                 int action = Math.abs(rand.nextInt() % 2);
238                 actions[i] = action;
239                 if (action == 0) {
240                     // Get rid of the page
241                     Integer   hashCount = (Integer  ) md5Hashes.get(curPage.getMD5());
242                     if (hashCount.intValue() == 1) {
243                         md5Hashes.remove(curPage.getMD5());
244                     } else {
245                         md5Hashes.put(curPage.getMD5(), new Integer  (hashCount.intValue() - 1));
246                     }
247                     pages[i] = null;
248                     writer.deletePage(curPage.getURL().toString());
249                     linkCount -= outlinks[i].size();
250                     
251                     // Delete all the outlinks from our webgraph
252                     // structures.
253                     // 
254                     // First, iterate through the list of all the outlinks
255                     // we're about to get rid of.
256                     for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) {
257                         Link curOutlink = (Link) e.nextElement();
258 
259                         // Remove each outlink from the "seenLinks" table.
260                         seenLinks.remove(curOutlink);
261 
262                         // Remove each outlink from the "inlink" tables.
263                         // We need to find the target URL's inlinkList,
264                         // and remove the handle to the curOutlink.
265                         //
266                         // First, get the target's inlinkList.
267                         int removeIndex = -1, pos = 0;
268                         Vector inlinkList = (Vector) inlinks.get(curOutlink.getURL().toString());
269                         // Find the position where the curOutlink appears.
270                         for (Enumeration e2 = inlinkList.elements(); e2.hasMoreElements(); pos++) {
271                             Link curInlink = (Link) e2.nextElement();
272                             if (curInlink.getFromID().equals(curOutlink.getFromID())) {
273                                 removeIndex = pos;
274                                 break;
275                             }
276                         }
277 
278                         // Remove the curOutlink from the target's inlink list
279                         if (removeIndex >= 0) {
280                             inlinkList.removeElementAt(removeIndex);
281                         }
282                     }
283 
284                     // Just clear all the links out.
285                     outlinks[i].clear();
286 
287                     // Create a new one to replace it!
288                     pages[i] = createRandomPage();
289 
290                     // Will add new links after this loop, 
291                     // once all pages are created.
292                 } else if (action == 1) {
293                     // Modify the page's MD5.
294                     Integer   hashCount = (Integer  ) md5Hashes.get(curPage.getMD5());
295                     if (hashCount.intValue() == 1) {
296                         md5Hashes.remove(curPage.getMD5());
297                     } else {
298                         md5Hashes.put(curPage.getMD5(), new Integer  (hashCount.intValue() - 1));
299                     }
300 
301                     // We need a unique md5 hash, because
302                     // otherwise we need to maintain models
303                     // of page contents.  That is too much
304                     // for now, though might eventually
305                     // be a good idea.
306                     MD5Hash md5Hash = null;
307                     do {
308                         md5Hash = MD5Hash.digest(createRandomString(Math.abs(rand.nextInt() % 2048)));
309                         hashCount = (Integer  ) md5Hashes.get(md5Hash);
310                     } while (hashCount != null);
311 
312                     md5Hashes.put(md5Hash, new Integer  (1));
313                     pages[i].setMD5(md5Hash);
314 
315                     // We're going to generate new Outlinks.
316                     // (However, the Page's URL stays the same,
317                     // so all inlinks to this URL remain untouched.)
318                     linkCount -= outlinks[i].size();
319 
320                     //
321                     // Delete all of the outlinks from our webgraph
322                     // structures.
323                     //
324                     // First, iterate through the list of all the outlinks
325                     // we're about to get rid of.
326                     for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) {
327                         Link curOutlink = (Link) e.nextElement();
328 
329                         // Remove each outlink from the "seenLinks" table
330                         seenLinks.remove(curOutlink);
331 
332                         // Remove each outlink from the "inlink" tables.
333                         // We need to find the target URL's inlinkList,
334                         // and remove the handle to the curOutlink.
335                         //
336                         // First, get the target's inlinkList
337                         int removeIndex = -1, pos = 0;                        
338                         Vector inlinkList = (Vector) inlinks.get(curOutlink.getURL().toString());
339                         // Find the position where the curOutlink appears
340                         for (Enumeration e2 = inlinkList.elements(); e2.hasMoreElements(); pos++) {
341                             Link curLink = (Link) e2.nextElement();
342                             if (curLink.getFromID().equals(curOutlink.getFromID())) {
343                                 removeIndex = pos;
344                                 break;
345                             }
346                         }
347                         
348                         // Remove the curOutlink from the target's inlink list
349                         if (removeIndex >= 0) {
350                             inlinkList.removeElementAt(removeIndex);
351                         }
352                     }
353 
354                     // Clear all the links out.
355                     // Set the Page's number of outlinks to zero.
356                     outlinks[i].clear();
357                     pages[i].setNumOutlinks(0);
358 
359                     // Will add new links after this loop...
360                 } 
361                 // Otherwise, leave things alone!
362             }
363 
364             // Now that we've built all the pages, add in the
365             // outlinks
366             for (int i = 0; i < maxPages; i++) {
367                 if ((actions[i] == 0) || (actions[i] == 1)) {
368                     // Make the necessary outlinks for this new page!
369                     pages[i].setNumOutlinks(makeOutlinkSet(writer, i));
370                     writer.addPage(pages[i]);
371                 }
372             }
373         } finally {
374             writer.close();
375         }
376     }
377 
378     /**
379      * The checkConsistency() function will load in the
380      * db from disk, and match it against the in-memory
381      * representation.
382      */
383     private void checkConsistency() throws IOException {
384         IWebDBReader reader = new WebDBReader(nfs, webdb);
385         try {
386             // Make sure counts match.
387             if (pageCount != reader.numPages()) {
388                 throw new IOException("DB claims " + reader.numPages() + " pages, but should be " + pageCount);
389             }
390 
391             if (seenLinks.size() != reader.numLinks()) {
392                 throw new IOException("DB claims " + reader.numLinks() + " links, but should be " + seenLinks.size() + ".  Total links since last checkConsistency: " + totalLinksEver);
393             }
394 
395             // Go through every page....
396             for (int i = 0; i < pageCount; i++) {
397                 // First, check coverage of the page set.
398                 Page dbPage = reader.getPage(pages[i].getURL().toString());
399                 if (dbPage == null) {
400                     throw new IOException("DB could not find page " + pages[i].getURL());
401                 }
402                 if (! dbPage.getURL().equals(pages[i].getURL())) {
403                     throw new IOException("DB's page " + dbPage.getURL() + " should be " + pages[i].getURL());
404                 }
405                 if (! dbPage.getMD5().equals(pages[i].getMD5())) {
406                     throw new IOException("Page " + pages[i].getURL() + " in the DB has an MD5 of " + dbPage.getMD5() + ", but should be " + pages[i].getMD5());
407                 }
408 
409                 // Next, the outlinks from that page.  Go through
410                 // every one of the links we think it should have,
411                 // and make sure it is there.
412                 Link dbOutlinks[] = reader.getLinks(pages[i].getMD5());
413                 for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) {
414                     Link curOutlink = (Link) e.nextElement();
415                     boolean foundLink = false;
416                     for (int j = 0; j < dbOutlinks.length; j++) {
417                         if (dbOutlinks[j].compareTo(curOutlink) == 0) {
418                             foundLink = true;
419                             break;
420                         }
421                     }
422                     if (! foundLink) {
423                         throw new IOException("DB did not return Link " + curOutlink + " when asked for all links from " + pages[i].getMD5());
424                     }
425                 }
426 
427                 // We also want to test whether there are some 
428                 // links in the DB which should not be there.
429                 // (Yes, this is caught by the above counting
430                 // test, but we want to find out *which* urls
431                 // are the "extra" ones.)
432                 int numTooMany = 0;
433                 boolean excessLinks = false;
434                 for (int j = 0; j < dbOutlinks.length; j++) {
435                     boolean foundLink = false;
436                     for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) {
437                         Link curOutlink = (Link) e.nextElement();
438                         if (dbOutlinks[j].compareTo(curOutlink) == 0) {
439                             foundLink = true;
440                             break;
441                         }
442                     }
443 
444                     if (! foundLink) {
445                         System.out.println("Found excess link in WebDB: " + dbOutlinks[j]);
446                         excessLinks = true;
447                         numTooMany++;
448                     }
449                 }
450                 if (excessLinks) {
451                     throw new IOException("DB has " + numTooMany + " too many outlinks.");
452                 }
453 
454 
455 
456                 // Finally, the links *to* that page.
457                 Vector inlinkList = (Vector) inlinks.get(pages[i].getURL().toString());
458                 if (inlinkList != null) {
459                     Link dbInlinks[] = reader.getLinks(pages[i].getURL());
460                     for (Enumeration e = inlinkList.elements(); e.hasMoreElements(); ) {
461                         Link curInlink = (Link) e.nextElement();
462                         boolean foundLink = false;
463                         for (int j = 0; j < dbInlinks.length; j++) {
464                             if (dbInlinks[j].compareTo(curInlink) == 0) {
465                                 foundLink = true;
466                                 break;
467                             }
468                         }
469                         if (! foundLink) {
470                             throw new IOException("DB did not return Link " + curInlink + " when asked for all links to " + pages[i].getURL());
471                         }
472                     }
473                 }
474             }
475         } finally {
476             reader.close();
477         }
478         totalLinksEver = 0;
479     }
480 
481     /**
482      * apiTest() will run through all the methods of
483      * IWebDBReader and make sure they give correct
484      * answers.  We might use the internal model as a 
485      * source of items that are in the webdb, but we aren't 
486      * trying to perform a full consistency check - that's 
487      * done in checkConsistency().
488      */
489     private void apiTest() throws IOException {
490         long urlEnumCount = 0, md5EnumCount = 0, linkEnumCount = 0;
491         IWebDBReader reader = new WebDBReader(nfs, webdb);
492         try {
493             //
494             // PAGE OPERATIONS
495             //
496 
497             // 1.  Test pages() and numPages()
498             System.out.println("Testing IWebDBReader.pages()...");
499             Page prevPage = null;
500             for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
501                 if (prevPage == null) {
502                     prevPage = (Page) e.nextElement();
503                 } else {
504                     Page curPage = (Page) e.nextElement();
505                     if (! (prevPage.getURL().compareTo(curPage.getURL()) < 0)) {
506                         throw new IOException("While enumerating by URL, page " + prevPage + " comes before " + curPage);
507                     }
508                     prevPage = curPage;
509                 }
510                 urlEnumCount++;
511             }
512             if (urlEnumCount != reader.numPages()) {
513                 throw new IOException("IWebDBReader call to pages() results in " + urlEnumCount + ", but IWebDBReader reports " + reader.numPages() + " items.");
514             }
515 
516             // 2.  Test pagesByMD5().
517             System.out.println("Testing IWebDBReader.pagesByMD5()...");
518             prevPage = null;
519             for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); ) {
520                 if (prevPage == null) {
521                     prevPage = (Page) e.nextElement();
522                 } else {
523                     Page curPage = (Page) e.nextElement();
524                     if (! (prevPage.compareTo(curPage) < 0)) {
525                         throw new IOException("While enumerating by MD5, page " + prevPage + " comes before " + curPage);
526                     }
527                     prevPage = curPage;
528                 }
529                 md5EnumCount++;
530             }
531             if (md5EnumCount != reader.numPages()) {
532                 throw new IOException("IWebDBReader call to pagesByMD5() results in " + md5EnumCount + ", but IWebDBReader reports " + reader.numPages() + " items.");
533             }
534 
535             // 3.  Test getPage(String) method.
536             System.out.println("Testing IWebDBReader.getPage()...");
537             for (int i = 0; i < pages.length; i++) {
538                 Page curPage = pages[i];
539                 Page resultPage = reader.getPage(curPage.getURL().toString());
540 
541                 if (resultPage == null || (resultPage.compareTo(curPage) != 0)) {
542                     throw new IOException("Call to IWebDBReader.getPage(" + curPage.getURL() + ") should have returned " + curPage + ", but returned " + resultPage + " instead.");
543                 }
544             }
545 
546             // 4.  Test getPages(MD5Hash)
547             System.out.println("Testing IWebDBReader.getPages()...");
548             for (Iterator it = md5Hashes.keySet().iterator(); it.hasNext(); ) {
549                 MD5Hash curHash = (MD5Hash) it.next();
550                 Page pageSet[] = reader.getPages(curHash);
551                 int numItems = ((Integer  ) md5Hashes.get(curHash)).intValue();
552                 if (pageSet.length != numItems) {
553                     throw new IOException("There should be " + numItems + " item(s) with MD5Hash " + curHash + " in the db, but IWebDBReader.getPages() reports " + pageSet.length);
554                 }
555             }
556 
557             // 5.  Test pageExists(MD5Hash)
558             System.out.println("Testing IWebDBReader.pageExists()...");
559             for (int i = 0; i < pages.length; i++) {
560                 Page curPage = pages[i];
561                 if (! reader.pageExists(curPage.getMD5())) {
562                     throw new IOException("IWebDBReader.pageExists() reports that a page with MD5 " + curPage.getMD5() + " is not found.  It should be!");
563                 }
564             }
565 
566 
567             //
568             // LINK OPERATIONS
569             //
570             
571             // 1.  Test links() and numLinks(), and generate a list
572             // of items to test later on.
573             System.out.println("Testing IWebDBReader.links()...");
574             Link prevLink = null;
575             for (Enumeration e = reader.links(); e.hasMoreElements(); ) {
576                 if (prevLink == null) {
577                     prevLink = (Link) e.nextElement();
578                 } else {
579                     Link curLink = (Link) e.nextElement();
580                     if (! (prevLink.compareTo(curLink) < 0)) {
581                         throw new IOException("While enumerating by Link, link " + prevLink + " comes before " + curLink);
582                     }
583                     prevLink = curLink;
584                 }
585                 linkEnumCount++;
586             }
587             if (linkEnumCount != reader.numLinks()) {
588                 throw new IOException("IWebDBReader call to links() results in " + linkEnumCount + ", but IWebDBReader reports " + reader.numLinks() + " items.");
589             }
590 
591             // 2.  Test getLinks(UTF8)
592             System.out.println("Testing IWebDBReader.getLinks(UTF8)...");
593             for (int i = 0; i < pages.length; i++) {
594                 Page curPage = pages[i];
595                 Vector inlinkList = (Vector) inlinks.get(curPage.getURL().toString());
596                 Link dbInlinks[] = reader.getLinks(curPage.getURL());
597 
598                 if (inlinkList == null || dbInlinks == null) {
599                     if ((inlinkList == null || inlinkList.size() == 0) &&
600                         (dbInlinks.length != 0)) {
601                         throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getURL()+ ") should return 0 links, but returns " + dbInlinks.length + " instead.");
602                     }
603                 } else {
604                     if (dbInlinks.length != inlinkList.size()) {
605                         throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getURL() + ") should return " + inlinkList.size() + " inlinks, but returns " + dbInlinks.length + " instead.");
606                     }
607                 }
608             }
609 
610             // 3.  Test getLinks(MD5Hash)
611             System.out.println("Testing IWebDBReader.getLinks(MD5Hash)...");
612             for (int i = 0; i < pages.length; i++) {
613                 Page curPage = pages[i];
614                 Link dbOutlinks[] = reader.getLinks(curPage.getMD5());
615                 if (dbOutlinks.length != outlinks[i].size()) {
616                     throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getMD5() + ") should return " + outlinks[i].size() + " outlinks, but returns " + dbOutlinks.length + " instead.");
617                 }
618                 if (dbOutlinks.length != curPage.getNumOutlinks()) {
619                     throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getMD5() + ") should (according to Page.getNumOutlinks() return " + curPage.getNumOutlinks() + ", but returns " + dbOutlinks.length + " instead.");
620                 }
621             }
622         } finally {
623             reader.close();
624         }
625     }
626 
627     /**
628      * Return a string of numChars length.  This is good for
629      * anchors and URLs.
630      */
631     private String   createRandomString(int numChars) {
632         StringBuffer   buf = new StringBuffer  ();
633         for (int i = 0; i < numChars; i++) {
634             buf.append((char) ('A' + Math.abs(rand.nextInt() % 26)));
635         }
636         return buf.toString();
637     }
638 
639     /**
640      * Internal utility method that manufactures a brand-new
641      * novel page.
642      */
643     private Page createRandomPage() throws IOException {
644         String   curURL = "http://www.somePage." + createRandomString(20) + ".com/index.html";
645         MD5Hash md5Hash = null;
646         Integer   hashCount = null;
647 
648         // Keep generating random contents until we have a unique
649         // one.  otherwise we need to maintain models of page contents.
650         // As mentioned, that's too much for now, but maybe someday.
651         do {
652             md5Hash = MD5Hash.digest(createRandomString(Math.abs(rand.nextInt() % 2048) + 1));
653             hashCount = (Integer  ) md5Hashes.get(md5Hash);
654         } while (hashCount != null);
655 
656         md5Hashes.put(md5Hash, new Integer  (1));
657         return new Page(curURL, md5Hash);
658     }
659 
660     /**
661      * The createClonePage() is used to test duplicate MD5s
662      * in our webdb.  This is why we worry about tracking MD5s
663      * in the md5Hashes table.  REMIND - It is not used yet
664      */
665     private Page createClonePage(Page cloneSrc) throws IOException {
666         String   curURL = "http://www.somePage." + createRandomString(20) + ".com/index.html";
667         MD5Hash md5Hash = cloneSrc.getMD5();
668         Integer   hashCount = (Integer  ) md5Hashes.get(md5Hash);
669         md5Hashes.put(md5Hash, (hashCount == null) ? new Integer  (1) : new Integer  (hashCount.intValue() + 1));
670         return new Page(curURL, md5Hash);
671     }
672 
673     /**
674      * Internal method that makes a Link between two
675      * pages.
676      */
677     private Link createLink(Page src, Page dst) throws IOException {
678         UTF8 targetURL = dst.getURL();
679         MD5Hash srcMD5 = new MD5Hash();
680         srcMD5.set(src.getMD5());
681         String   linkText = createRandomString(Math.abs(rand.nextInt() % 16) + 1);
682         return new Link(srcMD5, src.computeDomainID(), targetURL.toString(), linkText);
683     }
684 
685     /**
686      * We make a randomized set of outlinks from the given
687      * page to any number of other pages.
688      *
689      * Returns number of outlinks generated.
690      */
691     private int makeOutlinkSet(IWebDBWriter writer, int srcIndex) throws IOException {
692         // Create the links for this new page!
693         int numOutlinks = Math.abs(rand.nextInt() % MAX_OUTLINKS) + 1;
694         int numInserted = 0;
695         for (int j = 0; j < numOutlinks; j++) {
696             int targetPageIndex = Math.abs(rand.nextInt() % (maxPages));
697             Page targetPage = pages[targetPageIndex];
698             Link lr = createLink(pages[srcIndex], targetPage);
699 
700             // See if we've made this link before
701             if (! seenLinks.contains(lr)) {
702                 outlinks[srcIndex].add(lr);
703                 Vector inlinkList = (Vector) inlinks.get(targetPage.getURL().toString());
704                 if (inlinkList == null) {
705                     inlinkList = new Vector();
706                     inlinks.put(targetPage.getURL().toString(), inlinkList);
707                 }
708                 inlinkList.add(lr);
709                 writer.addLink(lr);
710 
711                 linkCount++;
712                 totalLinksEver++;
713                 numInserted++;
714                 seenLinks.add(lr);
715             }
716         }
717         return numInserted;
718     }
719 
720     /**
721      * The command-line takes a location to put temporary work
722      * files, the number of pages to use in the test set, and
723      * (optionally) a seed for the random num-generator.
724      */
725     public static void main(String   argv[]) throws IOException {
726         if (argv.length < 2) {
727         System.out.println("Usage: java net.nutch.db.DBTester (-local | -ndfs <namenode:port>) <workingdir> <numPages> [-seed <seed>]");
728             return;
729         }
730 
731         // Parse args
732         int i = 0;
733         NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
734         try {
735             File dbDir = new File(argv[i++]);
736             int numPages = Integer.parseInt(argv[i++]);
737 
738             boolean gotSeed = false;
739             long seed = 0;
740             for (; i < argv.length; i++) {
741                 if ("-seed".equals(argv[i])) {
742                     gotSeed = true;
743                     seed = Long.parseLong(argv[i+1]);
744                     i++;
745                 }
746             }
747 
748             DBTester tester = (gotSeed) ? new DBTester(nfs, dbDir, seed, numPages) : new DBTester(nfs, dbDir, numPages);
749             try {
750                 tester.runTest();
751             } finally {
752                 tester.cleanup();
753             }
754         } finally {
755             nfs.close();
756         }
757     }
758 }
759
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags