KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > db > DBTester


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.db;
5
6 import java.io.*;
7 import java.util.*;
8
9 import net.nutch.db.*;
10 import net.nutch.io.*;
11 import net.nutch.fs.*;
12 import net.nutch.util.*;
13 import net.nutch.linkdb.*;
14 import net.nutch.pagedb.*;
15
16 /***********************************************
17  * DBTester runs a test suite against
18  * net.nutch.db.IWebDBWriter and net.nutch.db.IWebDBReader.
19  *
20  * It tests things by repeatedly:
21  * 1. Adding new items and editing existing items in the WebDB,
22  * 2. Closing down the db
23  * 3. Making sure it's still coherent, via WebDBReader
24  * 4. Goto 1 a bunch of times.
25  * 5. Test the full IWebDBReader API.
26  *
27  * @author Mike Cafarella
28  ***********************************************/

29 public class DBTester {
30     static int MAX_OUTLINKS = 20;
31
32     NutchFileSystem nfs;
33     long seed;
34     Random rand;
35     File webdb;
36     int maxPages;
37     TreeSet seenLinks = new TreeSet();
38     TreeMap md5Hashes = new TreeMap();
39     long pageCount = 0, linkCount = 0, totalLinksEver = 0;
40     Page pages[];
41     Vector outlinks[];
42     Hashtable inlinks;
43
44     /**
45      */

46     public DBTester(NutchFileSystem nfs, File dir, int maxPages) throws IOException {
47     this(nfs, dir, new Random().nextLong(), maxPages);
48     }
49
50     /**
51      * Create a tester object by passing in the location of the
52      * webdb and a few parameters.
53      */

54     public DBTester(NutchFileSystem nfs, File dir, long seed, int maxPages) throws IOException {
55         this.nfs = nfs;
56         this.maxPages = maxPages;
57         this.webdb = new File(dir, "webdb_test");
58         if (webdb.exists()) {
59             throw new IOException("File " + webdb + " already exists");
60         }
61         webdb.mkdirs();
62         this.seed = seed;
63
64         WebDBWriter.createWebDB(nfs, webdb);
65
66         this.rand = new Random(seed);
67         System.out.println("-----------------------------------------------");
68         System.out.println("DBTester created at " + new Date(System.currentTimeMillis()));
69         System.out.println("WebDB: " + webdb);
70         System.out.println("Seed: " + seed);
71         System.out.println("-----------------------------------------------");
72
73
74         //
75
// Create structures to hold our model
76
// webgraph. As we build our own mini-web,
77
// we also make changes to the WebDB. Then we
78
// read the WebDB back and check if it matches
79
// our model.
80
pages = new Page[maxPages];
81         outlinks = new Vector[maxPages];
82         for (int i = 0; i < outlinks.length; i++) {
83             outlinks[i] = new Vector();
84         }
85         inlinks = new Hashtable();
86     }
87
88     /**
89      * We run a series of tests against the WebDB. We do
90      * this by first creating a model of our mini web-graph.
91      * We insert this into the webdb, close it, and then
92      * read back to make sure it matches our model.
93      *
94      * Then, we make a series of edits to the graph, making
95      * then simultaneously to the IWebDBWriter. Again we
96      * close it down, then read it back to see if it matches.
97      *
98      * We do this repeatedly.
99      */

100     public void runTest() throws IOException {
101         // Round 1.
102
// First thing, we create a large number of
103
// brand-new pages. We just add them to the
104
// webdb. Their interlink-stats are very roughly
105
// reflective of the real world.
106
System.out.println("CREATING WEB MODEL, CHECKING CONSISTENCY");
107         createGraph();
108         // Check to see if it's correct.
109
checkConsistency();
110
111         // Round 2 (repeated)
112
//
113
// Next, we create a number of edits. We
114
// select an existing page with some probability
115
// p, and choose a brand-new page with (1 - p).
116
// Perform some kind of edit to our model's page,
117
// then make the appropriate call to IWebDBWriter.
118
//
119
// After all that, check the db consistency
120
//
121
int maxTests = 10;
122         for (int i = 1; i <= maxTests; i++) {
123             System.out.println("EDIT-CONSISTENCY TEST (" + i + " of " + maxTests + ")");
124             makeEdits();
125             checkConsistency();
126         }
127
128         // At this point we're sure that the db is consistent
129
// with our own model. But is it self-consistent?
130
// The final step is do an API-coverage test.
131
System.out.println("API TEST");
132         apiTest();
133
134         //
135
// Finally, we make a bunch of random page deletes from
136
// the db, and test to make sure all Pages and Links are
137
// removed properly. This tests the "pageGone" scenario.
138
//
139
System.out.println("DB PAGE-DELETE TEST");
140         IWebDBReader db = new WebDBReader(nfs, webdb);
141         Vector toRemove = new Vector();
142         try {
143             for (Enumeration e = db.pages(); e.hasMoreElements(); ) {
144                 Page p = (Page) e.nextElement();
145                 
146                 if (Math.abs(rand.nextInt()) % 100 == 0) {
147                     toRemove.add(p);
148                 }
149             }
150         } finally {
151             db.close();
152         }
153
154         //
155
// Remove the randomly-chosen elements
156
//
157
IWebDBWriter dbwriter = new WebDBWriter(nfs, webdb);
158         try {
159             for (Enumeration e = toRemove.elements(); e.hasMoreElements(); ) {
160                 Page p = (Page) e.nextElement();
161                 dbwriter.deletePage(p.getURL().toString());
162             }
163         } finally {
164             dbwriter.close();
165         }
166
167         // Test that the Pages and any inlinks are gone
168
db = new WebDBReader(nfs, webdb);
169         try {
170             for (Enumeration e = toRemove.elements(); e.hasMoreElements(); ) {
171                 Page p = (Page) e.nextElement();
172
173                 Page result = db.getPage(p.getURL().toString());
174                 if (result != null) {
175                     // error
176
throw new IOException("Found a Page that should have been deleted: " + result);
177                 }
178
179                 Link results[] = db.getLinks(p.getURL());
180                 if (results.length != 0) {
181                     // error
182
throw new IOException("Should find no inlinks for deleted URL " + p.getURL() + ", but found " + results.length + " of them.");
183                 }
184             }
185         } finally {
186             db.close();
187         }
188
189         System.out.println("*** TEST COMPLETE ***");
190     }
191
192     /**
193      * Do away with the database. Only do this if you
194      * no longer need the evidence!
195      */

196     public void cleanup() throws IOException {
197         FileUtil.fullyDelete(nfs, webdb);
198     }
199
200     /**
201      * We create the 1st iteration of the web graph. That
202      * means no edits or modifications. Just adds.
203      */

204     private void createGraph() throws IOException {
205         IWebDBWriter writer = new WebDBWriter(nfs, webdb);
206         try {
207             for (int i = 0; i < maxPages; i++) {
208                 // Make some pages
209
pages[i] = createRandomPage();
210                 writer.addPage(pages[i]);
211                 pageCount++;
212             }
213
214             // Make some links that interconnect them
215
for (int i = 0; i < maxPages; i++) {
216                 pages[i].setNumOutlinks(makeOutlinkSet(writer, i));
217             }
218         } finally {
219             writer.close();
220         }
221     }
222
223     /**
224      * We make a set of adds, deletes, and mods to the
225      * internal web graph. All of these are also applied
226      * to the WebDB.
227      */

228     private void makeEdits() throws IOException {
229         IWebDBWriter writer = new WebDBWriter(nfs, webdb);
230         try {
231             int actions[] = new int[pages.length];
232             
233             for (int i = 0; i < maxPages; i++) {
234                 Page curPage = pages[i];
235
236                 // We will either delete, edit, or leave it alone
237
int action = Math.abs(rand.nextInt() % 2);
238                 actions[i] = action;
239                 if (action == 0) {
240                     // Get rid of the page
241
Integer JavaDoc hashCount = (Integer JavaDoc) md5Hashes.get(curPage.getMD5());
242                     if (hashCount.intValue() == 1) {
243                         md5Hashes.remove(curPage.getMD5());
244                     } else {
245                         md5Hashes.put(curPage.getMD5(), new Integer JavaDoc(hashCount.intValue() - 1));
246                     }
247                     pages[i] = null;
248                     writer.deletePage(curPage.getURL().toString());
249                     linkCount -= outlinks[i].size();
250                     
251                     // Delete all the outlinks from our webgraph
252
// structures.
253
//
254
// First, iterate through the list of all the outlinks
255
// we're about to get rid of.
256
for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) {
257                         Link curOutlink = (Link) e.nextElement();
258
259                         // Remove each outlink from the "seenLinks" table.
260
seenLinks.remove(curOutlink);
261
262                         // Remove each outlink from the "inlink" tables.
263
// We need to find the target URL's inlinkList,
264
// and remove the handle to the curOutlink.
265
//
266
// First, get the target's inlinkList.
267
int removeIndex = -1, pos = 0;
268                         Vector inlinkList = (Vector) inlinks.get(curOutlink.getURL().toString());
269                         // Find the position where the curOutlink appears.
270
for (Enumeration e2 = inlinkList.elements(); e2.hasMoreElements(); pos++) {
271                             Link curInlink = (Link) e2.nextElement();
272                             if (curInlink.getFromID().equals(curOutlink.getFromID())) {
273                                 removeIndex = pos;
274                                 break;
275                             }
276                         }
277
278                         // Remove the curOutlink from the target's inlink list
279
if (removeIndex >= 0) {
280                             inlinkList.removeElementAt(removeIndex);
281                         }
282                     }
283
284                     // Just clear all the links out.
285
outlinks[i].clear();
286
287                     // Create a new one to replace it!
288
pages[i] = createRandomPage();
289
290                     // Will add new links after this loop,
291
// once all pages are created.
292
} else if (action == 1) {
293                     // Modify the page's MD5.
294
Integer JavaDoc hashCount = (Integer JavaDoc) md5Hashes.get(curPage.getMD5());
295                     if (hashCount.intValue() == 1) {
296                         md5Hashes.remove(curPage.getMD5());
297                     } else {
298                         md5Hashes.put(curPage.getMD5(), new Integer JavaDoc(hashCount.intValue() - 1));
299                     }
300
301                     // We need a unique md5 hash, because
302
// otherwise we need to maintain models
303
// of page contents. That is too much
304
// for now, though might eventually
305
// be a good idea.
306
MD5Hash md5Hash = null;
307                     do {
308                         md5Hash = MD5Hash.digest(createRandomString(Math.abs(rand.nextInt() % 2048)));
309                         hashCount = (Integer JavaDoc) md5Hashes.get(md5Hash);
310                     } while (hashCount != null);
311
312                     md5Hashes.put(md5Hash, new Integer JavaDoc(1));
313                     pages[i].setMD5(md5Hash);
314
315                     // We're going to generate new Outlinks.
316
// (However, the Page's URL stays the same,
317
// so all inlinks to this URL remain untouched.)
318
linkCount -= outlinks[i].size();
319
320                     //
321
// Delete all of the outlinks from our webgraph
322
// structures.
323
//
324
// First, iterate through the list of all the outlinks
325
// we're about to get rid of.
326
for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) {
327                         Link curOutlink = (Link) e.nextElement();
328
329                         // Remove each outlink from the "seenLinks" table
330
seenLinks.remove(curOutlink);
331
332                         // Remove each outlink from the "inlink" tables.
333
// We need to find the target URL's inlinkList,
334
// and remove the handle to the curOutlink.
335
//
336
// First, get the target's inlinkList
337
int removeIndex = -1, pos = 0;
338                         Vector inlinkList = (Vector) inlinks.get(curOutlink.getURL().toString());
339                         // Find the position where the curOutlink appears
340
for (Enumeration e2 = inlinkList.elements(); e2.hasMoreElements(); pos++) {
341                             Link curLink = (Link) e2.nextElement();
342                             if (curLink.getFromID().equals(curOutlink.getFromID())) {
343                                 removeIndex = pos;
344                                 break;
345                             }
346                         }
347                         
348                         // Remove the curOutlink from the target's inlink list
349
if (removeIndex >= 0) {
350                             inlinkList.removeElementAt(removeIndex);
351                         }
352                     }
353
354                     // Clear all the links out.
355
// Set the Page's number of outlinks to zero.
356
outlinks[i].clear();
357                     pages[i].setNumOutlinks(0);
358
359                     // Will add new links after this loop...
360
}
361                 // Otherwise, leave things alone!
362
}
363
364             // Now that we've built all the pages, add in the
365
// outlinks
366
for (int i = 0; i < maxPages; i++) {
367                 if ((actions[i] == 0) || (actions[i] == 1)) {
368                     // Make the necessary outlinks for this new page!
369
pages[i].setNumOutlinks(makeOutlinkSet(writer, i));
370                     writer.addPage(pages[i]);
371                 }
372             }
373         } finally {
374             writer.close();
375         }
376     }
377
378     /**
379      * The checkConsistency() function will load in the
380      * db from disk, and match it against the in-memory
381      * representation.
382      */

383     private void checkConsistency() throws IOException {
384         IWebDBReader reader = new WebDBReader(nfs, webdb);
385         try {
386             // Make sure counts match.
387
if (pageCount != reader.numPages()) {
388                 throw new IOException("DB claims " + reader.numPages() + " pages, but should be " + pageCount);
389             }
390
391             if (seenLinks.size() != reader.numLinks()) {
392                 throw new IOException("DB claims " + reader.numLinks() + " links, but should be " + seenLinks.size() + ". Total links since last checkConsistency: " + totalLinksEver);
393             }
394
395             // Go through every page....
396
for (int i = 0; i < pageCount; i++) {
397                 // First, check coverage of the page set.
398
Page dbPage = reader.getPage(pages[i].getURL().toString());
399                 if (dbPage == null) {
400                     throw new IOException("DB could not find page " + pages[i].getURL());
401                 }
402                 if (! dbPage.getURL().equals(pages[i].getURL())) {
403                     throw new IOException("DB's page " + dbPage.getURL() + " should be " + pages[i].getURL());
404                 }
405                 if (! dbPage.getMD5().equals(pages[i].getMD5())) {
406                     throw new IOException("Page " + pages[i].getURL() + " in the DB has an MD5 of " + dbPage.getMD5() + ", but should be " + pages[i].getMD5());
407                 }
408
409                 // Next, the outlinks from that page. Go through
410
// every one of the links we think it should have,
411
// and make sure it is there.
412
Link dbOutlinks[] = reader.getLinks(pages[i].getMD5());
413                 for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) {
414                     Link curOutlink = (Link) e.nextElement();
415                     boolean foundLink = false;
416                     for (int j = 0; j < dbOutlinks.length; j++) {
417                         if (dbOutlinks[j].compareTo(curOutlink) == 0) {
418                             foundLink = true;
419                             break;
420                         }
421                     }
422                     if (! foundLink) {
423                         throw new IOException("DB did not return Link " + curOutlink + " when asked for all links from " + pages[i].getMD5());
424                     }
425                 }
426
427                 // We also want to test whether there are some
428
// links in the DB which should not be there.
429
// (Yes, this is caught by the above counting
430
// test, but we want to find out *which* urls
431
// are the "extra" ones.)
432
int numTooMany = 0;
433                 boolean excessLinks = false;
434                 for (int j = 0; j < dbOutlinks.length; j++) {
435                     boolean foundLink = false;
436                     for (Enumeration e = outlinks[i].elements(); e.hasMoreElements(); ) {
437                         Link curOutlink = (Link) e.nextElement();
438                         if (dbOutlinks[j].compareTo(curOutlink) == 0) {
439                             foundLink = true;
440                             break;
441                         }
442                     }
443
444                     if (! foundLink) {
445                         System.out.println("Found excess link in WebDB: " + dbOutlinks[j]);
446                         excessLinks = true;
447                         numTooMany++;
448                     }
449                 }
450                 if (excessLinks) {
451                     throw new IOException("DB has " + numTooMany + " too many outlinks.");
452                 }
453
454
455
456                 // Finally, the links *to* that page.
457
Vector inlinkList = (Vector) inlinks.get(pages[i].getURL().toString());
458                 if (inlinkList != null) {
459                     Link dbInlinks[] = reader.getLinks(pages[i].getURL());
460                     for (Enumeration e = inlinkList.elements(); e.hasMoreElements(); ) {
461                         Link curInlink = (Link) e.nextElement();
462                         boolean foundLink = false;
463                         for (int j = 0; j < dbInlinks.length; j++) {
464                             if (dbInlinks[j].compareTo(curInlink) == 0) {
465                                 foundLink = true;
466                                 break;
467                             }
468                         }
469                         if (! foundLink) {
470                             throw new IOException("DB did not return Link " + curInlink + " when asked for all links to " + pages[i].getURL());
471                         }
472                     }
473                 }
474             }
475         } finally {
476             reader.close();
477         }
478         totalLinksEver = 0;
479     }
480
481     /**
482      * apiTest() will run through all the methods of
483      * IWebDBReader and make sure they give correct
484      * answers. We might use the internal model as a
485      * source of items that are in the webdb, but we aren't
486      * trying to perform a full consistency check - that's
487      * done in checkConsistency().
488      */

489     private void apiTest() throws IOException {
490         long urlEnumCount = 0, md5EnumCount = 0, linkEnumCount = 0;
491         IWebDBReader reader = new WebDBReader(nfs, webdb);
492         try {
493             //
494
// PAGE OPERATIONS
495
//
496

497             // 1. Test pages() and numPages()
498
System.out.println("Testing IWebDBReader.pages()...");
499             Page prevPage = null;
500             for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
501                 if (prevPage == null) {
502                     prevPage = (Page) e.nextElement();
503                 } else {
504                     Page curPage = (Page) e.nextElement();
505                     if (! (prevPage.getURL().compareTo(curPage.getURL()) < 0)) {
506                         throw new IOException("While enumerating by URL, page " + prevPage + " comes before " + curPage);
507                     }
508                     prevPage = curPage;
509                 }
510                 urlEnumCount++;
511             }
512             if (urlEnumCount != reader.numPages()) {
513                 throw new IOException("IWebDBReader call to pages() results in " + urlEnumCount + ", but IWebDBReader reports " + reader.numPages() + " items.");
514             }
515
516             // 2. Test pagesByMD5().
517
System.out.println("Testing IWebDBReader.pagesByMD5()...");
518             prevPage = null;
519             for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); ) {
520                 if (prevPage == null) {
521                     prevPage = (Page) e.nextElement();
522                 } else {
523                     Page curPage = (Page) e.nextElement();
524                     if (! (prevPage.compareTo(curPage) < 0)) {
525                         throw new IOException("While enumerating by MD5, page " + prevPage + " comes before " + curPage);
526                     }
527                     prevPage = curPage;
528                 }
529                 md5EnumCount++;
530             }
531             if (md5EnumCount != reader.numPages()) {
532                 throw new IOException("IWebDBReader call to pagesByMD5() results in " + md5EnumCount + ", but IWebDBReader reports " + reader.numPages() + " items.");
533             }
534
535             // 3. Test getPage(String) method.
536
System.out.println("Testing IWebDBReader.getPage()...");
537             for (int i = 0; i < pages.length; i++) {
538                 Page curPage = pages[i];
539                 Page resultPage = reader.getPage(curPage.getURL().toString());
540
541                 if (resultPage == null || (resultPage.compareTo(curPage) != 0)) {
542                     throw new IOException("Call to IWebDBReader.getPage(" + curPage.getURL() + ") should have returned " + curPage + ", but returned " + resultPage + " instead.");
543                 }
544             }
545
546             // 4. Test getPages(MD5Hash)
547
System.out.println("Testing IWebDBReader.getPages()...");
548             for (Iterator it = md5Hashes.keySet().iterator(); it.hasNext(); ) {
549                 MD5Hash curHash = (MD5Hash) it.next();
550                 Page pageSet[] = reader.getPages(curHash);
551                 int numItems = ((Integer JavaDoc) md5Hashes.get(curHash)).intValue();
552                 if (pageSet.length != numItems) {
553                     throw new IOException("There should be " + numItems + " item(s) with MD5Hash " + curHash + " in the db, but IWebDBReader.getPages() reports " + pageSet.length);
554                 }
555             }
556
557             // 5. Test pageExists(MD5Hash)
558
System.out.println("Testing IWebDBReader.pageExists()...");
559             for (int i = 0; i < pages.length; i++) {
560                 Page curPage = pages[i];
561                 if (! reader.pageExists(curPage.getMD5())) {
562                     throw new IOException("IWebDBReader.pageExists() reports that a page with MD5 " + curPage.getMD5() + " is not found. It should be!");
563                 }
564             }
565
566
567             //
568
// LINK OPERATIONS
569
//
570

571             // 1. Test links() and numLinks(), and generate a list
572
// of items to test later on.
573
System.out.println("Testing IWebDBReader.links()...");
574             Link prevLink = null;
575             for (Enumeration e = reader.links(); e.hasMoreElements(); ) {
576                 if (prevLink == null) {
577                     prevLink = (Link) e.nextElement();
578                 } else {
579                     Link curLink = (Link) e.nextElement();
580                     if (! (prevLink.compareTo(curLink) < 0)) {
581                         throw new IOException("While enumerating by Link, link " + prevLink + " comes before " + curLink);
582                     }
583                     prevLink = curLink;
584                 }
585                 linkEnumCount++;
586             }
587             if (linkEnumCount != reader.numLinks()) {
588                 throw new IOException("IWebDBReader call to links() results in " + linkEnumCount + ", but IWebDBReader reports " + reader.numLinks() + " items.");
589             }
590
591             // 2. Test getLinks(UTF8)
592
System.out.println("Testing IWebDBReader.getLinks(UTF8)...");
593             for (int i = 0; i < pages.length; i++) {
594                 Page curPage = pages[i];
595                 Vector inlinkList = (Vector) inlinks.get(curPage.getURL().toString());
596                 Link dbInlinks[] = reader.getLinks(curPage.getURL());
597
598                 if (inlinkList == null || dbInlinks == null) {
599                     if ((inlinkList == null || inlinkList.size() == 0) &&
600                         (dbInlinks.length != 0)) {
601                         throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getURL()+ ") should return 0 links, but returns " + dbInlinks.length + " instead.");
602                     }
603                 } else {
604                     if (dbInlinks.length != inlinkList.size()) {
605                         throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getURL() + ") should return " + inlinkList.size() + " inlinks, but returns " + dbInlinks.length + " instead.");
606                     }
607                 }
608             }
609
610             // 3. Test getLinks(MD5Hash)
611
System.out.println("Testing IWebDBReader.getLinks(MD5Hash)...");
612             for (int i = 0; i < pages.length; i++) {
613                 Page curPage = pages[i];
614                 Link dbOutlinks[] = reader.getLinks(curPage.getMD5());
615                 if (dbOutlinks.length != outlinks[i].size()) {
616                     throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getMD5() + ") should return " + outlinks[i].size() + " outlinks, but returns " + dbOutlinks.length + " instead.");
617                 }
618                 if (dbOutlinks.length != curPage.getNumOutlinks()) {
619                     throw new IOException("Call to IWebDBReader.getLinks(" + curPage.getMD5() + ") should (according to Page.getNumOutlinks() return " + curPage.getNumOutlinks() + ", but returns " + dbOutlinks.length + " instead.");
620                 }
621             }
622         } finally {
623             reader.close();
624         }
625     }
626
627     /**
628      * Return a string of numChars length. This is good for
629      * anchors and URLs.
630      */

631     private String JavaDoc createRandomString(int numChars) {
632         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
633         for (int i = 0; i < numChars; i++) {
634             buf.append((char) ('A' + Math.abs(rand.nextInt() % 26)));
635         }
636         return buf.toString();
637     }
638
639     /**
640      * Internal utility method that manufactures a brand-new
641      * novel page.
642      */

643     private Page createRandomPage() throws IOException {
644         String JavaDoc curURL = "http://www.somePage." + createRandomString(20) + ".com/index.html";
645         MD5Hash md5Hash = null;
646         Integer JavaDoc hashCount = null;
647
648         // Keep generating random contents until we have a unique
649
// one. otherwise we need to maintain models of page contents.
650
// As mentioned, that's too much for now, but maybe someday.
651
do {
652             md5Hash = MD5Hash.digest(createRandomString(Math.abs(rand.nextInt() % 2048) + 1));
653             hashCount = (Integer JavaDoc) md5Hashes.get(md5Hash);
654         } while (hashCount != null);
655
656         md5Hashes.put(md5Hash, new Integer JavaDoc(1));
657         return new Page(curURL, md5Hash);
658     }
659
660     /**
661      * The createClonePage() is used to test duplicate MD5s
662      * in our webdb. This is why we worry about tracking MD5s
663      * in the md5Hashes table. REMIND - It is not used yet
664      */

665     private Page createClonePage(Page cloneSrc) throws IOException {
666         String JavaDoc curURL = "http://www.somePage." + createRandomString(20) + ".com/index.html";
667         MD5Hash md5Hash = cloneSrc.getMD5();
668         Integer JavaDoc hashCount = (Integer JavaDoc) md5Hashes.get(md5Hash);
669         md5Hashes.put(md5Hash, (hashCount == null) ? new Integer JavaDoc(1) : new Integer JavaDoc(hashCount.intValue() + 1));
670         return new Page(curURL, md5Hash);
671     }
672
673     /**
674      * Internal method that makes a Link between two
675      * pages.
676      */

677     private Link createLink(Page src, Page dst) throws IOException {
678         UTF8 targetURL = dst.getURL();
679         MD5Hash srcMD5 = new MD5Hash();
680         srcMD5.set(src.getMD5());
681         String JavaDoc linkText = createRandomString(Math.abs(rand.nextInt() % 16) + 1);
682         return new Link(srcMD5, src.computeDomainID(), targetURL.toString(), linkText);
683     }
684
685     /**
686      * We make a randomized set of outlinks from the given
687      * page to any number of other pages.
688      *
689      * Returns number of outlinks generated.
690      */

691     private int makeOutlinkSet(IWebDBWriter writer, int srcIndex) throws IOException {
692         // Create the links for this new page!
693
int numOutlinks = Math.abs(rand.nextInt() % MAX_OUTLINKS) + 1;
694         int numInserted = 0;
695         for (int j = 0; j < numOutlinks; j++) {
696             int targetPageIndex = Math.abs(rand.nextInt() % (maxPages));
697             Page targetPage = pages[targetPageIndex];
698             Link lr = createLink(pages[srcIndex], targetPage);
699
700             // See if we've made this link before
701
if (! seenLinks.contains(lr)) {
702                 outlinks[srcIndex].add(lr);
703                 Vector inlinkList = (Vector) inlinks.get(targetPage.getURL().toString());
704                 if (inlinkList == null) {
705                     inlinkList = new Vector();
706                     inlinks.put(targetPage.getURL().toString(), inlinkList);
707                 }
708                 inlinkList.add(lr);
709                 writer.addLink(lr);
710
711                 linkCount++;
712                 totalLinksEver++;
713                 numInserted++;
714                 seenLinks.add(lr);
715             }
716         }
717         return numInserted;
718     }
719
720     /**
721      * The command-line takes a location to put temporary work
722      * files, the number of pages to use in the test set, and
723      * (optionally) a seed for the random num-generator.
724      */

725     public static void main(String JavaDoc argv[]) throws IOException {
726         if (argv.length < 2) {
727         System.out.println("Usage: java net.nutch.db.DBTester (-local | -ndfs <namenode:port>) <workingdir> <numPages> [-seed <seed>]");
728             return;
729         }
730
731         // Parse args
732
int i = 0;
733         NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
734         try {
735             File dbDir = new File(argv[i++]);
736             int numPages = Integer.parseInt(argv[i++]);
737
738             boolean gotSeed = false;
739             long seed = 0;
740             for (; i < argv.length; i++) {
741                 if ("-seed".equals(argv[i])) {
742                     gotSeed = true;
743                     seed = Long.parseLong(argv[i+1]);
744                     i++;
745                 }
746             }
747
748             DBTester tester = (gotSeed) ? new DBTester(nfs, dbDir, seed, numPages) : new DBTester(nfs, dbDir, numPages);
749             try {
750                 tester.runTest();
751             } finally {
752                 tester.cleanup();
753             }
754         } finally {
755             nfs.close();
756         }
757     }
758 }
759
Popular Tags