KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > db > DistributedWebDBReader


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3 package net.nutch.db;
4
5 import java.io.*;
6 import java.util.*;
7
8 import net.nutch.io.*;
9 import net.nutch.fs.*;
10 import net.nutch.util.*;
11 import net.nutch.pagedb.*;
12 import net.nutch.linkdb.*;
13
14 /**********************************************
15  * The WebDBReader implements all the read-only
16  * parts of accessing our web database.
17  * All the writing ones can be found in WebDBWriter.
18  *
19  * @author Mike Cafarella
20  **********************************************/

21 public class DistributedWebDBReader implements IWebDBReader {
22     static final Page[] PAGE_RECORDS = new Page[0];
23     static final Link[] LINK_RECORDS = new Link[0];
24
25     // filenames
26
static final String JavaDoc PAGES_BY_URL = "pagesByURL";
27     static final String JavaDoc PAGES_BY_MD5 = "pagesByMD5";
28     static final String JavaDoc LINKS_BY_URL = "linksByURL";
29     static final String JavaDoc LINKS_BY_MD5 = "linksByMD5";
30
31     static final String JavaDoc STATS_FILE = "stats";
32     static final String JavaDoc META_FILE = "metainfo";
33
34     // For different enumeration types
35
static final EnumCall PAGE_ENUMS = new PageEnumCall();
36     static final EnumCall PAGE_MD5_ENUMS = new PageByMD5EnumCall();
37     static final EnumCall LINK_ENUMS = new LinkEnumCall();
38
39     // Utility array for Vector conversion
40
static final DBSectionReader[] STATIC_SR_ARRAY = new DBSectionReader[0];
41
42     // Structures for multi-file db structures
43
File root, dbDir;
44     File globalWriteLock;
45     DBSectionReader pagesByURL[], pagesByMD5[], linksByURL[], linksByMD5[];
46     long totalPages = 0, totalLinks = 0;
47     int numMachines = 0;
48
49     /**
50      * Open a web db reader for the named directory.
51      */

52     public DistributedWebDBReader(NutchFileSystem nfs, File root) throws IOException, FileNotFoundException {
53         //
54
// Get the current db from the given nutchfs. It consists
55
// of a bunch of directories full of files.
56
//
57
this.root = root;
58         this.dbDir = new File(new File(root, "standard"), "webdb");
59
60         //
61
// Wait until the webdb is complete, by waiting till a given
62
// file exists.
63
//
64
File dirIsComplete = new File(dbDir, "dbIsComplete");
65         while (! nfs.exists(dirIsComplete)) {
66             try {
67                 Thread.sleep(2000);
68             } catch (InterruptedException JavaDoc ie) {
69             }
70         }
71
72         //
73
// Obtain non-exclusive lock on the webdb's globalWriteLock
74
// so writers don't move it out from under us.
75
//
76

77         // REMIND - mjc - I think the locking here is suspect.
78
/**
79         this.globalWriteLock = new File("standard", new File("globalWriteLock"));
80         nfs.lock(globalWriteLock, false);
81         **/

82
83         //
84
// Load in how many segments we can expect
85
//
86
File machineInfo = new File(new File(root, "standard"), "machineinfo");
87         DataInputStream in = new DataInputStream(nfs.open(machineInfo));
88         try {
89             in.readByte(); // version
90
this.numMachines = in.readInt();
91         } finally {
92             in.close();
93         }
94
95         //
96
// Find all the "section" subdirs. Each section will contain
97
// one of the 4 tables we're after. Create one DBSectionReader
98
// object for each table in each section.
99
//
100
Vector pagesByURL = new Vector(), pagesByMD5 = new Vector(), linksByMD5 = new Vector(), linksByURL = new Vector();
101         for (int i = 0; i < numMachines; i++) {
102             // The relevant NutchFiles for each part of this db section
103
File sectionDir = new File(dbDir, "dbsection." + i);
104             File pagesByURLFile = new File(sectionDir, PAGES_BY_URL);
105             File pagesByMD5File = new File(sectionDir, PAGES_BY_MD5);
106             File linksByURLFile = new File(sectionDir, LINKS_BY_URL);
107             File linksByMD5File = new File(sectionDir, LINKS_BY_MD5);
108
109             // Create DBSectionReader object for each subtype
110
pagesByURL.add(new DBSectionReader(nfs, pagesByURLFile, new UTF8.Comparator()));
111             pagesByMD5.add(new DBSectionReader(nfs, pagesByMD5File, new Page.Comparator()));
112             linksByURL.add(new DBSectionReader(nfs, linksByURLFile, new Link.UrlComparator()));
113             linksByMD5.add(new DBSectionReader(nfs, linksByMD5File, new Link.MD5Comparator()));
114
115             // Load in the stats file for the section
116
File sectionStats = new File(sectionDir, STATS_FILE);
117             in = new DataInputStream(nfs.open(sectionStats));
118             try {
119                 in.read(); // version
120
this.totalPages += in.readLong();
121                 this.totalLinks += in.readLong();
122             } finally {
123                 in.close();
124             }
125         }
126
127         // Put lists into array form
128
this.pagesByURL = (DBSectionReader[]) pagesByURL.toArray(STATIC_SR_ARRAY);
129         this.pagesByMD5 = (DBSectionReader[]) pagesByMD5.toArray(STATIC_SR_ARRAY);
130         this.linksByURL = (DBSectionReader[]) linksByURL.toArray(STATIC_SR_ARRAY);
131         this.linksByMD5 = (DBSectionReader[]) linksByMD5.toArray(STATIC_SR_ARRAY);
132     }
133
134     /**
135      * Shutdown
136      */

137     public void close() throws IOException {
138         for (int i = 0; i < pagesByURL.length; i++) {
139             pagesByURL[i].close();
140             pagesByMD5[i].close();
141             linksByURL[i].close();
142             linksByMD5[i].close();
143         }
144     }
145
146     /**
147      * How many sections (machines) there are in this distributed db.
148      */

149     public int numMachines() {
150         return numMachines;
151     }
152
153     /**
154      * Return the number of pages we're dealing with.
155      */

156     public long numPages() {
157         return totalPages;
158     }
159
160     /**
161      * Return the number of links in our db.
162      */

163     public long numLinks() {
164         return totalLinks;
165     }
166
167     /**
168      * Get Page from the pagedb with the given URL.
169      */

170     public Page getPage(String JavaDoc url) throws IOException {
171         Page result = null, target = new Page();
172         UTF8 searchURL = new UTF8(url);
173
174         // Don't do linear search. Instead, jump to the
175
// chunk that will have it.
176
return pagesByURL[DBKeyDivision.findURLSection(url, numMachines)].getPage(searchURL, target);
177     }
178
179     /**
180      * Get all the Pages according to their content hash.
181      * Since items in the pagesByMD5 DBSectionReader array will
182      * be sorted by ascending blocks of the content hash,
183      * we know the results will come in sorted order.
184      */

185     public Page[] getPages(MD5Hash md5) throws IOException {
186         Vector resultSet = pagesByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].getPages(md5);
187         Page resultArray[] = new Page[resultSet.size()];
188         int i = 0;
189         for (Enumeration e = resultSet.elements(); e.hasMoreElements(); i++) {
190             resultArray[i] = (Page) e.nextElement();
191         }
192         return resultArray;
193     }
194
195     /**
196      * Test whether a certain piece of content is in the
197      * database, but don't bother returning the Page(s) itself.
198      * We need to test every DBSectionReader in pagesByMD5 until
199      * we reach the end, or find a positive.
200      */

201     public boolean pageExists(MD5Hash md5) throws IOException {
202         return pagesByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].pageExists(md5);
203     }
204
205     /**
206      * Iterate through all the Pages, sorted by URL.
207      * We need to enumerate all the Enumerations given
208      * to us via a call to pages() for each DBSectionReader.
209      */

210     public Enumeration pages() throws IOException {
211         return new MetaEnumerator(pagesByURL, PAGE_ENUMS);
212     }
213
214     /**
215      * Iterate through all the Pages, sorted by MD5.
216      * We enumerate all the DBSectionReader Enumerations,
217      * just as above.
218      */

219     public Enumeration pagesByMD5() throws IOException {
220         return new MetaEnumerator(pagesByMD5, PAGE_MD5_ENUMS);
221     }
222
223     /**
224      * Get all the hyperlinks that link TO the indicated URL.
225      */

226     public Link[] getLinks(UTF8 url) throws IOException {
227         Vector resultSet = linksByURL[DBKeyDivision.findURLSection(url.toString(), numMachines)].getLinks(url);
228         Link resultArray[] = new Link[resultSet.size()];
229         int i = 0;
230         for (Enumeration e = resultSet.elements(); e.hasMoreElements(); ) {
231             resultArray[i++] = (Link) e.nextElement();
232         }
233         return resultArray;
234     }
235
236     /**
237      * Grab all the links from the given MD5 hash.
238      */

239     public Link[] getLinks(MD5Hash md5) throws IOException {
240         Vector resultSet = linksByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].getLinks(md5);
241         Link resultArray[] = new Link[resultSet.size()];
242         int i = 0;
243         for (Enumeration e = resultSet.elements(); e.hasMoreElements(); ) {
244             resultArray[i++] = (Link) e.nextElement();
245         }
246         return resultArray;
247     }
248
249     /**
250      * Return all the links, by target URL
251      */

252     public Enumeration links() throws IOException {
253         return new MetaEnumerator(linksByURL, LINK_ENUMS);
254     }
255
256     //
257
// The EnumCall class allows the creator of MetaEnumerator
258
// to indicate how to get each enumeration. Will it be pages
259
// or links?
260
//
261
static abstract class EnumCall {
262         /**
263          */

264         public EnumCall() {
265         }
266
267         /**
268          * Subclasses override this for different kinds of MetaEnumerator
269          * behavior.
270          */

271         public abstract Enumeration getEnumeration(DBSectionReader reader) throws IOException;
272     }
273
274     //
275
// For enumerating Pages
276
//
277
static class PageEnumCall extends EnumCall {
278         /**
279          */

280         public PageEnumCall() {
281         }
282
283         /**
284          * Get the enum of Pages
285          */

286         public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
287             return reader.pages();
288         }
289     }
290
291     //
292
// For enumerating Pages
293
//
294
static class PageByMD5EnumCall extends EnumCall {
295         /**
296          */

297         public PageByMD5EnumCall() {
298         }
299
300         /**
301          * Get the enum of Pages
302          */

303         public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
304             return reader.pagesByMD5();
305         }
306     }
307
308     //
309
// For enumerating Links
310
//
311
static class LinkEnumCall extends EnumCall {
312         /**
313          */

314         public LinkEnumCall() {
315         }
316
317         /**
318          * Get the enum of Links
319          */

320         public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
321             return reader.links();
322         }
323     }
324
325     //
326
// MetaEnumerator uses the Enumerations from each
327
// DBSectionReader in the passed-in DBSectionReader array.
328
//
329
class MetaEnumerator implements Enumeration {
330         Enumeration enumerations[];
331         int curEnum = 0;
332
333         /**
334          * Create all the Enumerations from the given Sections
335          */

336         public MetaEnumerator(DBSectionReader sections[], EnumCall enumCall) throws IOException {
337             this.enumerations = new Enumeration[sections.length];
338
339             for (int i = 0; i < enumerations.length; i++) {
340                 enumerations[i] = enumCall.getEnumeration(sections[i]);
341             }
342         }
343
344         /**
345          * Go through all the DBSectionReader items in
346          * pagesByURL, until we find one that hasMoreElements.
347          * Or until we hit the end.
348          */

349         public boolean hasMoreElements() {
350             boolean result = false;
351
352             //
353
// Go through Enumerations until we find one with
354
// hasMoreElements() == true. (Or until we run out
355
// of Enumerations.)
356
//
357
for (; curEnum < enumerations.length; curEnum++) {
358                 result = enumerations[curEnum].hasMoreElements();
359                 
360                 if (result) {
361                     break;
362                 }
363             }
364             return result;
365         }
366
367         /**
368          * Exhaust the Objects we can receive from the
369          * Enumerations array, via calls to nextElement();
370          */

371         public Object JavaDoc nextElement() {
372             Object JavaDoc obj = null;
373
374             //
375
// Go through Enumerations until we find one with
376
// a nextElement() to return. (Or until we run out.)
377
//
378
for (; curEnum < enumerations.length; curEnum++) {
379                 if (enumerations[curEnum].hasMoreElements()) {
380                     obj = enumerations[curEnum].nextElement();
381
382                     if (obj != null) {
383                         break;
384                     }
385                 }
386             }
387             return obj;
388         }
389     }
390
391     /**
392      * The DistributedWebDBReader.main() provides some handy utility methods
393      * for looking through the contents of the webdb. Hoo-boy!
394      *
395      * Note this only works for a completely-NFS deployment.
396      */

397     public static void main(String JavaDoc argv[]) throws FileNotFoundException, IOException {
398         if (argv.length < 2) {
399             System.out.println("Usage: java net.nutch.db.DistributedWebDBReader (-local | -ndfs <namenode:port>) <root> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]");
400             return;
401         }
402
403         int i = 0;
404         NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
405         File root = new File(argv[i++]);
406         DistributedWebDBReader reader = new DistributedWebDBReader(nfs, root);
407         try {
408             String JavaDoc cmd = argv[i++];
409
410             if ("-pageurl".equals(cmd)) {
411                 String JavaDoc url = argv[i++];
412                 System.out.println(reader.getPage(url.trim()));
413             } else if ("-pagemd5".equals(cmd)) {
414                 MD5Hash md5 = new MD5Hash(argv[i++]);
415                 Page pages[] = reader.getPages(md5);
416                 System.out.println("Found " + pages.length + " pages.");
417                 for (int j = 0; j < pages.length; j++) {
418                     System.out.println("Page " + j + ": " + pages[j]);
419                 }
420             } else if ("-dumppageurl".equals(cmd)) {
421                 int j = 1;
422                 for (Enumeration e = reader.pages(); e.hasMoreElements(); j++) {
423                     Page page = (Page) e.nextElement();
424                     System.out.println("Page " + j + ": " + page);
425                     System.out.println();
426                 }
427             } else if ("-dumppagemd5".equals(cmd)) {
428                 int j = 1;
429                 for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); j++) {
430                     Page page = (Page) e.nextElement();
431                     System.out.println("Page " + j + ": " + page);
432                     System.out.println();
433                 }
434             } else if ("-toppages".equals(cmd)) {
435                 int topSize = Integer.parseInt(argv[i++]);
436
437                 // Create a sorted list
438
SortedSet topSet = new TreeSet(new Comparator() {
439                     public int compare(Object JavaDoc o1, Object JavaDoc o2) {
440                         Page p1 = (Page) o1;
441                         Page p2 = (Page) o2;
442                         if (p1.getScore() < p2.getScore()) {
443                             return -1;
444                         } else if (p1.getScore() == p2.getScore()) {
445                             // If two scores are equal, we will
446
// use regular Page comparison (which
447
// uses URL as the primary key). We
448
// don't want to uniquify by score!
449
return p1.compareTo(p2);
450                         } else {
451                             return 1;
452                         }
453                     }
454                 }
455                     );
456
457                 // Find the top "topSize" elts
458
Page lowestPage = null;
459                 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
460                     Page curPage = (Page) e.nextElement();
461                     if (topSet.size() < topSize) {
462                         topSet.add(curPage);
463                         lowestPage = (Page) topSet.first();
464                     } else if (lowestPage.getScore() < curPage.getScore()) {
465                         topSet.remove(lowestPage);
466                         topSet.add(curPage);
467                         lowestPage = (Page) topSet.first();
468                     }
469                 }
470             
471                 // Print them out
472
int j = 0;
473                 for (Iterator it = topSet.iterator(); it.hasNext(); j++) {
474                     System.out.println("Page " + j + ": " + (Page) it.next());
475                     System.out.println();
476                 }
477             } else if ("-linkurl".equals(cmd)) {
478                 String JavaDoc url = argv[i++];
479                 Link links[] = reader.getLinks(new UTF8(url.trim()));
480                 System.out.println("Found " + links.length + " links.");
481                 for (int j = 0; j < links.length; j++) {
482                     System.out.println("Link " + j + ": " + links[j]);
483                 }
484             } else if ("-linkmd5".equals(cmd)) {
485                 MD5Hash fromID = new MD5Hash(argv[i++]);
486                 Link links[] = reader.getLinks(fromID);
487                 System.out.println("Found " + links.length + " links.");
488                 for (int j = 0; j < links.length; j++) {
489                     System.out.println("Link " + j + ": " + links[j]);
490                 }
491             } else if ("-dumplinks".equals(cmd)) {
492                 int j = 1;
493                 for (Enumeration e = reader.links(); e.hasMoreElements(); j++) {
494                     Link link = (Link) e.nextElement();
495                     System.out.println("Link " + j + ": " + link);
496                     System.out.println();
497                 }
498             } else if ("-stats".equals(cmd)) {
499                 System.out.println("Stats for " + reader);
500                 System.out.println("-------------------------------");
501                 System.out.println("Number of pages: " + reader.numPages());
502                 System.out.println("Number of links: " + reader.numLinks());
503                 System.out.println("Number of machines (sections): " + reader.numMachines());
504             } else {
505                 System.out.println("Sorry, no command with name " + cmd);
506             }
507         } finally {
508             reader.close();
509         }
510     }
511 }
512
Popular Tags