DistributedWebDBReader


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   package net.nutch.db;
4   
5   import java.io.*;
6   import java.util.*;
7   
8   import net.nutch.io.*;
9   import net.nutch.fs.*;
10  import net.nutch.util.*;
11  import net.nutch.pagedb.*;
12  import net.nutch.linkdb.*;
13  
14  /**********************************************
15   * The WebDBReader implements all the read-only
16   * parts of accessing our web database.
17   * All the writing ones can be found in WebDBWriter.
18   *
19   * @author Mike Cafarella
20   **********************************************/
21  public class DistributedWebDBReader implements IWebDBReader {
22      static final Page[] PAGE_RECORDS = new Page[0];
23      static final Link[] LINK_RECORDS = new Link[0];
24  
25      // filenames
26      static final String   PAGES_BY_URL = "pagesByURL";
27      static final String   PAGES_BY_MD5 = "pagesByMD5";
28      static final String   LINKS_BY_URL = "linksByURL";
29      static final String   LINKS_BY_MD5 = "linksByMD5";
30  
31      static final String   STATS_FILE = "stats";
32      static final String   META_FILE = "metainfo";
33  
34      // For different enumeration types
35      static final EnumCall PAGE_ENUMS = new PageEnumCall();
36      static final EnumCall PAGE_MD5_ENUMS = new PageByMD5EnumCall();
37      static final EnumCall LINK_ENUMS = new LinkEnumCall();    
38  
39      // Utility array for Vector conversion
40      static final DBSectionReader[] STATIC_SR_ARRAY = new DBSectionReader[0];
41  
42      // Structures for multi-file db structures
43      File root, dbDir;
44      File globalWriteLock;
45      DBSectionReader pagesByURL[], pagesByMD5[], linksByURL[], linksByMD5[];
46      long totalPages = 0, totalLinks = 0;
47      int numMachines = 0;
48  
49      /**
50       * Open a web db reader for the named directory.
51       */    
52      public DistributedWebDBReader(NutchFileSystem nfs, File root) throws IOException, FileNotFoundException {
53          //
54          // Get the current db from the given nutchfs.  It consists
55          // of a bunch of directories full of files.  
56          //
57          this.root = root;
58          this.dbDir = new File(new File(root, "standard"), "webdb");
59  
60          //
61          // Wait until the webdb is complete, by waiting till a given
62          // file exists.
63          //
64          File dirIsComplete = new File(dbDir, "dbIsComplete");
65          while (! nfs.exists(dirIsComplete)) {
66              try {
67                  Thread.sleep(2000);
68              } catch (InterruptedException   ie) {
69              }
70          }
71  
72          //
73          // Obtain non-exclusive lock on the webdb's globalWriteLock 
74          // so writers don't move it out from under us.
75          //
76  
77          // REMIND - mjc - I think the locking here is suspect.
78          /**
79          this.globalWriteLock = new File("standard", new File("globalWriteLock"));
80          nfs.lock(globalWriteLock, false);
81          **/
82  
83          //
84          // Load in how many segments we can expect
85          //
86          File machineInfo = new File(new File(root, "standard"), "machineinfo");
87          DataInputStream in = new DataInputStream(nfs.open(machineInfo));
88          try {
89              in.readByte();  // version
90              this.numMachines = in.readInt();
91          } finally {
92              in.close();
93          }
94  
95          // 
96          // Find all the "section" subdirs.  Each section will contain 
97          // one of the 4 tables we're after.  Create one DBSectionReader 
98          // object for each table in each section.
99          //
100         Vector pagesByURL = new Vector(), pagesByMD5 = new Vector(), linksByMD5 = new Vector(), linksByURL = new Vector();
101         for (int i = 0; i < numMachines; i++) {
102             // The relevant NutchFiles for each part of this db section
103             File sectionDir = new File(dbDir, "dbsection." + i);
104             File pagesByURLFile = new File(sectionDir, PAGES_BY_URL);
105             File pagesByMD5File = new File(sectionDir, PAGES_BY_MD5);
106             File linksByURLFile = new File(sectionDir, LINKS_BY_URL);
107             File linksByMD5File = new File(sectionDir, LINKS_BY_MD5);
108 
109             // Create DBSectionReader object for each subtype
110             pagesByURL.add(new DBSectionReader(nfs, pagesByURLFile, new UTF8.Comparator()));
111             pagesByMD5.add(new DBSectionReader(nfs, pagesByMD5File, new Page.Comparator()));
112             linksByURL.add(new DBSectionReader(nfs, linksByURLFile, new Link.UrlComparator()));
113             linksByMD5.add(new DBSectionReader(nfs, linksByMD5File, new Link.MD5Comparator()));
114 
115             // Load in the stats file for the section
116             File sectionStats = new File(sectionDir, STATS_FILE);
117             in = new DataInputStream(nfs.open(sectionStats));
118             try {
119                 in.read(); // version
120                 this.totalPages += in.readLong();
121                 this.totalLinks += in.readLong();
122             } finally {
123                 in.close();
124             }
125         }
126 
127         // Put lists into array form
128         this.pagesByURL = (DBSectionReader[]) pagesByURL.toArray(STATIC_SR_ARRAY);
129         this.pagesByMD5 = (DBSectionReader[]) pagesByMD5.toArray(STATIC_SR_ARRAY);
130         this.linksByURL = (DBSectionReader[]) linksByURL.toArray(STATIC_SR_ARRAY);
131         this.linksByMD5 = (DBSectionReader[]) linksByMD5.toArray(STATIC_SR_ARRAY);
132     }
133 
134     /**
135      * Shutdown
136      */
137     public void close() throws IOException {
138         for (int i = 0; i < pagesByURL.length; i++) {
139             pagesByURL[i].close();
140             pagesByMD5[i].close();
141             linksByURL[i].close();
142             linksByMD5[i].close();
143         }
144     }
145 
146     /**
147      * How many sections (machines) there are in this distributed db.
148      */
149     public int numMachines() {
150         return numMachines;
151     }
152 
153     /**
154      * Return the number of pages we're dealing with.
155      */
156     public long numPages() {
157         return totalPages;
158     }
159 
160     /**
161      * Return the number of links in our db.
162      */
163     public long numLinks() {
164         return totalLinks;
165     }
166 
167     /**
168      * Get Page from the pagedb with the given URL.
169      */
170     public Page getPage(String   url) throws IOException {
171         Page result = null, target = new Page();
172         UTF8 searchURL = new UTF8(url);
173 
174         // Don't do linear search.  Instead, jump to the
175         // chunk that will have it.
176         return pagesByURL[DBKeyDivision.findURLSection(url, numMachines)].getPage(searchURL, target);
177     }
178 
179     /**
180      * Get all the Pages according to their content hash.
181      * Since items in the pagesByMD5 DBSectionReader array will 
182      * be sorted by ascending blocks of the content hash, 
183      * we know the results will come in sorted order.
184      */
185     public Page[] getPages(MD5Hash md5) throws IOException {
186         Vector resultSet = pagesByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].getPages(md5);
187         Page resultArray[] = new Page[resultSet.size()];
188         int i = 0;
189         for (Enumeration e = resultSet.elements(); e.hasMoreElements(); i++) {
190             resultArray[i] = (Page) e.nextElement();
191         }
192         return resultArray;
193     }
194 
195     /**
196      * Test whether a certain piece of content is in the
197      * database, but don't bother returning the Page(s) itself.
198      * We need to test every DBSectionReader in pagesByMD5 until
199      * we reach the end, or find a positive.
200      */
201     public boolean pageExists(MD5Hash md5) throws IOException {
202         return pagesByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].pageExists(md5);
203     }
204 
205     /**
206      * Iterate through all the Pages, sorted by URL.
207      * We need to enumerate all the Enumerations given 
208      * to us via a call to pages() for each DBSectionReader.
209      */
210     public Enumeration pages() throws IOException {
211         return new MetaEnumerator(pagesByURL, PAGE_ENUMS);
212     }
213 
214     /**
215      * Iterate through all the Pages, sorted by MD5.
216      * We enumerate all the DBSectionReader Enumerations,
217      * just as above.
218      */
219     public Enumeration pagesByMD5() throws IOException {
220         return new MetaEnumerator(pagesByMD5, PAGE_MD5_ENUMS);
221     }
222 
223     /**
224      * Get all the hyperlinks that link TO the indicated URL.
225      */     
226     public Link[] getLinks(UTF8 url) throws IOException {
227         Vector resultSet = linksByURL[DBKeyDivision.findURLSection(url.toString(), numMachines)].getLinks(url);
228         Link resultArray[] = new Link[resultSet.size()];
229         int i = 0;
230         for (Enumeration e = resultSet.elements(); e.hasMoreElements(); ) {
231             resultArray[i++] = (Link) e.nextElement();
232         }
233         return resultArray;
234     }
235 
236     /**
237      * Grab all the links from the given MD5 hash.
238      */
239     public Link[] getLinks(MD5Hash md5) throws IOException {
240         Vector resultSet = linksByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].getLinks(md5);
241         Link resultArray[] = new Link[resultSet.size()];
242         int i = 0;
243         for (Enumeration e = resultSet.elements(); e.hasMoreElements(); ) {
244             resultArray[i++] = (Link) e.nextElement();
245         }
246         return resultArray;
247     }
248 
249     /**
250      * Return all the links, by target URL
251      */
252     public Enumeration links() throws IOException {
253         return new MetaEnumerator(linksByURL, LINK_ENUMS);
254     }
255 
256     //
257     // The EnumCall class allows the creator of MetaEnumerator
258     // to indicate how to get each enumeration.  Will it be pages
259     // or links?
260     //
261     static abstract class EnumCall {
262         /**
263          */
264         public EnumCall() {
265         }
266 
267         /**
268          * Subclasses override this for different kinds of MetaEnumerator
269          * behavior.
270          */
271         public abstract Enumeration getEnumeration(DBSectionReader reader) throws IOException;
272     }
273 
274     //
275     // For enumerating Pages
276     //
277     static class PageEnumCall extends EnumCall {
278         /**
279          */
280         public PageEnumCall() {
281         }
282 
283         /**
284          * Get the enum of Pages
285          */
286         public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
287             return reader.pages();
288         }
289     }
290 
291     //
292     // For enumerating Pages
293     //
294     static class PageByMD5EnumCall extends EnumCall {
295         /**
296          */
297         public PageByMD5EnumCall() {
298         }
299 
300         /**
301          * Get the enum of Pages
302          */
303         public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
304             return reader.pagesByMD5();
305         }
306     }
307 
308     //
309     // For enumerating Links
310     //
311     static class LinkEnumCall extends EnumCall {
312         /**
313          */
314         public LinkEnumCall() {
315         }
316 
317         /**
318          * Get the enum of Links
319          */
320         public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
321             return reader.links();
322         }
323     }
324 
325     //
326     // MetaEnumerator uses the Enumerations from each
327     // DBSectionReader in the passed-in DBSectionReader array.
328     //
329     class MetaEnumerator implements Enumeration {
330         Enumeration enumerations[];
331         int curEnum = 0;
332 
333         /**
334          * Create all the Enumerations from the given Sections
335          */
336         public MetaEnumerator(DBSectionReader sections[], EnumCall enumCall) throws IOException {
337             this.enumerations = new Enumeration[sections.length];
338 
339             for (int i = 0; i < enumerations.length; i++) {
340                 enumerations[i] = enumCall.getEnumeration(sections[i]);
341             }
342         }
343 
344         /**
345          * Go through all the DBSectionReader items in
346          * pagesByURL, until we find one that hasMoreElements.
347          * Or until we hit the end.
348          */
349         public boolean hasMoreElements() {
350             boolean result = false;
351 
352             //
353             // Go through Enumerations until we find one with
354             // hasMoreElements() == true.  (Or until we run out
355             // of Enumerations.)
356             //
357             for (; curEnum < enumerations.length; curEnum++) {
358                 result = enumerations[curEnum].hasMoreElements();
359                 
360                 if (result) {
361                     break;
362                 }
363             }
364             return result;
365         }
366 
367         /**
368          * Exhaust the Objects we can receive from the 
369          * Enumerations array, via calls to nextElement();
370          */
371         public Object   nextElement() {
372             Object   obj = null;
373 
374             //
375             // Go through Enumerations until we find one with
376             // a nextElement() to return.  (Or until we run out.)
377             //
378             for (; curEnum < enumerations.length; curEnum++) {
379                 if (enumerations[curEnum].hasMoreElements()) {
380                     obj = enumerations[curEnum].nextElement();
381 
382                     if (obj != null) {
383                         break;
384                     }
385                 }
386             }
387             return obj;
388         }
389     }
390 
391     /**
392      * The DistributedWebDBReader.main() provides some handy utility methods
393      * for looking through the contents of the webdb.  Hoo-boy!
394      *
395      * Note this only works for a completely-NFS deployment.
396      */
397     public static void main(String   argv[]) throws FileNotFoundException, IOException {
398         if (argv.length < 2) {
399             System.out.println("Usage: java net.nutch.db.DistributedWebDBReader (-local | -ndfs <namenode:port>) <root> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]");
400             return;
401         }
402 
403         int i = 0;
404         NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
405         File root = new File(argv[i++]);
406         DistributedWebDBReader reader = new DistributedWebDBReader(nfs, root);
407         try {
408             String   cmd = argv[i++];
409 
410             if ("-pageurl".equals(cmd)) {
411                 String   url = argv[i++];
412                 System.out.println(reader.getPage(url.trim()));
413             } else if ("-pagemd5".equals(cmd)) {
414                 MD5Hash md5 = new MD5Hash(argv[i++]);
415                 Page pages[] = reader.getPages(md5);
416                 System.out.println("Found " + pages.length + " pages.");
417                 for (int j = 0; j < pages.length; j++) {
418                     System.out.println("Page " + j + ": " + pages[j]);
419                 }
420             } else if ("-dumppageurl".equals(cmd)) {
421                 int j = 1;
422                 for (Enumeration e = reader.pages(); e.hasMoreElements(); j++) {
423                     Page page = (Page) e.nextElement();
424                     System.out.println("Page " + j + ": " + page);
425                     System.out.println();
426                 }
427             } else if ("-dumppagemd5".equals(cmd)) {
428                 int j = 1;
429                 for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); j++) {
430                     Page page = (Page) e.nextElement();
431                     System.out.println("Page " + j + ": " + page);
432                     System.out.println();
433                 }
434             } else if ("-toppages".equals(cmd)) {
435                 int topSize = Integer.parseInt(argv[i++]);
436 
437                 // Create a sorted list
438                 SortedSet topSet = new TreeSet(new Comparator() {
439                     public int compare(Object   o1, Object   o2) {
440                         Page p1 = (Page) o1;
441                         Page p2 = (Page) o2;
442                         if (p1.getScore() < p2.getScore()) {
443                             return -1;
444                         } else if (p1.getScore() == p2.getScore()) {
445                             // If two scores are equal, we will
446                             // use regular Page comparison (which
447                             // uses URL as the primary key).  We
448                             // don't want to uniquify by score!
449                             return p1.compareTo(p2);
450                         } else {
451                             return 1;
452                         }
453                     }
454                 }
455                     );
456 
457                 // Find the top "topSize" elts
458                 Page lowestPage = null;
459                 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
460                     Page curPage = (Page) e.nextElement();
461                     if (topSet.size() < topSize) {
462                         topSet.add(curPage);
463                         lowestPage = (Page) topSet.first();
464                     } else if (lowestPage.getScore() < curPage.getScore()) {
465                         topSet.remove(lowestPage);
466                         topSet.add(curPage);
467                         lowestPage = (Page) topSet.first();
468                     }
469                 }
470             
471                 // Print them out
472                 int j = 0;
473                 for (Iterator it = topSet.iterator(); it.hasNext(); j++) {
474                     System.out.println("Page " + j + ": " + (Page) it.next());
475                     System.out.println();
476                 }
477             } else if ("-linkurl".equals(cmd)) {
478                 String   url = argv[i++];
479                 Link links[] = reader.getLinks(new UTF8(url.trim()));
480                 System.out.println("Found " + links.length + " links.");
481                 for (int j = 0; j < links.length; j++) {
482                     System.out.println("Link " + j + ": " + links[j]);
483                 }
484             } else if ("-linkmd5".equals(cmd)) {
485                 MD5Hash fromID = new MD5Hash(argv[i++]);
486                 Link links[] = reader.getLinks(fromID);
487                 System.out.println("Found " + links.length + " links.");
488                 for (int j = 0; j < links.length; j++) {
489                     System.out.println("Link " + j + ": " + links[j]);
490                 }
491             } else if ("-dumplinks".equals(cmd)) {
492                 int j = 1;
493                 for (Enumeration e = reader.links(); e.hasMoreElements(); j++) {
494                     Link link = (Link) e.nextElement();
495                     System.out.println("Link " + j + ": " + link);
496                     System.out.println();
497                 }
498             } else if ("-stats".equals(cmd)) {
499                 System.out.println("Stats for " + reader);
500                 System.out.println("-------------------------------");
501                 System.out.println("Number of pages: " + reader.numPages());
502                 System.out.println("Number of links: " + reader.numLinks());
503                 System.out.println("Number of machines (sections): " + reader.numMachines());
504             } else {
505                 System.out.println("Sorry, no command with name " + cmd);
506             }
507         } finally {
508             reader.close();
509         }
510     }
511 }
512
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags