WebDBReader


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   package net.nutch.db;
4   
5   import java.io.*;
6   import java.util.*;
7   import java.nio.channels.*;
8   
9   import net.nutch.io.*;
10  import net.nutch.fs.*;
11  import net.nutch.util.*;
12  import net.nutch.pagedb.*;
13  import net.nutch.linkdb.*;
14  
15  /**********************************************
16   * The WebDBReader implements all the read-only
17   * parts of accessing our web database.
18   * All the writing ones can be found in WebDBWriter.
19   *
20   * @author Mike Cafarella
21   **********************************************/
22  public class WebDBReader implements IWebDBReader {
23      static final Page[] PAGE_RECORDS = new Page[0];
24      static final Link[] LINK_RECORDS = new Link[0];
25  
26      // filenames
27      static final String   PAGES_BY_URL = "pagesByURL";
28      static final String   PAGES_BY_MD5 = "pagesByMD5";
29      static final String   LINKS_BY_URL = "linksByURL";
30      static final String   LINKS_BY_MD5 = "linksByMD5";
31      static final String   STATS_FILE = "stats";
32  
33      NutchFileSystem nfs;
34      File dbDir, dbFile;
35      MapFile.Reader pagesByURL, pagesByMD5, linksByURL, linksByMD5;
36      long totalPages = 0, totalLinks = 0;
37      Vector mapReaders = null, setReaders = null;
38      FileInputStream dbReadLockData;
39      FileLock dbReadLock;
40  
41      /**
42       * Open a web db reader for the named directory.
43       */    
44      public WebDBReader(NutchFileSystem nfs, File dbDir) throws IOException, FileNotFoundException {
45          this.nfs = nfs;
46          this.dbDir = dbDir;
47          this.dbFile = new File(dbDir, "webdb");
48  
49          // Obtain read lock on db so writers don't try to 
50          // move it out from under us.  This obtains a non-exclusive
51          // lock on the directory that holds the dbs (old and new)
52          nfs.lock(new File(dbDir, "dbreadlock"), true);
53  
54          this.pagesByURL = new MapFile.Reader(nfs, new File(dbFile, PAGES_BY_URL).getPath(), new UTF8.Comparator());
55          this.pagesByMD5 = new MapFile.Reader(nfs, new File(dbFile, PAGES_BY_MD5).getPath(), new Page.Comparator());
56  
57          this.linksByURL = new MapFile.Reader(nfs, new File(dbFile, LINKS_BY_URL).getPath(), new Link.UrlComparator());
58          this.linksByMD5 = new MapFile.Reader(nfs, new File(dbFile, LINKS_BY_MD5).getPath(), new Link.MD5Comparator());
59  
60          // Load in statistics
61          File stats = new File(dbFile, STATS_FILE);
62          if (nfs.exists(stats)) {
63              DataInputStream in = new DataInputStream(nfs.open(stats));
64              try {
65                  int version = (byte) in.read();
66                  this.totalPages = in.readLong();
67                  this.totalLinks = in.readLong();
68              } finally {
69                  in.close();
70              }
71          }
72  
73          // Create vectors so we can GC readers used by 
74          // enum() calls.  We do this so we can have multiple
75          // simultaneous enum users.  However, since we keep
76          // a handle to each one, we're assuming that we don't
77          // create too many before WebDBReader.close() is called.
78          this.mapReaders = new Vector();
79          this.setReaders = new Vector();
80      }
81  
82      /**
83       * Shutdown
84       */
85      public void close() throws IOException {
86          pagesByURL.close();
87          pagesByMD5.close();
88          linksByURL.close();
89          linksByMD5.close();
90  
91          for (Enumeration e = mapReaders.elements(); e.hasMoreElements(); ) {
92              MapFile.Reader tmp = (MapFile.Reader) e.nextElement();
93              tmp.close();
94          }
95          for (Enumeration e = setReaders.elements(); e.hasMoreElements(); ) {
96              SetFile.Reader tmp = (SetFile.Reader) e.nextElement();
97              tmp.close();
98          }
99  
100         // release the lock
101         nfs.release(new File(dbDir, "dbreadlock"));
102     }
103 
104     /**
105      * Get Page from the pagedb with the given URL
106      */
107     public Page getPage(String   url) throws IOException {
108         return (Page) pagesByURL.get(new UTF8(url), new Page());
109     }
110 
111     /**
112      * Get Pages from the pagedb according to their
113      * content hash.
114      */
115     public Page[] getPages(MD5Hash md5) throws IOException {
116         Vector records = new Vector(3);
117         Page p = new Page();
118         p.getMD5().set(md5);
119 
120         pagesByMD5.seek(p);
121         while (pagesByMD5.next(p, NullWritable.get())) {
122             if (p.getMD5().compareTo(md5) == 0) {
123                 records.add(p);
124                 p = new Page();
125             } else {
126                 break;
127             }
128         }
129 
130         // Xfer from the vector into an array
131         return (Page[]) records.toArray(PAGE_RECORDS);
132     }
133 
134     /**
135      * Test whether a certain piece of content is in the
136      * database, but don't bother returning the Page(s) itself.
137      */
138     public boolean pageExists(MD5Hash md5) throws IOException {
139         Page p = new Page();
140         p.getMD5().set(md5);
141         pagesByMD5.seek(p);
142         if (pagesByMD5.next(p, NullWritable.get()) && p.getMD5().compareTo(md5) == 0) {
143             return true;
144         } else {
145             return false;
146         }
147     }
148 
149     /**
150      * Iterate through all the Pages, sorted by URL
151      */
152     public Enumeration pages() throws IOException {
153         MapFile.Reader tmpReader = new MapFile.Reader(nfs, new File(dbFile, "pagesByURL").getPath());
154         mapReaders.add(tmpReader);
155         return new TableEnumerator(tmpReader);
156     }
157 
158     //
159     // The TableEnumerator goes through all the entries
160     // in the Table (which is a MapFile).
161     //
162     class TableEnumerator implements Enumeration {
163         MapFile.Reader reader;
164         Page nextItem;
165 
166         /**
167          * Start the cursor and find the first item.
168          * Store it for later return.
169          */
170         public TableEnumerator(MapFile.Reader reader) {
171             this.reader = reader;
172             this.nextItem = new Page();
173             try {
174                 if (! reader.next(new UTF8(), this.nextItem)) {
175                     this.nextItem = null;
176                 }
177             } catch (IOException ie) {
178                 this.nextItem = null;
179             }
180         }
181 
182         /**
183          * If there's no item left in store, we've hit the end.
184          */
185         public boolean hasMoreElements() {
186             return (nextItem != null);
187         }
188 
189         /**
190          * Set aside the item we have in store.  Then retrieve
191          * another for the next time we're called.  Finally, return
192          * the set-aside item.
193          */
194         public Object   nextElement() {
195             if (nextItem == null) {
196                 throw new NoSuchElementException("PageDB Enumeration");
197             }
198             Page toReturn = nextItem;
199             this.nextItem = new Page();
200             try {
201                 if (! reader.next(new UTF8(), nextItem)) {
202                     this.nextItem = null;
203                 }
204             } catch (IOException ie) {
205                 this.nextItem = null;
206             }
207             return toReturn;
208         }
209     }
210 
211 
212     /**
213      * Iterate through all the Pages, sorted by MD5
214      */
215     public Enumeration pagesByMD5() throws IOException {
216         SetFile.Reader tmpReader = new SetFile.Reader(nfs, new File(dbFile, "pagesByMD5").getPath());
217         setReaders.add(tmpReader);
218         return new IndexEnumerator(tmpReader);
219     }
220 
221     /**
222      * Return the number of pages we're dealing with
223      */
224     public long numPages() {
225         return totalPages;
226     }
227 
228     //
229     // The IndexEnumerator goes through all the entries
230     // in the index (which is a SequenceFile).
231     //
232     class IndexEnumerator implements Enumeration {
233         SetFile.Reader reader;
234         Page nextItem;
235 
236         /**
237          * Start the cursor and find the first item.
238          * Store it for later return.
239          */
240         public IndexEnumerator(SetFile.Reader reader) {
241             this.reader = reader;
242             this.nextItem = new Page();
243             try {
244                 if (! reader.next(nextItem)) {
245                     this.nextItem = null;
246                 }
247             } catch (IOException ie) {
248                 this.nextItem = null;
249             }
250         }
251 
252         /**
253          * If there's no item left in store, we've hit the end.
254          */
255         public boolean hasMoreElements() {
256             return (nextItem != null);
257         }
258 
259         /**
260          * Set aside the item we have in store.  Then retrieve
261          * another for the next time we're called.  Finally, return
262          * the set-aside item.
263          */
264         public Object   nextElement() {
265             if (nextItem == null) {
266                 throw new NoSuchElementException("PageDB Enumeration");
267             }
268 
269             Page toReturn = nextItem;
270             this.nextItem = new Page();
271             try {
272                 if (! reader.next(nextItem)) {
273                     this.nextItem = null;
274                 }
275             } catch (IOException ie) {
276                 this.nextItem = null;
277             }
278             return toReturn;
279         }
280     }
281 
282     /**
283      * Get all the hyperlinks that link TO the indicated URL.
284      */     
285     public Link[] getLinks(UTF8 url) throws IOException {
286         Vector records = new Vector(3);
287         Link l = new Link();
288         l.getURL().set(url);
289 
290         linksByURL.seek(l);
291         while (linksByURL.next(l, NullWritable.get())) {
292             if (url.equals(l.getURL())) {
293                 records.add(l);
294                 l = new Link();
295             } else {
296                 break;
297             }
298         }
299         
300         // Xfer from the vector into an array
301         return (Link[]) records.toArray(LINK_RECORDS);
302     }
303 
304     /**
305      * Grab all the links from the given MD5 hash.
306      */
307     public Link[] getLinks(MD5Hash md5) throws IOException {
308         Vector records = new Vector(3);
309         Link l = new Link();
310         l.getFromID().set(md5);
311 
312         linksByMD5.seek(l);
313         while (linksByMD5.next(l, NullWritable.get())) {
314             if (md5.equals(l.getFromID())) {
315                 records.add(l);
316                 l = new Link();
317             } else {
318                 break;
319             }
320         }
321         
322         // Xfer from the vector into an array
323         return (Link[]) records.toArray(LINK_RECORDS);
324     }
325 
326     /**
327      * Return all the links, by target URL
328      */
329     public Enumeration links() {
330         return new MapEnumerator(linksByURL);
331     }
332 
333     /**
334      * Return the number of links in our db.
335      */
336     public long numLinks() {
337         return totalLinks;
338     }
339 
340     //
341     // Here's the class for the above function
342     //
343     class MapEnumerator implements Enumeration {
344         MapFile.Reader reader;
345         Link nextItem;
346 
347         /**
348          * Start the cursor and find the first item.
349          * Store it for later return.
350          */
351         public MapEnumerator(MapFile.Reader reader) {
352             this.reader = reader;
353             this.nextItem = new Link();
354             try {
355                 if (! reader.next(this.nextItem, NullWritable.get())) {
356                     this.nextItem = null;
357                 }
358             } catch (IOException ie) {
359                 this.nextItem = null;
360             }
361         }
362 
363         /**
364          * If there's no item left in store, we've hit the end.
365          */
366         public boolean hasMoreElements() {
367             return (nextItem != null);
368         }
369 
370         /**
371          * Set aside the item we have in store.  Then retrieve
372          * another for the next time we're called.  Finally, return
373          * the set-aside item.
374          */
375         public Object   nextElement() {
376             if (nextItem == null) {
377                 throw new NoSuchElementException("PageDB Enumeration");
378             }
379 
380             Link toReturn = nextItem;
381             this.nextItem = new Link();
382             try {
383                 if (! reader.next(nextItem, NullWritable.get())) {
384                     this.nextItem = null;
385                 }
386             } catch (IOException ie) {
387                 this.nextItem = null;
388             }
389             return toReturn;
390         }
391     }
392 
393     /**
394      * The WebDBReader.main() provides some handy utility methods
395      * for looking through the contents of the webdb.  Hoo-boy!
396      */
397     public static void main(String   argv[]) throws FileNotFoundException, IOException {
398         if (argv.length < 2) {
399             System.out.println("Usage: java net.nutch.db.WebDBReader (-local | -ndfs <namenode:port>) <db> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]");
400             return;
401 
402         }
403 
404         int i = 0;
405         NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
406         File dbDir = new File(argv[i++]);
407         WebDBReader reader = new WebDBReader(nfs, dbDir);
408         try {
409             String   cmd = argv[i++];
410 
411             if ("-pageurl".equals(cmd)) {
412                 String   url = argv[i++];
413                 System.out.println(reader.getPage(url.trim()));
414             } else if ("-pagemd5".equals(cmd)) {
415                 MD5Hash md5 = new MD5Hash(argv[i++]);
416                 Page pages[] = reader.getPages(md5);
417                 System.out.println("Found " + pages.length + " pages.");
418                 for (int j = 0; j < pages.length; j++) {
419                     System.out.println("Page " + j + ": " + pages[j]);
420                 }
421             } else if ("-dumppageurl".equals(cmd)) {
422                 System.out.println(reader);
423                 System.out.println();
424                 int j = 1;
425                 for (Enumeration e = reader.pages(); e.hasMoreElements(); j++) {
426                     Page page = (Page) e.nextElement();
427                     System.out.println("Page " + j + ": " + page);
428                     System.out.println();
429                 }
430             } else if ("-dumppagemd5".equals(cmd)) {
431                 System.out.println(reader);
432                 System.out.println();
433                 int j = 1;
434                 for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); j++) {
435                     Page page = (Page) e.nextElement();
436                     System.out.println("Page " + j + ": " + page);
437                     System.out.println();
438                 }
439             } else if ("-toppages".equals(cmd)) {
440                 int topSize = Integer.parseInt(argv[i++]);
441 
442                 // Create a sorted list
443                 SortedSet topSet = new TreeSet(new Comparator() {
444                     public int compare(Object   o1, Object   o2) {
445                         Page p1 = (Page) o1;
446                         Page p2 = (Page) o2;
447                         if (p1.getScore() < p2.getScore()) {
448                             return -1;
449                         } else if (p1.getScore() == p2.getScore()) {
450                             // If two scores are equal, we will
451                             // use regular Page comparison (which
452                             // uses URL as the primary key).  We
453                             // don't want to uniquify by score!
454                             return p1.compareTo(p2);
455                         } else {
456                             return 1;
457                         }
458                     }
459                 }
460                     );
461 
462                 // Find the top "topSize" elts
463                 Page lowestPage = null;
464                 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
465                     Page curPage = (Page) e.nextElement();
466                     if (topSet.size() < topSize) {
467                         topSet.add(curPage);
468                         lowestPage = (Page) topSet.first();
469                     } else if (lowestPage.getScore() < curPage.getScore()) {
470                         topSet.remove(lowestPage);
471                         topSet.add(curPage);
472                         lowestPage = (Page) topSet.first();
473                     }
474                 }
475             
476                 // Print them out
477                 int j = 0;
478                 for (Iterator it = topSet.iterator(); it.hasNext(); j++) {
479                     System.out.println("Page " + j + ": " + (Page) it.next());
480                     System.out.println();
481                 }
482             } else if ("-linkurl".equals(cmd)) {
483                 String   url = argv[i++];
484                 Link links[] = reader.getLinks(new UTF8(url.trim()));
485                 System.out.println("Found " + links.length + " links.");
486                 for (int j = 0; j < links.length; j++) {
487                     System.out.println("Link " + j + ": " + links[j]);
488                 }
489             } else if ("-linkmd5".equals(cmd)) {
490                 MD5Hash fromID = new MD5Hash(argv[i++]);
491                 Link links[] = reader.getLinks(fromID);
492                 System.out.println("Found " + links.length + " links.");
493                 for (int j = 0; j < links.length; j++) {
494                     System.out.println("Link " + j + ": " + links[j]);
495                 }
496             } else if ("-dumplinks".equals(cmd)) {
497                 System.out.println(reader);
498                 System.out.println();
499                 Enumeration e = reader.pagesByMD5();
500                 while (e.hasMoreElements()) {
501                   Page page = (Page) e.nextElement();
502                   Link[] links = reader.getLinks(page.getMD5());
503                   if (links.length > 0) {
504                     System.out.println("from " + page.getURL());
505                     for (int j = 0; j < links.length; j++) {
506                       System.out.println(" to " + links[j].getURL());
507                     }
508                     System.out.println();
509                   }
510                 }
511             } else if ("-stats".equals(cmd)) {
512                 System.out.println("Stats for " + reader);
513                 System.out.println("-------------------------------");
514                 System.out.println("Number of pages: " + reader.numPages());
515                 System.out.println("Number of links: " + reader.numLinks());
516             } else {
517                 System.out.println("Sorry, no command with name " + cmd);
518             }
519         } finally {
520             reader.close();
521             nfs.close();
522         }
523     }
524 }
525
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags