KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > db > WebDBReader


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3 package net.nutch.db;
4
5 import java.io.*;
6 import java.util.*;
7 import java.nio.channels.*;
8
9 import net.nutch.io.*;
10 import net.nutch.fs.*;
11 import net.nutch.util.*;
12 import net.nutch.pagedb.*;
13 import net.nutch.linkdb.*;
14
15 /**********************************************
16  * The WebDBReader implements all the read-only
17  * parts of accessing our web database.
18  * All the writing ones can be found in WebDBWriter.
19  *
20  * @author Mike Cafarella
21  **********************************************/

22 public class WebDBReader implements IWebDBReader {
23     static final Page[] PAGE_RECORDS = new Page[0];
24     static final Link[] LINK_RECORDS = new Link[0];
25
26     // filenames
27
static final String JavaDoc PAGES_BY_URL = "pagesByURL";
28     static final String JavaDoc PAGES_BY_MD5 = "pagesByMD5";
29     static final String JavaDoc LINKS_BY_URL = "linksByURL";
30     static final String JavaDoc LINKS_BY_MD5 = "linksByMD5";
31     static final String JavaDoc STATS_FILE = "stats";
32
33     NutchFileSystem nfs;
34     File dbDir, dbFile;
35     MapFile.Reader pagesByURL, pagesByMD5, linksByURL, linksByMD5;
36     long totalPages = 0, totalLinks = 0;
37     Vector mapReaders = null, setReaders = null;
38     FileInputStream dbReadLockData;
39     FileLock dbReadLock;
40
41     /**
42      * Open a web db reader for the named directory.
43      */

44     public WebDBReader(NutchFileSystem nfs, File dbDir) throws IOException, FileNotFoundException {
45         this.nfs = nfs;
46         this.dbDir = dbDir;
47         this.dbFile = new File(dbDir, "webdb");
48
49         // Obtain read lock on db so writers don't try to
50
// move it out from under us. This obtains a non-exclusive
51
// lock on the directory that holds the dbs (old and new)
52
nfs.lock(new File(dbDir, "dbreadlock"), true);
53
54         this.pagesByURL = new MapFile.Reader(nfs, new File(dbFile, PAGES_BY_URL).getPath(), new UTF8.Comparator());
55         this.pagesByMD5 = new MapFile.Reader(nfs, new File(dbFile, PAGES_BY_MD5).getPath(), new Page.Comparator());
56
57         this.linksByURL = new MapFile.Reader(nfs, new File(dbFile, LINKS_BY_URL).getPath(), new Link.UrlComparator());
58         this.linksByMD5 = new MapFile.Reader(nfs, new File(dbFile, LINKS_BY_MD5).getPath(), new Link.MD5Comparator());
59
60         // Load in statistics
61
File stats = new File(dbFile, STATS_FILE);
62         if (nfs.exists(stats)) {
63             DataInputStream in = new DataInputStream(nfs.open(stats));
64             try {
65                 int version = (byte) in.read();
66                 this.totalPages = in.readLong();
67                 this.totalLinks = in.readLong();
68             } finally {
69                 in.close();
70             }
71         }
72
73         // Create vectors so we can GC readers used by
74
// enum() calls. We do this so we can have multiple
75
// simultaneous enum users. However, since we keep
76
// a handle to each one, we're assuming that we don't
77
// create too many before WebDBReader.close() is called.
78
this.mapReaders = new Vector();
79         this.setReaders = new Vector();
80     }
81
82     /**
83      * Shutdown
84      */

85     public void close() throws IOException {
86         pagesByURL.close();
87         pagesByMD5.close();
88         linksByURL.close();
89         linksByMD5.close();
90
91         for (Enumeration e = mapReaders.elements(); e.hasMoreElements(); ) {
92             MapFile.Reader tmp = (MapFile.Reader) e.nextElement();
93             tmp.close();
94         }
95         for (Enumeration e = setReaders.elements(); e.hasMoreElements(); ) {
96             SetFile.Reader tmp = (SetFile.Reader) e.nextElement();
97             tmp.close();
98         }
99
100         // release the lock
101
nfs.release(new File(dbDir, "dbreadlock"));
102     }
103
104     /**
105      * Get Page from the pagedb with the given URL
106      */

107     public Page getPage(String JavaDoc url) throws IOException {
108         return (Page) pagesByURL.get(new UTF8(url), new Page());
109     }
110
111     /**
112      * Get Pages from the pagedb according to their
113      * content hash.
114      */

115     public Page[] getPages(MD5Hash md5) throws IOException {
116         Vector records = new Vector(3);
117         Page p = new Page();
118         p.getMD5().set(md5);
119
120         pagesByMD5.seek(p);
121         while (pagesByMD5.next(p, NullWritable.get())) {
122             if (p.getMD5().compareTo(md5) == 0) {
123                 records.add(p);
124                 p = new Page();
125             } else {
126                 break;
127             }
128         }
129
130         // Xfer from the vector into an array
131
return (Page[]) records.toArray(PAGE_RECORDS);
132     }
133
134     /**
135      * Test whether a certain piece of content is in the
136      * database, but don't bother returning the Page(s) itself.
137      */

138     public boolean pageExists(MD5Hash md5) throws IOException {
139         Page p = new Page();
140         p.getMD5().set(md5);
141         pagesByMD5.seek(p);
142         if (pagesByMD5.next(p, NullWritable.get()) && p.getMD5().compareTo(md5) == 0) {
143             return true;
144         } else {
145             return false;
146         }
147     }
148
149     /**
150      * Iterate through all the Pages, sorted by URL
151      */

152     public Enumeration pages() throws IOException {
153         MapFile.Reader tmpReader = new MapFile.Reader(nfs, new File(dbFile, "pagesByURL").getPath());
154         mapReaders.add(tmpReader);
155         return new TableEnumerator(tmpReader);
156     }
157
158     //
159
// The TableEnumerator goes through all the entries
160
// in the Table (which is a MapFile).
161
//
162
class TableEnumerator implements Enumeration {
163         MapFile.Reader reader;
164         Page nextItem;
165
166         /**
167          * Start the cursor and find the first item.
168          * Store it for later return.
169          */

170         public TableEnumerator(MapFile.Reader reader) {
171             this.reader = reader;
172             this.nextItem = new Page();
173             try {
174                 if (! reader.next(new UTF8(), this.nextItem)) {
175                     this.nextItem = null;
176                 }
177             } catch (IOException ie) {
178                 this.nextItem = null;
179             }
180         }
181
182         /**
183          * If there's no item left in store, we've hit the end.
184          */

185         public boolean hasMoreElements() {
186             return (nextItem != null);
187         }
188
189         /**
190          * Set aside the item we have in store. Then retrieve
191          * another for the next time we're called. Finally, return
192          * the set-aside item.
193          */

194         public Object JavaDoc nextElement() {
195             if (nextItem == null) {
196                 throw new NoSuchElementException("PageDB Enumeration");
197             }
198             Page toReturn = nextItem;
199             this.nextItem = new Page();
200             try {
201                 if (! reader.next(new UTF8(), nextItem)) {
202                     this.nextItem = null;
203                 }
204             } catch (IOException ie) {
205                 this.nextItem = null;
206             }
207             return toReturn;
208         }
209     }
210
211
212     /**
213      * Iterate through all the Pages, sorted by MD5
214      */

215     public Enumeration pagesByMD5() throws IOException {
216         SetFile.Reader tmpReader = new SetFile.Reader(nfs, new File(dbFile, "pagesByMD5").getPath());
217         setReaders.add(tmpReader);
218         return new IndexEnumerator(tmpReader);
219     }
220
221     /**
222      * Return the number of pages we're dealing with
223      */

224     public long numPages() {
225         return totalPages;
226     }
227
228     //
229
// The IndexEnumerator goes through all the entries
230
// in the index (which is a SequenceFile).
231
//
232
class IndexEnumerator implements Enumeration {
233         SetFile.Reader reader;
234         Page nextItem;
235
236         /**
237          * Start the cursor and find the first item.
238          * Store it for later return.
239          */

240         public IndexEnumerator(SetFile.Reader reader) {
241             this.reader = reader;
242             this.nextItem = new Page();
243             try {
244                 if (! reader.next(nextItem)) {
245                     this.nextItem = null;
246                 }
247             } catch (IOException ie) {
248                 this.nextItem = null;
249             }
250         }
251
252         /**
253          * If there's no item left in store, we've hit the end.
254          */

255         public boolean hasMoreElements() {
256             return (nextItem != null);
257         }
258
259         /**
260          * Set aside the item we have in store. Then retrieve
261          * another for the next time we're called. Finally, return
262          * the set-aside item.
263          */

264         public Object JavaDoc nextElement() {
265             if (nextItem == null) {
266                 throw new NoSuchElementException("PageDB Enumeration");
267             }
268
269             Page toReturn = nextItem;
270             this.nextItem = new Page();
271             try {
272                 if (! reader.next(nextItem)) {
273                     this.nextItem = null;
274                 }
275             } catch (IOException ie) {
276                 this.nextItem = null;
277             }
278             return toReturn;
279         }
280     }
281
282     /**
283      * Get all the hyperlinks that link TO the indicated URL.
284      */

285     public Link[] getLinks(UTF8 url) throws IOException {
286         Vector records = new Vector(3);
287         Link l = new Link();
288         l.getURL().set(url);
289
290         linksByURL.seek(l);
291         while (linksByURL.next(l, NullWritable.get())) {
292             if (url.equals(l.getURL())) {
293                 records.add(l);
294                 l = new Link();
295             } else {
296                 break;
297             }
298         }
299         
300         // Xfer from the vector into an array
301
return (Link[]) records.toArray(LINK_RECORDS);
302     }
303
304     /**
305      * Grab all the links from the given MD5 hash.
306      */

307     public Link[] getLinks(MD5Hash md5) throws IOException {
308         Vector records = new Vector(3);
309         Link l = new Link();
310         l.getFromID().set(md5);
311
312         linksByMD5.seek(l);
313         while (linksByMD5.next(l, NullWritable.get())) {
314             if (md5.equals(l.getFromID())) {
315                 records.add(l);
316                 l = new Link();
317             } else {
318                 break;
319             }
320         }
321         
322         // Xfer from the vector into an array
323
return (Link[]) records.toArray(LINK_RECORDS);
324     }
325
326     /**
327      * Return all the links, by target URL
328      */

329     public Enumeration links() {
330         return new MapEnumerator(linksByURL);
331     }
332
333     /**
334      * Return the number of links in our db.
335      */

336     public long numLinks() {
337         return totalLinks;
338     }
339
340     //
341
// Here's the class for the above function
342
//
343
class MapEnumerator implements Enumeration {
344         MapFile.Reader reader;
345         Link nextItem;
346
347         /**
348          * Start the cursor and find the first item.
349          * Store it for later return.
350          */

351         public MapEnumerator(MapFile.Reader reader) {
352             this.reader = reader;
353             this.nextItem = new Link();
354             try {
355                 if (! reader.next(this.nextItem, NullWritable.get())) {
356                     this.nextItem = null;
357                 }
358             } catch (IOException ie) {
359                 this.nextItem = null;
360             }
361         }
362
363         /**
364          * If there's no item left in store, we've hit the end.
365          */

366         public boolean hasMoreElements() {
367             return (nextItem != null);
368         }
369
370         /**
371          * Set aside the item we have in store. Then retrieve
372          * another for the next time we're called. Finally, return
373          * the set-aside item.
374          */

375         public Object JavaDoc nextElement() {
376             if (nextItem == null) {
377                 throw new NoSuchElementException("PageDB Enumeration");
378             }
379
380             Link toReturn = nextItem;
381             this.nextItem = new Link();
382             try {
383                 if (! reader.next(nextItem, NullWritable.get())) {
384                     this.nextItem = null;
385                 }
386             } catch (IOException ie) {
387                 this.nextItem = null;
388             }
389             return toReturn;
390         }
391     }
392
393     /**
394      * The WebDBReader.main() provides some handy utility methods
395      * for looking through the contents of the webdb. Hoo-boy!
396      */

397     public static void main(String JavaDoc argv[]) throws FileNotFoundException, IOException {
398         if (argv.length < 2) {
399             System.out.println("Usage: java net.nutch.db.WebDBReader (-local | -ndfs <namenode:port>) <db> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]");
400             return;
401
402         }
403
404         int i = 0;
405         NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
406         File dbDir = new File(argv[i++]);
407         WebDBReader reader = new WebDBReader(nfs, dbDir);
408         try {
409             String JavaDoc cmd = argv[i++];
410
411             if ("-pageurl".equals(cmd)) {
412                 String JavaDoc url = argv[i++];
413                 System.out.println(reader.getPage(url.trim()));
414             } else if ("-pagemd5".equals(cmd)) {
415                 MD5Hash md5 = new MD5Hash(argv[i++]);
416                 Page pages[] = reader.getPages(md5);
417                 System.out.println("Found " + pages.length + " pages.");
418                 for (int j = 0; j < pages.length; j++) {
419                     System.out.println("Page " + j + ": " + pages[j]);
420                 }
421             } else if ("-dumppageurl".equals(cmd)) {
422                 System.out.println(reader);
423                 System.out.println();
424                 int j = 1;
425                 for (Enumeration e = reader.pages(); e.hasMoreElements(); j++) {
426                     Page page = (Page) e.nextElement();
427                     System.out.println("Page " + j + ": " + page);
428                     System.out.println();
429                 }
430             } else if ("-dumppagemd5".equals(cmd)) {
431                 System.out.println(reader);
432                 System.out.println();
433                 int j = 1;
434                 for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); j++) {
435                     Page page = (Page) e.nextElement();
436                     System.out.println("Page " + j + ": " + page);
437                     System.out.println();
438                 }
439             } else if ("-toppages".equals(cmd)) {
440                 int topSize = Integer.parseInt(argv[i++]);
441
442                 // Create a sorted list
443
SortedSet topSet = new TreeSet(new Comparator() {
444                     public int compare(Object JavaDoc o1, Object JavaDoc o2) {
445                         Page p1 = (Page) o1;
446                         Page p2 = (Page) o2;
447                         if (p1.getScore() < p2.getScore()) {
448                             return -1;
449                         } else if (p1.getScore() == p2.getScore()) {
450                             // If two scores are equal, we will
451
// use regular Page comparison (which
452
// uses URL as the primary key). We
453
// don't want to uniquify by score!
454
return p1.compareTo(p2);
455                         } else {
456                             return 1;
457                         }
458                     }
459                 }
460                     );
461
462                 // Find the top "topSize" elts
463
Page lowestPage = null;
464                 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
465                     Page curPage = (Page) e.nextElement();
466                     if (topSet.size() < topSize) {
467                         topSet.add(curPage);
468                         lowestPage = (Page) topSet.first();
469                     } else if (lowestPage.getScore() < curPage.getScore()) {
470                         topSet.remove(lowestPage);
471                         topSet.add(curPage);
472                         lowestPage = (Page) topSet.first();
473                     }
474                 }
475             
476                 // Print them out
477
int j = 0;
478                 for (Iterator it = topSet.iterator(); it.hasNext(); j++) {
479                     System.out.println("Page " + j + ": " + (Page) it.next());
480                     System.out.println();
481                 }
482             } else if ("-linkurl".equals(cmd)) {
483                 String JavaDoc url = argv[i++];
484                 Link links[] = reader.getLinks(new UTF8(url.trim()));
485                 System.out.println("Found " + links.length + " links.");
486                 for (int j = 0; j < links.length; j++) {
487                     System.out.println("Link " + j + ": " + links[j]);
488                 }
489             } else if ("-linkmd5".equals(cmd)) {
490                 MD5Hash fromID = new MD5Hash(argv[i++]);
491                 Link links[] = reader.getLinks(fromID);
492                 System.out.println("Found " + links.length + " links.");
493                 for (int j = 0; j < links.length; j++) {
494                     System.out.println("Link " + j + ": " + links[j]);
495                 }
496             } else if ("-dumplinks".equals(cmd)) {
497                 System.out.println(reader);
498                 System.out.println();
499                 Enumeration e = reader.pagesByMD5();
500                 while (e.hasMoreElements()) {
501                   Page page = (Page) e.nextElement();
502                   Link[] links = reader.getLinks(page.getMD5());
503                   if (links.length > 0) {
504                     System.out.println("from " + page.getURL());
505                     for (int j = 0; j < links.length; j++) {
506                       System.out.println(" to " + links[j].getURL());
507                     }
508                     System.out.println();
509                   }
510                 }
511             } else if ("-stats".equals(cmd)) {
512                 System.out.println("Stats for " + reader);
513                 System.out.println("-------------------------------");
514                 System.out.println("Number of pages: " + reader.numPages());
515                 System.out.println("Number of links: " + reader.numLinks());
516             } else {
517                 System.out.println("Sorry, no command with name " + cmd);
518             }
519         } finally {
520             reader.close();
521             nfs.close();
522         }
523     }
524 }
525
Popular Tags