KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > db > DBSectionReader


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3 package net.nutch.db;
4
5 import java.io.*;
6 import java.util.*;
7
8 import net.nutch.io.*;
9 import net.nutch.fs.*;
10 import net.nutch.util.*;
11
12 /**********************************************************
13  * DBSectionReader reads a discrete portion of a WebDB.
14  * It may implement its methods with either a local
15  * MapFile.Reader object or (eventually) a remote-
16  * machine network interface. For the moment, we
17  * do only the MapFile.Reader implementation (much of
18  * the code for this was moved from the earlier
19  * pre-distributed version of WebDBReadaer).
20  *
21  * @author Mike Cafarella
22  ***********************************************/

23 public class DBSectionReader {
24     NutchFileSystem nfs;
25     File sectionFile;
26     WritableComparator comparator;
27     MapFile.Reader reader;
28
29     /**
30      * Right now we assume we're getting a File that is a
31      * MapFile.Reader directory. But in the future we could
32      * also check for existence of a "remote-network" file, similar
33      * to the way we do now for distributed index reading.
34      * Then, we would either create a MapFile.Reader or a network
35      * client for one.
36      */

37     public DBSectionReader(NutchFileSystem nfs, File sectionFile, WritableComparator comparator) throws IOException {
38         this.nfs = nfs;
39         this.sectionFile = sectionFile;
40         this.comparator = comparator;
41         this.reader = new MapFile.Reader(nfs, sectionFile.getPath(), comparator);
42     }
43
44     /**
45      * Fetch a Page with the given URL, and fill it into
46      * the pre-allocated Page 'p'.
47      */

48     public Page getPage(UTF8 url, Page p) throws IOException {
49         return (Page) reader.get(url, p);
50     }
51
52     /**
53      * Get Pages from the db according to their
54      * content hash.
55      */

56     public Vector getPages(MD5Hash md5) throws IOException {
57         Vector records = new Vector(3);
58         Page p = new Page();
59         p.getMD5().set(md5);
60
61         reader.seek(p);
62         while (reader.next(p, NullWritable.get())) {
63             if (p.getMD5().compareTo(md5) == 0) {
64                 records.add(p);
65                 p = new Page();
66             } else {
67                 break;
68             }
69         }
70
71         return records;
72     }
73
74     /**
75      * Test whether a certain piece of content is in the
76      * db, but don't bother returning it.
77      */

78     public boolean pageExists(MD5Hash md5) throws IOException {
79         Page p = new Page();
80         p.getMD5().set(md5);
81         reader.seek(p);
82         if (reader.next(p, NullWritable.get()) && p.getMD5().compareTo(md5) == 0) {
83             return true;
84         } else {
85             return false;
86         }
87     }
88
89     /**
90      * Iterate through all the Pages, sorted by URL
91      */

92     public Enumeration pages() throws IOException {
93         return new TableEnumerator(new MapFile.Reader(nfs, sectionFile.getPath(), comparator));
94     }
95
96     //
97
// The TableEnumerator goes through all the entries
98
// in the Table (which is a MapFile).
99
//
100
class TableEnumerator implements Enumeration {
101         MapFile.Reader reader;
102         Page nextItem;
103
104         /**
105          * Start the cursor and find the first item.
106          * Store it for later return.
107          */

108         public TableEnumerator(MapFile.Reader reader) {
109             this.reader = reader;
110             this.nextItem = new Page();
111             try {
112                 if (! reader.next(new UTF8(), this.nextItem)) {
113                     this.nextItem = null;
114                 }
115             } catch (IOException ie) {
116                 ie.printStackTrace();
117                 this.nextItem = null;
118             }
119         }
120
121         /**
122          * If there's no item left in store, we've hit the end.
123          */

124         public boolean hasMoreElements() {
125             return (nextItem != null);
126         }
127
128         /**
129          * Set aside the item we have in store. Then retrieve
130          * another for the next time we're called. Finally, return
131          * the set-aside item.
132          */

133         public Object JavaDoc nextElement() {
134             if (nextItem == null) {
135                 throw new NoSuchElementException("PageDB Enumeration");
136             }
137             Page toReturn = nextItem;
138             this.nextItem = new Page();
139             try {
140                 if (! reader.next(new UTF8(), nextItem)) {
141                     this.nextItem = null;
142                 }
143             } catch (IOException ie) {
144                 this.nextItem = null;
145             }
146             return toReturn;
147         }
148     }
149
150     /**
151      * Iterate through all the Pages, sorted by MD5
152      */

153     public Enumeration pagesByMD5() throws IOException {
154         return new IndexEnumerator(new SetFile.Reader(nfs, sectionFile.getPath(), comparator));
155     }
156
157     //
158
// The IndexEnumerator goes through all the entries
159
// in the index (which is a SequenceFile).
160
//
161
class IndexEnumerator implements Enumeration {
162         SetFile.Reader reader;
163         Page nextItem;
164
165         /**
166          * Start the cursor and find the first item.
167          * Store it for later return.
168          */

169         public IndexEnumerator(SetFile.Reader reader) {
170             this.reader = reader;
171             this.nextItem = new Page();
172             try {
173                 if (! reader.next(nextItem)) {
174                     this.nextItem = null;
175                 }
176             } catch (IOException ie) {
177                 this.nextItem = null;
178             }
179         }
180
181         /**
182          * If there's no item left in store, we've hit the end.
183          */

184         public boolean hasMoreElements() {
185             return (nextItem != null);
186         }
187
188         /**
189          * Set aside the item we have in store. Then retrieve
190          * another for the next time we're called. Finally, return
191          * the set-aside item.
192          */

193         public Object JavaDoc nextElement() {
194             if (nextItem == null) {
195                 throw new NoSuchElementException("PageDB Enumeration");
196             }
197
198             Page toReturn = nextItem;
199             this.nextItem = new Page();
200             try {
201                 if (! reader.next(nextItem)) {
202                     this.nextItem = null;
203                 }
204             } catch (IOException ie) {
205                 this.nextItem = null;
206             }
207             return toReturn;
208         }
209     }
210
211     /**
212      * Get all the hyperlinks that link TO the indicated URL.
213      */

214     public Vector getLinks(UTF8 url) throws IOException {
215         Vector records = new Vector(3);
216         Link l = new Link();
217         l.getURL().set(url);
218
219         reader.seek(l);
220         while (reader.next(l, NullWritable.get())) {
221             if (url.equals(l.getURL())) {
222                 records.add(l);
223                 l = new Link();
224             } else {
225                 break;
226             }
227         }
228         
229         return records;
230     }
231
232     /**
233      * Grab all the links from the given MD5 hash.
234      */

235     public Vector getLinks(MD5Hash md5) throws IOException {
236         Vector records = new Vector(3);
237         Link l = new Link();
238         l.getFromID().set(md5);
239
240         reader.seek(l);
241         while (reader.next(l, NullWritable.get())) {
242             if (md5.equals(l.getFromID())) {
243                 records.add(l);
244                 l = new Link();
245             } else {
246                 break;
247             }
248         }
249         
250         return records;
251     }
252
253     /**
254      * Return all the links, by target URL
255      */

256     public Enumeration links() throws IOException {
257         return new MapEnumerator(new MapFile.Reader(nfs, sectionFile.getPath(), comparator));
258     }
259     
260     //
261
// Here's the class for the above function
262
//
263
class MapEnumerator implements Enumeration {
264         MapFile.Reader reader;
265         Link nextItem;
266
267         /**
268          * Start the cursor and find the first item.
269          * Store it for later return.
270          */

271         public MapEnumerator(MapFile.Reader reader) {
272             this.reader = reader;
273             this.nextItem = new Link();
274             try {
275                 if (! reader.next(this.nextItem, NullWritable.get())) {
276                     this.nextItem = null;
277                 }
278             } catch (IOException ie) {
279                 this.nextItem = null;
280             }
281         }
282
283         /**
284          * If there's no item left in store, we've hit the end.
285          */

286         public boolean hasMoreElements() {
287             return (nextItem != null);
288         }
289
290         /**
291          * Set aside the item we have in store. Then retrieve
292          * another for the next time we're called. Finally, return
293          * the set-aside item.
294          */

295         public Object JavaDoc nextElement() {
296             if (nextItem == null) {
297                 throw new NoSuchElementException("PageDB Enumeration");
298             }
299
300             Link toReturn = nextItem;
301             this.nextItem = new Link();
302             try {
303                 if (! reader.next(nextItem, NullWritable.get())) {
304                     this.nextItem = null;
305                 }
306             } catch (IOException ie) {
307                 this.nextItem = null;
308             }
309             return toReturn;
310         }
311     }
312
313     /**
314      */

315     public void close() throws IOException {
316         reader.close();
317     }
318 }
319
Popular Tags