KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > db > Page


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.db;
5
6 import java.io.*;
7 import java.net.*;
8 import java.util.*;
9
10 import net.nutch.io.*;
11 import net.nutch.util.*;
12 import net.nutch.net.UrlNormalizerFactory;
13
14 /*********************************************
15  * A row in the Page Database.
16  * <pre>
17  * type name description
18  * ---------------------------------------------------------------
19  * byte VERSION - A byte indicating the version of this entry.
20  * String URL - The url of a page. This is the primary key.
21  * 128bit ID - The MD5 hash of the contents of the page.
22  * 64bit DATE - The date this page should be refetched.
23  * byte RETRIES - The number of times we've failed to fetch this page.
24  * byte INTERVAL - Frequency, in days, this page should be refreshed.
25  * float SCORE - Multiplied into the score for hits on this page.
26  * float NEXTSCORE - Multiplied into the score for hits on this page.
27  * </pre>
28  *
29  * @author Mike Cafarella
30  * @author Doug Cutting
31  *********************************************/

32 public class Page implements WritableComparable, Cloneable JavaDoc {
33   private final static byte CUR_VERSION = 4;
34
35   private static final byte DEFAULT_INTERVAL =
36     (byte)NutchConf.getInt("db.default.fetch.interval", 30);
37
38   private UTF8 url;
39   private MD5Hash md5;
40   private long nextFetch = System.currentTimeMillis();
41   private byte retries;
42   private byte fetchInterval = DEFAULT_INTERVAL;
43   private int numOutlinks;
44   private float score = 1.0f;
45   private float nextScore = 1.0f;
46
47   /** Construct a page ready to be read by {@link
48    * #readFields(DataInput)}.*/

49   public Page() {
50     url = new UTF8(); // initialize for readFields()
51
md5 = new MD5Hash(); // initialize for readFields()
52
}
53
54   /** Construct a new, default page, due to be fetched. */
55   public Page(String JavaDoc urlString, MD5Hash md5) throws MalformedURLException {
56     setURL(urlString);
57     this.md5 = md5;
58   }
59
60   public Page(String JavaDoc urlString, float score)
61     throws MalformedURLException {
62     this(urlString, score, score, System.currentTimeMillis());
63   }
64     
65   public Page(String JavaDoc urlString, float score, long nextFetch)
66     throws MalformedURLException {
67     this(urlString, score, score, nextFetch);
68   }
69
70   public Page(String JavaDoc urlString, float score, float nextScore, long nextFetch)
71     throws MalformedURLException {
72     setURL(urlString);
73     this.md5 = MD5Hash.digest(url); // hash url, by default
74
this.score = score;
75     this.nextScore = nextScore;
76     this.nextFetch = nextFetch;
77   }
78
79   public void readFields(DataInput in) throws IOException {
80     byte version = in.readByte(); // read version
81
if (version > CUR_VERSION) // check version
82
throw new VersionMismatchException(CUR_VERSION, version);
83
84     url.readFields(in);
85     md5.readFields(in);
86     nextFetch = in.readLong();
87     retries = in.readByte();
88     fetchInterval = in.readByte();
89     numOutlinks = (version > 2) ? in.readInt() : 0; // added in Version 3
90
score = (version>1) ? in.readFloat() : 1.0f; // score added in version 2
91
nextScore = (version>3) ? in.readFloat() : 1.0f; // 2nd score added in V4
92
}
93
94   /** Copy the contents of another instance into this instance. */
95   public void set(Page that) {
96     this.url.set(that.url);
97     this.md5.set(that.md5);
98     this.nextFetch = that.nextFetch;
99     this.retries = that.retries;
100     this.fetchInterval = that.fetchInterval;
101     this.numOutlinks = that.numOutlinks;
102     this.score = that.score;
103     this.nextScore = that.nextScore;
104   }
105
106   /**
107    * Write the bytes out to the bytestream
108    */

109   public void write(DataOutput out) throws IOException {
110     out.writeByte(CUR_VERSION); // store current version
111
url.write(out);
112     md5.write(out);
113     out.writeLong(nextFetch);
114     out.write(retries);
115     out.write(fetchInterval);
116     out.writeInt(numOutlinks);
117     out.writeFloat(score);
118     out.writeFloat(nextScore);
119   }
120
121     /**
122      * Compare to another Page object
123      */

124     public int compareTo(Object JavaDoc o) {
125         int md5Result = this.md5.compareTo(((Page) o).md5);
126         if (md5Result != 0) {
127             return md5Result;
128         }
129         return this.url.compareTo(((Page) o).url);
130     }
131
132
133   /** Compares pages by MD5, then by URL. */
134   public static class Comparator extends WritableComparator {
135     public Comparator() { super(Page.class); }
136     
137     /** Optimized comparator. */
138     public int compare(byte[] b1, int s1, int l1,
139                        byte[] b2, int s2, int l2) {
140       int urlLen1 = readUnsignedShort(b1, s1+1); // skip version byte
141
int urlLen2 = readUnsignedShort(b2, s2+1);
142       int urlStart1 = s1+1+2;
143       int urlStart2 = s2+1+2;
144       int md5Start1 = urlStart1 + urlLen1;
145       int md5Start2 = urlStart2 + urlLen2;
146       int c = compareBytes(b1, md5Start1, MD5Hash.MD5_LEN, // compare md5
147
b2, md5Start2, MD5Hash.MD5_LEN);
148       if (c != 0)
149         return c;
150       return compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2);
151     }
152   }
153
154   /** Compares pages by URL only. */
155   public static class UrlComparator extends WritableComparator {
156     public UrlComparator() { super(Page.class); }
157     
158     public int compare(WritableComparable a, WritableComparable b) {
159       Page pageA = (Page)a;
160       Page pageB = (Page)b;
161       
162       return pageA.getURL().compareTo(pageB.getURL());
163     }
164
165
166     /** Optimized comparator. */
167     public int compare(byte[] b1, int s1, int l1,
168                        byte[] b2, int s2, int l2) {
169       int urlLen1 = readUnsignedShort(b1, s1+1); // skip version byte
170
int urlLen2 = readUnsignedShort(b2, s2+1);
171       int urlStart1 = s1+1+2;
172       int urlStart2 = s2+1+2;
173       return compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2);
174     }
175   }
176
177   public static Page read(DataInput in) throws IOException {
178     Page page = new Page();
179     page.readFields(in);
180     return page;
181   }
182
183   //
184
// Accessor methods
185
//
186
public UTF8 getURL() { return url; }
187   public void setURL(String JavaDoc url) throws MalformedURLException {
188     this.url = new UTF8(UrlNormalizerFactory.getNormalizer().normalize(url));
189   }
190
191   public MD5Hash getMD5() { return md5; }
192   public void setMD5(MD5Hash md5) { this.md5 = md5; }
193
194   public long getNextFetchTime() { return nextFetch; }
195   public void setNextFetchTime(long nextFetch) { this.nextFetch = nextFetch; }
196
197   public byte getRetriesSinceFetch() { return retries; }
198   public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;}
199
200   public byte getFetchInterval() { return fetchInterval; }
201   public void setFetchInterval(byte fetchInterval) {
202     this.fetchInterval = fetchInterval;
203   }
204
205   public int getNumOutlinks() { return numOutlinks; }
206   public void setNumOutlinks(int numOutlinks) {
207     this.numOutlinks = numOutlinks;
208   }
209
210   public float getScore() { return score; }
211   public float getNextScore() { return nextScore; }
212   public void setScore(float score, float nextScore) {
213     this.score = score;
214     this.nextScore = nextScore;
215   }
216
217   /**
218    * Compute domain ID from URL
219    */

220   public long computeDomainID() throws MalformedURLException {
221     return MD5Hash.digest(new URL(url.toString()).getHost()).halfDigest();
222   }
223
224
225   /**
226    * Print out the Page
227    */

228   public String JavaDoc toString() {
229     StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
230     buf.append("Version: " + CUR_VERSION + "\n");
231     buf.append("URL: " + getURL() + "\n");
232     buf.append("ID: " + getMD5() + "\n");
233     buf.append("Next fetch: " + new Date(getNextFetchTime()) + "\n");
234     buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
235     buf.append("Retry interval: " + getFetchInterval() + " days\n");
236     buf.append("Num outlinks: " + getNumOutlinks() + "\n");
237     buf.append("Score: " + getScore() + "\n");
238     buf.append("NextScore: " + getNextScore() + "\n");
239     return buf.toString();
240   }
241
242   /**
243    * A tab-delimited text version of the Page's data.
244    */

245   public String JavaDoc toTabbedString() {
246       StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
247       buf.append(CUR_VERSION); buf.append("\t");
248       buf.append(getURL()); buf.append("\t");
249       buf.append(getMD5()); buf.append("\t");
250       buf.append(getNextFetchTime()); buf.append("\t");
251       buf.append(getRetriesSinceFetch()); buf.append("\t");
252       buf.append(getFetchInterval()); buf.append("\t");
253       buf.append(getNumOutlinks()); buf.append("\t");
254       buf.append(getScore()); buf.append("\t");
255       buf.append(getNextScore()); buf.append("\t");
256       return buf.toString();
257   }
258
259   public boolean equals(Object JavaDoc o) {
260     if (!(o instanceof Page))
261       return false;
262     Page other = (Page)o;
263     return
264       this.url.equals(other.url) &&
265       this.md5.equals(other.md5) &&
266       (this.nextFetch == other.nextFetch) &&
267       (this.retries == other.retries) &&
268       (this.fetchInterval == other.fetchInterval) &&
269       (this.score == other.score) &&
270       (this.nextScore == other.nextScore);
271   }
272
273   public int hashCode() {
274     return
275       url.hashCode() ^
276       md5.hashCode() ^
277       ((int)nextFetch) ^
278       retries ^
279       fetchInterval ^
280       Float.floatToIntBits(score) ^
281       Float.floatToIntBits(nextScore);
282   }
283
284   public Object JavaDoc clone() {
285     try {
286       return super.clone();
287     } catch (CloneNotSupportedException JavaDoc e) {
288       throw new RuntimeException JavaDoc(e);
289     }
290   }
291
292 }
293
Popular Tags