KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > db > Link


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.db;
5
6 import java.io.*;
7 import java.net.*;
8 import java.net.MalformedURLException JavaDoc;
9
10 import net.nutch.io.*;
11 import net.nutch.util.*;
12 import net.nutch.net.UrlNormalizerFactory;
13
14 /*********************************************
15  * This is the field in the Link Database.
16  * Each row is a Link:
17  * type name description
18  * ---------------------------------------------------------------
19  * byte VERSION - A byte indicating the version of this entry.
20  * 128bit FROM_ID - The MD5 hash of the source of the link.
21  * 64bit DOMAIN_ID - The 8-byte MD5Hash of the source's domain.
22  * string TO_URL - The URL destination of the link.
23  * string ANCHOR - The anchor text of the link.
24  * boolean TARGET_HAS_OUTLINK - Whether the target of the link has outlinks.
25  *
26  * @author Mike Cafarella
27  *************************************************/

28 public class Link implements WritableComparable {
29     public static final int MAX_ANCHOR_LENGTH =
30       NutchConf.getInt("db.max.anchor.length", 100);
31
32     private final static byte VERSION_1 = 1;
33     private final static byte VERSION_2 = 2;
34     private final static byte CUR_VERSION = 5;
35
36     private MD5Hash fromID;
37     private UTF8 url;
38     private long domainID;
39     private UTF8 anchor;
40     private boolean targetHasOutlink;
41
42     /**
43      * Create the Link with no data
44      */

45     public Link() {
46         this.fromID = new MD5Hash();
47         this.url = new UTF8();
48         this.domainID = 0;
49         this.anchor = new UTF8();
50         this.targetHasOutlink = false;
51     }
52
53     /**
54      * Create the record
55      */

56     public Link(MD5Hash fromID, long domainID, String JavaDoc urlString, String JavaDoc anchorText)
57       throws MalformedURLException JavaDoc {
58         this.fromID = fromID;
59         this.url = new UTF8(UrlNormalizerFactory.getNormalizer().normalize(urlString));
60         this.domainID = domainID;
61         
62         // truncate long anchors
63
if (anchorText.length() > MAX_ANCHOR_LENGTH)
64           anchorText = anchorText.substring(0, MAX_ANCHOR_LENGTH);
65
66         this.anchor = new UTF8(anchorText);
67         this.targetHasOutlink = false;
68     }
69
70     /**
71      * Read in fields from a bytestream
72      */

73     public void readFields(DataInput in) throws IOException {
74         byte version = in.readByte();
75         
76         if (version > CUR_VERSION)
77           throw new VersionMismatchException(CUR_VERSION, version);
78
79         if (fromID == null)
80           fromID = new MD5Hash();
81         fromID.readFields(in);
82
83         if (url == null)
84           url = new UTF8();
85         url.readFields(in);
86
87         // 'domainID' was addded in Version 4
88
domainID = (version > 4) ? in.readLong() : 0;
89         
90         if (anchor == null)
91           anchor = new UTF8();
92         anchor.readFields(in);
93
94         // 'targetHasOutlink' added in Version 3.
95
targetHasOutlink = (version > 3) ? in.readBoolean() : false;
96     }
97
98     /**
99      */

100     public void set(Link that) {
101         this.fromID.set(that.fromID);
102         this.url.set(that.url);
103         this.domainID = that.getDomainID();
104         this.anchor.set(that.anchor);
105         this.targetHasOutlink = that.targetHasOutlink;
106     }
107
108     /**
109      * Write bytes out to stream
110      */

111     public void write(DataOutput out) throws IOException {
112         out.write(CUR_VERSION);
113         fromID.write(out);
114         url.write(out);
115         out.writeLong(domainID);
116         anchor.write(out);
117         out.writeBoolean(targetHasOutlink);
118     }
119
120     public static Link read(DataInput in) throws IOException {
121         Link lr = new Link();
122         lr.readFields(in);
123         return lr;
124     }
125
126     //
127
// Accessors
128
//
129
public MD5Hash getFromID() {
130         return fromID;
131     }
132     public UTF8 getURL() {
133         return url;
134     }
135     public long getDomainID() {
136         return domainID;
137     }
138     public UTF8 getAnchorText() {
139         return anchor;
140     }
141     public boolean targetHasOutlink() {
142         return targetHasOutlink;
143     }
144     public void setTargetHasOutlink(boolean targetHasOutlink) {
145         this.targetHasOutlink = targetHasOutlink;
146     }
147
148     /**
149      * Print out the record
150      */

151     public String JavaDoc toString() {
152         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
153         buf.append("Version: " + CUR_VERSION + "\n");
154         buf.append("ID: " + getFromID() + "\n");
155         buf.append("DomainID: " + getDomainID() + "\n");
156         buf.append("URL: " + getURL() + "\n");
157         buf.append("AnchorText: " + getAnchorText() + "\n");
158         buf.append("targetHasOutlink: " + targetHasOutlink() + "\n");
159         return buf.toString();
160     }
161
162     /**
163      * Get a tab-delimited version of the text data.
164      */

165     public String JavaDoc toTabbedString() {
166         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
167         buf.append("" + CUR_VERSION); buf.append("\t");
168         buf.append(getFromID().toString()); buf.append("\t");
169         buf.append(getDomainID()); buf.append("\t");
170         buf.append(getURL()); buf.append("\t");
171         buf.append(getAnchorText()); buf.append("\t");
172         buf.append(targetHasOutlink()); buf.append("\t");
173
174         return buf.toString();
175     }
176
177     /**
178      */

179     public int compareTo(Object JavaDoc o) {
180         return urlCompare(o);
181     }
182
183     /**
184      * Compare URLs, then compare MD5s.
185      */

186     public int urlCompare(Object JavaDoc o) {
187         int urlResult = this.url.compareTo(((Link) o).url);
188         if (urlResult != 0) {
189             return urlResult;
190         }
191
192         return this.fromID.compareTo(((Link) o).fromID);
193     }
194
195     /**
196      * Compare MD5s, then compare URLs.
197      */

198     public int md5Compare(Object JavaDoc o) {
199         int md5Result = this.fromID.compareTo(((Link) o).fromID);
200         if (md5Result != 0) {
201             return md5Result;
202         }
203
204         return this.url.compareTo(((Link) o).url);
205     }
206
207     /**
208      * URLComparator uses the standard method where, uh,
209      * the URL comes first.
210      */

211     public static class UrlComparator extends WritableComparator {
212         public UrlComparator() {
213             super(Link.class);
214         }
215         public int compare(WritableComparable a, WritableComparable b) {
216             return ((Link) a).urlCompare(b);
217         }
218
219         /** Optimized comparator. */
220         public int compare(byte[] b1, int s1, int l1,
221                            byte[] b2, int s2, int l2) {
222           int md5Start1 = s1 + 1; // skip version
223
int md5Start2 = s2 + 1;
224           int urlLenStart1 = md5Start1 + MD5Hash.MD5_LEN;
225           int urlLenStart2 = md5Start2 + MD5Hash.MD5_LEN;
226           int urlLen1 = readUnsignedShort(b1, urlLenStart1);
227           int urlLen2 = readUnsignedShort(b2, urlLenStart2);
228           int urlStart1 = urlLenStart1+2;
229           int urlStart2 = urlLenStart2+2;
230           // compare urls
231
int c = compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2);
232           if (c != 0)
233             return c;
234           // compare md5s
235
return compareBytes(b1, md5Start1, MD5Hash.MD5_LEN,
236                               b2, md5Start2, MD5Hash.MD5_LEN);
237         }
238     }
239
240     /**
241      * MD5Comparator is the opposite.
242      */

243     public static class MD5Comparator extends WritableComparator {
244         public MD5Comparator() {
245             super(Link.class);
246         }
247         public int compare(WritableComparable a, WritableComparable b) {
248             return ((Link) a).md5Compare(b);
249         }
250       
251         /** Optimized comparator. */
252         public int compare(byte[] b1, int s1, int l1,
253                            byte[] b2, int s2, int l2) {
254           // compare md5s
255
int md5Start1 = s1 + 1; // skip version
256
int md5Start2 = s2 + 1;
257           int c = compareBytes(b1, md5Start1, MD5Hash.MD5_LEN,
258                                b2, md5Start2, MD5Hash.MD5_LEN);
259           if (c != 0)
260             return c;
261
262           // compare urls
263
int urlLenStart1 = md5Start1 + MD5Hash.MD5_LEN;
264           int urlLenStart2 = md5Start2 + MD5Hash.MD5_LEN;
265           int urlLen1 = readUnsignedShort(b1, urlLenStart1);
266           int urlLen2 = readUnsignedShort(b2, urlLenStart2);
267           int urlStart1 = urlLenStart1+2;
268           int urlStart2 = urlLenStart2+2;
269           return compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2);
270         }
271     }
272 }
273
274
275
Popular Tags