1 2 3 4 package net.nutch.db; 5 6 import java.io.*; 7 import java.net.*; 8 import java.util.*; 9 10 import net.nutch.io.*; 11 import net.nutch.util.*; 12 import net.nutch.net.UrlNormalizerFactory; 13 14 32 public class Page implements WritableComparable, Cloneable { 33 private final static byte CUR_VERSION = 4; 34 35 private static final byte DEFAULT_INTERVAL = 36 (byte)NutchConf.getInt("db.default.fetch.interval", 30); 37 38 private UTF8 url; 39 private MD5Hash md5; 40 private long nextFetch = System.currentTimeMillis(); 41 private byte retries; 42 private byte fetchInterval = DEFAULT_INTERVAL; 43 private int numOutlinks; 44 private float score = 1.0f; 45 private float nextScore = 1.0f; 46 47 49 public Page() { 50 url = new UTF8(); md5 = new MD5Hash(); } 53 54 55 public Page(String urlString, MD5Hash md5) throws MalformedURLException { 56 setURL(urlString); 57 this.md5 = md5; 58 } 59 60 public Page(String urlString, float score) 61 throws MalformedURLException { 62 this(urlString, score, score, System.currentTimeMillis()); 63 } 64 65 public Page(String urlString, float score, long nextFetch) 66 throws MalformedURLException { 67 this(urlString, score, score, nextFetch); 68 } 69 70 public Page(String urlString, float score, float nextScore, long nextFetch) 71 throws MalformedURLException { 72 setURL(urlString); 73 this.md5 = MD5Hash.digest(url); this.score = score; 75 this.nextScore = nextScore; 76 this.nextFetch = nextFetch; 77 } 78 79 public void readFields(DataInput in) throws IOException { 80 byte version = in.readByte(); if (version > CUR_VERSION) throw new VersionMismatchException(CUR_VERSION, version); 83 84 url.readFields(in); 85 md5.readFields(in); 86 nextFetch = in.readLong(); 87 retries = in.readByte(); 88 fetchInterval = in.readByte(); 89 numOutlinks = (version > 2) ? in.readInt() : 0; score = (version>1) ? in.readFloat() : 1.0f; nextScore = (version>3) ? in.readFloat() : 1.0f; } 93 94 95 public void set(Page that) { 96 this.url.set(that.url); 97 this.md5.set(that.md5); 98 this.nextFetch = that.nextFetch; 99 this.retries = that.retries; 100 this.fetchInterval = that.fetchInterval; 101 this.numOutlinks = that.numOutlinks; 102 this.score = that.score; 103 this.nextScore = that.nextScore; 104 } 105 106 109 public void write(DataOutput out) throws IOException { 110 out.writeByte(CUR_VERSION); url.write(out); 112 md5.write(out); 113 out.writeLong(nextFetch); 114 out.write(retries); 115 out.write(fetchInterval); 116 out.writeInt(numOutlinks); 117 out.writeFloat(score); 118 out.writeFloat(nextScore); 119 } 120 121 124 public int compareTo(Object o) { 125 int md5Result = this.md5.compareTo(((Page) o).md5); 126 if (md5Result != 0) { 127 return md5Result; 128 } 129 return this.url.compareTo(((Page) o).url); 130 } 131 132 133 134 public static class Comparator extends WritableComparator { 135 public Comparator() { super(Page.class); } 136 137 138 public int compare(byte[] b1, int s1, int l1, 139 byte[] b2, int s2, int l2) { 140 int urlLen1 = readUnsignedShort(b1, s1+1); int urlLen2 = readUnsignedShort(b2, s2+1); 142 int urlStart1 = s1+1+2; 143 int urlStart2 = s2+1+2; 144 int md5Start1 = urlStart1 + urlLen1; 145 int md5Start2 = urlStart2 + urlLen2; 146 int c = compareBytes(b1, md5Start1, MD5Hash.MD5_LEN, b2, md5Start2, MD5Hash.MD5_LEN); 148 if (c != 0) 149 return c; 150 return compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2); 151 } 152 } 153 154 155 public static class UrlComparator extends WritableComparator { 156 public UrlComparator() { super(Page.class); } 157 158 public int compare(WritableComparable a, WritableComparable b) { 159 Page pageA = (Page)a; 160 Page pageB = (Page)b; 161 162 return pageA.getURL().compareTo(pageB.getURL()); 163 } 164 165 166 167 public int compare(byte[] b1, int s1, int l1, 168 byte[] b2, int s2, int l2) { 169 int urlLen1 = readUnsignedShort(b1, s1+1); int urlLen2 = readUnsignedShort(b2, s2+1); 171 int urlStart1 = s1+1+2; 172 int urlStart2 = s2+1+2; 173 return compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2); 174 } 175 } 176 177 public static Page read(DataInput in) throws IOException { 178 Page page = new Page(); 179 page.readFields(in); 180 return page; 181 } 182 183 public UTF8 getURL() { return url; } 187 public void setURL(String url) throws MalformedURLException { 188 this.url = new UTF8(UrlNormalizerFactory.getNormalizer().normalize(url)); 189 } 190 191 public MD5Hash getMD5() { return md5; } 192 public void setMD5(MD5Hash md5) { this.md5 = md5; } 193 194 public long getNextFetchTime() { return nextFetch; } 195 public void setNextFetchTime(long nextFetch) { this.nextFetch = nextFetch; } 196 197 public byte getRetriesSinceFetch() { return retries; } 198 public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;} 199 200 public byte getFetchInterval() { return fetchInterval; } 201 public void setFetchInterval(byte fetchInterval) { 202 this.fetchInterval = fetchInterval; 203 } 204 205 public int getNumOutlinks() { return numOutlinks; } 206 public void setNumOutlinks(int numOutlinks) { 207 this.numOutlinks = numOutlinks; 208 } 209 210 public float getScore() { return score; } 211 public float getNextScore() { return nextScore; } 212 public void setScore(float score, float nextScore) { 213 this.score = score; 214 this.nextScore = nextScore; 215 } 216 217 220 public long computeDomainID() throws MalformedURLException { 221 return MD5Hash.digest(new URL(url.toString()).getHost()).halfDigest(); 222 } 223 224 225 228 public String toString() { 229 StringBuffer buf = new StringBuffer (); 230 buf.append("Version: " + CUR_VERSION + "\n"); 231 buf.append("URL: " + getURL() + "\n"); 232 buf.append("ID: " + getMD5() + "\n"); 233 buf.append("Next fetch: " + new Date(getNextFetchTime()) + "\n"); 234 buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n"); 235 buf.append("Retry interval: " + getFetchInterval() + " days\n"); 236 buf.append("Num outlinks: " + getNumOutlinks() + "\n"); 237 buf.append("Score: " + getScore() + "\n"); 238 buf.append("NextScore: " + getNextScore() + "\n"); 239 return buf.toString(); 240 } 241 242 245 public String toTabbedString() { 246 StringBuffer buf = new StringBuffer (); 247 buf.append(CUR_VERSION); buf.append("\t"); 248 buf.append(getURL()); buf.append("\t"); 249 buf.append(getMD5()); buf.append("\t"); 250 buf.append(getNextFetchTime()); buf.append("\t"); 251 buf.append(getRetriesSinceFetch()); buf.append("\t"); 252 buf.append(getFetchInterval()); buf.append("\t"); 253 buf.append(getNumOutlinks()); buf.append("\t"); 254 buf.append(getScore()); buf.append("\t"); 255 buf.append(getNextScore()); buf.append("\t"); 256 return buf.toString(); 257 } 258 259 public boolean equals(Object o) { 260 if (!(o instanceof Page)) 261 return false; 262 Page other = (Page)o; 263 return 264 this.url.equals(other.url) && 265 this.md5.equals(other.md5) && 266 (this.nextFetch == other.nextFetch) && 267 (this.retries == other.retries) && 268 (this.fetchInterval == other.fetchInterval) && 269 (this.score == other.score) && 270 (this.nextScore == other.nextScore); 271 } 272 273 public int hashCode() { 274 return 275 url.hashCode() ^ 276 md5.hashCode() ^ 277 ((int)nextFetch) ^ 278 retries ^ 279 fetchInterval ^ 280 Float.floatToIntBits(score) ^ 281 Float.floatToIntBits(nextScore); 282 } 283 284 public Object clone() { 285 try { 286 return super.clone(); 287 } catch (CloneNotSupportedException e) { 288 throw new RuntimeException (e); 289 } 290 } 291 292 } 293 | Popular Tags |