1 2 3 4 package net.nutch.db; 5 6 import java.io.*; 7 import java.net.*; 8 import java.net.MalformedURLException ; 9 10 import net.nutch.io.*; 11 import net.nutch.util.*; 12 import net.nutch.net.UrlNormalizerFactory; 13 14 28 public class Link implements WritableComparable { 29 public static final int MAX_ANCHOR_LENGTH = 30 NutchConf.getInt("db.max.anchor.length", 100); 31 32 private final static byte VERSION_1 = 1; 33 private final static byte VERSION_2 = 2; 34 private final static byte CUR_VERSION = 5; 35 36 private MD5Hash fromID; 37 private UTF8 url; 38 private long domainID; 39 private UTF8 anchor; 40 private boolean targetHasOutlink; 41 42 45 public Link() { 46 this.fromID = new MD5Hash(); 47 this.url = new UTF8(); 48 this.domainID = 0; 49 this.anchor = new UTF8(); 50 this.targetHasOutlink = false; 51 } 52 53 56 public Link(MD5Hash fromID, long domainID, String urlString, String anchorText) 57 throws MalformedURLException { 58 this.fromID = fromID; 59 this.url = new UTF8(UrlNormalizerFactory.getNormalizer().normalize(urlString)); 60 this.domainID = domainID; 61 62 if (anchorText.length() > MAX_ANCHOR_LENGTH) 64 anchorText = anchorText.substring(0, MAX_ANCHOR_LENGTH); 65 66 this.anchor = new UTF8(anchorText); 67 this.targetHasOutlink = false; 68 } 69 70 73 public void readFields(DataInput in) throws IOException { 74 byte version = in.readByte(); 75 76 if (version > CUR_VERSION) 77 throw new VersionMismatchException(CUR_VERSION, version); 78 79 if (fromID == null) 80 fromID = new MD5Hash(); 81 fromID.readFields(in); 82 83 if (url == null) 84 url = new UTF8(); 85 url.readFields(in); 86 87 domainID = (version > 4) ? in.readLong() : 0; 89 90 if (anchor == null) 91 anchor = new UTF8(); 92 anchor.readFields(in); 93 94 targetHasOutlink = (version > 3) ? in.readBoolean() : false; 96 } 97 98 100 public void set(Link that) { 101 this.fromID.set(that.fromID); 102 this.url.set(that.url); 103 this.domainID = that.getDomainID(); 104 this.anchor.set(that.anchor); 105 this.targetHasOutlink = that.targetHasOutlink; 106 } 107 108 111 public void write(DataOutput out) throws IOException { 112 out.write(CUR_VERSION); 113 fromID.write(out); 114 url.write(out); 115 out.writeLong(domainID); 116 anchor.write(out); 117 out.writeBoolean(targetHasOutlink); 118 } 119 120 public static Link read(DataInput in) throws IOException { 121 Link lr = new Link(); 122 lr.readFields(in); 123 return lr; 124 } 125 126 public MD5Hash getFromID() { 130 return fromID; 131 } 132 public UTF8 getURL() { 133 return url; 134 } 135 public long getDomainID() { 136 return domainID; 137 } 138 public UTF8 getAnchorText() { 139 return anchor; 140 } 141 public boolean targetHasOutlink() { 142 return targetHasOutlink; 143 } 144 public void setTargetHasOutlink(boolean targetHasOutlink) { 145 this.targetHasOutlink = targetHasOutlink; 146 } 147 148 151 public String toString() { 152 StringBuffer buf = new StringBuffer (); 153 buf.append("Version: " + CUR_VERSION + "\n"); 154 buf.append("ID: " + getFromID() + "\n"); 155 buf.append("DomainID: " + getDomainID() + "\n"); 156 buf.append("URL: " + getURL() + "\n"); 157 buf.append("AnchorText: " + getAnchorText() + "\n"); 158 buf.append("targetHasOutlink: " + targetHasOutlink() + "\n"); 159 return buf.toString(); 160 } 161 162 165 public String toTabbedString() { 166 StringBuffer buf = new StringBuffer (); 167 buf.append("" + CUR_VERSION); buf.append("\t"); 168 buf.append(getFromID().toString()); buf.append("\t"); 169 buf.append(getDomainID()); buf.append("\t"); 170 buf.append(getURL()); buf.append("\t"); 171 buf.append(getAnchorText()); buf.append("\t"); 172 buf.append(targetHasOutlink()); buf.append("\t"); 173 174 return buf.toString(); 175 } 176 177 179 public int compareTo(Object o) { 180 return urlCompare(o); 181 } 182 183 186 public int urlCompare(Object o) { 187 int urlResult = this.url.compareTo(((Link) o).url); 188 if (urlResult != 0) { 189 return urlResult; 190 } 191 192 return this.fromID.compareTo(((Link) o).fromID); 193 } 194 195 198 public int md5Compare(Object o) { 199 int md5Result = this.fromID.compareTo(((Link) o).fromID); 200 if (md5Result != 0) { 201 return md5Result; 202 } 203 204 return this.url.compareTo(((Link) o).url); 205 } 206 207 211 public static class UrlComparator extends WritableComparator { 212 public UrlComparator() { 213 super(Link.class); 214 } 215 public int compare(WritableComparable a, WritableComparable b) { 216 return ((Link) a).urlCompare(b); 217 } 218 219 220 public int compare(byte[] b1, int s1, int l1, 221 byte[] b2, int s2, int l2) { 222 int md5Start1 = s1 + 1; int md5Start2 = s2 + 1; 224 int urlLenStart1 = md5Start1 + MD5Hash.MD5_LEN; 225 int urlLenStart2 = md5Start2 + MD5Hash.MD5_LEN; 226 int urlLen1 = readUnsignedShort(b1, urlLenStart1); 227 int urlLen2 = readUnsignedShort(b2, urlLenStart2); 228 int urlStart1 = urlLenStart1+2; 229 int urlStart2 = urlLenStart2+2; 230 int c = compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2); 232 if (c != 0) 233 return c; 234 return compareBytes(b1, md5Start1, MD5Hash.MD5_LEN, 236 b2, md5Start2, MD5Hash.MD5_LEN); 237 } 238 } 239 240 243 public static class MD5Comparator extends WritableComparator { 244 public MD5Comparator() { 245 super(Link.class); 246 } 247 public int compare(WritableComparable a, WritableComparable b) { 248 return ((Link) a).md5Compare(b); 249 } 250 251 252 public int compare(byte[] b1, int s1, int l1, 253 byte[] b2, int s2, int l2) { 254 int md5Start1 = s1 + 1; int md5Start2 = s2 + 1; 257 int c = compareBytes(b1, md5Start1, MD5Hash.MD5_LEN, 258 b2, md5Start2, MD5Hash.MD5_LEN); 259 if (c != 0) 260 return c; 261 262 int urlLenStart1 = md5Start1 + MD5Hash.MD5_LEN; 264 int urlLenStart2 = md5Start2 + MD5Hash.MD5_LEN; 265 int urlLen1 = readUnsignedShort(b1, urlLenStart1); 266 int urlLen2 = readUnsignedShort(b2, urlLenStart2); 267 int urlStart1 = urlLenStart1+2; 268 int urlStart2 = urlLenStart2+2; 269 return compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2); 270 } 271 } 272 } 273 274 275 | Popular Tags |