1 2 3 package net.nutch.db; 4 5 import java.io.*; 6 import java.util.*; 7 8 import net.nutch.io.*; 9 import net.nutch.fs.*; 10 import net.nutch.util.*; 11 import net.nutch.pagedb.*; 12 import net.nutch.linkdb.*; 13 14 21 public class DistributedWebDBReader implements IWebDBReader { 22 static final Page[] PAGE_RECORDS = new Page[0]; 23 static final Link[] LINK_RECORDS = new Link[0]; 24 25 static final String PAGES_BY_URL = "pagesByURL"; 27 static final String PAGES_BY_MD5 = "pagesByMD5"; 28 static final String LINKS_BY_URL = "linksByURL"; 29 static final String LINKS_BY_MD5 = "linksByMD5"; 30 31 static final String STATS_FILE = "stats"; 32 static final String META_FILE = "metainfo"; 33 34 static final EnumCall PAGE_ENUMS = new PageEnumCall(); 36 static final EnumCall PAGE_MD5_ENUMS = new PageByMD5EnumCall(); 37 static final EnumCall LINK_ENUMS = new LinkEnumCall(); 38 39 static final DBSectionReader[] STATIC_SR_ARRAY = new DBSectionReader[0]; 41 42 File root, dbDir; 44 File globalWriteLock; 45 DBSectionReader pagesByURL[], pagesByMD5[], linksByURL[], linksByMD5[]; 46 long totalPages = 0, totalLinks = 0; 47 int numMachines = 0; 48 49 52 public DistributedWebDBReader(NutchFileSystem nfs, File root) throws IOException, FileNotFoundException { 53 this.root = root; 58 this.dbDir = new File(new File(root, "standard"), "webdb"); 59 60 File dirIsComplete = new File(dbDir, "dbIsComplete"); 65 while (! nfs.exists(dirIsComplete)) { 66 try { 67 Thread.sleep(2000); 68 } catch (InterruptedException ie) { 69 } 70 } 71 72 77 82 83 File machineInfo = new File(new File(root, "standard"), "machineinfo"); 87 DataInputStream in = new DataInputStream(nfs.open(machineInfo)); 88 try { 89 in.readByte(); this.numMachines = in.readInt(); 91 } finally { 92 in.close(); 93 } 94 95 Vector pagesByURL = new Vector(), pagesByMD5 = new Vector(), linksByMD5 = new Vector(), linksByURL = new Vector(); 101 for (int i = 0; i < numMachines; i++) { 102 File sectionDir = new File(dbDir, "dbsection." + i); 104 File pagesByURLFile = new File(sectionDir, PAGES_BY_URL); 105 File pagesByMD5File = new File(sectionDir, PAGES_BY_MD5); 106 File linksByURLFile = new File(sectionDir, LINKS_BY_URL); 107 File linksByMD5File = new File(sectionDir, LINKS_BY_MD5); 108 109 pagesByURL.add(new DBSectionReader(nfs, pagesByURLFile, new UTF8.Comparator())); 111 pagesByMD5.add(new DBSectionReader(nfs, pagesByMD5File, new Page.Comparator())); 112 linksByURL.add(new DBSectionReader(nfs, linksByURLFile, new Link.UrlComparator())); 113 linksByMD5.add(new DBSectionReader(nfs, linksByMD5File, new Link.MD5Comparator())); 114 115 File sectionStats = new File(sectionDir, STATS_FILE); 117 in = new DataInputStream(nfs.open(sectionStats)); 118 try { 119 in.read(); this.totalPages += in.readLong(); 121 this.totalLinks += in.readLong(); 122 } finally { 123 in.close(); 124 } 125 } 126 127 this.pagesByURL = (DBSectionReader[]) pagesByURL.toArray(STATIC_SR_ARRAY); 129 this.pagesByMD5 = (DBSectionReader[]) pagesByMD5.toArray(STATIC_SR_ARRAY); 130 this.linksByURL = (DBSectionReader[]) linksByURL.toArray(STATIC_SR_ARRAY); 131 this.linksByMD5 = (DBSectionReader[]) linksByMD5.toArray(STATIC_SR_ARRAY); 132 } 133 134 137 public void close() throws IOException { 138 for (int i = 0; i < pagesByURL.length; i++) { 139 pagesByURL[i].close(); 140 pagesByMD5[i].close(); 141 linksByURL[i].close(); 142 linksByMD5[i].close(); 143 } 144 } 145 146 149 public int numMachines() { 150 return numMachines; 151 } 152 153 156 public long numPages() { 157 return totalPages; 158 } 159 160 163 public long numLinks() { 164 return totalLinks; 165 } 166 167 170 public Page getPage(String url) throws IOException { 171 Page result = null, target = new Page(); 172 UTF8 searchURL = new UTF8(url); 173 174 return pagesByURL[DBKeyDivision.findURLSection(url, numMachines)].getPage(searchURL, target); 177 } 178 179 185 public Page[] getPages(MD5Hash md5) throws IOException { 186 Vector resultSet = pagesByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].getPages(md5); 187 Page resultArray[] = new Page[resultSet.size()]; 188 int i = 0; 189 for (Enumeration e = resultSet.elements(); e.hasMoreElements(); i++) { 190 resultArray[i] = (Page) e.nextElement(); 191 } 192 return resultArray; 193 } 194 195 201 public boolean pageExists(MD5Hash md5) throws IOException { 202 return pagesByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].pageExists(md5); 203 } 204 205 210 public Enumeration pages() throws IOException { 211 return new MetaEnumerator(pagesByURL, PAGE_ENUMS); 212 } 213 214 219 public Enumeration pagesByMD5() throws IOException { 220 return new MetaEnumerator(pagesByMD5, PAGE_MD5_ENUMS); 221 } 222 223 226 public Link[] getLinks(UTF8 url) throws IOException { 227 Vector resultSet = linksByURL[DBKeyDivision.findURLSection(url.toString(), numMachines)].getLinks(url); 228 Link resultArray[] = new Link[resultSet.size()]; 229 int i = 0; 230 for (Enumeration e = resultSet.elements(); e.hasMoreElements(); ) { 231 resultArray[i++] = (Link) e.nextElement(); 232 } 233 return resultArray; 234 } 235 236 239 public Link[] getLinks(MD5Hash md5) throws IOException { 240 Vector resultSet = linksByMD5[DBKeyDivision.findMD5Section(md5, numMachines)].getLinks(md5); 241 Link resultArray[] = new Link[resultSet.size()]; 242 int i = 0; 243 for (Enumeration e = resultSet.elements(); e.hasMoreElements(); ) { 244 resultArray[i++] = (Link) e.nextElement(); 245 } 246 return resultArray; 247 } 248 249 252 public Enumeration links() throws IOException { 253 return new MetaEnumerator(linksByURL, LINK_ENUMS); 254 } 255 256 static abstract class EnumCall { 262 264 public EnumCall() { 265 } 266 267 271 public abstract Enumeration getEnumeration(DBSectionReader reader) throws IOException; 272 } 273 274 static class PageEnumCall extends EnumCall { 278 280 public PageEnumCall() { 281 } 282 283 286 public Enumeration getEnumeration(DBSectionReader reader) throws IOException { 287 return reader.pages(); 288 } 289 } 290 291 static class PageByMD5EnumCall extends EnumCall { 295 297 public PageByMD5EnumCall() { 298 } 299 300 303 public Enumeration getEnumeration(DBSectionReader reader) throws IOException { 304 return reader.pagesByMD5(); 305 } 306 } 307 308 static class LinkEnumCall extends EnumCall { 312 314 public LinkEnumCall() { 315 } 316 317 320 public Enumeration getEnumeration(DBSectionReader reader) throws IOException { 321 return reader.links(); 322 } 323 } 324 325 class MetaEnumerator implements Enumeration { 330 Enumeration enumerations[]; 331 int curEnum = 0; 332 333 336 public MetaEnumerator(DBSectionReader sections[], EnumCall enumCall) throws IOException { 337 this.enumerations = new Enumeration[sections.length]; 338 339 for (int i = 0; i < enumerations.length; i++) { 340 enumerations[i] = enumCall.getEnumeration(sections[i]); 341 } 342 } 343 344 349 public boolean hasMoreElements() { 350 boolean result = false; 351 352 for (; curEnum < enumerations.length; curEnum++) { 358 result = enumerations[curEnum].hasMoreElements(); 359 360 if (result) { 361 break; 362 } 363 } 364 return result; 365 } 366 367 371 public Object nextElement() { 372 Object obj = null; 373 374 for (; curEnum < enumerations.length; curEnum++) { 379 if (enumerations[curEnum].hasMoreElements()) { 380 obj = enumerations[curEnum].nextElement(); 381 382 if (obj != null) { 383 break; 384 } 385 } 386 } 387 return obj; 388 } 389 } 390 391 397 public static void main(String argv[]) throws FileNotFoundException, IOException { 398 if (argv.length < 2) { 399 System.out.println("Usage: java net.nutch.db.DistributedWebDBReader (-local | -ndfs <namenode:port>) <root> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]"); 400 return; 401 } 402 403 int i = 0; 404 NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i); 405 File root = new File(argv[i++]); 406 DistributedWebDBReader reader = new DistributedWebDBReader(nfs, root); 407 try { 408 String cmd = argv[i++]; 409 410 if ("-pageurl".equals(cmd)) { 411 String url = argv[i++]; 412 System.out.println(reader.getPage(url.trim())); 413 } else if ("-pagemd5".equals(cmd)) { 414 MD5Hash md5 = new MD5Hash(argv[i++]); 415 Page pages[] = reader.getPages(md5); 416 System.out.println("Found " + pages.length + " pages."); 417 for (int j = 0; j < pages.length; j++) { 418 System.out.println("Page " + j + ": " + pages[j]); 419 } 420 } else if ("-dumppageurl".equals(cmd)) { 421 int j = 1; 422 for (Enumeration e = reader.pages(); e.hasMoreElements(); j++) { 423 Page page = (Page) e.nextElement(); 424 System.out.println("Page " + j + ": " + page); 425 System.out.println(); 426 } 427 } else if ("-dumppagemd5".equals(cmd)) { 428 int j = 1; 429 for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); j++) { 430 Page page = (Page) e.nextElement(); 431 System.out.println("Page " + j + ": " + page); 432 System.out.println(); 433 } 434 } else if ("-toppages".equals(cmd)) { 435 int topSize = Integer.parseInt(argv[i++]); 436 437 SortedSet topSet = new TreeSet(new Comparator() { 439 public int compare(Object o1, Object o2) { 440 Page p1 = (Page) o1; 441 Page p2 = (Page) o2; 442 if (p1.getScore() < p2.getScore()) { 443 return -1; 444 } else if (p1.getScore() == p2.getScore()) { 445 return p1.compareTo(p2); 450 } else { 451 return 1; 452 } 453 } 454 } 455 ); 456 457 Page lowestPage = null; 459 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { 460 Page curPage = (Page) e.nextElement(); 461 if (topSet.size() < topSize) { 462 topSet.add(curPage); 463 lowestPage = (Page) topSet.first(); 464 } else if (lowestPage.getScore() < curPage.getScore()) { 465 topSet.remove(lowestPage); 466 topSet.add(curPage); 467 lowestPage = (Page) topSet.first(); 468 } 469 } 470 471 int j = 0; 473 for (Iterator it = topSet.iterator(); it.hasNext(); j++) { 474 System.out.println("Page " + j + ": " + (Page) it.next()); 475 System.out.println(); 476 } 477 } else if ("-linkurl".equals(cmd)) { 478 String url = argv[i++]; 479 Link links[] = reader.getLinks(new UTF8(url.trim())); 480 System.out.println("Found " + links.length + " links."); 481 for (int j = 0; j < links.length; j++) { 482 System.out.println("Link " + j + ": " + links[j]); 483 } 484 } else if ("-linkmd5".equals(cmd)) { 485 MD5Hash fromID = new MD5Hash(argv[i++]); 486 Link links[] = reader.getLinks(fromID); 487 System.out.println("Found " + links.length + " links."); 488 for (int j = 0; j < links.length; j++) { 489 System.out.println("Link " + j + ": " + links[j]); 490 } 491 } else if ("-dumplinks".equals(cmd)) { 492 int j = 1; 493 for (Enumeration e = reader.links(); e.hasMoreElements(); j++) { 494 Link link = (Link) e.nextElement(); 495 System.out.println("Link " + j + ": " + link); 496 System.out.println(); 497 } 498 } else if ("-stats".equals(cmd)) { 499 System.out.println("Stats for " + reader); 500 System.out.println("-------------------------------"); 501 System.out.println("Number of pages: " + reader.numPages()); 502 System.out.println("Number of links: " + reader.numLinks()); 503 System.out.println("Number of machines (sections): " + reader.numMachines()); 504 } else { 505 System.out.println("Sorry, no command with name " + cmd); 506 } 507 } finally { 508 reader.close(); 509 } 510 } 511 } 512
| Popular Tags
|