1 2 3 package net.nutch.db; 4 5 import java.io.*; 6 import java.util.*; 7 import java.nio.channels.*; 8 9 import net.nutch.io.*; 10 import net.nutch.fs.*; 11 import net.nutch.util.*; 12 import net.nutch.pagedb.*; 13 import net.nutch.linkdb.*; 14 15 22 public class WebDBReader implements IWebDBReader { 23 static final Page[] PAGE_RECORDS = new Page[0]; 24 static final Link[] LINK_RECORDS = new Link[0]; 25 26 static final String PAGES_BY_URL = "pagesByURL"; 28 static final String PAGES_BY_MD5 = "pagesByMD5"; 29 static final String LINKS_BY_URL = "linksByURL"; 30 static final String LINKS_BY_MD5 = "linksByMD5"; 31 static final String STATS_FILE = "stats"; 32 33 NutchFileSystem nfs; 34 File dbDir, dbFile; 35 MapFile.Reader pagesByURL, pagesByMD5, linksByURL, linksByMD5; 36 long totalPages = 0, totalLinks = 0; 37 Vector mapReaders = null, setReaders = null; 38 FileInputStream dbReadLockData; 39 FileLock dbReadLock; 40 41 44 public WebDBReader(NutchFileSystem nfs, File dbDir) throws IOException, FileNotFoundException { 45 this.nfs = nfs; 46 this.dbDir = dbDir; 47 this.dbFile = new File(dbDir, "webdb"); 48 49 nfs.lock(new File(dbDir, "dbreadlock"), true); 53 54 this.pagesByURL = new MapFile.Reader(nfs, new File(dbFile, PAGES_BY_URL).getPath(), new UTF8.Comparator()); 55 this.pagesByMD5 = new MapFile.Reader(nfs, new File(dbFile, PAGES_BY_MD5).getPath(), new Page.Comparator()); 56 57 this.linksByURL = new MapFile.Reader(nfs, new File(dbFile, LINKS_BY_URL).getPath(), new Link.UrlComparator()); 58 this.linksByMD5 = new MapFile.Reader(nfs, new File(dbFile, LINKS_BY_MD5).getPath(), new Link.MD5Comparator()); 59 60 File stats = new File(dbFile, STATS_FILE); 62 if (nfs.exists(stats)) { 63 DataInputStream in = new DataInputStream(nfs.open(stats)); 64 try { 65 int version = (byte) in.read(); 66 this.totalPages = in.readLong(); 67 this.totalLinks = in.readLong(); 68 } finally { 69 in.close(); 70 } 71 } 72 73 this.mapReaders = new Vector(); 79 this.setReaders = new Vector(); 80 } 81 82 85 public void close() throws IOException { 86 pagesByURL.close(); 87 pagesByMD5.close(); 88 linksByURL.close(); 89 linksByMD5.close(); 90 91 for (Enumeration e = mapReaders.elements(); e.hasMoreElements(); ) { 92 MapFile.Reader tmp = (MapFile.Reader) e.nextElement(); 93 tmp.close(); 94 } 95 for (Enumeration e = setReaders.elements(); e.hasMoreElements(); ) { 96 SetFile.Reader tmp = (SetFile.Reader) e.nextElement(); 97 tmp.close(); 98 } 99 100 nfs.release(new File(dbDir, "dbreadlock")); 102 } 103 104 107 public Page getPage(String url) throws IOException { 108 return (Page) pagesByURL.get(new UTF8(url), new Page()); 109 } 110 111 115 public Page[] getPages(MD5Hash md5) throws IOException { 116 Vector records = new Vector(3); 117 Page p = new Page(); 118 p.getMD5().set(md5); 119 120 pagesByMD5.seek(p); 121 while (pagesByMD5.next(p, NullWritable.get())) { 122 if (p.getMD5().compareTo(md5) == 0) { 123 records.add(p); 124 p = new Page(); 125 } else { 126 break; 127 } 128 } 129 130 return (Page[]) records.toArray(PAGE_RECORDS); 132 } 133 134 138 public boolean pageExists(MD5Hash md5) throws IOException { 139 Page p = new Page(); 140 p.getMD5().set(md5); 141 pagesByMD5.seek(p); 142 if (pagesByMD5.next(p, NullWritable.get()) && p.getMD5().compareTo(md5) == 0) { 143 return true; 144 } else { 145 return false; 146 } 147 } 148 149 152 public Enumeration pages() throws IOException { 153 MapFile.Reader tmpReader = new MapFile.Reader(nfs, new File(dbFile, "pagesByURL").getPath()); 154 mapReaders.add(tmpReader); 155 return new TableEnumerator(tmpReader); 156 } 157 158 class TableEnumerator implements Enumeration { 163 MapFile.Reader reader; 164 Page nextItem; 165 166 170 public TableEnumerator(MapFile.Reader reader) { 171 this.reader = reader; 172 this.nextItem = new Page(); 173 try { 174 if (! reader.next(new UTF8(), this.nextItem)) { 175 this.nextItem = null; 176 } 177 } catch (IOException ie) { 178 this.nextItem = null; 179 } 180 } 181 182 185 public boolean hasMoreElements() { 186 return (nextItem != null); 187 } 188 189 194 public Object nextElement() { 195 if (nextItem == null) { 196 throw new NoSuchElementException("PageDB Enumeration"); 197 } 198 Page toReturn = nextItem; 199 this.nextItem = new Page(); 200 try { 201 if (! reader.next(new UTF8(), nextItem)) { 202 this.nextItem = null; 203 } 204 } catch (IOException ie) { 205 this.nextItem = null; 206 } 207 return toReturn; 208 } 209 } 210 211 212 215 public Enumeration pagesByMD5() throws IOException { 216 SetFile.Reader tmpReader = new SetFile.Reader(nfs, new File(dbFile, "pagesByMD5").getPath()); 217 setReaders.add(tmpReader); 218 return new IndexEnumerator(tmpReader); 219 } 220 221 224 public long numPages() { 225 return totalPages; 226 } 227 228 class IndexEnumerator implements Enumeration { 233 SetFile.Reader reader; 234 Page nextItem; 235 236 240 public IndexEnumerator(SetFile.Reader reader) { 241 this.reader = reader; 242 this.nextItem = new Page(); 243 try { 244 if (! reader.next(nextItem)) { 245 this.nextItem = null; 246 } 247 } catch (IOException ie) { 248 this.nextItem = null; 249 } 250 } 251 252 255 public boolean hasMoreElements() { 256 return (nextItem != null); 257 } 258 259 264 public Object nextElement() { 265 if (nextItem == null) { 266 throw new NoSuchElementException("PageDB Enumeration"); 267 } 268 269 Page toReturn = nextItem; 270 this.nextItem = new Page(); 271 try { 272 if (! reader.next(nextItem)) { 273 this.nextItem = null; 274 } 275 } catch (IOException ie) { 276 this.nextItem = null; 277 } 278 return toReturn; 279 } 280 } 281 282 285 public Link[] getLinks(UTF8 url) throws IOException { 286 Vector records = new Vector(3); 287 Link l = new Link(); 288 l.getURL().set(url); 289 290 linksByURL.seek(l); 291 while (linksByURL.next(l, NullWritable.get())) { 292 if (url.equals(l.getURL())) { 293 records.add(l); 294 l = new Link(); 295 } else { 296 break; 297 } 298 } 299 300 return (Link[]) records.toArray(LINK_RECORDS); 302 } 303 304 307 public Link[] getLinks(MD5Hash md5) throws IOException { 308 Vector records = new Vector(3); 309 Link l = new Link(); 310 l.getFromID().set(md5); 311 312 linksByMD5.seek(l); 313 while (linksByMD5.next(l, NullWritable.get())) { 314 if (md5.equals(l.getFromID())) { 315 records.add(l); 316 l = new Link(); 317 } else { 318 break; 319 } 320 } 321 322 return (Link[]) records.toArray(LINK_RECORDS); 324 } 325 326 329 public Enumeration links() { 330 return new MapEnumerator(linksByURL); 331 } 332 333 336 public long numLinks() { 337 return totalLinks; 338 } 339 340 class MapEnumerator implements Enumeration { 344 MapFile.Reader reader; 345 Link nextItem; 346 347 351 public MapEnumerator(MapFile.Reader reader) { 352 this.reader = reader; 353 this.nextItem = new Link(); 354 try { 355 if (! reader.next(this.nextItem, NullWritable.get())) { 356 this.nextItem = null; 357 } 358 } catch (IOException ie) { 359 this.nextItem = null; 360 } 361 } 362 363 366 public boolean hasMoreElements() { 367 return (nextItem != null); 368 } 369 370 375 public Object nextElement() { 376 if (nextItem == null) { 377 throw new NoSuchElementException("PageDB Enumeration"); 378 } 379 380 Link toReturn = nextItem; 381 this.nextItem = new Link(); 382 try { 383 if (! reader.next(nextItem, NullWritable.get())) { 384 this.nextItem = null; 385 } 386 } catch (IOException ie) { 387 this.nextItem = null; 388 } 389 return toReturn; 390 } 391 } 392 393 397 public static void main(String argv[]) throws FileNotFoundException, IOException { 398 if (argv.length < 2) { 399 System.out.println("Usage: java net.nutch.db.WebDBReader (-local | -ndfs <namenode:port>) <db> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]"); 400 return; 401 402 } 403 404 int i = 0; 405 NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i); 406 File dbDir = new File(argv[i++]); 407 WebDBReader reader = new WebDBReader(nfs, dbDir); 408 try { 409 String cmd = argv[i++]; 410 411 if ("-pageurl".equals(cmd)) { 412 String url = argv[i++]; 413 System.out.println(reader.getPage(url.trim())); 414 } else if ("-pagemd5".equals(cmd)) { 415 MD5Hash md5 = new MD5Hash(argv[i++]); 416 Page pages[] = reader.getPages(md5); 417 System.out.println("Found " + pages.length + " pages."); 418 for (int j = 0; j < pages.length; j++) { 419 System.out.println("Page " + j + ": " + pages[j]); 420 } 421 } else if ("-dumppageurl".equals(cmd)) { 422 System.out.println(reader); 423 System.out.println(); 424 int j = 1; 425 for (Enumeration e = reader.pages(); e.hasMoreElements(); j++) { 426 Page page = (Page) e.nextElement(); 427 System.out.println("Page " + j + ": " + page); 428 System.out.println(); 429 } 430 } else if ("-dumppagemd5".equals(cmd)) { 431 System.out.println(reader); 432 System.out.println(); 433 int j = 1; 434 for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); j++) { 435 Page page = (Page) e.nextElement(); 436 System.out.println("Page " + j + ": " + page); 437 System.out.println(); 438 } 439 } else if ("-toppages".equals(cmd)) { 440 int topSize = Integer.parseInt(argv[i++]); 441 442 SortedSet topSet = new TreeSet(new Comparator() { 444 public int compare(Object o1, Object o2) { 445 Page p1 = (Page) o1; 446 Page p2 = (Page) o2; 447 if (p1.getScore() < p2.getScore()) { 448 return -1; 449 } else if (p1.getScore() == p2.getScore()) { 450 return p1.compareTo(p2); 455 } else { 456 return 1; 457 } 458 } 459 } 460 ); 461 462 Page lowestPage = null; 464 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { 465 Page curPage = (Page) e.nextElement(); 466 if (topSet.size() < topSize) { 467 topSet.add(curPage); 468 lowestPage = (Page) topSet.first(); 469 } else if (lowestPage.getScore() < curPage.getScore()) { 470 topSet.remove(lowestPage); 471 topSet.add(curPage); 472 lowestPage = (Page) topSet.first(); 473 } 474 } 475 476 int j = 0; 478 for (Iterator it = topSet.iterator(); it.hasNext(); j++) { 479 System.out.println("Page " + j + ": " + (Page) it.next()); 480 System.out.println(); 481 } 482 } else if ("-linkurl".equals(cmd)) { 483 String url = argv[i++]; 484 Link links[] = reader.getLinks(new UTF8(url.trim())); 485 System.out.println("Found " + links.length + " links."); 486 for (int j = 0; j < links.length; j++) { 487 System.out.println("Link " + j + ": " + links[j]); 488 } 489 } else if ("-linkmd5".equals(cmd)) { 490 MD5Hash fromID = new MD5Hash(argv[i++]); 491 Link links[] = reader.getLinks(fromID); 492 System.out.println("Found " + links.length + " links."); 493 for (int j = 0; j < links.length; j++) { 494 System.out.println("Link " + j + ": " + links[j]); 495 } 496 } else if ("-dumplinks".equals(cmd)) { 497 System.out.println(reader); 498 System.out.println(); 499 Enumeration e = reader.pagesByMD5(); 500 while (e.hasMoreElements()) { 501 Page page = (Page) e.nextElement(); 502 Link[] links = reader.getLinks(page.getMD5()); 503 if (links.length > 0) { 504 System.out.println("from " + page.getURL()); 505 for (int j = 0; j < links.length; j++) { 506 System.out.println(" to " + links[j].getURL()); 507 } 508 System.out.println(); 509 } 510 } 511 } else if ("-stats".equals(cmd)) { 512 System.out.println("Stats for " + reader); 513 System.out.println("-------------------------------"); 514 System.out.println("Number of pages: " + reader.numPages()); 515 System.out.println("Number of links: " + reader.numLinks()); 516 } else { 517 System.out.println("Sorry, no command with name " + cmd); 518 } 519 } finally { 520 reader.close(); 521 nfs.close(); 522 } 523 } 524 } 525 | Popular Tags |