1 2 3 4 package net.nutch.db; 5 6 import java.io.*; 7 import java.net.*; 8 import java.util.*; 9 import java.util.logging.*; 10 import java.net.MalformedURLException ; 11 import java.util.regex.*; 12 13 import javax.xml.parsers.*; 14 import org.xml.sax.*; 15 import org.xml.sax.helpers.*; 16 import org.apache.xerces.util.XMLChar; 17 18 import net.nutch.io.*; 19 import net.nutch.fs.*; 20 import net.nutch.net.*; 21 import net.nutch.util.*; 22 import net.nutch.pagedb.*; 23 import net.nutch.linkdb.*; 24 import net.nutch.util.NutchConf; 25 26 34 public class WebDBInjector { 35 private static final String DMOZ_PAGENAME = "http://www.dmoz.org/"; 36 37 private static final byte DEFAULT_INTERVAL = 38 (byte)NutchConf.getInt("db.default.fetch.interval", 30); 39 40 private static final float NEW_INJECTED_PAGE_SCORE = 41 NutchConf.getFloat("db.score.injected", 2.0f); 42 43 public static final Logger LOG = LogFormatter.getLogger("net.nutch.db.WebDBInjector"); 44 45 49 private static class XMLCharFilter extends FilterReader { 50 private boolean lastBad = false; 51 52 public XMLCharFilter(Reader reader) { 53 super(reader); 54 } 55 56 public int read() throws IOException { 57 int c = in.read(); 58 int value = c; 59 if (c != -1 && !(XMLChar.isValid(c))) value = 'X'; 61 else if (lastBad && c == '<') { in.mark(1); 63 if (in.read() != '/') 64 value = 'X'; 65 in.reset(); 66 } 67 lastBad = (c == 65533); 68 69 return value; 70 } 71 72 public int read(char[] cbuf, int off, int len) 73 throws IOException { 74 int n = in.read(cbuf, off, len); 75 if (n != -1) { 76 for (int i = 0; i < n; i++) { 77 char c = cbuf[off+i]; 78 char value = c; 79 if (!(XMLChar.isValid(c))) value = 'X'; 81 else if (lastBad && c == '<') { if (i != n-1 && cbuf[off+i+1] != '/') 83 value = 'X'; 84 } 85 lastBad = (c == 65533); 86 cbuf[off+i] = value; 87 } 88 } 89 return n; 90 } 91 } 92 93 94 99 class RDFProcessor extends DefaultHandler { 100 String curURL = null, curSection = null; 101 boolean titlePending = false, descPending = false, insideAdultSection = false; 102 Pattern topicPattern = null; 103 StringBuffer title = new StringBuffer (), desc = new StringBuffer (); 104 XMLReader reader; 105 int subsetDenom; 106 int hashSkew; 107 boolean includeAdult, includeDmozDesc; 108 MD5Hash srcDmozID; 109 long srcDmozDomainID; 110 Locator location; 111 112 116 public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, boolean includeDmozDesc, int skew, Pattern topicPattern) throws IOException { 117 this.reader = reader; 118 this.subsetDenom = subsetDenom; 119 this.includeAdult = includeAdult; 120 this.includeDmozDesc = includeDmozDesc; 121 this.topicPattern = topicPattern; 122 123 this.srcDmozID = MD5Hash.digest(DMOZ_PAGENAME + "_" + nextFetch); 141 Page dmozPage = new Page(DMOZ_PAGENAME, srcDmozID); 142 dmozPage.setNextFetchTime(Long.MAX_VALUE); 143 dbWriter.addPageIfNotPresent(dmozPage); 144 145 this.srcDmozDomainID = MD5Hash.digest(new URL(DMOZ_PAGENAME).getHost()).halfDigest(); 146 147 this.hashSkew = skew != 0 ? skew : new Random().nextInt(); 148 } 149 150 154 157 public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { 158 if ("Topic".equals(qName)) { 159 curSection = atts.getValue("r:id"); 160 } else if ("ExternalPage".equals(qName)) { 161 if ((! includeAdult) && curSection.startsWith("Top/Adult")) { 163 return; 164 } 165 166 if (topicPattern != null && !topicPattern.matcher(curSection).matches()) { 167 return; 168 } 169 170 String url = atts.getValue("about"); 173 int hashValue = MD5Hash.digest(url).hashCode(); 174 hashValue = Math.abs(hashValue ^ hashSkew); 175 if ((hashValue % subsetDenom) != 0) { 176 return; 177 } 178 179 curURL = url; 181 } else if (curURL != null && "d:Title".equals(qName)) { 182 titlePending = true; 183 } else if (curURL != null && "d:Description".equals(qName)) { 184 descPending = true; 185 } 186 } 187 188 191 public void characters(char ch[], int start, int length) { 192 if (titlePending) { 193 title.append(ch, start, length); 194 } else if (descPending) { 195 desc.append(ch, start, length); 196 } 197 } 198 199 202 public void endElement(String namespaceURI, String localName, String qName) throws SAXException { 203 if (curURL != null) { 204 if ("ExternalPage".equals(qName)) { 205 try { 210 if (addPage(curURL)) { 213 214 if (includeDmozDesc) { 220 String fullDesc = title + " " + desc; 221 Link descLink = new Link(srcDmozID, srcDmozDomainID, curURL, fullDesc); 222 dbWriter.addLink(descLink); 223 } 224 pages++; 225 } 226 227 } catch (MalformedURLException e) { 228 LOG.fine("skipping " + curURL + ":" + e); 229 } catch (IOException ie) { 230 LOG.severe("problem adding url " + curURL + ": " + ie); 231 } 232 printStatusBar(2000, 50000); 233 234 if (title.length() > 0) { 239 title.delete(0, title.length()); 240 } 241 if (desc.length() > 0) { 242 desc.delete(0, desc.length()); 243 } 244 245 curURL = null; 247 } else if ("d:Title".equals(qName)) { 248 titlePending = false; 249 } else if ("d:Description".equals(qName)) { 250 descPending = false; 251 } 252 } 253 } 254 255 258 public void startDocument() { 259 LOG.info("Begin parse"); 260 } 261 262 265 public void endDocument() { 266 LOG.info("Completed parse. Added " + pages + " pages."); 267 } 268 269 274 public void setDocumentLocator(Locator locator) { 275 location = locator; 276 } 277 278 279 283 286 public void error(SAXParseException spe) { 287 LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage()); 288 spe.printStackTrace(System.out); 289 } 290 291 294 public void fatalError(SAXParseException spe) { 295 LOG.severe("Fatal error: " + spe.toString() + ": " + spe.getMessage()); 296 LOG.severe("Last known line is " + location.getLineNumber() + ", column " + location.getColumnNumber()); 297 spe.printStackTrace(System.out); 298 } 299 300 303 public void warning(SAXParseException spe) { 304 LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage()); 305 spe.printStackTrace(System.out); 306 } 307 } 308 309 private IWebDBWriter dbWriter; 310 311 314 public WebDBInjector(IWebDBWriter dbWriter) { 315 this.dbWriter = dbWriter; 316 } 317 318 321 public void close() throws IOException { 322 dbWriter.close(); 323 } 324 325 328 public void printStatusBar(int small, int big){ 329 if ((pages % small ) == 0) { 330 System.out.print("."); 331 } 332 if ((pages % big ) == 0) { 333 printStatus(); 334 } 335 } 336 337 long startTime = System.currentTimeMillis(); 338 long pages = 0; 339 long nextFetch = System.currentTimeMillis(); 340 341 344 public void printStatus(){ 345 long elapsed = (System.currentTimeMillis() - this.startTime); 346 if ( this.pages == 0) { 347 } else { 348 LOG.info("\t" + this.pages + "\t" + 349 (int)((1000 * pages)/elapsed) + " pages/second\t" ); 350 } 351 } 352 353 357 public void injectURLFile(File urlList) throws IOException { 358 nextFetch = urlList.lastModified(); 359 BufferedReader reader = new BufferedReader(new FileReader(urlList)); 360 try { 361 String curStr = null; 362 LOG.info("Starting URL processing"); 363 while ((curStr = reader.readLine()) != null) { 364 String url = curStr.trim(); 365 if (addPage(url)) 366 this.pages++; 367 printStatusBar(2000,50000); 368 } 369 LOG.info("Added " + pages + " pages"); 370 } catch (Exception e) { 371 LOG.severe("error while injecting:" + e); 372 e.printStackTrace(); 373 } finally { 374 reader.close(); 375 } 376 } 377 378 382 public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, boolean includeDmozDesc, int skew, Pattern topicPattern) throws IOException, SAXException, ParserConfigurationException { 383 nextFetch = dmozFile.lastModified(); 384 385 SAXParserFactory parserFactory = SAXParserFactory.newInstance(); 386 SAXParser parser = parserFactory.newSAXParser(); 387 XMLReader reader = parser.getXMLReader(); 388 389 RDFProcessor rp = 391 new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew, topicPattern); 392 reader.setContentHandler(rp); 393 reader.setErrorHandler(rp); 394 LOG.info("skew = " + rp.hashSkew); 395 396 XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8"))); 402 try { 403 InputSource is = new InputSource(in); 404 reader.parse(is); 405 } catch (Exception e) { 406 LOG.severe(e.toString()); 407 e.printStackTrace(System.out); 408 System.exit(0); 409 } finally { 410 in.close(); 411 } 412 } 413 414 private boolean addPage(String url) throws IOException { 415 url = URLFilterFactory.getFilter().filter(url); 416 if (url != null) { 417 try { 418 Page page = new Page(url, NEW_INJECTED_PAGE_SCORE, nextFetch); 419 dbWriter.addPageIfNotPresent(page); 420 return true; 421 } catch (MalformedURLException e) { 422 LOG.warning("bad url: "+url); 423 } 424 } 425 return false; 426 } 427 428 private static void addTopicsFromFile(String topicFile, Vector topics) throws IOException { 429 BufferedReader in = null; 430 try { 431 in = new BufferedReader(new InputStreamReader(new FileInputStream(topicFile), "UTF-8")); 432 String line = null; 433 while ((line = in.readLine()) != null) { 434 topics.addElement(new String (line)); 435 } 436 } 437 catch (Exception e) { 438 LOG.severe(e.toString()); 439 e.printStackTrace(System.out); 440 System.exit(0); 441 } finally { 442 in.close(); 443 } 444 } 445 446 447 452 public static void main(String argv[]) throws Exception { 453 if (argv.length < 3) { 454 System.out.println("Usage: WebDBInjector (-local | -ndfs <namenode:port>) <db_dir> (-urlfile <url_file> | -dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-noDmozDesc] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]"); 455 return; 456 } 457 458 int subsetDenom = 1; 463 int skew = 0; 464 String command = null, loadfile = null; 465 boolean includeAdult = false, includeDmozDesc = true; 466 Pattern topicPattern = null; 467 Vector topics = new Vector(); 468 469 int i = 0; 470 NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i); 471 try { 472 File root = new File(argv[i++]); 473 474 for (; i < argv.length; i++) { 475 if ("-urlfile".equals(argv[i]) || 476 "-dmozfile".equals(argv[i])) { 477 command = argv[i]; 478 loadfile = argv[i+1]; 479 i++; 480 } else if ("-includeAdultMaterial".equals(argv[i])) { 481 includeAdult = true; 482 } else if ("-noDmozDesc".equals(argv[i])) { 483 includeDmozDesc = false; 484 } else if ("-subset".equals(argv[i])) { 485 subsetDenom = Integer.parseInt(argv[i+1]); 486 i++; 487 } else if ("-topic".equals(argv[i])) { 488 topics.addElement(argv[i+1]); 489 i++; 490 } else if ("-topicFile".equals(argv[i])) { 491 addTopicsFromFile(argv[i+1], topics); 492 i++; 493 } else if ("-skew".equals(argv[i])) { 494 skew = Integer.parseInt(argv[i+1]); 495 i++; 496 } 497 } 498 499 IWebDBWriter writer = new WebDBWriter(nfs, root); 504 WebDBInjector injector = new WebDBInjector(writer); 505 try { 506 if ("-urlfile".equals(command)) { 507 if (!topics.isEmpty()) { 508 System.out.println("You can't select URLs based on a topic when usin a URL-file"); 509 } 510 injector.injectURLFile(new File(loadfile)); 511 } else if ("-dmozfile".equals(command)) { 512 if (!topics.isEmpty()) { 513 String regExp = new String ("^("); 514 int j = 0; 515 for ( ; j < topics.size() - 1; ++j) { 516 regExp = regExp.concat((String ) topics.get(j)); 517 regExp = regExp.concat("|"); 518 } 519 regExp = regExp.concat((String ) topics.get(j)); 520 regExp = regExp.concat(").*"); 521 LOG.info("Topic selection pattern = " + regExp); 522 topicPattern = Pattern.compile(regExp); 523 } 524 injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult, includeDmozDesc, skew, topicPattern); 525 } else { 526 System.out.println("No command indicated."); 527 return; 528 } 529 } finally { 530 injector.close(); 531 } 532 } finally { 533 nfs.close(); 534 } 535 } 536 } 537 | Popular Tags |