1 6 package net.nutch.tools; 7 8 import java.io.BufferedReader ; 9 import java.io.BufferedWriter ; 10 import java.io.File ; 11 import java.io.FileFilter ; 12 import java.io.FileInputStream ; 13 import java.io.FileOutputStream ; 14 import java.io.IOException ; 15 import java.io.InputStream ; 16 import java.io.InputStreamReader ; 17 import java.io.OutputStreamWriter ; 18 import java.io.PrintStream ; 19 import java.util.BitSet ; 20 import java.util.StringTokenizer ; 21 import java.util.Vector ; 22 import java.util.logging.Logger ; 23 24 import net.nutch.io.UTF8; 25 import net.nutch.util.LogFormatter; 26 import net.nutch.util.NutchConf; 27 28 import org.apache.lucene.analysis.WhitespaceAnalyzer; 29 import org.apache.lucene.document.Document; 30 import org.apache.lucene.index.IndexReader; 31 import org.apache.lucene.index.MultiReader; 32 import org.apache.lucene.queryParser.QueryParser; 33 import org.apache.lucene.search.HitCollector; 34 import org.apache.lucene.search.IndexSearcher; 35 import org.apache.lucene.search.Query; 36 import org.apache.lucene.store.Directory; 37 import org.apache.lucene.store.FSDirectory; 38 39 82 public class PruneIndexTool implements Runnable { 83 public static final Logger LOG = LogFormatter.getLogger("net.nutch.tools.PruneIndexTool"); 84 85 public static int LOG_STEP = 50000; 86 87 92 public static interface PruneChecker { 93 100 public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception ; 101 104 public void close(); 105 } 106 107 114 public static class PrintFieldsChecker implements PruneChecker { 115 private PrintStream ps = null; 116 private String [] fields = null; 117 118 124 public PrintFieldsChecker(PrintStream ps, String [] fields) { 125 this.ps = ps; 126 this.fields = fields; 127 } 128 129 public void close() { 130 ps.flush(); 131 } 132 133 public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception { 134 Document doc = reader.document(docNum); 135 StringBuffer sb = new StringBuffer ("#" + docNum + ":"); 136 for (int i = 0; i < fields.length; i++) { 137 String [] values = doc.getValues(fields[i]); 138 sb.append(" " + fields[i] + "="); 139 if (values != null) { 140 for (int k = 0; k < values.length; k++) { 141 sb.append("[" + values[k] + "]"); 142 } 143 } else sb.append("[null]"); 144 } 145 ps.println(sb.toString()); 146 return true; 147 } 148 } 149 150 156 public static class StoreUrlsChecker implements PruneChecker { 157 private BufferedWriter output = null; 158 private boolean storeHomeUrl = false; 159 160 164 public StoreUrlsChecker(File out, boolean storeHomeUrl) throws Exception { 165 this.output = new BufferedWriter (new OutputStreamWriter (new FileOutputStream (out), "UTF-8")); 166 this.storeHomeUrl = storeHomeUrl; 167 } 168 169 public void close() { 170 try { 171 output.flush(); 172 output.close(); 173 } catch (Exception e) { 174 LOG.warning("Error closing: " + e.getMessage()); 175 } 176 } 177 178 public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception { 179 Document doc = reader.document(docNum); 180 String url = doc.get("url"); 181 output.write(url); output.write('\n'); 182 if (storeHomeUrl) { 183 int idx = url.indexOf("://"); 185 if (idx != -1) { 186 idx = url.indexOf('/', idx + 3); 187 if (idx != -1) { 188 output.write(url.substring(0, idx + 1) + "\n"); 189 } 190 } 191 } 192 return true; 193 } 194 } 195 196 private Query[] queries = null; 197 private IndexReader reader = null; 198 private IndexSearcher searcher = null; 199 private PruneChecker[] checkers = null; 200 private boolean dryrun = false; 201 private String dr = ""; 202 203 223 public PruneIndexTool(File [] indexDirs, Query[] queries, PruneChecker[] checkers, 224 boolean unlock, boolean dryrun) throws Exception { 225 if (indexDirs == null || queries == null) 226 throw new Exception ("Invalid arguments."); 227 if (indexDirs.length == 0 || queries.length == 0) 228 throw new Exception ("Nothing to do."); 229 this.queries = queries; 230 this.checkers = checkers; 231 this.dryrun = dryrun; 232 if (dryrun) dr = "[DRY RUN] "; 233 int numIdx = 0; 234 if (indexDirs.length == 1) { 235 Directory dir = FSDirectory.getDirectory(indexDirs[0], false); 236 if (IndexReader.isLocked(dir)) { 237 if (!unlock) { 238 throw new Exception ("Index " + indexDirs[0] + " is locked."); 239 } 240 if (!dryrun) { 241 IndexReader.unlock(dir); 242 LOG.fine(" - had to unlock index in " + dir); 243 } 244 } 245 reader = IndexReader.open(dir); 246 numIdx = 1; 247 } else { 248 Directory dir; 249 Vector indexes = new Vector (indexDirs.length); 250 for (int i = 0; i < indexDirs.length; i++) { 251 try { 252 dir = FSDirectory.getDirectory(indexDirs[i], false); 253 if (IndexReader.isLocked(dir)) { 254 if (!unlock) { 255 LOG.warning(dr + "Index " + indexDirs[i] + " is locked. Skipping..."); 256 continue; 257 } 258 if (!dryrun) { 259 IndexReader.unlock(dir); 260 LOG.fine(" - had to unlock index in " + dir); 261 } 262 } 263 IndexReader r = IndexReader.open(dir); 264 indexes.add(r); 265 numIdx++; 266 } catch (Exception e) { 267 LOG.warning(dr + "Invalid index in " + indexDirs[i] + " - skipping..."); 268 } 269 } 270 if (indexes.size() == 0) throw new Exception ("No input indexes."); 271 IndexReader[] readers = (IndexReader[])indexes.toArray(new IndexReader[0]); 272 reader = new MultiReader(readers); 273 } 274 LOG.info(dr + "Opened " + numIdx + " index(es) with total " + reader.numDocs() + " documents."); 275 searcher = new IndexSearcher(reader); 276 } 277 278 286 private static class AllHitsCollector extends HitCollector { 287 private BitSet bits; 288 289 public AllHitsCollector(BitSet bits) { 290 this.bits = bits; 291 } 292 public void collect(int doc, float score) { 293 bits.set(doc); 294 } 295 } 296 297 302 public void run() { 303 BitSet bits = new BitSet (reader.maxDoc()); 304 AllHitsCollector ahc = new AllHitsCollector(bits); 305 boolean doDelete = false; 306 UTF8 url = new UTF8(); 307 for (int i = 0; i < queries.length; i++) { 308 LOG.info(dr + "Processing query: " + queries[i].toString()); 309 bits.clear(); 310 try { 311 searcher.search(queries[i], ahc); 312 } catch (IOException e) { 313 LOG.warning(dr + " - failed: " + e.getMessage()); 314 continue; 315 } 316 if (bits.cardinality() == 0) { 317 LOG.info(dr + " - no matching documents."); 318 continue; 319 } 320 LOG.info(dr + " - found " + bits.cardinality() + " document(s)."); 321 int docNum = -1, start = 0, cnt = 0; 323 while ((docNum = bits.nextSetBit(start)) != -1) { 325 if (reader.isDeleted(docNum)) continue; 327 try { 328 if (checkers != null && checkers.length > 0) { 329 boolean check = true; 330 for (int k = 0; k < checkers.length; k++) { 331 check &= checkers[k].isPrunable(queries[i], reader, docNum); 333 } 334 doDelete = check; 335 } else doDelete = true; 336 if (doDelete) { 337 if (!dryrun) reader.delete(docNum); 338 cnt++; 339 } 340 } catch (Exception e) { 341 LOG.warning(dr + " - failed to delete doc #" + docNum); 342 } 343 start = docNum + 1; 344 } 345 LOG.info(dr + " - deleted " + cnt + " document(s)."); 346 } 347 if (checkers != null) { 349 for (int i = 0; i < checkers.length; i++) { 350 checkers[i].close(); 351 } 352 } 353 try { 354 reader.close(); 355 } catch (IOException e) { 356 LOG.warning(dr + "Exception when closing reader(s): " + e.getMessage()); 357 } 358 } 359 360 public static void main(String [] args) throws Exception { 361 if (args.length == 0) { 362 usage(); 363 LOG.severe("Missing arguments"); 364 return; 365 } 366 File idx = new File (args[0]); 367 if (!idx.isDirectory()) { 368 usage(); 369 LOG.severe("Not a directory: " + idx); 370 return; 371 } 372 Vector paths = new Vector (); 373 if (IndexReader.indexExists(idx)) { 374 paths.add(idx); 375 } else { 376 File [] dirs = idx.listFiles(new FileFilter () { 378 public boolean accept(File f) { 379 return f.isDirectory(); 380 } 381 }); 382 if (dirs == null || dirs.length == 0) { 383 usage(); 384 LOG.severe("No indexes in " + idx); 385 return; 386 } 387 for (int i = 0; i < dirs.length; i++) { 388 File sidx = new File (dirs[i], "index"); 389 if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) { 390 paths.add(sidx); 391 } 392 } 393 if (paths.size() == 0) { 394 usage(); 395 LOG.severe("No indexes in " + idx + " or its subdirs."); 396 return; 397 } 398 } 399 File [] indexes = (File [])paths.toArray(new File [0]); 400 boolean force = false; 401 boolean dryrun = false; 402 String qPath = null; 403 String outPath = null; 404 String fList = null; 405 for (int i = 1; i < args.length; i++) { 406 if (args[i].equals("-force")) { 407 force = true; 408 } else if (args[i].equals("-queries")) { 409 qPath = args[++i]; 410 } else if (args[i].equals("-output")) { 411 outPath = args[++i]; 412 } else if (args[i].equals("-showfields")) { 413 fList = args[++i]; 414 } else if (args[i].equals("-dryrun")) { 415 dryrun = true; 416 } else { 417 usage(); 418 LOG.severe("Unrecognized option: " + args[i]); 419 return; 420 } 421 } 422 Vector cv = new Vector (); 423 if (fList != null) { 424 StringTokenizer st = new StringTokenizer (fList, ","); 425 Vector tokens = new Vector (); 426 while (st.hasMoreTokens()) tokens.add(st.nextToken()); 427 String [] fields = (String [])tokens.toArray(new String [0]); 428 PruneChecker pc = new PrintFieldsChecker(System.out, fields); 429 cv.add(pc); 430 } 431 432 if (outPath != null) { 433 StoreUrlsChecker luc = new StoreUrlsChecker(new File (outPath), false); 434 cv.add(luc); 435 } 436 437 PruneChecker[] checkers = null; 438 if (cv.size() > 0) { 439 checkers = (PruneChecker[])cv.toArray(new PruneChecker[0]); 440 } 441 Query[] queries = null; 442 InputStream is = null; 443 if (qPath != null) { 444 is = new FileInputStream (qPath); 445 } else { 446 qPath = NutchConf.get("prune.index.tool.queries"); 447 is = NutchConf.getConfResourceAsInputStream(qPath); 448 } 449 if (is == null) { 450 LOG.severe("Can't load queries from " + qPath); 451 return; 452 } 453 try { 454 queries = parseQueries(is); 455 } catch (Exception e) { 456 LOG.severe("Error parsing queries: " + e.getMessage()); 457 return; 458 } 459 try { 460 PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, true, dryrun); 461 pit.run(); 462 } catch (Exception e) { 463 LOG.severe("Error running PruneIndexTool: " + e.getMessage()); 464 return; 465 } 466 } 467 468 478 public static Query[] parseQueries(InputStream is) throws Exception { 479 BufferedReader br = new BufferedReader (new InputStreamReader (is, "UTF-8")); 480 String line = null; 481 QueryParser qp = new QueryParser("url", new WhitespaceAnalyzer()); 482 Vector queries = new Vector (); 483 while ((line = br.readLine()) != null) { 484 line = line.trim(); 485 if (line.length() == 0 || line.charAt(0) == '#') continue; 487 Query q = qp.parse(line); 488 queries.add(q); 489 } 490 return (Query[])queries.toArray(new Query[0]); 491 } 492 493 private static void usage() { 494 System.err.println("PruneIndexTool <indexDir | segmentsDir> [-dryrun] [-force] [-queries filename] [-output filename] [-showfields field1,field2,field3...]"); 495 System.err.println("\tNOTE: exactly one of <indexDir> or <segmentsDir> MUST be provided!\n"); 496 System.err.println("\t-dryrun\t\t\tdon't do anything, just show what would be done."); 497 System.err.println("\t-force\t\t\tforce index unlock, if locked. Use with caution!"); 498 System.err.println("\t-queries filename\tread pruning queries from this file, instead of the"); 499 System.err.println("\t\t\t\tdefault defined in Nutch config files under 'prune.index.tool.queries' key.\n"); 500 System.err.println("\t-output filename\tstore pruned URLs in a text file"); 501 System.err.println("\t-showfields field1,field2...\tfor each deleted document show the values of the selected fields."); 502 System.err.println("\t\t\t\tNOTE 1: this will slow down processing by orders of magnitude."); 503 System.err.println("\t\t\t\tNOTE 2: only values of stored fields will be shown."); 504 } 505 } 506 | Popular Tags |