PruneIndexTool


1   /*
2    * Created on Nov 2, 2004
3    * Author: Andrzej Bialecki <ab@getopt.org>
4    *
5    */
6   package net.nutch.tools;
7   
8   import java.io.BufferedReader  ;
9   import java.io.BufferedWriter  ;
10  import java.io.File  ;
11  import java.io.FileFilter  ;
12  import java.io.FileInputStream  ;
13  import java.io.FileOutputStream  ;
14  import java.io.IOException  ;
15  import java.io.InputStream  ;
16  import java.io.InputStreamReader  ;
17  import java.io.OutputStreamWriter  ;
18  import java.io.PrintStream  ;
19  import java.util.BitSet  ;
20  import java.util.StringTokenizer  ;
21  import java.util.Vector  ;
22  import java.util.logging.Logger  ;
23  
24  import net.nutch.io.UTF8;
25  import net.nutch.util.LogFormatter;
26  import net.nutch.util.NutchConf;
27  
28  import org.apache.lucene.analysis.WhitespaceAnalyzer;
29  import org.apache.lucene.document.Document;
30  import org.apache.lucene.index.IndexReader;
31  import org.apache.lucene.index.MultiReader;
32  import org.apache.lucene.queryParser.QueryParser;
33  import org.apache.lucene.search.HitCollector;
34  import org.apache.lucene.search.IndexSearcher;
35  import org.apache.lucene.search.Query;
36  import org.apache.lucene.store.Directory;
37  import org.apache.lucene.store.FSDirectory;
38  
39  /**
40   * This tool prunes existing Nutch indexes of unwanted content. The main method
41   * accepts a list of segment directories (containing indexes). These indexes will
42   * be pruned of any content that matches one or more query from a list of Lucene
43   * queries read from a file (defined in standard config file, or explicitly
44   * overridden from command-line). Segments should already be indexed, if some
45   * of them are missing indexes then these segments will be skipped.
46   * 
47   * <p>NOTE 1: Queries are expressed in Lucene's QueryParser syntax, so a knowledge
48   * of available Lucene document fields is required. This can be obtained by reading sources
49   * of <code>index-basic</code> and <code>index-more</code> plugins, or using tools
50   * like <a HREF="http://www.getopt.org/luke">Luke</a>. During query parsing a
51   * WhitespaceAnalyzer is used - this choice has been made to minimize side effects of
52   * Analyzer on the final set of query terms. You can use {@link net.nutch.searcher.Query#main(String[])}
53   * method to translate queries in Nutch syntax to queries in Lucene syntax.<br>
54   * If additional level of control is required, an instance of {@link PruneChecker} can
55   * be provided to check each document before it's deleted. The results of all
56   * checkers are logically AND-ed, which means that any checker in the chain
57   * can veto the deletion of the current document. Two example checker implementations
58   * are provided - PrintFieldsChecker prints the values of selected index fields,
59   * StoreUrlsChecker stores the URLs of deleted documents to a file. Any of them can
60   * be activated by providing respective command-line options.
61   * </p>
62   * <p>The typical command-line usage is as follows:<br>
63   * <blockquote>
64   * <code>PruneIndexTool index_dir -dryrun -queries queries.txt -showfields url,title</code><br>
65   * This command will just print out fields of matching documents.<br>
66   * <code>PruneIndexTool index_dir -queries queries.txt</code><br>
67   * This command will actually remove all matching entries, according to the
68   * queries read from <code>queries.txt</code> file.
69   * </blockquote></p>
70   * <p>NOTE 2: This tool removes matching documents ONLY from segment indexes (or
71   * from a merged index). In particular it does NOT remove the pages and links
72   * from WebDB. This means that unwanted URLs may pop up again when new segments
73   * are created. To prevent this, use your own {@link net.nutch.net.URLFilter},
74   * or PruneDBTool (under construction...).</p>
75   * <p>NOTE 3: This tool uses a low-level Lucene interface to collect all matching
76   * documents. For large indexes and broad queries this may result in high memory
77   * consumption. If you encounter OutOfMemory exceptions, try to narrow down your
78   * queries, or increase the heap size.</p>
79   * 
80   * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
81   */
82  public class PruneIndexTool implements Runnable   {
83    public static final Logger   LOG = LogFormatter.getLogger("net.nutch.tools.PruneIndexTool");
84    /** Log the progress every LOG_STEP number of processed documents. */
85    public static int LOG_STEP = 50000;
86    
87    /**
88     * This interface can be used to implement additional checking on matching
89     * documents.
90     * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
91     */
92    public static interface PruneChecker {
93      /**
94       * Check whether this document should be pruned. NOTE: this method
95       * MUST NOT modify the IndexReader.
96       * @param reader index reader to read documents from
97       * @param docNum document ID
98       * @return true if the document should be deleted, false otherwise.
99       */
100     public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception  ;
101     /**
102      * Close the checker - this could involve flushing output files or somesuch.
103      */
104     public void close();
105   }
106 
107   /**
108    * This checker's main function is just to print out
109    * selected field values from each document, just before
110    * they are deleted.
111    * 
112    * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
113    */
114   public static class PrintFieldsChecker implements PruneChecker {
115     private PrintStream   ps = null;
116     private String  [] fields = null;
117     
118     /**
119      * 
120      * @param ps an instance of PrintStream to print the information to
121      * @param fields a list of Lucene index field names. Values from these
122      * fields will be printed for every matching document.
123      */
124     public PrintFieldsChecker(PrintStream   ps, String  [] fields) {
125       this.ps = ps;
126       this.fields = fields;
127     }
128 
129     public void close() {
130       ps.flush();
131     }
132     
133     public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception   {
134       Document doc = reader.document(docNum);
135       StringBuffer   sb = new StringBuffer  ("#" + docNum + ":");
136       for (int i = 0; i < fields.length; i++) {
137         String  [] values = doc.getValues(fields[i]);
138         sb.append(" " + fields[i] + "=");
139         if (values != null) {
140           for (int k = 0; k < values.length; k++) {
141             sb.append("[" + values[k] + "]");
142           }
143         } else sb.append("[null]");
144       }
145       ps.println(sb.toString());
146       return true;
147     }
148   }
149 
150   /**
151    * This checker's main function is just to store
152    * the URLs of each document to be deleted in a text file.
153    * 
154    * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
155    */
156   public static class StoreUrlsChecker implements PruneChecker {
157     private BufferedWriter   output = null;
158     private boolean storeHomeUrl = false;
159     
160     /**
161      * Store the list in a file
162      * @param out name of the output file
163      */
164     public StoreUrlsChecker(File   out, boolean storeHomeUrl) throws Exception   {
165       this.output = new BufferedWriter  (new OutputStreamWriter  (new FileOutputStream  (out), "UTF-8"));
166       this.storeHomeUrl = storeHomeUrl;
167     }
168     
169     public void close() {
170       try {
171         output.flush();
172         output.close();
173       } catch (Exception   e) {
174         LOG.warning("Error closing: " + e.getMessage());
175       }
176     }
177     
178     public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception   {
179       Document doc = reader.document(docNum);
180       String   url = doc.get("url");
181       output.write(url); output.write('\n');
182       if (storeHomeUrl) {
183         // store also the main url
184         int idx = url.indexOf("://");
185         if (idx != -1) {
186           idx = url.indexOf('/', idx + 3);
187           if (idx != -1) {
188             output.write(url.substring(0, idx + 1) + "\n");
189           }
190         }
191       }
192       return true;
193     }
194   }
195 
196   private Query[] queries = null;
197   private IndexReader reader = null;
198   private IndexSearcher searcher = null;
199   private PruneChecker[] checkers = null;
200   private boolean dryrun = false;
201   private String   dr = "";
202   
203   /**
204    * Create an instance of the tool, and open all input indexes.
205    * @param indexDirs directories with input indexes. At least one valid index must
206    * exist, otherwise an Exception is thrown.
207    * @param queries pruning queries. Each query will be processed in turn, and the
208    * length of the array must be at least one, otherwise an Exception is thrown.
209    * @param checkers if not null, they will be used to perform additional
210    * checks on matching documents - each checker's method {@link PruneChecker#isPrunable(Query, IndexReader, int)}
211    * will be called in turn, for each matching document, and if it returns true this means that
212    * the document should be deleted. A logical AND is performed on the results returned
213    * by all checkers (which means that if one of them returns false, the document will
214    * not be deleted).
215    * @param unlock if true, and if any of the input indexes is locked, forcibly
216    * unlock it. Use with care, only when you are sure that other processes don't
217    * modify the index at the same time.
218    * @param dryrun if set to true, don't change the index, just show what would be done.
219    * If false, perform all actions, changing indexes as needed. Note: dryrun doesn't prevent
220    * PruneCheckers from performing changes or causing any other side-effects.
221    * @throws Exception
222    */
223   public PruneIndexTool(File  [] indexDirs, Query[] queries, PruneChecker[] checkers,
224           boolean unlock, boolean dryrun) throws Exception   {
225     if (indexDirs == null || queries == null)
226       throw new Exception  ("Invalid arguments.");
227     if (indexDirs.length == 0 || queries.length == 0)
228       throw new Exception  ("Nothing to do.");
229     this.queries = queries;
230     this.checkers = checkers;
231     this.dryrun = dryrun;
232     if (dryrun) dr = "[DRY RUN] ";
233     int numIdx = 0;
234     if (indexDirs.length == 1) {
235       Directory dir = FSDirectory.getDirectory(indexDirs[0], false);
236       if (IndexReader.isLocked(dir)) {
237         if (!unlock) {
238           throw new Exception  ("Index " + indexDirs[0] + " is locked.");
239         }
240         if (!dryrun) {
241           IndexReader.unlock(dir);
242           LOG.fine(" - had to unlock index in " + dir);
243         }
244       }
245       reader = IndexReader.open(dir);
246       numIdx = 1;
247     } else {
248       Directory dir;
249       Vector   indexes = new Vector  (indexDirs.length);
250       for (int i = 0; i < indexDirs.length; i++) {
251         try {
252           dir = FSDirectory.getDirectory(indexDirs[i], false);
253           if (IndexReader.isLocked(dir)) {
254             if (!unlock) {
255               LOG.warning(dr + "Index " + indexDirs[i] + " is locked. Skipping...");
256               continue;
257             }
258             if (!dryrun) {
259               IndexReader.unlock(dir);
260               LOG.fine(" - had to unlock index in " + dir);
261             }
262           }
263           IndexReader r = IndexReader.open(dir);
264           indexes.add(r);
265           numIdx++;
266         } catch (Exception   e) {
267           LOG.warning(dr + "Invalid index in " + indexDirs[i] + " - skipping...");
268         }
269       }
270       if (indexes.size() == 0) throw new Exception  ("No input indexes.");
271       IndexReader[] readers = (IndexReader[])indexes.toArray(new IndexReader[0]);
272       reader = new MultiReader(readers);
273     }
274     LOG.info(dr + "Opened " + numIdx + " index(es) with total " + reader.numDocs() + " documents.");
275     searcher = new IndexSearcher(reader);
276   }
277   
278   /**
279    * This class collects all matching document IDs in a BitSet.
280    * <p>NOTE: the reason to use this API is that the most common way of
281    * performing Lucene queries (Searcher.search(Query)::Hits) does NOT
282    * return all matching documents, because it skips very low scoring hits.</p>
283    * 
284    * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
285    */
286   private static class AllHitsCollector extends HitCollector {
287     private BitSet   bits;
288     
289     public AllHitsCollector(BitSet   bits) {
290       this.bits = bits;
291     }
292     public void collect(int doc, float score) {
293       bits.set(doc);
294     }
295   }
296   
297   /**
298    * For each query, find all matching documents and delete them from all input
299    * indexes. Optionally, an additional check can be performed by using {@link PruneChecker}
300    * implementations.
301    */
302   public void run() {
303     BitSet   bits = new BitSet  (reader.maxDoc());
304     AllHitsCollector ahc = new AllHitsCollector(bits);
305     boolean doDelete = false;
306     UTF8 url = new UTF8();
307     for (int i = 0; i < queries.length; i++) {
308       LOG.info(dr + "Processing query: " + queries[i].toString());
309       bits.clear();
310       try {
311         searcher.search(queries[i], ahc);
312       } catch (IOException   e) {
313         LOG.warning(dr + " - failed: " + e.getMessage());
314         continue;
315       }
316       if (bits.cardinality() == 0) {
317         LOG.info(dr + " - no matching documents.");
318         continue;
319       }
320       LOG.info(dr + " - found " + bits.cardinality() + " document(s).");
321       // Now delete all matching documents
322       int docNum = -1, start = 0, cnt = 0;
323       // probably faster than looping sequentially through all index values?
324       while ((docNum = bits.nextSetBit(start)) != -1) {
325         // don't delete the same document multiple times
326         if (reader.isDeleted(docNum)) continue;
327         try {
328           if (checkers != null && checkers.length > 0) {
329             boolean check = true;
330             for (int k = 0; k < checkers.length; k++) {
331               // fail if any checker returns false
332               check &= checkers[k].isPrunable(queries[i], reader, docNum);
333             }
334             doDelete = check;
335           } else doDelete = true;
336           if (doDelete) {
337             if (!dryrun) reader.delete(docNum);
338             cnt++;
339           }
340         } catch (Exception   e) {
341           LOG.warning(dr + " - failed to delete doc #" + docNum);
342         }
343         start = docNum + 1;
344       }
345       LOG.info(dr + " - deleted " + cnt + " document(s).");
346     }
347     // close checkers
348     if (checkers != null) {
349       for (int i = 0; i < checkers.length; i++) {
350         checkers[i].close();
351       }
352     }
353     try {
354       reader.close();
355     } catch (IOException   e) {
356       LOG.warning(dr + "Exception when closing reader(s): " + e.getMessage());
357     }
358   }
359   
360   public static void main(String  [] args) throws Exception   {
361     if (args.length == 0) {
362       usage();
363       LOG.severe("Missing arguments");
364       return;
365     }
366     File   idx = new File  (args[0]);
367     if (!idx.isDirectory()) {
368       usage();
369       LOG.severe("Not a directory: " + idx);
370       return;
371     }
372     Vector   paths = new Vector  ();
373     if (IndexReader.indexExists(idx)) {
374       paths.add(idx);
375     } else {
376       // try and see if there are segments inside, with index dirs
377       File  [] dirs = idx.listFiles(new FileFilter  () {
378         public boolean accept(File   f) {
379           return f.isDirectory();
380         }
381       });
382       if (dirs == null || dirs.length == 0) {
383         usage();
384         LOG.severe("No indexes in " + idx);
385         return;
386       }
387       for (int i = 0; i < dirs.length; i++) {
388         File   sidx = new File  (dirs[i], "index");
389         if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) {
390           paths.add(sidx);
391         }
392       }
393       if (paths.size() == 0) {
394         usage();
395         LOG.severe("No indexes in " + idx + " or its subdirs.");
396         return;
397       }
398     }
399     File  [] indexes = (File  [])paths.toArray(new File  [0]);
400     boolean force = false;
401     boolean dryrun = false;
402     String   qPath = null;
403     String   outPath = null;
404     String   fList = null;
405     for (int i = 1; i < args.length; i++) {
406       if (args[i].equals("-force")) {
407         force = true;
408       } else if (args[i].equals("-queries")) {
409         qPath = args[++i];
410       } else if (args[i].equals("-output")) {
411         outPath = args[++i];
412       } else if (args[i].equals("-showfields")) {
413         fList = args[++i];
414       } else if (args[i].equals("-dryrun")) {
415         dryrun = true;
416       } else {
417         usage();
418         LOG.severe("Unrecognized option: " + args[i]);
419         return;
420       }
421     }
422     Vector   cv = new Vector  ();
423     if (fList != null) {
424       StringTokenizer   st = new StringTokenizer  (fList, ",");
425       Vector   tokens = new Vector  ();
426       while (st.hasMoreTokens()) tokens.add(st.nextToken());
427       String  [] fields = (String  [])tokens.toArray(new String  [0]);
428       PruneChecker pc = new PrintFieldsChecker(System.out, fields);
429       cv.add(pc);
430     }
431     
432     if (outPath != null) {
433       StoreUrlsChecker luc = new StoreUrlsChecker(new File  (outPath), false);
434       cv.add(luc);
435     }
436 
437     PruneChecker[] checkers = null;
438     if (cv.size() > 0) {
439       checkers = (PruneChecker[])cv.toArray(new PruneChecker[0]);
440     }
441     Query[] queries = null;
442     InputStream   is = null;
443     if (qPath != null) {
444       is = new FileInputStream  (qPath);
445     } else {
446       qPath = NutchConf.get("prune.index.tool.queries");
447       is = NutchConf.getConfResourceAsInputStream(qPath);
448     }
449     if (is == null) {
450       LOG.severe("Can't load queries from " + qPath);
451       return;
452     }
453     try {
454       queries = parseQueries(is);
455     } catch (Exception   e) {
456       LOG.severe("Error parsing queries: " + e.getMessage());
457       return;
458     }
459     try {
460       PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, true, dryrun);
461       pit.run();
462     } catch (Exception   e) {
463       LOG.severe("Error running PruneIndexTool: " + e.getMessage());
464       return;
465     }
466   }
467   
468   /**
469    * Read a list of Lucene queries from the stream (UTF-8 encoding is assumed).
470    * There should be a single Lucene query per line. Blank lines and comments
471    * starting with '#' are allowed.
472    * <p>NOTE: you may wish to use {@link net.nutch.searcher.Query#main(String[])}
473    * method to translate queries from Nutch format to Lucene format.</p>
474    * @param is InputStream to read from
475    * @return array of Lucene queries
476    * @throws Exception
477    */
478   public static Query[] parseQueries(InputStream   is) throws Exception   {
479     BufferedReader   br = new BufferedReader  (new InputStreamReader  (is, "UTF-8"));
480     String   line = null;
481     QueryParser qp = new QueryParser("url", new WhitespaceAnalyzer());
482     Vector   queries = new Vector  ();
483     while ((line = br.readLine()) != null) {
484       line = line.trim();
485       //skip blanks and comments
486       if (line.length() == 0 || line.charAt(0) == '#') continue;
487       Query q = qp.parse(line);
488       queries.add(q);
489     }
490     return (Query[])queries.toArray(new Query[0]);
491   }
492   
493   private static void usage() {
494     System.err.println("PruneIndexTool <indexDir | segmentsDir> [-dryrun] [-force] [-queries filename] [-output filename] [-showfields field1,field2,field3...]");
495     System.err.println("\tNOTE: exactly one of <indexDir> or <segmentsDir> MUST be provided!\n");
496     System.err.println("\t-dryrun\t\t\tdon't do anything, just show what would be done.");
497     System.err.println("\t-force\t\t\tforce index unlock, if locked. Use with caution!");
498     System.err.println("\t-queries filename\tread pruning queries from this file, instead of the");
499     System.err.println("\t\t\t\tdefault defined in Nutch config files under 'prune.index.tool.queries' key.\n");
500     System.err.println("\t-output filename\tstore pruned URLs in a text file");
501     System.err.println("\t-showfields field1,field2...\tfor each deleted document show the values of the selected fields.");
502     System.err.println("\t\t\t\tNOTE 1: this will slow down processing by orders of magnitude.");
503     System.err.println("\t\t\t\tNOTE 2: only values of stored fields will be shown.");
504   }
505 }
506
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags