KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > tools > PruneIndexTool


1 /*
2  * Created on Nov 2, 2004
3  * Author: Andrzej Bialecki <ab@getopt.org>
4  *
5  */

6 package net.nutch.tools;
7
8 import java.io.BufferedReader JavaDoc;
9 import java.io.BufferedWriter JavaDoc;
10 import java.io.File JavaDoc;
11 import java.io.FileFilter JavaDoc;
12 import java.io.FileInputStream JavaDoc;
13 import java.io.FileOutputStream JavaDoc;
14 import java.io.IOException JavaDoc;
15 import java.io.InputStream JavaDoc;
16 import java.io.InputStreamReader JavaDoc;
17 import java.io.OutputStreamWriter JavaDoc;
18 import java.io.PrintStream JavaDoc;
19 import java.util.BitSet JavaDoc;
20 import java.util.StringTokenizer JavaDoc;
21 import java.util.Vector JavaDoc;
22 import java.util.logging.Logger JavaDoc;
23
24 import net.nutch.io.UTF8;
25 import net.nutch.util.LogFormatter;
26 import net.nutch.util.NutchConf;
27
28 import org.apache.lucene.analysis.WhitespaceAnalyzer;
29 import org.apache.lucene.document.Document;
30 import org.apache.lucene.index.IndexReader;
31 import org.apache.lucene.index.MultiReader;
32 import org.apache.lucene.queryParser.QueryParser;
33 import org.apache.lucene.search.HitCollector;
34 import org.apache.lucene.search.IndexSearcher;
35 import org.apache.lucene.search.Query;
36 import org.apache.lucene.store.Directory;
37 import org.apache.lucene.store.FSDirectory;
38
39 /**
40  * This tool prunes existing Nutch indexes of unwanted content. The main method
41  * accepts a list of segment directories (containing indexes). These indexes will
42  * be pruned of any content that matches one or more query from a list of Lucene
43  * queries read from a file (defined in standard config file, or explicitly
44  * overridden from command-line). Segments should already be indexed, if some
45  * of them are missing indexes then these segments will be skipped.
46  *
47  * <p>NOTE 1: Queries are expressed in Lucene's QueryParser syntax, so a knowledge
48  * of available Lucene document fields is required. This can be obtained by reading sources
49  * of <code>index-basic</code> and <code>index-more</code> plugins, or using tools
50  * like <a HREF="http://www.getopt.org/luke">Luke</a>. During query parsing a
51  * WhitespaceAnalyzer is used - this choice has been made to minimize side effects of
52  * Analyzer on the final set of query terms. You can use {@link net.nutch.searcher.Query#main(String[])}
53  * method to translate queries in Nutch syntax to queries in Lucene syntax.<br>
54  * If additional level of control is required, an instance of {@link PruneChecker} can
55  * be provided to check each document before it's deleted. The results of all
56  * checkers are logically AND-ed, which means that any checker in the chain
57  * can veto the deletion of the current document. Two example checker implementations
58  * are provided - PrintFieldsChecker prints the values of selected index fields,
59  * StoreUrlsChecker stores the URLs of deleted documents to a file. Any of them can
60  * be activated by providing respective command-line options.
61  * </p>
62  * <p>The typical command-line usage is as follows:<br>
63  * <blockquote>
64  * <code>PruneIndexTool index_dir -dryrun -queries queries.txt -showfields url,title</code><br>
65  * This command will just print out fields of matching documents.<br>
66  * <code>PruneIndexTool index_dir -queries queries.txt</code><br>
67  * This command will actually remove all matching entries, according to the
68  * queries read from <code>queries.txt</code> file.
69  * </blockquote></p>
70  * <p>NOTE 2: This tool removes matching documents ONLY from segment indexes (or
71  * from a merged index). In particular it does NOT remove the pages and links
72  * from WebDB. This means that unwanted URLs may pop up again when new segments
73  * are created. To prevent this, use your own {@link net.nutch.net.URLFilter},
74  * or PruneDBTool (under construction...).</p>
75  * <p>NOTE 3: This tool uses a low-level Lucene interface to collect all matching
76  * documents. For large indexes and broad queries this may result in high memory
77  * consumption. If you encounter OutOfMemory exceptions, try to narrow down your
78  * queries, or increase the heap size.</p>
79  *
80  * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
81  */

82 public class PruneIndexTool implements Runnable JavaDoc {
83   public static final Logger JavaDoc LOG = LogFormatter.getLogger("net.nutch.tools.PruneIndexTool");
84   /** Log the progress every LOG_STEP number of processed documents. */
85   public static int LOG_STEP = 50000;
86   
87   /**
88    * This interface can be used to implement additional checking on matching
89    * documents.
90    * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
91    */

92   public static interface PruneChecker {
93     /**
94      * Check whether this document should be pruned. NOTE: this method
95      * MUST NOT modify the IndexReader.
96      * @param reader index reader to read documents from
97      * @param docNum document ID
98      * @return true if the document should be deleted, false otherwise.
99      */

100     public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception JavaDoc;
101     /**
102      * Close the checker - this could involve flushing output files or somesuch.
103      */

104     public void close();
105   }
106
107   /**
108    * This checker's main function is just to print out
109    * selected field values from each document, just before
110    * they are deleted.
111    *
112    * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
113    */

114   public static class PrintFieldsChecker implements PruneChecker {
115     private PrintStream JavaDoc ps = null;
116     private String JavaDoc[] fields = null;
117     
118     /**
119      *
120      * @param ps an instance of PrintStream to print the information to
121      * @param fields a list of Lucene index field names. Values from these
122      * fields will be printed for every matching document.
123      */

124     public PrintFieldsChecker(PrintStream JavaDoc ps, String JavaDoc[] fields) {
125       this.ps = ps;
126       this.fields = fields;
127     }
128
129     public void close() {
130       ps.flush();
131     }
132     
133     public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception JavaDoc {
134       Document doc = reader.document(docNum);
135       StringBuffer JavaDoc sb = new StringBuffer JavaDoc("#" + docNum + ":");
136       for (int i = 0; i < fields.length; i++) {
137         String JavaDoc[] values = doc.getValues(fields[i]);
138         sb.append(" " + fields[i] + "=");
139         if (values != null) {
140           for (int k = 0; k < values.length; k++) {
141             sb.append("[" + values[k] + "]");
142           }
143         } else sb.append("[null]");
144       }
145       ps.println(sb.toString());
146       return true;
147     }
148   }
149
150   /**
151    * This checker's main function is just to store
152    * the URLs of each document to be deleted in a text file.
153    *
154    * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
155    */

156   public static class StoreUrlsChecker implements PruneChecker {
157     private BufferedWriter JavaDoc output = null;
158     private boolean storeHomeUrl = false;
159     
160     /**
161      * Store the list in a file
162      * @param out name of the output file
163      */

164     public StoreUrlsChecker(File JavaDoc out, boolean storeHomeUrl) throws Exception JavaDoc {
165       this.output = new BufferedWriter JavaDoc(new OutputStreamWriter JavaDoc(new FileOutputStream JavaDoc(out), "UTF-8"));
166       this.storeHomeUrl = storeHomeUrl;
167     }
168     
169     public void close() {
170       try {
171         output.flush();
172         output.close();
173       } catch (Exception JavaDoc e) {
174         LOG.warning("Error closing: " + e.getMessage());
175       }
176     }
177     
178     public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception JavaDoc {
179       Document doc = reader.document(docNum);
180       String JavaDoc url = doc.get("url");
181       output.write(url); output.write('\n');
182       if (storeHomeUrl) {
183         // store also the main url
184
int idx = url.indexOf("://");
185         if (idx != -1) {
186           idx = url.indexOf('/', idx + 3);
187           if (idx != -1) {
188             output.write(url.substring(0, idx + 1) + "\n");
189           }
190         }
191       }
192       return true;
193     }
194   }
195
196   private Query[] queries = null;
197   private IndexReader reader = null;
198   private IndexSearcher searcher = null;
199   private PruneChecker[] checkers = null;
200   private boolean dryrun = false;
201   private String JavaDoc dr = "";
202   
203   /**
204    * Create an instance of the tool, and open all input indexes.
205    * @param indexDirs directories with input indexes. At least one valid index must
206    * exist, otherwise an Exception is thrown.
207    * @param queries pruning queries. Each query will be processed in turn, and the
208    * length of the array must be at least one, otherwise an Exception is thrown.
209    * @param checkers if not null, they will be used to perform additional
210    * checks on matching documents - each checker's method {@link PruneChecker#isPrunable(Query, IndexReader, int)}
211    * will be called in turn, for each matching document, and if it returns true this means that
212    * the document should be deleted. A logical AND is performed on the results returned
213    * by all checkers (which means that if one of them returns false, the document will
214    * not be deleted).
215    * @param unlock if true, and if any of the input indexes is locked, forcibly
216    * unlock it. Use with care, only when you are sure that other processes don't
217    * modify the index at the same time.
218    * @param dryrun if set to true, don't change the index, just show what would be done.
219    * If false, perform all actions, changing indexes as needed. Note: dryrun doesn't prevent
220    * PruneCheckers from performing changes or causing any other side-effects.
221    * @throws Exception
222    */

223   public PruneIndexTool(File JavaDoc[] indexDirs, Query[] queries, PruneChecker[] checkers,
224           boolean unlock, boolean dryrun) throws Exception JavaDoc {
225     if (indexDirs == null || queries == null)
226       throw new Exception JavaDoc("Invalid arguments.");
227     if (indexDirs.length == 0 || queries.length == 0)
228       throw new Exception JavaDoc("Nothing to do.");
229     this.queries = queries;
230     this.checkers = checkers;
231     this.dryrun = dryrun;
232     if (dryrun) dr = "[DRY RUN] ";
233     int numIdx = 0;
234     if (indexDirs.length == 1) {
235       Directory dir = FSDirectory.getDirectory(indexDirs[0], false);
236       if (IndexReader.isLocked(dir)) {
237         if (!unlock) {
238           throw new Exception JavaDoc("Index " + indexDirs[0] + " is locked.");
239         }
240         if (!dryrun) {
241           IndexReader.unlock(dir);
242           LOG.fine(" - had to unlock index in " + dir);
243         }
244       }
245       reader = IndexReader.open(dir);
246       numIdx = 1;
247     } else {
248       Directory dir;
249       Vector JavaDoc indexes = new Vector JavaDoc(indexDirs.length);
250       for (int i = 0; i < indexDirs.length; i++) {
251         try {
252           dir = FSDirectory.getDirectory(indexDirs[i], false);
253           if (IndexReader.isLocked(dir)) {
254             if (!unlock) {
255               LOG.warning(dr + "Index " + indexDirs[i] + " is locked. Skipping...");
256               continue;
257             }
258             if (!dryrun) {
259               IndexReader.unlock(dir);
260               LOG.fine(" - had to unlock index in " + dir);
261             }
262           }
263           IndexReader r = IndexReader.open(dir);
264           indexes.add(r);
265           numIdx++;
266         } catch (Exception JavaDoc e) {
267           LOG.warning(dr + "Invalid index in " + indexDirs[i] + " - skipping...");
268         }
269       }
270       if (indexes.size() == 0) throw new Exception JavaDoc("No input indexes.");
271       IndexReader[] readers = (IndexReader[])indexes.toArray(new IndexReader[0]);
272       reader = new MultiReader(readers);
273     }
274     LOG.info(dr + "Opened " + numIdx + " index(es) with total " + reader.numDocs() + " documents.");
275     searcher = new IndexSearcher(reader);
276   }
277   
278   /**
279    * This class collects all matching document IDs in a BitSet.
280    * <p>NOTE: the reason to use this API is that the most common way of
281    * performing Lucene queries (Searcher.search(Query)::Hits) does NOT
282    * return all matching documents, because it skips very low scoring hits.</p>
283    *
284    * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
285    */

286   private static class AllHitsCollector extends HitCollector {
287     private BitSet JavaDoc bits;
288     
289     public AllHitsCollector(BitSet JavaDoc bits) {
290       this.bits = bits;
291     }
292     public void collect(int doc, float score) {
293       bits.set(doc);
294     }
295   }
296   
297   /**
298    * For each query, find all matching documents and delete them from all input
299    * indexes. Optionally, an additional check can be performed by using {@link PruneChecker}
300    * implementations.
301    */

302   public void run() {
303     BitSet JavaDoc bits = new BitSet JavaDoc(reader.maxDoc());
304     AllHitsCollector ahc = new AllHitsCollector(bits);
305     boolean doDelete = false;
306     UTF8 url = new UTF8();
307     for (int i = 0; i < queries.length; i++) {
308       LOG.info(dr + "Processing query: " + queries[i].toString());
309       bits.clear();
310       try {
311         searcher.search(queries[i], ahc);
312       } catch (IOException JavaDoc e) {
313         LOG.warning(dr + " - failed: " + e.getMessage());
314         continue;
315       }
316       if (bits.cardinality() == 0) {
317         LOG.info(dr + " - no matching documents.");
318         continue;
319       }
320       LOG.info(dr + " - found " + bits.cardinality() + " document(s).");
321       // Now delete all matching documents
322
int docNum = -1, start = 0, cnt = 0;
323       // probably faster than looping sequentially through all index values?
324
while ((docNum = bits.nextSetBit(start)) != -1) {
325         // don't delete the same document multiple times
326
if (reader.isDeleted(docNum)) continue;
327         try {
328           if (checkers != null && checkers.length > 0) {
329             boolean check = true;
330             for (int k = 0; k < checkers.length; k++) {
331               // fail if any checker returns false
332
check &= checkers[k].isPrunable(queries[i], reader, docNum);
333             }
334             doDelete = check;
335           } else doDelete = true;
336           if (doDelete) {
337             if (!dryrun) reader.delete(docNum);
338             cnt++;
339           }
340         } catch (Exception JavaDoc e) {
341           LOG.warning(dr + " - failed to delete doc #" + docNum);
342         }
343         start = docNum + 1;
344       }
345       LOG.info(dr + " - deleted " + cnt + " document(s).");
346     }
347     // close checkers
348
if (checkers != null) {
349       for (int i = 0; i < checkers.length; i++) {
350         checkers[i].close();
351       }
352     }
353     try {
354       reader.close();
355     } catch (IOException JavaDoc e) {
356       LOG.warning(dr + "Exception when closing reader(s): " + e.getMessage());
357     }
358   }
359   
360   public static void main(String JavaDoc[] args) throws Exception JavaDoc {
361     if (args.length == 0) {
362       usage();
363       LOG.severe("Missing arguments");
364       return;
365     }
366     File JavaDoc idx = new File JavaDoc(args[0]);
367     if (!idx.isDirectory()) {
368       usage();
369       LOG.severe("Not a directory: " + idx);
370       return;
371     }
372     Vector JavaDoc paths = new Vector JavaDoc();
373     if (IndexReader.indexExists(idx)) {
374       paths.add(idx);
375     } else {
376       // try and see if there are segments inside, with index dirs
377
File JavaDoc[] dirs = idx.listFiles(new FileFilter JavaDoc() {
378         public boolean accept(File JavaDoc f) {
379           return f.isDirectory();
380         }
381       });
382       if (dirs == null || dirs.length == 0) {
383         usage();
384         LOG.severe("No indexes in " + idx);
385         return;
386       }
387       for (int i = 0; i < dirs.length; i++) {
388         File JavaDoc sidx = new File JavaDoc(dirs[i], "index");
389         if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) {
390           paths.add(sidx);
391         }
392       }
393       if (paths.size() == 0) {
394         usage();
395         LOG.severe("No indexes in " + idx + " or its subdirs.");
396         return;
397       }
398     }
399     File JavaDoc[] indexes = (File JavaDoc[])paths.toArray(new File JavaDoc[0]);
400     boolean force = false;
401     boolean dryrun = false;
402     String JavaDoc qPath = null;
403     String JavaDoc outPath = null;
404     String JavaDoc fList = null;
405     for (int i = 1; i < args.length; i++) {
406       if (args[i].equals("-force")) {
407         force = true;
408       } else if (args[i].equals("-queries")) {
409         qPath = args[++i];
410       } else if (args[i].equals("-output")) {
411         outPath = args[++i];
412       } else if (args[i].equals("-showfields")) {
413         fList = args[++i];
414       } else if (args[i].equals("-dryrun")) {
415         dryrun = true;
416       } else {
417         usage();
418         LOG.severe("Unrecognized option: " + args[i]);
419         return;
420       }
421     }
422     Vector JavaDoc cv = new Vector JavaDoc();
423     if (fList != null) {
424       StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(fList, ",");
425       Vector JavaDoc tokens = new Vector JavaDoc();
426       while (st.hasMoreTokens()) tokens.add(st.nextToken());
427       String JavaDoc[] fields = (String JavaDoc[])tokens.toArray(new String JavaDoc[0]);
428       PruneChecker pc = new PrintFieldsChecker(System.out, fields);
429       cv.add(pc);
430     }
431     
432     if (outPath != null) {
433       StoreUrlsChecker luc = new StoreUrlsChecker(new File JavaDoc(outPath), false);
434       cv.add(luc);
435     }
436
437     PruneChecker[] checkers = null;
438     if (cv.size() > 0) {
439       checkers = (PruneChecker[])cv.toArray(new PruneChecker[0]);
440     }
441     Query[] queries = null;
442     InputStream JavaDoc is = null;
443     if (qPath != null) {
444       is = new FileInputStream JavaDoc(qPath);
445     } else {
446       qPath = NutchConf.get("prune.index.tool.queries");
447       is = NutchConf.getConfResourceAsInputStream(qPath);
448     }
449     if (is == null) {
450       LOG.severe("Can't load queries from " + qPath);
451       return;
452     }
453     try {
454       queries = parseQueries(is);
455     } catch (Exception JavaDoc e) {
456       LOG.severe("Error parsing queries: " + e.getMessage());
457       return;
458     }
459     try {
460       PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, true, dryrun);
461       pit.run();
462     } catch (Exception JavaDoc e) {
463       LOG.severe("Error running PruneIndexTool: " + e.getMessage());
464       return;
465     }
466   }
467   
468   /**
469    * Read a list of Lucene queries from the stream (UTF-8 encoding is assumed).
470    * There should be a single Lucene query per line. Blank lines and comments
471    * starting with '#' are allowed.
472    * <p>NOTE: you may wish to use {@link net.nutch.searcher.Query#main(String[])}
473    * method to translate queries from Nutch format to Lucene format.</p>
474    * @param is InputStream to read from
475    * @return array of Lucene queries
476    * @throws Exception
477    */

478   public static Query[] parseQueries(InputStream JavaDoc is) throws Exception JavaDoc {
479     BufferedReader JavaDoc br = new BufferedReader JavaDoc(new InputStreamReader JavaDoc(is, "UTF-8"));
480     String JavaDoc line = null;
481     QueryParser qp = new QueryParser("url", new WhitespaceAnalyzer());
482     Vector JavaDoc queries = new Vector JavaDoc();
483     while ((line = br.readLine()) != null) {
484       line = line.trim();
485       //skip blanks and comments
486
if (line.length() == 0 || line.charAt(0) == '#') continue;
487       Query q = qp.parse(line);
488       queries.add(q);
489     }
490     return (Query[])queries.toArray(new Query[0]);
491   }
492   
493   private static void usage() {
494     System.err.println("PruneIndexTool <indexDir | segmentsDir> [-dryrun] [-force] [-queries filename] [-output filename] [-showfields field1,field2,field3...]");
495     System.err.println("\tNOTE: exactly one of <indexDir> or <segmentsDir> MUST be provided!\n");
496     System.err.println("\t-dryrun\t\t\tdon't do anything, just show what would be done.");
497     System.err.println("\t-force\t\t\tforce index unlock, if locked. Use with caution!");
498     System.err.println("\t-queries filename\tread pruning queries from this file, instead of the");
499     System.err.println("\t\t\t\tdefault defined in Nutch config files under 'prune.index.tool.queries' key.\n");
500     System.err.println("\t-output filename\tstore pruned URLs in a text file");
501     System.err.println("\t-showfields field1,field2...\tfor each deleted document show the values of the selected fields.");
502     System.err.println("\t\t\t\tNOTE 1: this will slow down processing by orders of magnitude.");
503     System.err.println("\t\t\t\tNOTE 2: only values of stored fields will be shown.");
504   }
505 }
506
Popular Tags