KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > creativecommons > nutch > CCDeleteUnlicensedTool


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package org.creativecommons.nutch;
5
6 import net.nutch.io.*;
7 import net.nutch.util.LogFormatter;
8 import net.nutch.indexer.IndexSegment;
9
10 import org.apache.lucene.index.IndexReader;
11 import org.apache.lucene.document.Document;
12
13 import java.io.*;
14 import java.util.Vector JavaDoc;
15 import java.util.logging.Logger JavaDoc;
16
17 /** Deletes documents in a set of Lucene indexes that do not have a Creative
18  * Commons license. */

19 public class CCDeleteUnlicensedTool {
20   private static final Logger JavaDoc LOG =
21     LogFormatter.getLogger("org.creativecommons.nutch.CCDeleteUnlicensedTool");
22
23   private IndexReader[] readers;
24
25   /** Constructs a duplicate detector for the provided indexes. */
26   public CCDeleteUnlicensedTool(IndexReader[] readers) {
27     this.readers = readers;
28   }
29
30   /** Closes the indexes, saving changes. */
31   public void close() throws IOException {
32     for (int i = 0; i < readers.length; i++)
33       readers[i].close();
34   }
35
36   /** Delete pages without CC licenes. */
37   public int deleteUnlicensed() throws IOException {
38     int deleteCount = 0;
39     for (int index = 0; index < readers.length; index++) {
40       IndexReader reader = readers[index];
41       int readerMax = reader.maxDoc();
42       for (int doc = 0; doc < readerMax; doc++) {
43         if (!reader.isDeleted(doc)) {
44           Document document = reader.document(doc);
45           if (document.get(CCIndexingFilter.FIELD)==null){ // no CC fields
46
reader.delete(doc); // delete it
47
deleteCount++;
48           }
49         }
50       }
51     }
52     return deleteCount;
53   }
54
55   /** Delete duplicates in the indexes in the named directory. */
56   public static void main(String JavaDoc[] args) throws Exception JavaDoc {
57     String JavaDoc usage = "CCDeleteUnlicensedTool <segmentsDir>";
58
59     if (args.length != 1) {
60       System.err.println("Usage: " + usage);
61       return;
62     }
63
64     String JavaDoc segmentsDir = args[0];
65
66     File[] directories = new File(segmentsDir).listFiles();
67     Vector JavaDoc vReaders=new Vector JavaDoc();
68     int maxDoc = 0;
69     for (int i = 0; i < directories.length; i++) {
70       File indexDone = new File(directories[i], IndexSegment.DONE_NAME);
71       if (indexDone.exists() && indexDone.isFile()){
72         File indexDir = new File(directories[i], "index");
73         IndexReader reader = IndexReader.open(indexDir);
74         maxDoc += reader.maxDoc();
75         vReaders.add(reader);
76       }
77     }
78
79     IndexReader[] readers=new IndexReader[vReaders.size()];
80     for(int i = 0; vReaders.size()>0; i++) {
81       readers[i]=(IndexReader)vReaders.remove(0);
82     }
83
84     CCDeleteUnlicensedTool dd = new CCDeleteUnlicensedTool(readers);
85     int count = dd.deleteUnlicensed();
86     LOG.info("CC: deleted "+count+" out of "+maxDoc);
87     dd.close();
88   }
89 }
90
Popular Tags