Your browser does not support JavaScript and this site utilizes JavaScript to build content and provide links to additional information. You should either enable JavaScript in your browser settings or use a browser that supports JavaScript in order to take full advantage of this site.
1 2 3 4 package org.creativecommons.nutch; 5 6 import net.nutch.io.*; 7 import net.nutch.util.LogFormatter; 8 import net.nutch.indexer.IndexSegment; 9 10 import org.apache.lucene.index.IndexReader; 11 import org.apache.lucene.document.Document; 12 13 import java.io.*; 14 import java.util.Vector ; 15 import java.util.logging.Logger ; 16 17 19 public class CCDeleteUnlicensedTool { 20 private static final Logger LOG = 21 LogFormatter.getLogger("org.creativecommons.nutch.CCDeleteUnlicensedTool"); 22 23 private IndexReader[] readers; 24 25 26 public CCDeleteUnlicensedTool(IndexReader[] readers) { 27 this.readers = readers; 28 } 29 30 31 public void close() throws IOException { 32 for (int i = 0; i < readers.length; i++) 33 readers[i].close(); 34 } 35 36 37 public int deleteUnlicensed() throws IOException { 38 int deleteCount = 0; 39 for (int index = 0; index < readers.length; index++) { 40 IndexReader reader = readers[index]; 41 int readerMax = reader.maxDoc(); 42 for (int doc = 0; doc < readerMax; doc++) { 43 if (!reader.isDeleted(doc)) { 44 Document document = reader.document(doc); 45 if (document.get(CCIndexingFilter.FIELD)==null){ reader.delete(doc); deleteCount++; 48 } 49 } 50 } 51 } 52 return deleteCount; 53 } 54 55 56 public static void main(String [] args) throws Exception { 57 String usage = "CCDeleteUnlicensedTool <segmentsDir>"; 58 59 if (args.length != 1) { 60 System.err.println("Usage: " + usage); 61 return; 62 } 63 64 String segmentsDir = args[0]; 65 66 File[] directories = new File(segmentsDir).listFiles(); 67 Vector vReaders=new Vector (); 68 int maxDoc = 0; 69 for (int i = 0; i < directories.length; i++) { 70 File indexDone = new File(directories[i], IndexSegment.DONE_NAME); 71 if (indexDone.exists() && indexDone.isFile()){ 72 File indexDir = new File(directories[i], "index"); 73 IndexReader reader = IndexReader.open(indexDir); 74 maxDoc += reader.maxDoc(); 75 vReaders.add(reader); 76 } 77 } 78 79 IndexReader[] readers=new IndexReader[vReaders.size()]; 80 for(int i = 0; vReaders.size()>0; i++) { 81 readers[i]=(IndexReader)vReaders.remove(0); 82 } 83 84 CCDeleteUnlicensedTool dd = new CCDeleteUnlicensedTool(readers); 85 int count = dd.deleteUnlicensed(); 86 LOG.info("CC: deleted "+count+" out of "+maxDoc); 87 dd.close(); 88 } 89 } 90
| Popular Tags
|