1 2 3 4 package net.nutch.indexer; 5 6 import java.util.*; 7 import java.io.*; 8 9 import org.apache.lucene.util.*; 10 import org.apache.lucene.document.*; 11 import org.apache.lucene.index.*; 12 import org.apache.lucene.search.*; 13 14 15 public class IndexOptimizer { 16 public static final String DONE_NAME = "optimize.done"; 17 18 private static final float IDF_THRESHOLD = 6.0f; 19 private static final float FRACTION = 0.1f; 20 21 private static class FilterTermDocs implements TermDocs { 22 protected TermDocs in; 23 24 public FilterTermDocs(TermDocs in) { this.in = in; } 25 26 public void seek(Term term) throws IOException { in.seek(term); } 27 public void seek(TermEnum e) throws IOException { in.seek(e); } 28 public int doc() { return in.doc(); } 29 public int freq() { return in.freq(); } 30 public boolean next() throws IOException { return in.next(); } 31 public int read(int[] docs, int[] freqs) throws IOException { 32 return in.read(docs, freqs); 33 } 34 public boolean skipTo(int i) throws IOException { return in.skipTo(i); } 35 public void close() throws IOException { in.close(); } 36 } 37 38 private static class FilterTermPositions 39 extends FilterTermDocs implements TermPositions { 40 41 public FilterTermPositions(TermPositions in) { super(in); } 42 43 public int nextPosition() throws IOException { 44 return ((TermPositions)in).nextPosition(); 45 } 46 } 47 48 private static class FilterTermEnum extends TermEnum { 49 protected TermEnum in; 50 51 public FilterTermEnum(TermEnum in) { this.in = in; } 52 53 public boolean next() throws IOException { return in.next(); } 54 public Term term() { return in.term(); } 55 public int docFreq() { return in.docFreq(); } 56 public void close() throws IOException { in.close(); } 57 } 58 59 private static class OptimizingTermEnum extends FilterTermEnum { 60 private IndexReader reader; 61 private Similarity similarity; 62 63 public OptimizingTermEnum(IndexReader reader, Similarity similarity) 64 throws IOException { 65 super(reader.terms()); 66 this.reader = reader; 67 this.similarity = similarity; 68 } 69 70 public boolean next() throws IOException { 71 while (in.next()) { 72 float idf = similarity.idf(in.docFreq(), reader.maxDoc()); 73 74 if (idf <= IDF_THRESHOLD) 75 return true; 76 } 77 return false; 78 } 79 } 80 81 private static class ScoreDocQueue extends PriorityQueue { 82 ScoreDocQueue(int size) { 83 initialize(size); 84 } 85 86 protected final boolean lessThan(Object a, Object b) { 87 ScoreDoc hitA = (ScoreDoc)a; 88 ScoreDoc hitB = (ScoreDoc)b; 89 if (hitA.score == hitB.score) 90 return hitA.doc > hitB.doc; 91 else 92 return hitA.score < hitB.score; 93 } 94 } 95 96 private static class OptimizingTermPositions extends FilterTermPositions { 97 private IndexReader reader; 98 private TermDocs termDocs; 99 private int docFreq; 100 private ScoreDocQueue sdq; 101 private BitSet docs; 102 private Similarity similarity; 103 104 public OptimizingTermPositions(IndexReader reader, Similarity similarity) 105 throws IOException { 106 super(reader.termPositions()); 107 this.reader = reader; 108 this.termDocs = reader.termDocs(); 109 this.similarity = similarity; 110 this.sdq = new ScoreDocQueue((int)Math.ceil(reader.maxDoc() * FRACTION)); 111 this.docs = new BitSet(reader.maxDoc()); 112 } 113 114 public void seek(TermEnum e) throws IOException { 115 super.seek(e); 116 termDocs.seek(e); 117 118 byte[] norms = reader.norms(e.term().field()); 119 120 sdq.clear(); 121 float minScore = 0.0f; 122 int count = (int)Math.ceil(e.docFreq() * FRACTION); 123 System.out.println("Optimizing " + e.term() 124 + " from " + e.docFreq() 125 + " to " + count); 126 while (termDocs.next()) { 127 int doc = termDocs.doc(); 128 float score = 129 similarity.tf(termDocs.freq()) * similarity.decodeNorm(norms[doc]); 130 131 if (score > minScore) { 132 sdq.put(new ScoreDoc(doc, score)); 133 if (sdq.size() > count) { sdq.pop(); minScore = ((ScoreDoc)sdq.top()).score; } 137 } 138 } 139 140 docs.clear(); 141 while (sdq.size() != 0) { 142 docs.set(((ScoreDoc)sdq.pop()).doc); 143 } 144 145 } 146 147 public boolean next() throws IOException { 148 while (in.next()) { 149 if (docs.get(in.doc())) 150 return true; 151 } 152 return false; 153 } 154 155 } 156 157 private static class OptimizingReader extends FilterIndexReader { 158 private Similarity similarity = new NutchSimilarity(); 159 160 161 public OptimizingReader(IndexReader reader) { 162 super(reader); 163 } 164 165 public int numDocs() { return 0; } 167 public int maxDoc() { return 0; } 168 169 public TermEnum terms() throws IOException { 171 return new OptimizingTermEnum(in, similarity); 172 } 173 174 public TermPositions termPositions() throws IOException { 176 return new OptimizingTermPositions(in, similarity); 177 } 178 179 public boolean hasDeletions() { return false; } 180 } 181 182 183 private File directory; 184 185 public IndexOptimizer(File directory) { 186 this.directory = directory; 187 } 188 189 public void optimize() throws IOException { 190 IndexReader reader = IndexReader.open(new File(directory, "index")); 191 OptimizingReader optimizer = new OptimizingReader(reader); 192 IndexWriter writer = new IndexWriter(new File(directory, "index-opt"), 193 null, true); 194 writer.addIndexes(new IndexReader[] { optimizer }); 195 } 196 197 198 public static void main(String [] args) throws Exception { 199 File directory; 200 201 String usage = "IndexOptimizer directory"; 202 203 if (args.length < 1) { 204 System.err.println("Usage: " + usage); 205 return; 206 } 207 208 directory = new File(args[0]); 209 210 IndexOptimizer optimizer = new IndexOptimizer(directory); 211 212 Date start = new Date(); 213 214 optimizer.optimize(); 215 216 Date end = new Date(); 217 218 System.out.print(end.getTime() - start.getTime()); 219 System.out.println(" total milliseconds"); 220 } 221 222 } 223 | Popular Tags |