KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > indexer > IndexOptimizer


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.indexer;
5
6 import java.util.*;
7 import java.io.*;
8
9 import org.apache.lucene.util.*;
10 import org.apache.lucene.document.*;
11 import org.apache.lucene.index.*;
12 import org.apache.lucene.search.*;
13
14 /** */
15 public class IndexOptimizer {
16   public static final String JavaDoc DONE_NAME = "optimize.done";
17
18   private static final float IDF_THRESHOLD = 6.0f;
19   private static final float FRACTION = 0.1f;
20
21   private static class FilterTermDocs implements TermDocs {
22     protected TermDocs in;
23
24     public FilterTermDocs(TermDocs in) { this.in = in; }
25
26     public void seek(Term term) throws IOException { in.seek(term); }
27     public void seek(TermEnum e) throws IOException { in.seek(e); }
28     public int doc() { return in.doc(); }
29     public int freq() { return in.freq(); }
30     public boolean next() throws IOException { return in.next(); }
31     public int read(int[] docs, int[] freqs) throws IOException {
32       return in.read(docs, freqs);
33     }
34     public boolean skipTo(int i) throws IOException { return in.skipTo(i); }
35     public void close() throws IOException { in.close(); }
36   }
37
38   private static class FilterTermPositions
39      extends FilterTermDocs implements TermPositions {
40
41     public FilterTermPositions(TermPositions in) { super(in); }
42
43     public int nextPosition() throws IOException {
44       return ((TermPositions)in).nextPosition();
45     }
46   }
47
48   private static class FilterTermEnum extends TermEnum {
49     protected TermEnum in;
50
51     public FilterTermEnum(TermEnum in) { this.in = in; }
52
53     public boolean next() throws IOException { return in.next(); }
54     public Term term() { return in.term(); }
55     public int docFreq() { return in.docFreq(); }
56     public void close() throws IOException { in.close(); }
57   }
58
59   private static class OptimizingTermEnum extends FilterTermEnum {
60     private IndexReader reader;
61     private Similarity similarity;
62
63     public OptimizingTermEnum(IndexReader reader, Similarity similarity)
64       throws IOException {
65       super(reader.terms());
66       this.reader = reader;
67       this.similarity = similarity;
68     }
69
70     public boolean next() throws IOException {
71       while (in.next()) {
72         float idf = similarity.idf(in.docFreq(), reader.maxDoc());
73
74         if (idf <= IDF_THRESHOLD)
75           return true;
76       }
77       return false;
78     }
79   }
80     
81   private static class ScoreDocQueue extends PriorityQueue {
82     ScoreDocQueue(int size) {
83       initialize(size);
84     }
85     
86     protected final boolean lessThan(Object JavaDoc a, Object JavaDoc b) {
87       ScoreDoc hitA = (ScoreDoc)a;
88       ScoreDoc hitB = (ScoreDoc)b;
89       if (hitA.score == hitB.score)
90         return hitA.doc > hitB.doc;
91       else
92         return hitA.score < hitB.score;
93     }
94   }
95
96   private static class OptimizingTermPositions extends FilterTermPositions {
97     private IndexReader reader;
98     private TermDocs termDocs;
99     private int docFreq;
100     private ScoreDocQueue sdq;
101     private BitSet docs;
102     private Similarity similarity;
103
104     public OptimizingTermPositions(IndexReader reader, Similarity similarity)
105       throws IOException {
106       super(reader.termPositions());
107       this.reader = reader;
108       this.termDocs = reader.termDocs();
109       this.similarity = similarity;
110       this.sdq = new ScoreDocQueue((int)Math.ceil(reader.maxDoc() * FRACTION));
111       this.docs = new BitSet(reader.maxDoc());
112     }
113
114     public void seek(TermEnum e) throws IOException {
115       super.seek(e);
116       termDocs.seek(e);
117
118       byte[] norms = reader.norms(e.term().field());
119
120       sdq.clear();
121       float minScore = 0.0f;
122       int count = (int)Math.ceil(e.docFreq() * FRACTION);
123       System.out.println("Optimizing " + e.term()
124                          + " from " + e.docFreq()
125                          + " to " + count);
126       while (termDocs.next()) {
127         int doc = termDocs.doc();
128         float score =
129           similarity.tf(termDocs.freq()) * similarity.decodeNorm(norms[doc]);
130
131         if (score > minScore) {
132           sdq.put(new ScoreDoc(doc, score));
133           if (sdq.size() > count) { // if sdq overfull
134
sdq.pop(); // remove lowest in sdq
135
minScore = ((ScoreDoc)sdq.top()).score; // reset minScore
136
}
137         }
138       }
139
140       docs.clear();
141       while (sdq.size() != 0) {
142         docs.set(((ScoreDoc)sdq.pop()).doc);
143       }
144
145     }
146         
147     public boolean next() throws IOException {
148       while (in.next()) {
149         if (docs.get(in.doc()))
150           return true;
151       }
152       return false;
153     }
154       
155   }
156
157   private static class OptimizingReader extends FilterIndexReader {
158     private Similarity similarity = new NutchSimilarity();
159
160     
161     public OptimizingReader(IndexReader reader) {
162       super(reader);
163     }
164
165     // don't copy any per-document data
166
public int numDocs() { return 0; }
167     public int maxDoc() { return 0; }
168
169     // filter out low frequency terms
170
public TermEnum terms() throws IOException {
171       return new OptimizingTermEnum(in, similarity);
172     }
173
174     // filter out low-scoring postings
175
public TermPositions termPositions() throws IOException {
176       return new OptimizingTermPositions(in, similarity);
177     }
178
179     public boolean hasDeletions() { return false; }
180   }
181
182
183   private File directory;
184
185   public IndexOptimizer(File directory) {
186     this.directory = directory;
187   }
188
189   public void optimize() throws IOException {
190     IndexReader reader = IndexReader.open(new File(directory, "index"));
191     OptimizingReader optimizer = new OptimizingReader(reader);
192     IndexWriter writer = new IndexWriter(new File(directory, "index-opt"),
193                                          null, true);
194     writer.addIndexes(new IndexReader[] { optimizer });
195   }
196
197   /** */
198   public static void main(String JavaDoc[] args) throws Exception JavaDoc {
199     File directory;
200       
201     String JavaDoc usage = "IndexOptimizer directory";
202
203     if (args.length < 1) {
204       System.err.println("Usage: " + usage);
205       return;
206     }
207
208     directory = new File(args[0]);
209
210     IndexOptimizer optimizer = new IndexOptimizer(directory);
211
212     Date start = new Date();
213
214     optimizer.optimize();
215
216     Date end = new Date();
217
218     System.out.print(end.getTime() - start.getTime());
219     System.out.println(" total milliseconds");
220   }
221
222 }
223
Popular Tags