KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > index > SegmentMerger


1 package org.apache.lucene.index;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import java.util.Vector JavaDoc;
20 import java.util.Iterator JavaDoc;
21 import java.util.Collection JavaDoc;
22 import java.io.IOException JavaDoc;
23
24 import org.apache.lucene.store.Directory;
25 import org.apache.lucene.store.IndexOutput;
26 import org.apache.lucene.store.RAMOutputStream;
27
28 /**
29  * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
30  * into a single Segment. After adding the appropriate readers, call the merge method to combine the
31  * segments.
32  *<P>
33  * If the compoundFile flag is set, then the segments will be merged into a compound file.
34  *
35  *
36  * @see #merge
37  * @see #add
38  */

39 final class SegmentMerger {
40   private Directory directory;
41   private String JavaDoc segment;
42   private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
43
44   private Vector JavaDoc readers = new Vector JavaDoc();
45   private FieldInfos fieldInfos;
46
47   /** This ctor used only by test code.
48    *
49    * @param dir The Directory to merge the other segments into
50    * @param name The name of the new segment
51    */

52   SegmentMerger(Directory dir, String JavaDoc name) {
53     directory = dir;
54     segment = name;
55   }
56
57   SegmentMerger(IndexWriter writer, String JavaDoc name) {
58     directory = writer.getDirectory();
59     segment = name;
60     termIndexInterval = writer.getTermIndexInterval();
61   }
62
63   /**
64    * Add an IndexReader to the collection of readers that are to be merged
65    * @param reader
66    */

67   final void add(IndexReader reader) {
68     readers.addElement(reader);
69   }
70
71   /**
72    *
73    * @param i The index of the reader to return
74    * @return The ith reader to be merged
75    */

76   final IndexReader segmentReader(int i) {
77     return (IndexReader) readers.elementAt(i);
78   }
79
80   /**
81    * Merges the readers specified by the {@link #add} method into the directory passed to the constructor
82    * @return The number of documents that were merged
83    * @throws IOException
84    */

85   final int merge() throws IOException JavaDoc {
86     int value;
87     
88     value = mergeFields();
89     mergeTerms();
90     mergeNorms();
91
92     if (fieldInfos.hasVectors())
93       mergeVectors();
94
95     return value;
96   }
97   
98   /**
99    * close all IndexReaders that have been added.
100    * Should not be called before merge().
101    * @throws IOException
102    */

103   final void closeReaders() throws IOException JavaDoc {
104     for (int i = 0; i < readers.size(); i++) { // close readers
105
IndexReader reader = (IndexReader) readers.elementAt(i);
106       reader.close();
107     }
108   }
109
110   final Vector JavaDoc createCompoundFile(String JavaDoc fileName)
111           throws IOException JavaDoc {
112     CompoundFileWriter cfsWriter =
113             new CompoundFileWriter(directory, fileName);
114
115     Vector JavaDoc files =
116       new Vector JavaDoc(IndexFileNames.COMPOUND_EXTENSIONS.length + fieldInfos.size());
117     
118     // Basic files
119
for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) {
120       files.add(segment + "." + IndexFileNames.COMPOUND_EXTENSIONS[i]);
121     }
122
123     // Field norm files
124
for (int i = 0; i < fieldInfos.size(); i++) {
125       FieldInfo fi = fieldInfos.fieldInfo(i);
126       if (fi.isIndexed && !fi.omitNorms) {
127         files.add(segment + ".f" + i);
128       }
129     }
130
131     // Vector files
132
if (fieldInfos.hasVectors()) {
133       for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.length; i++) {
134         files.add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]);
135       }
136     }
137
138     // Now merge all added files
139
Iterator JavaDoc it = files.iterator();
140     while (it.hasNext()) {
141       cfsWriter.addFile((String JavaDoc) it.next());
142     }
143     
144     // Perform the merge
145
cfsWriter.close();
146    
147     return files;
148   }
149
150   private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection JavaDoc names, boolean storeTermVectors, boolean storePositionWithTermVector,
151                          boolean storeOffsetWithTermVector) throws IOException JavaDoc {
152     Iterator JavaDoc i = names.iterator();
153     while (i.hasNext()) {
154       String JavaDoc field = (String JavaDoc)i.next();
155       fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field));
156     }
157   }
158
159   /**
160    *
161    * @return The number of documents in all of the readers
162    * @throws IOException
163    */

164   private final int mergeFields() throws IOException JavaDoc {
165     fieldInfos = new FieldInfos(); // merge field names
166
int docCount = 0;
167     for (int i = 0; i < readers.size(); i++) {
168       IndexReader reader = (IndexReader) readers.elementAt(i);
169       addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
170       addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
171       addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
172       addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
173       addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
174       fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
175     }
176     fieldInfos.write(directory, segment + ".fnm");
177
178     FieldsWriter fieldsWriter = // merge field values
179
new FieldsWriter(directory, segment, fieldInfos);
180     try {
181       for (int i = 0; i < readers.size(); i++) {
182         IndexReader reader = (IndexReader) readers.elementAt(i);
183         int maxDoc = reader.maxDoc();
184         for (int j = 0; j < maxDoc; j++)
185           if (!reader.isDeleted(j)) { // skip deleted docs
186
fieldsWriter.addDocument(reader.document(j));
187             docCount++;
188           }
189       }
190     } finally {
191       fieldsWriter.close();
192     }
193     return docCount;
194   }
195
196   /**
197    * Merge the TermVectors from each of the segments into the new one.
198    * @throws IOException
199    */

200   private final void mergeVectors() throws IOException JavaDoc {
201     TermVectorsWriter termVectorsWriter =
202       new TermVectorsWriter(directory, segment, fieldInfos);
203
204     try {
205       for (int r = 0; r < readers.size(); r++) {
206         IndexReader reader = (IndexReader) readers.elementAt(r);
207         int maxDoc = reader.maxDoc();
208         for (int docNum = 0; docNum < maxDoc; docNum++) {
209           // skip deleted docs
210
if (reader.isDeleted(docNum))
211             continue;
212           termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
213         }
214       }
215     } finally {
216       termVectorsWriter.close();
217     }
218   }
219
220   private IndexOutput freqOutput = null;
221   private IndexOutput proxOutput = null;
222   private TermInfosWriter termInfosWriter = null;
223   private int skipInterval;
224   private SegmentMergeQueue queue = null;
225
226   private final void mergeTerms() throws IOException JavaDoc {
227     try {
228       freqOutput = directory.createOutput(segment + ".frq");
229       proxOutput = directory.createOutput(segment + ".prx");
230       termInfosWriter =
231               new TermInfosWriter(directory, segment, fieldInfos,
232                                   termIndexInterval);
233       skipInterval = termInfosWriter.skipInterval;
234       queue = new SegmentMergeQueue(readers.size());
235
236       mergeTermInfos();
237
238     } finally {
239       if (freqOutput != null) freqOutput.close();
240       if (proxOutput != null) proxOutput.close();
241       if (termInfosWriter != null) termInfosWriter.close();
242       if (queue != null) queue.close();
243     }
244   }
245
246   private final void mergeTermInfos() throws IOException JavaDoc {
247     int base = 0;
248     for (int i = 0; i < readers.size(); i++) {
249       IndexReader reader = (IndexReader) readers.elementAt(i);
250       TermEnum termEnum = reader.terms();
251       SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
252       base += reader.numDocs();
253       if (smi.next())
254         queue.put(smi); // initialize queue
255
else
256         smi.close();
257     }
258
259     SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
260
261     while (queue.size() > 0) {
262       int matchSize = 0; // pop matching terms
263
match[matchSize++] = (SegmentMergeInfo) queue.pop();
264       Term term = match[0].term;
265       SegmentMergeInfo top = (SegmentMergeInfo) queue.top();
266
267       while (top != null && term.compareTo(top.term) == 0) {
268         match[matchSize++] = (SegmentMergeInfo) queue.pop();
269         top = (SegmentMergeInfo) queue.top();
270       }
271
272       mergeTermInfo(match, matchSize); // add new TermInfo
273

274       while (matchSize > 0) {
275         SegmentMergeInfo smi = match[--matchSize];
276         if (smi.next())
277           queue.put(smi); // restore queue
278
else
279           smi.close(); // done with a segment
280
}
281     }
282   }
283
284   private final TermInfo termInfo = new TermInfo(); // minimize consing
285

286   /** Merge one term found in one or more segments. The array <code>smis</code>
287    * contains segments that are positioned at the same term. <code>N</code>
288    * is the number of cells in the array actually occupied.
289    *
290    * @param smis array of segments
291    * @param n number of cells in the array actually occupied
292    */

293   private final void mergeTermInfo(SegmentMergeInfo[] smis, int n)
294           throws IOException JavaDoc {
295     long freqPointer = freqOutput.getFilePointer();
296     long proxPointer = proxOutput.getFilePointer();
297
298     int df = appendPostings(smis, n); // append posting data
299

300     long skipPointer = writeSkip();
301
302     if (df > 0) {
303       // add an entry to the dictionary with pointers to prox and freq files
304
termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
305       termInfosWriter.add(smis[0].term, termInfo);
306     }
307   }
308
309   /** Process postings from multiple segments all positioned on the
310    * same term. Writes out merged entries into freqOutput and
311    * the proxOutput streams.
312    *
313    * @param smis array of segments
314    * @param n number of cells in the array actually occupied
315    * @return number of documents across all segments where this term was found
316    */

317   private final int appendPostings(SegmentMergeInfo[] smis, int n)
318           throws IOException JavaDoc {
319     int lastDoc = 0;
320     int df = 0; // number of docs w/ term
321
resetSkip();
322     for (int i = 0; i < n; i++) {
323       SegmentMergeInfo smi = smis[i];
324       TermPositions postings = smi.getPositions();
325       int base = smi.base;
326       int[] docMap = smi.getDocMap();
327       postings.seek(smi.termEnum);
328       while (postings.next()) {
329         int doc = postings.doc();
330         if (docMap != null)
331           doc = docMap[doc]; // map around deletions
332
doc += base; // convert to merged space
333

334         if (doc < lastDoc)
335           throw new IllegalStateException JavaDoc("docs out of order");
336
337         df++;
338
339         if ((df % skipInterval) == 0) {
340           bufferSkip(lastDoc);
341         }
342
343         int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
344
lastDoc = doc;
345
346         int freq = postings.freq();
347         if (freq == 1) {
348           freqOutput.writeVInt(docCode | 1); // write doc & freq=1
349
} else {
350           freqOutput.writeVInt(docCode); // write doc
351
freqOutput.writeVInt(freq); // write frequency in doc
352
}
353
354         int lastPosition = 0; // write position deltas
355
for (int j = 0; j < freq; j++) {
356           int position = postings.nextPosition();
357           proxOutput.writeVInt(position - lastPosition);
358           lastPosition = position;
359         }
360       }
361     }
362     return df;
363   }
364
365   private RAMOutputStream skipBuffer = new RAMOutputStream();
366   private int lastSkipDoc;
367   private long lastSkipFreqPointer;
368   private long lastSkipProxPointer;
369
370   private void resetSkip() {
371     skipBuffer.reset();
372     lastSkipDoc = 0;
373     lastSkipFreqPointer = freqOutput.getFilePointer();
374     lastSkipProxPointer = proxOutput.getFilePointer();
375   }
376
377   private void bufferSkip(int doc) throws IOException JavaDoc {
378     long freqPointer = freqOutput.getFilePointer();
379     long proxPointer = proxOutput.getFilePointer();
380
381     skipBuffer.writeVInt(doc - lastSkipDoc);
382     skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
383     skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
384
385     lastSkipDoc = doc;
386     lastSkipFreqPointer = freqPointer;
387     lastSkipProxPointer = proxPointer;
388   }
389
390   private long writeSkip() throws IOException JavaDoc {
391     long skipPointer = freqOutput.getFilePointer();
392     skipBuffer.writeTo(freqOutput);
393     return skipPointer;
394   }
395
396   private void mergeNorms() throws IOException JavaDoc {
397     for (int i = 0; i < fieldInfos.size(); i++) {
398       FieldInfo fi = fieldInfos.fieldInfo(i);
399       if (fi.isIndexed && !fi.omitNorms) {
400         IndexOutput output = directory.createOutput(segment + ".f" + i);
401         try {
402           for (int j = 0; j < readers.size(); j++) {
403             IndexReader reader = (IndexReader) readers.elementAt(j);
404             int maxDoc = reader.maxDoc();
405             byte[] input = new byte[maxDoc];
406             reader.norms(fi.name, input, 0);
407             for (int k = 0; k < maxDoc; k++) {
408               if (!reader.isDeleted(k)) {
409                 output.writeByte(input[k]);
410               }
411             }
412           }
413         } finally {
414           output.close();
415         }
416       }
417     }
418   }
419
420 }
421
Popular Tags