KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > index > DocumentWriter


1 package org.apache.lucene.index;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import java.io.IOException JavaDoc;
20 import java.io.PrintStream JavaDoc;
21 import java.io.Reader JavaDoc;
22 import java.io.StringReader JavaDoc;
23 import java.util.Hashtable JavaDoc;
24 import java.util.Enumeration JavaDoc;
25 import java.util.Arrays JavaDoc;
26
27 import org.apache.lucene.document.Document;
28 import org.apache.lucene.document.Field;
29 import org.apache.lucene.analysis.Analyzer;
30 import org.apache.lucene.analysis.TokenStream;
31 import org.apache.lucene.analysis.Token;
32 import org.apache.lucene.store.Directory;
33 import org.apache.lucene.store.IndexOutput;
34 import org.apache.lucene.search.Similarity;
35
36 final class DocumentWriter {
37   private Analyzer analyzer;
38   private Directory directory;
39   private Similarity similarity;
40   private FieldInfos fieldInfos;
41   private int maxFieldLength;
42   private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
43   private PrintStream JavaDoc infoStream;
44
45   /** This ctor used by test code only.
46    *
47    * @param directory The directory to write the document information to
48    * @param analyzer The analyzer to use for the document
49    * @param similarity The Similarity function
50    * @param maxFieldLength The maximum number of tokens a field may have
51    */

52   DocumentWriter(Directory directory, Analyzer analyzer,
53                  Similarity similarity, int maxFieldLength) {
54     this.directory = directory;
55     this.analyzer = analyzer;
56     this.similarity = similarity;
57     this.maxFieldLength = maxFieldLength;
58   }
59
60   DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer) {
61     this.directory = directory;
62     this.analyzer = analyzer;
63     this.similarity = writer.getSimilarity();
64     this.maxFieldLength = writer.getMaxFieldLength();
65     this.termIndexInterval = writer.getTermIndexInterval();
66   }
67
68   final void addDocument(String JavaDoc segment, Document doc)
69           throws IOException JavaDoc {
70     // write field names
71
fieldInfos = new FieldInfos();
72     fieldInfos.add(doc);
73     fieldInfos.write(directory, segment + ".fnm");
74
75     // write field values
76
FieldsWriter fieldsWriter =
77             new FieldsWriter(directory, segment, fieldInfos);
78     try {
79       fieldsWriter.addDocument(doc);
80     } finally {
81       fieldsWriter.close();
82     }
83
84     // invert doc into postingTable
85
postingTable.clear(); // clear postingTable
86
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
87
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
88
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
89

90     fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
91
Arrays.fill(fieldBoosts, doc.getBoost());
92
93     invertDocument(doc);
94
95     // sort postingTable into an array
96
Posting[] postings = sortPostingTable();
97
98     /*
99     for (int i = 0; i < postings.length; i++) {
100       Posting posting = postings[i];
101       System.out.print(posting.term);
102       System.out.print(" freq=" + posting.freq);
103       System.out.print(" pos=");
104       System.out.print(posting.positions[0]);
105       for (int j = 1; j < posting.freq; j++)
106     System.out.print("," + posting.positions[j]);
107       System.out.println("");
108     }
109     */

110
111     // write postings
112
writePostings(postings, segment);
113
114     // write norms of indexed fields
115
writeNorms(segment);
116
117   }
118
119   // Keys are Terms, values are Postings.
120
// Used to buffer a document before it is written to the index.
121
private final Hashtable JavaDoc postingTable = new Hashtable JavaDoc();
122   private int[] fieldLengths;
123   private int[] fieldPositions;
124   private int[] fieldOffsets;
125   private float[] fieldBoosts;
126
127   // Tokenizes the fields of a document into Postings.
128
private final void invertDocument(Document doc)
129           throws IOException JavaDoc {
130     Enumeration JavaDoc fields = doc.fields();
131     while (fields.hasMoreElements()) {
132       Field field = (Field) fields.nextElement();
133       String JavaDoc fieldName = field.name();
134       int fieldNumber = fieldInfos.fieldNumber(fieldName);
135
136       int length = fieldLengths[fieldNumber]; // length of field
137
int position = fieldPositions[fieldNumber]; // position in field
138
if (length>0) position+=analyzer.getPositionIncrementGap(fieldName);
139       int offset = fieldOffsets[fieldNumber]; // offset field
140

141       if (field.isIndexed()) {
142         if (!field.isTokenized()) { // un-tokenized field
143
String JavaDoc stringValue = field.stringValue();
144           if(field.isStoreOffsetWithTermVector())
145             addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
146           else
147             addPosition(fieldName, stringValue, position++, null);
148           offset += stringValue.length();
149           length++;
150         } else
151         {
152           Reader JavaDoc reader; // find or make Reader
153
if (field.readerValue() != null)
154             reader = field.readerValue();
155           else if (field.stringValue() != null)
156             reader = new StringReader JavaDoc(field.stringValue());
157           else
158             throw new IllegalArgumentException JavaDoc
159                     ("field must have either String or Reader value");
160
161           // Tokenize field and add to postingTable
162
TokenStream stream = analyzer.tokenStream(fieldName, reader);
163           try {
164             Token lastToken = null;
165             for (Token t = stream.next(); t != null; t = stream.next()) {
166               position += (t.getPositionIncrement() - 1);
167               
168               if(field.isStoreOffsetWithTermVector())
169                 addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
170               else
171                 addPosition(fieldName, t.termText(), position++, null);
172               
173               lastToken = t;
174               if (++length > maxFieldLength) {
175                 if (infoStream != null)
176                   infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens");
177                 break;
178               }
179             }
180             
181             if(lastToken != null)
182               offset += lastToken.endOffset() + 1;
183             
184           } finally {
185             stream.close();
186           }
187         }
188
189         fieldLengths[fieldNumber] = length; // save field length
190
fieldPositions[fieldNumber] = position; // save field position
191
fieldBoosts[fieldNumber] *= field.getBoost();
192         fieldOffsets[fieldNumber] = offset;
193       }
194     }
195   }
196
197   private final Term termBuffer = new Term("", ""); // avoid consing
198

199   private final void addPosition(String JavaDoc field, String JavaDoc text, int position, TermVectorOffsetInfo offset) {
200     termBuffer.set(field, text);
201     //System.out.println("Offset: " + offset);
202
Posting ti = (Posting) postingTable.get(termBuffer);
203     if (ti != null) { // word seen before
204
int freq = ti.freq;
205       if (ti.positions.length == freq) { // positions array is full
206
int[] newPositions = new int[freq * 2]; // double size
207
int[] positions = ti.positions;
208         for (int i = 0; i < freq; i++) // copy old positions to new
209
newPositions[i] = positions[i];
210         ti.positions = newPositions;
211       }
212       ti.positions[freq] = position; // add new position
213

214       if (offset != null) {
215         if (ti.offsets.length == freq){
216           TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
217           TermVectorOffsetInfo [] offsets = ti.offsets;
218           for (int i = 0; i < freq; i++)
219           {
220             newOffsets[i] = offsets[i];
221           }
222           ti.offsets = newOffsets;
223         }
224         ti.offsets[freq] = offset;
225       }
226       ti.freq = freq + 1; // update frequency
227
} else { // word not seen before
228
Term term = new Term(field, text, false);
229       postingTable.put(term, new Posting(term, position, offset));
230     }
231   }
232
233   private final Posting[] sortPostingTable() {
234     // copy postingTable into an array
235
Posting[] array = new Posting[postingTable.size()];
236     Enumeration JavaDoc postings = postingTable.elements();
237     for (int i = 0; postings.hasMoreElements(); i++)
238       array[i] = (Posting) postings.nextElement();
239
240     // sort the array
241
quickSort(array, 0, array.length - 1);
242
243     return array;
244   }
245
246   private static final void quickSort(Posting[] postings, int lo, int hi) {
247     if (lo >= hi)
248       return;
249
250     int mid = (lo + hi) / 2;
251
252     if (postings[lo].term.compareTo(postings[mid].term) > 0) {
253       Posting tmp = postings[lo];
254       postings[lo] = postings[mid];
255       postings[mid] = tmp;
256     }
257
258     if (postings[mid].term.compareTo(postings[hi].term) > 0) {
259       Posting tmp = postings[mid];
260       postings[mid] = postings[hi];
261       postings[hi] = tmp;
262
263       if (postings[lo].term.compareTo(postings[mid].term) > 0) {
264         Posting tmp2 = postings[lo];
265         postings[lo] = postings[mid];
266         postings[mid] = tmp2;
267       }
268     }
269
270     int left = lo + 1;
271     int right = hi - 1;
272
273     if (left >= right)
274       return;
275
276     Term partition = postings[mid].term;
277
278     for (; ;) {
279       while (postings[right].term.compareTo(partition) > 0)
280         --right;
281
282       while (left < right && postings[left].term.compareTo(partition) <= 0)
283         ++left;
284
285       if (left < right) {
286         Posting tmp = postings[left];
287         postings[left] = postings[right];
288         postings[right] = tmp;
289         --right;
290       } else {
291         break;
292       }
293     }
294
295     quickSort(postings, lo, left);
296     quickSort(postings, left + 1, hi);
297   }
298
299   private final void writePostings(Posting[] postings, String JavaDoc segment)
300           throws IOException JavaDoc {
301     IndexOutput freq = null, prox = null;
302     TermInfosWriter tis = null;
303     TermVectorsWriter termVectorWriter = null;
304     try {
305       //open files for inverse index storage
306
freq = directory.createOutput(segment + ".frq");
307       prox = directory.createOutput(segment + ".prx");
308       tis = new TermInfosWriter(directory, segment, fieldInfos,
309                                 termIndexInterval);
310       TermInfo ti = new TermInfo();
311       String JavaDoc currentField = null;
312
313       for (int i = 0; i < postings.length; i++) {
314         Posting posting = postings[i];
315
316         // add an entry to the dictionary with pointers to prox and freq files
317
ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
318         tis.add(posting.term, ti);
319
320         // add an entry to the freq file
321
int postingFreq = posting.freq;
322         if (postingFreq == 1) // optimize freq=1
323
freq.writeVInt(1); // set low bit of doc num.
324
else {
325           freq.writeVInt(0); // the document number
326
freq.writeVInt(postingFreq); // frequency in doc
327
}
328
329         int lastPosition = 0; // write positions
330
int[] positions = posting.positions;
331         for (int j = 0; j < postingFreq; j++) { // use delta-encoding
332
int position = positions[j];
333           prox.writeVInt(position - lastPosition);
334           lastPosition = position;
335         }
336         // check to see if we switched to a new field
337
String JavaDoc termField = posting.term.field();
338         if (currentField != termField) {
339           // changing field - see if there is something to save
340
currentField = termField;
341           FieldInfo fi = fieldInfos.fieldInfo(currentField);
342           if (fi.storeTermVector) {
343             if (termVectorWriter == null) {
344               termVectorWriter =
345                 new TermVectorsWriter(directory, segment, fieldInfos);
346               termVectorWriter.openDocument();
347             }
348             termVectorWriter.openField(currentField);
349
350           } else if (termVectorWriter != null) {
351             termVectorWriter.closeField();
352           }
353         }
354         if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
355             termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
356         }
357       }
358       if (termVectorWriter != null)
359         termVectorWriter.closeDocument();
360     } finally {
361       // make an effort to close all streams we can but remember and re-throw
362
// the first exception encountered in this process
363
IOException JavaDoc keep = null;
364       if (freq != null) try { freq.close(); } catch (IOException JavaDoc e) { if (keep == null) keep = e; }
365       if (prox != null) try { prox.close(); } catch (IOException JavaDoc e) { if (keep == null) keep = e; }
366       if (tis != null) try { tis.close(); } catch (IOException JavaDoc e) { if (keep == null) keep = e; }
367       if (termVectorWriter != null) try { termVectorWriter.close(); } catch (IOException JavaDoc e) { if (keep == null) keep = e; }
368       if (keep != null) throw (IOException JavaDoc) keep.fillInStackTrace();
369     }
370   }
371
372   private final void writeNorms(String JavaDoc segment) throws IOException JavaDoc {
373     for(int n = 0; n < fieldInfos.size(); n++){
374       FieldInfo fi = fieldInfos.fieldInfo(n);
375       if(fi.isIndexed && !fi.omitNorms){
376         float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]);
377         IndexOutput norms = directory.createOutput(segment + ".f" + n);
378         try {
379           norms.writeByte(Similarity.encodeNorm(norm));
380         } finally {
381           norms.close();
382         }
383       }
384     }
385   }
386   
387   /** If non-null, a message will be printed to this if maxFieldLength is reached.
388    */

389   void setInfoStream(PrintStream JavaDoc infoStream) {
390     this.infoStream = infoStream;
391   }
392
393 }
394
395 final class Posting { // info about a Term in a doc
396
Term term; // the Term
397
int freq; // its frequency in doc
398
int[] positions; // positions it occurs at
399
TermVectorOffsetInfo [] offsets;
400
401   Posting(Term t, int position, TermVectorOffsetInfo offset) {
402     term = t;
403     freq = 1;
404     positions = new int[1];
405     positions[0] = position;
406     if(offset != null){
407     offsets = new TermVectorOffsetInfo[1];
408     offsets[0] = offset;
409     }
410     else
411       offsets = null;
412   }
413 }
414
Popular Tags