KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > index > TermVectorsWriter


1 package org.apache.lucene.index;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import org.apache.lucene.store.Directory;
20 import org.apache.lucene.store.IndexOutput;
21 import org.apache.lucene.util.StringHelper;
22
23 import java.io.IOException JavaDoc;
24 import java.util.Vector JavaDoc;
25
26 /**
27  * Writer works by opening a document and then opening the fields within the document and then
28  * writing out the vectors for each field.
29  *
30  * Rough usage:
31  *
32  <CODE>
33  for each document
34  {
35  writer.openDocument();
36  for each field on the document
37  {
38  writer.openField(field);
39  for all of the terms
40  {
41  writer.addTerm(...)
42  }
43  writer.closeField
44  }
45  writer.closeDocument()
46  }
47  </CODE>
48  *
49  * @version $Id: TermVectorsWriter.java 150689 2004-11-29 21:42:02Z bmesser $
50  *
51  */

52 final class TermVectorsWriter {
53   static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
54   static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
55   
56   static final int FORMAT_VERSION = 2;
57   //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
58
static final int FORMAT_SIZE = 4;
59   
60   static final String JavaDoc TVX_EXTENSION = ".tvx";
61   static final String JavaDoc TVD_EXTENSION = ".tvd";
62   static final String JavaDoc TVF_EXTENSION = ".tvf";
63   
64   private IndexOutput tvx = null, tvd = null, tvf = null;
65   private Vector JavaDoc fields = null;
66   private Vector JavaDoc terms = null;
67   private FieldInfos fieldInfos;
68
69   private TVField currentField = null;
70   private long currentDocPointer = -1;
71
72   public TermVectorsWriter(Directory directory, String JavaDoc segment,
73                            FieldInfos fieldInfos)
74     throws IOException JavaDoc {
75     // Open files for TermVector storage
76
tvx = directory.createOutput(segment + TVX_EXTENSION);
77     tvx.writeInt(FORMAT_VERSION);
78     tvd = directory.createOutput(segment + TVD_EXTENSION);
79     tvd.writeInt(FORMAT_VERSION);
80     tvf = directory.createOutput(segment + TVF_EXTENSION);
81     tvf.writeInt(FORMAT_VERSION);
82
83     this.fieldInfos = fieldInfos;
84     fields = new Vector JavaDoc(fieldInfos.size());
85     terms = new Vector JavaDoc();
86   }
87
88
89   public final void openDocument()
90           throws IOException JavaDoc {
91     closeDocument();
92     currentDocPointer = tvd.getFilePointer();
93   }
94
95
96   public final void closeDocument()
97           throws IOException JavaDoc {
98     if (isDocumentOpen()) {
99       closeField();
100       writeDoc();
101       fields.clear();
102       currentDocPointer = -1;
103     }
104   }
105
106
107   public final boolean isDocumentOpen() {
108     return currentDocPointer != -1;
109   }
110
111
112   /** Start processing a field. This can be followed by a number of calls to
113    * addTerm, and a final call to closeField to indicate the end of
114    * processing of this field. If a field was previously open, it is
115    * closed automatically.
116    */

117   public final void openField(String JavaDoc field) throws IOException JavaDoc {
118     FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
119     openField(fieldInfo.number, fieldInfo.storePositionWithTermVector, fieldInfo.storeOffsetWithTermVector);
120   }
121   
122   private void openField(int fieldNumber, boolean storePositionWithTermVector,
123       boolean storeOffsetWithTermVector) throws IOException JavaDoc{
124     if (!isDocumentOpen())
125       throw new IllegalStateException JavaDoc("Cannot open field when no document is open.");
126     closeField();
127     currentField = new TVField(fieldNumber, storePositionWithTermVector, storeOffsetWithTermVector);
128   }
129
130   /** Finished processing current field. This should be followed by a call to
131    * openField before future calls to addTerm.
132    */

133   public final void closeField()
134           throws IOException JavaDoc {
135     if (isFieldOpen()) {
136       /* DEBUG */
137       //System.out.println("closeField()");
138
/* DEBUG */
139
140       // save field and terms
141
writeField();
142       fields.add(currentField);
143       terms.clear();
144       currentField = null;
145     }
146   }
147
148   /** Return true if a field is currently open. */
149   public final boolean isFieldOpen() {
150     return currentField != null;
151   }
152
153   /** Add term to the field's term vector. Field must already be open.
154    * Terms should be added in
155    * increasing order of terms, one call per unique termNum. ProxPointer
156    * is a pointer into the TermPosition file (prx). Freq is the number of
157    * times this term appears in this field, in this document.
158    * @throws IllegalStateException if document or field is not open
159    */

160   public final void addTerm(String JavaDoc termText, int freq) {
161     addTerm(termText, freq, null, null);
162   }
163   
164   public final void addTerm(String JavaDoc termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets)
165   {
166     if (!isDocumentOpen())
167       throw new IllegalStateException JavaDoc("Cannot add terms when document is not open");
168     if (!isFieldOpen())
169       throw new IllegalStateException JavaDoc("Cannot add terms when field is not open");
170     
171     addTermInternal(termText, freq, positions, offsets);
172   }
173
174   private final void addTermInternal(String JavaDoc termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets) {
175     TVTerm term = new TVTerm();
176     term.termText = termText;
177     term.freq = freq;
178     term.positions = positions;
179     term.offsets = offsets;
180     terms.add(term);
181   }
182
183   /**
184    * Add a complete document specified by all its term vectors. If document has no
185    * term vectors, add value for tvx.
186    *
187    * @param vectors
188    * @throws IOException
189    */

190   public final void addAllDocVectors(TermFreqVector[] vectors)
191       throws IOException JavaDoc {
192     openDocument();
193
194     if (vectors != null) {
195       for (int i = 0; i < vectors.length; i++) {
196         boolean storePositionWithTermVector = false;
197         boolean storeOffsetWithTermVector = false;
198
199         try {
200
201           TermPositionVector tpVector = (TermPositionVector) vectors[i];
202
203           if (tpVector.size() > 0 && tpVector.getTermPositions(0) != null)
204             storePositionWithTermVector = true;
205           if (tpVector.size() > 0 && tpVector.getOffsets(0) != null)
206             storeOffsetWithTermVector = true;
207
208           FieldInfo fieldInfo = fieldInfos.fieldInfo(tpVector.getField());
209           openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
210
211           for (int j = 0; j < tpVector.size(); j++)
212             addTermInternal(tpVector.getTerms()[j], tpVector.getTermFrequencies()[j], tpVector.getTermPositions(j),
213                 tpVector.getOffsets(j));
214
215           closeField();
216
217         } catch (ClassCastException JavaDoc ignore) {
218
219           TermFreqVector tfVector = vectors[i];
220
221           FieldInfo fieldInfo = fieldInfos.fieldInfo(tfVector.getField());
222           openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
223
224           for (int j = 0; j < tfVector.size(); j++)
225             addTermInternal(tfVector.getTerms()[j], tfVector.getTermFrequencies()[j], null, null);
226
227           closeField();
228
229         }
230       }
231     }
232
233     closeDocument();
234   }
235   
236   /** Close all streams. */
237   final void close() throws IOException JavaDoc {
238     try {
239       closeDocument();
240     } finally {
241       // make an effort to close all streams we can but remember and re-throw
242
// the first exception encountered in this process
243
IOException JavaDoc keep = null;
244       if (tvx != null)
245         try {
246           tvx.close();
247         } catch (IOException JavaDoc e) {
248           if (keep == null) keep = e;
249         }
250       if (tvd != null)
251         try {
252           tvd.close();
253         } catch (IOException JavaDoc e) {
254           if (keep == null) keep = e;
255         }
256       if (tvf != null)
257         try {
258           tvf.close();
259         } catch (IOException JavaDoc e) {
260           if (keep == null) keep = e;
261         }
262       if (keep != null) throw (IOException JavaDoc) keep.fillInStackTrace();
263     }
264   }
265
266   
267
268   private void writeField() throws IOException JavaDoc {
269     // remember where this field is written
270
currentField.tvfPointer = tvf.getFilePointer();
271     //System.out.println("Field Pointer: " + currentField.tvfPointer);
272

273     final int size = terms.size();
274     tvf.writeVInt(size);
275     
276     boolean storePositions = currentField.storePositions;
277     boolean storeOffsets = currentField.storeOffsets;
278     byte bits = 0x0;
279     if (storePositions)
280       bits |= STORE_POSITIONS_WITH_TERMVECTOR;
281     if (storeOffsets)
282       bits |= STORE_OFFSET_WITH_TERMVECTOR;
283     tvf.writeByte(bits);
284     
285     String JavaDoc lastTermText = "";
286     for (int i = 0; i < size; i++) {
287       TVTerm term = (TVTerm) terms.elementAt(i);
288       int start = StringHelper.stringDifference(lastTermText, term.termText);
289       int length = term.termText.length() - start;
290       tvf.writeVInt(start); // write shared prefix length
291
tvf.writeVInt(length); // write delta length
292
tvf.writeChars(term.termText, start, length); // write delta chars
293
tvf.writeVInt(term.freq);
294       lastTermText = term.termText;
295       
296       if(storePositions){
297         if(term.positions == null)
298           throw new IllegalStateException JavaDoc("Trying to write positions that are null!");
299         
300         // use delta encoding for positions
301
int position = 0;
302         for (int j = 0; j < term.freq; j++){
303           tvf.writeVInt(term.positions[j] - position);
304           position = term.positions[j];
305         }
306       }
307       
308       if(storeOffsets){
309         if(term.offsets == null)
310           throw new IllegalStateException JavaDoc("Trying to write offsets that are null!");
311         
312         // use delta encoding for offsets
313
int position = 0;
314         for (int j = 0; j < term.freq; j++) {
315           tvf.writeVInt(term.offsets[j].getStartOffset() - position);
316           tvf.writeVInt(term.offsets[j].getEndOffset() - term.offsets[j].getStartOffset()); //Save the diff between the two.
317
position = term.offsets[j].getEndOffset();
318         }
319       }
320     }
321   }
322
323   private void writeDoc() throws IOException JavaDoc {
324     if (isFieldOpen())
325       throw new IllegalStateException JavaDoc("Field is still open while writing document");
326     //System.out.println("Writing doc pointer: " + currentDocPointer);
327
// write document index record
328
tvx.writeLong(currentDocPointer);
329
330     // write document data record
331
final int size = fields.size();
332
333     // write the number of fields
334
tvd.writeVInt(size);
335
336     // write field numbers
337
for (int i = 0; i < size; i++) {
338       TVField field = (TVField) fields.elementAt(i);
339       tvd.writeVInt(field.number);
340     }
341
342     // write field pointers
343
long lastFieldPointer = 0;
344     for (int i = 0; i < size; i++) {
345       TVField field = (TVField) fields.elementAt(i);
346       tvd.writeVLong(field.tvfPointer - lastFieldPointer);
347       lastFieldPointer = field.tvfPointer;
348     }
349     //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
350
}
351
352
353   private static class TVField {
354     int number;
355     long tvfPointer = 0;
356     boolean storePositions = false;
357     boolean storeOffsets = false;
358     TVField(int number, boolean storePos, boolean storeOff) {
359       this.number = number;
360       storePositions = storePos;
361       storeOffsets = storeOff;
362     }
363   }
364
365   private static class TVTerm {
366     String JavaDoc termText;
367     int freq = 0;
368     int positions[] = null;
369     TermVectorOffsetInfo [] offsets = null;
370   }
371
372
373 }
374
Popular Tags