KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > index > TermVectorsReader


1 package org.apache.lucene.index;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import org.apache.lucene.store.Directory;
20 import org.apache.lucene.store.IndexInput;
21
22 import java.io.IOException JavaDoc;
23
24 /**
25  * @version $Id: TermVectorsReader.java 170226 2005-05-15 15:04:39Z bmesser $
26  */

27 class TermVectorsReader implements Cloneable JavaDoc {
28   private FieldInfos fieldInfos;
29
30   private IndexInput tvx;
31   private IndexInput tvd;
32   private IndexInput tvf;
33   private int size;
34   
35   private int tvdFormat;
36   private int tvfFormat;
37
38   TermVectorsReader(Directory d, String JavaDoc segment, FieldInfos fieldInfos)
39     throws IOException JavaDoc {
40     if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {
41       tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION);
42       checkValidFormat(tvx);
43       tvd = d.openInput(segment + TermVectorsWriter.TVD_EXTENSION);
44       tvdFormat = checkValidFormat(tvd);
45       tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION);
46       tvfFormat = checkValidFormat(tvf);
47       size = (int) tvx.length() / 8;
48     }
49
50     this.fieldInfos = fieldInfos;
51   }
52   
53   private int checkValidFormat(IndexInput in) throws IOException JavaDoc
54   {
55     int format = in.readInt();
56     if (format > TermVectorsWriter.FORMAT_VERSION)
57     {
58       throw new IOException JavaDoc("Incompatible format version: " + format + " expected "
59               + TermVectorsWriter.FORMAT_VERSION + " or less");
60     }
61     return format;
62   }
63
64   void close() throws IOException JavaDoc {
65     // make all effort to close up. Keep the first exception
66
// and throw it as a new one.
67
IOException JavaDoc keep = null;
68     if (tvx != null) try { tvx.close(); } catch (IOException JavaDoc e) { if (keep == null) keep = e; }
69     if (tvd != null) try { tvd.close(); } catch (IOException JavaDoc e) { if (keep == null) keep = e; }
70     if (tvf != null) try { tvf.close(); } catch (IOException JavaDoc e) { if (keep == null) keep = e; }
71     if (keep != null) throw (IOException JavaDoc) keep.fillInStackTrace();
72   }
73
74   /**
75    *
76    * @return The number of documents in the reader
77    */

78   int size() {
79     return size;
80   }
81
82   /**
83    * Retrieve the term vector for the given document and field
84    * @param docNum The document number to retrieve the vector for
85    * @param field The field within the document to retrieve
86    * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
87    * @throws IOException if there is an error reading the term vector files
88    */

89   TermFreqVector get(int docNum, String JavaDoc field) throws IOException JavaDoc {
90     // Check if no term vectors are available for this segment at all
91
int fieldNumber = fieldInfos.fieldNumber(field);
92     TermFreqVector result = null;
93     if (tvx != null) {
94       //We need to account for the FORMAT_SIZE at when seeking in the tvx
95
//We don't need to do this in other seeks because we already have the
96
// file pointer
97
//that was written in another file
98
tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
99       //System.out.println("TVX Pointer: " + tvx.getFilePointer());
100
long position = tvx.readLong();
101
102       tvd.seek(position);
103       int fieldCount = tvd.readVInt();
104       //System.out.println("Num Fields: " + fieldCount);
105
// There are only a few fields per document. We opt for a full scan
106
// rather then requiring that they be ordered. We need to read through
107
// all of the fields anyway to get to the tvf pointers.
108
int number = 0;
109       int found = -1;
110       for (int i = 0; i < fieldCount; i++) {
111         if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
112           number = tvd.readVInt();
113         else
114           number += tvd.readVInt();
115         
116         if (number == fieldNumber)
117           found = i;
118       }
119
120       // This field, although valid in the segment, was not found in this
121
// document
122
if (found != -1) {
123         // Compute position in the tvf file
124
position = 0;
125         for (int i = 0; i <= found; i++)
126           position += tvd.readVLong();
127
128         result = readTermVector(field, position);
129       } else {
130         //System.out.println("Field not found");
131
}
132     } else {
133       //System.out.println("No tvx file");
134
}
135     return result;
136   }
137
138   /**
139    * Return all term vectors stored for this document or null if the could not be read in.
140    *
141    * @param docNum The document number to retrieve the vector for
142    * @return All term frequency vectors
143    * @throws IOException if there is an error reading the term vector files
144    */

145   TermFreqVector[] get(int docNum) throws IOException JavaDoc {
146     TermFreqVector[] result = null;
147     // Check if no term vectors are available for this segment at all
148
if (tvx != null) {
149       //We need to offset by
150
tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
151       long position = tvx.readLong();
152
153       tvd.seek(position);
154       int fieldCount = tvd.readVInt();
155
156       // No fields are vectorized for this document
157
if (fieldCount != 0) {
158         int number = 0;
159         String JavaDoc[] fields = new String JavaDoc[fieldCount];
160         
161         for (int i = 0; i < fieldCount; i++) {
162           if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
163             number = tvd.readVInt();
164           else
165             number += tvd.readVInt();
166
167           fields[i] = fieldInfos.fieldName(number);
168         }
169
170         // Compute position in the tvf file
171
position = 0;
172         long[] tvfPointers = new long[fieldCount];
173         for (int i = 0; i < fieldCount; i++) {
174           position += tvd.readVLong();
175           tvfPointers[i] = position;
176         }
177
178         result = readTermVectors(fields, tvfPointers);
179       }
180     } else {
181       //System.out.println("No tvx file");
182
}
183     return result;
184   }
185
186
187   private SegmentTermVector[] readTermVectors(String JavaDoc fields[], long tvfPointers[])
188           throws IOException JavaDoc {
189     SegmentTermVector res[] = new SegmentTermVector[fields.length];
190     for (int i = 0; i < fields.length; i++) {
191       res[i] = readTermVector(fields[i], tvfPointers[i]);
192     }
193     return res;
194   }
195
196   /**
197    *
198    * @param field The field to read in
199    * @param tvfPointer The pointer within the tvf file where we should start reading
200    * @return The TermVector located at that position
201    * @throws IOException
202    */

203   private SegmentTermVector readTermVector(String JavaDoc field, long tvfPointer)
204           throws IOException JavaDoc {
205
206     // Now read the data from specified position
207
//We don't need to offset by the FORMAT here since the pointer already includes the offset
208
tvf.seek(tvfPointer);
209
210     int numTerms = tvf.readVInt();
211     //System.out.println("Num Terms: " + numTerms);
212
// If no terms - return a constant empty termvector. However, this should never occur!
213
if (numTerms == 0)
214       return new SegmentTermVector(field, null, null);
215     
216     boolean storePositions;
217     boolean storeOffsets;
218     
219     if(tvfFormat == TermVectorsWriter.FORMAT_VERSION){
220       byte bits = tvf.readByte();
221       storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
222       storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
223     }
224     else{
225       tvf.readVInt();
226       storePositions = false;
227       storeOffsets = false;
228     }
229
230     String JavaDoc terms[] = new String JavaDoc[numTerms];
231     int termFreqs[] = new int[numTerms];
232     
233     // we may not need these, but declare them
234
int positions[][] = null;
235     TermVectorOffsetInfo offsets[][] = null;
236     if(storePositions)
237       positions = new int[numTerms][];
238     if(storeOffsets)
239       offsets = new TermVectorOffsetInfo[numTerms][];
240     
241     int start = 0;
242     int deltaLength = 0;
243     int totalLength = 0;
244     char [] buffer = new char[10]; // init the buffer with a length of 10 character
245
char[] previousBuffer = {};
246     
247     for (int i = 0; i < numTerms; i++) {
248       start = tvf.readVInt();
249       deltaLength = tvf.readVInt();
250       totalLength = start + deltaLength;
251       if (buffer.length < totalLength) { // increase buffer
252
buffer = null; // give a hint to garbage collector
253
buffer = new char[totalLength];
254         
255         if (start > 0) // just copy if necessary
256
System.arraycopy(previousBuffer, 0, buffer, 0, start);
257       }
258       
259       tvf.readChars(buffer, start, deltaLength);
260       terms[i] = new String JavaDoc(buffer, 0, totalLength);
261       previousBuffer = buffer;
262       int freq = tvf.readVInt();
263       termFreqs[i] = freq;
264       
265       if (storePositions) { //read in the positions
266
int [] pos = new int[freq];
267         positions[i] = pos;
268         int prevPosition = 0;
269         for (int j = 0; j < freq; j++)
270         {
271           pos[j] = prevPosition + tvf.readVInt();
272           prevPosition = pos[j];
273         }
274       }
275       
276       if (storeOffsets) {
277         TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
278         offsets[i] = offs;
279         int prevOffset = 0;
280         for (int j = 0; j < freq; j++) {
281           int startOffset = prevOffset + tvf.readVInt();
282           int endOffset = startOffset + tvf.readVInt();
283           offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
284           prevOffset = endOffset;
285         }
286       }
287     }
288     
289     SegmentTermVector tv;
290     if (storePositions || storeOffsets){
291       tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
292     }
293     else {
294       tv = new SegmentTermVector(field, terms, termFreqs);
295     }
296     return tv;
297   }
298
299   protected Object JavaDoc clone() {
300     
301     if (tvx == null || tvd == null || tvf == null)
302       return null;
303     
304     TermVectorsReader clone = null;
305     try {
306       clone = (TermVectorsReader) super.clone();
307     } catch (CloneNotSupportedException JavaDoc e) {}
308
309     clone.tvx = (IndexInput) tvx.clone();
310     clone.tvd = (IndexInput) tvd.clone();
311     clone.tvf = (IndexInput) tvf.clone();
312     
313     return clone;
314   }
315 }
316
Popular Tags