KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > index > SegmentReader


1 package org.apache.lucene.index;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import java.io.IOException JavaDoc;
20 import java.util.*;
21
22 import org.apache.lucene.document.Document;
23 import org.apache.lucene.document.Field;
24 import org.apache.lucene.store.IndexInput;
25 import org.apache.lucene.store.IndexOutput;
26 import org.apache.lucene.store.Directory;
27 import org.apache.lucene.util.BitVector;
28 import org.apache.lucene.search.DefaultSimilarity;
29
30 /**
31  * @version $Id: SegmentReader.java 329523 2005-10-30 05:37:11Z yonik $
32  */

33 class SegmentReader extends IndexReader {
34   private String JavaDoc segment;
35
36   FieldInfos fieldInfos;
37   private FieldsReader fieldsReader;
38
39   TermInfosReader tis;
40   TermVectorsReader termVectorsReaderOrig = null;
41   ThreadLocal JavaDoc termVectorsLocal = new ThreadLocal JavaDoc();
42
43   BitVector deletedDocs = null;
44   private boolean deletedDocsDirty = false;
45   private boolean normsDirty = false;
46   private boolean undeleteAll = false;
47
48   IndexInput freqStream;
49   IndexInput proxStream;
50
51   // Compound File Reader when based on a compound file segment
52
CompoundFileReader cfsReader = null;
53
54   private class Norm {
55     public Norm(IndexInput in, int number)
56     {
57       this.in = in;
58       this.number = number;
59     }
60
61     private IndexInput in;
62     private byte[] bytes;
63     private boolean dirty;
64     private int number;
65
66     private void reWrite() throws IOException JavaDoc {
67       // NOTE: norms are re-written in regular directory, not cfs
68
IndexOutput out = directory().createOutput(segment + ".tmp");
69       try {
70         out.writeBytes(bytes, maxDoc());
71       } finally {
72         out.close();
73       }
74       String JavaDoc fileName;
75       if(cfsReader == null)
76           fileName = segment + ".f" + number;
77       else{
78           // use a different file name if we have compound format
79
fileName = segment + ".s" + number;
80       }
81       directory().renameFile(segment + ".tmp", fileName);
82       this.dirty = false;
83     }
84   }
85
86   private Hashtable norms = new Hashtable();
87
88   /** The class which implements SegmentReader. */
89   private static Class JavaDoc IMPL;
90   static {
91     try {
92       String JavaDoc name =
93         System.getProperty("org.apache.lucene.SegmentReader.class",
94                            SegmentReader.class.getName());
95       IMPL = Class.forName(name);
96     } catch (ClassNotFoundException JavaDoc e) {
97       throw new RuntimeException JavaDoc("cannot load SegmentReader class: " + e);
98     } catch (SecurityException JavaDoc se) {
99       try {
100         IMPL = Class.forName(SegmentReader.class.getName());
101       } catch (ClassNotFoundException JavaDoc e) {
102         throw new RuntimeException JavaDoc("cannot load default SegmentReader class: " + e);
103       }
104     }
105   }
106
107   protected SegmentReader() { super(null); }
108
109   public static SegmentReader get(SegmentInfo si) throws IOException JavaDoc {
110     return get(si.dir, si, null, false, false);
111   }
112
113   public static SegmentReader get(SegmentInfos sis, SegmentInfo si,
114                                   boolean closeDir) throws IOException JavaDoc {
115     return get(si.dir, si, sis, closeDir, true);
116   }
117
118   public static SegmentReader get(Directory dir, SegmentInfo si,
119                                   SegmentInfos sis,
120                                   boolean closeDir, boolean ownDir)
121     throws IOException JavaDoc {
122     SegmentReader instance;
123     try {
124       instance = (SegmentReader)IMPL.newInstance();
125     } catch (Exception JavaDoc e) {
126       throw new RuntimeException JavaDoc("cannot load SegmentReader class: " + e);
127     }
128     instance.init(dir, sis, closeDir, ownDir);
129     instance.initialize(si);
130     return instance;
131   }
132
133    private void initialize(SegmentInfo si) throws IOException JavaDoc {
134     segment = si.name;
135
136     // Use compound file directory for some files, if it exists
137
Directory cfsDir = directory();
138     if (directory().fileExists(segment + ".cfs")) {
139       cfsReader = new CompoundFileReader(directory(), segment + ".cfs");
140       cfsDir = cfsReader;
141     }
142
143     // No compound file exists - use the multi-file format
144
fieldInfos = new FieldInfos(cfsDir, segment + ".fnm");
145     fieldsReader = new FieldsReader(cfsDir, segment, fieldInfos);
146
147     tis = new TermInfosReader(cfsDir, segment, fieldInfos);
148
149     // NOTE: the bitvector is stored using the regular directory, not cfs
150
if (hasDeletions(si))
151       deletedDocs = new BitVector(directory(), segment + ".del");
152
153     // make sure that all index files have been read or are kept open
154
// so that if an index update removes them we'll still have them
155
freqStream = cfsDir.openInput(segment + ".frq");
156     proxStream = cfsDir.openInput(segment + ".prx");
157     openNorms(cfsDir);
158
159     if (fieldInfos.hasVectors()) { // open term vector files only as needed
160
termVectorsReaderOrig = new TermVectorsReader(cfsDir, segment, fieldInfos);
161     }
162   }
163
164    protected void finalize() {
165      // patch for pre-1.4.2 JVMs, whose ThreadLocals leak
166
termVectorsLocal.set(null);
167      super.finalize();
168    }
169
170   protected void doCommit() throws IOException JavaDoc {
171     if (deletedDocsDirty) { // re-write deleted
172
deletedDocs.write(directory(), segment + ".tmp");
173       directory().renameFile(segment + ".tmp", segment + ".del");
174     }
175     if(undeleteAll && directory().fileExists(segment + ".del")){
176       directory().deleteFile(segment + ".del");
177     }
178     if (normsDirty) { // re-write norms
179
Enumeration values = norms.elements();
180       while (values.hasMoreElements()) {
181         Norm norm = (Norm) values.nextElement();
182         if (norm.dirty) {
183           norm.reWrite();
184         }
185       }
186     }
187     deletedDocsDirty = false;
188     normsDirty = false;
189     undeleteAll = false;
190   }
191
192   protected void doClose() throws IOException JavaDoc {
193     fieldsReader.close();
194     tis.close();
195
196     if (freqStream != null)
197       freqStream.close();
198     if (proxStream != null)
199       proxStream.close();
200
201     closeNorms();
202
203     if (termVectorsReaderOrig != null)
204       termVectorsReaderOrig.close();
205
206     if (cfsReader != null)
207       cfsReader.close();
208   }
209
210   static boolean hasDeletions(SegmentInfo si) throws IOException JavaDoc {
211     return si.dir.fileExists(si.name + ".del");
212   }
213
214   public boolean hasDeletions() {
215     return deletedDocs != null;
216   }
217
218
219   static boolean usesCompoundFile(SegmentInfo si) throws IOException JavaDoc {
220     return si.dir.fileExists(si.name + ".cfs");
221   }
222
223   static boolean hasSeparateNorms(SegmentInfo si) throws IOException JavaDoc {
224     String JavaDoc[] result = si.dir.list();
225     String JavaDoc pattern = si.name + ".s";
226     int patternLength = pattern.length();
227     for(int i = 0; i < result.length; i++){
228       if(result[i].startsWith(pattern) && Character.isDigit(result[i].charAt(patternLength)))
229         return true;
230     }
231     return false;
232   }
233
234   protected void doDelete(int docNum) {
235     if (deletedDocs == null)
236       deletedDocs = new BitVector(maxDoc());
237     deletedDocsDirty = true;
238     undeleteAll = false;
239     deletedDocs.set(docNum);
240   }
241
242   protected void doUndeleteAll() {
243       deletedDocs = null;
244       deletedDocsDirty = false;
245       undeleteAll = true;
246   }
247
248   Vector files() throws IOException JavaDoc {
249     Vector files = new Vector(16);
250
251     for (int i = 0; i < IndexFileNames.INDEX_EXTENSIONS.length; i++) {
252       String JavaDoc name = segment + "." + IndexFileNames.INDEX_EXTENSIONS[i];
253       if (directory().fileExists(name))
254         files.addElement(name);
255     }
256
257     for (int i = 0; i < fieldInfos.size(); i++) {
258       FieldInfo fi = fieldInfos.fieldInfo(i);
259       if (fi.isIndexed && !fi.omitNorms){
260         String JavaDoc name;
261         if(cfsReader == null)
262             name = segment + ".f" + i;
263         else
264             name = segment + ".s" + i;
265         if (directory().fileExists(name))
266             files.addElement(name);
267       }
268     }
269     return files;
270   }
271
272   public TermEnum terms() {
273     return tis.terms();
274   }
275
276   public TermEnum terms(Term t) throws IOException JavaDoc {
277     return tis.terms(t);
278   }
279
280   public synchronized Document document(int n) throws IOException JavaDoc {
281     if (isDeleted(n))
282       throw new IllegalArgumentException JavaDoc
283               ("attempt to access a deleted document");
284     return fieldsReader.doc(n);
285   }
286
287   public synchronized boolean isDeleted(int n) {
288     return (deletedDocs != null && deletedDocs.get(n));
289   }
290
291   public TermDocs termDocs() throws IOException JavaDoc {
292     return new SegmentTermDocs(this);
293   }
294
295   public TermPositions termPositions() throws IOException JavaDoc {
296     return new SegmentTermPositions(this);
297   }
298
299   public int docFreq(Term t) throws IOException JavaDoc {
300     TermInfo ti = tis.get(t);
301     if (ti != null)
302       return ti.docFreq;
303     else
304       return 0;
305   }
306
307   public int numDocs() {
308     int n = maxDoc();
309     if (deletedDocs != null)
310       n -= deletedDocs.count();
311     return n;
312   }
313
314   public int maxDoc() {
315     return fieldsReader.size();
316   }
317
318   /**
319    * @see IndexReader#getFieldNames()
320    * @deprecated Replaced by {@link #getFieldNames (IndexReader.FieldOption fldOption)}
321    */

322   public Collection getFieldNames() {
323     // maintain a unique set of field names
324
Set fieldSet = new HashSet();
325     for (int i = 0; i < fieldInfos.size(); i++) {
326       FieldInfo fi = fieldInfos.fieldInfo(i);
327       fieldSet.add(fi.name);
328     }
329     return fieldSet;
330   }
331
332   /**
333    * @see IndexReader#getFieldNames(boolean)
334    * @deprecated Replaced by {@link #getFieldNames (IndexReader.FieldOption fldOption)}
335    */

336   public Collection getFieldNames(boolean indexed) {
337     // maintain a unique set of field names
338
Set fieldSet = new HashSet();
339     for (int i = 0; i < fieldInfos.size(); i++) {
340       FieldInfo fi = fieldInfos.fieldInfo(i);
341       if (fi.isIndexed == indexed)
342         fieldSet.add(fi.name);
343     }
344     return fieldSet;
345   }
346
347   /**
348    * @see IndexReader#getIndexedFieldNames(Field.TermVector tvSpec)
349    * @deprecated Replaced by {@link #getFieldNames (IndexReader.FieldOption fldOption)}
350    */

351   public Collection getIndexedFieldNames (Field.TermVector tvSpec){
352     boolean storedTermVector;
353     boolean storePositionWithTermVector;
354     boolean storeOffsetWithTermVector;
355
356     if(tvSpec == Field.TermVector.NO){
357       storedTermVector = false;
358       storePositionWithTermVector = false;
359       storeOffsetWithTermVector = false;
360     }
361     else if(tvSpec == Field.TermVector.YES){
362       storedTermVector = true;
363       storePositionWithTermVector = false;
364       storeOffsetWithTermVector = false;
365     }
366     else if(tvSpec == Field.TermVector.WITH_POSITIONS){
367       storedTermVector = true;
368       storePositionWithTermVector = true;
369       storeOffsetWithTermVector = false;
370     }
371     else if(tvSpec == Field.TermVector.WITH_OFFSETS){
372       storedTermVector = true;
373       storePositionWithTermVector = false;
374       storeOffsetWithTermVector = true;
375     }
376     else if(tvSpec == Field.TermVector.WITH_POSITIONS_OFFSETS){
377       storedTermVector = true;
378       storePositionWithTermVector = true;
379       storeOffsetWithTermVector = true;
380     }
381     else{
382       throw new IllegalArgumentException JavaDoc("unknown termVector parameter " + tvSpec);
383     }
384
385     // maintain a unique set of field names
386
Set fieldSet = new HashSet();
387     for (int i = 0; i < fieldInfos.size(); i++) {
388       FieldInfo fi = fieldInfos.fieldInfo(i);
389       if (fi.isIndexed && fi.storeTermVector == storedTermVector &&
390           fi.storePositionWithTermVector == storePositionWithTermVector &&
391           fi.storeOffsetWithTermVector == storeOffsetWithTermVector){
392         fieldSet.add(fi.name);
393       }
394     }
395     return fieldSet;
396   }
397
398   /**
399    * @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption)
400    */

401   public Collection getFieldNames(IndexReader.FieldOption fieldOption) {
402
403     Set fieldSet = new HashSet();
404     for (int i = 0; i < fieldInfos.size(); i++) {
405       FieldInfo fi = fieldInfos.fieldInfo(i);
406       if (fieldOption == IndexReader.FieldOption.ALL) {
407         fieldSet.add(fi.name);
408       }
409       else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
410         fieldSet.add(fi.name);
411       }
412       else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) {
413         fieldSet.add(fi.name);
414       }
415       else if (fi.isIndexed && fi.storeTermVector == false && fieldOption == IndexReader.FieldOption.INDEXED_NO_TERMVECTOR) {
416         fieldSet.add(fi.name);
417       }
418       else if (fi.storeTermVector == true &&
419                fi.storePositionWithTermVector == false &&
420                fi.storeOffsetWithTermVector == false &&
421                fieldOption == IndexReader.FieldOption.TERMVECTOR) {
422         fieldSet.add(fi.name);
423       }
424       else if (fi.isIndexed && fi.storeTermVector && fieldOption == IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR) {
425         fieldSet.add(fi.name);
426       }
427       else if (fi.storePositionWithTermVector && fi.storeOffsetWithTermVector == false && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION) {
428         fieldSet.add(fi.name);
429       }
430       else if (fi.storeOffsetWithTermVector && fi.storePositionWithTermVector == false && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET) {
431         fieldSet.add(fi.name);
432       }
433       else if ((fi.storeOffsetWithTermVector && fi.storePositionWithTermVector) &&
434                 fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET) {
435         fieldSet.add(fi.name);
436       }
437     }
438     return fieldSet;
439   }
440
441
442   public synchronized boolean hasNorms(String JavaDoc field) {
443     return norms.containsKey(field);
444   }
445
446   static byte[] createFakeNorms(int size) {
447     byte[] ones = new byte[size];
448     Arrays.fill(ones, DefaultSimilarity.encodeNorm(1.0f));
449     return ones;
450   }
451
452   private byte[] ones;
453   private byte[] fakeNorms() {
454     if (ones==null) ones=createFakeNorms(maxDoc());
455     return ones;
456   }
457
458   // can return null if norms aren't stored
459
protected synchronized byte[] getNorms(String JavaDoc field) throws IOException JavaDoc {
460     Norm norm = (Norm) norms.get(field);
461     if (norm == null) return null; // not indexed, or norms not stored
462

463     if (norm.bytes == null) { // value not yet read
464
byte[] bytes = new byte[maxDoc()];
465       norms(field, bytes, 0);
466       norm.bytes = bytes; // cache it
467
}
468     return norm.bytes;
469   }
470
471   // returns fake norms if norms aren't available
472
public synchronized byte[] norms(String JavaDoc field) throws IOException JavaDoc {
473     byte[] bytes = getNorms(field);
474     if (bytes==null) bytes=fakeNorms();
475     return bytes;
476   }
477
478   protected void doSetNorm(int doc, String JavaDoc field, byte value)
479           throws IOException JavaDoc {
480     Norm norm = (Norm) norms.get(field);
481     if (norm == null) // not an indexed field
482
return;
483     norm.dirty = true; // mark it dirty
484
normsDirty = true;
485
486     norms(field)[doc] = value; // set the value
487
}
488
489   /** Read norms into a pre-allocated array. */
490   public synchronized void norms(String JavaDoc field, byte[] bytes, int offset)
491     throws IOException JavaDoc {
492
493     Norm norm = (Norm) norms.get(field);
494     if (norm == null) {
495       System.arraycopy(fakeNorms(), 0, bytes, offset, maxDoc());
496       return;
497     }
498
499     if (norm.bytes != null) { // can copy from cache
500
System.arraycopy(norm.bytes, 0, bytes, offset, maxDoc());
501       return;
502     }
503
504     IndexInput normStream = (IndexInput) norm.in.clone();
505     try { // read from disk
506
normStream.seek(0);
507       normStream.readBytes(bytes, offset, maxDoc());
508     } finally {
509       normStream.close();
510     }
511   }
512
513
514   private void openNorms(Directory cfsDir) throws IOException JavaDoc {
515     for (int i = 0; i < fieldInfos.size(); i++) {
516       FieldInfo fi = fieldInfos.fieldInfo(i);
517       if (fi.isIndexed && !fi.omitNorms) {
518         // look first if there are separate norms in compound format
519
String JavaDoc fileName = segment + ".s" + fi.number;
520         Directory d = directory();
521         if(!d.fileExists(fileName)){
522             fileName = segment + ".f" + fi.number;
523             d = cfsDir;
524         }
525         norms.put(fi.name, new Norm(d.openInput(fileName), fi.number));
526       }
527     }
528   }
529
530   private void closeNorms() throws IOException JavaDoc {
531     synchronized (norms) {
532       Enumeration enumerator = norms.elements();
533       while (enumerator.hasMoreElements()) {
534         Norm norm = (Norm) enumerator.nextElement();
535         norm.in.close();
536       }
537     }
538   }
539   
540   /**
541    * Create a clone from the initial TermVectorsReader and store it in the ThreadLocal.
542    * @return TermVectorsReader
543    */

544   private TermVectorsReader getTermVectorsReader() {
545     TermVectorsReader tvReader = (TermVectorsReader)termVectorsLocal.get();
546     if (tvReader == null) {
547       tvReader = (TermVectorsReader)termVectorsReaderOrig.clone();
548       termVectorsLocal.set(tvReader);
549     }
550     return tvReader;
551   }
552   
553   /** Return a term frequency vector for the specified document and field. The
554    * vector returned contains term numbers and frequencies for all terms in
555    * the specified field of this document, if the field had storeTermVector
556    * flag set. If the flag was not set, the method returns null.
557    * @throws IOException
558    */

559   public TermFreqVector getTermFreqVector(int docNumber, String JavaDoc field) throws IOException JavaDoc {
560     // Check if this field is invalid or has no stored term vector
561
FieldInfo fi = fieldInfos.fieldInfo(field);
562     if (fi == null || !fi.storeTermVector || termVectorsReaderOrig == null)
563       return null;
564     
565     TermVectorsReader termVectorsReader = getTermVectorsReader();
566     if (termVectorsReader == null)
567       return null;
568     
569     return termVectorsReader.get(docNumber, field);
570   }
571
572
573   /** Return an array of term frequency vectors for the specified document.
574    * The array contains a vector for each vectorized field in the document.
575    * Each vector vector contains term numbers and frequencies for all terms
576    * in a given vectorized field.
577    * If no such fields existed, the method returns null.
578    * @throws IOException
579    */

580   public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException JavaDoc {
581     if (termVectorsReaderOrig == null)
582       return null;
583     
584     TermVectorsReader termVectorsReader = getTermVectorsReader();
585     if (termVectorsReader == null)
586       return null;
587     
588     return termVectorsReader.get(docNumber);
589   }
590 }
591
Popular Tags