KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > document > Field


1 package org.apache.lucene.document;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import java.io.Reader JavaDoc;
20 import java.io.Serializable JavaDoc;
21 import java.util.Date JavaDoc;
22
23 import org.apache.lucene.index.IndexReader;
24 import org.apache.lucene.search.Hits;
25 import org.apache.lucene.search.Similarity;
26 import org.apache.lucene.util.Parameter;
27
28 /**
29   A field is a section of a Document. Each field has two parts, a name and a
30   value. Values may be free text, provided as a String or as a Reader, or they
31   may be atomic keywords, which are not further processed. Such keywords may
32   be used to represent dates, urls, etc. Fields are optionally stored in the
33   index, so that they may be returned with hits on the document.
34   */

35
36 public final class Field implements Serializable JavaDoc {
37   private String JavaDoc name = "body";
38   
39   // the one and only data object for all different kind of field values
40
private Object JavaDoc fieldsData = null;
41   
42   private boolean storeTermVector = false;
43   private boolean storeOffsetWithTermVector = false;
44   private boolean storePositionWithTermVector = false;
45   private boolean omitNorms = false;
46   private boolean isStored = false;
47   private boolean isIndexed = true;
48   private boolean isTokenized = true;
49   private boolean isBinary = false;
50   private boolean isCompressed = false;
51   
52   private float boost = 1.0f;
53   
54   /** Specifies whether and how a field should be stored. */
55   public static final class Store extends Parameter implements Serializable JavaDoc {
56     
57     private Store(String JavaDoc name) {
58       super(name);
59     }
60     
61     /** Store the original field value in the index in a compressed form. This is
62      * useful for long documents and for binary valued fields.
63      */

64     public static final Store COMPRESS = new Store("COMPRESS");
65     
66     /** Store the original field value in the index. This is useful for short texts
67      * like a document's title which should be displayed with the results. The
68      * value is stored in its original form, i.e. no analyzer is used before it is
69      * stored.
70      */

71     public static final Store YES = new Store("YES");
72     
73     /** Do not store the field value in the index. */
74     public static final Store NO = new Store("NO");
75   }
76   
77   /** Specifies whether and how a field should be indexed. */
78   public static final class Index extends Parameter implements Serializable JavaDoc {
79     
80     private Index(String JavaDoc name) {
81       super(name);
82     }
83     
84     /** Do not index the field value. This field can thus not be searched,
85      * but one can still access its contents provided it is
86      * {@link Field.Store stored}. */

87     public static final Index NO = new Index("NO");
88     
89     /** Index the field's value so it can be searched. An Analyzer will be used
90      * to tokenize and possibly further normalize the text before its
91      * terms will be stored in the index. This is useful for common text.
92      */

93     public static final Index TOKENIZED = new Index("TOKENIZED");
94     
95     /** Index the field's value without using an Analyzer, so it can be searched.
96      * As no analyzer is used the value will be stored as a single term. This is
97      * useful for unique Ids like product numbers.
98      */

99     public static final Index UN_TOKENIZED = new Index("UN_TOKENIZED");
100
101     /** Index the field's value without an Analyzer, and disable
102      * the storing of norms. No norms means that index-time boosting
103      * and field length normalization will be disabled. The benefit is
104      * less memory usage as norms take up one byte per indexed field
105      * for every document in the index.
106      */

107     public static final Index NO_NORMS = new Index("NO_NORMS");
108
109   }
110
111   /** Specifies whether and how a field should have term vectors. */
112   public static final class TermVector extends Parameter implements Serializable JavaDoc {
113     
114     private TermVector(String JavaDoc name) {
115       super(name);
116     }
117     
118     /** Do not store term vectors.
119      */

120     public static final TermVector NO = new TermVector("NO");
121     
122     /** Store the term vectors of each document. A term vector is a list
123      * of the document's terms and their number of occurences in that document. */

124     public static final TermVector YES = new TermVector("YES");
125     
126     /**
127      * Store the term vector + token position information
128      *
129      * @see #YES
130      */

131     public static final TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS");
132     
133     /**
134      * Store the term vector + Token offset information
135      *
136      * @see #YES
137      */

138     public static final TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS");
139     
140     /**
141      * Store the term vector + Token position and offset information
142      *
143      * @see #YES
144      * @see #WITH_POSITIONS
145      * @see #WITH_OFFSETS
146      */

147     public static final TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS");
148   }
149   
150   /** Sets the boost factor hits on this field. This value will be
151    * multiplied into the score of all hits on this this field of this
152    * document.
153    *
154    * <p>The boost is multiplied by {@link Document#getBoost()} of the document
155    * containing this field. If a document has multiple fields with the same
156    * name, all such values are multiplied together. This product is then
157    * multipled by the value {@link Similarity#lengthNorm(String,int)}, and
158    * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the
159    * index. One should attempt to ensure that this product does not overflow
160    * the range of that encoding.
161    *
162    * @see Document#setBoost(float)
163    * @see Similarity#lengthNorm(String, int)
164    * @see Similarity#encodeNorm(float)
165    */

166   public void setBoost(float boost) {
167     this.boost = boost;
168   }
169
170   /** Returns the boost factor for hits for this field.
171    *
172    * <p>The default value is 1.0.
173    *
174    * <p>Note: this value is not stored directly with the document in the index.
175    * Documents returned from {@link IndexReader#document(int)} and
176    * {@link Hits#doc(int)} may thus not have the same value present as when
177    * this field was indexed.
178    *
179    * @see #setBoost(float)
180    */

181   public float getBoost() {
182     return boost;
183   }
184
185   /** Constructs a String-valued Field that is not tokenized, but is indexed
186     and stored. Useful for non-text fields, e.g. date or url.
187     @deprecated use {@link #Field(String, String, Field.Store, Field.Index)
188       Field(name, value, Field.Store.YES, Field.Index.UN_TOKENIZED)} instead */

189   public static final Field Keyword(String JavaDoc name, String JavaDoc value) {
190     return new Field(name, value, true, true, false);
191   }
192
193   /** Constructs a String-valued Field that is not tokenized nor indexed,
194     but is stored in the index, for return with hits.
195     @deprecated use {@link #Field(String, String, Field.Store, Field.Index)
196       Field(name, value, Field.Store.YES, Field.Index.NO)} instead */

197   public static final Field UnIndexed(String JavaDoc name, String JavaDoc value) {
198     return new Field(name, value, true, false, false);
199   }
200
201   /** Constructs a String-valued Field that is tokenized and indexed,
202     and is stored in the index, for return with hits. Useful for short text
203     fields, like "title" or "subject". Term vector will not be stored for this field.
204   @deprecated use {@link #Field(String, String, Field.Store, Field.Index)
205     Field(name, value, Field.Store.YES, Field.Index.TOKENIZED)} instead */

206   public static final Field Text(String JavaDoc name, String JavaDoc value) {
207     return Text(name, value, false);
208   }
209
210   /** Constructs a Date-valued Field that is not tokenized and is indexed,
211       and stored in the index, for return with hits.
212       @deprecated use {@link #Field(String, String, Field.Store, Field.Index)
213       Field(name, value, Field.Store.YES, Field.Index.UN_TOKENIZED)} instead */

214   public static final Field Keyword(String JavaDoc name, Date JavaDoc value) {
215     return new Field(name, DateField.dateToString(value), true, true, false);
216   }
217
218   /** Constructs a String-valued Field that is tokenized and indexed,
219     and is stored in the index, for return with hits. Useful for short text
220     fields, like "title" or "subject".
221     @deprecated use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)
222       Field(name, value, Field.Store.YES, Field.Index.TOKENIZED, storeTermVector)} instead */

223   public static final Field Text(String JavaDoc name, String JavaDoc value, boolean storeTermVector) {
224     return new Field(name, value, true, true, true, storeTermVector);
225   }
226
227   /** Constructs a String-valued Field that is tokenized and indexed,
228     but that is not stored in the index. Term vector will not be stored for this field.
229     @deprecated use {@link #Field(String, String, Field.Store, Field.Index)
230       Field(name, value, Field.Store.NO, Field.Index.TOKENIZED)} instead */

231   public static final Field UnStored(String JavaDoc name, String JavaDoc value) {
232     return UnStored(name, value, false);
233   }
234
235   /** Constructs a String-valued Field that is tokenized and indexed,
236     but that is not stored in the index.
237     @deprecated use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)
238       Field(name, value, Field.Store.NO, Field.Index.TOKENIZED, storeTermVector)} instead */

239   public static final Field UnStored(String JavaDoc name, String JavaDoc value, boolean storeTermVector) {
240     return new Field(name, value, false, true, true, storeTermVector);
241   }
242
243   /** Constructs a Reader-valued Field that is tokenized and indexed, but is
244     not stored in the index verbatim. Useful for longer text fields, like
245     "body". Term vector will not be stored for this field.
246     @deprecated use {@link #Field(String, Reader) Field(name, value)} instead */

247   public static final Field Text(String JavaDoc name, Reader JavaDoc value) {
248     return Text(name, value, false);
249   }
250
251   /** Constructs a Reader-valued Field that is tokenized and indexed, but is
252     not stored in the index verbatim. Useful for longer text fields, like
253     "body".
254     @deprecated use {@link #Field(String, Reader, Field.TermVector)
255       Field(name, value, storeTermVector)} instead */

256   public static final Field Text(String JavaDoc name, Reader JavaDoc value, boolean storeTermVector) {
257     Field f = new Field(name, value);
258     f.storeTermVector = storeTermVector;
259     return f;
260   }
261   
262   /** Returns the name of the field as an interned string.
263    * For example "date", "title", "body", ...
264    */

265   public String JavaDoc name() { return name; }
266
267   /** The value of the field as a String, or null. If null, the Reader value
268    * or binary value is used. Exactly one of stringValue(), readerValue(), and
269    * binaryValue() must be set. */

270   public String JavaDoc stringValue() { return fieldsData instanceof String JavaDoc ? (String JavaDoc)fieldsData : null; }
271   
272   /** The value of the field as a Reader, or null. If null, the String value
273    * or binary value is used. Exactly one of stringValue(), readerValue(),
274    * and binaryValue() must be set. */

275   public Reader JavaDoc readerValue() { return fieldsData instanceof Reader JavaDoc ? (Reader JavaDoc)fieldsData : null; }
276   
277   /** The value of the field in Binary, or null. If null, the Reader or
278    * String value is used. Exactly one of stringValue(), readerValue() and
279    * binaryValue() must be set. */

280   public byte[] binaryValue() { return fieldsData instanceof byte[] ? (byte[])fieldsData : null; }
281   
282   /**
283    * Create a field by specifying its name, value and how it will
284    * be saved in the index. Term vectors will not be stored in the index.
285    *
286    * @param name The name of the field
287    * @param value The string to process
288    * @param store Whether <code>value</code> should be stored in the index
289    * @param index Whether the field should be indexed, and if so, if it should
290    * be tokenized before indexing
291    * @throws NullPointerException if name or value is <code>null</code>
292    * @throws IllegalArgumentException if the field is neither stored nor indexed
293    */

294   public Field(String JavaDoc name, String JavaDoc value, Store store, Index index) {
295     this(name, value, store, index, TermVector.NO);
296   }
297   
298   /**
299    * Create a field by specifying its name, value and how it will
300    * be saved in the index.
301    *
302    * @param name The name of the field
303    * @param value The string to process
304    * @param store Whether <code>value</code> should be stored in the index
305    * @param index Whether the field should be indexed, and if so, if it should
306    * be tokenized before indexing
307    * @param termVector Whether term vector should be stored
308    * @throws NullPointerException if name or value is <code>null</code>
309    * @throws IllegalArgumentException in any of the following situations:
310    * <ul>
311    * <li>the field is neither stored nor indexed</li>
312    * <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
313    * </ul>
314    */

315   public Field(String JavaDoc name, String JavaDoc value, Store store, Index index, TermVector termVector) {
316     if (name == null)
317       throw new NullPointerException JavaDoc("name cannot be null");
318     if (value == null)
319       throw new NullPointerException JavaDoc("value cannot be null");
320     if (index == Index.NO && store == Store.NO)
321       throw new IllegalArgumentException JavaDoc("it doesn't make sense to have a field that "
322          + "is neither indexed nor stored");
323     if (index == Index.NO && termVector != TermVector.NO)
324       throw new IllegalArgumentException JavaDoc("cannot store term vector information "
325          + "for a field that is not indexed");
326           
327     this.name = name.intern(); // field names are interned
328
this.fieldsData = value;
329
330     if (store == Store.YES){
331       this.isStored = true;
332       this.isCompressed = false;
333     }
334     else if (store == Store.COMPRESS) {
335       this.isStored = true;
336       this.isCompressed = true;
337     }
338     else if (store == Store.NO){
339       this.isStored = false;
340       this.isCompressed = false;
341     }
342     else
343       throw new IllegalArgumentException JavaDoc("unknown store parameter " + store);
344    
345     if (index == Index.NO) {
346       this.isIndexed = false;
347       this.isTokenized = false;
348     } else if (index == Index.TOKENIZED) {
349       this.isIndexed = true;
350       this.isTokenized = true;
351     } else if (index == Index.UN_TOKENIZED) {
352       this.isIndexed = true;
353       this.isTokenized = false;
354     } else if (index == Index.NO_NORMS) {
355       this.isIndexed = true;
356       this.isTokenized = false;
357       this.omitNorms = true;
358     } else {
359       throw new IllegalArgumentException JavaDoc("unknown index parameter " + index);
360     }
361     
362     this.isBinary = false;
363
364     setStoreTermVector(termVector);
365   }
366
367   /**
368    * Create a tokenized and indexed field that is not stored. Term vectors will
369    * not be stored.
370    *
371    * @param name The name of the field
372    * @param reader The reader with the content
373    * @throws NullPointerException if name or reader is <code>null</code>
374    */

375   public Field(String JavaDoc name, Reader JavaDoc reader) {
376     this(name, reader, TermVector.NO);
377   }
378
379   /**
380    * Create a tokenized and indexed field that is not stored, optionally with
381    * storing term vectors.
382    *
383    * @param name The name of the field
384    * @param reader The reader with the content
385    * @param termVector Whether term vector should be stored
386    * @throws NullPointerException if name or reader is <code>null</code>
387    */

388   public Field(String JavaDoc name, Reader JavaDoc reader, TermVector termVector) {
389     if (name == null)
390       throw new NullPointerException JavaDoc("name cannot be null");
391     if (reader == null)
392       throw new NullPointerException JavaDoc("reader cannot be null");
393     
394     this.name = name.intern(); // field names are interned
395
this.fieldsData = reader;
396     
397     this.isStored = false;
398     this.isCompressed = false;
399     
400     this.isIndexed = true;
401     this.isTokenized = true;
402     
403     this.isBinary = false;
404     
405     setStoreTermVector(termVector);
406   }
407
408   /** Create a field by specifying all parameters except for <code>storeTermVector</code>,
409    * which is set to <code>false</code>.
410    *
411    * @deprecated use {@link #Field(String, String, Field.Store, Field.Index)} instead
412    */

413   public Field(String JavaDoc name, String JavaDoc string,
414          boolean store, boolean index, boolean token) {
415     this(name, string, store, index, token, false);
416   }
417
418   
419   /**
420    * Create a stored field with binary value. Optionally the value may be compressed.
421    *
422    * @param name The name of the field
423    * @param value The binary value
424    * @param store How <code>value</code> should be stored (compressed or not)
425    * @throws IllegalArgumentException if store is <code>Store.NO</code>
426    */

427   public Field(String JavaDoc name, byte[] value, Store store) {
428     if (name == null)
429       throw new IllegalArgumentException JavaDoc("name cannot be null");
430     if (value == null)
431       throw new IllegalArgumentException JavaDoc("value cannot be null");
432     
433     this.name = name.intern();
434     this.fieldsData = value;
435     
436     if (store == Store.YES){
437       this.isStored = true;
438       this.isCompressed = false;
439     }
440     else if (store == Store.COMPRESS) {
441       this.isStored = true;
442       this.isCompressed = true;
443     }
444     else if (store == Store.NO)
445       throw new IllegalArgumentException JavaDoc("binary values can't be unstored");
446     else
447       throw new IllegalArgumentException JavaDoc("unknown store parameter " + store);
448     
449     this.isIndexed = false;
450     this.isTokenized = false;
451     
452     this.isBinary = true;
453     
454     setStoreTermVector(TermVector.NO);
455   }
456   
457   /**
458    *
459    * @param name The name of the field
460    * @param string The string to process
461    * @param store true if the field should store the string
462    * @param index true if the field should be indexed
463    * @param token true if the field should be tokenized
464    * @param storeTermVector true if we should store the Term Vector info
465    *
466    * @deprecated use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)} instead
467    */

468   public Field(String JavaDoc name, String JavaDoc string,
469          boolean store, boolean index, boolean token, boolean storeTermVector) {
470     if (name == null)
471       throw new NullPointerException JavaDoc("name cannot be null");
472     if (string == null)
473       throw new NullPointerException JavaDoc("value cannot be null");
474     if (!index && storeTermVector)
475       throw new IllegalArgumentException JavaDoc("cannot store a term vector for fields that are not indexed");
476
477     this.name = name.intern(); // field names are interned
478
this.fieldsData = string;
479     this.isStored = store;
480     this.isIndexed = index;
481     this.isTokenized = token;
482     this.storeTermVector = storeTermVector;
483   }
484
485   private void setStoreTermVector(TermVector termVector) {
486     if (termVector == TermVector.NO) {
487       this.storeTermVector = false;
488       this.storePositionWithTermVector = false;
489       this.storeOffsetWithTermVector = false;
490     }
491     else if (termVector == TermVector.YES) {
492       this.storeTermVector = true;
493       this.storePositionWithTermVector = false;
494       this.storeOffsetWithTermVector = false;
495     }
496     else if (termVector == TermVector.WITH_POSITIONS) {
497       this.storeTermVector = true;
498       this.storePositionWithTermVector = true;
499       this.storeOffsetWithTermVector = false;
500     }
501     else if (termVector == TermVector.WITH_OFFSETS) {
502       this.storeTermVector = true;
503       this.storePositionWithTermVector = false;
504       this.storeOffsetWithTermVector = true;
505     }
506     else if (termVector == TermVector.WITH_POSITIONS_OFFSETS) {
507       this.storeTermVector = true;
508       this.storePositionWithTermVector = true;
509       this.storeOffsetWithTermVector = true;
510     }
511     else {
512       throw new IllegalArgumentException JavaDoc("unknown termVector parameter " + termVector);
513     }
514   }
515   
516   /** True iff the value of the field is to be stored in the index for return
517     with search hits. It is an error for this to be true if a field is
518     Reader-valued. */

519   public final boolean isStored() { return isStored; }
520
521   /** True iff the value of the field is to be indexed, so that it may be
522     searched on. */

523   public final boolean isIndexed() { return isIndexed; }
524
525   /** True iff the value of the field should be tokenized as text prior to
526     indexing. Un-tokenized fields are indexed as a single word and may not be
527     Reader-valued. */

528   public final boolean isTokenized() { return isTokenized; }
529   
530   /** True if the value of the field is stored and compressed within the index */
531   public final boolean isCompressed() { return isCompressed; }
532
533   /** True iff the term or terms used to index this field are stored as a term
534    * vector, available from {@link IndexReader#getTermFreqVector(int,String)}.
535    * These methods do not provide access to the original content of the field,
536    * only to terms used to index it. If the original content must be
537    * preserved, use the <code>stored</code> attribute instead.
538    *
539    * @see IndexReader#getTermFreqVector(int, String)
540    */

541   public final boolean isTermVectorStored() { return storeTermVector; }
542   
543   /**
544    * True iff terms are stored as term vector together with their offsets
545    * (start and end positon in source text).
546    */

547   public boolean isStoreOffsetWithTermVector(){
548     return storeOffsetWithTermVector;
549   }
550   
551   /**
552    * True iff terms are stored as term vector together with their token positions.
553    */

554   public boolean isStorePositionWithTermVector(){
555     return storePositionWithTermVector;
556   }
557       
558   /** True iff the value of the filed is stored as binary */
559   public final boolean isBinary() { return isBinary; }
560   
561   /** True if norms are omitted for this indexed field */
562   public boolean getOmitNorms() { return omitNorms; }
563
564   /** Expert:
565    *
566    * If set, omit normalization factors associated with this indexed field.
567    * This effectively disables indexing boosts and length normalization for this field.
568    */

569   public void setOmitNorms(boolean omitNorms) { this.omitNorms=omitNorms; }
570   
571   /** Prints a Field for human consumption. */
572   public final String JavaDoc toString() {
573     StringBuffer JavaDoc result = new StringBuffer JavaDoc();
574     if (isStored) {
575       result.append("stored");
576       if (isCompressed)
577         result.append("/compressed");
578       else
579         result.append("/uncompressed");
580     }
581     if (isIndexed) {
582       if (result.length() > 0)
583         result.append(",");
584       result.append("indexed");
585     }
586     if (isTokenized) {
587       if (result.length() > 0)
588         result.append(",");
589       result.append("tokenized");
590     }
591     if (storeTermVector) {
592       if (result.length() > 0)
593         result.append(",");
594       result.append("termVector");
595     }
596     if (storeOffsetWithTermVector) {
597       if (result.length() > 0)
598         result.append(",");
599       result.append("termVectorOffsets");
600     }
601     if (storePositionWithTermVector) {
602       if (result.length() > 0)
603         result.append(",");
604       result.append("termVectorPosition");
605     }
606     if (isBinary) {
607       if (result.length() > 0)
608         result.append(",");
609       result.append("binary");
610     }
611     if (omitNorms) {
612       result.append(",omitNorms");
613     }
614     result.append('<');
615     result.append(name);
616     result.append(':');
617     
618     if (fieldsData != null) {
619       result.append(fieldsData);
620     }
621     
622     result.append('>');
623     return result.toString();
624   }
625
626 }
627
Popular Tags