Similarity


1   package org.apache.lucene.search;
2   
3   /**
4    * Copyright 2004 The Apache Software Foundation
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  import java.io.IOException  ;
20  import java.io.Serializable  ;
21  
22  import java.util.Collection  ;
23  import java.util.Iterator  ;
24  
25  import org.apache.lucene.index.Term;
26  
27  import org.apache.lucene.index.IndexReader;       // for javadoc
28  import org.apache.lucene.index.IndexWriter;       // for javadoc
29  import org.apache.lucene.document.Field;          // for javadoc
30  import org.apache.lucene.util.SmallFloat;
31  
32  /** Expert: Scoring API.
33   * <p>Subclasses implement search scoring.
34   *
35   * <p>The score of query <code>q</code> for document <code>d</code> is defined
36   * in terms of these methods as follows:
37   *
38   * <table cellpadding="0" cellspacing="0" border="0">
39   *  <tr>
40   *    <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
41   *    <td valign="middle" align="center">
42   *    <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
43   *    <td valign="middle"><small>
44   *    ( {@link #tf(int) tf}(t in d) *
45   *    {@link #idf(Term,Searcher) idf}(t)^2 *
46   *    {@link Query#getBoost getBoost}(t in q) *
47   *    {@link Field#getBoost getBoost}(t.field in d) *
48   *    {@link #lengthNorm(String,int) lengthNorm}(t.field in d) )
49   *    </small></td>
50   *    <td valign="middle" rowspan="2">&nbsp;*
51   *    {@link #coord(int,int) coord}(q,d) *
52   *    {@link #queryNorm(float) queryNorm}(sumOfSqaredWeights)
53   *    </td>
54   *  </tr>
55   *  <tr>
56   *   <td valign="top" align="right">
57   *    <small>t in q</small>
58   *    </td>
59   *  </tr>
60   * </table>
61   * 
62   * <p> where
63   * 
64   * <table cellpadding="0" cellspacing="0" border="0">
65   *  <tr>
66   *    <td valign="middle" align="right" rowspan="2">sumOfSqaredWeights =<br></td>
67   *    <td valign="middle" align="center">
68   *    <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
69   *    <td valign="middle"><small>
70   *    ( {@link #idf(Term,Searcher) idf}(t) *
71   *    {@link Query#getBoost getBoost}(t in q) )^2
72   *    </small></td>
73   *  </tr>
74   *  <tr>
75   *   <td valign="top" align="right">
76   *    <small>t in q</small>
77   *    </td>
78   *  </tr>
79   * </table>
80   * 
81   * <p> Note that the above formula is motivated by the cosine-distance or dot-product
82   * between document and query vector, which is implemented by {@link DefaultSimilarity}.
83   *
84   * @see #setDefault(Similarity)
85   * @see IndexWriter#setSimilarity(Similarity)
86   * @see Searcher#setSimilarity(Similarity)
87   */
88  public abstract class Similarity implements Serializable   {
89    /** The Similarity implementation used by default. */
90    private static Similarity defaultImpl = new DefaultSimilarity();
91  
92    /** Set the default Similarity implementation used by indexing and search
93     * code.
94     *
95     * @see Searcher#setSimilarity(Similarity)
96     * @see IndexWriter#setSimilarity(Similarity)
97     */
98    public static void setDefault(Similarity similarity) {
99      Similarity.defaultImpl = similarity;
100   }
101 
102   /** Return the default Similarity implementation used by indexing and search
103    * code.
104    *
105    * <p>This is initially an instance of {@link DefaultSimilarity}.
106    *
107    * @see Searcher#setSimilarity(Similarity)
108    * @see IndexWriter#setSimilarity(Similarity)
109    */
110   public static Similarity getDefault() {
111     return Similarity.defaultImpl;
112   }
113 
114   /** Cache of decoded bytes. */
115   private static final float[] NORM_TABLE = new float[256];
116 
117   static {
118     for (int i = 0; i < 256; i++)
119       NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
120   }
121 
122   /** Decodes a normalization factor stored in an index.
123    * @see #encodeNorm(float)
124    */
125   public static float decodeNorm(byte b) {
126     return NORM_TABLE[b & 0xFF];  // & 0xFF maps negative bytes to positive above 127
127   }
128 
129   /** Returns a table for decoding normalization bytes.
130    * @see #encodeNorm(float)
131    */
132   public static float[] getNormDecoder() {
133     return NORM_TABLE;
134   }
135 
136   /** Computes the normalization value for a field given the total number of
137    * terms contained in a field.  These values, together with field boosts, are
138    * stored in an index and multipled into scores for hits on each field by the
139    * search code.
140    *
141    * <p>Matches in longer fields are less precise, so implementations of this
142    * method usually return smaller values when <code>numTokens</code> is large,
143    * and larger values when <code>numTokens</code> is small.
144    *
145    * <p>That these values are computed under {@link
146    * IndexWriter#addDocument(org.apache.lucene.document.Document)} and stored then using
147    * {@link #encodeNorm(float)}.  Thus they have limited precision, and documents
148    * must be re-indexed if this method is altered.
149    *
150    * @param fieldName the name of the field
151    * @param numTokens the total number of tokens contained in fields named
152    * <i>fieldName</i> of <i>doc</i>.
153    * @return a normalization factor for hits on this field of this document
154    *
155    * @see Field#setBoost(float)
156    */
157   public abstract float lengthNorm(String   fieldName, int numTokens);
158 
159   /** Computes the normalization value for a query given the sum of the squared
160    * weights of each of the query terms.  This value is then multipled into the
161    * weight of each query term.
162    *
163    * <p>This does not affect ranking, but rather just attempts to make scores
164    * from different queries comparable.
165    *
166    * @param sumOfSquaredWeights the sum of the squares of query term weights
167    * @return a normalization factor for query weights
168    */
169   public abstract float queryNorm(float sumOfSquaredWeights);
170 
171   /** Encodes a normalization factor for storage in an index.
172    *
173    * <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
174    * the zero-exponent point at 15, thus
175    * representing values from around 7x10^9 to 2x10^-9 with about one
176    * significant decimal digit of accuracy.  Zero is also represented.
177    * Negative numbers are rounded up to zero.  Values too large to represent
178    * are rounded down to the largest representable value.  Positive values too
179    * small to represent are rounded up to the smallest positive representable
180    * value.
181    *
182    * @see Field#setBoost(float)
183    * @see SmallFloat
184    */
185   public static byte encodeNorm(float f) {
186     return SmallFloat.floatToByte315(f);
187   }
188 
189 
190   /** Computes a score factor based on a term or phrase's frequency in a
191    * document.  This value is multiplied by the {@link #idf(Term, Searcher)}
192    * factor for each term in the query and these products are then summed to
193    * form the initial score for a document.
194    *
195    * <p>Terms and phrases repeated in a document indicate the topic of the
196    * document, so implementations of this method usually return larger values
197    * when <code>freq</code> is large, and smaller values when <code>freq</code>
198    * is small.
199    *
200    * <p>The default implementation calls {@link #tf(float)}.
201    *
202    * @param freq the frequency of a term within a document
203    * @return a score factor based on a term's within-document frequency
204    */
205   public float tf(int freq) {
206     return tf((float)freq);
207   }
208 
209   /** Computes the amount of a sloppy phrase match, based on an edit distance.
210    * This value is summed for each sloppy phrase match in a document to form
211    * the frequency that is passed to {@link #tf(float)}.
212    *
213    * <p>A phrase match with a small edit distance to a document passage more
214    * closely matches the document, so implementations of this method usually
215    * return larger values when the edit distance is small and smaller values
216    * when it is large.
217    *
218    * @see PhraseQuery#setSlop(int)
219    * @param distance the edit distance of this sloppy phrase match
220    * @return the frequency increment for this match
221    */
222   public abstract float sloppyFreq(int distance);
223 
224   /** Computes a score factor based on a term or phrase's frequency in a
225    * document.  This value is multiplied by the {@link #idf(Term, Searcher)}
226    * factor for each term in the query and these products are then summed to
227    * form the initial score for a document.
228    *
229    * <p>Terms and phrases repeated in a document indicate the topic of the
230    * document, so implementations of this method usually return larger values
231    * when <code>freq</code> is large, and smaller values when <code>freq</code>
232    * is small.
233    *
234    * @param freq the frequency of a term within a document
235    * @return a score factor based on a term's within-document frequency
236    */
237   public abstract float tf(float freq);
238 
239   /** Computes a score factor for a simple term.
240    *
241    * <p>The default implementation is:<pre>
242    *   return idf(searcher.docFreq(term), searcher.maxDoc());
243    * </pre>
244    *
245    * Note that {@link Searcher#maxDoc()} is used instead of
246    * {@link IndexReader#numDocs()} because it is proportional to
247    * {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,
248    * so is the other, and in the same direction.
249    *
250    * @param term the term in question
251    * @param searcher the document collection being searched
252    * @return a score factor for the term
253    */
254   public float idf(Term term, Searcher searcher) throws IOException   {
255     return idf(searcher.docFreq(term), searcher.maxDoc());
256   }
257 
258   /** Computes a score factor for a phrase.
259    *
260    * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
261    * for each term in the phrase.
262    *
263    * @param terms the terms in the phrase
264    * @param searcher the document collection being searched
265    * @return a score factor for the phrase
266    */
267   public float idf(Collection   terms, Searcher searcher) throws IOException   {
268     float idf = 0.0f;
269     Iterator   i = terms.iterator();
270     while (i.hasNext()) {
271       idf += idf((Term)i.next(), searcher);
272     }
273     return idf;
274   }
275 
276   /** Computes a score factor based on a term's document frequency (the number
277    * of documents which contain the term).  This value is multiplied by the
278    * {@link #tf(int)} factor for each term in the query and these products are
279    * then summed to form the initial score for a document.
280    *
281    * <p>Terms that occur in fewer documents are better indicators of topic, so
282    * implementations of this method usually return larger values for rare terms,
283    * and smaller values for common terms.
284    *
285    * @param docFreq the number of documents which contain the term
286    * @param numDocs the total number of documents in the collection
287    * @return a score factor based on the term's document frequency
288    */
289   public abstract float idf(int docFreq, int numDocs);
290 
291   /** Computes a score factor based on the fraction of all query terms that a
292    * document contains.  This value is multiplied into scores.
293    *
294    * <p>The presence of a large portion of the query terms indicates a better
295    * match with the query, so implementations of this method usually return
296    * larger values when the ratio between these parameters is large and smaller
297    * values when the ratio between them is small.
298    *
299    * @param overlap the number of query terms matched in the document
300    * @param maxOverlap the total number of terms in the query
301    * @return a score factor based on term overlap with the query
302    */
303   public abstract float coord(int overlap, int maxOverlap);
304 }
305
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags