KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > search > FuzzyQuery


1 package org.apache.lucene.search;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import org.apache.lucene.index.IndexReader;
20 import org.apache.lucene.index.Term;
21 import org.apache.lucene.util.PriorityQueue;
22 import org.apache.lucene.util.ToStringUtils;
23
24 import java.io.IOException JavaDoc;
25
26 /** Implements the fuzzy search query. The similiarity measurement
27  * is based on the Levenshtein (edit distance) algorithm.
28  */

29 public final class FuzzyQuery extends MultiTermQuery {
30   
31   public final static float defaultMinSimilarity = 0.5f;
32   public final static int defaultPrefixLength = 0;
33   
34   private float minimumSimilarity;
35   private int prefixLength;
36   
37   /**
38    * Create a new FuzzyQuery that will match terms with a similarity
39    * of at least <code>minimumSimilarity</code> to <code>term</code>.
40    * If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
41    * of that length is also required.
42    *
43    * @param term the term to search for
44    * @param minimumSimilarity a value between 0 and 1 to set the required similarity
45    * between the query term and the matching terms. For example, for a
46    * <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
47    * as the query term is considered similar to the query term if the edit distance
48    * between both terms is less than <code>length(term)*0.5</code>
49    * @param prefixLength length of common (non-fuzzy) prefix
50    * @throws IllegalArgumentException if minimumSimilarity is &gt;= 1 or &lt; 0
51    * or if prefixLength &lt; 0
52    */

53   public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException JavaDoc {
54     super(term);
55     
56     if (minimumSimilarity >= 1.0f)
57       throw new IllegalArgumentException JavaDoc("minimumSimilarity >= 1");
58     else if (minimumSimilarity < 0.0f)
59       throw new IllegalArgumentException JavaDoc("minimumSimilarity < 0");
60     if (prefixLength < 0)
61       throw new IllegalArgumentException JavaDoc("prefixLength < 0");
62     
63     this.minimumSimilarity = minimumSimilarity;
64     this.prefixLength = prefixLength;
65   }
66   
67   /**
68    * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
69    */

70   public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException JavaDoc {
71       this(term, minimumSimilarity, defaultPrefixLength);
72   }
73
74   /**
75    * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
76    */

77   public FuzzyQuery(Term term) {
78     this(term, defaultMinSimilarity, defaultPrefixLength);
79   }
80   
81   /**
82    * Returns the minimum similarity that is required for this query to match.
83    * @return float value between 0.0 and 1.0
84    */

85   public float getMinSimilarity() {
86     return minimumSimilarity;
87   }
88     
89   /**
90    * Returns the non-fuzzy prefix length. This is the number of characters at the start
91    * of a term that must be identical (not fuzzy) to the query term if the query
92    * is to match that term.
93    */

94   public int getPrefixLength() {
95     return prefixLength;
96   }
97
98   protected FilteredTermEnum getEnum(IndexReader reader) throws IOException JavaDoc {
99     return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
100   }
101   
102   public Query rewrite(IndexReader reader) throws IOException JavaDoc {
103     FilteredTermEnum enumerator = getEnum(reader);
104     int maxClauseCount = BooleanQuery.getMaxClauseCount();
105     ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
106     
107     try {
108       do {
109         float minScore = 0.0f;
110         float score = 0.0f;
111         Term t = enumerator.term();
112         if (t != null) {
113           score = enumerator.difference();
114           // terms come in alphabetical order, therefore if queue is full and score
115
// not bigger than minScore, we can skip
116
if(stQueue.size() < maxClauseCount || score > minScore){
117             stQueue.insert(new ScoreTerm(t, score));
118             minScore = ((ScoreTerm)stQueue.top()).score; // maintain minScore
119
}
120         }
121       } while (enumerator.next());
122     } finally {
123       enumerator.close();
124     }
125     
126     BooleanQuery query = new BooleanQuery(true);
127     int size = stQueue.size();
128     for(int i = 0; i < size; i++){
129       ScoreTerm st = (ScoreTerm) stQueue.pop();
130       TermQuery tq = new TermQuery(st.term); // found a match
131
tq.setBoost(getBoost() * st.score); // set the boost
132
query.add(tq, BooleanClause.Occur.SHOULD); // add to query
133
}
134
135     return query;
136   }
137     
138   public String JavaDoc toString(String JavaDoc field) {
139     StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
140     Term term = getTerm();
141     if (!term.field().equals(field)) {
142         buffer.append(term.field());
143         buffer.append(":");
144     }
145     buffer.append(term.text());
146     buffer.append('~');
147     buffer.append(Float.toString(minimumSimilarity));
148     buffer.append(ToStringUtils.boost(getBoost()));
149     return buffer.toString();
150   }
151   
152   private static class ScoreTerm{
153     public Term term;
154     public float score;
155     
156     public ScoreTerm(Term term, float score){
157       this.term = term;
158       this.score = score;
159     }
160   }
161   
162   private static class ScoreTermQueue extends PriorityQueue {
163     
164     public ScoreTermQueue(int size){
165       initialize(size);
166     }
167     
168     /* (non-Javadoc)
169      * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
170      */

171     protected boolean lessThan(Object JavaDoc a, Object JavaDoc b) {
172       ScoreTerm termA = (ScoreTerm)a;
173       ScoreTerm termB = (ScoreTerm)b;
174       if (termA.score == termB.score)
175         return termA.term.compareTo(termB.term) > 0;
176       else
177         return termA.score < termB.score;
178     }
179     
180   }
181
182   public boolean equals(Object JavaDoc o) {
183     if (this == o) return true;
184     if (!(o instanceof FuzzyQuery)) return false;
185     if (!super.equals(o)) return false;
186
187     final FuzzyQuery fuzzyQuery = (FuzzyQuery) o;
188
189     if (minimumSimilarity != fuzzyQuery.minimumSimilarity) return false;
190     if (prefixLength != fuzzyQuery.prefixLength) return false;
191
192     return true;
193   }
194
195   public int hashCode() {
196     int result = super.hashCode();
197     result = 29 * result + minimumSimilarity != +0.0f ? Float.floatToIntBits(minimumSimilarity) : 0;
198     result = 29 * result + prefixLength;
199     return result;
200   }
201 }
202
Popular Tags