KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > eclipse > help > internal > search > QueryBuilder


1 /*******************************************************************************
2  * Copyright (c) 2000, 2007 IBM Corporation and others.
3  * All rights reserved. This program and the accompanying materials
4  * are made available under the terms of the Eclipse Public License v1.0
5  * which accompanies this distribution, and is available at
6  * http://www.eclipse.org/legal/epl-v10.html
7  *
8  * Contributors:
9  * IBM Corporation - initial API and implementation
10  *******************************************************************************/

11 package org.eclipse.help.internal.search;
12 import java.io.*;
13 import java.util.ArrayList JavaDoc;
14 import java.util.Collection JavaDoc;
15 import java.util.Iterator JavaDoc;
16 import java.util.List JavaDoc;
17 import java.util.Locale JavaDoc;
18 import java.util.StringTokenizer JavaDoc;
19
20 import org.apache.lucene.analysis.*;
21 import org.apache.lucene.index.*;
22 import org.apache.lucene.search.*;
23 import org.eclipse.help.internal.base.*;
24 /**
25  * Build query acceptable by the search engine.
26  */

27 public class QueryBuilder {
28     // Maximum allowed number of terms
29
private static final int MAX_TERMS = 10;
30     // Maximum allowed number of ORs
31
private static final int MAX_UNIONS = 4;
32     // Maximum allowed number terms with wild cards
33
private static final int MAX_WILD_TERMS = 2;
34     // Query from user
35
private String JavaDoc searchWords;
36     // Descriptor of Analyzer to process the query words
37
private AnalyzerDescriptor analyzerDesc;
38     // Analyzer to process the query words
39
private Analyzer analyzer;
40     // List of QueryWordsToken
41
private List JavaDoc analyzedTokens;
42     // List of words to highlight
43
private List JavaDoc highlightWords = new ArrayList JavaDoc();
44     private Locale JavaDoc locale;
45     /**
46      * Creates a query builder for the search word. The search word is processed
47      * by a lexical analyzer.
48      */

49     public QueryBuilder(String JavaDoc searchWords, AnalyzerDescriptor analyzerDesc) {
50         this.searchWords = searchWords;
51         String JavaDoc language = analyzerDesc.getLang();
52         if (language.length() >= 5) {
53             this.locale = new Locale JavaDoc(language.substring(0, 2), language
54                     .substring(3, 5));
55         } else {
56             this.locale = new Locale JavaDoc(language.substring(0, 2), ""); //$NON-NLS-1$
57
}
58         this.analyzerDesc = analyzerDesc;
59         this.analyzer = analyzerDesc.getAnalyzer();
60     }
61     /**
62      * Splits user query into tokens and returns a list of QueryWordsToken's.
63      */

64     private List JavaDoc tokenizeUserQuery(String JavaDoc searchWords) {
65         List JavaDoc tokenList = new ArrayList JavaDoc();
66         //Divide along quotation marks
67
//StringTokenizer qTokenizer = new StringTokenizer(searchWords.trim(),
68
// "\"", true); //$NON-NLS-1$
69
boolean withinQuotation = false;
70         String JavaDoc quotedString = ""; //$NON-NLS-1$
71
int termCount = 0;// keep track of number of terms to disallow too many
72

73         int fromIndex = -1;
74         searchWords = searchWords.trim();
75         while((fromIndex = searchWords.indexOf("\"", fromIndex+1))!= -1){ //$NON-NLS-1$
76
withinQuotation = !withinQuotation;
77         }
78         if( withinQuotation ) {
79             searchWords = searchWords + "\""; //$NON-NLS-1$
80
withinQuotation = !withinQuotation;
81         }
82         
83         StringTokenizer JavaDoc qTokenizer = new StringTokenizer JavaDoc(searchWords,"\"",true); //$NON-NLS-1$
84
int orCount = 0; // keep track of number of ORs to disallow too many
85
while (qTokenizer.hasMoreTokens()) {
86             String JavaDoc curToken = qTokenizer.nextToken();
87             if (curToken.equals("\"")) { //$NON-NLS-1$
88
if (withinQuotation) {
89                     // check for too many terms
90
if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER
91                             && ++termCount > MAX_TERMS) {
92                         throw new QueryTooComplexException();
93                     }
94                     tokenList.add(QueryWordsToken.exactPhrase(quotedString));
95                 } else {
96                     quotedString = ""; //$NON-NLS-1$
97
}
98                 withinQuotation = !withinQuotation;
99                 continue;
100             } else if (withinQuotation) {
101                 quotedString = curToken;
102                 continue;
103             } else {
104                 //divide unquoted strings along white space
105
StringTokenizer JavaDoc parser = new StringTokenizer JavaDoc(curToken.trim());
106                 while (parser.hasMoreTokens()) {
107                     String JavaDoc token = parser.nextToken();
108                     if (token.equalsIgnoreCase(QueryWordsToken.AND().value)) {
109                         tokenList.add(QueryWordsToken.AND());
110                     } else if (token
111                             .equalsIgnoreCase(QueryWordsToken.OR().value)) {
112                         // Check for too many OR terms
113
if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER
114                                 && ++orCount > MAX_UNIONS) {
115                             throw new QueryTooComplexException();
116                         }
117                         tokenList.add(QueryWordsToken.OR());
118                     } else if (token
119                             .equalsIgnoreCase(QueryWordsToken.NOT().value)) {
120                         tokenList.add(QueryWordsToken.NOT());
121                     } else {
122                         // check for too many terms
123
if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER
124                                 && ++termCount > MAX_TERMS) {
125                             throw new QueryTooComplexException();
126                         }
127                         tokenList.add(QueryWordsToken.word(token));
128                     }
129                 }
130             }
131         }
132         return tokenList;
133     }
134     /**
135      * Apply the Analyzer to the search tokens and return the list of processed
136      * QueryWordsToken's.
137      */

138     private List JavaDoc analyzeTokens(List JavaDoc tokens) {
139         boolean isTokenAfterNot = false;
140         List JavaDoc newTokens = new ArrayList JavaDoc();
141         int wildCardTermCount = 0;
142         for (int i = 0; i < tokens.size(); i++) {
143             QueryWordsToken token = (QueryWordsToken) tokens.get(i);
144             if (token.type == QueryWordsToken.WORD) {
145                 int questionMIndex = token.value.indexOf('?');
146                 int starIndex = token.value.indexOf('*');
147                 if (starIndex >= 0 || questionMIndex >= 0) {
148                     if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER
149                             && ++wildCardTermCount > MAX_WILD_TERMS) {
150                         throw new QueryTooComplexException();
151                     }
152                     if (questionMIndex != 0 && starIndex != 0) {
153                         newTokens.add(QueryWordsToken.word(token.value
154                                 .toLowerCase(locale)));
155                         // add word to the list of words to highlight
156
if (!isTokenAfterNot && !highlightWords.contains(token.value)) {
157                             highlightWords.add(token.value);
158                         }
159                     } else {
160                         // wild card not allowed as the first character
161
}
162                 } else {
163                     List JavaDoc wordList = analyzeText(analyzer, "contents", //$NON-NLS-1$
164
token.value);
165                     if (wordList.size() > 0) {
166                         if (!isTokenAfterNot && !highlightWords.contains(token.value)) {
167                             // add original word to the list of words to
168
// highlight
169
highlightWords.add(token.value);
170                         }
171                         if (wordList.size() == 1) {
172                             String JavaDoc word = (String JavaDoc) wordList.get(0);
173                             newTokens.add(QueryWordsToken.word(word));
174                             // add analyzed word to the list of words to
175
// highlight
176
// this is required to highlight stemmed words
177
if (!isTokenAfterNot && !highlightWords.contains(word)) {
178                                 highlightWords.add(word);
179                             }
180                         } else {
181                             QueryWordsPhrase phrase = QueryWordsToken.phrase();
182                             for (Iterator JavaDoc it = wordList.iterator(); it
183                                     .hasNext();) {
184                                 String JavaDoc word = (String JavaDoc) it.next();
185                                 phrase.addWord(word);
186                                 // add each analyzed word to the list of words
187
// to highlight
188
// this is only required to highlight stemmed
189
// words.
190
// Adding words should not be done when
191
// DefaultAnalyzer is used,
192
// because it does not perform stemming and
193
// common words removal
194
// which would result in common characters
195
// highlighted all over (bug 30263)
196
if (!analyzerDesc.getId().startsWith(
197                                         HelpBasePlugin.PLUGIN_ID + "#")) { //$NON-NLS-1$
198
if (!isTokenAfterNot && !highlightWords.contains(word)) {
199                                         highlightWords.add(word);
200                                     }
201                                 }
202                             }
203                             newTokens.add(phrase);
204                         }
205                     }
206                 }
207             } else if (// forget ANDs
208
/*
209              * token.type == SearchQueryToken.AND ||
210              */

211             token.type == QueryWordsToken.OR
212                     || token.type == QueryWordsToken.NOT)
213                 newTokens.add(token);
214             else if (token.type == QueryWordsToken.EXACT_PHRASE) {
215                 List JavaDoc wordList = analyzeText(analyzer, "exact_contents", //$NON-NLS-1$
216
token.value);
217                 if (wordList.size() > 0) {
218                     if (!isTokenAfterNot && !highlightWords.contains(token.value)) {
219                         // add original word to the list of words to highlight
220
highlightWords.add(token.value);
221                     }
222                 }
223                 QueryWordsExactPhrase phrase = QueryWordsToken.exactPhrase();
224                 for (Iterator JavaDoc it = wordList.iterator(); it.hasNext();) {
225                     String JavaDoc word = (String JavaDoc) it.next();
226                     phrase.addWord(word);
227                     // add analyzed word to the list of words to highlight
228
// if (!highlightWords.contains(word))
229
// highlightWords.add(word);
230
}
231                 // add phrase only if not empty
232
if (phrase.getWords().size() > 0) {
233                     newTokens.add(phrase);
234                 }
235             }
236             isTokenAfterNot = (token.type == QueryWordsToken.NOT);
237         }
238         return newTokens;
239     }
240     /**
241      * Get a list of tokens corresponding to a search word or phrase
242      *
243      * @return List of String
244      */

245     private List JavaDoc analyzeText(Analyzer analyzer, String JavaDoc fieldName, String JavaDoc text) {
246         List JavaDoc words = new ArrayList JavaDoc(1);
247         Reader reader = new StringReader(text);
248         TokenStream tStream = analyzer.tokenStream(fieldName, reader);
249         Token tok;
250         try {
251             while (null != (tok = tStream.next())) {
252                 words.add(tok.termText());
253             }
254             reader.close();
255         } catch (IOException ioe) {
256         }
257         return words;
258     }
259     /**
260      * Obtains Lucene Query from tokens
261      *
262      * @return Query or null if no query could be created
263      */

264     private Query createLuceneQuery(List JavaDoc searchTokens, String JavaDoc[] fieldNames,
265             float[] boosts) {
266         // Get queries for parts separated by OR
267
List JavaDoc requiredQueries = getRequiredQueries(searchTokens, fieldNames,
268                 boosts);
269         if (requiredQueries.size() == 0)
270             return null;
271         else if (requiredQueries.size() <= 1)
272             return (Query) requiredQueries.get(0);
273         else
274             /* if (requiredQueries.size() > 1) */
275             // OR queries
276
return (orQueries(requiredQueries));
277     }
278     /**
279      * Obtains Lucene queries for token sequences separated at OR.
280      *
281      * @return List of Query (could be empty)
282      */

283     private List JavaDoc getRequiredQueries(List JavaDoc tokens, String JavaDoc[] fieldNames,
284             float[] boosts) {
285         List JavaDoc oredQueries = new ArrayList JavaDoc();
286         ArrayList JavaDoc requiredQueryTokens = new ArrayList JavaDoc();
287         for (int i = 0; i < tokens.size(); i++) {
288             QueryWordsToken token = (QueryWordsToken) tokens.get(i);
289             if (token.type != QueryWordsToken.OR) {
290                 requiredQueryTokens.add(token);
291             } else {
292                 Query reqQuery = getRequiredQuery(requiredQueryTokens,
293                         fieldNames, boosts);
294                 if (reqQuery != null)
295                     oredQueries.add(reqQuery);
296                 requiredQueryTokens = new ArrayList JavaDoc();
297             }
298         }
299         Query reqQuery = getRequiredQuery(requiredQueryTokens, fieldNames,
300                 boosts);
301         if (reqQuery != null)
302             oredQueries.add(reqQuery);
303         return oredQueries;
304     }
305     private Query orQueries(Collection JavaDoc queries) {
306         BooleanQuery bq = new BooleanQuery();
307         for (Iterator JavaDoc it = queries.iterator(); it.hasNext();) {
308             Query q = (Query) it.next();
309             bq.add(q, BooleanClause.Occur.SHOULD);
310         }
311         return bq;
312     }
313     /**
314      * Obtains Lucene Query for tokens containing only AND and NOT operators.
315      *
316      * @return BooleanQuery or null if no query could be created from the tokens
317      */

318     private Query getRequiredQuery(List JavaDoc requiredTokens, String JavaDoc[] fieldNames,
319             float[] boosts) {
320         BooleanQuery retQuery = new BooleanQuery();
321         boolean requiredTermExist = false;
322         // Parse tokens left to right
323
QueryWordsToken operator = null;
324         for (int i = 0; i < requiredTokens.size(); i++) {
325             QueryWordsToken token = (QueryWordsToken) requiredTokens.get(i);
326             if (token.type == QueryWordsToken.AND
327                     || token.type == QueryWordsToken.NOT) {
328                 operator = token;
329                 continue;
330             }
331             // Creates queries for all fields
332
Query qs[] = new Query[fieldNames.length];
333             for (int f = 0; f < fieldNames.length; f++) {
334                 qs[f] = token.createLuceneQuery(fieldNames[f], boosts[f]);
335             }
336             // creates the boolean query of all fields
337
Query q = qs[0];
338             if (fieldNames.length > 1) {
339                 BooleanQuery allFieldsQuery = new BooleanQuery();
340                 for (int f = 0; f < fieldNames.length; f++)
341                     allFieldsQuery.add(qs[f], BooleanClause.Occur.SHOULD);
342                 q = allFieldsQuery;
343             }
344             if (operator != null && operator.type == QueryWordsToken.NOT) {
345                 retQuery.add(q, BooleanClause.Occur.MUST_NOT); // add as prohibited
346
} else {
347                 retQuery.add(q, BooleanClause.Occur.MUST); // add as required
348
requiredTermExist = true;
349             }
350         }
351         if (!requiredTermExist) {
352             return null; // cannot search for prohibited only
353
}
354         return retQuery;
355     }
356     private Query getLuceneQuery(String JavaDoc[] fieldNames, float[] boosts) {
357         Query luceneQuery = createLuceneQuery(analyzedTokens, fieldNames,
358                 boosts);
359         return luceneQuery;
360     }
361     /**
362      * @param fieldNames -
363      * Collection of field names of type String (e.g. "h1"); the
364      * search will be performed on the given fields
365      * @param fieldSearchOnly -
366      * boolean indicating if field only search should be performed;
367      * if set to false, default field "contents" and all other fields
368      * will be searched
369      */

370     public Query getLuceneQuery(Collection JavaDoc fieldNames, boolean fieldSearchOnly)
371             throws QueryTooComplexException {
372         // split search query into tokens
373
List JavaDoc userTokens = tokenizeUserQuery(searchWords);
374         analyzedTokens = analyzeTokens(userTokens);
375         return buildLuceneQuery(fieldNames, fieldSearchOnly);
376     }
377     /**
378      * @param fieldNames -
379      * Collection of field names of type String (e.g. "h1"); the
380      * search will be performed on the given fields
381      * @param fieldSearchOnly -
382      * boolean indicating if field only search should be performed;
383      * if set to false, default field "contents" and all other fields
384      * will be searched
385      */

386     private Query buildLuceneQuery(Collection JavaDoc fieldNames,
387             boolean fieldSearchOnly) {
388         String JavaDoc[] fields;
389         float[] boosts;
390         if (fieldSearchOnly) {
391             fields = new String JavaDoc[fieldNames.size()];
392             boosts = new float[fieldNames.size()];
393             Iterator JavaDoc fieldNamesIt = fieldNames.iterator();
394             for (int i = 0; i < fieldNames.size(); i++) {
395                 fields[i] = (String JavaDoc) fieldNamesIt.next();
396                 boosts[i] = 5.0f;
397             }
398         } else {
399             fields = new String JavaDoc[fieldNames.size() + 1];
400             boosts = new float[fieldNames.size() + 1];
401             Iterator JavaDoc fieldNamesIt = fieldNames.iterator();
402             for (int i = 0; i < fieldNames.size(); i++) {
403                 fields[i] = (String JavaDoc) fieldNamesIt.next();
404                 boosts[i] = 5.0f;
405             }
406             fields[fieldNames.size()] = "contents"; //$NON-NLS-1$
407
boosts[fieldNames.size()] = 1.0f;
408         }
409         Query query = getLuceneQuery(fields, boosts);
410         query = improveRankingForUnqotedPhrase(query, fields, boosts);
411         return query;
412     }
413     /**
414      * If user query contained only words (no quotaions nor operators) extends
415      * query with term phrase representing entire user query i.e for user string
416      * a b, the query a AND b will be extended to "a b" OR a AND b
417      */

418     private Query improveRankingForUnqotedPhrase(Query query, String JavaDoc[] fields,
419             float[] boosts) {
420         if (query == null)
421             return query;
422         // check if all tokens are words
423
for (int i = 0; i < analyzedTokens.size(); i++)
424             if (((QueryWordsToken) analyzedTokens.get(i)).type != QueryWordsToken.WORD)
425                 return query;
426         // Create phrase query for all tokens and OR with original query
427
BooleanQuery booleanQuery = new BooleanQuery();
428         booleanQuery.add(query, BooleanClause.Occur.SHOULD);
429         PhraseQuery[] phraseQueries = new PhraseQuery[fields.length];
430         for (int f = 0; f < fields.length; f++) {
431             phraseQueries[f] = new PhraseQuery();
432             for (int i = 0; i < analyzedTokens.size(); i++) {
433                 Term t = new Term(fields[f], ((QueryWordsToken) analyzedTokens
434                         .get(i)).value);
435                 phraseQueries[f].add(t);
436             }
437             phraseQueries[f].setBoost(10 * boosts[f]);
438             booleanQuery.add(phraseQueries[f], BooleanClause.Occur.SHOULD);
439         }
440         return booleanQuery;
441     }
442     /**
443      * Obtains analyzed terms from query as one string. Words are double quoted,
444      * and separated by space. The analyzed words are needed for highlighting
445      * word roots.
446      */

447     public String JavaDoc gethighlightTerms() {
448         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
449         for (Iterator JavaDoc it = highlightWords.iterator(); it.hasNext();) {
450             buf.append('"');
451             buf.append(it.next());
452             buf.append("\" "); //$NON-NLS-1$
453
}
454         return buf.toString();
455     }
456 }
457
Popular Tags