1 11 package org.eclipse.help.internal.search; 12 import java.io.*; 13 import java.util.ArrayList ; 14 import java.util.Collection ; 15 import java.util.Iterator ; 16 import java.util.List ; 17 import java.util.Locale ; 18 import java.util.StringTokenizer ; 19 20 import org.apache.lucene.analysis.*; 21 import org.apache.lucene.index.*; 22 import org.apache.lucene.search.*; 23 import org.eclipse.help.internal.base.*; 24 27 public class QueryBuilder { 28 private static final int MAX_TERMS = 10; 30 private static final int MAX_UNIONS = 4; 32 private static final int MAX_WILD_TERMS = 2; 34 private String searchWords; 36 private AnalyzerDescriptor analyzerDesc; 38 private Analyzer analyzer; 40 private List analyzedTokens; 42 private List highlightWords = new ArrayList (); 44 private Locale locale; 45 49 public QueryBuilder(String searchWords, AnalyzerDescriptor analyzerDesc) { 50 this.searchWords = searchWords; 51 String language = analyzerDesc.getLang(); 52 if (language.length() >= 5) { 53 this.locale = new Locale (language.substring(0, 2), language 54 .substring(3, 5)); 55 } else { 56 this.locale = new Locale (language.substring(0, 2), ""); } 58 this.analyzerDesc = analyzerDesc; 59 this.analyzer = analyzerDesc.getAnalyzer(); 60 } 61 64 private List tokenizeUserQuery(String searchWords) { 65 List tokenList = new ArrayList (); 66 boolean withinQuotation = false; 70 String quotedString = ""; int termCount = 0; 73 int fromIndex = -1; 74 searchWords = searchWords.trim(); 75 while((fromIndex = searchWords.indexOf("\"", fromIndex+1))!= -1){ withinQuotation = !withinQuotation; 77 } 78 if( withinQuotation ) { 79 searchWords = searchWords + "\""; withinQuotation = !withinQuotation; 81 } 82 83 StringTokenizer qTokenizer = new StringTokenizer (searchWords,"\"",true); int orCount = 0; while (qTokenizer.hasMoreTokens()) { 86 String curToken = qTokenizer.nextToken(); 87 if (curToken.equals("\"")) { if (withinQuotation) { 89 if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER 91 && ++termCount > MAX_TERMS) { 92 throw new QueryTooComplexException(); 93 } 94 tokenList.add(QueryWordsToken.exactPhrase(quotedString)); 95 } else { 96 quotedString = ""; } 98 withinQuotation = !withinQuotation; 99 continue; 100 } else if (withinQuotation) { 101 quotedString = curToken; 102 continue; 103 } else { 104 StringTokenizer parser = new StringTokenizer (curToken.trim()); 106 while (parser.hasMoreTokens()) { 107 String token = parser.nextToken(); 108 if (token.equalsIgnoreCase(QueryWordsToken.AND().value)) { 109 tokenList.add(QueryWordsToken.AND()); 110 } else if (token 111 .equalsIgnoreCase(QueryWordsToken.OR().value)) { 112 if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER 114 && ++orCount > MAX_UNIONS) { 115 throw new QueryTooComplexException(); 116 } 117 tokenList.add(QueryWordsToken.OR()); 118 } else if (token 119 .equalsIgnoreCase(QueryWordsToken.NOT().value)) { 120 tokenList.add(QueryWordsToken.NOT()); 121 } else { 122 if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER 124 && ++termCount > MAX_TERMS) { 125 throw new QueryTooComplexException(); 126 } 127 tokenList.add(QueryWordsToken.word(token)); 128 } 129 } 130 } 131 } 132 return tokenList; 133 } 134 138 private List analyzeTokens(List tokens) { 139 boolean isTokenAfterNot = false; 140 List newTokens = new ArrayList (); 141 int wildCardTermCount = 0; 142 for (int i = 0; i < tokens.size(); i++) { 143 QueryWordsToken token = (QueryWordsToken) tokens.get(i); 144 if (token.type == QueryWordsToken.WORD) { 145 int questionMIndex = token.value.indexOf('?'); 146 int starIndex = token.value.indexOf('*'); 147 if (starIndex >= 0 || questionMIndex >= 0) { 148 if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER 149 && ++wildCardTermCount > MAX_WILD_TERMS) { 150 throw new QueryTooComplexException(); 151 } 152 if (questionMIndex != 0 && starIndex != 0) { 153 newTokens.add(QueryWordsToken.word(token.value 154 .toLowerCase(locale))); 155 if (!isTokenAfterNot && !highlightWords.contains(token.value)) { 157 highlightWords.add(token.value); 158 } 159 } else { 160 } 162 } else { 163 List wordList = analyzeText(analyzer, "contents", token.value); 165 if (wordList.size() > 0) { 166 if (!isTokenAfterNot && !highlightWords.contains(token.value)) { 167 highlightWords.add(token.value); 170 } 171 if (wordList.size() == 1) { 172 String word = (String ) wordList.get(0); 173 newTokens.add(QueryWordsToken.word(word)); 174 if (!isTokenAfterNot && !highlightWords.contains(word)) { 178 highlightWords.add(word); 179 } 180 } else { 181 QueryWordsPhrase phrase = QueryWordsToken.phrase(); 182 for (Iterator it = wordList.iterator(); it 183 .hasNext();) { 184 String word = (String ) it.next(); 185 phrase.addWord(word); 186 if (!analyzerDesc.getId().startsWith( 197 HelpBasePlugin.PLUGIN_ID + "#")) { if (!isTokenAfterNot && !highlightWords.contains(word)) { 199 highlightWords.add(word); 200 } 201 } 202 } 203 newTokens.add(phrase); 204 } 205 } 206 } 207 } else if ( 211 token.type == QueryWordsToken.OR 212 || token.type == QueryWordsToken.NOT) 213 newTokens.add(token); 214 else if (token.type == QueryWordsToken.EXACT_PHRASE) { 215 List wordList = analyzeText(analyzer, "exact_contents", token.value); 217 if (wordList.size() > 0) { 218 if (!isTokenAfterNot && !highlightWords.contains(token.value)) { 219 highlightWords.add(token.value); 221 } 222 } 223 QueryWordsExactPhrase phrase = QueryWordsToken.exactPhrase(); 224 for (Iterator it = wordList.iterator(); it.hasNext();) { 225 String word = (String ) it.next(); 226 phrase.addWord(word); 227 } 231 if (phrase.getWords().size() > 0) { 233 newTokens.add(phrase); 234 } 235 } 236 isTokenAfterNot = (token.type == QueryWordsToken.NOT); 237 } 238 return newTokens; 239 } 240 245 private List analyzeText(Analyzer analyzer, String fieldName, String text) { 246 List words = new ArrayList (1); 247 Reader reader = new StringReader(text); 248 TokenStream tStream = analyzer.tokenStream(fieldName, reader); 249 Token tok; 250 try { 251 while (null != (tok = tStream.next())) { 252 words.add(tok.termText()); 253 } 254 reader.close(); 255 } catch (IOException ioe) { 256 } 257 return words; 258 } 259 264 private Query createLuceneQuery(List searchTokens, String [] fieldNames, 265 float[] boosts) { 266 List requiredQueries = getRequiredQueries(searchTokens, fieldNames, 268 boosts); 269 if (requiredQueries.size() == 0) 270 return null; 271 else if (requiredQueries.size() <= 1) 272 return (Query) requiredQueries.get(0); 273 else 274 275 return (orQueries(requiredQueries)); 277 } 278 283 private List getRequiredQueries(List tokens, String [] fieldNames, 284 float[] boosts) { 285 List oredQueries = new ArrayList (); 286 ArrayList requiredQueryTokens = new ArrayList (); 287 for (int i = 0; i < tokens.size(); i++) { 288 QueryWordsToken token = (QueryWordsToken) tokens.get(i); 289 if (token.type != QueryWordsToken.OR) { 290 requiredQueryTokens.add(token); 291 } else { 292 Query reqQuery = getRequiredQuery(requiredQueryTokens, 293 fieldNames, boosts); 294 if (reqQuery != null) 295 oredQueries.add(reqQuery); 296 requiredQueryTokens = new ArrayList (); 297 } 298 } 299 Query reqQuery = getRequiredQuery(requiredQueryTokens, fieldNames, 300 boosts); 301 if (reqQuery != null) 302 oredQueries.add(reqQuery); 303 return oredQueries; 304 } 305 private Query orQueries(Collection queries) { 306 BooleanQuery bq = new BooleanQuery(); 307 for (Iterator it = queries.iterator(); it.hasNext();) { 308 Query q = (Query) it.next(); 309 bq.add(q, BooleanClause.Occur.SHOULD); 310 } 311 return bq; 312 } 313 318 private Query getRequiredQuery(List requiredTokens, String [] fieldNames, 319 float[] boosts) { 320 BooleanQuery retQuery = new BooleanQuery(); 321 boolean requiredTermExist = false; 322 QueryWordsToken operator = null; 324 for (int i = 0; i < requiredTokens.size(); i++) { 325 QueryWordsToken token = (QueryWordsToken) requiredTokens.get(i); 326 if (token.type == QueryWordsToken.AND 327 || token.type == QueryWordsToken.NOT) { 328 operator = token; 329 continue; 330 } 331 Query qs[] = new Query[fieldNames.length]; 333 for (int f = 0; f < fieldNames.length; f++) { 334 qs[f] = token.createLuceneQuery(fieldNames[f], boosts[f]); 335 } 336 Query q = qs[0]; 338 if (fieldNames.length > 1) { 339 BooleanQuery allFieldsQuery = new BooleanQuery(); 340 for (int f = 0; f < fieldNames.length; f++) 341 allFieldsQuery.add(qs[f], BooleanClause.Occur.SHOULD); 342 q = allFieldsQuery; 343 } 344 if (operator != null && operator.type == QueryWordsToken.NOT) { 345 retQuery.add(q, BooleanClause.Occur.MUST_NOT); } else { 347 retQuery.add(q, BooleanClause.Occur.MUST); requiredTermExist = true; 349 } 350 } 351 if (!requiredTermExist) { 352 return null; } 354 return retQuery; 355 } 356 private Query getLuceneQuery(String [] fieldNames, float[] boosts) { 357 Query luceneQuery = createLuceneQuery(analyzedTokens, fieldNames, 358 boosts); 359 return luceneQuery; 360 } 361 370 public Query getLuceneQuery(Collection fieldNames, boolean fieldSearchOnly) 371 throws QueryTooComplexException { 372 List userTokens = tokenizeUserQuery(searchWords); 374 analyzedTokens = analyzeTokens(userTokens); 375 return buildLuceneQuery(fieldNames, fieldSearchOnly); 376 } 377 386 private Query buildLuceneQuery(Collection fieldNames, 387 boolean fieldSearchOnly) { 388 String [] fields; 389 float[] boosts; 390 if (fieldSearchOnly) { 391 fields = new String [fieldNames.size()]; 392 boosts = new float[fieldNames.size()]; 393 Iterator fieldNamesIt = fieldNames.iterator(); 394 for (int i = 0; i < fieldNames.size(); i++) { 395 fields[i] = (String ) fieldNamesIt.next(); 396 boosts[i] = 5.0f; 397 } 398 } else { 399 fields = new String [fieldNames.size() + 1]; 400 boosts = new float[fieldNames.size() + 1]; 401 Iterator fieldNamesIt = fieldNames.iterator(); 402 for (int i = 0; i < fieldNames.size(); i++) { 403 fields[i] = (String ) fieldNamesIt.next(); 404 boosts[i] = 5.0f; 405 } 406 fields[fieldNames.size()] = "contents"; boosts[fieldNames.size()] = 1.0f; 408 } 409 Query query = getLuceneQuery(fields, boosts); 410 query = improveRankingForUnqotedPhrase(query, fields, boosts); 411 return query; 412 } 413 418 private Query improveRankingForUnqotedPhrase(Query query, String [] fields, 419 float[] boosts) { 420 if (query == null) 421 return query; 422 for (int i = 0; i < analyzedTokens.size(); i++) 424 if (((QueryWordsToken) analyzedTokens.get(i)).type != QueryWordsToken.WORD) 425 return query; 426 BooleanQuery booleanQuery = new BooleanQuery(); 428 booleanQuery.add(query, BooleanClause.Occur.SHOULD); 429 PhraseQuery[] phraseQueries = new PhraseQuery[fields.length]; 430 for (int f = 0; f < fields.length; f++) { 431 phraseQueries[f] = new PhraseQuery(); 432 for (int i = 0; i < analyzedTokens.size(); i++) { 433 Term t = new Term(fields[f], ((QueryWordsToken) analyzedTokens 434 .get(i)).value); 435 phraseQueries[f].add(t); 436 } 437 phraseQueries[f].setBoost(10 * boosts[f]); 438 booleanQuery.add(phraseQueries[f], BooleanClause.Occur.SHOULD); 439 } 440 return booleanQuery; 441 } 442 447 public String gethighlightTerms() { 448 StringBuffer buf = new StringBuffer (); 449 for (Iterator it = highlightWords.iterator(); it.hasNext();) { 450 buf.append('"'); 451 buf.append(it.next()); 452 buf.append("\" "); } 454 return buf.toString(); 455 } 456 } 457 | Popular Tags |