1 31 32 package org.opencms.search.documents; 33 34 import java.io.IOException ; 35 import java.io.StringReader ; 36 import java.util.ArrayList ; 37 import java.util.HashSet ; 38 import java.util.Iterator ; 39 40 import org.apache.lucene.analysis.Analyzer; 41 import org.apache.lucene.analysis.TokenStream; 42 import org.apache.lucene.index.Term; 43 import org.apache.lucene.search.BooleanClause; 44 import org.apache.lucene.search.BooleanQuery; 45 import org.apache.lucene.search.PhraseQuery; 46 import org.apache.lucene.search.Query; 47 import org.apache.lucene.search.TermQuery; 48 import org.apache.lucene.util.PriorityQueue; 49 50 64 public final class CmsHighlightFinder { 65 66 67 private Analyzer m_analyzer; 68 69 70 private I_CmsTermHighlighter m_highlighter; 71 72 73 private Query m_query; 74 75 76 private HashSet m_terms = new HashSet (); 77 78 87 public CmsHighlightFinder(I_CmsTermHighlighter highlighter, Query query, Analyzer analyzer) 88 throws IOException { 89 90 this.m_highlighter = highlighter; 91 this.m_query = query; 92 this.m_analyzer = analyzer; 93 getTerms(m_query, m_terms, false); 95 96 } 97 98 111 public static void getTerms(Query query, HashSet terms, boolean prohibited) throws IOException { 112 113 if (query instanceof BooleanQuery) { 114 getTermsFromBooleanQuery((BooleanQuery)query, terms, prohibited); 115 } else if (query instanceof PhraseQuery) { 116 getTermsFromPhraseQuery((PhraseQuery)query, terms); 117 } else if (query instanceof TermQuery) { 118 getTermsFromTermQuery((TermQuery)query, terms); 119 } 120 } 121 122 135 private static void getTermsFromBooleanQuery(BooleanQuery query, HashSet terms, boolean prohibited) 136 throws IOException { 137 138 BooleanClause[] queryClauses = query.getClauses(); 139 int i; 140 141 for (i = 0; i < queryClauses.length; i++) { 142 if (prohibited || !queryClauses[i].isProhibited()) { 143 getTerms(queryClauses[i].getQuery(), terms, prohibited); 144 } 145 } 146 } 147 148 158 private static void getTermsFromPhraseQuery(PhraseQuery query, HashSet terms) { 159 160 Term[] queryTerms = query.getTerms(); 161 int i; 162 163 for (i = 0; i < queryTerms.length; i++) { 164 terms.add(getTermsFromTerm(queryTerms[i])); 165 } 166 } 167 168 177 private static String getTermsFromTerm(Term term) { 178 179 return term.text().toLowerCase(); 180 } 181 182 192 private static void getTermsFromTermQuery(TermQuery query, HashSet terms) { 193 194 terms.add(getTermsFromTerm(query.getTerm())); 195 } 196 197 215 public String [] getBestFragments(String text, int fragmentSize, int maxNumFragments) throws IOException { 216 217 StringBuffer newText = new StringBuffer (); 218 TokenStream stream = null; 219 220 ArrayList docFrags = new ArrayList (); 221 222 DocumentFragment currentFrag = new DocumentFragment(newText.length(), docFrags.size()); 223 docFrags.add(currentFrag); 224 225 FragmentQueue fragQueue = new FragmentQueue(maxNumFragments + 1); 226 227 try { 228 org.apache.lucene.analysis.Token token; 229 String tokenText; 230 int startOffset; 231 int endOffset; 232 int lastEndOffset = 0; 233 234 stream = m_analyzer.tokenStream(null, new StringReader (text)); 236 while ((token = stream.next()) != null) { 237 startOffset = token.startOffset(); 238 endOffset = token.endOffset(); 239 tokenText = text.substring(startOffset, endOffset); 242 243 if (startOffset > lastEndOffset) { 246 newText.append(" "); 247 } 249 250 if (m_terms.contains(token.termText())) { 252 newText.append(m_highlighter.highlightTerm(tokenText)); 253 currentFrag.addTerm(token.termText()); 254 } else { 255 if (tokenText.length() > fragmentSize / 2) { 256 newText.append(tokenText.substring(0, fragmentSize / 2)); 257 newText.append(" "); 258 } else { 259 newText.append(tokenText); 260 } 261 } 262 263 if (newText.length() >= (fragmentSize * (docFrags.size() + 1))) { 264 currentFrag.m_textEndPos = newText.length(); 266 currentFrag = new DocumentFragment(newText.length(), docFrags.size()); 267 docFrags.add(currentFrag); 268 } 269 270 lastEndOffset = endOffset; 271 } 272 273 if (lastEndOffset < text.length()) { 275 newText.append(text.substring(lastEndOffset)); 279 } 280 281 currentFrag.m_textEndPos = newText.length(); 282 283 int minScore = 0; 285 for (Iterator i = docFrags.iterator(); i.hasNext();) { 286 currentFrag = (DocumentFragment)i.next(); 287 if (currentFrag.getScore() >= minScore) { 288 fragQueue.put(currentFrag); 289 if (fragQueue.size() > maxNumFragments) { 290 fragQueue.pop(); 292 minScore = ((DocumentFragment)fragQueue.top()).getScore(); 294 } 296 297 } 298 } 299 300 String [] fragText = new String [fragQueue.size()]; 302 for (int i = fragText.length - 1; i >= 0; i--) { 303 DocumentFragment frag = (DocumentFragment)fragQueue.pop(); 304 fragText[i] = newText.substring(frag.m_textStartPos, frag.m_textEndPos); 305 } 306 return fragText; 307 308 } finally { 309 if (stream != null) { 310 try { 311 stream.close(); 312 } catch (Exception e) { 313 } 315 } 316 } 317 } 318 319 339 public String getBestFragments(String text, int fragmentSize, int maxNumFragments, String separator) 340 throws IOException { 341 342 String [] sections = getBestFragments(text, fragmentSize, maxNumFragments); 343 StringBuffer result = new StringBuffer (); 344 for (int i = 0; i < sections.length; i++) { 345 if (i > 0) { 346 result.append(separator); 347 } 348 result.append(sections[i]); 349 } 350 return result.toString(); 351 } 352 } 353 354 363 364 class DocumentFragment { 365 366 367 protected int m_fragNum; 368 369 370 protected int m_score; 371 372 373 protected int m_textEndPos; 374 375 376 protected int m_textStartPos; 377 378 379 protected HashSet m_uniqueTerms = new HashSet (); 380 381 385 public DocumentFragment(int textStartPos, int fragNum) { 386 387 this.m_textStartPos = textStartPos; 388 this.m_fragNum = fragNum; 389 } 390 391 394 void addTerm(String term) { 395 396 m_uniqueTerms.add(term); 397 } 398 399 402 int getScore() { 403 404 return m_uniqueTerms.size(); 405 } 406 } 407 408 411 412 class FragmentQueue extends PriorityQueue { 413 414 417 public FragmentQueue(int size) { 418 419 initialize(size); 420 } 421 422 425 public final boolean lessThan(Object a, Object b) { 426 427 DocumentFragment fragA = (DocumentFragment)a; 428 DocumentFragment fragB = (DocumentFragment)b; 429 if (fragA.getScore() == fragB.getScore()) { 430 return fragA.m_fragNum > fragB.m_fragNum; 431 } else { 432 return fragA.getScore() < fragB.getScore(); 433 } 434 } 435 } 436 | Popular Tags |