1 package org.apache.lucene.search.highlight; 2 17 18 import java.io.IOException ; 19 import java.util.ArrayList ; 20 import java.util.Iterator ; 21 import org.apache.lucene.analysis.TokenStream; 22 import org.apache.lucene.util.PriorityQueue; 23 24 28 public class Highlighter 29 { 30 31 public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024; 32 private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE; 33 private Formatter formatter; 34 private Fragmenter textFragmenter=new SimpleFragmenter(); 35 private Scorer fragmentScorer=null; 36 37 public Highlighter(Scorer fragmentScorer) 38 { 39 this(new SimpleHTMLFormatter(),fragmentScorer); 40 } 41 42 43 public Highlighter(Formatter formatter, Scorer fragmentScorer) 44 { 45 this.formatter = formatter; 46 this.fragmentScorer = fragmentScorer; 47 } 48 49 50 51 52 67 public final String getBestFragment(TokenStream tokenStream, String text) 68 throws IOException 69 { 70 String [] results = getBestFragments(tokenStream,text, 1); 71 if (results.length > 0) 72 { 73 return results[0]; 74 } 75 return null; 76 } 77 89 public final String [] getBestFragments( 90 TokenStream tokenStream, 91 String text, 92 int maxNumFragments) 93 throws IOException 94 { 95 maxNumFragments = Math.max(1, maxNumFragments); 97 TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments); 98 99 ArrayList <String > fragTexts = new ArrayList <String >(); 101 for (int i = 0; i < frag.length; i++) 102 { 103 if ((frag[i] != null) && (frag[i].getScore() > 0)) 104 { 105 fragTexts.add(frag[i].toString()); 106 } 107 } 108 return (String []) fragTexts.toArray(new String [0]); 109 } 110 111 112 123 public final TextFragment[] getBestTextFragments( 124 TokenStream tokenStream, 125 String text, 126 boolean mergeContiguousFragments, 127 int maxNumFragments) 128 throws IOException 129 { 130 ArrayList <TextFragment> docFrags = new ArrayList <TextFragment>(); 131 StringBuffer newText=new StringBuffer (); 132 133 TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); 134 fragmentScorer.startFragment(currentFrag); 135 docFrags.add(currentFrag); 136 137 FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); 138 139 try 140 { 141 org.apache.lucene.analysis.Token token; 142 String tokenText; 143 int startOffset; 144 int endOffset; 145 int lastEndOffset = 0; 146 textFragmenter.start(text); 147 148 TokenGroup tokenGroup=new TokenGroup(); 149 150 while ((token = tokenStream.next()) != null) 151 { 152 if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token))) 153 { 154 startOffset = tokenGroup.startOffset; 157 endOffset = tokenGroup.endOffset; 158 tokenText = text.substring(startOffset, endOffset); 159 String markedUpText=formatter.highlightTerm(tokenText, tokenGroup); 160 if (startOffset > lastEndOffset) 162 newText.append(text.substring(lastEndOffset, startOffset)); 163 newText.append(markedUpText); 164 lastEndOffset=endOffset; 165 tokenGroup.clear(); 166 167 if(textFragmenter.isNewFragment(token)) 169 { 170 currentFrag.setScore(fragmentScorer.getFragmentScore()); 171 currentFrag.textEndPos = newText.length(); 173 currentFrag =new TextFragment(newText, newText.length(), docFrags.size()); 174 fragmentScorer.startFragment(currentFrag); 175 docFrags.add(currentFrag); 176 } 177 } 178 179 tokenGroup.addToken(token,fragmentScorer.getTokenScore(token)); 180 181 if(lastEndOffset>maxDocBytesToAnalyze) 182 { 183 break; 184 } 185 } 186 currentFrag.setScore(fragmentScorer.getFragmentScore()); 187 188 if(tokenGroup.numTokens>0) 189 { 190 startOffset = tokenGroup.startOffset; 192 endOffset = tokenGroup.endOffset; 193 tokenText = text.substring(startOffset, endOffset); 194 String markedUpText=formatter.highlightTerm(tokenText, tokenGroup); 195 if (startOffset > lastEndOffset) 197 newText.append(text.substring(lastEndOffset, startOffset)); 198 newText.append(markedUpText); 199 lastEndOffset=endOffset; 200 } 201 202 if (lastEndOffset < text.length()) 204 newText.append(text.substring(lastEndOffset)); 205 206 currentFrag.textEndPos = newText.length(); 207 208 for (Iterator i = docFrags.iterator(); i.hasNext();) 210 { 211 currentFrag = (TextFragment) i.next(); 212 213 228 fragQueue.insert(currentFrag); 232 } 233 234 TextFragment frag[] = new TextFragment[fragQueue.size()]; 236 for (int i = frag.length - 1; i >= 0; i--) 237 { 238 frag[i] = (TextFragment) fragQueue.pop(); 239 } 240 241 if(mergeContiguousFragments) 243 { 244 mergeContiguousFragments(frag); 245 ArrayList <TextFragment> fragTexts = new ArrayList <TextFragment>(); 246 for (int i = 0; i < frag.length; i++) 247 { 248 if ((frag[i] != null) && (frag[i].getScore() > 0)) 249 { 250 fragTexts.add(frag[i]); 251 } 252 } 253 frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]); 254 } 255 256 return frag; 257 258 } 259 finally 260 { 261 if (tokenStream != null) 262 { 263 try 264 { 265 tokenStream.close(); 266 } 267 catch (Exception e) 268 { 269 } 270 } 271 } 272 } 273 274 275 281 private void mergeContiguousFragments(TextFragment[] frag) 282 { 283 boolean mergingStillBeingDone; 284 if (frag.length > 1) 285 do 286 { 287 mergingStillBeingDone = false; for (int i = 0; i < frag.length; i++) 290 { 291 if (frag[i] == null) 292 { 293 continue; 294 } 295 for (int x = 0; x < frag.length; x++) 297 { 298 if (frag[x] == null) 299 { 300 continue; 301 } 302 if (frag[i] == null) 303 { 304 break; 305 } 306 TextFragment frag1 = null; 307 TextFragment frag2 = null; 308 int frag1Num = 0; 309 int frag2Num = 0; 310 int bestScoringFragNum; 311 int worstScoringFragNum; 312 if (frag[i].follows(frag[x])) 314 { 315 frag1 = frag[x]; 316 frag1Num = x; 317 frag2 = frag[i]; 318 frag2Num = i; 319 } 320 else 321 if (frag[x].follows(frag[i])) 322 { 323 frag1 = frag[i]; 324 frag1Num = i; 325 frag2 = frag[x]; 326 frag2Num = x; 327 } 328 if (frag1 != null) 330 { 331 if (frag1.getScore() > frag2.getScore()) 332 { 333 bestScoringFragNum = frag1Num; 334 worstScoringFragNum = frag2Num; 335 } 336 else 337 { 338 bestScoringFragNum = frag2Num; 339 worstScoringFragNum = frag1Num; 340 } 341 frag1.merge(frag2); 342 frag[worstScoringFragNum] = null; 343 mergingStillBeingDone = true; 344 frag[bestScoringFragNum] = frag1; 345 } 346 } 347 } 348 } 349 while (mergingStillBeingDone); 350 } 351 352 353 366 public final String getBestFragments( 367 TokenStream tokenStream, 368 String text, 369 int maxNumFragments, 370 String separator) 371 throws IOException 372 { 373 String sections[] = getBestFragments(tokenStream,text, maxNumFragments); 374 StringBuffer result = new StringBuffer (); 375 for (int i = 0; i < sections.length; i++) 376 { 377 if (i > 0) 378 { 379 result.append(separator); 380 } 381 result.append(sections[i]); 382 } 383 return result.toString(); 384 } 385 386 390 public int getMaxDocBytesToAnalyze() 391 { 392 return maxDocBytesToAnalyze; 393 } 394 395 399 public void setMaxDocBytesToAnalyze(int byteCount) 400 { 401 maxDocBytesToAnalyze = byteCount; 402 } 403 404 408 public Fragmenter getTextFragmenter() 409 { 410 return textFragmenter; 411 } 412 413 417 public void setTextFragmenter(Fragmenter fragmenter) 418 { 419 textFragmenter = fragmenter; 420 } 421 422 426 public Scorer getFragmentScorer() 427 { 428 return fragmentScorer; 429 } 430 431 432 436 public void setFragmentScorer(Scorer scorer) 437 { 438 fragmentScorer = scorer; 439 } 440 441 442 } 443 class FragmentQueue extends PriorityQueue 444 { 445 public FragmentQueue(int size) 446 { 447 initialize(size); 448 } 449 450 public final boolean lessThan(Object a, Object b) 451 { 452 TextFragment fragA = (TextFragment) a; 453 TextFragment fragB = (TextFragment) b; 454 if (fragA.getScore() == fragB.getScore()) 455 return fragA.fragNum > fragB.fragNum; 456 else 457 return fragA.getScore() < fragB.getScore(); 458 } 459 } 460 | Popular Tags |