KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > search > highlight > Highlighter


1 package org.apache.lucene.search.highlight;
2 /**
3  * Copyright 2002-2004 The Apache Software Foundation
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */

17
18 import java.io.IOException JavaDoc;
19 import java.util.ArrayList JavaDoc;
20 import java.util.Iterator JavaDoc;
21 import org.apache.lucene.analysis.TokenStream;
22 import org.apache.lucene.util.PriorityQueue;
23
24 /**
25  * Class used to markup highlighted terms found in the best sections of a text, using configurable {@link Fragmenter} , {@link Scorer} , {@link Formatter} and tokenizers.
26  * @author mark@searcharea.co.uk
27  */

28 public class Highlighter
29 {
30
31     public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
32     private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
33     private Formatter formatter;
34     private Fragmenter textFragmenter=new SimpleFragmenter();
35     private Scorer fragmentScorer=null;
36
37     public Highlighter(Scorer fragmentScorer)
38     {
39         this(new SimpleHTMLFormatter(),fragmentScorer);
40     }
41     
42     
43     public Highlighter(Formatter formatter, Scorer fragmentScorer)
44     {
45         this.formatter = formatter;
46         this.fragmentScorer = fragmentScorer;
47     }
48     
49
50
51
52     /**
53      * Highlights chosen terms in a text, extracting the most relevant section.
54      * The document text is analysed in chunks to record hit statistics
55      * across the document. After accumulating stats, the fragment with the highest score
56      * is returned
57      *
58      * @param tokenStream a stream of tokens identified in the text parameter, including offset information.
59      * This is typically produced by an analyzer re-parsing a document's
60      * text. Some work may be done on retrieving TokenStreams more efficently
61      * by adding support for storing original text position data in the Lucene
62      * index but this support is not currently available (as of Lucene 1.4 rc2).
63      * @param text text to highlight terms in
64      *
65      * @return highlighted text fragment or null if no terms found
66      */

67     public final String JavaDoc getBestFragment(TokenStream tokenStream, String JavaDoc text)
68         throws IOException JavaDoc
69     {
70         String JavaDoc[] results = getBestFragments(tokenStream,text, 1);
71         if (results.length > 0)
72         {
73             return results[0];
74         }
75         return null;
76     }
77     /**
78      * Highlights chosen terms in a text, extracting the most relevant sections.
79      * The document text is analysed in chunks to record hit statistics
80      * across the document. After accumulating stats, the fragments with the highest scores
81      * are returned as an array of strings in order of score (contiguous fragments are merged into
82      * one in their original order to improve readability)
83      *
84      * @param text text to highlight terms in
85      * @param maxNumFragments the maximum number of fragments.
86      *
87      * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
88      */

89     public final String JavaDoc[] getBestFragments(
90         TokenStream tokenStream,
91         String JavaDoc text,
92         int maxNumFragments)
93         throws IOException JavaDoc
94     {
95         maxNumFragments = Math.max(1, maxNumFragments); //sanity check
96

97         TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
98
99         //Get text
100
ArrayList JavaDoc<String JavaDoc> fragTexts = new ArrayList JavaDoc<String JavaDoc>();
101         for (int i = 0; i < frag.length; i++)
102         {
103             if ((frag[i] != null) && (frag[i].getScore() > 0))
104             {
105                 fragTexts.add(frag[i].toString());
106             }
107         }
108         return (String JavaDoc[]) fragTexts.toArray(new String JavaDoc[0]);
109     }
110     
111
112     /**
113      * Low level api to get the most relevant (formatted) sections of the document.
114      * This method has been made public to allow visibility of score information held in TextFragment objects.
115      * Thanks to Jason Calabrese for help in redefining the interface.
116      * @param tokenStream
117      * @param text
118      * @param maxNumFragments
119      * @param mergeContiguousFragments
120      * @return
121      * @throws IOException
122      */

123     public final TextFragment[] getBestTextFragments(
124         TokenStream tokenStream,
125         String JavaDoc text,
126         boolean mergeContiguousFragments,
127         int maxNumFragments)
128         throws IOException JavaDoc
129     {
130         ArrayList JavaDoc<TextFragment> docFrags = new ArrayList JavaDoc<TextFragment>();
131         StringBuffer JavaDoc newText=new StringBuffer JavaDoc();
132
133         TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
134         fragmentScorer.startFragment(currentFrag);
135         docFrags.add(currentFrag);
136     
137         FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
138
139         try
140         {
141             org.apache.lucene.analysis.Token token;
142             String JavaDoc tokenText;
143             int startOffset;
144             int endOffset;
145             int lastEndOffset = 0;
146             textFragmenter.start(text);
147         
148             TokenGroup tokenGroup=new TokenGroup();
149
150             while ((token = tokenStream.next()) != null)
151             {
152                 if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
153                 {
154                     //the current token is distinct from previous tokens -
155
// markup the cached token group info
156
startOffset = tokenGroup.startOffset;
157                     endOffset = tokenGroup.endOffset;
158                     tokenText = text.substring(startOffset, endOffset);
159                     String JavaDoc markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
160                     //store any whitespace etc from between this and last group
161
if (startOffset > lastEndOffset)
162                         newText.append(text.substring(lastEndOffset, startOffset));
163                     newText.append(markedUpText);
164                     lastEndOffset=endOffset;
165                     tokenGroup.clear();
166
167                     //check if current token marks the start of a new fragment
168
if(textFragmenter.isNewFragment(token))
169                     {
170                         currentFrag.setScore(fragmentScorer.getFragmentScore());
171                         //record stats for a new fragment
172
currentFrag.textEndPos = newText.length();
173                         currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
174                         fragmentScorer.startFragment(currentFrag);
175                         docFrags.add(currentFrag);
176                     }
177                 }
178                         
179                 tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
180                 
181                 if(lastEndOffset>maxDocBytesToAnalyze)
182                 {
183                     break;
184                 }
185             }
186             currentFrag.setScore(fragmentScorer.getFragmentScore());
187     
188             if(tokenGroup.numTokens>0)
189             {
190                 //flush the accumulated text (same code as in above loop)
191
startOffset = tokenGroup.startOffset;
192                 endOffset = tokenGroup.endOffset;
193                 tokenText = text.substring(startOffset, endOffset);
194                 String JavaDoc markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
195                 //store any whitespace etc from between this and last group
196
if (startOffset > lastEndOffset)
197                     newText.append(text.substring(lastEndOffset, startOffset));
198                 newText.append(markedUpText);
199                 lastEndOffset=endOffset;
200             }
201
202             // append text after end of last token
203
if (lastEndOffset < text.length())
204                 newText.append(text.substring(lastEndOffset));
205
206             currentFrag.textEndPos = newText.length();
207
208             //sort the most relevant sections of the text
209
for (Iterator JavaDoc i = docFrags.iterator(); i.hasNext();)
210             {
211                 currentFrag = (TextFragment) i.next();
212
213                 //If you are running with a version of Lucene before 11th Sept 03
214
// you do not have PriorityQueue.insert() - so uncomment the code below
215
/*
216                                     if (currentFrag.getScore() >= minScore)
217                                     {
218                                         fragQueue.put(currentFrag);
219                                         if (fragQueue.size() > maxNumFragments)
220                                         { // if hit queue overfull
221                                             fragQueue.pop(); // remove lowest in hit queue
222                                             minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
223                                         }
224                                         
225                     
226                                     }
227                 */

228                 //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
229
//fix to PriorityQueue. The correct method to use here is the new "insert" method
230
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
231
fragQueue.insert(currentFrag);
232             }
233
234             //return the most relevant fragments
235
TextFragment frag[] = new TextFragment[fragQueue.size()];
236             for (int i = frag.length - 1; i >= 0; i--)
237             {
238                 frag[i] = (TextFragment) fragQueue.pop();
239             }
240             
241             //merge any contiguous fragments to improve readability
242
if(mergeContiguousFragments)
243             {
244                 mergeContiguousFragments(frag);
245                 ArrayList JavaDoc<TextFragment> fragTexts = new ArrayList JavaDoc<TextFragment>();
246                 for (int i = 0; i < frag.length; i++)
247                 {
248                     if ((frag[i] != null) && (frag[i].getScore() > 0))
249                     {
250                         fragTexts.add(frag[i]);
251                     }
252                 }
253                 frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
254             }
255             
256             return frag;
257
258         }
259         finally
260         {
261             if (tokenStream != null)
262             {
263                 try
264                 {
265                     tokenStream.close();
266                 }
267                 catch (Exception JavaDoc e)
268                 {
269                 }
270             }
271         }
272     }
273
274
275     /** Improves readability of a score-sorted list of TextFragments by merging any fragments
276      * that were contiguous in the original text into one larger fragment with the correct order.
277      * This will leave a "null" in the array entry for the lesser scored fragment.
278      *
279      * @param frag An array of document fragments in descending score
280      */

281     private void mergeContiguousFragments(TextFragment[] frag)
282     {
283         boolean mergingStillBeingDone;
284         if (frag.length > 1)
285             do
286             {
287                 mergingStillBeingDone = false; //initialise loop control flag
288
//for each fragment, scan other frags looking for contiguous blocks
289
for (int i = 0; i < frag.length; i++)
290                 {
291                     if (frag[i] == null)
292                     {
293                         continue;
294                     }
295                     //merge any contiguous blocks
296
for (int x = 0; x < frag.length; x++)
297                     {
298                         if (frag[x] == null)
299                         {
300                             continue;
301                         }
302                         if (frag[i] == null)
303                         {
304                             break;
305                         }
306                         TextFragment frag1 = null;
307                         TextFragment frag2 = null;
308                         int frag1Num = 0;
309                         int frag2Num = 0;
310                         int bestScoringFragNum;
311                         int worstScoringFragNum;
312                         //if blocks are contiguous....
313
if (frag[i].follows(frag[x]))
314                         {
315                             frag1 = frag[x];
316                             frag1Num = x;
317                             frag2 = frag[i];
318                             frag2Num = i;
319                         }
320                         else
321                             if (frag[x].follows(frag[i]))
322                             {
323                                 frag1 = frag[i];
324                                 frag1Num = i;
325                                 frag2 = frag[x];
326                                 frag2Num = x;
327                             }
328                         //merging required..
329
if (frag1 != null)
330                         {
331                             if (frag1.getScore() > frag2.getScore())
332                             {
333                                 bestScoringFragNum = frag1Num;
334                                 worstScoringFragNum = frag2Num;
335                             }
336                             else
337                             {
338                                 bestScoringFragNum = frag2Num;
339                                 worstScoringFragNum = frag1Num;
340                             }
341                             frag1.merge(frag2);
342                             frag[worstScoringFragNum] = null;
343                             mergingStillBeingDone = true;
344                             frag[bestScoringFragNum] = frag1;
345                         }
346                     }
347                 }
348             }
349             while (mergingStillBeingDone);
350     }
351     
352     
353     /**
354      * Highlights terms in the text , extracting the most relevant sections
355      * and concatenating the chosen fragments with a separator (typically "...").
356      * The document text is analysed in chunks to record hit statistics
357      * across the document. After accumulating stats, the fragments with the highest scores
358      * are returned in order as "separator" delimited strings.
359      *
360      * @param text text to highlight terms in
361      * @param maxNumFragments the maximum number of fragments.
362      * @param separator the separator used to intersperse the document fragments (typically "...")
363      *
364      * @return highlighted text
365      */

366     public final String JavaDoc getBestFragments(
367         TokenStream tokenStream,
368         String JavaDoc text,
369         int maxNumFragments,
370         String JavaDoc separator)
371         throws IOException JavaDoc
372     {
373         String JavaDoc sections[] = getBestFragments(tokenStream,text, maxNumFragments);
374         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
375         for (int i = 0; i < sections.length; i++)
376         {
377             if (i > 0)
378             {
379                 result.append(separator);
380             }
381             result.append(sections[i]);
382         }
383         return result.toString();
384     }
385
386     /**
387      * @return the maximum number of bytes to be tokenized per doc
388      * @uml.property name="maxDocBytesToAnalyze"
389      */

390     public int getMaxDocBytesToAnalyze()
391     {
392         return maxDocBytesToAnalyze;
393     }
394
395     /**
396      * @param byteCount the maximum number of bytes to be tokenized per doc (This can improve performance with large documents)
397      * @uml.property name="maxDocBytesToAnalyze"
398      */

399     public void setMaxDocBytesToAnalyze(int byteCount)
400     {
401         maxDocBytesToAnalyze = byteCount;
402     }
403
404     /**
405      * @return
406      * @uml.property name="textFragmenter"
407      */

408     public Fragmenter getTextFragmenter()
409     {
410         return textFragmenter;
411     }
412
413     /**
414      * @param fragmenter
415      * @uml.property name="textFragmenter"
416      */

417     public void setTextFragmenter(Fragmenter fragmenter)
418     {
419         textFragmenter = fragmenter;
420     }
421
422     /**
423      * @return Object used to score each text fragment
424      * @uml.property name="fragmentScorer"
425      */

426     public Scorer getFragmentScorer()
427     {
428         return fragmentScorer;
429     }
430
431
432     /**
433      * @param scorer
434      * @uml.property name="fragmentScorer"
435      */

436     public void setFragmentScorer(Scorer scorer)
437     {
438         fragmentScorer = scorer;
439     }
440
441
442 }
443 class FragmentQueue extends PriorityQueue
444 {
445     public FragmentQueue(int size)
446     {
447         initialize(size);
448     }
449
450     public final boolean lessThan(Object JavaDoc a, Object JavaDoc b)
451     {
452         TextFragment fragA = (TextFragment) a;
453         TextFragment fragB = (TextFragment) b;
454         if (fragA.getScore() == fragB.getScore())
455             return fragA.fragNum > fragB.fragNum;
456         else
457             return fragA.getScore() < fragB.getScore();
458     }
459 }
460
Popular Tags