KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > search > documents > CmsHighlightFinder


1 /*
2  * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/documents/CmsHighlightFinder.java,v $
3  * Date : $Date: 2006/03/27 14:53:05 $
4  * Version: $Revision: 1.8 $
5  *
6  * This library is part of OpenCms -
7  * the Open Source Content Mananagement System
8  *
9  * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the m_terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * For further information about Alkacon Software GmbH, please see the
22  * company website: http://www.alkacon.com
23  *
24  * For further information about OpenCms, please see the
25  * project website: http://www.opencms.org
26  *
27  * You should have received a copy of the GNU Lesser General Public
28  * License along with this library; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30  */

31
32 package org.opencms.search.documents;
33
34 import java.io.IOException JavaDoc;
35 import java.io.StringReader JavaDoc;
36 import java.util.ArrayList JavaDoc;
37 import java.util.HashSet JavaDoc;
38 import java.util.Iterator JavaDoc;
39
40 import org.apache.lucene.analysis.Analyzer;
41 import org.apache.lucene.analysis.TokenStream;
42 import org.apache.lucene.index.Term;
43 import org.apache.lucene.search.BooleanClause;
44 import org.apache.lucene.search.BooleanQuery;
45 import org.apache.lucene.search.PhraseQuery;
46 import org.apache.lucene.search.Query;
47 import org.apache.lucene.search.TermQuery;
48 import org.apache.lucene.util.PriorityQueue;
49
50 /**
51  * Adapted from Maik Schreiber's LuceneTools.java,v 1.5 2001/10/16 07:25:55.
52  *
53  * Alterations include:
54  * + Changed to support Lucene 1.3 release (requires no change to Lucene code
55  * base but consequently no longer supports MultiTermQuery, RangeQuery and
56  * PrefixQuery highlighting currently)
57  * + Performance enhancement - CmsHighlightExtractor caches m_query m_terms and can
58  * therefore be called repeatedly to highlight multiple results more efficently
59  * + New feature: can extract the most relevant parts of large bodies of text -
60  * with user defined size of extracts
61  *
62  * @author Maik Schreiber
63  */

64 public final class CmsHighlightFinder {
65
66     /** The analyzer. */
67     private Analyzer m_analyzer;
68
69     /** The term highlighter. */
70     private I_CmsTermHighlighter m_highlighter;
71
72     /** The query. */
73     private Query m_query;
74
75     /** A set of all terms. */
76     private HashSet JavaDoc m_terms = new HashSet JavaDoc();
77
78     /**
79      * @param highlighter
80      * I_TermHighlighter to use to highlight m_terms in the text
81      * @param query
82      * Query which contains the m_terms to be highlighted in the text
83      * @param analyzer
84      * Analyzer used to construct the Query
85      * @throws IOException if something goes wrong
86      */

87     public CmsHighlightFinder(I_CmsTermHighlighter highlighter, Query query, Analyzer analyzer)
88     throws IOException JavaDoc {
89
90         this.m_highlighter = highlighter;
91         this.m_query = query;
92         this.m_analyzer = analyzer;
93         // get m_terms in m_query
94
getTerms(m_query, m_terms, false);
95
96     }
97
98     /**
99      * Extracts all term texts of a given Query. Term texts will be returned in
100      * lower-case.
101      *
102      * @param query
103      * Query to extract term texts from
104      * @param terms
105      * HashSet where extracted term texts should be put into
106      * (Elements: String)
107      * @param prohibited
108      * <code>true</code> to extract "prohibited" m_terms, too
109      * @throws IOException if something goes wrong
110      */

111     public static void getTerms(Query query, HashSet JavaDoc terms, boolean prohibited) throws IOException JavaDoc {
112
113         if (query instanceof BooleanQuery) {
114             getTermsFromBooleanQuery((BooleanQuery)query, terms, prohibited);
115         } else if (query instanceof PhraseQuery) {
116             getTermsFromPhraseQuery((PhraseQuery)query, terms);
117         } else if (query instanceof TermQuery) {
118             getTermsFromTermQuery((TermQuery)query, terms);
119         }
120     }
121
122     /**
123      * Extracts all term texts of a given BooleanQuery. Term texts will be
124      * returned in lower-case.
125      *
126      * @param query
127      * BooleanQuery to extract term texts from
128      * @param terms
129      * HashSet where extracted term texts should be put into
130      * (Elements: String)
131      * @param prohibited
132      * <code>true</code> to extract "prohibited" m_terms, too
133      * @throws IOException if something goes wrong
134      */

135     private static void getTermsFromBooleanQuery(BooleanQuery query, HashSet JavaDoc terms, boolean prohibited)
136     throws IOException JavaDoc {
137
138         BooleanClause[] queryClauses = query.getClauses();
139         int i;
140
141         for (i = 0; i < queryClauses.length; i++) {
142             if (prohibited || !queryClauses[i].isProhibited()) {
143                 getTerms(queryClauses[i].getQuery(), terms, prohibited);
144             }
145         }
146     }
147
148     /**
149      * Extracts all term texts of a given PhraseQuery. Term texts will be
150      * returned in lower-case.
151      *
152      * @param query
153      * PhraseQuery to extract term texts from
154      * @param terms
155      * HashSet where extracted term texts should be put into
156      * (Elements: String)
157      */

158     private static void getTermsFromPhraseQuery(PhraseQuery query, HashSet JavaDoc terms) {
159
160         Term[] queryTerms = query.getTerms();
161         int i;
162
163         for (i = 0; i < queryTerms.length; i++) {
164             terms.add(getTermsFromTerm(queryTerms[i]));
165         }
166     }
167
168     /**
169      * Extracts the term of a given Term. The term will be returned in
170      * lower-case.
171      *
172      * @param term
173      * Term to extract term from
174      *
175      * @return the Term's term text
176      */

177     private static String JavaDoc getTermsFromTerm(Term term) {
178
179         return term.text().toLowerCase();
180     }
181
182     /**
183      * Extracts all term texts of a given TermQuery. Term texts will be
184      * returned in lower-case.
185      *
186      * @param query
187      * TermQuery to extract term texts from
188      * @param terms
189      * HashSet where extracted term texts should be put into
190      * (Elements: String)
191      */

192     private static void getTermsFromTermQuery(TermQuery query, HashSet JavaDoc terms) {
193
194         terms.add(getTermsFromTerm(query.getTerm()));
195     }
196
197     /**
198      * Highlights a text in accordance to the given m_query, extracting the most
199      * relevant sections. The document text is analysed in fragmentSize chunks
200      * to record hit statistics across the document. After accumulating stats,
201      * the fragments with the highest scores are returned as an array of
202      * strings in order of m_score.
203      *
204      * @param text
205      * text to highlight m_terms in
206      * @param fragmentSize
207      * the size in bytes of each fragment to be returned
208      * @param maxNumFragments
209      * the maximum number of fragments.
210      *
211      * @return highlighted text fragments (between 0 and maxNumFragments number
212      * of fragments)
213      * @throws IOException if something goes wrong
214      */

215     public String JavaDoc[] getBestFragments(String JavaDoc text, int fragmentSize, int maxNumFragments) throws IOException JavaDoc {
216
217         StringBuffer JavaDoc newText = new StringBuffer JavaDoc();
218         TokenStream stream = null;
219
220         ArrayList JavaDoc docFrags = new ArrayList JavaDoc();
221
222         DocumentFragment currentFrag = new DocumentFragment(newText.length(), docFrags.size());
223         docFrags.add(currentFrag);
224
225         FragmentQueue fragQueue = new FragmentQueue(maxNumFragments + 1);
226
227         try {
228             org.apache.lucene.analysis.Token token;
229             String JavaDoc tokenText;
230             int startOffset;
231             int endOffset;
232             int lastEndOffset = 0;
233
234             // long start=System.currentTimeMillis();
235
stream = m_analyzer.tokenStream(null, new StringReader JavaDoc(text));
236             while ((token = stream.next()) != null) {
237                 startOffset = token.startOffset();
238                 endOffset = token.endOffset();
239                 // make sure wildcards are removed else highlighting will not
240
// work
241
tokenText = text.substring(startOffset, endOffset);
242
243                 // append text between end of last token (or beginning of text)
244
// and start of current token
245
if (startOffset > lastEndOffset) {
246                     newText.append(" ");
247                     // newText.append(text.substring(lastEndOffset, startOffset));
248
}
249
250                 // does m_query contain current token?
251
if (m_terms.contains(token.termText())) {
252                     newText.append(m_highlighter.highlightTerm(tokenText));
253                     currentFrag.addTerm(token.termText());
254                 } else {
255                     if (tokenText.length() > fragmentSize / 2) {
256                         newText.append(tokenText.substring(0, fragmentSize / 2));
257                         newText.append(" ");
258                     } else {
259                         newText.append(tokenText);
260                     }
261                 }
262
263                 if (newText.length() >= (fragmentSize * (docFrags.size() + 1))) {
264                     //record stats for a new fragment
265
currentFrag.m_textEndPos = newText.length();
266                     currentFrag = new DocumentFragment(newText.length(), docFrags.size());
267                     docFrags.add(currentFrag);
268                 }
269
270                 lastEndOffset = endOffset;
271             }
272
273             // append text after end of last token
274
if (lastEndOffset < text.length()) {
275                 // int extend = lastEndOffset + fragmentSize;
276
// extend = (extend > text.length()) ? text.length() : extend;
277
// newText.append(text.substring(lastEndOffset, extend));
278
newText.append(text.substring(lastEndOffset));
279             }
280
281             currentFrag.m_textEndPos = newText.length();
282
283             //find the most relevant sections of the text
284
int minScore = 0;
285             for (Iterator JavaDoc i = docFrags.iterator(); i.hasNext();) {
286                 currentFrag = (DocumentFragment)i.next();
287                 if (currentFrag.getScore() >= minScore) {
288                     fragQueue.put(currentFrag);
289                     if (fragQueue.size() > maxNumFragments) {
290                         // if hit queue overfull
291
fragQueue.pop();
292                         // remove lowest in hit queue
293
minScore = ((DocumentFragment)fragQueue.top()).getScore();
294                         // reset minScore
295
}
296
297                 }
298             }
299
300             //extract the text
301
String JavaDoc[] fragText = new String JavaDoc[fragQueue.size()];
302             for (int i = fragText.length - 1; i >= 0; i--) {
303                 DocumentFragment frag = (DocumentFragment)fragQueue.pop();
304                 fragText[i] = newText.substring(frag.m_textStartPos, frag.m_textEndPos);
305             }
306             return fragText;
307
308         } finally {
309             if (stream != null) {
310                 try {
311                     stream.close();
312                 } catch (Exception JavaDoc e) {
313                     // noop
314
}
315             }
316         }
317     }
318
319     /**
320      * Highlights a text in accordance to the given m_query and extracting the most
321      * relevant sections. The document text is analysed in
322      * fragmentSize chunks to record hit statistics across the document. After
323      * accumulating stats, the fragments with the highest scores are returned
324      * in order as "separator" delimited strings.
325      *
326      * @param text
327      * text to highlight m_terms in
328      * @param fragmentSize
329      * the size in bytes of each fragment to be returned
330      * @param maxNumFragments
331      * the maximum number of fragments.
332      * @param separator
333      * the separator used to intersperse the document fragments
334      * (typically " ... ")
335      *
336      * @return highlighted text
337      * @throws IOException if something goes wrong
338      */

339     public String JavaDoc getBestFragments(String JavaDoc text, int fragmentSize, int maxNumFragments, String JavaDoc separator)
340     throws IOException JavaDoc {
341
342         String JavaDoc[] sections = getBestFragments(text, fragmentSize, maxNumFragments);
343         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
344         for (int i = 0; i < sections.length; i++) {
345             if (i > 0) {
346                 result.append(separator);
347             }
348             result.append(sections[i]);
349         }
350         return result.toString();
351     }
352 }
353
354 /**
355  * This class describes a fragment within a document. <p>
356  *
357  * @author Alexander Kandzior
358  *
359  * @version $Revision: 1.8 $
360  *
361  * @since 6.0.0
362  */

363
364 class DocumentFragment {
365
366     /** The fragment number. */
367     protected int m_fragNum;
368
369     /** The score. */
370     protected int m_score;
371
372     /** The text end position .*/
373     protected int m_textEndPos;
374
375     /** The test start position. */
376     protected int m_textStartPos;
377
378     /** All unique terms found. */
379     protected HashSet JavaDoc m_uniqueTerms = new HashSet JavaDoc();
380
381     /**
382      * @param textStartPos textStartPos
383      * @param fragNum fragNum
384      */

385     public DocumentFragment(int textStartPos, int fragNum) {
386
387         this.m_textStartPos = textStartPos;
388         this.m_fragNum = fragNum;
389     }
390
391     /**
392      * @param term term
393      */

394     void addTerm(String JavaDoc term) {
395
396         m_uniqueTerms.add(term);
397     }
398
399     /**
400      * @return the score
401      */

402     int getScore() {
403
404         return m_uniqueTerms.size();
405     }
406 }
407
408 /**
409  * This class implements a priority queue for document fragments.<p>
410  */

411
412 class FragmentQueue extends PriorityQueue {
413
414     /**
415      * @param size size
416      */

417     public FragmentQueue(int size) {
418
419         initialize(size);
420     }
421
422     /**
423      * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
424      */

425     public final boolean lessThan(Object JavaDoc a, Object JavaDoc b) {
426
427         DocumentFragment fragA = (DocumentFragment)a;
428         DocumentFragment fragB = (DocumentFragment)b;
429         if (fragA.getScore() == fragB.getScore()) {
430             return fragA.m_fragNum > fragB.m_fragNum;
431         } else {
432             return fragA.getScore() < fragB.getScore();
433         }
434     }
435 }
436
Popular Tags