CmsHighlightFinder


1   /*
2    * File   : $Source: /usr/local/cvs/opencms/src/org/opencms/search/documents/CmsHighlightFinder.java,v $
3    * Date   : $Date: 2006/03/27 14:53:05 $
4    * Version: $Revision: 1.8 $
5    *
6    * This library is part of OpenCms -
7    * the Open Source Content Mananagement System
8    *
9    * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10   *
11   * This library is free software; you can redistribute it and/or
12   * modify it under the m_terms of the GNU Lesser General Public
13   * License as published by the Free Software Foundation; either
14   * version 2.1 of the License, or (at your option) any later version.
15   *
16   * This library is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19   * Lesser General Public License for more details.
20   *
21   * For further information about Alkacon Software GmbH, please see the
22   * company website: http://www.alkacon.com
23   *
24   * For further information about OpenCms, please see the
25   * project website: http://www.opencms.org
26   * 
27   * You should have received a copy of the GNU Lesser General Public
28   * License along with this library; if not, write to the Free Software
29   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
30   */
31  
32  package org.opencms.search.documents;
33  
34  import java.io.IOException  ;
35  import java.io.StringReader  ;
36  import java.util.ArrayList  ;
37  import java.util.HashSet  ;
38  import java.util.Iterator  ;
39  
40  import org.apache.lucene.analysis.Analyzer;
41  import org.apache.lucene.analysis.TokenStream;
42  import org.apache.lucene.index.Term;
43  import org.apache.lucene.search.BooleanClause;
44  import org.apache.lucene.search.BooleanQuery;
45  import org.apache.lucene.search.PhraseQuery;
46  import org.apache.lucene.search.Query;
47  import org.apache.lucene.search.TermQuery;
48  import org.apache.lucene.util.PriorityQueue;
49  
50  /**
51   * Adapted from Maik Schreiber's LuceneTools.java,v 1.5 2001/10/16 07:25:55. 
52   * 
53   * Alterations include:
54   *  + Changed to support Lucene 1.3 release (requires no change to Lucene code
55   * base but consequently no longer supports MultiTermQuery, RangeQuery and
56   * PrefixQuery highlighting currently)
57   *  + Performance enhancement - CmsHighlightExtractor caches m_query m_terms and can
58   * therefore be called repeatedly to highlight multiple results more efficently
59   *  + New feature: can extract the most relevant parts of large bodies of text -
60   * with user defined size of extracts
61   * 
62   * @author Maik Schreiber 
63   */
64  public final class CmsHighlightFinder {
65  
66      /** The analyzer. */
67      private Analyzer m_analyzer;
68  
69      /** The term highlighter. */
70      private I_CmsTermHighlighter m_highlighter;
71  
72      /** The query. */
73      private Query m_query;
74  
75      /** A set of all terms. */
76      private HashSet   m_terms = new HashSet  ();
77  
78      /**
79       * @param highlighter
80       *            I_TermHighlighter to use to highlight m_terms in the text
81       * @param query
82       *            Query which contains the m_terms to be highlighted in the text
83       * @param analyzer
84       *            Analyzer used to construct the Query
85       * @throws IOException if something goes wrong
86       */
87      public CmsHighlightFinder(I_CmsTermHighlighter highlighter, Query query, Analyzer analyzer)
88      throws IOException   {
89  
90          this.m_highlighter = highlighter;
91          this.m_query = query;
92          this.m_analyzer = analyzer;
93          // get m_terms in m_query
94          getTerms(m_query, m_terms, false);
95  
96      }
97  
98      /**
99       * Extracts all term texts of a given Query. Term texts will be returned in
100      * lower-case.
101      * 
102      * @param query
103      *            Query to extract term texts from
104      * @param terms
105      *            HashSet where extracted term texts should be put into
106      *            (Elements: String)
107      * @param prohibited
108      *            <code>true</code> to extract "prohibited" m_terms, too
109      * @throws IOException if something goes wrong
110      */
111     public static void getTerms(Query query, HashSet   terms, boolean prohibited) throws IOException   {
112 
113         if (query instanceof BooleanQuery) {
114             getTermsFromBooleanQuery((BooleanQuery)query, terms, prohibited);
115         } else if (query instanceof PhraseQuery) {
116             getTermsFromPhraseQuery((PhraseQuery)query, terms);
117         } else if (query instanceof TermQuery) {
118             getTermsFromTermQuery((TermQuery)query, terms);
119         }
120     }
121 
122     /**
123      * Extracts all term texts of a given BooleanQuery. Term texts will be
124      * returned in lower-case.
125      * 
126      * @param query
127      *            BooleanQuery to extract term texts from
128      * @param terms
129      *            HashSet where extracted term texts should be put into
130      *            (Elements: String)
131      * @param prohibited
132      *            <code>true</code> to extract "prohibited" m_terms, too
133      * @throws IOException if something goes wrong
134      */
135     private static void getTermsFromBooleanQuery(BooleanQuery query, HashSet   terms, boolean prohibited)
136     throws IOException   {
137 
138         BooleanClause[] queryClauses = query.getClauses();
139         int i;
140 
141         for (i = 0; i < queryClauses.length; i++) {
142             if (prohibited || !queryClauses[i].isProhibited()) {
143                 getTerms(queryClauses[i].getQuery(), terms, prohibited);
144             }
145         }
146     }
147 
148     /**
149      * Extracts all term texts of a given PhraseQuery. Term texts will be
150      * returned in lower-case.
151      * 
152      * @param query
153      *            PhraseQuery to extract term texts from
154      * @param terms
155      *            HashSet where extracted term texts should be put into
156      *            (Elements: String)
157      */
158     private static void getTermsFromPhraseQuery(PhraseQuery query, HashSet   terms) {
159 
160         Term[] queryTerms = query.getTerms();
161         int i;
162 
163         for (i = 0; i < queryTerms.length; i++) {
164             terms.add(getTermsFromTerm(queryTerms[i]));
165         }
166     }
167 
168     /**
169      * Extracts the term of a given Term. The term will be returned in
170      * lower-case.
171      * 
172      * @param term
173      *            Term to extract term from
174      * 
175      * @return the Term's term text
176      */
177     private static String   getTermsFromTerm(Term term) {
178 
179         return term.text().toLowerCase();
180     }
181 
182     /**
183      * Extracts all term texts of a given TermQuery. Term texts will be
184      * returned in lower-case.
185      * 
186      * @param query
187      *            TermQuery to extract term texts from
188      * @param terms
189      *            HashSet where extracted term texts should be put into
190      *            (Elements: String)
191      */
192     private static void getTermsFromTermQuery(TermQuery query, HashSet   terms) {
193 
194         terms.add(getTermsFromTerm(query.getTerm()));
195     }
196 
197     /**
198      * Highlights a text in accordance to the given m_query, extracting the most
199      * relevant sections. The document text is analysed in fragmentSize chunks
200      * to record hit statistics across the document. After accumulating stats,
201      * the fragments with the highest scores are returned as an array of
202      * strings in order of m_score.
203      * 
204      * @param text
205      *            text to highlight m_terms in
206      * @param fragmentSize
207      *            the size in bytes of each fragment to be returned
208      * @param maxNumFragments
209      *            the maximum number of fragments.
210      * 
211      * @return highlighted text fragments (between 0 and maxNumFragments number
212      *         of fragments)
213      * @throws IOException if something goes wrong
214      */
215     public String  [] getBestFragments(String   text, int fragmentSize, int maxNumFragments) throws IOException   {
216 
217         StringBuffer   newText = new StringBuffer  ();
218         TokenStream stream = null;
219 
220         ArrayList   docFrags = new ArrayList  ();
221 
222         DocumentFragment currentFrag = new DocumentFragment(newText.length(), docFrags.size());
223         docFrags.add(currentFrag);
224 
225         FragmentQueue fragQueue = new FragmentQueue(maxNumFragments + 1);
226 
227         try {
228             org.apache.lucene.analysis.Token token;
229             String   tokenText;
230             int startOffset;
231             int endOffset;
232             int lastEndOffset = 0;
233 
234             // long start=System.currentTimeMillis();
235             stream = m_analyzer.tokenStream(null, new StringReader  (text));
236             while ((token = stream.next()) != null) {
237                 startOffset = token.startOffset();
238                 endOffset = token.endOffset();
239                 // make sure wildcards are removed else highlighting will not
240                 // work
241                 tokenText = text.substring(startOffset, endOffset);
242 
243                 // append text between end of last token (or beginning of text)
244                 // and start of current token
245                 if (startOffset > lastEndOffset) {
246                     newText.append(" ");
247                     // newText.append(text.substring(lastEndOffset, startOffset));
248                 }
249 
250                 // does m_query contain current token?
251                 if (m_terms.contains(token.termText())) {
252                     newText.append(m_highlighter.highlightTerm(tokenText));
253                     currentFrag.addTerm(token.termText());
254                 } else {
255                     if (tokenText.length() > fragmentSize / 2) {
256                         newText.append(tokenText.substring(0, fragmentSize / 2));
257                         newText.append(" ");
258                     } else {
259                         newText.append(tokenText);
260                     }
261                 }
262 
263                 if (newText.length() >= (fragmentSize * (docFrags.size() + 1))) {
264                     //record stats for a new fragment
265                     currentFrag.m_textEndPos = newText.length();
266                     currentFrag = new DocumentFragment(newText.length(), docFrags.size());
267                     docFrags.add(currentFrag);
268                 }
269 
270                 lastEndOffset = endOffset;
271             }
272 
273             // append text after end of last token
274             if (lastEndOffset < text.length()) {
275                 // int extend = lastEndOffset + fragmentSize;
276                 // extend = (extend > text.length()) ? text.length() : extend;
277                 // newText.append(text.substring(lastEndOffset, extend));
278                 newText.append(text.substring(lastEndOffset));
279             }
280 
281             currentFrag.m_textEndPos = newText.length();
282 
283             //find the most relevant sections of the text
284             int minScore = 0;
285             for (Iterator   i = docFrags.iterator(); i.hasNext();) {
286                 currentFrag = (DocumentFragment)i.next();
287                 if (currentFrag.getScore() >= minScore) {
288                     fragQueue.put(currentFrag);
289                     if (fragQueue.size() > maxNumFragments) {
290                         // if hit queue overfull
291                         fragQueue.pop();
292                         // remove lowest in hit queue
293                         minScore = ((DocumentFragment)fragQueue.top()).getScore();
294                         // reset minScore
295                     }
296 
297                 }
298             }
299 
300             //extract the text
301             String  [] fragText = new String  [fragQueue.size()];
302             for (int i = fragText.length - 1; i >= 0; i--) {
303                 DocumentFragment frag = (DocumentFragment)fragQueue.pop();
304                 fragText[i] = newText.substring(frag.m_textStartPos, frag.m_textEndPos);
305             }
306             return fragText;
307 
308         } finally {
309             if (stream != null) {
310                 try {
311                     stream.close();
312                 } catch (Exception   e) {
313                     // noop
314                 }
315             }
316         }
317     }
318 
319     /**
320      * Highlights a text in accordance to the given m_query and extracting the most
321      * relevant sections. The document text is analysed in
322      * fragmentSize chunks to record hit statistics across the document. After
323      * accumulating stats, the fragments with the highest scores are returned
324      * in order as "separator" delimited strings.
325      * 
326      * @param text
327      *            text to highlight m_terms in
328      * @param fragmentSize
329      *            the size in bytes of each fragment to be returned
330      * @param maxNumFragments
331      *            the maximum number of fragments.
332      * @param separator
333      *            the separator used to intersperse the document fragments
334      *            (typically " ... ")
335      * 
336      * @return highlighted text
337      * @throws IOException if something goes wrong
338      */
339     public String   getBestFragments(String   text, int fragmentSize, int maxNumFragments, String   separator)
340     throws IOException   {
341 
342         String  [] sections = getBestFragments(text, fragmentSize, maxNumFragments);
343         StringBuffer   result = new StringBuffer  ();
344         for (int i = 0; i < sections.length; i++) {
345             if (i > 0) {
346                 result.append(separator);
347             }
348             result.append(sections[i]);
349         }
350         return result.toString();
351     }
352 }
353 
354 /**
355  * This class describes a fragment within a document. <p>
356  * 
357  * @author Alexander Kandzior 
358  * 
359  * @version $Revision: 1.8 $ 
360  * 
361  * @since 6.0.0 
362  */
363 
364 class DocumentFragment {
365 
366     /** The fragment number. */
367     protected int m_fragNum;
368 
369     /** The score. */
370     protected int m_score;
371 
372     /** The text end position .*/
373     protected int m_textEndPos;
374 
375     /** The test start position. */
376     protected int m_textStartPos;
377 
378     /** All unique terms found. */
379     protected HashSet   m_uniqueTerms = new HashSet  ();
380 
381     /**
382      * @param textStartPos textStartPos
383      * @param fragNum fragNum
384      */
385     public DocumentFragment(int textStartPos, int fragNum) {
386 
387         this.m_textStartPos = textStartPos;
388         this.m_fragNum = fragNum;
389     }
390 
391     /**
392      * @param term term
393      */
394     void addTerm(String   term) {
395 
396         m_uniqueTerms.add(term);
397     }
398 
399     /**
400      * @return the score
401      */
402     int getScore() {
403 
404         return m_uniqueTerms.size();
405     }
406 }
407 
408 /**
409  * This class implements a priority queue for document fragments.<p>
410  */
411 
412 class FragmentQueue extends PriorityQueue {
413 
414     /**
415      * @param size size
416      */
417     public FragmentQueue(int size) {
418 
419         initialize(size);
420     }
421 
422     /**
423      * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
424      */
425     public final boolean lessThan(Object   a, Object   b) {
426 
427         DocumentFragment fragA = (DocumentFragment)a;
428         DocumentFragment fragB = (DocumentFragment)b;
429         if (fragA.getScore() == fragB.getScore()) {
430             return fragA.m_fragNum > fragB.m_fragNum;
431         } else {
432             return fragA.getScore() < fragB.getScore();
433         }
434     }
435 }
436
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags