KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > searcher > Summarizer


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.searcher;
5
6 import java.io.*;
7 import java.util.*;
8
9 import org.apache.lucene.analysis.Token;
10 import org.apache.lucene.analysis.Analyzer;
11 import org.apache.lucene.analysis.TokenStream;
12
13 import net.nutch.util.NutchConf;
14 import net.nutch.searcher.Summary.*;
15 import net.nutch.analysis.NutchDocumentAnalyzer;
16
17 /** Implements hit summarization. */
18 public class Summarizer {
19
20   /** The number of context terms to display preceding and following matches.*/
21   private static final int SUM_CONTEXT =
22     NutchConf.getInt("searcher.summary.context", 5);
23
24   /** The total number of terms to display in a summary.*/
25   private static final int SUM_LENGTH =
26     NutchConf.getInt("searcher.summary.length", 20);
27
28   /** Converts text to tokens. */
29   private static final Analyzer ANALYZER = new NutchDocumentAnalyzer();
30
31   /**
32    * Class Excerpt represents a single passage found in the
33    * document, with some appropriate regions highlit.
34    */

35   class Excerpt {
36       Vector passages = new Vector();
37       SortedSet tokenSet = new TreeSet();
38       int numTerms = 0;
39
40       /**
41        */

42       public Excerpt() {
43       }
44
45       /**
46        */

47       public void addToken(String JavaDoc token) {
48           tokenSet.add(token);
49       }
50
51       /**
52        * Return how many unique toks we have
53        */

54       public int numUniqueTokens() {
55           return tokenSet.size();
56       }
57
58       /**
59        * How many fragments we have.
60        */

61       public int numFragments() {
62           return passages.size();
63       }
64
65       public void setNumTerms(int numTerms) {
66           this.numTerms = numTerms;
67       }
68
69       public int getNumTerms() {
70           return numTerms;
71       }
72
73       /**
74        * Add a frag to the list.
75        */

76       public void add(Fragment fragment) {
77           passages.add(fragment);
78       }
79
80       /**
81        * Return an Enum for all the fragments
82        */

83       public Enumeration elements() {
84           return passages.elements();
85       }
86   }
87
88   /** Returns a summary for the given pre-tokenized text. */
89   public Summary getSummary(String JavaDoc text, Query query) throws IOException {
90
91     // Simplistic implementation. Finds the first fragments in the document
92
// containing any query terms.
93
//
94
// TODO: check that phrases in the query are matched in the fragment
95

96     Token[] tokens = getTokens(text); // parse text to token array
97

98     if (tokens.length == 0)
99       return new Summary();
100
101     String JavaDoc[] terms = query.getTerms();
102     HashSet highlight = new HashSet(); // put query terms in table
103
for (int i = 0; i < terms.length; i++)
104       highlight.add(terms[i]);
105
106     //
107
// Create a SortedSet that ranks excerpts according to
108
// how many query terms are present. An excerpt is
109
// a Vector full of Fragments and Highlights
110
//
111
SortedSet excerptSet = new TreeSet(new Comparator() {
112         public int compare(Object JavaDoc o1, Object JavaDoc o2) {
113             Excerpt excerpt1 = (Excerpt) o1;
114             Excerpt excerpt2 = (Excerpt) o2;
115
116             if (excerpt1 == null && excerpt2 != null) {
117                 return -1;
118             } else if (excerpt1 != null && excerpt2 == null) {
119                 return 1;
120             } else if (excerpt1 == null && excerpt2 == null) {
121                 return 0;
122             }
123
124             int numToks1 = excerpt1.numUniqueTokens();
125             int numToks2 = excerpt2.numUniqueTokens();
126
127             if (numToks1 < numToks2) {
128                 return -1;
129             } else if (numToks1 == numToks2) {
130                 return excerpt1.numFragments() - excerpt2.numFragments();
131             } else {
132                 return 1;
133             }
134         }
135     }
136         );
137
138     //
139
// Iterate through all terms in the document
140
//
141
int lastExcerptPos = 0;
142     for (int i = 0; i < tokens.length; i++) {
143       //
144
// If we find a term that's in the query...
145
//
146
if (highlight.contains(tokens[i].termText())) {
147         //
148
// Start searching at a point SUM_CONTEXT terms back,
149
// and move SUM_CONTEXT terms into the future.
150
//
151
int startToken = (i > SUM_CONTEXT) ? i-SUM_CONTEXT : 0;
152         int endToken = Math.min(i+SUM_CONTEXT, tokens.length);
153         int offset = tokens[startToken].startOffset();
154         int j = startToken;
155
156         //
157
// Iterate from the start point to the finish, adding
158
// terms all the way. The end of the passage is always
159
// SUM_CONTEXT beyond the last query-term.
160
//
161
Excerpt excerpt = new Excerpt();
162         if (i != 0) {
163             excerpt.add(new Summary.Ellipsis());
164         }
165
166         //
167
// Iterate through as long as we're before the end of
168
// the document and we haven't hit the max-number-of-items
169
// -in-a-summary.
170
//
171
while ((j < endToken) && (j - startToken < SUM_LENGTH)) {
172           //
173
// Now grab the hit-element, if present
174
//
175
Token t = tokens[j];
176           if (highlight.contains(t.termText())) {
177             excerpt.addToken(t.termText());
178             excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
179             excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
180             offset = t.endOffset();
181             endToken = Math.min(j+SUM_CONTEXT, tokens.length);
182           }
183
184           j++;
185         }
186
187         lastExcerptPos = endToken;
188
189         //
190
// We found the series of search-term hits and added
191
// them (with intervening text) to the excerpt. Now
192
// we need to add the trailing edge of text.
193
//
194
// So if (j < tokens.length) then there is still trailing
195
// text to add. (We haven't hit the end of the source doc.)
196
// Add the words since the last hit-term insert.
197
//
198
if (j < tokens.length) {
199           excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
200         }
201
202         //
203
// Remember how many terms are in this excerpt
204
//
205
excerpt.setNumTerms(j - startToken);
206
207         //
208
// Store the excerpt for later sorting
209
//
210
excerptSet.add(excerpt);
211
212         //
213
// Start SUM_CONTEXT places away. The next
214
// search for relevant excerpts begins at i-SUM_CONTEXT
215
//
216
i = j+SUM_CONTEXT;
217       }
218     }
219
220     //
221
// If the target text doesn't appear, then we just
222
// excerpt the first SUM_LENGTH words from the document.
223
//
224
if (excerptSet.size() == 0) {
225         Excerpt excerpt = new Excerpt();
226         int excerptLen = Math.min(SUM_LENGTH, tokens.length);
227         lastExcerptPos = excerptLen;
228
229         excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
230         excerpt.setNumTerms(excerptLen);
231         excerptSet.add(excerpt);
232     }
233
234     //
235
// Now choose the best items from the excerpt set.
236
// Stop when our Summary grows too large.
237
//
238
double tokenCount = 0;
239     Summary s = new Summary();
240     while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) {
241         Excerpt excerpt = (Excerpt) excerptSet.last();
242         excerptSet.remove(excerpt);
243
244         double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
245         for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
246             Fragment f = (Fragment) e.nextElement();
247             // Don't add fragments if it takes us over the max-limit
248
if (tokenCount + tokenFraction <= SUM_LENGTH) {
249                 s.add(f);
250             }
251             tokenCount += tokenFraction;
252         }
253     }
254     
255     if (tokenCount > 0 && lastExcerptPos < tokens.length)
256       s.add(new Ellipsis());
257     return s;
258   }
259
260   private Token[] getTokens(String JavaDoc text) throws IOException {
261     ArrayList result = new ArrayList();
262     TokenStream ts = ANALYZER.tokenStream("content", new StringReader(text));
263     for (Token token = ts.next(); token != null; token = ts.next()) {
264       result.add(token);
265     }
266     return (Token[])result.toArray(new Token[result.size()]);
267   }
268
269     /**
270      * Tests Summary-generation. User inputs the name of a
271      * text file and a query string
272      */

273     public static void main(String JavaDoc argv[]) throws IOException {
274         // Test arglist
275
if (argv.length < 2) {
276             System.out.println("Usage: java net.nutch.searcher.Summarizer <textfile> <queryStr>");
277             return;
278         }
279
280         Summarizer s = new Summarizer();
281
282         //
283
// Parse the args
284
//
285
File textFile = new File(argv[0]);
286         StringBuffer JavaDoc queryBuf = new StringBuffer JavaDoc();
287         for (int i = 1; i < argv.length; i++) {
288             queryBuf.append(argv[i]);
289             queryBuf.append(" ");
290         }
291
292         //
293
// Load the text file into a single string.
294
//
295
StringBuffer JavaDoc body = new StringBuffer JavaDoc();
296         BufferedReader in = new BufferedReader(new FileReader(textFile));
297         try {
298             System.out.println("About to read " + textFile + " from " + in);
299             String JavaDoc str = in.readLine();
300             while (str != null) {
301                 body.append(str);
302                 str = in.readLine();
303             }
304         } finally {
305             in.close();
306         }
307
308         // Convert the query string into a proper Query
309
Query query = Query.parse(queryBuf.toString());
310         System.out.println("Summary: '" + s.getSummary(body.toString(), query) + "'");
311     }
312 }
313
Popular Tags