1 2 3 4 package net.nutch.searcher; 5 6 import java.io.*; 7 import java.util.*; 8 9 import org.apache.lucene.analysis.Token; 10 import org.apache.lucene.analysis.Analyzer; 11 import org.apache.lucene.analysis.TokenStream; 12 13 import net.nutch.util.NutchConf; 14 import net.nutch.searcher.Summary.*; 15 import net.nutch.analysis.NutchDocumentAnalyzer; 16 17 18 public class Summarizer { 19 20 21 private static final int SUM_CONTEXT = 22 NutchConf.getInt("searcher.summary.context", 5); 23 24 25 private static final int SUM_LENGTH = 26 NutchConf.getInt("searcher.summary.length", 20); 27 28 29 private static final Analyzer ANALYZER = new NutchDocumentAnalyzer(); 30 31 35 class Excerpt { 36 Vector passages = new Vector(); 37 SortedSet tokenSet = new TreeSet(); 38 int numTerms = 0; 39 40 42 public Excerpt() { 43 } 44 45 47 public void addToken(String token) { 48 tokenSet.add(token); 49 } 50 51 54 public int numUniqueTokens() { 55 return tokenSet.size(); 56 } 57 58 61 public int numFragments() { 62 return passages.size(); 63 } 64 65 public void setNumTerms(int numTerms) { 66 this.numTerms = numTerms; 67 } 68 69 public int getNumTerms() { 70 return numTerms; 71 } 72 73 76 public void add(Fragment fragment) { 77 passages.add(fragment); 78 } 79 80 83 public Enumeration elements() { 84 return passages.elements(); 85 } 86 } 87 88 89 public Summary getSummary(String text, Query query) throws IOException { 90 91 96 Token[] tokens = getTokens(text); 98 if (tokens.length == 0) 99 return new Summary(); 100 101 String [] terms = query.getTerms(); 102 HashSet highlight = new HashSet(); for (int i = 0; i < terms.length; i++) 104 highlight.add(terms[i]); 105 106 SortedSet excerptSet = new TreeSet(new Comparator() { 112 public int compare(Object o1, Object o2) { 113 Excerpt excerpt1 = (Excerpt) o1; 114 Excerpt excerpt2 = (Excerpt) o2; 115 116 if (excerpt1 == null && excerpt2 != null) { 117 return -1; 118 } else if (excerpt1 != null && excerpt2 == null) { 119 return 1; 120 } else if (excerpt1 == null && excerpt2 == null) { 121 return 0; 122 } 123 124 int numToks1 = excerpt1.numUniqueTokens(); 125 int numToks2 = excerpt2.numUniqueTokens(); 126 127 if (numToks1 < numToks2) { 128 return -1; 129 } else if (numToks1 == numToks2) { 130 return excerpt1.numFragments() - excerpt2.numFragments(); 131 } else { 132 return 1; 133 } 134 } 135 } 136 ); 137 138 int lastExcerptPos = 0; 142 for (int i = 0; i < tokens.length; i++) { 143 if (highlight.contains(tokens[i].termText())) { 147 int startToken = (i > SUM_CONTEXT) ? i-SUM_CONTEXT : 0; 152 int endToken = Math.min(i+SUM_CONTEXT, tokens.length); 153 int offset = tokens[startToken].startOffset(); 154 int j = startToken; 155 156 Excerpt excerpt = new Excerpt(); 162 if (i != 0) { 163 excerpt.add(new Summary.Ellipsis()); 164 } 165 166 while ((j < endToken) && (j - startToken < SUM_LENGTH)) { 172 Token t = tokens[j]; 176 if (highlight.contains(t.termText())) { 177 excerpt.addToken(t.termText()); 178 excerpt.add(new Fragment(text.substring(offset, t.startOffset()))); 179 excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset()))); 180 offset = t.endOffset(); 181 endToken = Math.min(j+SUM_CONTEXT, tokens.length); 182 } 183 184 j++; 185 } 186 187 lastExcerptPos = endToken; 188 189 if (j < tokens.length) { 199 excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset()))); 200 } 201 202 excerpt.setNumTerms(j - startToken); 206 207 excerptSet.add(excerpt); 211 212 i = j+SUM_CONTEXT; 217 } 218 } 219 220 if (excerptSet.size() == 0) { 225 Excerpt excerpt = new Excerpt(); 226 int excerptLen = Math.min(SUM_LENGTH, tokens.length); 227 lastExcerptPos = excerptLen; 228 229 excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset()))); 230 excerpt.setNumTerms(excerptLen); 231 excerptSet.add(excerpt); 232 } 233 234 double tokenCount = 0; 239 Summary s = new Summary(); 240 while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) { 241 Excerpt excerpt = (Excerpt) excerptSet.last(); 242 excerptSet.remove(excerpt); 243 244 double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments(); 245 for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) { 246 Fragment f = (Fragment) e.nextElement(); 247 if (tokenCount + tokenFraction <= SUM_LENGTH) { 249 s.add(f); 250 } 251 tokenCount += tokenFraction; 252 } 253 } 254 255 if (tokenCount > 0 && lastExcerptPos < tokens.length) 256 s.add(new Ellipsis()); 257 return s; 258 } 259 260 private Token[] getTokens(String text) throws IOException { 261 ArrayList result = new ArrayList(); 262 TokenStream ts = ANALYZER.tokenStream("content", new StringReader(text)); 263 for (Token token = ts.next(); token != null; token = ts.next()) { 264 result.add(token); 265 } 266 return (Token[])result.toArray(new Token[result.size()]); 267 } 268 269 273 public static void main(String argv[]) throws IOException { 274 if (argv.length < 2) { 276 System.out.println("Usage: java net.nutch.searcher.Summarizer <textfile> <queryStr>"); 277 return; 278 } 279 280 Summarizer s = new Summarizer(); 281 282 File textFile = new File(argv[0]); 286 StringBuffer queryBuf = new StringBuffer (); 287 for (int i = 1; i < argv.length; i++) { 288 queryBuf.append(argv[i]); 289 queryBuf.append(" "); 290 } 291 292 StringBuffer body = new StringBuffer (); 296 BufferedReader in = new BufferedReader(new FileReader(textFile)); 297 try { 298 System.out.println("About to read " + textFile + " from " + in); 299 String str = in.readLine(); 300 while (str != null) { 301 body.append(str); 302 str = in.readLine(); 303 } 304 } finally { 305 in.close(); 306 } 307 308 Query query = Query.parse(queryBuf.toString()); 310 System.out.println("Summary: '" + s.getSummary(body.toString(), query) + "'"); 311 } 312 } 313 | Popular Tags |