KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > analysis > CommonGrams


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.analysis;
5
6 import org.apache.lucene.analysis.Analyzer;
7 import org.apache.lucene.analysis.TokenFilter;
8 import org.apache.lucene.analysis.TokenStream;
9 import org.apache.lucene.analysis.Token;
10
11 import java.io.*;
12 import java.util.*;
13 import java.util.logging.Logger JavaDoc;
14
15 import net.nutch.util.*;
16
17 import net.nutch.searcher.Query.*;
18
19 /** Construct n-grams for frequently occuring terms and phrases while indexing.
20  * Optimize phrase queries to use the n-grams. Single terms are still indexed
21  * too, with n-grams overlaid. This is achieved through the use of {@link
22  * Token#setPositionIncrement(int)}.*/

23 public class CommonGrams {
24   private static final Logger JavaDoc LOG =
25     LogFormatter.getLogger("net.nutch.analysis.CommonGrams");
26   private static final char SEPARATOR = '-';
27   private static final HashMap COMMON_TERMS = new HashMap();
28
29   static { init(); }
30
31   private CommonGrams() {} // no public ctor
32

33   private static class Filter extends TokenFilter {
34     private HashSet common;
35     private Token previous;
36     private LinkedList gramQueue = new LinkedList();
37     private LinkedList nextQueue = new LinkedList();
38     private StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
39
40     /** Construct an n-gram producing filter. */
41     public Filter(TokenStream input, HashSet common) {
42       super(input);
43       this.common = common;
44     }
45
46     /** Inserts n-grams into a token stream. */
47     public Token next() throws IOException {
48       if (gramQueue.size() != 0) // consume any queued tokens
49
return (Token)gramQueue.removeFirst();
50
51       final Token token = popNext();
52       if (token == null)
53         return null;
54
55       if (!isCommon(token)) { // optimize simple case
56
previous = token;
57         return token;
58       }
59
60       gramQueue.add(token); // queue the token
61

62       ListIterator i = nextQueue.listIterator();
63       Token gram = token;
64       while (isCommon(gram)) {
65         if (previous != null && !isCommon(previous)) // queue prev gram first
66
gramQueue.addFirst(gramToken(previous, gram));
67
68         Token next = peekNext(i);
69         if (next == null)
70           break;
71
72         gram = gramToken(gram, next); // queue next gram last
73
gramQueue.addLast(gram);
74       }
75
76       previous = token;
77       return (Token)gramQueue.removeFirst();
78     }
79
80     /** True iff token is for a common term. */
81     private boolean isCommon(Token token) {
82       return common != null && common.contains(token.termText());
83     }
84
85     /** Pops nextQueue or, if empty, reads a new token. */
86     private Token popNext() throws IOException {
87       if (nextQueue.size() > 0)
88         return (Token)nextQueue.removeFirst();
89       else
90         return input.next();
91     }
92
93     /** Return next token in nextQueue, extending it when empty. */
94     private Token peekNext(ListIterator i) throws IOException {
95       if (!i.hasNext()) {
96         Token next = input.next();
97         if (next == null)
98           return null;
99         i.add(next);
100         i.previous();
101       }
102       return (Token)i.next();
103     }
104
105     /** Construct a compound token. */
106     private Token gramToken(Token first, Token second) {
107       buffer.setLength(0);
108       buffer.append(first.termText());
109       buffer.append(SEPARATOR);
110       buffer.append(second.termText());
111       Token result = new Token(buffer.toString(),
112                                first.startOffset(), second.endOffset(),
113                                "gram");
114       result.setPositionIncrement(0);
115       return result;
116     }
117   }
118
119   /** Construct using the provided config file. */
120   private static void init() {
121     try {
122       Reader reader = NutchConf.getConfResourceAsReader
123         (NutchConf.get("analysis.common.terms.file"));
124       BufferedReader in = new BufferedReader(reader);
125       String JavaDoc line;
126       while ((line = in.readLine()) != null) {
127         line = line.trim();
128         if (line.startsWith("#") || "".equals(line)) // skip comments
129
continue;
130         TokenStream ts = new NutchDocumentTokenizer(new StringReader(line));
131         Token token = ts.next();
132         if (token == null) {
133           LOG.warning("Line does not contain a field name: " + line);
134           continue;
135         }
136         String JavaDoc field = token.termText();
137         token = ts.next();
138         if (token == null) {
139           LOG.warning("Line contains only a field name, no word: " + line);
140           continue;
141         }
142         String JavaDoc gram = token.termText();
143         while ((token = ts.next()) != null) {
144           gram = gram + SEPARATOR + token.termText();
145         }
146         HashSet table = (HashSet)COMMON_TERMS.get(field);
147         if (table == null) {
148           table = new HashSet();
149           COMMON_TERMS.put(field, table);
150         }
151         table.add(gram);
152       }
153     } catch (IOException e) {
154       throw new RuntimeException JavaDoc(e.toString());
155     }
156   }
157
158   /** Construct a token filter that inserts n-grams for common terms. For use
159    * while indexing documents. */

160   public static TokenFilter getFilter(TokenStream ts, String JavaDoc field) {
161     return new Filter(ts, (HashSet)COMMON_TERMS.get(field));
162   }
163
164   /** Utility to convert an array of Query.Terms into a token stream. */
165   private static class ArrayTokens extends TokenStream {
166     private Term[] terms;
167     private int index;
168
169     public ArrayTokens(Phrase phrase) { this.terms = phrase.getTerms(); }
170     
171     public Token next() {
172       if (index == terms.length)
173         return null;
174       else
175         return new Token(terms[index].toString(), index, ++index);
176     }
177   }
178
179   /** Optimizes phrase queries to use n-grams when possible. */
180   public static String JavaDoc[] optimizePhrase(Phrase phrase, String JavaDoc field) {
181     //LOG.info("Optimizing " + phrase + " for " + field);
182
ArrayList result = new ArrayList();
183     TokenStream ts = getFilter(new ArrayTokens(phrase), field);
184     Token token, prev=null;
185     int position = 0;
186     try {
187       while ((token = ts.next()) != null) {
188         if (token.getPositionIncrement() != 0 && prev != null)
189           result.add(prev.termText());
190         prev = token;
191         position += token.getPositionIncrement();
192         if ((position + arity(token.termText())) == phrase.getTerms().length)
193           break;
194       }
195     } catch (IOException e) {
196       throw new RuntimeException JavaDoc(e.toString());
197     }
198     if (prev != null)
199       result.add(prev.termText());
200
201 // LOG.info("Optimized: ");
202
// for (int i = 0; i < result.size(); i++) {
203
// LOG.info(result.get(i) + " ");
204
// }
205

206     return (String JavaDoc[])result.toArray(new String JavaDoc[result.size()]);
207
208
209   }
210
211   private static int arity(String JavaDoc gram) {
212     int index = 0;
213     int arity = 0;
214     while ((index = gram.indexOf(SEPARATOR, index+1)) != -1) {
215       arity++;
216     }
217     return arity;
218   }
219
220   /** For debugging. */
221   public static void main(String JavaDoc[] args) throws Exception JavaDoc {
222     StringBuffer JavaDoc text = new StringBuffer JavaDoc();
223     for (int i = 0; i < args.length; i++) {
224       text.append(args[i]);
225       text.append(' ');
226     }
227     TokenStream ts =
228       new NutchDocumentTokenizer(new StringReader(text.toString()));
229     ts = getFilter(ts, "url");
230     Token token;
231     while ((token = ts.next()) != null) {
232       System.out.println("Token: " + token);
233     }
234     String JavaDoc[] optimized = optimizePhrase(new Phrase(args), "url");
235     System.out.print("Optimized: ");
236     for (int i = 0; i < optimized.length; i++) {
237       System.out.print(optimized[i] + " ");
238     }
239     System.out.println();
240   }
241   
242 }
243
Popular Tags