CommonGrams


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.analysis;
5   
6   import org.apache.lucene.analysis.Analyzer;
7   import org.apache.lucene.analysis.TokenFilter;
8   import org.apache.lucene.analysis.TokenStream;
9   import org.apache.lucene.analysis.Token;
10  
11  import java.io.*;
12  import java.util.*;
13  import java.util.logging.Logger  ;
14  
15  import net.nutch.util.*;
16  
17  import net.nutch.searcher.Query.*;
18  
19  /** Construct n-grams for frequently occuring terms and phrases while indexing.
20   * Optimize phrase queries to use the n-grams. Single terms are still indexed
21   * too, with n-grams overlaid.  This is achieved through the use of {@link
22   * Token#setPositionIncrement(int)}.*/
23  public class CommonGrams {
24    private static final Logger   LOG =
25      LogFormatter.getLogger("net.nutch.analysis.CommonGrams");
26    private static final char SEPARATOR = '-';
27    private static final HashMap COMMON_TERMS = new HashMap();
28  
29    static { init(); }
30  
31    private CommonGrams() {}                        // no public ctor
32  
33    private static class Filter extends TokenFilter {
34      private HashSet common;
35      private Token previous;
36      private LinkedList gramQueue = new LinkedList();
37      private LinkedList nextQueue = new LinkedList();
38      private StringBuffer   buffer = new StringBuffer  ();
39  
40      /** Construct an n-gram producing filter. */
41      public Filter(TokenStream input, HashSet common) {
42        super(input);
43        this.common = common;
44      }
45  
46      /** Inserts n-grams into a token stream. */
47      public Token next() throws IOException {
48        if (gramQueue.size() != 0)                  // consume any queued tokens
49          return (Token)gramQueue.removeFirst();
50  
51        final Token token = popNext();
52        if (token == null)
53          return null;
54  
55        if (!isCommon(token)) {                     // optimize simple case
56          previous = token;
57          return token;
58        }
59  
60        gramQueue.add(token);                       // queue the token
61  
62        ListIterator i = nextQueue.listIterator();
63        Token gram = token;
64        while (isCommon(gram)) {
65          if (previous != null && !isCommon(previous)) // queue prev gram first
66            gramQueue.addFirst(gramToken(previous, gram));
67  
68          Token next = peekNext(i);
69          if (next == null)
70            break;
71  
72          gram = gramToken(gram, next);             // queue next gram last
73          gramQueue.addLast(gram);
74        }
75  
76        previous = token;
77        return (Token)gramQueue.removeFirst();
78      }
79  
80      /** True iff token is for a common term. */
81      private boolean isCommon(Token token) {
82        return common != null && common.contains(token.termText());
83      }
84  
85      /** Pops nextQueue or, if empty, reads a new token. */
86      private Token popNext() throws IOException {
87        if (nextQueue.size() > 0)
88          return (Token)nextQueue.removeFirst();
89        else
90          return input.next();
91      }
92  
93      /** Return next token in nextQueue, extending it when empty. */
94      private Token peekNext(ListIterator i) throws IOException {
95        if (!i.hasNext()) {
96          Token next = input.next();
97          if (next == null)
98            return null;
99          i.add(next);
100         i.previous();
101       }
102       return (Token)i.next();
103     }
104 
105     /** Construct a compound token. */
106     private Token gramToken(Token first, Token second) {
107       buffer.setLength(0);
108       buffer.append(first.termText());
109       buffer.append(SEPARATOR);
110       buffer.append(second.termText());
111       Token result = new Token(buffer.toString(),
112                                first.startOffset(), second.endOffset(),
113                                "gram");
114       result.setPositionIncrement(0);
115       return result;
116     }
117   }
118 
119   /** Construct using the provided config file. */
120   private static void init() {
121     try {
122       Reader reader = NutchConf.getConfResourceAsReader
123         (NutchConf.get("analysis.common.terms.file"));
124       BufferedReader in = new BufferedReader(reader);
125       String   line;
126       while ((line = in.readLine()) != null) {
127         line = line.trim();
128         if (line.startsWith("#") || "".equals(line)) // skip comments
129           continue;
130         TokenStream ts = new NutchDocumentTokenizer(new StringReader(line));
131         Token token = ts.next();
132         if (token == null) {
133           LOG.warning("Line does not contain a field name: " + line);
134           continue;
135         }
136         String   field = token.termText();
137         token = ts.next();
138         if (token == null) {
139           LOG.warning("Line contains only a field name, no word: " + line);
140           continue;
141         }
142         String   gram = token.termText();
143         while ((token = ts.next()) != null) {
144           gram = gram + SEPARATOR + token.termText();
145         }
146         HashSet table = (HashSet)COMMON_TERMS.get(field);
147         if (table == null) {
148           table = new HashSet();
149           COMMON_TERMS.put(field, table);
150         }
151         table.add(gram);
152       }
153     } catch (IOException e) {
154       throw new RuntimeException  (e.toString());
155     }
156   }
157 
158   /** Construct a token filter that inserts n-grams for common terms.  For use
159    * while indexing documents.  */
160   public static TokenFilter getFilter(TokenStream ts, String   field) {
161     return new Filter(ts, (HashSet)COMMON_TERMS.get(field));
162   }
163 
164   /** Utility to convert an array of Query.Terms into a token stream. */
165   private static class ArrayTokens extends TokenStream {
166     private Term[] terms;
167     private int index;
168 
169     public ArrayTokens(Phrase phrase) { this.terms = phrase.getTerms(); }
170     
171     public Token next() {
172       if (index == terms.length)
173         return null;
174       else
175         return new Token(terms[index].toString(), index, ++index);
176     }
177   }
178 
179   /** Optimizes phrase queries to use n-grams when possible. */
180   public static String  [] optimizePhrase(Phrase phrase, String   field) {
181     //LOG.info("Optimizing " + phrase + " for " + field);
182     ArrayList result = new ArrayList();
183     TokenStream ts = getFilter(new ArrayTokens(phrase), field);
184     Token token, prev=null;
185     int position = 0;
186     try {
187       while ((token = ts.next()) != null) {
188         if (token.getPositionIncrement() != 0 && prev != null)
189           result.add(prev.termText());
190         prev = token;
191         position += token.getPositionIncrement();
192         if ((position + arity(token.termText())) == phrase.getTerms().length)
193           break;
194       }
195     } catch (IOException e) {
196       throw new RuntimeException  (e.toString());
197     }
198     if (prev != null)
199       result.add(prev.termText());
200 
201 //     LOG.info("Optimized: ");
202 //     for (int i = 0; i < result.size(); i++) {
203 //       LOG.info(result.get(i) + " ");
204 //     }
205 
206     return (String  [])result.toArray(new String  [result.size()]);
207 
208 
209   }
210 
211   private static int arity(String   gram) {
212     int index = 0;
213     int arity = 0;
214     while ((index = gram.indexOf(SEPARATOR, index+1)) != -1) {
215       arity++;
216     }
217     return arity;
218   }
219 
220   /** For debugging. */
221   public static void main(String  [] args) throws Exception   {
222     StringBuffer   text = new StringBuffer  ();
223     for (int i = 0; i < args.length; i++) {
224       text.append(args[i]);
225       text.append(' ');
226     }
227     TokenStream ts =
228       new NutchDocumentTokenizer(new StringReader(text.toString()));
229     ts = getFilter(ts, "url");
230     Token token;
231     while ((token = ts.next()) != null) {
232       System.out.println("Token: " + token);
233     }
234     String  [] optimized = optimizePhrase(new Phrase(args), "url");
235     System.out.print("Optimized: ");
236     for (int i = 0; i < optimized.length; i++) {
237       System.out.print(optimized[i] + " ");
238     }
239     System.out.println();
240   }
241   
242 }
243
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags