1 2 3 4 package net.nutch.analysis; 5 6 import org.apache.lucene.analysis.Analyzer; 7 import org.apache.lucene.analysis.TokenFilter; 8 import org.apache.lucene.analysis.TokenStream; 9 import org.apache.lucene.analysis.Token; 10 11 import java.io.*; 12 import java.util.*; 13 import java.util.logging.Logger ; 14 15 import net.nutch.util.*; 16 17 import net.nutch.searcher.Query.*; 18 19 23 public class CommonGrams { 24 private static final Logger LOG = 25 LogFormatter.getLogger("net.nutch.analysis.CommonGrams"); 26 private static final char SEPARATOR = '-'; 27 private static final HashMap COMMON_TERMS = new HashMap(); 28 29 static { init(); } 30 31 private CommonGrams() {} 33 private static class Filter extends TokenFilter { 34 private HashSet common; 35 private Token previous; 36 private LinkedList gramQueue = new LinkedList(); 37 private LinkedList nextQueue = new LinkedList(); 38 private StringBuffer buffer = new StringBuffer (); 39 40 41 public Filter(TokenStream input, HashSet common) { 42 super(input); 43 this.common = common; 44 } 45 46 47 public Token next() throws IOException { 48 if (gramQueue.size() != 0) return (Token)gramQueue.removeFirst(); 50 51 final Token token = popNext(); 52 if (token == null) 53 return null; 54 55 if (!isCommon(token)) { previous = token; 57 return token; 58 } 59 60 gramQueue.add(token); 62 ListIterator i = nextQueue.listIterator(); 63 Token gram = token; 64 while (isCommon(gram)) { 65 if (previous != null && !isCommon(previous)) gramQueue.addFirst(gramToken(previous, gram)); 67 68 Token next = peekNext(i); 69 if (next == null) 70 break; 71 72 gram = gramToken(gram, next); gramQueue.addLast(gram); 74 } 75 76 previous = token; 77 return (Token)gramQueue.removeFirst(); 78 } 79 80 81 private boolean isCommon(Token token) { 82 return common != null && common.contains(token.termText()); 83 } 84 85 86 private Token popNext() throws IOException { 87 if (nextQueue.size() > 0) 88 return (Token)nextQueue.removeFirst(); 89 else 90 return input.next(); 91 } 92 93 94 private Token peekNext(ListIterator i) throws IOException { 95 if (!i.hasNext()) { 96 Token next = input.next(); 97 if (next == null) 98 return null; 99 i.add(next); 100 i.previous(); 101 } 102 return (Token)i.next(); 103 } 104 105 106 private Token gramToken(Token first, Token second) { 107 buffer.setLength(0); 108 buffer.append(first.termText()); 109 buffer.append(SEPARATOR); 110 buffer.append(second.termText()); 111 Token result = new Token(buffer.toString(), 112 first.startOffset(), second.endOffset(), 113 "gram"); 114 result.setPositionIncrement(0); 115 return result; 116 } 117 } 118 119 120 private static void init() { 121 try { 122 Reader reader = NutchConf.getConfResourceAsReader 123 (NutchConf.get("analysis.common.terms.file")); 124 BufferedReader in = new BufferedReader(reader); 125 String line; 126 while ((line = in.readLine()) != null) { 127 line = line.trim(); 128 if (line.startsWith("#") || "".equals(line)) continue; 130 TokenStream ts = new NutchDocumentTokenizer(new StringReader(line)); 131 Token token = ts.next(); 132 if (token == null) { 133 LOG.warning("Line does not contain a field name: " + line); 134 continue; 135 } 136 String field = token.termText(); 137 token = ts.next(); 138 if (token == null) { 139 LOG.warning("Line contains only a field name, no word: " + line); 140 continue; 141 } 142 String gram = token.termText(); 143 while ((token = ts.next()) != null) { 144 gram = gram + SEPARATOR + token.termText(); 145 } 146 HashSet table = (HashSet)COMMON_TERMS.get(field); 147 if (table == null) { 148 table = new HashSet(); 149 COMMON_TERMS.put(field, table); 150 } 151 table.add(gram); 152 } 153 } catch (IOException e) { 154 throw new RuntimeException (e.toString()); 155 } 156 } 157 158 160 public static TokenFilter getFilter(TokenStream ts, String field) { 161 return new Filter(ts, (HashSet)COMMON_TERMS.get(field)); 162 } 163 164 165 private static class ArrayTokens extends TokenStream { 166 private Term[] terms; 167 private int index; 168 169 public ArrayTokens(Phrase phrase) { this.terms = phrase.getTerms(); } 170 171 public Token next() { 172 if (index == terms.length) 173 return null; 174 else 175 return new Token(terms[index].toString(), index, ++index); 176 } 177 } 178 179 180 public static String [] optimizePhrase(Phrase phrase, String field) { 181 ArrayList result = new ArrayList(); 183 TokenStream ts = getFilter(new ArrayTokens(phrase), field); 184 Token token, prev=null; 185 int position = 0; 186 try { 187 while ((token = ts.next()) != null) { 188 if (token.getPositionIncrement() != 0 && prev != null) 189 result.add(prev.termText()); 190 prev = token; 191 position += token.getPositionIncrement(); 192 if ((position + arity(token.termText())) == phrase.getTerms().length) 193 break; 194 } 195 } catch (IOException e) { 196 throw new RuntimeException (e.toString()); 197 } 198 if (prev != null) 199 result.add(prev.termText()); 200 201 206 return (String [])result.toArray(new String [result.size()]); 207 208 209 } 210 211 private static int arity(String gram) { 212 int index = 0; 213 int arity = 0; 214 while ((index = gram.indexOf(SEPARATOR, index+1)) != -1) { 215 arity++; 216 } 217 return arity; 218 } 219 220 221 public static void main(String [] args) throws Exception { 222 StringBuffer text = new StringBuffer (); 223 for (int i = 0; i < args.length; i++) { 224 text.append(args[i]); 225 text.append(' '); 226 } 227 TokenStream ts = 228 new NutchDocumentTokenizer(new StringReader(text.toString())); 229 ts = getFilter(ts, "url"); 230 Token token; 231 while ((token = ts.next()) != null) { 232 System.out.println("Token: " + token); 233 } 234 String [] optimized = optimizePhrase(new Phrase(args), "url"); 235 System.out.print("Optimized: "); 236 for (int i = 0; i < optimized.length; i++) { 237 System.out.print(optimized[i] + " "); 238 } 239 System.out.println(); 240 } 241 242 } 243 | Popular Tags |