1 2 3 4 package net.nutch.analysis; 5 6 import java.io.*; 7 8 import org.apache.lucene.analysis.Tokenizer; 9 import org.apache.lucene.analysis.Token; 10 11 15 public final class NutchDocumentTokenizer extends Tokenizer 16 implements NutchAnalysisConstants { 17 18 private NutchAnalysisTokenManager tokenManager; 19 20 21 public NutchDocumentTokenizer(Reader reader) { 22 super(reader); 23 tokenManager = new NutchAnalysisTokenManager(reader); 24 } 25 26 27 public final Token next() throws IOException { 28 29 net.nutch.analysis.Token t; 30 31 try { 32 loop: { 33 while (true) { 34 t = tokenManager.getNextToken(); 35 switch (t.kind) { case EOF: case WORD: case ACRONYM: case SIGRAM: 37 break loop; 38 default: 39 } 40 } 41 } 42 } catch (TokenMgrError e) { throw new IOException("Tokenizer error:" + e); 44 } 45 46 if (t.kind == EOF) return null; 48 else { 49 return new Token(t.image,t.beginColumn,t.endColumn,tokenImage[t.kind]); 50 } 51 } 52 53 54 public static void main(String [] args) throws Exception { 55 BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); 56 while (true) { 57 System.out.print("Text: "); 58 String line = in.readLine(); 59 Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line)); 60 Token token; 61 System.out.print("Tokens: "); 62 while ((token = tokenizer.next()) != null) { 63 System.out.print(token.termText()); 64 System.out.print(" "); 65 } 66 System.out.println(); 67 } 68 } 69 70 } 71 | Popular Tags |