NutchDocumentTokenizer


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.analysis;
5   
6   import java.io.*;
7   
8   import org.apache.lucene.analysis.Tokenizer;
9   import org.apache.lucene.analysis.Token;
10  
11  /** The tokenizer used for Nutch document text.  Implemented in terms of our
12   * JavaCC-generated lexical analyzer, {@link NutchAnalysisTokenManager}, shared
13   * with the query parser.
14   */
15  public final class NutchDocumentTokenizer extends Tokenizer
16    implements NutchAnalysisConstants {
17    
18    private NutchAnalysisTokenManager tokenManager;
19  
20    /** Construct a tokenizer for the text in a Reader. */
21    public NutchDocumentTokenizer(Reader reader) {
22      super(reader);
23      tokenManager = new NutchAnalysisTokenManager(reader); 
24    }
25    
26    /** Returns the next token in the stream, or null at EOF. */
27    public final Token next() throws IOException {
28  
29      net.nutch.analysis.Token t;
30  
31      try {
32        loop: {
33          while (true) {
34            t = tokenManager.getNextToken();
35            switch (t.kind) {                       // skip query syntax tokens
36            case EOF: case WORD: case ACRONYM: case SIGRAM:
37              break loop;
38            default:
39            }
40          }
41        }
42      } catch (TokenMgrError e) {                   // translate exceptions
43        throw new IOException("Tokenizer error:" + e);
44      }
45  
46      if (t.kind == EOF)                            // translate tokens
47        return null;
48      else {
49        return new Token(t.image,t.beginColumn,t.endColumn,tokenImage[t.kind]);
50      }
51    }
52  
53    /** For debugging. */
54    public static void main(String  [] args) throws Exception   {
55      BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
56      while (true) {
57        System.out.print("Text: ");
58        String   line = in.readLine();
59        Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line));
60        Token token;
61        System.out.print("Tokens: ");
62        while ((token = tokenizer.next()) != null) {
63          System.out.print(token.termText());
64          System.out.print(" ");
65        }
66        System.out.println();
67      }
68    }
69  
70  }
71
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags