KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > analysis > NutchDocumentTokenizer


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.analysis;
5
6 import java.io.*;
7
8 import org.apache.lucene.analysis.Tokenizer;
9 import org.apache.lucene.analysis.Token;
10
11 /** The tokenizer used for Nutch document text. Implemented in terms of our
12  * JavaCC-generated lexical analyzer, {@link NutchAnalysisTokenManager}, shared
13  * with the query parser.
14  */

15 public final class NutchDocumentTokenizer extends Tokenizer
16   implements NutchAnalysisConstants {
17   
18   private NutchAnalysisTokenManager tokenManager;
19
20   /** Construct a tokenizer for the text in a Reader. */
21   public NutchDocumentTokenizer(Reader reader) {
22     super(reader);
23     tokenManager = new NutchAnalysisTokenManager(reader);
24   }
25   
26   /** Returns the next token in the stream, or null at EOF. */
27   public final Token next() throws IOException {
28
29     net.nutch.analysis.Token t;
30
31     try {
32       loop: {
33         while (true) {
34           t = tokenManager.getNextToken();
35           switch (t.kind) { // skip query syntax tokens
36
case EOF: case WORD: case ACRONYM: case SIGRAM:
37             break loop;
38           default:
39           }
40         }
41       }
42     } catch (TokenMgrError e) { // translate exceptions
43
throw new IOException("Tokenizer error:" + e);
44     }
45
46     if (t.kind == EOF) // translate tokens
47
return null;
48     else {
49       return new Token(t.image,t.beginColumn,t.endColumn,tokenImage[t.kind]);
50     }
51   }
52
53   /** For debugging. */
54   public static void main(String JavaDoc[] args) throws Exception JavaDoc {
55     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
56     while (true) {
57       System.out.print("Text: ");
58       String JavaDoc line = in.readLine();
59       Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line));
60       Token token;
61       System.out.print("Tokens: ");
62       while ((token = tokenizer.next()) != null) {
63         System.out.print(token.termText());
64         System.out.print(" ");
65       }
66       System.out.println();
67     }
68   }
69
70 }
71
Popular Tags