KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > snipsnap > snip > SnipTokenizer


1 package org.snipsnap.snip;
2
3 import org.apache.lucene.analysis.Token;
4 import org.apache.lucene.analysis.Tokenizer;
5
6 import java.io.Reader JavaDoc;
7
8 /**
9  * Splits reader input into tokens.
10  * Tokens are made out of letters
11  * and digits
12  *
13  * @author stephan
14  * @version $Id: SnipTokenizer.java 864 2003-05-23 10:47:26Z stephan $
15  **/

16
17 public final class SnipTokenizer extends Tokenizer {
18
19   public SnipTokenizer(String JavaDoc field, Reader JavaDoc in) {
20     input = in;
21   }
22
23   private int offset = 0, bufferIndex = 0, dataLen = 0;
24   private final static int MAX_WORD_LEN = 255;
25   private final static int IO_BUFFER_SIZE = 1024;
26   private final char[] buffer = new char[MAX_WORD_LEN];
27   private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
28
29   public final Token next() throws java.io.IOException JavaDoc {
30     int length = 0;
31     int start = offset;
32
33     while (true) {
34       char c;
35
36       offset++;
37       if (bufferIndex >= dataLen) {
38         dataLen = input.read(ioBuffer);
39         bufferIndex = 0;
40       }
41       ;
42       if (dataLen == -1) {
43         if (length > 0) {
44           break;
45         } else {
46           return null;
47         }
48       } else
49         c = (char) ioBuffer[bufferIndex++];
50
51       if (Character.isLetterOrDigit(c)) {
52         if (length == 0) { // start of token
53
start = offset - 1;
54         }
55
56         buffer[length++] = Character.toLowerCase(c);
57         // buffer it
58
if (length == MAX_WORD_LEN) { // buffer overflow!
59
break;
60         }
61
62       } else if (length > 0){ // at non-Letter w/ chars
63
break; // return 'em
64
}
65
66     }
67
68     return new Token(new String JavaDoc(buffer, 0, length), start, start + length);
69   }
70 }
71
Popular Tags