1 package org.apache.lucene.analysis; 2 3 18 19 import java.io.IOException ; 20 import java.io.Reader ; 21 22 23 public abstract class CharTokenizer extends Tokenizer { 24 public CharTokenizer(Reader input) { 25 super(input); 26 } 27 28 private int offset = 0, bufferIndex = 0, dataLen = 0; 29 private static final int MAX_WORD_LEN = 255; 30 private static final int IO_BUFFER_SIZE = 1024; 31 private final char[] buffer = new char[MAX_WORD_LEN]; 32 private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; 33 34 38 protected abstract boolean isTokenChar(char c); 39 40 43 protected char normalize(char c) { 44 return c; 45 } 46 47 48 public final Token next() throws IOException { 49 int length = 0; 50 int start = offset; 51 while (true) { 52 final char c; 53 54 offset++; 55 if (bufferIndex >= dataLen) { 56 dataLen = input.read(ioBuffer); 57 bufferIndex = 0; 58 } 59 ; 60 if (dataLen == -1) { 61 if (length > 0) 62 break; 63 else 64 return null; 65 } else 66 c = ioBuffer[bufferIndex++]; 67 68 if (isTokenChar(c)) { 70 if (length == 0) start = offset - 1; 72 73 buffer[length++] = normalize(c); 75 if (length == MAX_WORD_LEN) break; 77 78 } else if (length > 0) break; 81 } 82 83 return new Token(new String (buffer, 0, length), start, start + length); 84 } 85 } 86 | Popular Tags |