1 package org.snipsnap.snip; 2 3 import org.apache.lucene.analysis.Token; 4 import org.apache.lucene.analysis.Tokenizer; 5 6 import java.io.Reader ; 7 8 16 17 public final class SnipTokenizer extends Tokenizer { 18 19 public SnipTokenizer(String field, Reader in) { 20 input = in; 21 } 22 23 private int offset = 0, bufferIndex = 0, dataLen = 0; 24 private final static int MAX_WORD_LEN = 255; 25 private final static int IO_BUFFER_SIZE = 1024; 26 private final char[] buffer = new char[MAX_WORD_LEN]; 27 private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; 28 29 public final Token next() throws java.io.IOException { 30 int length = 0; 31 int start = offset; 32 33 while (true) { 34 char c; 35 36 offset++; 37 if (bufferIndex >= dataLen) { 38 dataLen = input.read(ioBuffer); 39 bufferIndex = 0; 40 } 41 ; 42 if (dataLen == -1) { 43 if (length > 0) { 44 break; 45 } else { 46 return null; 47 } 48 } else 49 c = (char) ioBuffer[bufferIndex++]; 50 51 if (Character.isLetterOrDigit(c)) { 52 if (length == 0) { start = offset - 1; 54 } 55 56 buffer[length++] = Character.toLowerCase(c); 57 if (length == MAX_WORD_LEN) { break; 60 } 61 62 } else if (length > 0){ break; } 65 66 } 67 68 return new Token(new String (buffer, 0, length), start, start + length); 69 } 70 } 71 | Popular Tags |