1 package org.apache.lucene.analysis.cn; 2 3 18 19 20 import java.io.Reader ; 21 import org.apache.lucene.analysis.*; 22 23 24 51 52 public final class ChineseTokenizer extends Tokenizer { 53 54 55 public ChineseTokenizer(Reader in) { 56 input = in; 57 } 58 59 private int offset = 0, bufferIndex=0, dataLen=0; 60 private final static int MAX_WORD_LEN = 255; 61 private final static int IO_BUFFER_SIZE = 1024; 62 private final char[] buffer = new char[MAX_WORD_LEN]; 63 private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; 64 65 66 private int length; 67 private int start; 68 69 70 private final void push(char c) { 71 72 if (length == 0) start = offset-1; buffer[length++] = Character.toLowerCase(c); 75 } 76 77 private final Token flush() { 78 79 if (length>0) { 80 return new Token(new String (buffer, 0, length), start, start+length); 82 } 83 else 84 return null; 85 } 86 87 public final Token next() throws java.io.IOException { 88 89 length = 0; 90 start = offset; 91 92 93 while (true) { 94 95 final char c; 96 offset++; 97 98 if (bufferIndex >= dataLen) { 99 dataLen = input.read(ioBuffer); 100 bufferIndex = 0; 101 }; 102 103 if (dataLen == -1) return flush(); 104 else 105 c = ioBuffer[bufferIndex++]; 106 107 108 switch(Character.getType(c)) { 109 110 case Character.DECIMAL_DIGIT_NUMBER: 111 case Character.LOWERCASE_LETTER: 112 case Character.UPPERCASE_LETTER: 113 push(c); 114 if (length == MAX_WORD_LEN) return flush(); 115 break; 116 117 case Character.OTHER_LETTER: 118 if (length>0) { 119 bufferIndex--; 120 offset--; 121 return flush(); 122 } 123 push(c); 124 return flush(); 125 126 default: 127 if (length>0) return flush(); 128 break; 129 } 130 } 131 132 } 133 } 134 | Popular Tags |