1 package org.apache.lucene.analysis.cjk; 2 3 18 19 import org.apache.lucene.analysis.Token; 20 import org.apache.lucene.analysis.Tokenizer; 21 22 import java.io.Reader ; 23 24 25 38 public final class CJKTokenizer extends Tokenizer { 39 41 42 private static final int MAX_WORD_LEN = 255; 43 44 45 private static final int IO_BUFFER_SIZE = 256; 46 47 49 50 private int offset = 0; 51 52 53 private int bufferIndex = 0; 54 55 56 private int dataLen = 0; 57 58 62 private final char[] buffer = new char[MAX_WORD_LEN]; 63 64 68 private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; 69 70 71 private String tokenType = "word"; 72 73 78 private boolean preIsTokened = false; 79 80 82 87 public CJKTokenizer(Reader in) { 88 input = in; 89 } 90 91 93 104 public final Token next() throws java.io.IOException { 105 106 int length = 0; 107 108 109 int start = offset; 110 111 while (true) { 112 113 char c; 114 115 116 Character.UnicodeBlock ub; 117 118 offset++; 119 120 if (bufferIndex >= dataLen) { 121 dataLen = input.read(ioBuffer); 122 bufferIndex = 0; 123 } 124 125 if (dataLen == -1) { 126 if (length > 0) { 127 if (preIsTokened == true) { 128 length = 0; 129 preIsTokened = false; 130 } 131 132 break; 133 } else { 134 return null; 135 } 136 } else { 137 c = ioBuffer[bufferIndex++]; 139 140 ub = Character.UnicodeBlock.of(c); 142 } 143 144 if ((ub == Character.UnicodeBlock.BASIC_LATIN) 146 || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) 147 ) { 148 if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { 149 150 int i = (int) c; 151 i = i - 65248; 152 c = (char) i; 153 } 154 155 if (Character.isLetterOrDigit(c) 157 || ((c == '_') || (c == '+') || (c == '#')) 158 ) { 159 if (length == 0) { 160 start = offset - 1; 164 } else if (tokenType == "double") { 165 offset--; 169 bufferIndex--; 170 tokenType = "single"; 171 172 if (preIsTokened == true) { 173 length = 0; 175 preIsTokened = false; 176 177 break; 178 } else { 179 break; 180 } 181 } 182 183 buffer[length++] = Character.toLowerCase(c); 185 tokenType = "single"; 186 187 if (length == MAX_WORD_LEN) { 189 break; 190 } 191 } else if (length > 0) { 192 if (preIsTokened == true) { 193 length = 0; 194 preIsTokened = false; 195 } else { 196 break; 197 } 198 } 199 } else { 200 if (Character.isLetter(c)) { 202 if (length == 0) { 203 start = offset - 1; 204 buffer[length++] = c; 205 tokenType = "double"; 206 } else { 207 if (tokenType == "single") { 208 offset--; 209 bufferIndex--; 210 211 break; 213 } else { 214 buffer[length++] = c; 215 tokenType = "double"; 216 217 if (length == 2) { 218 offset--; 219 bufferIndex--; 220 preIsTokened = true; 221 222 break; 223 } 224 } 225 } 226 } else if (length > 0) { 227 if (preIsTokened == true) { 228 length = 0; 230 preIsTokened = false; 231 } else { 232 break; 233 } 234 } 235 } 236 } 237 238 return new Token(new String (buffer, 0, length), start, start + length, 239 tokenType 240 ); 241 } 242 } 243 | Popular Tags |