KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > cn > ChineseTokenizer


1 package org.apache.lucene.analysis.cn;
2
3 /**
4  * Copyright 2004-2005 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19
20 import java.io.Reader JavaDoc;
21 import org.apache.lucene.analysis.*;
22
23
24 /**
25  * Title: ChineseTokenizer
26  * Description: Extract tokens from the Stream using Character.getType()
27  * Rule: A Chinese character as a single token
28  * Copyright: Copyright (c) 2001
29  * Company:
30  *
31  * The difference between thr ChineseTokenizer and the
32  * CJKTokenizer (id=23545) is that they have different
33  * token parsing logic.
34  *
35  * Let me use an example. If having a Chinese text
36  * "C1C2C3C4" to be indexed, the tokens returned from the
37  * ChineseTokenizer are C1, C2, C3, C4. And the tokens
38  * returned from the CJKTokenizer are C1C2, C2C3, C3C4.
39  *
40  * Therefore the index the CJKTokenizer created is much
41  * larger.
42  *
43  * The problem is that when searching for C1, C1C2, C1C3,
44  * C4C2, C1C2C3 ... the ChineseTokenizer works, but the
45  * CJKTokenizer will not work.
46  *
47  * @author Yiyi Sun
48  * @version 1.0
49  *
50  */

51
52 public final class ChineseTokenizer extends Tokenizer {
53
54
55     public ChineseTokenizer(Reader JavaDoc in) {
56         input = in;
57     }
58
59     private int offset = 0, bufferIndex=0, dataLen=0;
60     private final static int MAX_WORD_LEN = 255;
61     private final static int IO_BUFFER_SIZE = 1024;
62     private final char[] buffer = new char[MAX_WORD_LEN];
63     private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
64
65
66     private int length;
67     private int start;
68
69
70     private final void push(char c) {
71
72         if (length == 0) start = offset-1; // start of token
73
buffer[length++] = Character.toLowerCase(c); // buffer it
74

75     }
76
77     private final Token flush() {
78
79         if (length>0) {
80             //System.out.println(new String(buffer, 0, length));
81
return new Token(new String JavaDoc(buffer, 0, length), start, start+length);
82         }
83         else
84             return null;
85     }
86
87     public final Token next() throws java.io.IOException JavaDoc {
88
89         length = 0;
90         start = offset;
91
92
93         while (true) {
94
95             final char c;
96             offset++;
97
98             if (bufferIndex >= dataLen) {
99                 dataLen = input.read(ioBuffer);
100                 bufferIndex = 0;
101             };
102
103             if (dataLen == -1) return flush();
104             else
105                 c = ioBuffer[bufferIndex++];
106
107
108             switch(Character.getType(c)) {
109
110             case Character.DECIMAL_DIGIT_NUMBER:
111             case Character.LOWERCASE_LETTER:
112             case Character.UPPERCASE_LETTER:
113                 push(c);
114                 if (length == MAX_WORD_LEN) return flush();
115                 break;
116
117             case Character.OTHER_LETTER:
118                 if (length>0) {
119                     bufferIndex--;
120                     offset--;
121                     return flush();
122                 }
123                 push(c);
124                 return flush();
125
126             default:
127                 if (length>0) return flush();
128                 break;
129             }
130         }
131
132     }
133 }
134
Popular Tags