CJKTokenizer


1   package org.apache.lucene.analysis.cjk;
2   
3   /**
4    * Copyright 2004-2005 The Apache Software Foundation
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  import org.apache.lucene.analysis.Token;
20  import org.apache.lucene.analysis.Tokenizer;
21  
22  import java.io.Reader  ;
23  
24  
25  /**
26   * CJKTokenizer was modified from StopTokenizer which does a decent job for
27   * most European languages. It performs other token methods for double-byte
28   * Characters: the token will return at each two charactors with overlap match.<br>
29   * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
30   * also need filter filter zero length token ""<br>
31   * for Digit: digit, '+', '#' will token as letter<br>
32   * for more info on Asia language(Chinese Japanese Korean) text segmentation:
33   * please search  <a
34   * HREF="http://www.google.com/search?q=word+chinese+segment">google</a>
35   *
36   * @author Che, Dong
37   */
38  public final class CJKTokenizer extends Tokenizer {
39      //~ Static fields/initializers ---------------------------------------------
40  
41      /** Max word length */
42      private static final int MAX_WORD_LEN = 255;
43  
44      /** buffer size: */
45      private static final int IO_BUFFER_SIZE = 256;
46  
47      //~ Instance fields --------------------------------------------------------
48  
49      /** word offset, used to imply which character(in ) is parsed */
50      private int offset = 0;
51  
52      /** the index used only for ioBuffer */
53      private int bufferIndex = 0;
54  
55      /** data length */
56      private int dataLen = 0;
57  
58      /**
59       * character buffer, store the characters which are used to compose <br>
60       * the returned Token
61       */
62      private final char[] buffer = new char[MAX_WORD_LEN];
63  
64      /**
65       * I/O buffer, used to store the content of the input(one of the <br>
66       * members of Tokenizer)
67       */
68      private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
69  
70      /** word type: single=>ASCII  double=>non-ASCII word=>default */
71      private String   tokenType = "word";
72  
73      /**
74       * tag: previous character is a cached double-byte character  "C1C2C3C4"
75       * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
76       * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
77       */
78      private boolean preIsTokened = false;
79  
80      //~ Constructors -----------------------------------------------------------
81  
82      /**
83       * Construct a token stream processing the given input.
84       *
85       * @param in I/O reader
86       */
87      public CJKTokenizer(Reader   in) {
88          input = in;
89      }
90  
91      //~ Methods ----------------------------------------------------------------
92  
93      /**
94       * Returns the next token in the stream, or null at EOS.
95       * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
96       * for detail.
97       *
98       * @return Token
99       *
100      * @throws java.io.IOException - throw IOException when read error <br>
101      *         hanppened in the InputStream
102      *
103      */
104     public final Token next() throws java.io.IOException   {
105         /** how many character(s) has been stored in buffer */
106         int length = 0;
107 
108         /** the position used to create Token */
109         int start = offset;
110 
111         while (true) {
112             /** current charactor */
113             char c;
114 
115             /** unicode block of current charactor for detail */
116             Character.UnicodeBlock   ub;
117 
118             offset++;
119 
120             if (bufferIndex >= dataLen) {
121                 dataLen = input.read(ioBuffer);
122                 bufferIndex = 0;
123             }
124 
125             if (dataLen == -1) {
126                 if (length > 0) {
127                     if (preIsTokened == true) {
128                         length = 0;
129                         preIsTokened = false;
130                     }
131 
132                     break;
133                 } else {
134                     return null;
135                 }
136             } else {
137                 //get current character
138                 c = ioBuffer[bufferIndex++];
139 
140                 //get the UnicodeBlock of the current character
141                 ub = Character.UnicodeBlock.of(c);
142             }
143 
144             //if the current character is ASCII or Extend ASCII
145             if ((ub == Character.UnicodeBlock.BASIC_LATIN)
146                     || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
147                ) {
148                 if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
149                     /** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
150                     int i = (int) c;
151                     i = i - 65248;
152                     c = (char) i;
153                 }
154 
155                 // if the current character is a letter or "_" "+" "#"
156                 if (Character.isLetterOrDigit(c)
157                         || ((c == '_') || (c == '+') || (c == '#'))
158                    ) {
159                     if (length == 0) {
160                         // "javaC1C2C3C4linux" <br>
161                         //      ^--: the current character begin to token the ASCII
162                         // letter
163                         start = offset - 1;
164                     } else if (tokenType == "double") {
165                         // "javaC1C2C3C4linux" <br>
166                         //              ^--: the previous non-ASCII
167                         // : the current character
168                         offset--;
169                         bufferIndex--;
170                         tokenType = "single";
171 
172                         if (preIsTokened == true) {
173                             // there is only one non-ASCII has been stored
174                             length = 0;
175                             preIsTokened = false;
176 
177                             break;
178                         } else {
179                             break;
180                         }
181                     }
182 
183                     // store the LowerCase(c) in the buffer
184                     buffer[length++] = Character.toLowerCase(c);
185                     tokenType = "single";
186 
187                     // break the procedure if buffer overflowed!
188                     if (length == MAX_WORD_LEN) {
189                         break;
190                     }
191                 } else if (length > 0) {
192                     if (preIsTokened == true) {
193                         length = 0;
194                         preIsTokened = false;
195                     } else {
196                         break;
197                     }
198                 }
199             } else {
200                 // non-ASCII letter, eg."C1C2C3C4"
201                 if (Character.isLetter(c)) {
202                     if (length == 0) {
203                         start = offset - 1;
204                         buffer[length++] = c;
205                         tokenType = "double";
206                     } else {
207                         if (tokenType == "single") {
208                             offset--;
209                             bufferIndex--;
210 
211                             //return the previous ASCII characters
212                             break;
213                         } else {
214                             buffer[length++] = c;
215                             tokenType = "double";
216 
217                             if (length == 2) {
218                                 offset--;
219                                 bufferIndex--;
220                                 preIsTokened = true;
221 
222                                 break;
223                             }
224                         }
225                     }
226                 } else if (length > 0) {
227                     if (preIsTokened == true) {
228                         // empty the buffer
229                         length = 0;
230                         preIsTokened = false;
231                     } else {
232                         break;
233                     }
234                 }
235             }
236         }
237 
238         return new Token(new String  (buffer, 0, length), start, start + length,
239                          tokenType
240                         );
241     }
242 }
243
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags