ChineseTokenizer


1   package org.apache.lucene.analysis.cn;
2   
3   /**
4    * Copyright 2004-2005 The Apache Software Foundation
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  
20  import java.io.Reader  ;
21  import org.apache.lucene.analysis.*;
22  
23  
24  /**
25   * Title: ChineseTokenizer
26   * Description: Extract tokens from the Stream using Character.getType()
27   *              Rule: A Chinese character as a single token
28   * Copyright:   Copyright (c) 2001
29   * Company:
30   *
31   * The difference between thr ChineseTokenizer and the
32   * CJKTokenizer (id=23545) is that they have different
33   * token parsing logic.
34   * 
35   * Let me use an example. If having a Chinese text
36   * "C1C2C3C4" to be indexed, the tokens returned from the
37   * ChineseTokenizer are C1, C2, C3, C4. And the tokens
38   * returned from the CJKTokenizer are C1C2, C2C3, C3C4.
39   *
40   * Therefore the index the CJKTokenizer created is much
41   * larger.
42   *
43   * The problem is that when searching for C1, C1C2, C1C3,
44   * C4C2, C1C2C3 ... the ChineseTokenizer works, but the
45   * CJKTokenizer will not work.
46   *
47   * @author Yiyi Sun
48   * @version 1.0
49   *
50   */
51  
52  public final class ChineseTokenizer extends Tokenizer {
53  
54  
55      public ChineseTokenizer(Reader   in) {
56          input = in;
57      }
58  
59      private int offset = 0, bufferIndex=0, dataLen=0;
60      private final static int MAX_WORD_LEN = 255;
61      private final static int IO_BUFFER_SIZE = 1024;
62      private final char[] buffer = new char[MAX_WORD_LEN];
63      private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
64  
65  
66      private int length;
67      private int start;
68  
69  
70      private final void push(char c) {
71  
72          if (length == 0) start = offset-1;            // start of token
73          buffer[length++] = Character.toLowerCase(c);  // buffer it
74  
75      }
76  
77      private final Token flush() {
78  
79          if (length>0) {
80              //System.out.println(new String(buffer, 0, length));
81              return new Token(new String  (buffer, 0, length), start, start+length);
82          }
83          else
84              return null;
85      }
86  
87      public final Token next() throws java.io.IOException   {
88  
89          length = 0;
90          start = offset;
91  
92  
93          while (true) {
94  
95              final char c;
96              offset++;
97  
98              if (bufferIndex >= dataLen) {
99                  dataLen = input.read(ioBuffer);
100                 bufferIndex = 0;
101             };
102 
103             if (dataLen == -1) return flush();
104             else
105                 c = ioBuffer[bufferIndex++];
106 
107 
108             switch(Character.getType(c)) {
109 
110             case Character.DECIMAL_DIGIT_NUMBER:
111             case Character.LOWERCASE_LETTER:
112             case Character.UPPERCASE_LETTER:
113                 push(c);
114                 if (length == MAX_WORD_LEN) return flush();
115                 break;
116 
117             case Character.OTHER_LETTER:
118                 if (length>0) {
119                     bufferIndex--;
120                     offset--;
121                     return flush();
122                 }
123                 push(c);
124                 return flush();
125 
126             default:
127                 if (length>0) return flush();
128                 break;
129             }
130         }
131 
132     }
133 }
134
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags