KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > ru > RussianLetterTokenizer


1 package org.apache.lucene.analysis.ru;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import java.io.Reader JavaDoc;
20 import org.apache.lucene.analysis.CharTokenizer;
21
22 /**
23  * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
24  * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
25  * which doesn't know how to detect letters in encodings like CP1252 and KOI8
26  * (well-known problems with 0xD7 and 0xF7 chars)
27  *
28  * @author Boris Okner, b.okner@rogers.com
29  * @version $Id: RussianLetterTokenizer.java 150998 2004-08-16 20:30:46Z dnaber $
30  */

31
32 public class RussianLetterTokenizer extends CharTokenizer
33 {
34     /** Construct a new LetterTokenizer. */
35     private char[] charset;
36
37     public RussianLetterTokenizer(Reader JavaDoc in, char[] charset)
38     {
39         super(in);
40         this.charset = charset;
41     }
42
43     /**
44      * Collects only characters which satisfy
45      * {@link Character#isLetter(char)}.
46      */

47     protected boolean isTokenChar(char c)
48     {
49         if (Character.isLetter(c))
50             return true;
51         for (int i = 0; i < charset.length; i++)
52         {
53             if (c == charset[i])
54                 return true;
55         }
56         return false;
57     }
58 }
59
Popular Tags