KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > cn > ChineseFilter


1 package org.apache.lucene.analysis.cn;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import java.util.Hashtable JavaDoc;
20 import org.apache.lucene.analysis.*;
21
22 /**
23  * Title: ChineseFilter
24  * Description: Filter with a stop word table
25  * Rule: No digital is allowed.
26  * English word/token should larger than 1 character.
27  * One Chinese character as one Chinese word.
28  * TO DO:
29  * 1. Add Chinese stop words, such as ?
30  * 2. Dictionary based Chinese word extraction
31  * 3. Intelligent Chinese word extraction
32  *
33  * Copyright: Copyright (c) 2001
34  * Company:
35  * @author Yiyi Sun
36  * @version 1.0
37  *
38  */

39
40 public final class ChineseFilter extends TokenFilter {
41
42
43     // Only English now, Chinese to be added later.
44
public static final String JavaDoc[] STOP_WORDS = {
45     "and", "are", "as", "at", "be", "but", "by",
46     "for", "if", "in", "into", "is", "it",
47     "no", "not", "of", "on", "or", "such",
48     "that", "the", "their", "then", "there", "these",
49     "they", "this", "to", "was", "will", "with"
50     };
51
52
53     private Hashtable JavaDoc stopTable;
54
55     public ChineseFilter(TokenStream in) {
56         super(in);
57
58         stopTable = new Hashtable JavaDoc(STOP_WORDS.length);
59         for (int i = 0; i < STOP_WORDS.length; i++)
60             stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
61     }
62
63     public final Token next() throws java.io.IOException JavaDoc {
64
65         for (Token token = input.next(); token != null; token = input.next()) {
66             String JavaDoc text = token.termText();
67
68           // why not key off token type here assuming ChineseTokenizer comes first?
69
if (stopTable.get(text) == null) {
70                 switch (Character.getType(text.charAt(0))) {
71
72                 case Character.LOWERCASE_LETTER:
73                 case Character.UPPERCASE_LETTER:
74
75                     // English word/token should larger than 1 character.
76
if (text.length()>1) {
77                         return token;
78                     }
79                     break;
80                 case Character.OTHER_LETTER:
81
82                     // One Chinese character as one Chinese word.
83
// Chinese word extraction to be added later here.
84

85                     return token;
86                 }
87
88             }
89
90         }
91         return null;
92     }
93
94 }
Popular Tags