1 package org.apache.lucene.analysis.cn; 2 3 18 19 import java.util.Hashtable ; 20 import org.apache.lucene.analysis.*; 21 22 39 40 public final class ChineseFilter extends TokenFilter { 41 42 43 public static final String [] STOP_WORDS = { 45 "and", "are", "as", "at", "be", "but", "by", 46 "for", "if", "in", "into", "is", "it", 47 "no", "not", "of", "on", "or", "such", 48 "that", "the", "their", "then", "there", "these", 49 "they", "this", "to", "was", "will", "with" 50 }; 51 52 53 private Hashtable stopTable; 54 55 public ChineseFilter(TokenStream in) { 56 super(in); 57 58 stopTable = new Hashtable (STOP_WORDS.length); 59 for (int i = 0; i < STOP_WORDS.length; i++) 60 stopTable.put(STOP_WORDS[i], STOP_WORDS[i]); 61 } 62 63 public final Token next() throws java.io.IOException { 64 65 for (Token token = input.next(); token != null; token = input.next()) { 66 String text = token.termText(); 67 68 if (stopTable.get(text) == null) { 70 switch (Character.getType(text.charAt(0))) { 71 72 case Character.LOWERCASE_LETTER: 73 case Character.UPPERCASE_LETTER: 74 75 if (text.length()>1) { 77 return token; 78 } 79 break; 80 case Character.OTHER_LETTER: 81 82 85 return token; 86 } 87 88 } 89 90 } 91 return null; 92 } 93 94 } | Popular Tags |