1 package org.apache.lucene.analysis.ru; 2 3 18 19 import org.apache.lucene.analysis.Analyzer; 20 import org.apache.lucene.analysis.StopFilter; 21 import org.apache.lucene.analysis.TokenStream; 22 23 import java.io.Reader ; 24 import java.util.Hashtable ; 25 import java.util.Set ; 26 import java.util.HashSet ; 27 28 36 public final class RussianAnalyzer extends Analyzer 37 { 38 private final static char A = 0; 40 private final static char B = 1; 41 private final static char V = 2; 42 private final static char G = 3; 43 private final static char D = 4; 44 private final static char E = 5; 45 private final static char ZH = 6; 46 private final static char Z = 7; 47 private final static char I = 8; 48 private final static char I_ = 9; 49 private final static char K = 10; 50 private final static char L = 11; 51 private final static char M = 12; 52 private final static char N = 13; 53 private final static char O = 14; 54 private final static char P = 15; 55 private final static char R = 16; 56 private final static char S = 17; 57 private final static char T = 18; 58 private final static char U = 19; 59 private final static char X = 21; 61 private final static char CH = 23; 63 private final static char SH = 24; 64 private final static char SHCH = 25; 65 private final static char Y = 27; 67 private final static char SOFT = 28; 68 private final static char AE = 29; 69 private final static char IU = 30; 70 private final static char IA = 31; 71 72 75 private static char[][] RUSSIAN_STOP_WORDS = { 76 {A}, 77 {B, E, Z}, 78 {B, O, L, E, E}, 79 {B, Y}, 80 {B, Y, L}, 81 {B, Y, L, A}, 82 {B, Y, L, I}, 83 {B, Y, L, O}, 84 {B, Y, T, SOFT}, 85 {V}, 86 {V, A, M}, 87 {V, A, S}, 88 {V, E, S, SOFT}, 89 {V, O}, 90 {V, O, T}, 91 {V, S, E}, 92 {V, S, E, G, O}, 93 {V, S, E, X}, 94 {V, Y}, 95 {G, D, E}, 96 {D, A}, 97 {D, A, ZH, E}, 98 {D, L, IA}, 99 {D, O}, 100 {E, G, O}, 101 {E, E}, 102 {E, I_,}, 103 {E, IU}, 104 {E, S, L, I}, 105 {E, S, T, SOFT}, 106 {E, SHCH, E}, 107 {ZH, E}, 108 {Z, A}, 109 {Z, D, E, S, SOFT}, 110 {I}, 111 {I, Z}, 112 {I, L, I}, 113 {I, M}, 114 {I, X}, 115 {K}, 116 {K, A, K}, 117 {K, O}, 118 {K, O, G, D, A}, 119 {K, T, O}, 120 {L, I}, 121 {L, I, B, O}, 122 {M, N, E}, 123 {M, O, ZH, E, T}, 124 {M, Y}, 125 {N, A}, 126 {N, A, D, O}, 127 {N, A, SH}, 128 {N, E}, 129 {N, E, G, O}, 130 {N, E, E}, 131 {N, E, T}, 132 {N, I}, 133 {N, I, X}, 134 {N, O}, 135 {N, U}, 136 {O}, 137 {O, B}, 138 {O, D, N, A, K, O}, 139 {O, N}, 140 {O, N, A}, 141 {O, N, I}, 142 {O, N, O}, 143 {O, T}, 144 {O, CH, E, N, SOFT}, 145 {P, O}, 146 {P, O, D}, 147 {P, R, I}, 148 {S}, 149 {S, O}, 150 {T, A, K}, 151 {T, A, K, ZH, E}, 152 {T, A, K, O, I_}, 153 {T, A, M}, 154 {T, E}, 155 {T, E, M}, 156 {T, O}, 157 {T, O, G, O}, 158 {T, O, ZH, E}, 159 {T, O, I_}, 160 {T, O, L, SOFT, K, O}, 161 {T, O, M}, 162 {T, Y}, 163 {U}, 164 {U, ZH, E}, 165 {X, O, T, IA}, 166 {CH, E, G, O}, 167 {CH, E, I_}, 168 {CH, E, M}, 169 {CH, T, O}, 170 {CH, T, O, B, Y}, 171 {CH, SOFT, E}, 172 {CH, SOFT, IA}, 173 {AE, T, A}, 174 {AE, T, I}, 175 {AE, T, O}, 176 {IA} 177 }; 178 179 182 private Set stopSet = new HashSet (); 183 184 189 private char[] charset; 190 191 192 public RussianAnalyzer() { 193 charset = RussianCharsets.UnicodeRussian; 194 stopSet = StopFilter.makeStopSet( 195 makeStopWords(RussianCharsets.UnicodeRussian)); 196 } 197 198 201 public RussianAnalyzer(char[] charset) 202 { 203 this.charset = charset; 204 stopSet = StopFilter.makeStopSet(makeStopWords(charset)); 205 } 206 207 210 public RussianAnalyzer(char[] charset, String [] stopwords) 211 { 212 this.charset = charset; 213 stopSet = StopFilter.makeStopSet(stopwords); 214 } 215 216 private static String [] makeStopWords(char[] charset) 219 { 220 String [] res = new String [RUSSIAN_STOP_WORDS.length]; 221 for (int i = 0; i < res.length; i++) 222 { 223 char[] theStopWord = RUSSIAN_STOP_WORDS[i]; 224 StringBuffer theWord = new StringBuffer (); 226 for (int j = 0; j < theStopWord.length; j++) 227 { 228 theWord.append(charset[theStopWord[j]]); 229 } 230 res[i] = theWord.toString(); 231 } 232 return res; 233 } 234 235 239 public RussianAnalyzer(char[] charset, Hashtable stopwords) 240 { 241 this.charset = charset; 242 stopSet = new HashSet (stopwords.keySet()); 243 } 244 245 251 public TokenStream tokenStream(String fieldName, Reader reader) 252 { 253 TokenStream result = new RussianLetterTokenizer(reader, charset); 254 result = new RussianLowerCaseFilter(result, charset); 255 result = new StopFilter(result, stopSet); 256 result = new RussianStemFilter(result, charset); 257 return result; 258 } 259 } 260 | Popular Tags |