1 package org.apache.lucene.analysis.ru; 2 3 18 19 25 class RussianStemmer 26 { 27 private char[] charset; 28 29 private int RV, R1, R2; 31 32 private final static char A = 0; 34 private final static char V = 2; 36 private final static char G = 3; 37 private final static char E = 5; 39 private final static char I = 8; 42 private final static char I_ = 9; 43 private final static char L = 11; 45 private final static char M = 12; 46 private final static char N = 13; 47 private final static char O = 14; 48 private final static char S = 17; 51 private final static char T = 18; 52 private final static char U = 19; 53 private final static char X = 21; 55 private final static char SH = 24; 58 private final static char SHCH = 25; 59 private final static char Y = 27; 61 private final static char SOFT = 28; 62 private final static char AE = 29; 63 private final static char IU = 30; 64 private final static char IA = 31; 65 66 private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA }; 68 69 private static char[][] perfectiveGerundEndings1 = { 70 { V }, 71 { V, SH, I }, 72 { V, SH, I, S, SOFT } 73 }; 74 75 private static char[][] perfectiveGerund1Predessors = { 76 { A }, 77 { IA } 78 }; 79 80 private static char[][] perfectiveGerundEndings2 = { { I, V }, { 81 Y, V }, { 82 I, V, SH, I }, { 83 Y, V, SH, I }, { 84 I, V, SH, I, S, SOFT }, { 85 Y, V, SH, I, S, SOFT } 86 }; 87 88 private static char[][] adjectiveEndings = { 89 { E, E }, 90 { I, E }, 91 { Y, E }, 92 { O, E }, 93 { E, I_ }, 94 { I, I_ }, 95 { Y, I_ }, 96 { O, I_ }, 97 { E, M }, 98 { I, M }, 99 { Y, M }, 100 { O, M }, 101 { I, X }, 102 { Y, X }, 103 { U, IU }, 104 { IU, IU }, 105 { A, IA }, 106 { IA, IA }, 107 { O, IU }, 108 { E, IU }, 109 { I, M, I }, 110 { Y, M, I }, 111 { E, G, O }, 112 { O, G, O }, 113 { E, M, U }, 114 {O, M, U } 115 }; 116 117 private static char[][] participleEndings1 = { 118 { SHCH }, 119 { E, M }, 120 { N, N }, 121 { V, SH }, 122 { IU, SHCH } 123 }; 124 125 private static char[][] participleEndings2 = { 126 { I, V, SH }, 127 { Y, V, SH }, 128 { U, IU, SHCH } 129 }; 130 131 private static char[][] participle1Predessors = { 132 { A }, 133 { IA } 134 }; 135 136 private static char[][] reflexiveEndings = { 137 { S, IA }, 138 { S, SOFT } 139 }; 140 141 private static char[][] verbEndings1 = { 142 { I_ }, 143 { L }, 144 { N }, 145 { L, O }, 146 { N, O }, 147 { E, T }, 148 { IU, T }, 149 { L, A }, 150 { N, A }, 151 { L, I }, 152 { E, M }, 153 { N, Y }, 154 { E, T, E }, 155 { I_, T, E }, 156 { T, SOFT }, 157 { E, SH, SOFT }, 158 { N, N, O } 159 }; 160 161 private static char[][] verbEndings2 = { 162 { IU }, 163 { U, IU }, 164 { E, N }, 165 { E, I_ }, 166 { IA, T }, 167 { U, I_ }, 168 { I, L }, 169 { Y, L }, 170 { I, M }, 171 { Y, M }, 172 { I, T }, 173 { Y, T }, 174 { I, L, A }, 175 { Y, L, A }, 176 { E, N, A }, 177 { I, T, E }, 178 { I, L, I }, 179 { Y, L, I }, 180 { I, L, O }, 181 { Y, L, O }, 182 { E, N, O }, 183 { U, E, T }, 184 { U, IU, T }, 185 { E, N, Y }, 186 { I, T, SOFT }, 187 { Y, T, SOFT }, 188 { I, SH, SOFT }, 189 { E, I_, T, E }, 190 { U, I_, T, E } 191 }; 192 193 private static char[][] verb1Predessors = { 194 { A }, 195 { IA } 196 }; 197 198 private static char[][] nounEndings = { 199 { A }, 200 { U }, 201 { I_ }, 202 { O }, 203 { U }, 204 { E }, 205 { Y }, 206 { I }, 207 { SOFT }, 208 { IA }, 209 { E, V }, 210 { O, V }, 211 { I, E }, 212 { SOFT, E }, 213 { IA, X }, 214 { I, IU }, 215 { E, I }, 216 { I, I }, 217 { E, I_ }, 218 { O, I_ }, 219 { E, M }, 220 { A, M }, 221 { O, M }, 222 { A, X }, 223 { SOFT, IU }, 224 { I, IA }, 225 { SOFT, IA }, 226 { I, I_ }, 227 { IA, M }, 228 { IA, M, I }, 229 { A, M, I }, 230 { I, E, I_ }, 231 { I, IA, M }, 232 { I, E, M }, 233 { I, IA, X }, 234 { I, IA, M, I } 235 }; 236 237 private static char[][] superlativeEndings = { 238 { E, I_, SH }, 239 { E, I_, SH, E } 240 }; 241 242 private static char[][] derivationalEndings = { 243 { O, S, T }, 244 { O, S, T, SOFT } 245 }; 246 247 250 public RussianStemmer() 251 { 252 super(); 253 } 254 255 258 public RussianStemmer(char[] charset) 259 { 260 super(); 261 this.charset = charset; 262 } 263 264 270 private boolean adjectival(StringBuffer stemmingZone) 271 { 272 if (!findAndRemoveEnding(stemmingZone, adjectiveEndings)) 274 return false; 275 boolean r = 279 findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors) 280 || 281 findAndRemoveEnding(stemmingZone, participleEndings2); 282 return true; 283 } 284 285 290 private boolean derivational(StringBuffer stemmingZone) 291 { 292 int endingLength = findEnding(stemmingZone, derivationalEndings); 293 if (endingLength == 0) 294 return false; 296 else 297 { 298 if (R2 - RV <= stemmingZone.length() - endingLength) 300 { 301 stemmingZone.setLength(stemmingZone.length() - endingLength); 302 return true; 303 } 304 else 305 { 306 return false; 307 } 308 } 309 } 310 311 315 private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass) 316 { 317 boolean match = false; 318 for (int i = theEndingClass.length - 1; i >= 0; i--) 319 { 320 char[] theEnding = theEndingClass[i]; 321 if (startIndex < theEnding.length - 1) 323 { 324 match = false; 325 continue; 326 } 327 match = true; 328 int stemmingIndex = startIndex; 329 for (int j = theEnding.length - 1; j >= 0; j--) 330 { 331 if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]]) 332 { 333 match = false; 334 break; 335 } 336 } 337 if (match) 339 { 340 return theEndingClass[i].length; } 342 } 343 return 0; 344 } 345 346 private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass) 347 { 348 return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass); 349 } 350 351 355 private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass) 356 { 357 int endingLength = findEnding(stemmingZone, theEndingClass); 358 if (endingLength == 0) 359 return false; 361 else { 362 stemmingZone.setLength(stemmingZone.length() - endingLength); 363 return true; 365 } 366 } 367 368 373 private boolean findAndRemoveEnding(StringBuffer stemmingZone, 374 char[][] theEndingClass, char[][] thePredessors) 375 { 376 int endingLength = findEnding(stemmingZone, theEndingClass); 377 if (endingLength == 0) 378 return false; 380 else 381 { 382 int predessorLength = 383 findEnding(stemmingZone, 384 stemmingZone.length() - endingLength - 1, 385 thePredessors); 386 if (predessorLength == 0) 387 return false; 388 else { 389 stemmingZone.setLength(stemmingZone.length() - endingLength); 390 return true; 392 } 393 } 394 395 } 396 397 401 private void markPositions(String word) 402 { 403 RV = 0; 404 R1 = 0; 405 R2 = 0; 406 int i = 0; 407 while (word.length() > i && !isVowel(word.charAt(i))) 409 { 410 i++; 411 } 412 if (word.length() - 1 < ++i) 413 return; RV = i; 415 while (word.length() > i && isVowel(word.charAt(i))) 417 { 418 i++; 419 } 420 if (word.length() - 1 < ++i) 421 return; R1 = i; 423 while (word.length() > i && !isVowel(word.charAt(i))) 425 { 426 i++; 427 } 428 if (word.length() - 1 < ++i) 429 return; while (word.length() > i && isVowel(word.charAt(i))) 431 { 432 i++; 433 } 434 if (word.length() - 1 < ++i) 435 return; R2 = i; 437 } 438 439 445 private boolean isVowel(char letter) 446 { 447 for (int i = 0; i < vowels.length; i++) 448 { 449 if (letter == charset[vowels[i]]) 450 return true; 451 } 452 return false; 453 } 454 455 460 private boolean noun(StringBuffer stemmingZone) 461 { 462 return findAndRemoveEnding(stemmingZone, nounEndings); 463 } 464 465 470 private boolean perfectiveGerund(StringBuffer stemmingZone) 471 { 472 return findAndRemoveEnding( 473 stemmingZone, 474 perfectiveGerundEndings1, 475 perfectiveGerund1Predessors) 476 || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2); 477 } 478 479 484 private boolean reflexive(StringBuffer stemmingZone) 485 { 486 return findAndRemoveEnding(stemmingZone, reflexiveEndings); 487 } 488 489 494 private boolean removeI(StringBuffer stemmingZone) 495 { 496 if (stemmingZone.length() > 0 497 && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I]) 498 { 499 stemmingZone.setLength(stemmingZone.length() - 1); 500 return true; 501 } 502 else 503 { 504 return false; 505 } 506 } 507 508 513 private boolean removeSoft(StringBuffer stemmingZone) 514 { 515 if (stemmingZone.length() > 0 516 && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT]) 517 { 518 stemmingZone.setLength(stemmingZone.length() - 1); 519 return true; 520 } 521 else 522 { 523 return false; 524 } 525 } 526 527 532 public void setCharset(char[] newCharset) 533 { 534 charset = newCharset; 535 } 536 537 543 public String stem(String input) 544 { 545 markPositions(input); 546 if (RV == 0) 547 return input; StringBuffer stemmingZone = new StringBuffer (input.substring(RV)); 549 552 if (!perfectiveGerund(stemmingZone)) 553 { 554 reflexive(stemmingZone); 555 boolean r = 559 adjectival(stemmingZone) 560 || verb(stemmingZone) 561 || noun(stemmingZone); 562 } 563 removeI(stemmingZone); 565 derivational(stemmingZone); 567 superlative(stemmingZone); 569 undoubleN(stemmingZone); 570 removeSoft(stemmingZone); 571 return input.substring(0, RV) + stemmingZone.toString(); 573 } 574 575 580 private boolean superlative(StringBuffer stemmingZone) 581 { 582 return findAndRemoveEnding(stemmingZone, superlativeEndings); 583 } 584 585 590 private boolean undoubleN(StringBuffer stemmingZone) 591 { 592 char[][] doubleN = { 593 { N, N } 594 }; 595 if (findEnding(stemmingZone, doubleN) != 0) 596 { 597 stemmingZone.setLength(stemmingZone.length() - 1); 598 return true; 599 } 600 else 601 { 602 return false; 603 } 604 } 605 606 611 private boolean verb(StringBuffer stemmingZone) 612 { 613 return findAndRemoveEnding( 614 stemmingZone, 615 verbEndings1, 616 verb1Predessors) 617 || findAndRemoveEnding(stemmingZone, verbEndings2); 618 } 619 620 623 public static String stem(String theWord, char[] charset) 624 { 625 RussianStemmer stemmer = new RussianStemmer(); 626 stemmer.setCharset(charset); 627 return stemmer.stem(theWord); 628 } 629 } 630 | Popular Tags |