1 package org.contineo.core.text.analyze.en; 2 3 18 19 29 30 import org.contineo.core.text.analyze.Stemmer; 31 32 40 41 class EnglishStemmer implements Stemmer { 42 private char[] b; 43 44 private int i; 45 private int j; 46 private int k; 47 private int k0; 48 49 private boolean dirty = false; 50 51 private static final int INC = 50; 52 53 private static final int EXTRA = 1; 54 55 public EnglishStemmer() { 56 b = new char[INC]; 57 i = 0; 58 } 59 60 65 public void reset() { 66 i = 0; 67 dirty = false; 68 } 69 70 74 public void add(char ch) { 75 if (b.length <= i + EXTRA) { 76 char[] new_b = new char[b.length + INC]; 77 for (int c = 0; c < b.length; c++) 78 new_b[c] = b[c]; 79 b = new_b; 80 } 81 b[i++] = ch; 82 } 83 84 89 public String toString() { 90 return new String (b, 0, i); 91 } 92 93 96 public int getResultLength() { 97 return i; 98 } 99 100 105 public char[] getResultBuffer() { 106 return b; 107 } 108 109 110 111 private final boolean cons(int i) { 112 switch (b[i]) { 113 case 'a': 114 case 'e': 115 case 'i': 116 case 'o': 117 case 'u': 118 return false; 119 case 'y': 120 return (i == k0) ? true : !cons(i - 1); 121 default: 122 return true; 123 } 124 } 125 126 134 135 private final int m() { 136 int n = 0; 137 int i = k0; 138 while (true) { 139 if (i > j) 140 return n; 141 if (!cons(i)) 142 break; 143 i++; 144 } 145 i++; 146 while (true) { 147 while (true) { 148 if (i > j) 149 return n; 150 if (cons(i)) 151 break; 152 i++; 153 } 154 i++; 155 n++; 156 while (true) { 157 if (i > j) 158 return n; 159 if (!cons(i)) 160 break; 161 i++; 162 } 163 i++; 164 } 165 } 166 167 168 169 private final boolean vowelinstem() { 170 int i; 171 for (i = k0; i <= j; i++) 172 if (!cons(i)) 173 return true; 174 return false; 175 } 176 177 178 179 private final boolean doublec(int j) { 180 if (j < k0 + 1) 181 return false; 182 if (b[j] != b[j - 1]) 183 return false; 184 return cons(j); 185 } 186 187 195 196 private final boolean cvc(int i) { 197 if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2)) 198 return false; 199 else { 200 int ch = b[i]; 201 if (ch == 'w' || ch == 'x' || ch == 'y') 202 return false; 203 } 204 return true; 205 } 206 207 private final boolean ends(String s) { 208 int l = s.length(); 209 int o = k - l + 1; 210 if (o < k0) 211 return false; 212 for (int i = 0; i < l; i++) 213 if (b[o + i] != s.charAt(i)) 214 return false; 215 j = k - l; 216 return true; 217 } 218 219 223 224 void setto(String s) { 225 int l = s.length(); 226 int o = j + 1; 227 for (int i = 0; i < l; i++) 228 b[o + i] = s.charAt(i); 229 k = j + l; 230 dirty = true; 231 } 232 233 234 235 void r(String s) { 236 if (m() > 0) 237 setto(s); 238 } 239 240 253 254 private final void step1() { 255 if (b[k] == 's') { 256 if (ends("sses")) 257 k -= 2; 258 else if (ends("ies")) 259 setto("i"); 260 else if (b[k - 1] != 's') 261 k--; 262 } 263 if (ends("eed")) { 264 if (m() > 0) 265 k--; 266 } else if ((ends("ed") || ends("ing")) && vowelinstem()) { 267 k = j; 268 if (ends("at")) 269 setto("ate"); 270 else if (ends("bl")) 271 setto("ble"); 272 else if (ends("iz")) 273 setto("ize"); 274 else if (doublec(k)) { 275 int ch = b[k--]; 276 if (ch == 'l' || ch == 's' || ch == 'z') 277 k++; 278 } else if (m() == 1 && cvc(k)) 279 setto("e"); 280 } 281 } 282 283 284 285 private final void step2() { 286 if (ends("y") && vowelinstem()) { 287 b[k] = 'i'; 288 dirty = true; 289 } 290 } 291 292 297 298 private final void step3() { 299 if (k == k0) 300 return; 301 switch (b[k - 1]) { 302 case 'a': 303 if (ends("ational")) { 304 r("ate"); 305 break; 306 } 307 if (ends("tional")) { 308 r("tion"); 309 break; 310 } 311 break; 312 case 'c': 313 if (ends("enci")) { 314 r("ence"); 315 break; 316 } 317 if (ends("anci")) { 318 r("ance"); 319 break; 320 } 321 break; 322 case 'e': 323 if (ends("izer")) { 324 r("ize"); 325 break; 326 } 327 break; 328 case 'l': 329 if (ends("bli")) { 330 r("ble"); 331 break; 332 } 333 if (ends("alli")) { 334 r("al"); 335 break; 336 } 337 if (ends("entli")) { 338 r("ent"); 339 break; 340 } 341 if (ends("eli")) { 342 r("e"); 343 break; 344 } 345 if (ends("ousli")) { 346 r("ous"); 347 break; 348 } 349 break; 350 case 'o': 351 if (ends("ization")) { 352 r("ize"); 353 break; 354 } 355 if (ends("ation")) { 356 r("ate"); 357 break; 358 } 359 if (ends("ator")) { 360 r("ate"); 361 break; 362 } 363 break; 364 case 's': 365 if (ends("alism")) { 366 r("al"); 367 break; 368 } 369 if (ends("iveness")) { 370 r("ive"); 371 break; 372 } 373 if (ends("fulness")) { 374 r("ful"); 375 break; 376 } 377 if (ends("ousness")) { 378 r("ous"); 379 break; 380 } 381 break; 382 case 't': 383 if (ends("aliti")) { 384 r("al"); 385 break; 386 } 387 if (ends("iviti")) { 388 r("ive"); 389 break; 390 } 391 if (ends("biliti")) { 392 r("ble"); 393 break; 394 } 395 break; 396 case 'g': 397 if (ends("logi")) { 398 r("log"); 399 break; 400 } 401 } 402 } 403 404 405 406 private final void step4() { 407 switch (b[k]) { 408 case 'e': 409 if (ends("icate")) { 410 r("ic"); 411 break; 412 } 413 if (ends("ative")) { 414 r(""); 415 break; 416 } 417 if (ends("alize")) { 418 r("al"); 419 break; 420 } 421 break; 422 case 'i': 423 if (ends("iciti")) { 424 r("ic"); 425 break; 426 } 427 break; 428 case 'l': 429 if (ends("ical")) { 430 r("ic"); 431 break; 432 } 433 if (ends("ful")) { 434 r(""); 435 break; 436 } 437 break; 438 case 's': 439 if (ends("ness")) { 440 r(""); 441 break; 442 } 443 break; 444 } 445 } 446 447 448 449 private final void step5() { 450 if (k == k0) 451 return; 452 switch (b[k - 1]) { 453 case 'a': 454 if (ends("al")) 455 break; 456 return; 457 case 'c': 458 if (ends("ance")) 459 break; 460 if (ends("ence")) 461 break; 462 return; 463 case 'e': 464 if (ends("er")) 465 break; 466 return; 467 case 'i': 468 if (ends("ic")) 469 break; 470 return; 471 case 'l': 472 if (ends("able")) 473 break; 474 if (ends("ible")) 475 break; 476 return; 477 case 'n': 478 if (ends("ant")) 479 break; 480 if (ends("ement")) 481 break; 482 if (ends("ment")) 483 break; 484 485 if (ends("ent")) 486 break; 487 return; 488 case 'o': 489 if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) 490 break; 491 492 if (ends("ou")) 493 break; 494 return; 495 496 case 's': 497 if (ends("ism")) 498 break; 499 return; 500 case 't': 501 if (ends("ate")) 502 break; 503 if (ends("iti")) 504 break; 505 return; 506 case 'u': 507 if (ends("ous")) 508 break; 509 return; 510 case 'v': 511 if (ends("ive")) 512 break; 513 return; 514 case 'z': 515 if (ends("ize")) 516 break; 517 return; 518 default: 519 return; 520 } 521 if (m() > 1) 522 k = j; 523 } 524 525 526 527 private final void step6() { 528 j = k; 529 if (b[k] == 'e') { 530 int a = m(); 531 if (a > 1 || a == 1 && !cvc(k - 1)) 532 k--; 533 } 534 if (b[k] == 'l' && doublec(k) && m() > 1) 535 k--; 536 } 537 538 541 public String stem(String s) { 542 if (stem(s.toCharArray(), s.length())) 543 return toString(); 544 else 545 return s; 546 } 547 548 553 public boolean stem(char[] word) { 554 return stem(word, word.length); 555 } 556 557 563 public boolean stem(char[] wordBuffer, int offset, int wordLen) { 564 reset(); 565 if (b.length < wordLen) { 566 char[] new_b = new char[wordLen + EXTRA]; 567 b = new_b; 568 } 569 for (int j = 0; j < wordLen; j++) 570 b[j] = wordBuffer[offset + j]; 571 i = wordLen; 572 return stem(0); 573 } 574 575 581 public boolean stem(char[] word, int wordLen) { 582 return stem(word, 0, wordLen); 583 } 584 585 591 public boolean stem() { 592 return stem(0); 593 } 594 595 public boolean stem(int i0) { 596 k = i - 1; 597 k0 = i0; 598 if (k > k0 + 1) { 599 step1(); 600 step2(); 601 step3(); 602 step4(); 603 step5(); 604 step6(); 605 } 606 if (i != k + 1) 609 dirty = true; 610 i = k + 1; 611 return dirty; 612 } 613 } | Popular Tags |