1 package org.apache.lucene.analysis; 2 3 18 19 44 45 46 import java.io.*; 47 48 56 57 class PorterStemmer 58 { 59 private char[] b; 60 private int i, 61 j, k, k0; 62 private boolean dirty = false; 63 private static final int INC = 50; 64 private static final int EXTRA = 1; 65 66 public PorterStemmer() { 67 b = new char[INC]; 68 i = 0; 69 } 70 71 76 public void reset() { i = 0; dirty = false; } 77 78 82 public void add(char ch) { 83 if (b.length <= i + EXTRA) { 84 char[] new_b = new char[b.length+INC]; 85 for (int c = 0; c < b.length; c++) 86 new_b[c] = b[c]; 87 b = new_b; 88 } 89 b[i++] = ch; 90 } 91 92 97 public String toString() { return new String (b,0,i); } 98 99 102 public int getResultLength() { return i; } 103 104 109 public char[] getResultBuffer() { return b; } 110 111 112 113 private final boolean cons(int i) { 114 switch (b[i]) { 115 case 'a': case 'e': case 'i': case 'o': case 'u': 116 return false; 117 case 'y': 118 return (i==k0) ? true : !cons(i-1); 119 default: 120 return true; 121 } 122 } 123 124 134 135 private final int m() { 136 int n = 0; 137 int i = k0; 138 while(true) { 139 if (i > j) 140 return n; 141 if (! cons(i)) 142 break; 143 i++; 144 } 145 i++; 146 while(true) { 147 while(true) { 148 if (i > j) 149 return n; 150 if (cons(i)) 151 break; 152 i++; 153 } 154 i++; 155 n++; 156 while(true) { 157 if (i > j) 158 return n; 159 if (! cons(i)) 160 break; 161 i++; 162 } 163 i++; 164 } 165 } 166 167 168 169 private final boolean vowelinstem() { 170 int i; 171 for (i = k0; i <= j; i++) 172 if (! cons(i)) 173 return true; 174 return false; 175 } 176 177 178 179 private final boolean doublec(int j) { 180 if (j < k0+1) 181 return false; 182 if (b[j] != b[j-1]) 183 return false; 184 return cons(j); 185 } 186 187 195 196 private final boolean cvc(int i) { 197 if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) 198 return false; 199 else { 200 int ch = b[i]; 201 if (ch == 'w' || ch == 'x' || ch == 'y') return false; 202 } 203 return true; 204 } 205 206 private final boolean ends(String s) { 207 int l = s.length(); 208 int o = k-l+1; 209 if (o < k0) 210 return false; 211 for (int i = 0; i < l; i++) 212 if (b[o+i] != s.charAt(i)) 213 return false; 214 j = k-l; 215 return true; 216 } 217 218 220 221 void setto(String s) { 222 int l = s.length(); 223 int o = j+1; 224 for (int i = 0; i < l; i++) 225 b[o+i] = s.charAt(i); 226 k = j+l; 227 dirty = true; 228 } 229 230 231 232 void r(String s) { if (m() > 0) setto(s); } 233 234 255 256 private final void step1() { 257 if (b[k] == 's') { 258 if (ends("sses")) k -= 2; 259 else if (ends("ies")) setto("i"); 260 else if (b[k-1] != 's') k--; 261 } 262 if (ends("eed")) { 263 if (m() > 0) 264 k--; 265 } 266 else if ((ends("ed") || ends("ing")) && vowelinstem()) { 267 k = j; 268 if (ends("at")) setto("ate"); 269 else if (ends("bl")) setto("ble"); 270 else if (ends("iz")) setto("ize"); 271 else if (doublec(k)) { 272 int ch = b[k--]; 273 if (ch == 'l' || ch == 's' || ch == 'z') 274 k++; 275 } 276 else if (m() == 1 && cvc(k)) 277 setto("e"); 278 } 279 } 280 281 282 283 private final void step2() { 284 if (ends("y") && vowelinstem()) { 285 b[k] = 'i'; 286 dirty = true; 287 } 288 } 289 290 293 294 private final void step3() { 295 if (k == k0) return; 296 switch (b[k-1]) { 297 case 'a': 298 if (ends("ational")) { r("ate"); break; } 299 if (ends("tional")) { r("tion"); break; } 300 break; 301 case 'c': 302 if (ends("enci")) { r("ence"); break; } 303 if (ends("anci")) { r("ance"); break; } 304 break; 305 case 'e': 306 if (ends("izer")) { r("ize"); break; } 307 break; 308 case 'l': 309 if (ends("bli")) { r("ble"); break; } 310 if (ends("alli")) { r("al"); break; } 311 if (ends("entli")) { r("ent"); break; } 312 if (ends("eli")) { r("e"); break; } 313 if (ends("ousli")) { r("ous"); break; } 314 break; 315 case 'o': 316 if (ends("ization")) { r("ize"); break; } 317 if (ends("ation")) { r("ate"); break; } 318 if (ends("ator")) { r("ate"); break; } 319 break; 320 case 's': 321 if (ends("alism")) { r("al"); break; } 322 if (ends("iveness")) { r("ive"); break; } 323 if (ends("fulness")) { r("ful"); break; } 324 if (ends("ousness")) { r("ous"); break; } 325 break; 326 case 't': 327 if (ends("aliti")) { r("al"); break; } 328 if (ends("iviti")) { r("ive"); break; } 329 if (ends("biliti")) { r("ble"); break; } 330 break; 331 case 'g': 332 if (ends("logi")) { r("log"); break; } 333 } 334 } 335 336 337 338 private final void step4() { 339 switch (b[k]) { 340 case 'e': 341 if (ends("icate")) { r("ic"); break; } 342 if (ends("ative")) { r(""); break; } 343 if (ends("alize")) { r("al"); break; } 344 break; 345 case 'i': 346 if (ends("iciti")) { r("ic"); break; } 347 break; 348 case 'l': 349 if (ends("ical")) { r("ic"); break; } 350 if (ends("ful")) { r(""); break; } 351 break; 352 case 's': 353 if (ends("ness")) { r(""); break; } 354 break; 355 } 356 } 357 358 359 360 private final void step5() { 361 if (k == k0) return; 362 switch (b[k-1]) { 363 case 'a': 364 if (ends("al")) break; 365 return; 366 case 'c': 367 if (ends("ance")) break; 368 if (ends("ence")) break; 369 return; 370 case 'e': 371 if (ends("er")) break; return; 372 case 'i': 373 if (ends("ic")) break; return; 374 case 'l': 375 if (ends("able")) break; 376 if (ends("ible")) break; return; 377 case 'n': 378 if (ends("ant")) break; 379 if (ends("ement")) break; 380 if (ends("ment")) break; 381 382 if (ends("ent")) break; 383 return; 384 case 'o': 385 if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; 386 387 if (ends("ou")) break; 388 return; 389 390 case 's': 391 if (ends("ism")) break; 392 return; 393 case 't': 394 if (ends("ate")) break; 395 if (ends("iti")) break; 396 return; 397 case 'u': 398 if (ends("ous")) break; 399 return; 400 case 'v': 401 if (ends("ive")) break; 402 return; 403 case 'z': 404 if (ends("ize")) break; 405 return; 406 default: 407 return; 408 } 409 if (m() > 1) 410 k = j; 411 } 412 413 414 415 private final void step6() { 416 j = k; 417 if (b[k] == 'e') { 418 int a = m(); 419 if (a > 1 || a == 1 && !cvc(k-1)) 420 k--; 421 } 422 if (b[k] == 'l' && doublec(k) && m() > 1) 423 k--; 424 } 425 426 427 430 public String stem(String s) { 431 if (stem(s.toCharArray(), s.length())) 432 return toString(); 433 else 434 return s; 435 } 436 437 441 public boolean stem(char[] word) { 442 return stem(word, word.length); 443 } 444 445 450 public boolean stem(char[] wordBuffer, int offset, int wordLen) { 451 reset(); 452 if (b.length < wordLen) { 453 char[] new_b = new char[wordLen + EXTRA]; 454 b = new_b; 455 } 456 for (int j=0; j<wordLen; j++) 457 b[j] = wordBuffer[offset+j]; 458 i = wordLen; 459 return stem(0); 460 } 461 462 467 public boolean stem(char[] word, int wordLen) { 468 return stem(word, 0, wordLen); 469 } 470 471 476 public boolean stem() { 477 return stem(0); 478 } 479 480 public boolean stem(int i0) { 481 k = i - 1; 482 k0 = i0; 483 if (k > k0+1) { 484 step1(); step2(); step3(); step4(); step5(); step6(); 485 } 486 if (i != k+1) 489 dirty = true; 490 i = k+1; 491 return dirty; 492 } 493 494 498 public static void main(String [] args) { 499 PorterStemmer s = new PorterStemmer(); 500 501 for (int i = 0; i < args.length; i++) { 502 try { 503 InputStream in = new FileInputStream(args[i]); 504 byte[] buffer = new byte[1024]; 505 int bufferLen, offset, ch; 506 507 bufferLen = in.read(buffer); 508 offset = 0; 509 s.reset(); 510 511 while(true) { 512 if (offset < bufferLen) 513 ch = buffer[offset++]; 514 else { 515 bufferLen = in.read(buffer); 516 offset = 0; 517 if (bufferLen < 0) 518 ch = -1; 519 else 520 ch = buffer[offset++]; 521 } 522 523 if (Character.isLetter((char) ch)) { 524 s.add(Character.toLowerCase((char) ch)); 525 } 526 else { 527 s.stem(); 528 System.out.print(s.toString()); 529 s.reset(); 530 if (ch < 0) 531 break; 532 else { 533 System.out.print((char) ch); 534 } 535 } 536 } 537 538 in.close(); 539 } 540 catch (IOException e) { 541 System.out.println("error reading " + args[i]); 542 } 543 } 544 } 545 } 546 547 | Popular Tags |