1 package org.contineo.core.text.analyze.fr; 2 3 import org.contineo.core.text.analyze.Stemmer; 4 5 58 59 67 68 public class FrenchStemmer implements Stemmer { 69 70 73 private StringBuffer sb = new StringBuffer (); 74 75 78 private StringBuffer tb = new StringBuffer (); 79 80 83 private String R0; 84 85 91 private String RV; 92 93 98 private String R1; 99 100 105 private String R2; 106 107 108 111 private boolean suite; 112 113 116 private boolean modified; 117 118 119 125 public String stem( String term ) { 126 if ( !isStemmable( term ) ) { 127 return term; 128 } 129 130 term = term.toLowerCase(); 132 133 sb.delete( 0, sb.length() ); 135 sb.insert( 0, term ); 136 137 modified = false; 139 suite = false; 140 141 sb = treatVowels( sb ); 142 143 setStrings(); 144 145 step1(); 146 147 if (!modified || suite) 148 { 149 if (RV != null) 150 { 151 suite = step2a(); 152 if (!suite) 153 step2b(); 154 } 155 } 156 157 if (modified || suite) 158 step3(); 159 else 160 step4(); 161 162 step5(); 163 164 step6(); 165 166 return sb.toString(); 167 } 168 169 173 private void setStrings() { 174 R0 = sb.toString(); 176 RV = retrieveRV( sb ); 177 R1 = retrieveR( sb ); 178 if ( R1 != null ) 179 { 180 tb.delete( 0, tb.length() ); 181 tb.insert( 0, R1 ); 182 R2 = retrieveR( tb ); 183 } 184 else 185 R2 = null; 186 } 187 188 192 private void step1( ) { 193 String [] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" }; 194 deleteFrom( R2, suffix ); 195 196 replaceFrom( R2, new String [] { "logies", "logie" }, "log" ); 197 replaceFrom( R2, new String [] { "usions", "utions", "usion", "ution" }, "u" ); 198 replaceFrom( R2, new String [] { "ences", "ence" }, "ent" ); 199 200 String [] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"}; 201 deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" ); 202 203 deleteButSuffixFromElseReplace( R2, new String [] { "ements", "ement" }, "eus", false, R0, "eux" ); 204 deleteButSuffixFrom( R2, new String [] { "ements", "ement" }, "ativ", false ); 205 deleteButSuffixFrom( R2, new String [] { "ements", "ement" }, "iv", false ); 206 deleteButSuffixFrom( R2, new String [] { "ements", "ement" }, "abl", false ); 207 deleteButSuffixFrom( R2, new String [] { "ements", "ement" }, "iqU", false ); 208 209 deleteFromIfTestVowelBeforeIn( R1, new String [] { "issements", "issement" }, false, R0 ); 210 deleteFrom( RV, new String [] { "ements", "ement" } ); 211 212 deleteButSuffixFromElseReplace( R2, new String [] { "itës", "itë" }, "abil", false, R0, "abl" ); 213 deleteButSuffixFromElseReplace( R2, new String [] { "itës", "itë" }, "ic", false, R0, "iqU" ); 214 deleteButSuffixFrom( R2, new String [] { "itës", "itë" }, "iv", true ); 215 216 String [] autre = { "ifs", "ives", "if", "ive" }; 217 deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" ); 218 deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" ); 219 220 replaceFrom( R0, new String [] { "eaux" }, "eau" ); 221 222 replaceFrom( R1, new String [] { "aux" }, "al" ); 223 224 deleteButSuffixFromElseReplace( R2, new String [] { "euses", "euse" }, "", true, R1, "eux" ); 225 226 deleteFrom( R2, new String [] { "eux" } ); 227 228 boolean temp = false; 230 temp = replaceFrom( RV, new String [] { "amment" }, "ant" ); 231 if (temp == true) 232 suite = true; 233 temp = replaceFrom( RV, new String [] { "emment" }, "ent" ); 234 if (temp == true) 235 suite = true; 236 temp = deleteFromIfTestVowelBeforeIn( RV, new String [] { "ments", "ment" }, true, RV ); 237 if (temp == true) 238 suite = true; 239 240 } 241 242 250 private boolean step2a() { 251 String [] search = { "ïmes", "ïtes", "iraIent", "irait", "irais", "irai", "iras", "ira", 252 "irent", "iriez", "irez", "irions", "irons", "iront", 253 "issaIent", "issais", "issantes", "issante", "issants", "issant", 254 "issait", "issais", "issions", "issons", "issiez", "issez", "issent", 255 "isses", "isse", "ir", "is", "ït", "it", "ies", "ie", "i" }; 256 return deleteFromIfTestVowelBeforeIn( RV, search, false, RV ); 257 } 258 259 264 private void step2b() { 265 String [] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez", 266 "erons", "eront","erez", "êrent", "era", "ëes", "iez", 267 "ëe", "ës", "er", "ez", "ë" }; 268 deleteFrom( RV, suffix ); 269 270 String [] search = { "assions", "assiez", "assent", "asses", "asse", "aIent", 271 "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant", 272 "ait", "aït", "ais", "Ait", "Aït", "Ais", "ât", "as", "ai", "Ai", "a" }; 273 deleteButSuffixFrom( RV, search, "e", true ); 274 275 deleteFrom( R2, new String [] { "ions" } ); 276 } 277 278 282 private void step3() { 283 if (sb.length()>0) 284 { 285 char ch = sb.charAt( sb.length()-1 ); 286 if (ch == 'Y') 287 { 288 sb.setCharAt( sb.length()-1, 'i' ); 289 setStrings(); 290 } 291 else if (ch == '§') 292 { 293 sb.setCharAt( sb.length()-1, 'c' ); 294 setStrings(); 295 } 296 } 297 } 298 299 303 private void step4() { 304 if (sb.length() > 1) 305 { 306 char ch = sb.charAt( sb.length()-1 ); 307 if (ch == 's') 308 { 309 char b = sb.charAt( sb.length()-2 ); 310 if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'ê' && b != 's') 311 { 312 sb.delete( sb.length() - 1, sb.length()); 313 setStrings(); 314 } 315 } 316 } 317 boolean found = deleteFromIfPrecededIn( R2, new String [] { "ion" }, RV, "s" ); 318 if (!found) 319 found = deleteFromIfPrecededIn( R2, new String [] { "ion" }, RV, "t" ); 320 321 replaceFrom( RV, new String [] { "Iêre", "iêre", "Ier", "ier" }, "i" ); 322 deleteFrom( RV, new String [] { "e" } ); 323 deleteFromIfPrecededIn( RV, new String [] { "ç" }, R0, "gu" ); 324 } 325 326 330 private void step5() { 331 if (R0 != null) 332 { 333 if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill")) 334 { 335 sb.delete( sb.length() - 1, sb.length() ); 336 setStrings(); 337 } 338 } 339 } 340 341 345 private void step6() { 346 if (R0!=null && R0.length()>0) 347 { 348 boolean seenVowel = false; 349 boolean seenConson = false; 350 int pos = -1; 351 for (int i = R0.length()-1; i > -1; i--) 352 { 353 char ch = R0.charAt(i); 354 if (isVowel(ch)) 355 { 356 if (!seenVowel) 357 { 358 if (ch == 'ë' || ch == 'ê') 359 { 360 pos = i; 361 break; 362 } 363 } 364 seenVowel = true; 365 } 366 else 367 { 368 if (seenVowel) 369 break; 370 else 371 seenConson = true; 372 } 373 } 374 if (pos > -1 && seenConson && !seenVowel) 375 sb.setCharAt(pos, 'e'); 376 } 377 } 378 379 388 private boolean deleteFromIfPrecededIn( String source, String [] search, String from, String prefix ) { 389 boolean found = false; 390 if (source!=null ) 391 { 392 for (int i = 0; i < search.length; i++) { 393 if ( source.endsWith( search[i] )) 394 { 395 if (from!=null && from.endsWith( prefix + search[i] )) 396 { 397 sb.delete( sb.length() - search[i].length(), sb.length()); 398 found = true; 399 setStrings(); 400 break; 401 } 402 } 403 } 404 } 405 return found; 406 } 407 408 417 private boolean deleteFromIfTestVowelBeforeIn( String source, String [] search, boolean vowel, String from ) { 418 boolean found = false; 419 if (source!=null && from!=null) 420 { 421 for (int i = 0; i < search.length; i++) { 422 if ( source.endsWith( search[i] )) 423 { 424 if ((search[i].length() + 1) <= from.length()) 425 { 426 boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1))); 427 if (test == vowel) 428 { 429 sb.delete( sb.length() - search[i].length(), sb.length()); 430 modified = true; 431 found = true; 432 setStrings(); 433 break; 434 } 435 } 436 } 437 } 438 } 439 return found; 440 } 441 442 450 private void deleteButSuffixFrom( String source, String [] search, String prefix, boolean without ) { 451 if (source!=null) 452 { 453 for (int i = 0; i < search.length; i++) { 454 if ( source.endsWith( prefix + search[i] )) 455 { 456 sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); 457 modified = true; 458 setStrings(); 459 break; 460 } 461 else if ( without && source.endsWith( search[i] )) 462 { 463 sb.delete( sb.length() - search[i].length(), sb.length() ); 464 modified = true; 465 setStrings(); 466 break; 467 } 468 } 469 } 470 } 471 472 482 private void deleteButSuffixFromElseReplace( String source, String [] search, String prefix, boolean without, String from, String replace ) { 483 if (source!=null) 484 { 485 for (int i = 0; i < search.length; i++) { 486 if ( source.endsWith( prefix + search[i] )) 487 { 488 sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); 489 modified = true; 490 setStrings(); 491 break; 492 } 493 else if ( from!=null && from.endsWith( prefix + search[i] )) 494 { 495 sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace ); 496 modified = true; 497 setStrings(); 498 break; 499 } 500 else if ( without && source.endsWith( search[i] )) 501 { 502 sb.delete( sb.length() - search[i].length(), sb.length() ); 503 modified = true; 504 setStrings(); 505 break; 506 } 507 } 508 } 509 } 510 511 518 private boolean replaceFrom( String source, String [] search, String replace ) { 519 boolean found = false; 520 if (source!=null) 521 { 522 for (int i = 0; i < search.length; i++) { 523 if ( source.endsWith( search[i] )) 524 { 525 sb.replace( sb.length() - search[i].length(), sb.length(), replace ); 526 modified = true; 527 found = true; 528 setStrings(); 529 break; 530 } 531 } 532 } 533 return found; 534 } 535 536 542 private void deleteFrom(String source, String [] suffix ) { 543 if (source!=null) 544 { 545 for (int i = 0; i < suffix.length; i++) { 546 if (source.endsWith( suffix[i] )) 547 { 548 sb.delete( sb.length() - suffix[i].length(), sb.length()); 549 modified = true; 550 setStrings(); 551 break; 552 } 553 } 554 } 555 } 556 557 563 private boolean isVowel(char ch) { 564 switch (ch) 565 { 566 case 'a': 567 case 'e': 568 case 'i': 569 case 'o': 570 case 'u': 571 case 'y': 572 case 'â': 573 case 'à': 574 case 'ç': 575 case 'ë': 576 case 'é': 577 case 'ê': 578 case 'è': 579 case 'ï': 580 case 'î': 581 case 'ô': 582 case 'û': 583 case 'ù': 584 return true; 585 default: 586 return false; 587 } 588 } 589 590 597 private String retrieveR( StringBuffer buffer ) { 598 int len = buffer.length(); 599 int pos = -1; 600 for (int c = 0; c < len; c++) { 601 if (isVowel( buffer.charAt( c ))) 602 { 603 pos = c; 604 break; 605 } 606 } 607 if (pos > -1) 608 { 609 int consonne = -1; 610 for (int c = pos; c < len; c++) { 611 if (!isVowel(buffer.charAt( c ))) 612 { 613 consonne = c; 614 break; 615 } 616 } 617 if (consonne > -1 && (consonne+1) < len) 618 return buffer.substring( consonne+1, len ); 619 else 620 return null; 621 } 622 else 623 return null; 624 } 625 626 634 private String retrieveRV( StringBuffer buffer ) { 635 int len = buffer.length(); 636 if ( buffer.length() > 3) 637 { 638 if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) { 639 return buffer.substring(3,len); 640 } 641 else 642 { 643 int pos = 0; 644 for (int c = 1; c < len; c++) { 645 if (isVowel( buffer.charAt( c ))) 646 { 647 pos = c; 648 break; 649 } 650 } 651 if ( pos+1 < len ) 652 return buffer.substring( pos+1, len ); 653 else 654 return null; 655 } 656 } 657 else 658 return null; 659 } 660 661 662 663 671 private StringBuffer treatVowels( StringBuffer buffer ) { 672 for ( int c = 0; c < buffer.length(); c++ ) { 673 char ch = buffer.charAt( c ); 674 675 if (c == 0) { 677 if (buffer.length()>1) 678 { 679 if (ch == 'y' && isVowel(buffer.charAt( c + 1 ))) 680 buffer.setCharAt( c, 'Y' ); 681 } 682 } 683 else if (c == buffer.length()-1) { 685 if (ch == 'u' && buffer.charAt( c - 1 ) == 'q') 686 buffer.setCharAt( c, 'U' ); 687 if (ch == 'y' && isVowel(buffer.charAt( c - 1 ))) 688 buffer.setCharAt( c, 'Y' ); 689 } 690 else { 692 if (ch == 'u') 693 { 694 if (buffer.charAt( c - 1) == 'q') 695 buffer.setCharAt( c, 'U' ); 696 else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) 697 buffer.setCharAt( c, 'U' ); 698 } 699 if (ch == 'i') 700 { 701 if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) 702 buffer.setCharAt( c, 'I' ); 703 } 704 if (ch == 'y') 705 { 706 if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 ))) 707 buffer.setCharAt( c, 'Y' ); 708 } 709 } 710 } 711 712 return buffer; 713 } 714 715 720 private boolean isStemmable( String term ) { 721 boolean upper = false; 722 int first = -1; 723 for ( int c = 0; c < term.length(); c++ ) { 724 if ( !Character.isLetter( term.charAt( c ) ) ) { 726 return false; 727 } 728 if ( Character.isUpperCase( term.charAt( c ) ) ) { 730 if ( upper ) { 731 return false; 732 } 733 else { 736 first = c; 737 upper = true; 738 } 739 } 740 } 741 if ( first > 0 ) { 744 return false; 745 } 746 return true; 747 } 748 } | Popular Tags |