KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > contineo > core > text > analyze > fr > FrenchStemmer


1 package org.contineo.core.text.analyze.fr;
2
3 import org.contineo.core.text.analyze.Stemmer;
4
5 /* ====================================================================
6  * The Apache Software License, Version 1.1
7  *
8  * Copyright (c) 2001 The Apache Software Foundation. All rights
9  * reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * 2. Redistributions in binary form must reproduce the above copyright
19  * notice, this list of conditions and the following disclaimer in
20  * the documentation and/or other materials provided with the
21  * distribution.
22  *
23  * 3. The end-user documentation included with the redistribution,
24  * if any, must include the following acknowledgment:
25  * "This product includes software developed by the
26  * Apache Software Foundation (http://www.apache.org/)."
27  * Alternately, this acknowledgment may appear in the software itself,
28  * if and wherever such third-party acknowledgments normally appear.
29  *
30  * 4. The names "Apache" and "Apache Software Foundation" and
31  * "Apache Lucene" must not be used to endorse or promote products
32  * derived from this software without prior written permission. For
33  * written permission, please contact apache@apache.org.
34  *
35  * 5. Products derived from this software may not be called "Apache",
36  * "Apache Lucene", nor may "Apache" appear in their name, without
37  * prior written permission of the Apache Software Foundation.
38  *
39  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  * ====================================================================
52  *
53  * This software consists of voluntary contributions made by many
54  * individuals on behalf of the Apache Software Foundation. For more
55  * information on the Apache Software Foundation, please see
56  * <http://www.apache.org/>.
57  */

58
59 /**
60  * A stemmer for French words. The algorithm is based on the work of
61  * Dr Martin Porter on his snowball project<br>
62  * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
63  * (French stemming algorithm) for details
64  *
65  * @author Patrick Talbot
66  */

67
68 public class FrenchStemmer implements Stemmer {
69
70     /**
71      * Buffer for the terms while stemming them.
72      */

73     private StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
74
75     /**
76      * A temporary buffer, used to reconstruct R2
77      */

78      private StringBuffer JavaDoc tb = new StringBuffer JavaDoc();
79
80     /**
81      * Region R0 is equal to the whole buffer
82      */

83     private String JavaDoc R0;
84
85     /**
86      * Region RV
87      * "If the word begins with two vowels, RV is the region after the third letter,
88      * otherwise the region after the first vowel not at the beginning of the word,
89      * or the end of the word if these positions cannot be found."
90      */

91     private String JavaDoc RV;
92
93     /**
94      * Region R1
95      * "R1 is the region after the first non-vowel following a vowel
96      * or is the null region at the end of the word if there is no such non-vowel"
97      */

98     private String JavaDoc R1;
99
100     /**
101      * Region R2
102      * "R2 is the region after the first non-vowel in R1 following a vowel
103      * or is the null region at the end of the word if there is no such non-vowel"
104      */

105     private String JavaDoc R2;
106
107
108     /**
109      * Set to true if we need to perform step 2
110      */

111     private boolean suite;
112
113     /**
114      * Set to true if the buffer was modified
115      */

116     private boolean modified;
117
118
119     /**
120      * Stemms the given term to a unique <tt>discriminator</tt>.
121      *
122      * @param term java.langString The term that should be stemmed
123      * @return java.lang.String Discriminator for <tt>term</tt>
124      */

125     public String JavaDoc stem( String JavaDoc term ) {
126         if ( !isStemmable( term ) ) {
127             return term;
128         }
129
130         // Use lowercase for medium stemming.
131
term = term.toLowerCase();
132
133         // Reset the StringBuffer.
134
sb.delete( 0, sb.length() );
135         sb.insert( 0, term );
136
137         // reset the booleans
138
modified = false;
139         suite = false;
140
141         sb = treatVowels( sb );
142
143         setStrings();
144
145         step1();
146
147         if (!modified || suite)
148         {
149             if (RV != null)
150             {
151                 suite = step2a();
152                 if (!suite)
153                     step2b();
154             }
155         }
156
157         if (modified || suite)
158             step3();
159         else
160             step4();
161
162         step5();
163
164         step6();
165
166         return sb.toString();
167     }
168
169     /**
170      * Sets the search region Strings<br>
171      * it needs to be done each time the buffer was modified
172      */

173     private void setStrings() {
174         // set the strings
175
R0 = sb.toString();
176         RV = retrieveRV( sb );
177         R1 = retrieveR( sb );
178         if ( R1 != null )
179         {
180             tb.delete( 0, tb.length() );
181             tb.insert( 0, R1 );
182             R2 = retrieveR( tb );
183         }
184         else
185             R2 = null;
186     }
187
188     /**
189      * First step of the Porter Algorithmn<br>
190      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
191      */

192     private void step1( ) {
193         String JavaDoc[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
194         deleteFrom( R2, suffix );
195
196         replaceFrom( R2, new String JavaDoc[] { "logies", "logie" }, "log" );
197         replaceFrom( R2, new String JavaDoc[] { "usions", "utions", "usion", "ution" }, "u" );
198         replaceFrom( R2, new String JavaDoc[] { "ences", "ence" }, "ent" );
199
200         String JavaDoc[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
201         deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
202
203         deleteButSuffixFromElseReplace( R2, new String JavaDoc[] { "ements", "ement" }, "eus", false, R0, "eux" );
204         deleteButSuffixFrom( R2, new String JavaDoc[] { "ements", "ement" }, "ativ", false );
205         deleteButSuffixFrom( R2, new String JavaDoc[] { "ements", "ement" }, "iv", false );
206         deleteButSuffixFrom( R2, new String JavaDoc[] { "ements", "ement" }, "abl", false );
207         deleteButSuffixFrom( R2, new String JavaDoc[] { "ements", "ement" }, "iqU", false );
208
209         deleteFromIfTestVowelBeforeIn( R1, new String JavaDoc[] { "issements", "issement" }, false, R0 );
210         deleteFrom( RV, new String JavaDoc[] { "ements", "ement" } );
211
212         deleteButSuffixFromElseReplace( R2, new String JavaDoc[] { "itës", "itë" }, "abil", false, R0, "abl" );
213         deleteButSuffixFromElseReplace( R2, new String JavaDoc[] { "itës", "itë" }, "ic", false, R0, "iqU" );
214         deleteButSuffixFrom( R2, new String JavaDoc[] { "itës", "itë" }, "iv", true );
215
216         String JavaDoc[] autre = { "ifs", "ives", "if", "ive" };
217         deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
218         deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
219
220         replaceFrom( R0, new String JavaDoc[] { "eaux" }, "eau" );
221
222         replaceFrom( R1, new String JavaDoc[] { "aux" }, "al" );
223
224         deleteButSuffixFromElseReplace( R2, new String JavaDoc[] { "euses", "euse" }, "", true, R1, "eux" );
225
226         deleteFrom( R2, new String JavaDoc[] { "eux" } );
227
228         // if one of the next steps is performed, we will need to perform step2a
229
boolean temp = false;
230         temp = replaceFrom( RV, new String JavaDoc[] { "amment" }, "ant" );
231         if (temp == true)
232             suite = true;
233         temp = replaceFrom( RV, new String JavaDoc[] { "emment" }, "ent" );
234         if (temp == true)
235             suite = true;
236         temp = deleteFromIfTestVowelBeforeIn( RV, new String JavaDoc[] { "ments", "ment" }, true, RV );
237         if (temp == true)
238             suite = true;
239
240     }
241
242     /**
243      * Second step (A) of the Porter Algorithmn<br>
244      * Will be performed if nothing changed from the first step
245      * or changed were done in the amment, emment, ments or ment suffixes<br>
246      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
247      *
248      * @return boolean - true if something changed in the StringBuffer
249      */

250     private boolean step2a() {
251         String JavaDoc[] search = { "ïmes", "ïtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
252                             "irent", "iriez", "irez", "irions", "irons", "iront",
253                             "issaIent", "issais", "issantes", "issante", "issants", "issant",
254                             "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
255                             "isses", "isse", "ir", "is", "ït", "it", "ies", "ie", "i" };
256         return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
257     }
258
259     /**
260      * Second step (B) of the Porter Algorithmn<br>
261      * Will be performed if step 2 A was performed unsuccessfully<br>
262      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
263      */

264     private void step2b() {
265         String JavaDoc[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
266                             "erons", "eront","erez", "êrent", "era", "ëes", "iez",
267                             "ëe", "ës", "er", "ez", "ë" };
268         deleteFrom( RV, suffix );
269
270         String JavaDoc[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
271                             "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
272                             "ait", "aït", "ais", "Ait", "Aït", "Ais", "ât", "as", "ai", "Ai", "a" };
273         deleteButSuffixFrom( RV, search, "e", true );
274
275         deleteFrom( R2, new String JavaDoc[] { "ions" } );
276     }
277
278     /**
279      * Third step of the Porter Algorithmn<br>
280      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
281      */

282     private void step3() {
283         if (sb.length()>0)
284         {
285             char ch = sb.charAt( sb.length()-1 );
286             if (ch == 'Y')
287             {
288                 sb.setCharAt( sb.length()-1, 'i' );
289                 setStrings();
290             }
291             else if (ch == '§')
292             {
293                 sb.setCharAt( sb.length()-1, 'c' );
294                 setStrings();
295             }
296         }
297     }
298
299     /**
300      * Fourth step of the Porter Algorithmn<br>
301      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
302      */

303     private void step4() {
304         if (sb.length() > 1)
305         {
306             char ch = sb.charAt( sb.length()-1 );
307             if (ch == 's')
308             {
309                 char b = sb.charAt( sb.length()-2 );
310                 if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'ê' && b != 's')
311                 {
312                     sb.delete( sb.length() - 1, sb.length());
313                     setStrings();
314                 }
315             }
316         }
317         boolean found = deleteFromIfPrecededIn( R2, new String JavaDoc[] { "ion" }, RV, "s" );
318         if (!found)
319         found = deleteFromIfPrecededIn( R2, new String JavaDoc[] { "ion" }, RV, "t" );
320
321         replaceFrom( RV, new String JavaDoc[] { "Iêre", "iêre", "Ier", "ier" }, "i" );
322         deleteFrom( RV, new String JavaDoc[] { "e" } );
323         deleteFromIfPrecededIn( RV, new String JavaDoc[] { "ç" }, R0, "gu" );
324     }
325
326     /**
327      * Fifth step of the Porter Algorithmn<br>
328      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
329      */

330     private void step5() {
331         if (R0 != null)
332         {
333             if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
334             {
335                 sb.delete( sb.length() - 1, sb.length() );
336                 setStrings();
337             }
338         }
339     }
340
341     /**
342      * Sixth (and last!) step of the Porter Algorithmn<br>
343      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
344      */

345     private void step6() {
346         if (R0!=null && R0.length()>0)
347         {
348             boolean seenVowel = false;
349             boolean seenConson = false;
350             int pos = -1;
351             for (int i = R0.length()-1; i > -1; i--)
352             {
353                 char ch = R0.charAt(i);
354                 if (isVowel(ch))
355                 {
356                     if (!seenVowel)
357                     {
358                         if (ch == 'ë' || ch == 'ê')
359                         {
360                             pos = i;
361                             break;
362                         }
363                     }
364                     seenVowel = true;
365                 }
366                 else
367                 {
368                     if (seenVowel)
369                         break;
370                     else
371                         seenConson = true;
372                 }
373             }
374             if (pos > -1 && seenConson && !seenVowel)
375                 sb.setCharAt(pos, 'e');
376         }
377     }
378
379     /**
380      * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
381      *
382      * @param source java.lang.String - the primary source zone for search
383      * @param search java.lang.String[] - the strings to search for suppression
384      * @param from java.lang.String - the secondary source zone for search
385      * @param prefix java.lang.String - the prefix to add to the search string to test
386      * @return boolean - true if modified
387      */

388     private boolean deleteFromIfPrecededIn( String JavaDoc source, String JavaDoc[] search, String JavaDoc from, String JavaDoc prefix ) {
389         boolean found = false;
390         if (source!=null )
391         {
392             for (int i = 0; i < search.length; i++) {
393                 if ( source.endsWith( search[i] ))
394                 {
395                     if (from!=null && from.endsWith( prefix + search[i] ))
396                     {
397                         sb.delete( sb.length() - search[i].length(), sb.length());
398                         found = true;
399                         setStrings();
400                         break;
401                     }
402                 }
403             }
404         }
405         return found;
406     }
407
408     /**
409      * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
410      *
411      * @param source java.lang.String - the primary source zone for search
412      * @param search java.lang.String[] - the strings to search for suppression
413      * @param vowel boolean - true if we need a vowel before the search string
414      * @param from java.lang.String - the secondary source zone for search (where vowel could be)
415      * @return boolean - true if modified
416      */

417     private boolean deleteFromIfTestVowelBeforeIn( String JavaDoc source, String JavaDoc[] search, boolean vowel, String JavaDoc from ) {
418         boolean found = false;
419         if (source!=null && from!=null)
420         {
421             for (int i = 0; i < search.length; i++) {
422                 if ( source.endsWith( search[i] ))
423                 {
424                     if ((search[i].length() + 1) <= from.length())
425                     {
426                         boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
427                         if (test == vowel)
428                         {
429                             sb.delete( sb.length() - search[i].length(), sb.length());
430                             modified = true;
431                             found = true;
432                             setStrings();
433                             break;
434                         }
435                     }
436                 }
437             }
438         }
439         return found;
440     }
441
442     /**
443      * Delete a suffix searched in zone "source" if preceded by the prefix
444      *
445      * @param source java.lang.String - the primary source zone for search
446      * @param search java.lang.String[] - the strings to search for suppression
447      * @param prefix java.lang.String - the prefix to add to the search string to test
448      * @param without boolean - true if it will be deleted even without prefix found
449      */

450     private void deleteButSuffixFrom( String JavaDoc source, String JavaDoc[] search, String JavaDoc prefix, boolean without ) {
451         if (source!=null)
452         {
453             for (int i = 0; i < search.length; i++) {
454                 if ( source.endsWith( prefix + search[i] ))
455                 {
456                     sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
457                     modified = true;
458                     setStrings();
459                     break;
460                 }
461                 else if ( without && source.endsWith( search[i] ))
462                 {
463                     sb.delete( sb.length() - search[i].length(), sb.length() );
464                     modified = true;
465                     setStrings();
466                     break;
467                 }
468             }
469         }
470     }
471
472     /**
473      * Delete a suffix searched in zone "source" if preceded by prefix<br>
474      * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
475      * or delete the suffix if specified
476      *
477      * @param source java.lang.String - the primary source zone for search
478      * @param search java.lang.String[] - the strings to search for suppression
479      * @param prefix java.lang.String - the prefix to add to the search string to test
480      * @param without boolean - true if it will be deleted even without prefix found
481      */

482     private void deleteButSuffixFromElseReplace( String JavaDoc source, String JavaDoc[] search, String JavaDoc prefix, boolean without, String JavaDoc from, String JavaDoc replace ) {
483         if (source!=null)
484         {
485             for (int i = 0; i < search.length; i++) {
486                 if ( source.endsWith( prefix + search[i] ))
487                 {
488                     sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
489                     modified = true;
490                     setStrings();
491                     break;
492                 }
493                 else if ( from!=null && from.endsWith( prefix + search[i] ))
494                 {
495                     sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
496                     modified = true;
497                     setStrings();
498                     break;
499                 }
500                 else if ( without && source.endsWith( search[i] ))
501                 {
502                     sb.delete( sb.length() - search[i].length(), sb.length() );
503                     modified = true;
504                     setStrings();
505                     break;
506                 }
507             }
508         }
509     }
510
511     /**
512      * Replace a search string with another within the source zone
513      *
514      * @param source java.lang.String - the source zone for search
515      * @param search java.lang.String[] - the strings to search for replacement
516      * @param replace java.lang.String - the replacement string
517      */

518     private boolean replaceFrom( String JavaDoc source, String JavaDoc[] search, String JavaDoc replace ) {
519         boolean found = false;
520         if (source!=null)
521         {
522             for (int i = 0; i < search.length; i++) {
523                 if ( source.endsWith( search[i] ))
524                 {
525                     sb.replace( sb.length() - search[i].length(), sb.length(), replace );
526                     modified = true;
527                     found = true;
528                     setStrings();
529                     break;
530                 }
531             }
532         }
533         return found;
534     }
535
536     /**
537      * Delete a search string within the source zone
538      *
539      * @param source the source zone for search
540      * @param suffix the strings to search for suppression
541      */

542     private void deleteFrom(String JavaDoc source, String JavaDoc[] suffix ) {
543         if (source!=null)
544         {
545             for (int i = 0; i < suffix.length; i++) {
546                 if (source.endsWith( suffix[i] ))
547                 {
548                     sb.delete( sb.length() - suffix[i].length(), sb.length());
549                     modified = true;
550                     setStrings();
551                     break;
552                 }
553             }
554         }
555     }
556
557     /**
558      * Test if a char is a french vowel, including accentuated ones
559      *
560      * @param ch the char to test
561      * @return boolean - true if the char is a vowel
562      */

563     private boolean isVowel(char ch) {
564         switch (ch)
565         {
566             case 'a':
567             case 'e':
568             case 'i':
569             case 'o':
570             case 'u':
571             case 'y':
572             case 'â':
573             case 'à':
574             case 'ç':
575             case 'ë':
576             case 'é':
577             case 'ê':
578             case 'è':
579             case 'ï':
580             case 'î':
581             case 'ô':
582             case 'û':
583             case 'ù':
584                 return true;
585             default:
586                 return false;
587         }
588     }
589
590     /**
591      * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
592      * "R is the region after the first non-vowel following a vowel
593      * or is the null region at the end of the word if there is no such non-vowel"<br>
594      * @param buffer java.lang.StringBuffer - the in buffer
595      * @return java.lang.String - the resulting string
596      */

597     private String JavaDoc retrieveR( StringBuffer JavaDoc buffer ) {
598         int len = buffer.length();
599         int pos = -1;
600         for (int c = 0; c < len; c++) {
601             if (isVowel( buffer.charAt( c )))
602             {
603                 pos = c;
604                 break;
605             }
606         }
607         if (pos > -1)
608         {
609             int consonne = -1;
610             for (int c = pos; c < len; c++) {
611                 if (!isVowel(buffer.charAt( c )))
612                 {
613                     consonne = c;
614                     break;
615                 }
616             }
617             if (consonne > -1 && (consonne+1) < len)
618                 return buffer.substring( consonne+1, len );
619             else
620                 return null;
621         }
622         else
623             return null;
624     }
625
626     /**
627      * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
628      * "If the word begins with two vowels, RV is the region after the third letter,
629      * otherwise the region after the first vowel not at the beginning of the word,
630      * or the end of the word if these positions cannot be found."<br>
631      * @param buffer java.lang.StringBuffer - the in buffer
632      * @return java.lang.String - the resulting string
633      */

634     private String JavaDoc retrieveRV( StringBuffer JavaDoc buffer ) {
635         int len = buffer.length();
636         if ( buffer.length() > 3)
637         {
638             if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
639                 return buffer.substring(3,len);
640             }
641             else
642             {
643                 int pos = 0;
644                 for (int c = 1; c < len; c++) {
645                     if (isVowel( buffer.charAt( c )))
646                     {
647                         pos = c;
648                         break;
649                     }
650                 }
651                 if ( pos+1 < len )
652                     return buffer.substring( pos+1, len );
653                 else
654                     return null;
655             }
656         }
657         else
658             return null;
659     }
660
661
662
663     /**
664      * Turns u and i preceded AND followed by a vowel to UpperCase<br>
665      * Turns y preceded OR followed by a vowel to UpperCase<br>
666      * Turns u preceded by q to UpperCase<br>
667      *
668      * @param buffer java.util.StringBuffer - the buffer to treat
669      * @return java.util.StringBuffer - the treated buffer
670      */

671     private StringBuffer JavaDoc treatVowels( StringBuffer JavaDoc buffer ) {
672         for ( int c = 0; c < buffer.length(); c++ ) {
673             char ch = buffer.charAt( c );
674
675             if (c == 0) // first char
676
{
677                 if (buffer.length()>1)
678                 {
679                     if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
680                         buffer.setCharAt( c, 'Y' );
681                 }
682             }
683             else if (c == buffer.length()-1) // last char
684
{
685                 if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
686                     buffer.setCharAt( c, 'U' );
687                 if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
688                     buffer.setCharAt( c, 'Y' );
689             }
690             else // other cases
691
{
692                 if (ch == 'u')
693                 {
694                     if (buffer.charAt( c - 1) == 'q')
695                         buffer.setCharAt( c, 'U' );
696                     else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
697                         buffer.setCharAt( c, 'U' );
698                 }
699                 if (ch == 'i')
700                 {
701                     if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
702                         buffer.setCharAt( c, 'I' );
703                 }
704                 if (ch == 'y')
705                 {
706                     if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
707                         buffer.setCharAt( c, 'Y' );
708                 }
709             }
710         }
711
712         return buffer;
713     }
714
715     /**
716      * Checks a term if it can be processed correctly.
717      *
718      * @return boolean - true if, and only if, the given term consists in letters.
719      */

720     private boolean isStemmable( String JavaDoc term ) {
721         boolean upper = false;
722         int first = -1;
723         for ( int c = 0; c < term.length(); c++ ) {
724             // Discard terms that contain non-letter characters.
725
if ( !Character.isLetter( term.charAt( c ) ) ) {
726                 return false;
727             }
728             // Discard terms that contain multiple uppercase letters.
729
if ( Character.isUpperCase( term.charAt( c ) ) ) {
730                 if ( upper ) {
731                     return false;
732                 }
733             // First encountered uppercase letter, set flag and save
734
// position.
735
else {
736                     first = c;
737                     upper = true;
738                 }
739             }
740         }
741         // Discard the term if it contains a single uppercase letter that
742
// is not starting the term.
743
if ( first > 0 ) {
744             return false;
745         }
746         return true;
747     }
748 }
Popular Tags