FrenchStemmer


1   package org.contineo.core.text.analyze.fr;
2   
3   import org.contineo.core.text.analyze.Stemmer;
4   
5   /* ====================================================================
6    * The Apache Software License, Version 1.1
7    *
8    * Copyright (c) 2001 The Apache Software Foundation.  All rights
9    * reserved.
10   *
11   * Redistribution and use in source and binary forms, with or without
12   * modification, are permitted provided that the following conditions
13   * are met:
14   *
15   * 1. Redistributions of source code must retain the above copyright
16   *    notice, this list of conditions and the following disclaimer.
17   *
18   * 2. Redistributions in binary form must reproduce the above copyright
19   *    notice, this list of conditions and the following disclaimer in
20   *    the documentation and/or other materials provided with the
21   *    distribution.
22   *
23   * 3. The end-user documentation included with the redistribution,
24   *    if any, must include the following acknowledgment:
25   *       "This product includes software developed by the
26   *        Apache Software Foundation (http://www.apache.org/)."
27   *    Alternately, this acknowledgment may appear in the software itself,
28   *    if and wherever such third-party acknowledgments normally appear.
29   *
30   * 4. The names "Apache" and "Apache Software Foundation" and
31   *    "Apache Lucene" must not be used to endorse or promote products
32   *    derived from this software without prior written permission. For
33   *    written permission, please contact apache@apache.org.
34   *
35   * 5. Products derived from this software may not be called "Apache",
36   *    "Apache Lucene", nor may "Apache" appear in their name, without
37   *    prior written permission of the Apache Software Foundation.
38   *
39   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50   * SUCH DAMAGE.
51   * ====================================================================
52   *
53   * This software consists of voluntary contributions made by many
54   * individuals on behalf of the Apache Software Foundation.  For more
55   * information on the Apache Software Foundation, please see
56   * <http://www.apache.org/>.
57   */
58  
59  /**
60   * A stemmer for French words. The algorithm is based on the work of
61   * Dr Martin Porter on his snowball project<br>
62   * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
63   * (French stemming algorithm) for details
64   *
65   * @author    Patrick Talbot
66   */
67  
68  public class FrenchStemmer implements Stemmer {
69  
70      /**
71       * Buffer for the terms while stemming them.
72       */
73      private StringBuffer   sb = new StringBuffer  ();
74  
75      /**
76       * A temporary buffer, used to reconstruct R2
77       */
78       private StringBuffer   tb = new StringBuffer  ();
79  
80      /**
81       * Region R0 is equal to the whole buffer
82       */
83      private String   R0;
84  
85      /**
86       * Region RV
87       * "If the word begins with two vowels, RV is the region after the third letter,
88       * otherwise the region after the first vowel not at the beginning of the word,
89       * or the end of the word if these positions cannot be found."
90       */
91      private String   RV;
92  
93      /**
94       * Region R1
95       * "R1 is the region after the first non-vowel following a vowel
96       * or is the null region at the end of the word if there is no such non-vowel"
97       */
98      private String   R1;
99  
100     /**
101      * Region R2
102      * "R2 is the region after the first non-vowel in R1 following a vowel
103      * or is the null region at the end of the word if there is no such non-vowel"
104      */
105     private String   R2;
106 
107 
108     /**
109      * Set to true if we need to perform step 2
110      */
111     private boolean suite;
112 
113     /**
114      * Set to true if the buffer was modified
115      */
116     private boolean modified;
117 
118 
119     /**
120      * Stemms the given term to a unique <tt>discriminator</tt>.
121      *
122      * @param term  java.langString The term that should be stemmed
123      * @return java.lang.String  Discriminator for <tt>term</tt>
124      */
125     public String   stem( String   term ) {
126         if ( !isStemmable( term ) ) {
127             return term;
128         }
129 
130         // Use lowercase for medium stemming.
131         term = term.toLowerCase();
132 
133         // Reset the StringBuffer.
134         sb.delete( 0, sb.length() );
135         sb.insert( 0, term );
136 
137         // reset the booleans
138         modified = false;
139         suite = false;
140 
141         sb = treatVowels( sb );
142 
143         setStrings();
144 
145         step1();
146 
147         if (!modified || suite)
148         {
149             if (RV != null)
150             {
151                 suite = step2a();
152                 if (!suite)
153                     step2b();
154             }
155         }
156 
157         if (modified || suite)
158             step3();
159         else
160             step4();
161 
162         step5();
163 
164         step6();
165 
166         return sb.toString();
167     }
168 
169     /**
170      * Sets the search region Strings<br>
171      * it needs to be done each time the buffer was modified
172      */
173     private void setStrings() {
174         // set the strings
175         R0 = sb.toString();
176         RV = retrieveRV( sb );
177         R1 = retrieveR( sb );
178         if ( R1 != null )
179         {
180             tb.delete( 0, tb.length() );
181             tb.insert( 0, R1 );
182             R2 = retrieveR( tb );
183         }
184         else
185             R2 = null;
186     }
187 
188     /**
189      * First step of the Porter Algorithmn<br>
190      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
191      */
192     private void step1( ) {
193         String  [] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
194         deleteFrom( R2, suffix );
195 
196         replaceFrom( R2, new String  [] { "logies", "logie" }, "log" );
197         replaceFrom( R2, new String  [] { "usions", "utions", "usion", "ution" }, "u" );
198         replaceFrom( R2, new String  [] { "ences", "ence" }, "ent" );
199 
200         String  [] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
201         deleteButSuffixFromElseReplace( R2, search, "ic",  true, R0, "iqU" );
202 
203         deleteButSuffixFromElseReplace( R2, new String  [] { "ements", "ement" }, "eus", false, R0, "eux" );
204         deleteButSuffixFrom( R2, new String  [] { "ements", "ement" }, "ativ", false );
205         deleteButSuffixFrom( R2, new String  [] { "ements", "ement" }, "iv", false );
206         deleteButSuffixFrom( R2, new String  [] { "ements", "ement" }, "abl", false );
207         deleteButSuffixFrom( R2, new String  [] { "ements", "ement" }, "iqU", false );
208 
209         deleteFromIfTestVowelBeforeIn( R1, new String  [] { "issements", "issement" }, false, R0 );
210         deleteFrom( RV, new String  [] { "ements", "ement" } );
211 
212         deleteButSuffixFromElseReplace( R2, new String  [] { "it�s", "it�" }, "abil", false, R0, "abl" );
213         deleteButSuffixFromElseReplace( R2, new String  [] { "it�s", "it�" }, "ic", false, R0, "iqU" );
214         deleteButSuffixFrom( R2, new String  [] { "it�s", "it�" }, "iv", true );
215 
216         String  [] autre = { "ifs", "ives", "if", "ive" };
217         deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
218         deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
219 
220         replaceFrom( R0, new String  [] { "eaux" }, "eau" );
221 
222         replaceFrom( R1, new String  [] { "aux" }, "al" );
223 
224         deleteButSuffixFromElseReplace( R2, new String  [] { "euses", "euse" }, "", true, R1, "eux" );
225 
226         deleteFrom( R2, new String  [] { "eux" } );
227 
228         // if one of the next steps is performed, we will need to perform step2a
229         boolean temp = false;
230         temp = replaceFrom( RV, new String  [] { "amment" }, "ant" );
231         if (temp == true)
232             suite = true;
233         temp = replaceFrom( RV, new String  [] { "emment" }, "ent" );
234         if (temp == true)
235             suite = true;
236         temp = deleteFromIfTestVowelBeforeIn( RV, new String  [] { "ments", "ment" }, true, RV );
237         if (temp == true)
238             suite = true;
239 
240     }
241 
242     /**
243      * Second step (A) of the Porter Algorithmn<br>
244      * Will be performed if nothing changed from the first step
245      * or changed were done in the amment, emment, ments or ment suffixes<br>
246      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
247      *
248      * @return boolean - true if something changed in the StringBuffer
249      */
250     private boolean step2a() {
251         String  [] search = { "�mes", "�tes", "iraIent", "irait", "irais", "irai", "iras", "ira",
252                             "irent", "iriez", "irez", "irions", "irons", "iront",
253                             "issaIent", "issais", "issantes", "issante", "issants", "issant",
254                             "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
255                             "isses", "isse", "ir", "is", "�t", "it", "ies", "ie", "i" };
256         return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
257     }
258 
259     /**
260      * Second step (B) of the Porter Algorithmn<br>
261      * Will be performed if step 2 A was performed unsuccessfully<br>
262      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
263      */
264     private void step2b() {
265         String  [] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
266                             "erons", "eront","erez", "�rent", "era", "�es", "iez",
267                             "�e", "�s", "er", "ez", "�" };
268         deleteFrom( RV, suffix );
269 
270         String  [] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
271                             "antes", "aIent", "Aient", "ante", "�mes", "�tes", "ants", "ant",
272                             "ait", "a�t", "ais", "Ait", "A�t", "Ais", "�t", "as", "ai", "Ai", "a" };
273         deleteButSuffixFrom( RV, search, "e", true );
274 
275         deleteFrom( R2, new String  [] { "ions" } );
276     }
277 
278     /**
279      * Third step of the Porter Algorithmn<br>
280      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
281      */
282     private void step3() {
283         if (sb.length()>0)
284         {
285             char ch = sb.charAt( sb.length()-1 );
286             if (ch == 'Y')
287             {
288                 sb.setCharAt( sb.length()-1, 'i' );
289                 setStrings();
290             }
291             else if (ch == '�')
292             {
293                 sb.setCharAt( sb.length()-1, 'c' );
294                 setStrings();
295             }
296         }
297     }
298 
299     /**
300      * Fourth step of the Porter Algorithmn<br>
301      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
302      */
303     private void step4() {
304         if (sb.length() > 1)
305         {
306             char ch = sb.charAt( sb.length()-1 );
307             if (ch == 's')
308             {
309                 char b = sb.charAt( sb.length()-2 );
310                 if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != '�' && b != 's')
311                 {
312                     sb.delete( sb.length() - 1, sb.length());
313                     setStrings();
314                 }
315             }
316         }
317         boolean found = deleteFromIfPrecededIn( R2, new String  [] { "ion" }, RV, "s" );
318         if (!found)
319         found = deleteFromIfPrecededIn( R2, new String  [] { "ion" }, RV, "t" );
320 
321         replaceFrom( RV, new String  [] { "I�re", "i�re", "Ier", "ier" }, "i" );
322         deleteFrom( RV, new String  [] { "e" } );
323         deleteFromIfPrecededIn( RV, new String  [] { "�" }, R0, "gu" );
324     }
325 
326     /**
327      * Fifth step of the Porter Algorithmn<br>
328      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
329      */
330     private void step5() {
331         if (R0 != null)
332         {
333             if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
334             {
335                 sb.delete( sb.length() - 1, sb.length() );
336                 setStrings();
337             }
338         }
339     }
340 
341     /**
342      * Sixth (and last!) step of the Porter Algorithmn<br>
343      * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
344      */
345     private void step6() {
346         if (R0!=null && R0.length()>0)
347         {
348             boolean seenVowel = false;
349             boolean seenConson = false;
350             int pos = -1;
351             for (int i = R0.length()-1; i > -1; i--)
352             {
353                 char ch = R0.charAt(i);
354                 if (isVowel(ch))
355                 {
356                     if (!seenVowel)
357                     {
358                         if (ch == '�' || ch == '�')
359                         {
360                             pos = i;
361                             break;
362                         }
363                     }
364                     seenVowel = true;
365                 }
366                 else
367                 {
368                     if (seenVowel)
369                         break;
370                     else
371                         seenConson = true;
372                 }
373             }
374             if (pos > -1 && seenConson && !seenVowel)
375                 sb.setCharAt(pos, 'e');
376         }
377     }
378 
379     /**
380      * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
381      *
382      * @param source java.lang.String - the primary source zone for search
383      * @param search java.lang.String[] - the strings to search for suppression
384      * @param from java.lang.String - the secondary source zone for search
385      * @param prefix java.lang.String - the prefix to add to the search string to test
386      * @return boolean - true if modified
387      */
388     private boolean deleteFromIfPrecededIn( String   source, String  [] search, String   from, String   prefix ) {
389         boolean found = false;
390         if (source!=null )
391         {
392             for (int i = 0; i < search.length; i++) {
393                 if ( source.endsWith( search[i] ))
394                 {
395                     if (from!=null && from.endsWith( prefix + search[i] ))
396                     {
397                         sb.delete( sb.length() - search[i].length(), sb.length());
398                         found = true;
399                         setStrings();
400                         break;
401                     }
402                 }
403             }
404         }
405         return found;
406     }
407 
408     /**
409      * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
410      *
411      * @param source java.lang.String - the primary source zone for search
412      * @param search java.lang.String[] - the strings to search for suppression
413      * @param vowel boolean - true if we need a vowel before the search string
414      * @param from java.lang.String - the secondary source zone for search (where vowel could be)
415      * @return boolean - true if modified
416      */
417     private boolean deleteFromIfTestVowelBeforeIn( String   source, String  [] search, boolean vowel, String   from ) {
418         boolean found = false;
419         if (source!=null && from!=null)
420         {
421             for (int i = 0; i < search.length; i++) {
422                 if ( source.endsWith( search[i] ))
423                 {
424                     if ((search[i].length() + 1) <= from.length())
425                     {
426                         boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
427                         if (test == vowel)
428                         {
429                             sb.delete( sb.length() - search[i].length(), sb.length());
430                             modified = true;
431                             found = true;
432                             setStrings();
433                             break;
434                         }
435                     }
436                 }
437             }
438         }
439         return found;
440     }
441 
442     /**
443      * Delete a suffix searched in zone "source" if preceded by the prefix
444      *
445      * @param source java.lang.String - the primary source zone for search
446      * @param search java.lang.String[] - the strings to search for suppression
447      * @param prefix java.lang.String - the prefix to add to the search string to test
448      * @param without boolean - true if it will be deleted even without prefix found
449      */
450     private void deleteButSuffixFrom( String   source, String  [] search, String   prefix, boolean without ) {
451         if (source!=null)
452         {
453             for (int i = 0; i < search.length; i++) {
454                 if ( source.endsWith( prefix + search[i] ))
455                 {
456                     sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
457                     modified = true;
458                     setStrings();
459                     break;
460                 }
461                 else if ( without && source.endsWith( search[i] ))
462                 {
463                     sb.delete( sb.length() - search[i].length(), sb.length() );
464                     modified = true;
465                     setStrings();
466                     break;
467                 }
468             }
469         }
470     }
471 
472     /**
473      * Delete a suffix searched in zone "source" if preceded by prefix<br>
474      * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
475      * or delete the suffix if specified
476      *
477      * @param source java.lang.String - the primary source zone for search
478      * @param search java.lang.String[] - the strings to search for suppression
479      * @param prefix java.lang.String - the prefix to add to the search string to test
480      * @param without boolean - true if it will be deleted even without prefix found
481      */
482     private void deleteButSuffixFromElseReplace( String   source, String  [] search, String   prefix, boolean without, String   from, String   replace ) {
483         if (source!=null)
484         {
485             for (int i = 0; i < search.length; i++) {
486                 if ( source.endsWith( prefix + search[i] ))
487                 {
488                     sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
489                     modified = true;
490                     setStrings();
491                     break;
492                 }
493                 else if ( from!=null && from.endsWith( prefix + search[i] ))
494                 {
495                     sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
496                     modified = true;
497                     setStrings();
498                     break;
499                 }
500                 else if ( without && source.endsWith( search[i] ))
501                 {
502                     sb.delete( sb.length() - search[i].length(), sb.length() );
503                     modified = true;
504                     setStrings();
505                     break;
506                 }
507             }
508         }
509     }
510 
511     /**
512      * Replace a search string with another within the source zone
513      *
514      * @param source java.lang.String - the source zone for search
515      * @param search java.lang.String[] - the strings to search for replacement
516      * @param replace java.lang.String - the replacement string
517      */
518     private boolean replaceFrom( String   source, String  [] search, String   replace ) {
519         boolean found = false;
520         if (source!=null)
521         {
522             for (int i = 0; i < search.length; i++) {
523                 if ( source.endsWith( search[i] ))
524                 {
525                     sb.replace( sb.length() - search[i].length(), sb.length(), replace );
526                     modified = true;
527                     found = true;
528                     setStrings();
529                     break;
530                 }
531             }
532         }
533         return found;
534     }
535 
536     /**
537      * Delete a search string within the source zone
538      *
539      * @param source the source zone for search
540      * @param suffix the strings to search for suppression
541      */
542     private void deleteFrom(String   source, String  [] suffix ) {
543         if (source!=null)
544         {
545             for (int i = 0; i < suffix.length; i++) {
546                 if (source.endsWith( suffix[i] ))
547                 {
548                     sb.delete( sb.length() - suffix[i].length(), sb.length());
549                     modified = true;
550                     setStrings();
551                     break;
552                 }
553             }
554         }
555     }
556 
557     /**
558      * Test if a char is a french vowel, including accentuated ones
559      *
560      * @param ch the char to test
561      * @return boolean - true if the char is a vowel
562      */
563     private boolean isVowel(char ch) {
564         switch (ch)
565         {
566             case 'a':
567             case 'e':
568             case 'i':
569             case 'o':
570             case 'u':
571             case 'y':
572             case '�':
573             case '�':
574             case '�':
575             case '�':
576             case '�':
577             case '�':
578             case '�':
579             case '�':
580             case '�':
581             case '�':
582             case '�':
583             case '�':
584                 return true;
585             default:
586                 return false;
587         }
588     }
589 
590     /**
591      * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
592      * "R is the region after the first non-vowel following a vowel
593      * or is the null region at the end of the word if there is no such non-vowel"<br>
594      * @param buffer java.lang.StringBuffer - the in buffer
595      * @return java.lang.String - the resulting string
596      */
597     private String   retrieveR( StringBuffer   buffer ) {
598         int len = buffer.length();
599         int pos = -1;
600         for (int c = 0; c < len; c++) {
601             if (isVowel( buffer.charAt( c )))
602             {
603                 pos = c;
604                 break;
605             }
606         }
607         if (pos > -1)
608         {
609             int consonne = -1;
610             for (int c = pos; c < len; c++) {
611                 if (!isVowel(buffer.charAt( c )))
612                 {
613                     consonne = c;
614                     break;
615                 }
616             }
617             if (consonne > -1 && (consonne+1) < len)
618                 return buffer.substring( consonne+1, len );
619             else
620                 return null;
621         }
622         else
623             return null;
624     }
625 
626     /**
627      * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
628      * "If the word begins with two vowels, RV is the region after the third letter,
629      * otherwise the region after the first vowel not at the beginning of the word,
630      * or the end of the word if these positions cannot be found."<br>
631      * @param buffer java.lang.StringBuffer - the in buffer
632      * @return java.lang.String - the resulting string
633      */
634     private String   retrieveRV( StringBuffer   buffer ) {
635         int len = buffer.length();
636         if ( buffer.length() > 3)
637         {
638             if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
639                 return buffer.substring(3,len);
640             }
641             else
642             {
643                 int pos = 0;
644                 for (int c = 1; c < len; c++) {
645                     if (isVowel( buffer.charAt( c )))
646                     {
647                         pos = c;
648                         break;
649                     }
650                 }
651                 if ( pos+1 < len )
652                     return buffer.substring( pos+1, len );
653                 else
654                     return null;
655             }
656         }
657         else
658             return null;
659     }
660 
661 
662 
663     /**
664      * Turns u and i preceded AND followed by a vowel to UpperCase<br>
665      * Turns y preceded OR followed by a vowel to UpperCase<br>
666      * Turns u preceded by q to UpperCase<br>
667      *
668      * @param buffer java.util.StringBuffer - the buffer to treat
669      * @return java.util.StringBuffer - the treated buffer
670      */
671     private StringBuffer   treatVowels( StringBuffer   buffer ) {
672         for ( int c = 0; c < buffer.length(); c++ ) {
673             char ch = buffer.charAt( c );
674 
675             if (c == 0) // first char
676             {
677                 if (buffer.length()>1)
678                 {
679                     if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
680                         buffer.setCharAt( c, 'Y' );
681                 }
682             }
683             else if (c == buffer.length()-1) // last char
684             {
685                 if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
686                     buffer.setCharAt( c, 'U' );
687                 if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
688                     buffer.setCharAt( c, 'Y' );
689             }
690             else // other cases
691             {
692                 if (ch == 'u')
693                 {
694                     if (buffer.charAt( c - 1) == 'q')
695                         buffer.setCharAt( c, 'U' );
696                     else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
697                         buffer.setCharAt( c, 'U' );
698                 }
699                 if (ch == 'i')
700                 {
701                     if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
702                         buffer.setCharAt( c, 'I' );
703                 }
704                 if (ch == 'y')
705                 {
706                     if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
707                         buffer.setCharAt( c, 'Y' );
708                 }
709             }
710         }
711 
712         return buffer;
713     }
714 
715     /**
716      * Checks a term if it can be processed correctly.
717      *
718      * @return boolean - true if, and only if, the given term consists in letters.
719      */
720     private boolean isStemmable( String   term ) {
721         boolean upper = false;
722         int first = -1;
723         for ( int c = 0; c < term.length(); c++ ) {
724             // Discard terms that contain non-letter characters.
725             if ( !Character.isLetter( term.charAt( c ) ) ) {
726                 return false;
727             }
728             // Discard terms that contain multiple uppercase letters.
729             if ( Character.isUpperCase( term.charAt( c ) ) ) {
730                 if ( upper ) {
731                     return false;
732                 }
733             // First encountered uppercase letter, set flag and save
734             // position.
735                 else {
736                     first = c;
737                     upper = true;
738                 }
739             }
740         }
741         // Discard the term if it contains a single uppercase letter that
742         // is not starting the term.
743         if ( first > 0 ) {
744             return false;
745         }
746         return true;
747     }
748 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags