EnglishStemmer


1   package org.contineo.core.text.analyze.en;
2   
3   /**
4    * Copyright 2004 The Apache Software Foundation
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  /*
20  
21   Porter stemmer in Java. The original paper is in
22  
23   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
24   no. 3, pp 130-137,
25  
26   See also http://www.tartarus.org/~martin/PorterStemmer/index.html
27  
28   */
29  
30  import org.contineo.core.text.analyze.Stemmer;
31  
32  /**
33   * 
34   * Stemmer, implementing the Porter Stemming Algorithm
35   * 
36   * The Stemmer class transforms a word into its root form. The input word can be
37   * provided a character at time (by calling add()), or at once by calling one of
38   * the various stem(something) methods.
39   */
40  
41  class EnglishStemmer implements Stemmer {
42      private char[] b;
43  
44      private int i;
45      /* offset into b */ private int j;
46      private int k;
47      private int k0;
48  
49      private boolean dirty = false;
50  
51      private static final int INC = 50; /* unit of size whereby b is increased */
52  
53      private static final int EXTRA = 1;
54  
55      public EnglishStemmer() {
56          b = new char[INC];
57          i = 0;
58      }
59  
60      /**
61       * reset() resets the stemmer so it can stem another word. If you invoke the
62       * stemmer by calling add(char) and then stem(), you must call reset()
63       * before starting another word.
64       */
65      public void reset() {
66          i = 0;
67          dirty = false;
68      }
69  
70      /**
71       * Add a character to the word being stemmed. When you are finished adding
72       * characters, you can call stem(void) to process the word.
73       */
74      public void add(char ch) {
75          if (b.length <= i + EXTRA) {
76              char[] new_b = new char[b.length + INC];
77              for (int c = 0; c < b.length; c++)
78                  new_b[c] = b[c];
79              b = new_b;
80          }
81          b[i++] = ch;
82      }
83  
84      /**
85       * After a word has been stemmed, it can be retrieved by toString(), or a
86       * reference to the internal buffer can be retrieved by getResultBuffer and
87       * getResultLength (which is generally more efficient.)
88       */
89      public String   toString() {
90          return new String  (b, 0, i);
91      }
92  
93      /**
94       * Returns the length of the word resulting from the stemming process.
95       */
96      public int getResultLength() {
97          return i;
98      }
99  
100     /**
101      * Returns a reference to a character buffer containing the results of the
102      * stemming process. You also need to consult getResultLength() to determine
103      * the length of the result.
104      */
105     public char[] getResultBuffer() {
106         return b;
107     }
108 
109     /* cons(i) is true <=> b[i] is a consonant. */
110 
111     private final boolean cons(int i) {
112         switch (b[i]) {
113         case 'a':
114         case 'e':
115         case 'i':
116         case 'o':
117         case 'u':
118             return false;
119         case 'y':
120             return (i == k0) ? true : !cons(i - 1);
121         default:
122             return true;
123         }
124     }
125 
126     /*
127      * m() measures the number of consonant sequences between k0 and j. if c is
128      * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
129      * presence,
130      * 
131      * <c> <v> gives 0 <c>vc <v> gives 1 <c>vcvc <v> gives 2 <c>vcvcvc <v> gives
132      * 3 ....
133      */
134 
135     private final int m() {
136         int n = 0;
137         int i = k0;
138         while (true) {
139             if (i > j)
140                 return n;
141             if (!cons(i))
142                 break;
143             i++;
144         }
145         i++;
146         while (true) {
147             while (true) {
148                 if (i > j)
149                     return n;
150                 if (cons(i))
151                     break;
152                 i++;
153             }
154             i++;
155             n++;
156             while (true) {
157                 if (i > j)
158                     return n;
159                 if (!cons(i))
160                     break;
161                 i++;
162             }
163             i++;
164         }
165     }
166 
167     /* vowelinstem() is true <=> k0,...j contains a vowel */
168 
169     private final boolean vowelinstem() {
170         int i;
171         for (i = k0; i <= j; i++)
172             if (!cons(i))
173                 return true;
174         return false;
175     }
176 
177     /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
178 
179     private final boolean doublec(int j) {
180         if (j < k0 + 1)
181             return false;
182         if (b[j] != b[j - 1])
183             return false;
184         return cons(j);
185     }
186 
187     /*
188      * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
189      * and also if the second c is not w,x or y. this is used when trying to
190      * restore an e at the end of a short word. e.g.
191      * 
192      * cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
193      *  
194      */
195 
196     private final boolean cvc(int i) {
197         if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2))
198             return false;
199         else {
200             int ch = b[i];
201             if (ch == 'w' || ch == 'x' || ch == 'y')
202                 return false;
203         }
204         return true;
205     }
206 
207     private final boolean ends(String   s) {
208         int l = s.length();
209         int o = k - l + 1;
210         if (o < k0)
211             return false;
212         for (int i = 0; i < l; i++)
213             if (b[o + i] != s.charAt(i))
214                 return false;
215         j = k - l;
216         return true;
217     }
218 
219     /*
220      * setto(s) sets (j+1),...k to the characters in the string s, readjusting
221      * k.
222      */
223 
224     void setto(String   s) {
225         int l = s.length();
226         int o = j + 1;
227         for (int i = 0; i < l; i++)
228             b[o + i] = s.charAt(i);
229         k = j + l;
230         dirty = true;
231     }
232 
233     /* r(s) is used further down. */
234 
235     void r(String   s) {
236         if (m() > 0)
237             setto(s);
238     }
239 
240     /*
241      * step1() gets rid of plurals and -ed or -ing. e.g.
242      * 
243      * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat
244      * 
245      * feed -> feed agreed -> agree disabled -> disable
246      * 
247      * matting -> mat mating -> mate meeting -> meet milling -> mill messing ->
248      * mess
249      * 
250      * meetings -> meet
251      *  
252      */
253 
254     private final void step1() {
255         if (b[k] == 's') {
256             if (ends("sses"))
257                 k -= 2;
258             else if (ends("ies"))
259                 setto("i");
260             else if (b[k - 1] != 's')
261                 k--;
262         }
263         if (ends("eed")) {
264             if (m() > 0)
265                 k--;
266         } else if ((ends("ed") || ends("ing")) && vowelinstem()) {
267             k = j;
268             if (ends("at"))
269                 setto("ate");
270             else if (ends("bl"))
271                 setto("ble");
272             else if (ends("iz"))
273                 setto("ize");
274             else if (doublec(k)) {
275                 int ch = b[k--];
276                 if (ch == 'l' || ch == 's' || ch == 'z')
277                     k++;
278             } else if (m() == 1 && cvc(k))
279                 setto("e");
280         }
281     }
282 
283     /* step2() turns terminal y to i when there is another vowel in the stem. */
284 
285     private final void step2() {
286         if (ends("y") && vowelinstem()) {
287             b[k] = 'i';
288             dirty = true;
289         }
290     }
291 
292     /*
293      * step3() maps double suffices to single ones. so -ization ( = -ize plus
294      * -ation) maps to -ize etc. note that the string before the suffix must
295      * give m() > 0.
296      */
297 
298     private final void step3() {
299         if (k == k0)
300             return; /* For Bug 1 */
301         switch (b[k - 1]) {
302         case 'a':
303             if (ends("ational")) {
304                 r("ate");
305                 break;
306             }
307             if (ends("tional")) {
308                 r("tion");
309                 break;
310             }
311             break;
312         case 'c':
313             if (ends("enci")) {
314                 r("ence");
315                 break;
316             }
317             if (ends("anci")) {
318                 r("ance");
319                 break;
320             }
321             break;
322         case 'e':
323             if (ends("izer")) {
324                 r("ize");
325                 break;
326             }
327             break;
328         case 'l':
329             if (ends("bli")) {
330                 r("ble");
331                 break;
332             }
333             if (ends("alli")) {
334                 r("al");
335                 break;
336             }
337             if (ends("entli")) {
338                 r("ent");
339                 break;
340             }
341             if (ends("eli")) {
342                 r("e");
343                 break;
344             }
345             if (ends("ousli")) {
346                 r("ous");
347                 break;
348             }
349             break;
350         case 'o':
351             if (ends("ization")) {
352                 r("ize");
353                 break;
354             }
355             if (ends("ation")) {
356                 r("ate");
357                 break;
358             }
359             if (ends("ator")) {
360                 r("ate");
361                 break;
362             }
363             break;
364         case 's':
365             if (ends("alism")) {
366                 r("al");
367                 break;
368             }
369             if (ends("iveness")) {
370                 r("ive");
371                 break;
372             }
373             if (ends("fulness")) {
374                 r("ful");
375                 break;
376             }
377             if (ends("ousness")) {
378                 r("ous");
379                 break;
380             }
381             break;
382         case 't':
383             if (ends("aliti")) {
384                 r("al");
385                 break;
386             }
387             if (ends("iviti")) {
388                 r("ive");
389                 break;
390             }
391             if (ends("biliti")) {
392                 r("ble");
393                 break;
394             }
395             break;
396         case 'g':
397             if (ends("logi")) {
398                 r("log");
399                 break;
400             }
401         }
402     }
403 
404     /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
405 
406     private final void step4() {
407         switch (b[k]) {
408         case 'e':
409             if (ends("icate")) {
410                 r("ic");
411                 break;
412             }
413             if (ends("ative")) {
414                 r("");
415                 break;
416             }
417             if (ends("alize")) {
418                 r("al");
419                 break;
420             }
421             break;
422         case 'i':
423             if (ends("iciti")) {
424                 r("ic");
425                 break;
426             }
427             break;
428         case 'l':
429             if (ends("ical")) {
430                 r("ic");
431                 break;
432             }
433             if (ends("ful")) {
434                 r("");
435                 break;
436             }
437             break;
438         case 's':
439             if (ends("ness")) {
440                 r("");
441                 break;
442             }
443             break;
444         }
445     }
446 
447     /* step5() takes off -ant, -ence etc., in context <c>vcvc <v>. */
448 
449     private final void step5() {
450         if (k == k0)
451             return; /* for Bug 1 */
452         switch (b[k - 1]) {
453         case 'a':
454             if (ends("al"))
455                 break;
456             return;
457         case 'c':
458             if (ends("ance"))
459                 break;
460             if (ends("ence"))
461                 break;
462             return;
463         case 'e':
464             if (ends("er"))
465                 break;
466             return;
467         case 'i':
468             if (ends("ic"))
469                 break;
470             return;
471         case 'l':
472             if (ends("able"))
473                 break;
474             if (ends("ible"))
475                 break;
476             return;
477         case 'n':
478             if (ends("ant"))
479                 break;
480             if (ends("ement"))
481                 break;
482             if (ends("ment"))
483                 break;
484             /* element etc. not stripped before the m */
485             if (ends("ent"))
486                 break;
487             return;
488         case 'o':
489             if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
490                 break;
491             /* j >= 0 fixes Bug 2 */
492             if (ends("ou"))
493                 break;
494             return;
495         /* takes care of -ous */
496         case 's':
497             if (ends("ism"))
498                 break;
499             return;
500         case 't':
501             if (ends("ate"))
502                 break;
503             if (ends("iti"))
504                 break;
505             return;
506         case 'u':
507             if (ends("ous"))
508                 break;
509             return;
510         case 'v':
511             if (ends("ive"))
512                 break;
513             return;
514         case 'z':
515             if (ends("ize"))
516                 break;
517             return;
518         default:
519             return;
520         }
521         if (m() > 1)
522             k = j;
523     }
524 
525     /* step6() removes a final -e if m() > 1. */
526 
527     private final void step6() {
528         j = k;
529         if (b[k] == 'e') {
530             int a = m();
531             if (a > 1 || a == 1 && !cvc(k - 1))
532                 k--;
533         }
534         if (b[k] == 'l' && doublec(k) && m() > 1)
535             k--;
536     }
537 
538     /**
539      * Stem a word provided as a String. Returns the result as a String.
540      */
541     public String   stem(String   s) {
542         if (stem(s.toCharArray(), s.length()))
543             return toString();
544         else
545             return s;
546     }
547 
548     /**
549      * Stem a word contained in a char[]. Returns true if the stemming process
550      * resulted in a word different from the input. You can retrieve the result
551      * with getResultLength()/getResultBuffer() or toString().
552      */
553     public boolean stem(char[] word) {
554         return stem(word, word.length);
555     }
556 
557     /**
558      * Stem a word contained in a portion of a char[] array. Returns true if the
559      * stemming process resulted in a word different from the input. You can
560      * retrieve the result with getResultLength()/getResultBuffer() or
561      * toString().
562      */
563     public boolean stem(char[] wordBuffer, int offset, int wordLen) {
564         reset();
565         if (b.length < wordLen) {
566             char[] new_b = new char[wordLen + EXTRA];
567             b = new_b;
568         }
569         for (int j = 0; j < wordLen; j++)
570             b[j] = wordBuffer[offset + j];
571         i = wordLen;
572         return stem(0);
573     }
574 
575     /**
576      * Stem a word contained in a leading portion of a char[] array. Returns
577      * true if the stemming process resulted in a word different from the input.
578      * You can retrieve the result with getResultLength()/getResultBuffer() or
579      * toString().
580      */
581     public boolean stem(char[] word, int wordLen) {
582         return stem(word, 0, wordLen);
583     }
584 
585     /**
586      * Stem the word placed into the Stemmer buffer through calls to add().
587      * Returns true if the stemming process resulted in a word different from
588      * the input. You can retrieve the result with
589      * getResultLength()/getResultBuffer() or toString().
590      */
591     public boolean stem() {
592         return stem(0);
593     }
594 
595     public boolean stem(int i0) {
596         k = i - 1;
597         k0 = i0;
598         if (k > k0 + 1) {
599             step1();
600             step2();
601             step3();
602             step4();
603             step5();
604             step6();
605         }
606         // Also, a word is considered dirty if we lopped off letters
607         // Thanks to Ifigenia Vairelles for pointing this out.
608         if (i != k + 1)
609             dirty = true;
610         i = k + 1;
611         return dirty;
612     }
613 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags