KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > ru > RussianStemmer


1 package org.apache.lucene.analysis.ru;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 /**
20  * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
21  *
22  * @author Boris Okner, b.okner@rogers.com
23  * @version $Id: RussianStemmer.java 150998 2004-08-16 20:30:46Z dnaber $
24  */

25 class RussianStemmer
26 {
27     private char[] charset;
28
29     // positions of RV, R1 and R2 respectively
30
private int RV, R1, R2;
31
32     // letters (currently unused letters are commented out)
33
private final static char A = 0;
34     //private final static char B = 1;
35
private final static char V = 2;
36     private final static char G = 3;
37     //private final static char D = 4;
38
private final static char E = 5;
39     //private final static char ZH = 6;
40
//private final static char Z = 7;
41
private final static char I = 8;
42     private final static char I_ = 9;
43     //private final static char K = 10;
44
private final static char L = 11;
45     private final static char M = 12;
46     private final static char N = 13;
47     private final static char O = 14;
48     //private final static char P = 15;
49
//private final static char R = 16;
50
private final static char S = 17;
51     private final static char T = 18;
52     private final static char U = 19;
53     //private final static char F = 20;
54
private final static char X = 21;
55     //private final static char TS = 22;
56
//private final static char CH = 23;
57
private final static char SH = 24;
58     private final static char SHCH = 25;
59     //private final static char HARD = 26;
60
private final static char Y = 27;
61     private final static char SOFT = 28;
62     private final static char AE = 29;
63     private final static char IU = 30;
64     private final static char IA = 31;
65
66     // stem definitions
67
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
68
69     private static char[][] perfectiveGerundEndings1 = {
70         { V },
71         { V, SH, I },
72         { V, SH, I, S, SOFT }
73     };
74
75     private static char[][] perfectiveGerund1Predessors = {
76         { A },
77         { IA }
78     };
79
80     private static char[][] perfectiveGerundEndings2 = { { I, V }, {
81         Y, V }, {
82             I, V, SH, I }, {
83                 Y, V, SH, I }, {
84                     I, V, SH, I, S, SOFT }, {
85                         Y, V, SH, I, S, SOFT }
86     };
87
88     private static char[][] adjectiveEndings = {
89         { E, E },
90         { I, E },
91         { Y, E },
92         { O, E },
93         { E, I_ },
94         { I, I_ },
95         { Y, I_ },
96         { O, I_ },
97         { E, M },
98         { I, M },
99         { Y, M },
100         { O, M },
101         { I, X },
102         { Y, X },
103         { U, IU },
104         { IU, IU },
105         { A, IA },
106         { IA, IA },
107         { O, IU },
108         { E, IU },
109         { I, M, I },
110         { Y, M, I },
111         { E, G, O },
112         { O, G, O },
113         { E, M, U },
114         {O, M, U }
115     };
116
117     private static char[][] participleEndings1 = {
118         { SHCH },
119         { E, M },
120         { N, N },
121         { V, SH },
122         { IU, SHCH }
123     };
124
125     private static char[][] participleEndings2 = {
126         { I, V, SH },
127         { Y, V, SH },
128         { U, IU, SHCH }
129     };
130
131     private static char[][] participle1Predessors = {
132         { A },
133         { IA }
134     };
135
136     private static char[][] reflexiveEndings = {
137         { S, IA },
138         { S, SOFT }
139     };
140
141     private static char[][] verbEndings1 = {
142         { I_ },
143         { L },
144         { N },
145         { L, O },
146         { N, O },
147         { E, T },
148         { IU, T },
149         { L, A },
150         { N, A },
151         { L, I },
152         { E, M },
153         { N, Y },
154         { E, T, E },
155         { I_, T, E },
156         { T, SOFT },
157         { E, SH, SOFT },
158         { N, N, O }
159     };
160
161     private static char[][] verbEndings2 = {
162         { IU },
163         { U, IU },
164         { E, N },
165         { E, I_ },
166         { IA, T },
167         { U, I_ },
168         { I, L },
169         { Y, L },
170         { I, M },
171         { Y, M },
172         { I, T },
173         { Y, T },
174         { I, L, A },
175         { Y, L, A },
176         { E, N, A },
177         { I, T, E },
178         { I, L, I },
179         { Y, L, I },
180         { I, L, O },
181         { Y, L, O },
182         { E, N, O },
183         { U, E, T },
184         { U, IU, T },
185         { E, N, Y },
186         { I, T, SOFT },
187         { Y, T, SOFT },
188         { I, SH, SOFT },
189         { E, I_, T, E },
190         { U, I_, T, E }
191     };
192
193     private static char[][] verb1Predessors = {
194         { A },
195         { IA }
196     };
197
198     private static char[][] nounEndings = {
199         { A },
200         { U },
201         { I_ },
202         { O },
203         { U },
204         { E },
205         { Y },
206         { I },
207         { SOFT },
208         { IA },
209         { E, V },
210         { O, V },
211         { I, E },
212         { SOFT, E },
213         { IA, X },
214         { I, IU },
215         { E, I },
216         { I, I },
217         { E, I_ },
218         { O, I_ },
219         { E, M },
220         { A, M },
221         { O, M },
222         { A, X },
223         { SOFT, IU },
224         { I, IA },
225         { SOFT, IA },
226         { I, I_ },
227         { IA, M },
228         { IA, M, I },
229         { A, M, I },
230         { I, E, I_ },
231         { I, IA, M },
232         { I, E, M },
233         { I, IA, X },
234         { I, IA, M, I }
235     };
236
237     private static char[][] superlativeEndings = {
238         { E, I_, SH },
239         { E, I_, SH, E }
240     };
241
242     private static char[][] derivationalEndings = {
243         { O, S, T },
244         { O, S, T, SOFT }
245     };
246
247     /**
248      * RussianStemmer constructor comment.
249      */

250     public RussianStemmer()
251     {
252         super();
253     }
254
255     /**
256      * RussianStemmer constructor comment.
257      */

258     public RussianStemmer(char[] charset)
259     {
260         super();
261         this.charset = charset;
262     }
263
264     /**
265      * Adjectival ending is an adjective ending,
266      * optionally preceded by participle ending.
267      * Creation date: (17/03/2002 12:14:58 AM)
268      * @param stemmingZone java.lang.StringBuffer
269      */

270     private boolean adjectival(StringBuffer JavaDoc stemmingZone)
271     {
272         // look for adjective ending in a stemming zone
273
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
274             return false;
275         // if adjective ending was found, try for participle ending.
276
// variable r is unused, we are just interested in the side effect of
277
// findAndRemoveEnding():
278
boolean r =
279             findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
280             ||
281             findAndRemoveEnding(stemmingZone, participleEndings2);
282         return true;
283     }
284
285     /**
286      * Derivational endings
287      * Creation date: (17/03/2002 12:14:58 AM)
288      * @param stemmingZone java.lang.StringBuffer
289      */

290     private boolean derivational(StringBuffer JavaDoc stemmingZone)
291     {
292         int endingLength = findEnding(stemmingZone, derivationalEndings);
293         if (endingLength == 0)
294              // no derivational ending found
295
return false;
296         else
297         {
298             // Ensure that the ending locates in R2
299
if (R2 - RV <= stemmingZone.length() - endingLength)
300             {
301                 stemmingZone.setLength(stemmingZone.length() - endingLength);
302                 return true;
303             }
304             else
305             {
306                 return false;
307             }
308         }
309     }
310
311     /**
312      * Finds ending among given ending class and returns the length of ending found(0, if not found).
313      * Creation date: (17/03/2002 8:18:34 PM)
314      */

315     private int findEnding(StringBuffer JavaDoc stemmingZone, int startIndex, char[][] theEndingClass)
316     {
317         boolean match = false;
318         for (int i = theEndingClass.length - 1; i >= 0; i--)
319         {
320             char[] theEnding = theEndingClass[i];
321             // check if the ending is bigger than stemming zone
322
if (startIndex < theEnding.length - 1)
323             {
324                 match = false;
325                 continue;
326             }
327             match = true;
328             int stemmingIndex = startIndex;
329             for (int j = theEnding.length - 1; j >= 0; j--)
330             {
331                 if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
332                 {
333                     match = false;
334                     break;
335                 }
336             }
337             // check if ending was found
338
if (match)
339             {
340                 return theEndingClass[i].length; // cut ending
341
}
342         }
343         return 0;
344     }
345
346     private int findEnding(StringBuffer JavaDoc stemmingZone, char[][] theEndingClass)
347     {
348         return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
349     }
350
351     /**
352      * Finds the ending among the given class of endings and removes it from stemming zone.
353      * Creation date: (17/03/2002 8:18:34 PM)
354      */

355     private boolean findAndRemoveEnding(StringBuffer JavaDoc stemmingZone, char[][] theEndingClass)
356     {
357         int endingLength = findEnding(stemmingZone, theEndingClass);
358         if (endingLength == 0)
359             // not found
360
return false;
361         else {
362             stemmingZone.setLength(stemmingZone.length() - endingLength);
363             // cut the ending found
364
return true;
365         }
366     }
367
368     /**
369      * Finds the ending among the given class of endings, then checks if this ending was
370      * preceded by any of given predessors, and if so, removes it from stemming zone.
371      * Creation date: (17/03/2002 8:18:34 PM)
372      */

373     private boolean findAndRemoveEnding(StringBuffer JavaDoc stemmingZone,
374         char[][] theEndingClass, char[][] thePredessors)
375     {
376         int endingLength = findEnding(stemmingZone, theEndingClass);
377         if (endingLength == 0)
378             // not found
379
return false;
380         else
381         {
382             int predessorLength =
383                 findEnding(stemmingZone,
384                     stemmingZone.length() - endingLength - 1,
385                     thePredessors);
386             if (predessorLength == 0)
387                 return false;
388             else {
389                 stemmingZone.setLength(stemmingZone.length() - endingLength);
390                 // cut the ending found
391
return true;
392             }
393         }
394
395     }
396
397     /**
398      * Marks positions of RV, R1 and R2 in a given word.
399      * Creation date: (16/03/2002 3:40:11 PM)
400      */

401     private void markPositions(String JavaDoc word)
402     {
403         RV = 0;
404         R1 = 0;
405         R2 = 0;
406         int i = 0;
407         // find RV
408
while (word.length() > i && !isVowel(word.charAt(i)))
409         {
410             i++;
411         }
412         if (word.length() - 1 < ++i)
413             return; // RV zone is empty
414
RV = i;
415         // find R1
416
while (word.length() > i && isVowel(word.charAt(i)))
417         {
418             i++;
419         }
420         if (word.length() - 1 < ++i)
421             return; // R1 zone is empty
422
R1 = i;
423         // find R2
424
while (word.length() > i && !isVowel(word.charAt(i)))
425         {
426             i++;
427         }
428         if (word.length() - 1 < ++i)
429             return; // R2 zone is empty
430
while (word.length() > i && isVowel(word.charAt(i)))
431         {
432             i++;
433         }
434         if (word.length() - 1 < ++i)
435             return; // R2 zone is empty
436
R2 = i;
437     }
438
439     /**
440      * Checks if character is a vowel..
441      * Creation date: (16/03/2002 10:47:03 PM)
442      * @return boolean
443      * @param letter char
444      */

445     private boolean isVowel(char letter)
446     {
447         for (int i = 0; i < vowels.length; i++)
448         {
449             if (letter == charset[vowels[i]])
450                 return true;
451         }
452         return false;
453     }
454
455     /**
456      * Noun endings.
457      * Creation date: (17/03/2002 12:14:58 AM)
458      * @param stemmingZone java.lang.StringBuffer
459      */

460     private boolean noun(StringBuffer JavaDoc stemmingZone)
461     {
462         return findAndRemoveEnding(stemmingZone, nounEndings);
463     }
464
465     /**
466      * Perfective gerund endings.
467      * Creation date: (17/03/2002 12:14:58 AM)
468      * @param stemmingZone java.lang.StringBuffer
469      */

470     private boolean perfectiveGerund(StringBuffer JavaDoc stemmingZone)
471     {
472         return findAndRemoveEnding(
473             stemmingZone,
474             perfectiveGerundEndings1,
475             perfectiveGerund1Predessors)
476             || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
477     }
478
479     /**
480      * Reflexive endings.
481      * Creation date: (17/03/2002 12:14:58 AM)
482      * @param stemmingZone java.lang.StringBuffer
483      */

484     private boolean reflexive(StringBuffer JavaDoc stemmingZone)
485     {
486         return findAndRemoveEnding(stemmingZone, reflexiveEndings);
487     }
488
489     /**
490      * Insert the method's description here.
491      * Creation date: (17/03/2002 12:14:58 AM)
492      * @param stemmingZone java.lang.StringBuffer
493      */

494     private boolean removeI(StringBuffer JavaDoc stemmingZone)
495     {
496         if (stemmingZone.length() > 0
497             && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
498         {
499             stemmingZone.setLength(stemmingZone.length() - 1);
500             return true;
501         }
502         else
503         {
504             return false;
505         }
506     }
507
508     /**
509      * Insert the method's description here.
510      * Creation date: (17/03/2002 12:14:58 AM)
511      * @param stemmingZone java.lang.StringBuffer
512      */

513     private boolean removeSoft(StringBuffer JavaDoc stemmingZone)
514     {
515         if (stemmingZone.length() > 0
516             && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
517         {
518             stemmingZone.setLength(stemmingZone.length() - 1);
519             return true;
520         }
521         else
522         {
523             return false;
524         }
525     }
526
527     /**
528      * Insert the method's description here.
529      * Creation date: (16/03/2002 10:58:42 PM)
530      * @param newCharset char[]
531      */

532     public void setCharset(char[] newCharset)
533     {
534         charset = newCharset;
535     }
536
537     /**
538      * Finds the stem for given Russian word.
539      * Creation date: (16/03/2002 3:36:48 PM)
540      * @return java.lang.String
541      * @param input java.lang.String
542      */

543     public String JavaDoc stem(String JavaDoc input)
544     {
545         markPositions(input);
546         if (RV == 0)
547             return input; //RV wasn't detected, nothing to stem
548
StringBuffer JavaDoc stemmingZone = new StringBuffer JavaDoc(input.substring(RV));
549         // stemming goes on in RV
550
// Step 1
551

552         if (!perfectiveGerund(stemmingZone))
553         {
554             reflexive(stemmingZone);
555             // variable r is unused, we are just interested in the flow that gets
556
// created by logical expression: apply adjectival(); if that fails,
557
// apply verb() etc
558
boolean r =
559                 adjectival(stemmingZone)
560                 || verb(stemmingZone)
561                 || noun(stemmingZone);
562         }
563         // Step 2
564
removeI(stemmingZone);
565         // Step 3
566
derivational(stemmingZone);
567         // Step 4
568
superlative(stemmingZone);
569         undoubleN(stemmingZone);
570         removeSoft(stemmingZone);
571         // return result
572
return input.substring(0, RV) + stemmingZone.toString();
573     }
574
575     /**
576      * Superlative endings.
577      * Creation date: (17/03/2002 12:14:58 AM)
578      * @param stemmingZone java.lang.StringBuffer
579      */

580     private boolean superlative(StringBuffer JavaDoc stemmingZone)
581     {
582         return findAndRemoveEnding(stemmingZone, superlativeEndings);
583     }
584
585     /**
586      * Undoubles N.
587      * Creation date: (17/03/2002 12:14:58 AM)
588      * @param stemmingZone java.lang.StringBuffer
589      */

590     private boolean undoubleN(StringBuffer JavaDoc stemmingZone)
591     {
592         char[][] doubleN = {
593             { N, N }
594         };
595         if (findEnding(stemmingZone, doubleN) != 0)
596         {
597             stemmingZone.setLength(stemmingZone.length() - 1);
598             return true;
599         }
600         else
601         {
602             return false;
603         }
604     }
605
606     /**
607      * Verb endings.
608      * Creation date: (17/03/2002 12:14:58 AM)
609      * @param stemmingZone java.lang.StringBuffer
610      */

611     private boolean verb(StringBuffer JavaDoc stemmingZone)
612     {
613         return findAndRemoveEnding(
614             stemmingZone,
615             verbEndings1,
616             verb1Predessors)
617             || findAndRemoveEnding(stemmingZone, verbEndings2);
618     }
619
620     /**
621      * Static method for stemming with different charsets
622      */

623     public static String JavaDoc stem(String JavaDoc theWord, char[] charset)
624     {
625         RussianStemmer stemmer = new RussianStemmer();
626         stemmer.setCharset(charset);
627         return stemmer.stem(theWord);
628     }
629 }
630
Popular Tags