KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > daffodilwoods > daffodildb > server > sql99 > fulltext > common > Stemmer


1 package com.daffodilwoods.daffodildb.server.sql99.fulltext.common;
2 import com.daffodilwoods.database.utility.P;
3
4 /**
5  * Stemmer --basic functionality of stemmer is to convert the token into
6  * its base form like started is coverted to start and agreed is converted to agree
7  *
8  */

9
10 public class Stemmer {
11   private int i, /* offset into b */
12       j, k, k0,pos=0;
13   private boolean dirty = false;
14   byte[] tokenToBeStemming;
15
16   /**
17    * This method takes token Which is byte[] and returned token that is being
18    * stemmed means coverted to its base form
19    * eg agreed -------agree
20    * helping------help
21    * @param token
22    * @return
23    */

24   public byte[] stemmingToken(byte[] token) {
25     dirty = false;
26     tokenToBeStemming = token;
27     i = token.length;
28     stem(0);
29     byte[] tokenAfterStemming;
30     if (pos==0)
31       tokenAfterStemming = new byte[i];
32     else
33       tokenAfterStemming = new byte[i-1];
34     System.arraycopy(tokenToBeStemming, pos, tokenAfterStemming, 0,
35                      tokenAfterStemming.length);
36     return tokenAfterStemming;
37   }
38
39   public boolean stem(int i0) {
40     k=i-1; // k = i ;
41
k0 = i0;
42     if (k > k0 + 1) {
43       step0();
44       step1();
45       step2();
46       step3();
47       step4();
48       step5();
49       step6();
50     }
51     if (i != k + 1)
52       dirty = true;
53     i = k + 1;
54     return dirty;
55   }
56
57   /**
58    * step0() is used to handle following case:
59    * if token='abcd' then it will remove quotes
60    * Now token will be abcd. If token is "daffodil's"
61    * it will become "daffodil".
62 */

63   private final void step0() {
64     if (ends(new byte[] {39})){
65       k--;
66       i--;
67     }
68     if (tokenToBeStemming[0] == 39)
69       pos = 1;
70         else
71             pos=0;
72   }
73
74   private final void step1() {
75     if (tokenToBeStemming[k] == 115) {//"s"
76
if (ends(new byte[] {115, 115, 101, 115}))//"sses"
77
k -= 2;
78       else if (ends(new byte[] {105, 101, 115}))//"ies"
79
setto(new byte[] {105});
80       else if (tokenToBeStemming[k-1] == 39)//"'s"
81
k=k-2; //'s
82
else if (tokenToBeStemming[k - 1] != 115)//"*ss"
83
k--;
84     }else{
85       if (tokenToBeStemming[k] == 116) {
86         if (tokenToBeStemming[k - 1] == 39)
87           k = k - 2; // for 't
88
}
89     }
90
91     if (ends(new byte[] {101, 101, 100})) { //"eed"
92
if (m() > 0)
93         k--;
94     }
95     else if ( (ends(new byte[] {101, 100}) || ends(new byte[] {105, 110, 103})) &&
96              vowelinstem()) {
97       k = j;
98       if (ends(new byte[] {97, 116})) //"at"
99
setto(new byte[] {97, 116, 101}); //"ate"
100
else if (ends(new byte[] {98, 108})) //"bl"
101
setto(new byte[] {98, 108, 101}); //"ble"
102
else if (ends(new byte[] {105, 122})) //"iz"
103
setto(new byte[] {105, 122, 101}); //"ize"
104
else if (doublec(k)) {
105         int ch = tokenToBeStemming[k--];
106         if (ch == 108 || ch == 115 || ch == 122) //l,s,z
107
k++;
108       }
109       else if (m() == 1 && cvc(k))
110         setto(new byte[] {101}); //"e"
111
}
112   }
113
114   /* step2() turns terminal y to i when there is another vowel in the stem. */
115
116   private final void step2() {
117     if (ends(new byte[] {121}) && vowelinstem()) { //"y"
118
tokenToBeStemming[k] = 105; //'i'
119
dirty = true;
120     }
121   }
122
123   /* step3() maps double suffices to single ones. so -ization ( = -ize plus
124      -ation) maps to -ize etc. note that the string before the suffix must give
125      m() > 0. */

126
127   private final void step3() {
128     if (k == k0)
129       return; /* For Bug 1 */
130     switch (tokenToBeStemming[k - 1]) {
131       case 97: //a
132
if (ends(new byte[] {97, 116, 105, 111, 110, 97, 108})) { //"ational"
133
r(new byte[] {97, 116, 101}); //"ate"
134
break;
135         }
136         if (ends(new byte[] {116, 105, 111, 110, 97, 108})) { //"tional"
137
r(new byte[] {116, 105, 111, 110}); //"tion"
138
break;
139         }
140         break;
141       case 99: //'c'
142
if (ends(new byte[] {101, 110, 99, 105})) { //"enci"
143
r(new byte[] {101, 110, 99, 101}); //"ence"
144
break;
145         }
146         if (ends(new byte[] {97, 110, 99, 105})) { //"anci"
147
r(new byte[] {97, 110, 99, 101}); //"ance"
148
break;
149         }
150         break;
151       case 101: //e
152
if (ends(new byte[] {101, 122, 101, 114})) { //"izer"
153
r(new byte[] {101, 122, 101}); //"ize"
154
break;
155         }
156         break;
157       case 108: //'l'
158
if (ends(new byte[] {98, 108, 105})) { //"bli"
159
r(new byte[] {98, 108, 101}); //"ble"
160
break;
161         }
162         if (ends(new byte[] {97, 108, 108, 105})) { //"alli"
163
r(new byte[] {97, 108}); //"al"
164
break;
165         }
166         if (ends(new byte[] {101, 110, 116, 108, 105})) { //"entli"
167
r(new byte[] {101, 110, 116}); //"ent"
168
break;
169         }
170         if ((ends(new byte[] {108, 108, 105})) || (ends(new byte[] {108, 108}))) { //"ll"
171
r(new byte[] {108}); //"l"
172
break;
173         }
174         if (ends(new byte[] {101, 108, 105})) { //"eli"
175
r(new byte[] {101}); //"e"
176
break;
177         }
178         if (ends(new byte[] {111, 117, 115, 108, 105})) { //"ousli"
179
r(new byte[] {111, 117, 115}); //"ous"
180
break;
181         }
182         break;
183       case 111: //'o'
184
if (ends(new byte[] {105, 122, 97, 116, 105, 111, 110})) { //"ization"
185
r(new byte[] {105, 122, 101}); //"ize"
186
break;
187         }
188         if (ends(new byte[] {97, 116, 105, 111, 110})) { //"ation"
189
r(new byte[] {97, 116, 101}); //"ate"
190
break;
191         }
192         if (ends(new byte[] {97, 116, 111, 114})) { //"ator"
193
r(new byte[] {97, 116, 101}); //"ate"
194
break;
195         }
196         break;
197       case 115: //'s'
198
if (ends(new byte[] {97, 108, 105, 115, 109})) { //"alism"
199
r(new byte[] {97, 108}); //"al"
200
break;
201         }
202         if (ends(new byte[] {105, 118, 101, 110, 101, 115, 115})) { //"iveness"
203
r(new byte[] {105, 118, 101}); //"ive"
204
break;
205         }
206         if (ends(new byte[] {102, 117, 108, 110, 101, 115, 115})) { //"fulness"
207
r(new byte[] {102, 117, 108}); //"ful"
208
break;
209         }
210         if (ends(new byte[] {111, 117, 115, 110, 101, 115, 115})) { //"ousness"
211
r(new byte[] {111, 117, 115}); //"ous"
212
break;
213         }
214         break;
215       case 116: //'t'
216
if (ends(new byte[] {97, 108, 105, 116, 105})) { //"aliti"
217
r(new byte[] {97, 108}); //"al"
218
break;
219         }
220         if (ends(new byte[] {105, 118, 105, 116, 105})) { //"iviti"
221
r(new byte[] {105, 118, 101}); //"ive"
222
break;
223         }
224         if (ends(new byte[] {98, 105, 108, 105, 116, 105})) { //"biliti"
225
r(new byte[] {98, 105, 101}); //"ble"
226
break;
227         }
228         break;
229       case 103: //'g'
230
if (ends(new byte[] {108, 111, 103, 105})) { //"logi"
231
r(new byte[] {108, 111, 103}); //"log"
232
break;
233         }
234     }
235   }
236
237   /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
238
239   private final void step4() {
240     switch (tokenToBeStemming[k]) {
241       case 101: //'e'
242
if (ends(new byte[] {105, 99, 97, 116, 101})) { //"icate"
243
r(new byte[] {105, 99}); //"ic"
244
break;
245         }
246         if (ends(new byte[] {97, 116, 105, 118, 101})) { //"ative"
247
r(new byte[] {32}); //""
248
k--;
249           break;
250         }
251         if (ends(new byte[] {97, 108, 105, 122, 101})) { //"alize"
252
r(new byte[] {97, 108}); //"al"
253
break;
254         }
255         break;
256       case 105: //'i'
257
if (ends(new byte[] {105, 99, 105, 116, 105})) { //"iciti"
258
r(new byte[] {105, 99}); //"ic"
259
break;
260         }
261         break;
262       case 108: //'l'
263
if (ends(new byte[] {105, 99, 97, 108})) { //"ical"
264
r(new byte[] {105, 99}); //"ic"
265
break;
266         }
267         if (ends(new byte[] {102, 117, 108})) { //"ful" // for successfull
268
r(new byte[] {32}); //""
269
k--;
270           if (ends(new byte[] {121})) //"y"//for playful, play
271
setto(new byte[] {105}); //"i"
272
break;
273         }
274         break;
275       case 115: //'s'
276
if (ends(new byte[] {110, 101, 115, 115})) { //"ness"
277
r(new byte[] {32}); //""
278
k--;
279           break;
280         }
281         break;
282     }
283   }
284
285   /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
286
287   private final void step5() {
288     if (k == k0)
289       return; /* for Bug 1 */
290     switch (tokenToBeStemming[k - 1]) {
291       case 97: //'a'
292
if (ends(new byte[] {97, 108})) //"al"
293
break;
294         return;
295       case 99: //'c'
296
if (ends(new byte[] {97, 110, 99, 101})) //"ance"
297
break;
298         if (ends(new byte[] {101, 110, 99, 101})) //"ence"
299
break;
300         return;
301       case 101: //'e'
302
if (ends(new byte[] {101, 114})) //"er"
303
break;
304         return;
305       case 105: //'i'
306
if (ends(new byte[] {105, 99})) //"ic"
307
break;
308         return;
309       case 108: //'l'
310
if (ends(new byte[] {97, 98, 108, 101})) //"able"
311
break;
312         if (ends(new byte[] {105, 98, 108, 101})) //"ible"
313
break;
314         return;
315       case 110: //'n'
316
if (ends(new byte[] {97, 110, 116})) //"ant"
317
break;
318         if (ends(new byte[] {101, 109, 101, 110, 116})) //"ement"
319
break;
320         if (ends(new byte[] {109, 101, 110, 116})) //"ment"
321
break;
322
323         /* element etc. not stripped before the m */
324         if (ends(new byte[] {101, 110, 116}))
325           break;
326         return;
327       case 111: //'o'
328
if (ends(new byte[] {105, 111, 110}) && j >= 0 &&
329             (tokenToBeStemming[j] == 115 || tokenToBeStemming[j] == 116)) //"ion", 's' ,'t'
330
break;
331
332         /* j >= 0 fixes Bug 2 */
333         if (ends(new byte[] {111, 117})) //"ou"
334
break;
335         return;
336       /* takes care of -ous */
337       case 115: //'s'
338
if (ends(new byte[] {105, 115, 109})) //"ism"
339
break;
340         return;
341       case 116: //'t'
342
if (ends(new byte[] {97, 116, 101})) //"ate"
343
break;
344         if (ends(new byte[] {105, 116, 105})) //"iti"
345
break;
346         return;
347       case 117: //'u'
348
if (ends(new byte[] {111, 117, 115})) //"ous"
349
break;
350         return;
351       case 118: //'v'
352
if (ends(new byte[] {105, 118, 101})) //"ive"
353
break;
354         return;
355       case 122: //'z'
356
if (ends(new byte[] {105, 122, 101})) //"ize"
357
break;
358         return;
359       default:
360         return;
361     }
362     if (m() > 1)
363       k = j;
364   }
365
366   /* step6() removes a final -e if m() > 1. */
367
368   private final void step6() {
369     j = k;
370     if (tokenToBeStemming[k] == 101) { //'e'
371
int a = m();
372       if (a > 1 || a == 1 && !cvc(k - 1))
373         k--;
374     }
375     if (tokenToBeStemming[k] == 108 && doublec(k) && m() > 1) //'l'
376
k--;
377   }
378
379   /* cons(i) is true <=> b[i] is a consonant. */
380
381   private final boolean cons(int i) {
382     switch (tokenToBeStemming[i]) {
383       case 97:
384       case 101:
385       case 105:
386       case 111:
387       case 117:
388         return false;
389       case 121:
390         return (i == k0) ? true : !cons(i - 1);
391       default:
392         return true;
393     }
394   }
395
396   /* m() measures the number of consonant sequences between k0 and j. if c is
397      a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
398      presence,
399           <c><v> gives 0
400           <c>vc<v> gives 1
401           <c>vcvc<v> gives 2
402           <c>vcvcvc<v> gives 3
403           ....
404    */

405
406   private final int m() {
407     int n = 0;
408     int i = k0;
409     while (true) {
410       if (i > j)
411         return n;
412       if (!cons(i))
413         break;
414       i++;
415     }
416     i++;
417     while (true) {
418       while (true) {
419         if (i > j)
420           return n;
421         if (cons(i))
422           break;
423         i++;
424       }
425       i++;
426       n++;
427       while (true) {
428         if (i > j)
429           return n;
430         if (!cons(i))
431           break;
432         i++;
433       }
434       i++;
435     }
436   }
437
438   /* vowelinstem() is true <=> k0,...j contains a vowel */
439
440   private final boolean vowelinstem() {
441     int i;
442     for (i = k0; i <= j; i++)
443       if (!cons(i))
444         return true;
445     return false;
446   }
447
448   /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
449
450   private final boolean doublec(int j) {
451     if (j < k0 + 1)
452       return false;
453     if (tokenToBeStemming[j] != tokenToBeStemming[j - 1])
454       return false;
455     return cons(j);
456   }
457
458   /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
459      and also if the second c is not w,x or y. this is used when trying to
460      restore an e at the end of a short word. e.g.
461           cav(e), lov(e), hop(e), crim(e), but
462           snow, box, tray.
463    */

464
465   private final boolean cvc(int i) {
466     if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2))
467       return false;
468     else {
469       int ch = tokenToBeStemming[i];
470       if (ch == 'w' || ch == 'x' || ch == 'y')
471         return false;
472     }
473     return true;
474   }
475
476   private final boolean ends(byte[] s) {
477     int l = s.length;
478     int o = k - l + 1;
479       if (o < k0)
480       return false;
481     for (int i = 0; i < l; i++)
482       if (tokenToBeStemming[o + i] != s[i])
483         return false;
484     j = k - l;
485     return true;
486   }
487
488   /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
489      k. */

490
491   void setto(byte[] s) {
492     int l = s.length;
493     int o = j + 1;
494     for (int i = 0; i < l; i++)
495       tokenToBeStemming[o + i] = s[ (byte) i];
496     k = j + l;
497     dirty = true;
498   }
499
500   /* r(s) is used further down. */
501
502   void r(byte[] s) {
503     if (m() > 0)
504       setto(s);
505   }
506
507 /* public static void main(String[] args) {
508    byte[] test = {
509     (byte)'w',(byte)'a',(byte)'k',(byte)'e',(byte)'f',(byte)'u',(byte)'l'};//,(byte)'f',(byte)'u',(byte)'l',(byte)'l',(byte)'y',(byte)'f',(byte)'u',(byte)'l',(byte)'l',(byte)'y'
510
511     for(int j=0;j<test.length;j++)
512          ;//// Removed By Program ** System.out.println("test: "+(char)test[j]);
513     Stemmer st = new Stemmer();
514   }*/

515
516 }
517
Popular Tags