KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > contineo > core > text > analyze > de > GermanStemmer


1 package org.contineo.core.text.analyze.de;
2
3 import org.contineo.core.text.analyze.Stemmer;
4
5 /**
6  * Copyright 2004 The Apache Software Foundation
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  * http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */

20
21 /**
22  * A stemmer for German words. The algorithm is based on the report "A Fast and
23  * Simple Stemming Algorithm for German Words" by Jörg Caumanns
24  * (joerg.caumanns@isst.fhg.de).
25  *
26  * @author Gerhard Schwarz
27  * @version $Id: GermanStemmer.java,v 1.4 2005/01/27 16:56:06 micha_info Exp $
28  */

29 public class GermanStemmer implements Stemmer {
30     /**
31      * Buffer for the terms while stemming them.
32      */

33     private StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
34
35     /**
36      * Amount of characters that are removed with <tt>substitute()</tt> while
37      * stemming.
38      */

39     private int substCount = 0;
40
41     /**
42      * Stemms the given term to an unique <tt>discriminator</tt>.
43      *
44      * @param term
45      * The term that should be stemmed.
46      * @return Discriminator for <tt>term</tt>
47      */

48     public String JavaDoc stem(String JavaDoc term) {
49         // Use lowercase for medium stemming.
50
term = term.toLowerCase();
51         if (!isStemmable(term))
52             return term;
53         // Reset the StringBuffer.
54
sb.delete(0, sb.length());
55         sb.insert(0, term);
56         // Stemming starts here...
57
substitute(sb);
58         strip(sb);
59         optimize(sb);
60         resubstitute(sb);
61         removeParticleDenotion(sb);
62         return sb.toString();
63     }
64
65     /**
66      * Checks if a term could be stemmed.
67      *
68      * @return true if, and only if, the given term consists in letters.
69      */

70     private boolean isStemmable(String JavaDoc term) {
71         for (int c = 0; c < term.length(); c++) {
72             if (!Character.isLetter(term.charAt(c)))
73                 return false;
74         }
75         return true;
76     }
77
78     /**
79      * suffix stripping (stemming) on the current term. The stripping is reduced
80      * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
81      * from which all regular suffixes are build of. The simplification causes
82      * some overstemming, and way more irregular stems, but still provides
83      * unique. discriminators in the most of those cases. The algorithm is
84      * context free, except of the length restrictions.
85      */

86     private void strip(StringBuffer JavaDoc buffer) {
87         boolean doMore = true;
88         while (doMore && buffer.length() > 3) {
89             if ((buffer.length() + substCount > 5)
90                     && buffer.substring(buffer.length() - 2, buffer.length())
91                             .equals("nd")) {
92                 buffer.delete(buffer.length() - 2, buffer.length());
93             } else if ((buffer.length() + substCount > 4)
94                     && buffer.substring(buffer.length() - 2, buffer.length())
95                             .equals("em")) {
96                 buffer.delete(buffer.length() - 2, buffer.length());
97             } else if ((buffer.length() + substCount > 4)
98                     && buffer.substring(buffer.length() - 2, buffer.length())
99                             .equals("er")) {
100                 buffer.delete(buffer.length() - 2, buffer.length());
101             } else if (buffer.charAt(buffer.length() - 1) == 'e') {
102                 buffer.deleteCharAt(buffer.length() - 1);
103             } else if (buffer.charAt(buffer.length() - 1) == 's') {
104                 buffer.deleteCharAt(buffer.length() - 1);
105             } else if (buffer.charAt(buffer.length() - 1) == 'n') {
106                 buffer.deleteCharAt(buffer.length() - 1);
107             }
108             // "t" occurs only as suffix of verbs.
109
else if (buffer.charAt(buffer.length() - 1) == 't') {
110                 buffer.deleteCharAt(buffer.length() - 1);
111             } else {
112                 doMore = false;
113             }
114         }
115     }
116
117     /**
118      * Does some optimizations on the term. This optimisations are contextual.
119      */

120     private void optimize(StringBuffer JavaDoc buffer) {
121         // Additional step for female plurals of professions and inhabitants.
122
if (buffer.length() > 5
123                 && buffer.substring(buffer.length() - 5, buffer.length())
124                         .equals("erin*")) {
125             buffer.deleteCharAt(buffer.length() - 1);
126             strip(buffer);
127         }
128         // Additional step for irregular plural nouns like "Matrizen -> Matrix".
129
if (buffer.charAt(buffer.length() - 1) == ('z')) {
130             buffer.setCharAt(buffer.length() - 1, 'x');
131         }
132     }
133
134     /**
135      * Removes a particle denotion ("ge") from a term.
136      */

137     private void removeParticleDenotion(StringBuffer JavaDoc buffer) {
138         if (buffer.length() > 4) {
139             for (int c = 0; c < buffer.length() - 3; c++) {
140                 if (buffer.substring(c, c + 4).equals("gege")) {
141                     buffer.delete(c, c + 2);
142                     return;
143                 }
144             }
145         }
146     }
147
148     /**
149      * Do some substitutions for the term to reduce overstemming:
150      * - Substitute Umlauts with their corresponding vowel: äöü -> aou, "ß"
151      * is substituted by "ss" - Substitute a second char of a pair of equal
152      * characters with an asterisk: ?? -> ?* - Substitute some common character
153      * combinations with a token: sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
154      */

155     private void substitute( StringBuffer JavaDoc buffer ) {
156       substCount = 0;
157       for ( int c = 0; c < buffer.length(); c++ ) {
158         // Replace the second char of a pair of the equal characters with an
159
// asterisk
160
if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
161           buffer.setCharAt( c, '*' );
162         }
163         // Substitute Umlauts.
164
else if ( buffer.charAt( c ) == 'ä' ) {
165           buffer.setCharAt( c, 'a' );
166         }
167         else if ( buffer.charAt( c ) == 'ö' ) {
168           buffer.setCharAt( c, 'o' );
169         }
170         else if ( buffer.charAt( c ) == 'ü' ) {
171           buffer.setCharAt( c, 'u' );
172         }
173         // Fix bug so that 'ß' at the end of a word is replaced.
174
else if ( buffer.charAt( c ) == 'ß' ) {
175             buffer.setCharAt( c, 's' );
176             buffer.insert( c + 1, 's' );
177             substCount++;
178         }
179         // Take care that at least one character is left left side from the
180
// current one
181
if ( c < buffer.length() - 1 ) {
182           // Masking several common character combinations with an token
183
if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
184             buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
185           {
186             buffer.setCharAt( c, '$' );
187             buffer.delete( c + 1, c + 3 );
188             substCount =+ 2;
189           }
190           else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
191             buffer.setCharAt( c, '§' );
192             buffer.deleteCharAt( c + 1 );
193             substCount++;
194           }
195           else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
196             buffer.setCharAt( c, '%' );
197             buffer.deleteCharAt( c + 1 );
198             substCount++;
199           }
200           else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
201             buffer.setCharAt( c, '&' );
202             buffer.deleteCharAt( c + 1 );
203             substCount++;
204           }
205           else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
206             buffer.setCharAt( c, '#' );
207             buffer.deleteCharAt( c + 1 );
208             substCount++;
209           }
210           else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
211             buffer.setCharAt( c, '!' );
212             buffer.deleteCharAt( c + 1 );
213             substCount++;
214           }
215         }
216       }
217     }
218     /**
219      * Undoes the changes made by substitute(). That are character pairs and
220      * character combinations. Umlauts will remain as their corresponding vowel,
221      * as "ß" remains as "ss".
222      */

223     private void resubstitute( StringBuffer JavaDoc buffer ) {
224       for ( int c = 0; c < buffer.length(); c++ ) {
225         if ( buffer.charAt( c ) == '*' ) {
226           char x = buffer.charAt( c - 1 );
227           buffer.setCharAt( c, x );
228         }
229         else if ( buffer.charAt( c ) == '$' ) {
230           buffer.setCharAt( c, 's' );
231           buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
232         }
233         else if ( buffer.charAt( c ) == '§' ) {
234           buffer.setCharAt( c, 'c' );
235           buffer.insert( c + 1, 'h' );
236         }
237         else if ( buffer.charAt( c ) == '%' ) {
238           buffer.setCharAt( c, 'e' );
239           buffer.insert( c + 1, 'i' );
240         }
241         else if ( buffer.charAt( c ) == '&' ) {
242           buffer.setCharAt( c, 'i' );
243           buffer.insert( c + 1, 'e' );
244         }
245         else if ( buffer.charAt( c ) == '#' ) {
246           buffer.setCharAt( c, 'i' );
247           buffer.insert( c + 1, 'g' );
248         }
249         else if ( buffer.charAt( c ) == '!' ) {
250           buffer.setCharAt( c, 's' );
251           buffer.insert( c + 1, 't' );
252         }
253       }
254     }
255 }
Popular Tags