GermanStemmer


1   package org.contineo.core.text.analyze.de;
2   
3   import org.contineo.core.text.analyze.Stemmer;
4   
5   /**
6    * Copyright 2004 The Apache Software Foundation
7    *
8    * Licensed under the Apache License, Version 2.0 (the "License");
9    * you may not use this file except in compliance with the License.
10   * You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  /**
22   * A stemmer for German words. The algorithm is based on the report "A Fast and
23   * Simple Stemming Algorithm for German Words" by Jörg Caumanns
24   * (joerg.caumanns@isst.fhg.de).
25   * 
26   * @author Gerhard Schwarz
27   * @version $Id: GermanStemmer.java,v 1.4 2005/01/27 16:56:06 micha_info Exp $
28   */
29  public class GermanStemmer implements Stemmer {
30      /**
31       * Buffer for the terms while stemming them.
32       */
33      private StringBuffer   sb = new StringBuffer  ();
34  
35      /**
36       * Amount of characters that are removed with <tt>substitute()</tt> while
37       * stemming.
38       */
39      private int substCount = 0;
40  
41      /**
42       * Stemms the given term to an unique <tt>discriminator</tt>.
43       * 
44       * @param term
45       *            The term that should be stemmed.
46       * @return Discriminator for <tt>term</tt>
47       */
48      public String   stem(String   term) {
49          // Use lowercase for medium stemming.
50          term = term.toLowerCase();
51          if (!isStemmable(term))
52              return term;
53          // Reset the StringBuffer.
54          sb.delete(0, sb.length());
55          sb.insert(0, term);
56          // Stemming starts here...
57          substitute(sb);
58          strip(sb);
59          optimize(sb);
60          resubstitute(sb);
61          removeParticleDenotion(sb);
62          return sb.toString();
63      }
64  
65      /**
66       * Checks if a term could be stemmed.
67       * 
68       * @return true if, and only if, the given term consists in letters.
69       */
70      private boolean isStemmable(String   term) {
71          for (int c = 0; c < term.length(); c++) {
72              if (!Character.isLetter(term.charAt(c)))
73                  return false;
74          }
75          return true;
76      }
77  
78      /**
79       * suffix stripping (stemming) on the current term. The stripping is reduced
80       * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
81       * from which all regular suffixes are build of. The simplification causes
82       * some overstemming, and way more irregular stems, but still provides
83       * unique. discriminators in the most of those cases. The algorithm is
84       * context free, except of the length restrictions.
85       */
86      private void strip(StringBuffer   buffer) {
87          boolean doMore = true;
88          while (doMore && buffer.length() > 3) {
89              if ((buffer.length() + substCount > 5)
90                      && buffer.substring(buffer.length() - 2, buffer.length())
91                              .equals("nd")) {
92                  buffer.delete(buffer.length() - 2, buffer.length());
93              } else if ((buffer.length() + substCount > 4)
94                      && buffer.substring(buffer.length() - 2, buffer.length())
95                              .equals("em")) {
96                  buffer.delete(buffer.length() - 2, buffer.length());
97              } else if ((buffer.length() + substCount > 4)
98                      && buffer.substring(buffer.length() - 2, buffer.length())
99                              .equals("er")) {
100                 buffer.delete(buffer.length() - 2, buffer.length());
101             } else if (buffer.charAt(buffer.length() - 1) == 'e') {
102                 buffer.deleteCharAt(buffer.length() - 1);
103             } else if (buffer.charAt(buffer.length() - 1) == 's') {
104                 buffer.deleteCharAt(buffer.length() - 1);
105             } else if (buffer.charAt(buffer.length() - 1) == 'n') {
106                 buffer.deleteCharAt(buffer.length() - 1);
107             }
108             // "t" occurs only as suffix of verbs.
109             else if (buffer.charAt(buffer.length() - 1) == 't') {
110                 buffer.deleteCharAt(buffer.length() - 1);
111             } else {
112                 doMore = false;
113             }
114         }
115     }
116 
117     /**
118      * Does some optimizations on the term. This optimisations are contextual.
119      */
120     private void optimize(StringBuffer   buffer) {
121         // Additional step for female plurals of professions and inhabitants.
122         if (buffer.length() > 5
123                 && buffer.substring(buffer.length() - 5, buffer.length())
124                         .equals("erin*")) {
125             buffer.deleteCharAt(buffer.length() - 1);
126             strip(buffer);
127         }
128         // Additional step for irregular plural nouns like "Matrizen -> Matrix".
129         if (buffer.charAt(buffer.length() - 1) == ('z')) {
130             buffer.setCharAt(buffer.length() - 1, 'x');
131         }
132     }
133 
134     /**
135      * Removes a particle denotion ("ge") from a term.
136      */
137     private void removeParticleDenotion(StringBuffer   buffer) {
138         if (buffer.length() > 4) {
139             for (int c = 0; c < buffer.length() - 3; c++) {
140                 if (buffer.substring(c, c + 4).equals("gege")) {
141                     buffer.delete(c, c + 2);
142                     return;
143                 }
144             }
145         }
146     }
147 
148     /**
149      * Do some substitutions for the term to reduce overstemming:
150      *  - Substitute Umlauts with their corresponding vowel: äöü -> aou, "ß"
151      * is substituted by "ss" - Substitute a second char of a pair of equal
152      * characters with an asterisk: ?? -> ?* - Substitute some common character
153      * combinations with a token: sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
154      */
155     private void substitute( StringBuffer   buffer ) {
156       substCount = 0;
157       for ( int c = 0; c < buffer.length(); c++ ) {
158         // Replace the second char of a pair of the equal characters with an
159         // asterisk
160         if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
161           buffer.setCharAt( c, '*' );
162         }
163         // Substitute Umlauts.
164         else if ( buffer.charAt( c ) == '�' ) {
165           buffer.setCharAt( c, 'a' );
166         }
167         else if ( buffer.charAt( c ) == '�' ) {
168           buffer.setCharAt( c, 'o' );
169         }
170         else if ( buffer.charAt( c ) == '�' ) {
171           buffer.setCharAt( c, 'u' );
172         }
173         // Fix bug so that 'ß' at the end of a word is replaced.
174         else if ( buffer.charAt( c ) == '�' ) {
175             buffer.setCharAt( c, 's' );
176             buffer.insert( c + 1, 's' );
177             substCount++;
178         }
179         // Take care that at least one character is left left side from the
180         // current one
181         if ( c < buffer.length() - 1 ) {
182           // Masking several common character combinations with an token
183           if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
184             buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
185           {
186             buffer.setCharAt( c, '$' );
187             buffer.delete( c + 1, c + 3 );
188             substCount =+ 2;
189           }
190           else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
191             buffer.setCharAt( c, '�' );
192             buffer.deleteCharAt( c + 1 );
193             substCount++;
194           }
195           else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
196             buffer.setCharAt( c, '%' );
197             buffer.deleteCharAt( c + 1 );
198             substCount++;
199           }
200           else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
201             buffer.setCharAt( c, '&' );
202             buffer.deleteCharAt( c + 1 );
203             substCount++;
204           }
205           else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
206             buffer.setCharAt( c, '#' );
207             buffer.deleteCharAt( c + 1 );
208             substCount++;
209           }
210           else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
211             buffer.setCharAt( c, '!' );
212             buffer.deleteCharAt( c + 1 );
213             substCount++;
214           }
215         }
216       }
217     }
218     /**
219      * Undoes the changes made by substitute(). That are character pairs and
220      * character combinations. Umlauts will remain as their corresponding vowel,
221      * as "ß" remains as "ss".
222      */
223     private void resubstitute( StringBuffer   buffer ) {
224       for ( int c = 0; c < buffer.length(); c++ ) {
225         if ( buffer.charAt( c ) == '*' ) {
226           char x = buffer.charAt( c - 1 );
227           buffer.setCharAt( c, x );
228         }
229         else if ( buffer.charAt( c ) == '$' ) {
230           buffer.setCharAt( c, 's' );
231           buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
232         }
233         else if ( buffer.charAt( c ) == '�' ) {
234           buffer.setCharAt( c, 'c' );
235           buffer.insert( c + 1, 'h' );
236         }
237         else if ( buffer.charAt( c ) == '%' ) {
238           buffer.setCharAt( c, 'e' );
239           buffer.insert( c + 1, 'i' );
240         }
241         else if ( buffer.charAt( c ) == '&' ) {
242           buffer.setCharAt( c, 'i' );
243           buffer.insert( c + 1, 'e' );
244         }
245         else if ( buffer.charAt( c ) == '#' ) {
246           buffer.setCharAt( c, 'i' );
247           buffer.insert( c + 1, 'g' );
248         }
249         else if ( buffer.charAt( c ) == '!' ) {
250           buffer.setCharAt( c, 's' );
251           buffer.insert( c + 1, 't' );
252         }
253       }
254     }
255 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags