| 1 package org.contineo.core.text.analyze.de; 2 3 import org.contineo.core.text.analyze.Stemmer; 4 5 20 21 29 public class GermanStemmer implements Stemmer { 30 33 private StringBuffer sb = new StringBuffer (); 34 35 39 private int substCount = 0; 40 41 48 public String stem(String term) { 49 term = term.toLowerCase(); 51 if (!isStemmable(term)) 52 return term; 53 sb.delete(0, sb.length()); 55 sb.insert(0, term); 56 substitute(sb); 58 strip(sb); 59 optimize(sb); 60 resubstitute(sb); 61 removeParticleDenotion(sb); 62 return sb.toString(); 63 } 64 65 70 private boolean isStemmable(String term) { 71 for (int c = 0; c < term.length(); c++) { 72 if (!Character.isLetter(term.charAt(c))) 73 return false; 74 } 75 return true; 76 } 77 78 86 private void strip(StringBuffer buffer) { 87 boolean doMore = true; 88 while (doMore && buffer.length() > 3) { 89 if ((buffer.length() + substCount > 5) 90 && buffer.substring(buffer.length() - 2, buffer.length()) 91 .equals("nd")) { 92 buffer.delete(buffer.length() - 2, buffer.length()); 93 } else if ((buffer.length() + substCount > 4) 94 && buffer.substring(buffer.length() - 2, buffer.length()) 95 .equals("em")) { 96 buffer.delete(buffer.length() - 2, buffer.length()); 97 } else if ((buffer.length() + substCount > 4) 98 && buffer.substring(buffer.length() - 2, buffer.length()) 99 .equals("er")) { 100 buffer.delete(buffer.length() - 2, buffer.length()); 101 } else if (buffer.charAt(buffer.length() - 1) == 'e') { 102 buffer.deleteCharAt(buffer.length() - 1); 103 } else if (buffer.charAt(buffer.length() - 1) == 's') { 104 buffer.deleteCharAt(buffer.length() - 1); 105 } else if (buffer.charAt(buffer.length() - 1) == 'n') { 106 buffer.deleteCharAt(buffer.length() - 1); 107 } 108 else if (buffer.charAt(buffer.length() - 1) == 't') { 110 buffer.deleteCharAt(buffer.length() - 1); 111 } else { 112 doMore = false; 113 } 114 } 115 } 116 117 120 private void optimize(StringBuffer buffer) { 121 if (buffer.length() > 5 123 && buffer.substring(buffer.length() - 5, buffer.length()) 124 .equals("erin*")) { 125 buffer.deleteCharAt(buffer.length() - 1); 126 strip(buffer); 127 } 128 if (buffer.charAt(buffer.length() - 1) == ('z')) { 130 buffer.setCharAt(buffer.length() - 1, 'x'); 131 } 132 } 133 134 137 private void removeParticleDenotion(StringBuffer buffer) { 138 if (buffer.length() > 4) { 139 for (int c = 0; c < buffer.length() - 3; c++) { 140 if (buffer.substring(c, c + 4).equals("gege")) { 141 buffer.delete(c, c + 2); 142 return; 143 } 144 } 145 } 146 } 147 148 155 private void substitute( StringBuffer buffer ) { 156 substCount = 0; 157 for ( int c = 0; c < buffer.length(); c++ ) { 158 if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { 161 buffer.setCharAt( c, '*' ); 162 } 163 else if ( buffer.charAt( c ) == 'ä' ) { 165 buffer.setCharAt( c, 'a' ); 166 } 167 else if ( buffer.charAt( c ) == 'ö' ) { 168 buffer.setCharAt( c, 'o' ); 169 } 170 else if ( buffer.charAt( c ) == 'ü' ) { 171 buffer.setCharAt( c, 'u' ); 172 } 173 else if ( buffer.charAt( c ) == 'ß' ) { 175 buffer.setCharAt( c, 's' ); 176 buffer.insert( c + 1, 's' ); 177 substCount++; 178 } 179 if ( c < buffer.length() - 1 ) { 182 if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && 184 buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) 185 { 186 buffer.setCharAt( c, '$' ); 187 buffer.delete( c + 1, c + 3 ); 188 substCount =+ 2; 189 } 190 else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { 191 buffer.setCharAt( c, '§' ); 192 buffer.deleteCharAt( c + 1 ); 193 substCount++; 194 } 195 else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { 196 buffer.setCharAt( c, '%' ); 197 buffer.deleteCharAt( c + 1 ); 198 substCount++; 199 } 200 else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { 201 buffer.setCharAt( c, '&' ); 202 buffer.deleteCharAt( c + 1 ); 203 substCount++; 204 } 205 else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { 206 buffer.setCharAt( c, '#' ); 207 buffer.deleteCharAt( c + 1 ); 208 substCount++; 209 } 210 else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { 211 buffer.setCharAt( c, '!' ); 212 buffer.deleteCharAt( c + 1 ); 213 substCount++; 214 } 215 } 216 } 217 } 218 223 private void resubstitute( StringBuffer buffer ) { 224 for ( int c = 0; c < buffer.length(); c++ ) { 225 if ( buffer.charAt( c ) == '*' ) { 226 char x = buffer.charAt( c - 1 ); 227 buffer.setCharAt( c, x ); 228 } 229 else if ( buffer.charAt( c ) == '$' ) { 230 buffer.setCharAt( c, 's' ); 231 buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); 232 } 233 else if ( buffer.charAt( c ) == '§' ) { 234 buffer.setCharAt( c, 'c' ); 235 buffer.insert( c + 1, 'h' ); 236 } 237 else if ( buffer.charAt( c ) == '%' ) { 238 buffer.setCharAt( c, 'e' ); 239 buffer.insert( c + 1, 'i' ); 240 } 241 else if ( buffer.charAt( c ) == '&' ) { 242 buffer.setCharAt( c, 'i' ); 243 buffer.insert( c + 1, 'e' ); 244 } 245 else if ( buffer.charAt( c ) == '#' ) { 246 buffer.setCharAt( c, 'i' ); 247 buffer.insert( c + 1, 'g' ); 248 } 249 else if ( buffer.charAt( c ) == '!' ) { 250 buffer.setCharAt( c, 's' ); 251 buffer.insert( c + 1, 't' ); 252 } 253 } 254 } 255 } | Popular Tags |