KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > commons > codec > language > Metaphone


1 /*
2  * Copyright 2001-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 package org.apache.commons.codec.language;
18
19 import org.apache.commons.codec.EncoderException;
20 import org.apache.commons.codec.StringEncoder;
21
22 /**
23  * Encodes a string into a metaphone value.
24  * <p>
25  * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
26  * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
27  * </p>
28  * <p>
29  * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
30  * 39.</CITE>
31  * </p>
32  *
33  * @author Apache Software Foundation
34  * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $
35  */

36 public class Metaphone implements StringEncoder {
37
38     /**
39      * Five values in the English language
40      */

41     private String JavaDoc vowels = "AEIOU" ;
42
43     /**
44      * Variable used in Metaphone algorithm
45      */

46     private String JavaDoc frontv = "EIY" ;
47
48     /**
49      * Variable used in Metaphone algorithm
50      */

51     private String JavaDoc varson = "CSPTG" ;
52
53     /**
54      * The max code length for metaphone is 4
55      */

56     private int maxCodeLen = 4 ;
57
58     /**
59      * Creates an instance of the Metaphone encoder
60      */

61     public Metaphone() {
62         super();
63     }
64
65     /**
66      * Find the metaphone value of a String. This is similar to the
67      * soundex algorithm, but better at finding similar sounding words.
68      * All input is converted to upper case.
69      * Limitations: Input format is expected to be a single ASCII word
70      * with only characters in the A - Z range, no punctuation or numbers.
71      *
72      * @param txt String to find the metaphone code for
73      * @return A metaphone code corresponding to the String supplied
74      */

75     public String JavaDoc metaphone(String JavaDoc txt) {
76         boolean hard = false ;
77         if ((txt == null) || (txt.length() == 0)) {
78             return "" ;
79         }
80         // single character is itself
81
if (txt.length() == 1) {
82             return txt.toUpperCase() ;
83         }
84       
85         char[] inwd = txt.toUpperCase().toCharArray() ;
86       
87         StringBuffer JavaDoc local = new StringBuffer JavaDoc(40); // manipulate
88
StringBuffer JavaDoc code = new StringBuffer JavaDoc(10) ; // output
89
// handle initial 2 characters exceptions
90
switch(inwd[0]) {
91         case 'K' :
92         case 'G' :
93         case 'P' : /* looking for KN, etc*/
94             if (inwd[1] == 'N') {
95                 local.append(inwd, 1, inwd.length - 1);
96             } else {
97                 local.append(inwd);
98             }
99             break;
100         case 'A': /* looking for AE */
101             if (inwd[1] == 'E') {
102                 local.append(inwd, 1, inwd.length - 1);
103             } else {
104                 local.append(inwd);
105             }
106             break;
107         case 'W' : /* looking for WR or WH */
108             if (inwd[1] == 'R') { // WR -> R
109
local.append(inwd, 1, inwd.length - 1);
110                 break ;
111             }
112             if (inwd[1] == 'H') {
113                 local.append(inwd, 1, inwd.length - 1);
114                 local.setCharAt(0, 'W'); // WH -> W
115
} else {
116                 local.append(inwd);
117             }
118             break;
119         case 'X' : /* initial X becomes S */
120             inwd[0] = 'S';
121             local.append(inwd);
122             break ;
123         default :
124             local.append(inwd);
125         } // now local has working string with initials fixed
126

127         int wdsz = local.length();
128         int n = 0 ;
129
130         while ((code.length() < this.getMaxCodeLen()) &&
131                (n < wdsz) ) { // max code size of 4 works well
132
char symb = local.charAt(n) ;
133             // remove duplicate letters except C
134
if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
135                 n++ ;
136             } else { // not dup
137
switch(symb) {
138                 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
139                     if (n == 0) {
140                         code.append(symb);
141                     }
142                     break ; // only use vowel if leading char
143
case 'B' :
144                     if ( isPreviousChar(local, n, 'M') &&
145                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
146
break;
147                     }
148                     code.append(symb);
149                     break;
150                 case 'C' : // lots of C special cases
151
/* discard if SCI, SCE or SCY */
152                     if ( isPreviousChar(local, n, 'S') &&
153                          !isLastChar(wdsz, n) &&
154                          (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) {
155                         break;
156                     }
157                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
158
code.append('X');
159                         break;
160                     }
161                     if (!isLastChar(wdsz, n) &&
162                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
163                         code.append('S');
164                         break; // CI,CE,CY -> S
165
}
166                     if (isPreviousChar(local, n, 'S') &&
167                         isNextChar(local, n, 'H') ) { // SCH->sk
168
code.append('K') ;
169                         break ;
170                     }
171                     if (isNextChar(local, n, 'H')) { // detect CH
172
if ((n == 0) &&
173                             (wdsz >= 3) &&
174                             isVowel(local,2) ) { // CH consonant -> K consonant
175
code.append('K');
176                         } else {
177                             code.append('X'); // CHvowel -> X
178
}
179                     } else {
180                         code.append('K');
181                     }
182                     break ;
183                 case 'D' :
184                     if (!isLastChar(wdsz, n + 1) &&
185                         isNextChar(local, n, 'G') &&
186                         (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
187
code.append('J'); n += 2 ;
188                     } else {
189                         code.append('T');
190                     }
191                     break ;
192                 case 'G' : // GH silent at end or before consonant
193
if (isLastChar(wdsz, n + 1) &&
194                         isNextChar(local, n, 'H')) {
195                         break;
196                     }
197                     if (!isLastChar(wdsz, n + 1) &&
198                         isNextChar(local,n,'H') &&
199                         !isVowel(local,n+2)) {
200                         break;
201                     }
202                     if ((n > 0) &&
203                         ( regionMatch(local, n, "GN") ||
204                           regionMatch(local, n, "GNED") ) ) {
205                         break; // silent G
206
}
207                     if (isPreviousChar(local, n, 'G')) {
208                         hard = true ;
209                     } else {
210                         hard = false ;
211                     }
212                     if (!isLastChar(wdsz, n) &&
213                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0) &&
214                         (!hard)) {
215                         code.append('J');
216                     } else {
217                         code.append('K');
218                     }
219                     break ;
220                 case 'H':
221                     if (isLastChar(wdsz, n)) {
222                         break ; // terminal H
223
}
224                     if ((n > 0) &&
225                         (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
226                         break;
227                     }
228                     if (isVowel(local,n+1)) {
229                         code.append('H'); // Hvowel
230
}
231                     break;
232                 case 'F':
233                 case 'J' :
234                 case 'L' :
235                 case 'M':
236                 case 'N' :
237                 case 'R' :
238                     code.append(symb);
239                     break;
240                 case 'K' :
241                     if (n > 0) { // not initial
242
if (!isPreviousChar(local, n, 'C')) {
243                             code.append(symb);
244                         }
245                     } else {
246                         code.append(symb); // initial K
247
}
248                     break ;
249                 case 'P' :
250                     if (isNextChar(local,n,'H')) {
251                         // PH -> F
252
code.append('F');
253                     } else {
254                         code.append(symb);
255                     }
256                     break ;
257                 case 'Q' :
258                     code.append('K');
259                     break;
260                 case 'S' :
261                     if (regionMatch(local,n,"SH") ||
262                         regionMatch(local,n,"SIO") ||
263                         regionMatch(local,n,"SIA")) {
264                         code.append('X');
265                     } else {
266                         code.append('S');
267                     }
268                     break;
269                 case 'T' :
270                     if (regionMatch(local,n,"TIA") ||
271                         regionMatch(local,n,"TIO")) {
272                         code.append('X');
273                         break;
274                     }
275                     if (regionMatch(local,n,"TCH")) {
276                         // Silent if in "TCH"
277
break;
278                     }
279                     // substitute numeral 0 for TH (resembles theta after all)
280
if (regionMatch(local,n,"TH")) {
281                         code.append('0');
282                     } else {
283                         code.append('T');
284                     }
285                     break ;
286                 case 'V' :
287                     code.append('F'); break ;
288                 case 'W' : case 'Y' : // silent if not followed by vowel
289
if (!isLastChar(wdsz,n) &&
290                         isVowel(local,n+1)) {
291                         code.append(symb);
292                     }
293                     break ;
294                 case 'X' :
295                     code.append('K'); code.append('S');
296                     break ;
297                 case 'Z' :
298                     code.append('S'); break ;
299                 } // end switch
300
n++ ;
301             } // end else from symb != 'C'
302
if (code.length() > this.getMaxCodeLen()) {
303                 code.setLength(this.getMaxCodeLen());
304             }
305         }
306         return code.toString();
307     }
308
309     private boolean isVowel(StringBuffer JavaDoc string, int index) {
310         return (this.vowels.indexOf(string.charAt(index)) >= 0);
311     }
312
313     private boolean isPreviousChar(StringBuffer JavaDoc string, int index, char c) {
314         boolean matches = false;
315         if( index > 0 &&
316             index < string.length() ) {
317             matches = string.charAt(index - 1) == c;
318         }
319         return matches;
320     }
321
322     private boolean isNextChar(StringBuffer JavaDoc string, int index, char c) {
323         boolean matches = false;
324         if( index >= 0 &&
325             index < string.length() - 1 ) {
326             matches = string.charAt(index + 1) == c;
327         }
328         return matches;
329     }
330
331     private boolean regionMatch(StringBuffer JavaDoc string, int index, String JavaDoc test) {
332         boolean matches = false;
333         if( index >= 0 &&
334             (index + test.length() - 1) < string.length() ) {
335             String JavaDoc substring = string.substring( index, index + test.length());
336             matches = substring.equals( test );
337         }
338         return matches;
339     }
340
341     private boolean isLastChar(int wdsz, int n) {
342         return n + 1 == wdsz;
343     }
344     
345     
346     /**
347      * Encodes an Object using the metaphone algorithm. This method
348      * is provided in order to satisfy the requirements of the
349      * Encoder interface, and will throw an EncoderException if the
350      * supplied object is not of type java.lang.String.
351      *
352      * @param pObject Object to encode
353      * @return An object (or type java.lang.String) containing the
354      * metaphone code which corresponds to the String supplied.
355      * @throws EncoderException if the parameter supplied is not
356      * of type java.lang.String
357      */

358     public Object JavaDoc encode(Object JavaDoc pObject) throws EncoderException {
359         if (!(pObject instanceof java.lang.String JavaDoc)) {
360             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
361         }
362         return metaphone((String JavaDoc) pObject);
363     }
364
365     /**
366      * Encodes a String using the Metaphone algorithm.
367      *
368      * @param pString String object to encode
369      * @return The metaphone code corresponding to the String supplied
370      */

371     public String JavaDoc encode(String JavaDoc pString) {
372         return metaphone(pString);
373     }
374
375     /**
376      * Tests is the metaphones of two strings are identical.
377      *
378      * @param str1 First of two strings to compare
379      * @param str2 Second of two strings to compare
380      * @return true if the metaphones of these strings are identical,
381      * false otherwise.
382      */

383     public boolean isMetaphoneEqual(String JavaDoc str1, String JavaDoc str2) {
384         return metaphone(str1).equals(metaphone(str2));
385     }
386
387     /**
388      * Returns the maxCodeLen.
389      * @return int
390      */

391     public int getMaxCodeLen() { return this.maxCodeLen; }
392
393     /**
394      * Sets the maxCodeLen.
395      * @param maxCodeLen The maxCodeLen to set
396      */

397     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
398
399 }
400
Popular Tags