KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > commons > codec > language > DoubleMetaphone


1 /*
2  * Copyright 2001-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 package org.apache.commons.codec.language;
18
19 import org.apache.commons.codec.EncoderException;
20 import org.apache.commons.codec.StringEncoder;
21
22 /**
23  * Encodes a string into a double metaphone value.
24  * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>.
25  * <ul>
26  * <li>Original Article: <a
27  * HREF="http://www.cuj.com/documents/s=8038/cuj0006philips/">
28  * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li>
29  * <li>Original Source Code: <a HREF="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
30  * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li>
31  * </ul>
32  *
33  * @author Apache Software Foundation
34  * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $
35  */

36 public class DoubleMetaphone implements StringEncoder {
37
38     /**
39      * "Vowels" to test for
40      */

41     private static final String JavaDoc VOWELS = "AEIOUY";
42
43     /**
44      * Prefixes when present which are not pronounced
45      */

46     private static final String JavaDoc[] SILENT_START =
47     { "GN", "KN", "PN", "WR", "PS" };
48     private static final String JavaDoc[] L_R_N_M_B_H_F_V_W_SPACE =
49     { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
50     private static final String JavaDoc[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
51     { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
52     private static final String JavaDoc[] L_T_K_S_N_M_B_Z =
53     { "L", "T", "K", "S", "N", "M", "B", "Z" };
54
55     /**
56      * Maximum length of an encoding, default is 4
57      */

58     protected int maxCodeLen = 4;
59
60     /**
61      * Creates an instance of this DoubleMetaphone encoder
62      */

63     public DoubleMetaphone() {
64         super();
65     }
66     
67     /**
68      * Encode a value with Double Metaphone
69      *
70      * @param value String to encode
71      * @return an encoded string
72      */

73     public String JavaDoc doubleMetaphone(String JavaDoc value) {
74         return doubleMetaphone(value, false);
75     }
76     
77     /**
78      * Encode a value with Double Metaphone, optionally using the alternate
79      * encoding.
80      *
81      * @param value String to encode
82      * @param alternate use alternate encode
83      * @return an encoded string
84      */

85     public String JavaDoc doubleMetaphone(String JavaDoc value, boolean alternate) {
86         value = cleanInput(value);
87         if (value == null) {
88             return null;
89         }
90         
91         boolean slavoGermanic = isSlavoGermanic(value);
92         int index = isSilentStart(value) ? 1 : 0;
93         
94         DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
95         
96         while (!result.isComplete() && index <= value.length() - 1) {
97             switch (value.charAt(index)) {
98             case 'A':
99             case 'E':
100             case 'I':
101             case 'O':
102             case 'U':
103             case 'Y':
104                 index = handleAEIOUY(value, result, index);
105                 break;
106             case 'B':
107                 result.append('P');
108                 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
109                 break;
110             case '\u00C7':
111                 // A C with a Cedilla
112
result.append('S');
113                 index++;
114                 break;
115             case 'C':
116                 index = handleC(value, result, index);
117                 break;
118             case 'D':
119                 index = handleD(value, result, index);
120                 break;
121             case 'F':
122                 result.append('F');
123                 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
124                 break;
125             case 'G':
126                 index = handleG(value, result, index, slavoGermanic);
127                 break;
128             case 'H':
129                 index = handleH(value, result, index);
130                 break;
131             case 'J':
132                 index = handleJ(value, result, index, slavoGermanic);
133                 break;
134             case 'K':
135                 result.append('K');
136                 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
137                 break;
138             case 'L':
139                 index = handleL(value, result, index);
140                 break;
141             case 'M':
142                 result.append('M');
143                 index = conditionM0(value, index) ? index + 2 : index + 1;
144                 break;
145             case 'N':
146                 result.append('N');
147                 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
148                 break;
149             case '\u00D1':
150                 // N with a tilde (spanish ene)
151
result.append('N');
152                 index++;
153                 break;
154             case 'P':
155                 index = handleP(value, result, index);
156                 break;
157             case 'Q':
158                 result.append('K');
159                 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
160                 break;
161             case 'R':
162                 index = handleR(value, result, index, slavoGermanic);
163                 break;
164             case 'S':
165                 index = handleS(value, result, index, slavoGermanic);
166                 break;
167             case 'T':
168                 index = handleT(value, result, index);
169                 break;
170             case 'V':
171                 result.append('F');
172                 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
173                 break;
174             case 'W':
175                 index = handleW(value, result, index);
176                 break;
177             case 'X':
178                 index = handleX(value, result, index);
179                 break;
180             case 'Z':
181                 index = handleZ(value, result, index, slavoGermanic);
182                 break;
183             default:
184                 index++;
185                 break;
186             }
187         }
188
189         return alternate ? result.getAlternate() : result.getPrimary();
190     }
191     
192     /**
193      * Encode the value using DoubleMetaphone. It will only work if
194      * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
195      *
196      * @param obj Object to encode (should be of type String)
197      * @return An encoded Object (will be of type String)
198      * @throws EncoderException encode parameter is not of type String
199      */

200     public Object JavaDoc encode(Object JavaDoc obj) throws EncoderException {
201         if (!(obj instanceof String JavaDoc)) {
202             throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
203         }
204         return doubleMetaphone((String JavaDoc) obj);
205     }
206
207     /**
208      * Encode the value using DoubleMetaphone.
209      *
210      * @param value String to encode
211      * @return An encoded String
212      */

213     public String JavaDoc encode(String JavaDoc value) {
214         return doubleMetaphone(value);
215     }
216
217     /**
218      * Check if the Double Metaphone values of two <code>String</code> values
219      * are equal.
220      *
221      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
222      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
223      * @return <code>true</code> if the encoded <code>String</code>s are equal;
224      * <code>false</code> otherwise.
225      * @see #isDoubleMetaphoneEqual(String,String,boolean)
226      */

227     public boolean isDoubleMetaphoneEqual(String JavaDoc value1, String JavaDoc value2) {
228         return isDoubleMetaphoneEqual(value1, value2, false);
229     }
230     
231     /**
232      * Check if the Double Metaphone values of two <code>String</code> values
233      * are equal, optionally using the alternate value.
234      *
235      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
236      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
237      * @param alternate use the alternate value if <code>true</code>.
238      * @return <code>true</code> if the encoded <code>String</code>s are equal;
239      * <code>false</code> otherwise.
240      */

241     public boolean isDoubleMetaphoneEqual(String JavaDoc value1,
242                                           String JavaDoc value2,
243                                           boolean alternate) {
244         return doubleMetaphone(value1, alternate).equals(doubleMetaphone
245                                                          (value2, alternate));
246     }
247     
248     /**
249      * Returns the maxCodeLen.
250      * @return int
251      */

252     public int getMaxCodeLen() {
253         return this.maxCodeLen;
254     }
255
256     /**
257      * Sets the maxCodeLen.
258      * @param maxCodeLen The maxCodeLen to set
259      */

260     public void setMaxCodeLen(int maxCodeLen) {
261         this.maxCodeLen = maxCodeLen;
262     }
263
264     //-- BEGIN HANDLERS --//
265

266     /**
267      * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
268      */

269     private int handleAEIOUY(String JavaDoc value, DoubleMetaphoneResult result, int
270                              index) {
271         if (index == 0) {
272             result.append('A');
273         }
274         return index + 1;
275     }
276     
277     /**
278      * Handles 'C' cases
279      */

280     private int handleC(String JavaDoc value,
281                         DoubleMetaphoneResult result,
282                         int index) {
283         if (conditionC0(value, index)) { // very confusing, moved out
284
result.append('K');
285             index += 2;
286         } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
287             result.append('S');
288             index += 2;
289         } else if (contains(value, index, 2, "CH")) {
290             index = handleCH(value, result, index);
291         } else if (contains(value, index, 2, "CZ") &&
292                    !contains(value, index - 2, 4, "WICZ")) {
293             //-- "Czerny" --//
294
result.append('S', 'X');
295             index += 2;
296         } else if (contains(value, index + 1, 3, "CIA")) {
297             //-- "focaccia" --//
298
result.append('X');
299             index += 3;
300         } else if (contains(value, index, 2, "CC") &&
301                    !(index == 1 && charAt(value, 0) == 'M')) {
302             //-- double "cc" but not "McClelland" --//
303
return handleCC(value, result, index);
304         } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
305             result.append('K');
306             index += 2;
307         } else if (contains(value, index, 2, "CI", "CE", "CY")) {
308             //-- Italian vs. English --//
309
if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
310                 result.append('S', 'X');
311             } else {
312                 result.append('S');
313             }
314             index += 2;
315         } else {
316             result.append('K');
317             if (contains(value, index + 1, 2, " C", " Q", " G")) {
318                 //-- Mac Caffrey, Mac Gregor --//
319
index += 3;
320             } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
321                        !contains(value, index + 1, 2, "CE", "CI")) {
322                 index += 2;
323             } else {
324                 index++;
325             }
326         }
327         
328         return index;
329     }
330
331     /**
332      * Handles 'CC' cases
333      */

334     private int handleCC(String JavaDoc value,
335                          DoubleMetaphoneResult result,
336                          int index) {
337         if (contains(value, index + 2, 1, "I", "E", "H") &&
338             !contains(value, index + 2, 2, "HU")) {
339             //-- "bellocchio" but not "bacchus" --//
340
if ((index == 1 && charAt(value, index - 1) == 'A') ||
341                 contains(value, index - 1, 5, "UCCEE", "UCCES")) {
342                 //-- "accident", "accede", "succeed" --//
343
result.append("KS");
344             } else {
345                 //-- "bacci", "bertucci", other Italian --//
346
result.append('X');
347             }
348             index += 3;
349         } else { // Pierce's rule
350
result.append('K');
351             index += 2;
352         }
353         
354         return index;
355     }
356     
357     /**
358      * Handles 'CH' cases
359      */

360     private int handleCH(String JavaDoc value,
361                          DoubleMetaphoneResult result,
362                          int index) {
363         if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael
364
result.append('K', 'X');
365             return index + 2;
366         } else if (conditionCH0(value, index)) {
367             //-- Greek roots ("chemistry", "chorus", etc.) --//
368
result.append('K');
369             return index + 2;
370         } else if (conditionCH1(value, index)) {
371             //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
372
result.append('K');
373             return index + 2;
374         } else {
375             if (index > 0) {
376                 if (contains(value, 0, 2, "MC")) {
377                     result.append('K');
378                 } else {
379                     result.append('X', 'K');
380                 }
381             } else {
382                 result.append('X');
383             }
384             return index + 2;
385         }
386     }
387
388     /**
389      * Handles 'D' cases
390      */

391     private int handleD(String JavaDoc value,
392                         DoubleMetaphoneResult result,
393                         int index) {
394         if (contains(value, index, 2, "DG")) {
395             //-- "Edge" --//
396
if (contains(value, index + 2, 1, "I", "E", "Y")) {
397                 result.append('J');
398                 index += 3;
399                 //-- "Edgar" --//
400
} else {
401                 result.append("TK");
402                 index += 2;
403             }
404         } else if (contains(value, index, 2, "DT", "DD")) {
405             result.append('T');
406             index += 2;
407         } else {
408             result.append('T');
409             index++;
410         }
411         return index;
412     }
413
414     /**
415      * Handles 'G' cases
416      */

417     private int handleG(String JavaDoc value,
418                         DoubleMetaphoneResult result,
419                         int index,
420                         boolean slavoGermanic) {
421         if (charAt(value, index + 1) == 'H') {
422             index = handleGH(value, result, index);
423         } else if (charAt(value, index + 1) == 'N') {
424             if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
425                 result.append("KN", "N");
426             } else if (!contains(value, index + 2, 2, "EY") &&
427                        charAt(value, index + 1) != 'Y' && !slavoGermanic) {
428                 result.append("N", "KN");
429             } else {
430                 result.append("KN");
431             }
432             index = index + 2;
433         } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
434             result.append("KL", "L");
435             index += 2;
436         } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
437             //-- -ges-, -gep-, -gel-, -gie- at beginning --//
438
result.append('K', 'J');
439             index += 2;
440         } else if ((contains(value, index + 1, 2, "ER") ||
441                     charAt(value, index + 1) == 'Y') &&
442                    !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
443                    !contains(value, index - 1, 1, "E", "I") &&
444                    !contains(value, index - 1, 3, "RGY", "OGY")) {
445             //-- -ger-, -gy- --//
446
result.append('K', 'J');
447             index += 2;
448         } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
449                    contains(value, index - 1, 4, "AGGI", "OGGI")) {
450             //-- Italian "biaggi" --//
451
if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) {
452                 //-- obvious germanic --//
453
result.append('K');
454             } else if (contains(value, index + 1, 4, "IER")) {
455                 result.append('J');
456             } else {
457                 result.append('J', 'K');
458             }
459             index += 2;
460         } else if (charAt(value, index + 1) == 'G') {
461             index += 2;
462             result.append('K');
463         } else {
464             index++;
465             result.append('K');
466         }
467         return index;
468     }
469     
470     /**
471      * Handles 'GH' cases
472      */

473     private int handleGH(String JavaDoc value,
474                          DoubleMetaphoneResult result,
475                          int index) {
476         if (index > 0 && !isVowel(charAt(value, index - 1))) {
477             result.append('K');
478             index += 2;
479         } else if (index == 0) {
480             if (charAt(value, index + 2) == 'I') {
481                 result.append('J');
482             } else {
483                 result.append('K');
484             }
485             index += 2;
486         } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
487                    (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
488                    (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
489             //-- Parker's rule (with some further refinements) - "hugh"
490
index += 2;
491         } else {
492             if (index > 2 && charAt(value, index - 1) == 'U' &&
493                 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
494                 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
495
result.append('F');
496             } else if (index > 0 && charAt(value, index - 1) != 'I') {
497                 result.append('K');
498             }
499             index += 2;
500         }
501         return index;
502     }
503
504     /**
505      * Handles 'H' cases
506      */

507     private int handleH(String JavaDoc value,
508                         DoubleMetaphoneResult result,
509                         int index) {
510         //-- only keep if first & before vowel or between 2 vowels --//
511
if ((index == 0 || isVowel(charAt(value, index - 1))) &&
512             isVowel(charAt(value, index + 1))) {
513             result.append('H');
514             index += 2;
515             //-- also takes car of "HH" --//
516
} else {
517             index++;
518         }
519         return index;
520     }
521     
522     /**
523      * Handles 'J' cases
524      */

525     private int handleJ(String JavaDoc value, DoubleMetaphoneResult result, int index,
526                         boolean slavoGermanic) {
527         if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
528                 //-- obvious Spanish, "Jose", "San Jacinto" --//
529
if ((index == 0 && (charAt(value, index + 4) == ' ') ||
530                      value.length() == 4) || contains(value, 0, 4, "SAN ")) {
531                     result.append('H');
532                 } else {
533                     result.append('J', 'H');
534                 }
535                 index++;
536             } else {
537                 if (index == 0 && !contains(value, index, 4, "JOSE")) {
538                     result.append('J', 'A');
539                 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
540                               (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
541                     result.append('J', 'H');
542                 } else if (index == value.length() - 1) {
543                     result.append('J', ' ');
544                 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) {
545                     result.append('J');
546                 }
547
548                 if (charAt(value, index + 1) == 'J') {
549                     index += 2;
550                 } else {
551                     index++;
552                 }
553             }
554         return index;
555     }
556     
557     /**
558      * Handles 'L' cases
559      */

560     private int handleL(String JavaDoc value,
561                         DoubleMetaphoneResult result,
562                         int index) {
563         result.append('L');
564         if (charAt(value, index + 1) == 'L') {
565             if (conditionL0(value, index)) {
566                 result.appendAlternate(' ');
567             }
568             index += 2;
569         } else {
570             index++;
571         }
572         return index;
573     }
574
575     /**
576      * Handles 'P' cases
577      */

578     private int handleP(String JavaDoc value,
579                         DoubleMetaphoneResult result,
580                         int index) {
581         if (charAt(value, index + 1) == 'H') {
582             result.append('F');
583             index += 2;
584         } else {
585             result.append('P');
586             index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
587         }
588         return index;
589     }
590
591     /**
592      * Handles 'R' cases
593      */

594     private int handleR(String JavaDoc value,
595                         DoubleMetaphoneResult result,
596                         int index,
597                         boolean slavoGermanic) {
598         if (index == value.length() - 1 && !slavoGermanic &&
599             contains(value, index - 2, 2, "IE") &&
600             !contains(value, index - 4, 2, "ME", "MA")) {
601             result.appendAlternate('R');
602         } else {
603             result.append('R');
604         }
605         return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
606     }
607
608     /**
609      * Handles 'S' cases
610      */

611     private int handleS(String JavaDoc value,
612                         DoubleMetaphoneResult result,
613                         int index,
614                         boolean slavoGermanic) {
615         if (contains(value, index - 1, 3, "ISL", "YSL")) {
616             //-- special cases "island", "isle", "carlisle", "carlysle" --//
617
index++;
618         } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
619             //-- special case "sugar-" --//
620
result.append('X', 'S');
621             index++;
622         } else if (contains(value, index, 2, "SH")) {
623             if (contains(value, index + 1, 4,
624                          "HEIM", "HOEK", "HOLM", "HOLZ")) {
625                 //-- germanic --//
626
result.append('S');
627             } else {
628                 result.append('X');
629             }
630             index += 2;
631         } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
632             //-- Italian and Armenian --//
633
if (slavoGermanic) {
634                 result.append('S');
635             } else {
636                 result.append('S', 'X');
637             }
638             index += 3;
639         } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) {
640             //-- german & anglicisations, e.g. "smith" match "schmidt" //
641
// "snider" match "schneider" --//
642
//-- also, -sz- in slavic language altho in hungarian it //
643
// is pronounced "s" --//
644
result.append('S', 'X');
645             index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
646         } else if (contains(value, index, 2, "SC")) {
647             index = handleSC(value, result, index);
648         } else {
649             if (index == value.length() - 1 && contains(value, index - 2,
650                                                         2, "AI", "OI")){
651                 //-- french e.g. "resnais", "artois" --//
652
result.appendAlternate('S');
653             } else {
654                 result.append('S');
655             }
656             index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
657         }
658         return index;
659     }
660
661     /**
662      * Handles 'SC' cases
663      */

664     private int handleSC(String JavaDoc value,
665                          DoubleMetaphoneResult result,
666                          int index) {
667         if (charAt(value, index + 2) == 'H') {
668             //-- Schlesinger's rule --//
669
if (contains(value, index + 3,
670                          2, "OO", "ER", "EN", "UY", "ED", "EM")) {
671                 //-- Dutch origin, e.g. "school", "schooner" --//
672
if (contains(value, index + 3, 2, "ER", "EN")) {
673                     //-- "schermerhorn", "schenker" --//
674
result.append("X", "SK");
675                 } else {
676                     result.append("SK");
677                 }
678             } else {
679                 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
680                     result.append('X', 'S');
681                 } else {
682                     result.append('X');
683                 }
684             }
685         } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
686             result.append('S');
687         } else {
688             result.append("SK");
689         }
690         return index + 3;
691     }
692
693     /**
694      * Handles 'T' cases
695      */

696     private int handleT(String JavaDoc value,
697                         DoubleMetaphoneResult result,
698                         int index) {
699         if (contains(value, index, 4, "TION")) {
700             result.append('X');
701             index += 3;
702         } else if (contains(value, index, 3, "TIA", "TCH")) {
703             result.append('X');
704             index += 3;
705         } else if (contains(value, index, 2, "TH") || contains(value, index,
706                                                                3, "TTH")) {
707             if (contains(value, index + 2, 2, "OM", "AM") ||
708                 //-- special case "thomas", "thames" or germanic --//
709
contains(value, 0, 4, "VAN ", "VON ") ||
710                 contains(value, 0, 3, "SCH")) {
711                 result.append('T');
712             } else {
713                 result.append('0', 'T');
714             }
715             index += 2;
716         } else {
717             result.append('T');
718             index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
719         }
720         return index;
721     }
722
723     /**
724      * Handles 'W' cases
725      */

726     private int handleW(String JavaDoc value,
727                         DoubleMetaphoneResult result,
728                         int index) {
729         if (contains(value, index, 2, "WR")) {
730             //-- can also be in middle of word --//
731
result.append('R');
732             index += 2;
733         } else {
734             if (index == 0 && (isVowel(charAt(value, index + 1)) ||
735                                contains(value, index, 2, "WH"))) {
736                 if (isVowel(charAt(value, index + 1))) {
737                     //-- Wasserman should match Vasserman --//
738
result.append('A', 'F');
739                 } else {
740                     //-- need Uomo to match Womo --//
741
result.append('A');
742                 }
743                 index++;
744             } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
745                        contains(value, index - 1,
746                                 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
747                        contains(value, 0, 3, "SCH")) {
748                 //-- Arnow should match Arnoff --//
749
result.appendAlternate('F');
750                 index++;
751             } else if (contains(value, index, 4, "WICZ", "WITZ")) {
752                 //-- Polish e.g. "filipowicz" --//
753
result.append("TS", "FX");
754                 index += 4;
755             } else {
756                 index++;
757             }
758         }
759         return index;
760     }
761     
762     /**
763      * Handles 'X' cases
764      */

765     private int handleX(String JavaDoc value,
766                         DoubleMetaphoneResult result,
767                         int index) {
768         if (index == 0) {
769             result.append('S');
770             index++;
771         } else {
772             if (!((index == value.length() - 1) &&
773                   (contains(value, index - 3, 3, "IAU", "EAU") ||
774                    contains(value, index - 2, 2, "AU", "OU")))) {
775                 //-- French e.g. breaux --//
776
result.append("KS");
777             }
778             index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
779         }
780         return index;
781     }
782
783     /**
784      * Handles 'Z' cases
785      */

786     private int handleZ(String JavaDoc value, DoubleMetaphoneResult result, int index,
787                         boolean slavoGermanic) {
788         if (charAt(value, index + 1) == 'H') {
789             //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
790
result.append('J');
791             index += 2;
792         } else {
793             if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
794                 result.append("S", "TS");
795             } else {
796                 result.append('S');
797             }
798             index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
799         }
800         return index;
801     }
802
803     //-- BEGIN CONDITIONS --//
804

805     /**
806      * Complex condition 0 for 'C'
807      */

808     private boolean conditionC0(String JavaDoc value, int index) {
809         if (contains(value, index, 4, "CHIA")) {
810             return true;
811         } else if (index <= 1) {
812             return false;
813         } else if (isVowel(charAt(value, index - 2))) {
814             return false;
815         } else if (!contains(value, index - 1, 3, "ACH")) {
816             return false;
817         } else {
818             char c = charAt(value, index + 2);
819             return (c != 'I' && c != 'E')
820                     || contains(value, index - 2, 6, "BACHER", "MACHER");
821         }
822     }
823     
824     /**
825      * Complex condition 0 for 'CH'
826      */

827     private boolean conditionCH0(String JavaDoc value, int index) {
828         if (index != 0) {
829             return false;
830         } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
831                    !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
832             return false;
833         } else if (contains(value, 0, 5, "CHORE")) {
834             return false;
835         } else {
836             return true;
837         }
838     }
839     
840     /**
841      * Complex condition 1 for 'CH'
842      */

843     private boolean conditionCH1(String JavaDoc value, int index) {
844         return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0,
845                                                                    3, "SCH")) ||
846                 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
847                 contains(value, index + 2, 1, "T", "S") ||
848                 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
849                  (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
850     }
851     
852     /**
853      * Complex condition 0 for 'L'
854      */

855     private boolean conditionL0(String JavaDoc value, int index) {
856         if (index == value.length() - 3 &&
857             contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
858             return true;
859         } else if ((contains(value, index - 1, 2, "AS", "OS") ||
860                     contains(value, value.length() - 1, 1, "A", "O")) &&
861                    contains(value, index - 1, 4, "ALLE")) {
862             return true;
863         } else {
864             return false;
865         }
866     }
867     
868     /**
869      * Complex condition 0 for 'M'
870      */

871     private boolean conditionM0(String JavaDoc value, int index) {
872         if (charAt(value, index + 1) == 'M') {
873             return true;
874         }
875         return contains(value, index - 1, 3, "UMB")
876                 && ((index + 1) == value.length() - 1 || contains(value,
877                         index + 2, 2, "ER"));
878     }
879     
880     //-- BEGIN HELPER FUNCTIONS --//
881

882     /**
883      * Determines whether or not a value is of slavo-germanic orgin. A value is
884      * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
885      */

886     private boolean isSlavoGermanic(String JavaDoc value) {
887         return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
888             value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
889     }
890
891     /**
892      * Determines whether or not a character is a vowel or not
893      */

894     private boolean isVowel(char ch) {
895         return VOWELS.indexOf(ch) != -1;
896     }
897
898     /**
899      * Determines whether or not the value starts with a silent letter. It will
900      * return <code>true</code> if the value starts with any of 'GN', 'KN',
901      * 'PN', 'WR' or 'PS'.
902      */

903     private boolean isSilentStart(String JavaDoc value) {
904         boolean result = false;
905         for (int i = 0; i < SILENT_START.length; i++) {
906             if (value.startsWith(SILENT_START[i])) {
907                 result = true;
908                 break;
909             }
910         }
911         return result;
912     }
913
914     /**
915      * Cleans the input
916      */

917     private String JavaDoc cleanInput(String JavaDoc input) {
918         if (input == null) {
919             return null;
920         }
921         input = input.trim();
922         if (input.length() == 0) {
923             return null;
924         }
925         return input.toUpperCase();
926     }
927
928     /**
929      * Gets the character at index <code>index</code> if available, otherwise
930      * it returns <code>Character.MIN_VALUE</code> so that there is some sort
931      * of a default
932      */

933     protected char charAt(String JavaDoc value, int index) {
934         if (index < 0 || index >= value.length()) {
935             return Character.MIN_VALUE;
936         }
937         return value.charAt(index);
938     }
939
940     /**
941      * Shortcut method with 1 criteria
942      */

943     private static boolean contains(String JavaDoc value, int start, int length,
944                                     String JavaDoc criteria) {
945         return contains(value, start, length,
946                         new String JavaDoc[] { criteria });
947     }
948
949     /**
950      * Shortcut method with 2 criteria
951      */

952     private static boolean contains(String JavaDoc value, int start, int length,
953                                     String JavaDoc criteria1, String JavaDoc criteria2) {
954         return contains(value, start, length,
955                         new String JavaDoc[] { criteria1, criteria2 });
956     }
957
958     /**
959      * Shortcut method with 3 criteria
960      */

961     private static boolean contains(String JavaDoc value, int start, int length,
962                                     String JavaDoc criteria1, String JavaDoc criteria2,
963                                     String JavaDoc criteria3) {
964         return contains(value, start, length,
965                         new String JavaDoc[] { criteria1, criteria2, criteria3 });
966     }
967
968     /**
969      * Shortcut method with 4 criteria
970      */

971     private static boolean contains(String JavaDoc value, int start, int length,
972                                     String JavaDoc criteria1, String JavaDoc criteria2,
973                                     String JavaDoc criteria3, String JavaDoc criteria4) {
974         return contains(value, start, length,
975                         new String JavaDoc[] { criteria1, criteria2, criteria3,
976                                        criteria4 });
977     }
978
979     /**
980      * Shortcut method with 5 criteria
981      */

982     private static boolean contains(String JavaDoc value, int start, int length,
983                                     String JavaDoc criteria1, String JavaDoc criteria2,
984                                     String JavaDoc criteria3, String JavaDoc criteria4,
985                                     String JavaDoc criteria5) {
986         return contains(value, start, length,
987                         new String JavaDoc[] { criteria1, criteria2, criteria3,
988                                        criteria4, criteria5 });
989     }
990
991     /**
992      * Shortcut method with 6 criteria
993      */

994     private static boolean contains(String JavaDoc value, int start, int length,
995                                     String JavaDoc criteria1, String JavaDoc criteria2,
996                                     String JavaDoc criteria3, String JavaDoc criteria4,
997                                     String JavaDoc criteria5, String JavaDoc criteria6) {
998         return contains(value, start, length,
999                         new String JavaDoc[] { criteria1, criteria2, criteria3,
1000                                       criteria4, criteria5, criteria6 });
1001    }
1002    
1003    /**
1004     * Determines whether <code>value</code> contains any of the criteria
1005     starting
1006     * at index <code>start</code> and matching up to length <code>length</code>
1007     */

1008    protected static boolean contains(String JavaDoc value, int start, int length,
1009                                      String JavaDoc[] criteria) {
1010        boolean result = false;
1011        if (start >= 0 && start + length <= value.length()) {
1012            String JavaDoc target = value.substring(start, start + length);
1013
1014            for (int i = 0; i < criteria.length; i++) {
1015                if (target.equals(criteria[i])) {
1016                    result = true;
1017                    break;
1018                }
1019            }
1020        }
1021        return result;
1022    }
1023    
1024    //-- BEGIN INNER CLASSES --//
1025

1026    /**
1027     * Inner class for storing results, since there is the optional alternate
1028     * encoding.
1029     */

1030    public class DoubleMetaphoneResult {
1031
1032        private StringBuffer JavaDoc primary = new StringBuffer JavaDoc(getMaxCodeLen());
1033        private StringBuffer JavaDoc alternate = new StringBuffer JavaDoc(getMaxCodeLen());
1034        private int maxLength;
1035
1036        public DoubleMetaphoneResult(int maxLength) {
1037            this.maxLength = maxLength;
1038        }
1039
1040        public void append(char value) {
1041            appendPrimary(value);
1042            appendAlternate(value);
1043        }
1044
1045        public void append(char primary, char alternate) {
1046            appendPrimary(primary);
1047            appendAlternate(alternate);
1048        }
1049
1050        public void appendPrimary(char value) {
1051            if (this.primary.length() < this.maxLength) {
1052                this.primary.append(value);
1053            }
1054        }
1055
1056        public void appendAlternate(char value) {
1057            if (this.alternate.length() < this.maxLength) {
1058                this.alternate.append(value);
1059            }
1060        }
1061
1062        public void append(String JavaDoc value) {
1063            appendPrimary(value);
1064            appendAlternate(value);
1065        }
1066
1067        public void append(String JavaDoc primary, String JavaDoc alternate) {
1068            appendPrimary(primary);
1069            appendAlternate(alternate);
1070        }
1071
1072        public void appendPrimary(String JavaDoc value) {
1073            int addChars = this.maxLength - this.primary.length();
1074            if (value.length() <= addChars) {
1075                this.primary.append(value);
1076            } else {
1077                this.primary.append(value.substring(0, addChars));
1078            }
1079        }
1080
1081        public void appendAlternate(String JavaDoc value) {
1082            int addChars = this.maxLength - this.alternate.length();
1083            if (value.length() <= addChars) {
1084                this.alternate.append(value);
1085            } else {
1086                this.alternate.append(value.substring(0, addChars));
1087            }
1088        }
1089
1090        public String JavaDoc getPrimary() {
1091            return this.primary.toString();
1092        }
1093
1094        public String JavaDoc getAlternate() {
1095            return this.alternate.toString();
1096        }
1097
1098        public boolean isComplete() {
1099            return this.primary.length() >= this.maxLength &&
1100                this.alternate.length() >= this.maxLength;
1101        }
1102    }
1103}
1104
Popular Tags