KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > commons > codec > language > Soundex


1 /*
2  * Copyright 2001-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 package org.apache.commons.codec.language;
18
19 import org.apache.commons.codec.EncoderException;
20 import org.apache.commons.codec.StringEncoder;
21
22 /**
23  * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
24  * general purpose scheme to find word with similar phonemes.
25  *
26  * @author Apache Software Foundation
27  * @version $Id: Soundex.java,v 1.26 2004/07/07 23:15:24 ggregory Exp $
28  */

29 public class Soundex implements StringEncoder {
30
31     /**
32      * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
33      *
34      * @see #US_ENGLISH_MAPPING
35      */

36     public static final Soundex US_ENGLISH = new Soundex();
37
38     /**
39      * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
40      * means do not encode.
41      * <p>
42      * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
43      * up the value for the constant values page.)
44      * </p>
45      *
46      * @see #US_ENGLISH_MAPPING
47      */

48     public static final String JavaDoc US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
49
50     /**
51      * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
52      * means do not encode.
53      *
54      * @see Soundex#Soundex(char[])
55      */

56     public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
57
58     /**
59      * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
60      * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
61      * identical values.
62      *
63      * @param s1
64      * A String that will be encoded and compared.
65      * @param s2
66      * A String that will be encoded and compared.
67      * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
68      *
69      * @see SoundexUtils#difference(StringEncoder,String,String)
70      * @see <a HREF="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
71      * T-SQL DIFFERENCE </a>
72      *
73      * @throws EncoderException
74      * if an error occurs encoding one of the strings
75      * @since 1.3
76      */

77     public int difference(String JavaDoc s1, String JavaDoc s2) throws EncoderException {
78         return SoundexUtils.difference(this, s1, s2);
79     }
80
81     /**
82      * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
83      *
84      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
85      */

86     private int maxLength = 4;
87
88     /**
89      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
90      * letter is mapped. This implementation contains a default map for US_ENGLISH
91      */

92     private char[] soundexMapping;
93
94     /**
95      * Creates an instance using US_ENGLISH_MAPPING
96      *
97      * @see Soundex#Soundex(char[])
98      * @see Soundex#US_ENGLISH_MAPPING
99      */

100     public Soundex() {
101         this(US_ENGLISH_MAPPING);
102     }
103
104     /**
105      * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
106      * mapping for a non-Western character set.
107      *
108      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
109      * letter is mapped. This implementation contains a default map for US_ENGLISH
110      *
111      * @param mapping
112      * Mapping array to use when finding the corresponding code for a given character
113      */

114     public Soundex(char[] mapping) {
115         this.setSoundexMapping(mapping);
116     }
117
118     /**
119      * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
120      * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
121      *
122      * @param pObject
123      * Object to encode
124      * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
125      * supplied.
126      * @throws EncoderException
127      * if the parameter supplied is not of type java.lang.String
128      * @throws IllegalArgumentException
129      * if a character is not mapped
130      */

131     public Object JavaDoc encode(Object JavaDoc pObject) throws EncoderException {
132         if (!(pObject instanceof String JavaDoc)) {
133             throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
134         }
135         return soundex((String JavaDoc) pObject);
136     }
137
138     /**
139      * Encodes a String using the soundex algorithm.
140      *
141      * @param pString
142      * A String object to encode
143      * @return A Soundex code corresponding to the String supplied
144      * @throws IllegalArgumentException
145      * if a character is not mapped
146      */

147     public String JavaDoc encode(String JavaDoc pString) {
148         return soundex(pString);
149     }
150
151     /**
152      * Used internally by the SoundEx algorithm.
153      *
154      * Consonants from the same code group separated by W or H are treated as one.
155      *
156      * @param str
157      * the cleaned working string to encode (in upper case).
158      * @param index
159      * the character position to encode
160      * @return Mapping code for a particular character
161      * @throws IllegalArgumentException
162      * if the character is not mapped
163      */

164     private char getMappingCode(String JavaDoc str, int index) {
165         char mappedChar = this.map(str.charAt(index));
166         // HW rule check
167
if (index > 1 && mappedChar != '0') {
168             char hwChar = str.charAt(index - 1);
169             if ('H' == hwChar || 'W' == hwChar) {
170                 char preHWChar = str.charAt(index - 2);
171                 char firstCode = this.map(preHWChar);
172                 if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) {
173                     return 0;
174                 }
175             }
176         }
177         return mappedChar;
178     }
179
180     /**
181      * Returns the maxLength. Standard Soundex
182      *
183      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
184      * @return int
185      */

186     public int getMaxLength() {
187         return this.maxLength;
188     }
189
190     /**
191      * Returns the soundex mapping.
192      *
193      * @return soundexMapping.
194      */

195     private char[] getSoundexMapping() {
196         return this.soundexMapping;
197     }
198
199     /**
200      * Maps the given upper-case character to it's Soudex code.
201      *
202      * @param ch
203      * An upper-case character.
204      * @return A Soundex code.
205      * @throws IllegalArgumentException
206      * Thrown if <code>ch</code> is not mapped.
207      */

208     private char map(char ch) {
209         int index = ch - 'A';
210         if (index < 0 || index >= this.getSoundexMapping().length) {
211             throw new IllegalArgumentException JavaDoc("The character is not mapped: " + ch);
212         }
213         return this.getSoundexMapping()[index];
214     }
215
216     /**
217      * Sets the maxLength.
218      *
219      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
220      * @param maxLength
221      * The maxLength to set
222      */

223     public void setMaxLength(int maxLength) {
224         this.maxLength = maxLength;
225     }
226
227     /**
228      * Sets the soundexMapping.
229      *
230      * @param soundexMapping
231      * The soundexMapping to set.
232      */

233     private void setSoundexMapping(char[] soundexMapping) {
234         this.soundexMapping = soundexMapping;
235     }
236
237     /**
238      * Retreives the Soundex code for a given String object.
239      *
240      * @param str
241      * String to encode using the Soundex algorithm
242      * @return A soundex code for the String supplied
243      * @throws IllegalArgumentException
244      * if a character is not mapped
245      */

246     public String JavaDoc soundex(String JavaDoc str) {
247         if (str == null) {
248             return null;
249         }
250         str = SoundexUtils.clean(str);
251         if (str.length() == 0) {
252             return str;
253         }
254         char out[] = {'0', '0', '0', '0'};
255         char last, mapped;
256         int incount = 1, count = 1;
257         out[0] = str.charAt(0);
258         last = getMappingCode(str, 0);
259         while ((incount < str.length()) && (count < out.length)) {
260             mapped = getMappingCode(str, incount++);
261             if (mapped != 0) {
262                 if ((mapped != '0') && (mapped != last)) {
263                     out[count++] = mapped;
264                 }
265                 last = mapped;
266             }
267         }
268         return new String JavaDoc(out);
269     }
270
271 }
Popular Tags