KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > mmbase > applications > xmlimporter > FuzzyStringMatcher


1 /*
2
3 This software is OSI Certified Open Source Software.
4 OSI Certified is a certification mark of the Open Source Initiative.
5
6 The license (Mozilla version 1.0) can be read at the MMBase site.
7 See http://www.MMBase.org/license
8
9 */

10
11 package org.mmbase.applications.xmlimporter;
12
13 /**
14  * Utility class, providing methods for a fuzzy comparison between strings.
15  * @author Rob van Maris (Finalist IT Group)
16  * @since MMBase-1.5
17  * @version $Id: FuzzyStringMatcher.java,v 1.3 2003/03/07 08:50:02 pierre Exp $
18  */

19 public class FuzzyStringMatcher {
20
21     /** Replacements for diacritical characters. */
22     private final static String JavaDoc[] NORMALIZE_REPLACEMENTS = {
23         "a", "a", "a", "a", "a", "a", "ae", "c", //c0-c7
24
"e", "e", "e", "e", "i", "i", "i", "i", //c8-cf
25
"d", "n", "o", "o", "o", "o", "o", "x", //d0-d7
26
"o", "u", "u", "u", "u", "y", "b", "s", //d8-df
27
"a", "a", "a", "a", "a", "a", "ae", "c", //e0-e7
28
"e", "e", "e", "e", "i", "i", "i", "i", //e8-ef
29
"o", "n", "o", "o", "o", "o", "o", " ", //f0-f7
30
"o", "u", "u", "u", "u", "y", "b", "y" //f8-ff
31
};
32
33     /* Strings to be matched, converted to char arrays. */
34     private char[] chars1, chars2;
35
36     /* Length of the strings. */
37     private int length1, length2;
38
39     /* Cache of getMismatch() results,
40      * i.e. resultsCache[i1, 12] caches the result of calculateMismatch(i1, i2).
41      */

42     private Integer JavaDoc[][] resultsCache;
43
44     /* Result of the matching, this is the minimum number of typo's
45      * necessary to account for the differences between the two strings,
46      * if they were meant to be identical.
47      */

48     private int mismatch;
49
50     /**
51      * Calculates the mismatch between two strings.
52      * @param string1 first string
53      * @param string2 second string
54      * @return The number of mismatches,this is the minimum number of typo's
55      * necessary to account for the differences between the two strings,
56      * if they were meant to be identical.
57      */

58     public static int getMismatch(String JavaDoc string1, String JavaDoc string2) {
59         return new FuzzyStringMatcher(string1, string2).getMismatch();
60     }
61
62     /**
63      * Calculates the match rate, a value between 0 and 1, proportional
64      * to the rate the two strings match (1 is exact match).
65      * This is calculated as
66      * 1 - (mismatch/max(string1.length(), string2.length())).
67      * @param string1 first string
68      * @param string2 second string
69      * @return The match rate.
70      */

71     public static float getMatchRate(String JavaDoc string1, String JavaDoc string2) {
72         return new FuzzyStringMatcher(string1, string2).getMatchRate();
73     }
74
75     /**
76      * Creates normalized title.
77      * e.g. all non-alphanumeric characters replaced by white space, all characters converted
78      * to lowercase non-diacritical characters, and all white space
79      * sequences contracted to a single white space character.
80      * This is a convenience method, provided to make string comparison
81      * easier by removing (more or less) arbitrary differences.
82      *
83      * @param str The original title.
84      * @return The normalized title.
85      */

86     public static String JavaDoc normalizeString(String JavaDoc str) {
87         StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
88         char[] chars = str.toCharArray();
89         boolean whiteSpace = false;
90
91         for (int i = 0; i < chars.length; i++) {
92             char ch = chars[i];
93             if (ch == '&') { // Ampersand
94
sb.append(ch);
95             } else if (ch < 0x0030) {
96                 if (!whiteSpace) {
97                     sb.append(" ");
98                     whiteSpace = true;
99                 }
100             } else if (ch < 0x003a) { // Digits.
101
sb.append(ch);
102                 whiteSpace = false;
103             } else if (ch < 0x0041) {
104                 if (!whiteSpace) {
105                     sb.append(" ");
106                     whiteSpace = true;
107                 }
108             } else if (ch < 0x005b) { // Upper case letters.
109
sb.append((char) (ch + 32));
110                 whiteSpace = false;
111             } else if (ch < 0x061) {
112                 if (!whiteSpace) {
113                     sb.append(" ");
114                     whiteSpace = true;
115                 }
116             } else if (ch < 0x007b) { // Lower case letters.
117
sb.append(ch);
118                 whiteSpace = false;
119             } else if (ch < 0x0c0) {
120                 if (!whiteSpace) {
121                     sb.append(" ");
122                     whiteSpace = true;
123                 }
124             } else if (ch == 0xf7) {
125                 if (!whiteSpace) {
126                     sb.append(" ");
127                     whiteSpace = true;
128                 }
129             } else {
130                 sb.append(NORMALIZE_REPLACEMENTS[ch-0x0c0]);
131                 whiteSpace = false;
132             }
133         }
134         return sb.toString();
135     }
136
137     /**
138      * Creates new FuzzyStringMatcher and executes the matching routine.
139      * @param string1 first string
140      * @param string2 second string
141      */

142     private FuzzyStringMatcher(String JavaDoc string1, String JavaDoc string2) {
143
144         // Convert strings to char arrays in order to speed up access
145
// to the individual characters.
146
// Tests confirm a slight performance gain over using plain
147
// String objects.
148
chars1 = string1.toCharArray();
149         chars2 = string2.toCharArray();
150
151         // Store the string lengths, instead calling length() repeatedly.
152
length1 = string1.length();
153         length2 = string2.length();
154
155         // Initialize the cache. It is dimensioned to match the range
156
// of input parameters of the calculateMismatch method.
157
resultsCache = new Integer JavaDoc[length1 + 1][length2 + 1];
158
159         // Execute the matching routine.
160
mismatch = calculateMismatch(0, 0);
161
162         // Make the resultsCache eligible for garbage collection.
163
resultsCache = null;
164     }
165
166
167     /**
168      * Get the result of the matching. This is calculated on creation of
169      * this instance, so it is not calculated again when this method
170      * is called.
171      * @return The number of mismatches,this is the minimum number of typo's
172      * necessary to account for the differences between the two strings,
173      * if they were meant to be identical.
174      */

175     private int getMismatch() {
176         return mismatch;
177     }
178
179     /**
180      * Get the match rate, a value between 0 and 1, proportional
181      * to the rate the two strings match.
182      * This is calculated as
183      * 1 - (mismatch/max(string1.length(), string2.length())).
184      * @return the match rate.
185      */

186     private float getMatchRate() {
187         return 1 - (mismatch / (float) Math.max(length1, length2));
188     }
189
190     /**
191      * Calculate the mismatch between a substring of string1
192      * and a substring of string2. This method calls itself
193      * recursively.
194      * @param i1Start start index in substring of string1
195      * @param i2Start start index in substring of string2
196      * @return the mismatch between the substrings.
197      */

198     private int calculateMismatch(int i1Start, int i2Start) {
199
200         // Retreive the result from the cache if possible.
201
if (resultsCache[i1Start][i2Start] != null) {
202             return resultsCache[i1Start][i2Start].intValue();
203         }
204
205         int mismatch = 0;
206
207         int i1 = i1Start;
208         int i2 = i2Start;
209         while (i1 < length1 && i2 < length2) {
210             if (chars1[i1] == chars2[i2]) {
211
212                 // Characters at current position match,
213
// so proceed to next position.
214
i1++;
215                 i2++;
216             } else {
217
218                 // Characters at current position don't match.
219
mismatch++;
220
221                 // Calculate the mismatch in the remaining substrings,
222
// based on 3 different assumptions on how the mismatch
223
// at the current position was produced:
224
// 1) Assume a character was ommitted in string1.
225
// (Or, equivalently: a character was added in string2.)
226
int m1 = calculateMismatch(i1, i2 + 1);
227
228                 // 2) Assume a wrong character was substituted
229
// in one of the strings.
230
int m2 = calculateMismatch(i1 + 1, i2);
231
232                 // 3) Assume a character was ommitted in string2.
233
// (Or, equivalently: a character was added in string1.)
234
int m3 = calculateMismatch(i1 + 1, i2 + 1);
235
236                 // Of these three, adopt the one that produces the
237
// smallest mismatch; adjust the mismatch accordingly.
238
if (m1 < m2) {
239                     if (m1 < m3) {
240                         mismatch += m1;
241                     } else {
242                         mismatch += m3;
243                     }
244                 } else {
245                     if (m2 < m3) {
246                         mismatch += m2;
247                     } else {
248                         mismatch += m3;
249                     }
250                 }
251
252                 // Store the result in the cache and return it.
253
resultsCache[i1Start][i2Start] = new Integer JavaDoc(mismatch);
254                 return mismatch;
255             }
256         }
257
258         // The end of one of the strings has been reached.
259
// Each remaining character in the other string is a mismatch.
260
if (length1 == i1) {
261
262             // The end of string1 is reached, add the remaining
263
// number of characters in string2 to the mismatch.
264
mismatch += (length2 - i2);
265         } else {
266
267             // The end of string2 is reached, add the remaining
268
// number of characters in string1 to the mismatch.
269
mismatch += (length1 - i1);
270         }
271
272         // Store the result in the cacht and return it.
273
resultsCache[i1Start][i2Start] = new Integer JavaDoc(mismatch);
274         return mismatch;
275     }
276
277 // // for testing only
278
// private static void test(String s1, String s2) {
279
// float result = 0;
280
// FuzzyStringMatcher sm = null;
281
// long startTime = System.currentTimeMillis();
282
// for (int i = 0; i < 10000; i++) {
283
// result = getMismatch(s1, s2);
284
// }
285
// long timeNeeded = System.currentTimeMillis() - startTime;
286
// System.out.println("mismatch: " + result
287
// + ", in " + timeNeeded/10000f + "ms.");
288
// }
289
//
290
// // for testing only
291
// public static void main(String[] args) {
292
// test(args[0], args[1]);
293
// }
294

295 }
296
Popular Tags