1 10 11 package org.mmbase.applications.xmlimporter; 12 13 19 public class FuzzyStringMatcher { 20 21 22 private final static String [] NORMALIZE_REPLACEMENTS = { 23 "a", "a", "a", "a", "a", "a", "ae", "c", "e", "e", "e", "e", "i", "i", "i", "i", "d", "n", "o", "o", "o", "o", "o", "x", "o", "u", "u", "u", "u", "y", "b", "s", "a", "a", "a", "a", "a", "a", "ae", "c", "e", "e", "e", "e", "i", "i", "i", "i", "o", "n", "o", "o", "o", "o", "o", " ", "o", "u", "u", "u", "u", "y", "b", "y" }; 32 33 34 private char[] chars1, chars2; 35 36 37 private int length1, length2; 38 39 42 private Integer [][] resultsCache; 43 44 48 private int mismatch; 49 50 58 public static int getMismatch(String string1, String string2) { 59 return new FuzzyStringMatcher(string1, string2).getMismatch(); 60 } 61 62 71 public static float getMatchRate(String string1, String string2) { 72 return new FuzzyStringMatcher(string1, string2).getMatchRate(); 73 } 74 75 86 public static String normalizeString(String str) { 87 StringBuffer sb = new StringBuffer (); 88 char[] chars = str.toCharArray(); 89 boolean whiteSpace = false; 90 91 for (int i = 0; i < chars.length; i++) { 92 char ch = chars[i]; 93 if (ch == '&') { sb.append(ch); 95 } else if (ch < 0x0030) { 96 if (!whiteSpace) { 97 sb.append(" "); 98 whiteSpace = true; 99 } 100 } else if (ch < 0x003a) { sb.append(ch); 102 whiteSpace = false; 103 } else if (ch < 0x0041) { 104 if (!whiteSpace) { 105 sb.append(" "); 106 whiteSpace = true; 107 } 108 } else if (ch < 0x005b) { sb.append((char) (ch + 32)); 110 whiteSpace = false; 111 } else if (ch < 0x061) { 112 if (!whiteSpace) { 113 sb.append(" "); 114 whiteSpace = true; 115 } 116 } else if (ch < 0x007b) { sb.append(ch); 118 whiteSpace = false; 119 } else if (ch < 0x0c0) { 120 if (!whiteSpace) { 121 sb.append(" "); 122 whiteSpace = true; 123 } 124 } else if (ch == 0xf7) { 125 if (!whiteSpace) { 126 sb.append(" "); 127 whiteSpace = true; 128 } 129 } else { 130 sb.append(NORMALIZE_REPLACEMENTS[ch-0x0c0]); 131 whiteSpace = false; 132 } 133 } 134 return sb.toString(); 135 } 136 137 142 private FuzzyStringMatcher(String string1, String string2) { 143 144 chars1 = string1.toCharArray(); 149 chars2 = string2.toCharArray(); 150 151 length1 = string1.length(); 153 length2 = string2.length(); 154 155 resultsCache = new Integer [length1 + 1][length2 + 1]; 158 159 mismatch = calculateMismatch(0, 0); 161 162 resultsCache = null; 164 } 165 166 167 175 private int getMismatch() { 176 return mismatch; 177 } 178 179 186 private float getMatchRate() { 187 return 1 - (mismatch / (float) Math.max(length1, length2)); 188 } 189 190 198 private int calculateMismatch(int i1Start, int i2Start) { 199 200 if (resultsCache[i1Start][i2Start] != null) { 202 return resultsCache[i1Start][i2Start].intValue(); 203 } 204 205 int mismatch = 0; 206 207 int i1 = i1Start; 208 int i2 = i2Start; 209 while (i1 < length1 && i2 < length2) { 210 if (chars1[i1] == chars2[i2]) { 211 212 i1++; 215 i2++; 216 } else { 217 218 mismatch++; 220 221 int m1 = calculateMismatch(i1, i2 + 1); 227 228 int m2 = calculateMismatch(i1 + 1, i2); 231 232 int m3 = calculateMismatch(i1 + 1, i2 + 1); 235 236 if (m1 < m2) { 239 if (m1 < m3) { 240 mismatch += m1; 241 } else { 242 mismatch += m3; 243 } 244 } else { 245 if (m2 < m3) { 246 mismatch += m2; 247 } else { 248 mismatch += m3; 249 } 250 } 251 252 resultsCache[i1Start][i2Start] = new Integer (mismatch); 254 return mismatch; 255 } 256 } 257 258 if (length1 == i1) { 261 262 mismatch += (length2 - i2); 265 } else { 266 267 mismatch += (length1 - i1); 270 } 271 272 resultsCache[i1Start][i2Start] = new Integer (mismatch); 274 return mismatch; 275 } 276 277 295 } 296 | Popular Tags |