KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > SnowMailClient > utils > TextSearch > EditDistance


1 package SnowMailClient.utils.TextSearch;
2
3 import java.util.*;
4
5
6 /** Character edit-distance: find the edit distance (also called Levenstein distance)
7     between two strings.
8     contain the edit distance and the editex variant (combine edit and soundex phonix)
9 */

10 public final class EditDistance
11 {
12   private EditDistance()
13   {
14   }
15
16
17   /** zero if a and b are identical 1 otherwise
18   *
19   static int redit(int a, int b)
20   {
21      if(a==b) return 0;
22      return 1;
23   }*/

24
25   /** same as r(a,b) zero if a and b are identical 1 if a and b in the same group, 2 otherwise
26   */

27   static int reditexEnglish(int a, int b)
28   {
29      if(a==b) return 0;
30      if((editexGroupENGLISH((char)a) & editexGroupENGLISH((char)b))!=0) return 1;
31      return 2;
32   }
33
34   static int d(int a, int b)
35   {
36      if(a=='H' && b =='W' && a!=b) return 1;
37      if(a=='W' && b =='H' && a!=b) return 1;
38      return reditexEnglish(a,b);
39   }
40
41   /** @return the editex group.
42     Caution: some letters appears twice.
43     use bitwise operator and to determine matchings
44     group1 & group2 is not zero if class match
45   */

46   private static int editexGroupENGLISH(char c)
47   {
48      int rep=0;
49      if("AEIOUY".indexOf(c)!=-1) rep += 1;
50      if("BP".indexOf(c)!=-1) rep += 2;
51      if("CKQ".indexOf(c)!=-1) rep += 4;
52      if("DT".indexOf(c)!=-1) rep += 8;
53      if("LR".indexOf(c)!=-1) rep += 16;
54      if("MN".indexOf(c)!=-1) rep += 32;
55      if("GJ".indexOf(c)!=-1) rep += 64;
56      if("FPV".indexOf(c)!=-1) rep += 128;
57      if("SXZ".indexOf(c)!=-1) rep += 256;
58      if("CSZ".indexOf(c)!=-1) rep += 512;
59      return rep;
60   }
61
62
63   /** combine edit and soundex
64       w1 must be already uppercased
65   */

66   public static int editexDistanceEnglish(String JavaDoc w1,String JavaDoc w2, int tolerance)
67   {
68     // boost??
69
if(Math.abs(w1.length()-w2.length())>tolerance) return tolerance+1; // refused
70

71     return editexEnglish(w1.length(), w2.length(),w1,w2.toUpperCase());
72   }
73
74
75   /** combine edit and soundex for English
76   */

77   static int editexEnglish(int i, int j, final String JavaDoc w1, final String JavaDoc w2)
78   {
79      if(i==0 && j==0) return 0;
80      if(i==0) return editexEnglish(0,j-1,w1,w2)+(j<w2.length()?d(w2.charAt(j-1),w2.charAt(j)):0);
81      if(j==0) return editexEnglish(i-1,0,w1,w2)+(i<w1.length()?d(w1.charAt(i-1),w1.charAt(i)):0);
82      int p1 = editexEnglish(i-1,j-1,w1,w2) + reditexEnglish(w1.charAt(i-1), w2.charAt(j-1));
83      if(p1==0) return 0;
84      int p2 = editexEnglish(i-1,j,w1,w2)+(i<w1.length()?d(w1.charAt(i-1),w1.charAt(i)):0);
85      int p3 = editexEnglish(i,j-1,w1,w2)+(j<w2.length()?d(w2.charAt(j-1),w2.charAt(j)):0);
86
87      return Math.min(
88         Math.min(p2, p3),
89                  p1);
90   }
91
92   private static int min (int a, int b, int c)
93   {
94     int mi = a;
95     if (b < mi)
96     {
97       mi = b;
98     }
99     if (c < mi)
100     {
101       mi = c;
102     }
103     return mi;
104   }
105
106   /** @return the edit distance, also called Levenshtein distance.
107      is the number of inserts, deletes to transform string s into string t
108   */

109   public static int editDistance(String JavaDoc s, String JavaDoc t, int maxDistance)
110   {
111     int n = s.length();
112     int m = t.length();
113     if (n == 0) return m;
114     if (m == 0) return n;
115     if(n-m>maxDistance) return n-m; // Boost for great unequal length
116
if(m-n>maxDistance) return m-n;
117
118     int d[][] = new int[n+1][m+1];
119
120     for (int i = 0; i <= n; i++) d[i][0] = i;
121     for (int j = 0; j <= m; j++) d[0][j] = j;
122
123     for (int i = 1; i <= n; i++)
124     {
125       char s_i = s.charAt(i-1);
126       for (int j = 1; j <= m; j++)
127       {
128          d[i][j] = min (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + (s_i==t.charAt(j-1)?0:1));
129       }
130     }
131
132     return d[n][m];
133   }
134
135
136   public static String JavaDoc getRandomString(int len)
137   {
138      StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
139      for(int i=0; i<len; i++)
140      {
141         sb.append((char) ('A'+((int)(Math.random()*25))));
142      }
143      return sb.toString();
144   }
145
146   public static void main(String JavaDoc[] aaa)
147   {
148      System.out.println( editDistance("JOERG","JEORG",6));
149      System.out.println( editDistance("WATER","WINE",6));
150      System.out.println( editDistance("WIINE","WINE",6));
151      System.out.println(getRandomString(12));
152      System.out.println(getRandomString(6));
153
154
155      // 10 sec for strings gen and 10 sec for distance comp for 2 milion iterations on a P1000/NT4/Jdk1.3.1
156
long t0 = new Date().getTime();
157      int n = 2000000;
158      for(int i=0; i<n; i++)
159      {
160         int d;
161         String JavaDoc s1 = getRandomString( (int) (2+Math.random()*8) );
162         String JavaDoc s2 = getRandomString( (int) (2+Math.random()*8) );
163         d = editDistance(s1,s2,2);
164      }
165      long t1 = new Date().getTime();
166      System.out.println("time for " + n +" edit distances = "+(t1-t0)/1000.0+ " s");
167
168 /* System.out.println( reditex('A','E'));
169      System.out.println( reditex('S','X'));
170      System.out.println( reditex('Z','S'));
171      System.out.println( reditex('S','A'));
172      System.out.println( reditex('A','A'));*/

173
174   }
175 }
Popular Tags