KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > python > modules > ucnhash


1 // Copyright 1998 Finn Bock.
2

3
4 package org.python.modules;
5
6 import java.io.*;
7 import org.python.core.*;
8
9 public class ucnhash implements ucnhashAPI {
10
11     // Parameters for the word hash.
12
private static int n;
13     private static int m;
14     private static int minchar;
15     private static int maxchar;
16     private static int alphasz;
17     private static int maxlen;
18     private static int maxidx;
19     private static int maxklen;
20
21     private static short[] G;
22     private static short[] T0;
23     private static short[] T1;
24     private static short[] T2;
25
26     // Map the hashed values into the text (as bytes).
27
private static byte[] worddata;
28     private static short[] wordoffs;
29
30     // wordindex greate then cutoff is stored as into two bytes.
31
private static short wordstart;
32     private static short wordcutoff;
33
34     // The raw data and indexes into start of each name
35
// The rawindex is sorted based on the wordindexes.
36
private static byte[] rawdata;
37     private static char[] rawindex;
38
39     // The mapping from raw data index to unicode code points.
40
private static char[] codepoint;
41
42
43
44
45     public static String JavaDoc[] __depends__ = new String JavaDoc[] {
46         "/org/python/modules/ucnhash.dat",
47     };
48
49
50     public static void loadTables() throws Exception JavaDoc {
51         InputStream instream = ucnhash.class.
52                                     getResourceAsStream("ucnhash.dat");
53         if (instream == null)
54             throw new IOException("Unicode name database not found: " +
55                                   "ucnhash.dat");
56
57         DataInputStream in = new DataInputStream(
58                                  new BufferedInputStream(instream));
59
60         n = in.readShort();
61         m = in.readShort();
62         minchar= in.readShort();
63         maxchar = in.readShort();
64         alphasz = in.readShort();
65         maxlen = in.readShort();
66         maxidx = maxlen*alphasz-minchar;
67
68         G = readShortTable(in);
69         if (in.readShort() != 3)
70             throw new IOException("UnicodeNameMap file corrupt, " +
71                                   "unknown dimension");
72
73         T0 = readShortTable(in);
74         T1 = readShortTable(in);
75         T2 = readShortTable(in);
76
77         wordoffs = readShortTable(in);
78         worddata = readByteTable(in);
79
80         wordstart = in.readShort();
81         wordcutoff = in.readShort();
82         maxklen = in.readShort();
83
84         rawdata = readByteTable(in);
85         rawindex = readCharTable(in);
86         codepoint = readCharTable(in);
87     }
88
89
90     private static short[] readShortTable(DataInputStream in)
91         throws IOException
92     {
93         if (in.read() != 't')
94             throw new IOException("UnicodeNameMap file corrupt, shorttable");
95
96         int n = in.readUnsignedShort() / 2;
97         short[] table = new short[n];
98         for (int i = 0; i < n; i++) {
99             table[i] = in.readShort();
100         }
101         return table;
102     }
103
104     private static char[] readCharTable(DataInputStream in)
105         throws IOException
106     {
107         if (in.read() != 't')
108             throw new IOException("UnicodeNameMap file corrupt, chartable");
109
110         int n = in.readUnsignedShort() / 2;
111         char[] table = new char[n];
112         for (int i = 0; i < n; i++) {
113             table[i] = in.readChar();
114         }
115         return table;
116     }
117
118     private static byte[] readByteTable(DataInputStream in)
119         throws IOException
120     {
121         if (in.read() != 't')
122             throw new IOException("UnicodeNameMap file corrupt, byte table");
123         int n = in.readUnsignedShort();
124         byte[] table = new byte[n];
125         in.readFully(table);
126         return table;
127     }
128
129
130     public static int hash(String JavaDoc key) {
131         return hash(key, 0, key.length());
132     }
133
134     public static int hash(String JavaDoc key, int start, int end) {
135         int i, j;
136         int f0, f1, f2;
137
138         for (j = start, i=-minchar, f0=f1=f2=0; j < end; j++) {
139             char ch = key.charAt(j);
140             if (ch >= 'a' && ch <= 'z')
141                 ch = (char) (ch - 'a' + 'A');
142             f0 += T0[i + ch];
143             f1 += T1[i + ch];
144             f2 += T2[i + ch];
145             i += alphasz;
146             if (i >= maxidx)
147                 i = -minchar;
148         }
149
150         f0 %= n;
151         f1 %= n;
152         f2 %= n;
153
154         return (G[f0] + G[f1] + G[f2]) % m;
155     }
156
157
158     private static final char[] charmap =
159            " ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-".toCharArray();
160
161     private static String JavaDoc getWord(int idx) {
162        int offset = wordoffs[idx];
163        int end = worddata.length;
164        if (idx < wordoffs.length-1)
165            end = wordoffs[idx+1];
166        StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
167        for (int i = offset; i < end; i++)
168            buf.append(charmap[worddata[i]]);
169        return buf.toString();
170     }
171
172
173     private static boolean match(int idx, byte[] raw, int begin, int end) {
174        int woff = wordoffs[idx];
175        int wend = worddata.length;
176        if (idx < wordoffs.length-1)
177            wend = wordoffs[idx+1];
178
179        if (end-begin != wend - woff)
180            return false;
181        int l = end-begin;
182        for (int i = 0; i < l; i++) {
183            if (worddata[woff + i] != raw[begin + i])
184                return false;
185        }
186        return true;
187     }
188
189
190
191     private static int compare(byte[] a1, int off1, int len1,
192                                byte[] a2, int off2, int len2)
193     {
194         for (int i = 0; i < len1 && i < len2; i++) {
195             int d = (a1[off1 + i] & 0xFF) - (a2[off2 + i] & 0xFF);
196             if (d != 0)
197                 return d;
198         }
199         return len1 - len2;
200     }
201
202
203     private static int binarysearch(byte[] rawlist, int start, int end) {
204         int floor = 0;
205         int ceiling = (rawindex.length) / 5;
206
207         while (floor < ceiling - 1) {
208            int middle = (floor + ceiling) / 2;
209            if (debug)
210                System.out.println("floor:" + floor + " ceiling:" +
211                                   ceiling +" => " + middle);
212
213            int off = rawindex[middle*5];
214            int len = rawindex[middle*5+4] & 0x1F;
215            int d = compare(rawlist, start, end - start, rawdata, off, len);
216            if (d < 0)
217                ceiling = middle;
218            else if (d > 0)
219                floor = middle;
220            else
221                return middle * 12;
222         }
223
224         int tmp = floor*5;
225
226         int off = rawindex[tmp++];
227         long lengths = ((long) rawindex[tmp++] << 48) |
228                        ((long) rawindex[tmp++] << 32) |
229                        ((long) rawindex[tmp++] << 16) |
230                        ((long) rawindex[tmp++]);
231
232         floor *= 12;
233         for (int i = 0; i < 12; i++) {
234             int len = (int) (lengths >> (i * 5)) & 0x1F;
235             if (compare(rawlist, start, end, rawdata, off, len) == 0)
236                 return floor;
237             off += len;
238             floor++;
239         }
240         return -1;
241     }
242
243     public static int lookup(String JavaDoc name) {
244         return lookup(name, 0, name.length());
245     }
246
247
248     private static int lookup(String JavaDoc name, int start, int end) {
249
250         byte[] rawlist = new byte[32];
251         int ridx = 0;
252         int rbegin = 0;
253         int rstart = 0;
254
255         int i;
256         while (true) {
257             rbegin = ridx;
258             int begin = start;
259             for (i = start; i < end; i++) {
260                 char ch = name.charAt(i);
261                 if (ch == ' ') {
262                     start = i+1;
263                     break;
264                 }
265                 int v;
266                 if (ch >= 'a' && ch <= 'z')
267                     ch = (char) (ch - 'a' + 'A');
268                 if (ch >= 'A' && ch <= 'Z')
269                     v = ch - 'A' + 1;
270                 else if (ch >= '0' && ch <= '9')
271                     v = ch - '0' + 27;
272                 else if (ch == '-')
273                     v = 37;
274                 else
275                     return -1;
276
277                 rawlist[ridx++] = (byte) v;
278                 if (ch == '-' && start != i) {
279                     start = ++i;
280                     break;
281                 }
282             }
283
284             int hash = hash(name, begin, i);
285
286             if (debug)
287                 System.out.println(name.substring(begin, i) + " " + hash);
288
289             boolean isWord = hash >= 0 &&
290                              ridx - rbegin > 1 &&
291                              match(hash, rawlist, rbegin, ridx);
292
293             if (isWord) {
294                 if (debug)
295                     System.out.println("match " + getWord(hash));
296                 hash += wordstart;
297                 ridx = rstart;
298                 if (hash > wordcutoff) {
299                     rawlist[ridx++] = (byte) ((hash >> 8) + wordcutoff);
300                     rawlist[ridx++] = (byte) (hash & 0xFF);
301                 } else
302                    rawlist[ridx++] = (byte) hash;
303             }
304             rstart = ridx;
305
306             if (i >= end)
307                break;
308
309             if (!isWord) {
310                 rawlist[ridx++] = 0;
311             }
312
313         }
314
315         if (debug) {
316             System.out.print("rawdata: ");
317             for (int k = 0; k < ridx; k++)
318                 System.out.print((rawlist[k] & 0xFF) + " ");
319             System.out.println();
320         }
321
322         int idx = binarysearch(rawlist, 0, ridx);
323         if (idx < 0)
324             return idx;
325         if (debug) {
326             System.out.println("idx:" + idx);
327             System.out.println("codepoint:" + codepoint[idx] + " " +
328                                Integer.toHexString((int)codepoint[idx]));
329         }
330         return codepoint[idx];
331     }
332
333
334     // From the ucnhashAPI interface
335
public int getCchMax() {
336         if (!initialized())
337            return -1;
338         return maxklen;
339     }
340
341
342
343     private static String JavaDoc cjkPrefix = "CJK COMPATIBILITY IDEOGRAPH-";
344     private static int cjkPrefixLen = cjkPrefix.length();
345
346     // From the ucnhashAPI interface
347
public int getValue(String JavaDoc s, int start, int end) {
348         if (!initialized())
349             return -1;
350
351         if (s.regionMatches(start, cjkPrefix, 0, cjkPrefixLen)) {
352             try {
353                String JavaDoc hex = s.substring(start + cjkPrefixLen, end);
354                int v = Integer.parseInt(hex, 16);
355                return v;
356             } catch (NumberFormatException JavaDoc exc) {
357                return -1; // Maybe fallthrough to the main algorithme.
358
}
359         }
360
361         return lookup(s, start, end);
362     }
363
364
365     private static boolean initialized = false;
366     private static boolean loaded = false;
367
368
369     private synchronized boolean initialized() {
370         if (initialized && loaded)
371             return true;
372         if (initialized)
373             return false;
374         try {
375             loadTables();
376             loaded = true;
377         } catch (Exception JavaDoc exc) {
378             return false;
379         }
380         initialized = true;
381         return true;
382     }
383
384     private static boolean debug = false;
385
386
387     public static void main(String JavaDoc[] args) throws Exception JavaDoc {
388        loadTables();
389
390        debug = true;
391
392 /*
393        System.out.println(getWord(hash("ARABIC")));
394        System.out.println(getWord(hash("SMALL")));
395        System.out.println(getWord(hash("YI")));
396        System.out.println(getWord(hash("SYLLABLE")));
397        System.out.println(getWord(hash("WITH")));
398        System.out.println(getWord(hash("LETTER")));
399
400        System.out.println(lookup("NULL"));
401        System.out.println(lookup("LATIN CAPITAL LETTER AFRICAN D"));
402        System.out.println(lookup("GURMUKHI TIPPI"));
403        System.out.println(lookup("TIBETAN MARK GTER YIG MGO -UM " +
404                                  "RNAM BCAD MA"));
405        System.out.println(lookup("HANGUL CHOSEONG PIEUP"));
406        System.out.println(lookup("SINGLE LOW-9 QUOTATION MARK"));
407 */

408
409        System.out.println(lookup("BACKSPACE"));
410 // System.out.println(lookup("ACTIVATE SYMMETRIC SWAPPING"));
411

412 /*
413        System.out.println(lookup("LATIN CAPITAL LETTER A"));
414        System.out.println(lookup("GREATER-THAN SIGN"));
415        System.out.println(lookup("EURO-CURRENCY SIGN"));
416 */

417     }
418 }
419
Popular Tags