1 3 4 package org.python.modules; 5 6 import java.io.*; 7 import org.python.core.*; 8 9 public class ucnhash implements ucnhashAPI { 10 11 private static int n; 13 private static int m; 14 private static int minchar; 15 private static int maxchar; 16 private static int alphasz; 17 private static int maxlen; 18 private static int maxidx; 19 private static int maxklen; 20 21 private static short[] G; 22 private static short[] T0; 23 private static short[] T1; 24 private static short[] T2; 25 26 private static byte[] worddata; 28 private static short[] wordoffs; 29 30 private static short wordstart; 32 private static short wordcutoff; 33 34 private static byte[] rawdata; 37 private static char[] rawindex; 38 39 private static char[] codepoint; 41 42 43 44 45 public static String [] __depends__ = new String [] { 46 "/org/python/modules/ucnhash.dat", 47 }; 48 49 50 public static void loadTables() throws Exception { 51 InputStream instream = ucnhash.class. 52 getResourceAsStream("ucnhash.dat"); 53 if (instream == null) 54 throw new IOException("Unicode name database not found: " + 55 "ucnhash.dat"); 56 57 DataInputStream in = new DataInputStream( 58 new BufferedInputStream(instream)); 59 60 n = in.readShort(); 61 m = in.readShort(); 62 minchar= in.readShort(); 63 maxchar = in.readShort(); 64 alphasz = in.readShort(); 65 maxlen = in.readShort(); 66 maxidx = maxlen*alphasz-minchar; 67 68 G = readShortTable(in); 69 if (in.readShort() != 3) 70 throw new IOException("UnicodeNameMap file corrupt, " + 71 "unknown dimension"); 72 73 T0 = readShortTable(in); 74 T1 = readShortTable(in); 75 T2 = readShortTable(in); 76 77 wordoffs = readShortTable(in); 78 worddata = readByteTable(in); 79 80 wordstart = in.readShort(); 81 wordcutoff = in.readShort(); 82 maxklen = in.readShort(); 83 84 rawdata = readByteTable(in); 85 rawindex = readCharTable(in); 86 codepoint = readCharTable(in); 87 } 88 89 90 private static short[] readShortTable(DataInputStream in) 91 throws IOException 92 { 93 if (in.read() != 't') 94 throw new IOException("UnicodeNameMap file corrupt, shorttable"); 95 96 int n = in.readUnsignedShort() / 2; 97 short[] table = new short[n]; 98 for (int i = 0; i < n; i++) { 99 table[i] = in.readShort(); 100 } 101 return table; 102 } 103 104 private static char[] readCharTable(DataInputStream in) 105 throws IOException 106 { 107 if (in.read() != 't') 108 throw new IOException("UnicodeNameMap file corrupt, chartable"); 109 110 int n = in.readUnsignedShort() / 2; 111 char[] table = new char[n]; 112 for (int i = 0; i < n; i++) { 113 table[i] = in.readChar(); 114 } 115 return table; 116 } 117 118 private static byte[] readByteTable(DataInputStream in) 119 throws IOException 120 { 121 if (in.read() != 't') 122 throw new IOException("UnicodeNameMap file corrupt, byte table"); 123 int n = in.readUnsignedShort(); 124 byte[] table = new byte[n]; 125 in.readFully(table); 126 return table; 127 } 128 129 130 public static int hash(String key) { 131 return hash(key, 0, key.length()); 132 } 133 134 public static int hash(String key, int start, int end) { 135 int i, j; 136 int f0, f1, f2; 137 138 for (j = start, i=-minchar, f0=f1=f2=0; j < end; j++) { 139 char ch = key.charAt(j); 140 if (ch >= 'a' && ch <= 'z') 141 ch = (char) (ch - 'a' + 'A'); 142 f0 += T0[i + ch]; 143 f1 += T1[i + ch]; 144 f2 += T2[i + ch]; 145 i += alphasz; 146 if (i >= maxidx) 147 i = -minchar; 148 } 149 150 f0 %= n; 151 f1 %= n; 152 f2 %= n; 153 154 return (G[f0] + G[f1] + G[f2]) % m; 155 } 156 157 158 private static final char[] charmap = 159 " ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-".toCharArray(); 160 161 private static String getWord(int idx) { 162 int offset = wordoffs[idx]; 163 int end = worddata.length; 164 if (idx < wordoffs.length-1) 165 end = wordoffs[idx+1]; 166 StringBuffer buf = new StringBuffer (); 167 for (int i = offset; i < end; i++) 168 buf.append(charmap[worddata[i]]); 169 return buf.toString(); 170 } 171 172 173 private static boolean match(int idx, byte[] raw, int begin, int end) { 174 int woff = wordoffs[idx]; 175 int wend = worddata.length; 176 if (idx < wordoffs.length-1) 177 wend = wordoffs[idx+1]; 178 179 if (end-begin != wend - woff) 180 return false; 181 int l = end-begin; 182 for (int i = 0; i < l; i++) { 183 if (worddata[woff + i] != raw[begin + i]) 184 return false; 185 } 186 return true; 187 } 188 189 190 191 private static int compare(byte[] a1, int off1, int len1, 192 byte[] a2, int off2, int len2) 193 { 194 for (int i = 0; i < len1 && i < len2; i++) { 195 int d = (a1[off1 + i] & 0xFF) - (a2[off2 + i] & 0xFF); 196 if (d != 0) 197 return d; 198 } 199 return len1 - len2; 200 } 201 202 203 private static int binarysearch(byte[] rawlist, int start, int end) { 204 int floor = 0; 205 int ceiling = (rawindex.length) / 5; 206 207 while (floor < ceiling - 1) { 208 int middle = (floor + ceiling) / 2; 209 if (debug) 210 System.out.println("floor:" + floor + " ceiling:" + 211 ceiling +" => " + middle); 212 213 int off = rawindex[middle*5]; 214 int len = rawindex[middle*5+4] & 0x1F; 215 int d = compare(rawlist, start, end - start, rawdata, off, len); 216 if (d < 0) 217 ceiling = middle; 218 else if (d > 0) 219 floor = middle; 220 else 221 return middle * 12; 222 } 223 224 int tmp = floor*5; 225 226 int off = rawindex[tmp++]; 227 long lengths = ((long) rawindex[tmp++] << 48) | 228 ((long) rawindex[tmp++] << 32) | 229 ((long) rawindex[tmp++] << 16) | 230 ((long) rawindex[tmp++]); 231 232 floor *= 12; 233 for (int i = 0; i < 12; i++) { 234 int len = (int) (lengths >> (i * 5)) & 0x1F; 235 if (compare(rawlist, start, end, rawdata, off, len) == 0) 236 return floor; 237 off += len; 238 floor++; 239 } 240 return -1; 241 } 242 243 public static int lookup(String name) { 244 return lookup(name, 0, name.length()); 245 } 246 247 248 private static int lookup(String name, int start, int end) { 249 250 byte[] rawlist = new byte[32]; 251 int ridx = 0; 252 int rbegin = 0; 253 int rstart = 0; 254 255 int i; 256 while (true) { 257 rbegin = ridx; 258 int begin = start; 259 for (i = start; i < end; i++) { 260 char ch = name.charAt(i); 261 if (ch == ' ') { 262 start = i+1; 263 break; 264 } 265 int v; 266 if (ch >= 'a' && ch <= 'z') 267 ch = (char) (ch - 'a' + 'A'); 268 if (ch >= 'A' && ch <= 'Z') 269 v = ch - 'A' + 1; 270 else if (ch >= '0' && ch <= '9') 271 v = ch - '0' + 27; 272 else if (ch == '-') 273 v = 37; 274 else 275 return -1; 276 277 rawlist[ridx++] = (byte) v; 278 if (ch == '-' && start != i) { 279 start = ++i; 280 break; 281 } 282 } 283 284 int hash = hash(name, begin, i); 285 286 if (debug) 287 System.out.println(name.substring(begin, i) + " " + hash); 288 289 boolean isWord = hash >= 0 && 290 ridx - rbegin > 1 && 291 match(hash, rawlist, rbegin, ridx); 292 293 if (isWord) { 294 if (debug) 295 System.out.println("match " + getWord(hash)); 296 hash += wordstart; 297 ridx = rstart; 298 if (hash > wordcutoff) { 299 rawlist[ridx++] = (byte) ((hash >> 8) + wordcutoff); 300 rawlist[ridx++] = (byte) (hash & 0xFF); 301 } else 302 rawlist[ridx++] = (byte) hash; 303 } 304 rstart = ridx; 305 306 if (i >= end) 307 break; 308 309 if (!isWord) { 310 rawlist[ridx++] = 0; 311 } 312 313 } 314 315 if (debug) { 316 System.out.print("rawdata: "); 317 for (int k = 0; k < ridx; k++) 318 System.out.print((rawlist[k] & 0xFF) + " "); 319 System.out.println(); 320 } 321 322 int idx = binarysearch(rawlist, 0, ridx); 323 if (idx < 0) 324 return idx; 325 if (debug) { 326 System.out.println("idx:" + idx); 327 System.out.println("codepoint:" + codepoint[idx] + " " + 328 Integer.toHexString((int)codepoint[idx])); 329 } 330 return codepoint[idx]; 331 } 332 333 334 public int getCchMax() { 336 if (!initialized()) 337 return -1; 338 return maxklen; 339 } 340 341 342 343 private static String cjkPrefix = "CJK COMPATIBILITY IDEOGRAPH-"; 344 private static int cjkPrefixLen = cjkPrefix.length(); 345 346 public int getValue(String s, int start, int end) { 348 if (!initialized()) 349 return -1; 350 351 if (s.regionMatches(start, cjkPrefix, 0, cjkPrefixLen)) { 352 try { 353 String hex = s.substring(start + cjkPrefixLen, end); 354 int v = Integer.parseInt(hex, 16); 355 return v; 356 } catch (NumberFormatException exc) { 357 return -1; } 359 } 360 361 return lookup(s, start, end); 362 } 363 364 365 private static boolean initialized = false; 366 private static boolean loaded = false; 367 368 369 private synchronized boolean initialized() { 370 if (initialized && loaded) 371 return true; 372 if (initialized) 373 return false; 374 try { 375 loadTables(); 376 loaded = true; 377 } catch (Exception exc) { 378 return false; 379 } 380 initialized = true; 381 return true; 382 } 383 384 private static boolean debug = false; 385 386 387 public static void main(String [] args) throws Exception { 388 loadTables(); 389 390 debug = true; 391 392 408 409 System.out.println(lookup("BACKSPACE")); 410 412 417 } 418 } 419 | Popular Tags |