1 8 9 package org.python.core; 10 11 15 16 public class codecs { 17 private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD; 18 19 private static PyList searchPath = new PyList(); 20 private static PyStringMap searchCache = new PyStringMap(); 21 22 private static String default_encoding = "ascii"; 23 24 public static String getDefaultEncoding() { 25 return default_encoding; 26 } 27 28 public static void setDefaultEncoding(String encoding) { 29 lookup(encoding); 30 default_encoding = encoding; 31 } 32 33 public static void register(PyObject search_function) { 34 if (!search_function.isCallable()) 35 throw Py.TypeError("argument must be callable"); 36 searchPath.append(search_function); 37 } 38 39 40 public static PyTuple lookup(String encoding) { 41 import_encodings(); 42 PyString v = new PyString(normalizestring(encoding)); 43 PyObject result = searchCache.__finditem__(v); 44 if (result != null) 45 return (PyTuple)result; 46 47 if (searchPath.__len__() == 0) 48 throw new PyException(Py.LookupError, 49 "no codec search functions registered: " + 50 "can't find encoding"); 51 52 PyObject iter = searchPath.__iter__(); 53 PyObject func = null; 54 while ((func = iter.__iternext__()) != null) { 55 result = func.__call__(v); 56 if (result == Py.None) 57 continue; 58 if (!(result instanceof PyTuple) || result.__len__() != 4) 59 throw Py.TypeError("codec search functions must "+ 60 "return 4-tuples"); 61 break; 62 } 63 if (func == null) 64 throw new PyException(Py.LookupError, "unknown encoding " + 65 encoding); 66 searchCache.__setitem__(v, result); 67 return (PyTuple)result; 68 } 69 70 private static String normalizestring(String string) { 71 return string.toLowerCase().replace(' ', '-'); 72 } 73 74 75 private static boolean import_encodings_called = false; 76 77 private static void import_encodings() { 78 if (!import_encodings_called) { 79 import_encodings_called = true; 80 try { 81 __builtin__.__import__("encodings"); 82 } catch (PyException exc) { 83 if (exc.type != Py.ImportError) 84 throw exc; 85 } 86 } 87 } 88 89 90 91 public static String decode(PyString v, String encoding, 92 String errors) 93 { 94 if (encoding == null) 95 encoding = getDefaultEncoding(); 96 else 97 encoding = normalizestring(encoding); 98 if (errors != null) 99 errors = errors.intern(); 100 101 102 110 if (encoding.equals("ascii")) 111 return PyUnicode_DecodeASCII(v.toString(), 112 v.__len__(), errors); 113 114 115 PyObject decoder = getDecoder(encoding); 116 PyObject result = null; 117 if (errors != null) { 118 result = decoder.__call__(v, new PyString(errors)); 119 } else { 120 result = decoder.__call__(v); 121 } 122 123 if (!(result instanceof PyTuple) || result.__len__() != 2) 124 throw Py.TypeError("decoder must return a tuple " + 125 "(object,integer)"); 126 return result.__getitem__(0).toString(); 127 } 128 129 130 private static PyObject getDecoder(String encoding) { 131 PyObject codecs = lookup(encoding); 132 return codecs.__getitem__(1); 133 } 134 135 136 137 public static String encode(PyString v, String encoding, 138 String errors) 139 { 140 if (encoding == null) 141 encoding = getDefaultEncoding(); 142 else 143 encoding = normalizestring(encoding); 144 if (errors != null) 145 errors = errors.intern(); 146 147 148 155 156 if (encoding.equals("ascii")) 157 return PyUnicode_EncodeASCII(v.toString(), 158 v.__len__(), errors); 159 160 161 PyObject encoder = getEncoder(encoding); 162 PyObject result = null; 163 if (errors != null) { 164 result = encoder.__call__(v, new PyString(errors)); 165 } else { 166 result = encoder.__call__(v); 167 } 168 169 if (!(result instanceof PyTuple) || result.__len__() != 2) 170 throw Py.TypeError("encoder must return a tuple " + 171 "(object,integer)"); 172 return result.__getitem__(0).toString(); 173 } 174 175 private static PyObject getEncoder(String encoding) { 176 PyObject codecs = lookup(encoding); 177 return codecs.__getitem__(0); 178 } 179 180 181 182 private static byte utf8_code_length[] = { 183 185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 198 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 199 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 200 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 201 }; 202 203 204 public static String PyUnicode_DecodeUTF8(String str, String errors) { 205 int size = str.length(); 206 StringBuffer unicode = new StringBuffer (size); 207 208 209 for (int i = 0; i < size; ) { 210 int ch = str.charAt(i); 211 if (ch > 0xFF) { 212 codecs.decoding_error("utf-8", unicode, errors, 213 "ordinal not in range(255)"); 214 i++; 215 continue; 216 } 217 218 if (ch < 0x80) { 219 unicode.append((char) ch); 220 i++; 221 continue; 222 } 223 224 int n = utf8_code_length[ch]; 225 226 if (i + n > size) { 227 codecs.decoding_error("utf-8", unicode, errors, 228 "unexpected end of data"); 229 i++; 230 continue; 231 } 232 233 234 switch (n) { 235 case 0: 236 codecs.decoding_error("utf-8", unicode, errors, 237 "unexpected code byte"); 238 i++; 239 continue; 240 case 1: 241 codecs.decoding_error("utf-8", unicode, errors, 242 "internal error"); 243 i++; 244 continue; 245 case 2: 246 char ch1 = str.charAt(i+1); 247 if ((ch1 & 0xc0) != 0x80) { 248 codecs.decoding_error("utf-8", unicode, errors, 249 "invalid data"); 250 i++; 251 continue; 252 } 253 ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f); 254 if (ch < 0x80) { 255 codecs.decoding_error("utf-8", unicode, errors, 256 "illegal encoding"); 257 i++; 258 continue; 259 } else 260 unicode.append((char) ch); 261 break; 262 263 case 3: 264 ch1 = str.charAt(i+1); 265 char ch2 = str.charAt(i+2); 266 if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) { 267 codecs.decoding_error("utf-8", unicode, errors, 268 "invalid data"); 269 i++; 270 continue; 271 } 272 ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f); 273 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { 274 codecs.decoding_error("utf-8", unicode, errors, 275 "illegal encoding"); 276 i++; 277 continue; 278 } else 279 unicode.append((char) ch); 280 break; 281 282 case 4: 283 ch1 = str.charAt(i+1); 284 ch2 = str.charAt(i+2); 285 char ch3 = str.charAt(i+3); 286 if ((ch1 & 0xc0) != 0x80 || 287 (ch2 & 0xc0) != 0x80 || 288 (ch3 & 0xc0) != 0x80) { 289 codecs.decoding_error("utf-8", unicode, errors, 290 "invalid data"); 291 i++; 292 continue; 293 } 294 ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) + 295 ((ch2 & 0x3f) << 6) + (ch3 & 0x3f); 296 297 if ((ch < 0x10000) || 299 (ch > 0x10ffff)) { 301 codecs.decoding_error("utf-8", unicode, errors, 302 "illegal encoding"); 303 i++; 304 continue; 305 } 306 307 308 309 ch -= 0x10000; 310 311 312 unicode.append((char) (0xD800 + (ch >> 10))); 313 314 315 unicode.append((char) (0xDC00 + (ch & ~0xFC00))); 316 break; 317 318 default: 319 320 codecs.decoding_error("utf-8", unicode, errors, 321 "unsupported Unicode code range"); 322 i++; 323 } 324 i += n; 325 } 326 327 return unicode.toString(); 328 } 329 330 331 public static String PyUnicode_EncodeUTF8(String str, String errors) { 332 int size = str.length(); 333 StringBuffer v = new StringBuffer (size * 3); 334 335 for (int i = 0; i < size; ) { 336 int ch = str.charAt(i++); 337 if (ch < 0x80) 338 v.append((char) ch); 339 else if (ch < 0x0800) { 340 v.append((char) (0xc0 | (ch >> 6))); 341 v.append((char) (0x80 | (ch & 0x3f))); 342 } else { 343 if (0xD800 <= ch && ch <= 0xDFFF) { 344 if (i != size) { 345 int ch2 = str.charAt(i); 346 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 347 348 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; 349 350 v.append((char)((ch >> 18) | 0xf0)); 351 v.append((char)(0x80 | ((ch >> 12) & 0x3f))); 352 i++; 353 } 354 } 355 } else { 356 v.append((char)(0xe0 | (ch >> 12))); 357 } 358 v.append((char) (0x80 | ((ch >> 6) & 0x3f))); 359 v.append((char) (0x80 | (ch & 0x3f))); 360 } 361 } 362 return v.toString(); 363 } 364 365 366 367 368 369 public static String PyUnicode_DecodeASCII(String str, int size, 370 String errors) 371 { 372 StringBuffer v = new StringBuffer (size); 373 374 for (int i = 0; i < size; i++) { 375 char ch = str.charAt(i); 376 if (ch < 128) { 377 v.append(ch); 378 } else { 379 decoding_error("ascii", v, errors, 380 "ordinal not in range(128)"); 381 continue; 382 } 383 } 384 385 return v.toString(); 386 } 387 388 389 public static String PyUnicode_EncodeASCII(String str, int size, 390 String errors) 391 { 392 StringBuffer v = new StringBuffer (size); 393 394 for (int i = 0; i < size; i++) { 395 char ch = str.charAt(i); 396 if (ch >= 128) { 397 encoding_error("ascii", v, errors, 398 "ordinal not in range(128)"); 399 } else 400 v.append(ch); 401 } 402 return v.toString(); 403 } 404 405 406 407 408 409 private static char[] hexdigit = "0123456789ABCDEF".toCharArray(); 410 411 public static String PyUnicode_EncodeRawUnicodeEscape(String str, 413 String errors, 414 boolean modifed) 415 { 416 417 int size = str.length(); 418 StringBuffer v = new StringBuffer (str.length()); 419 420 for (int i = 0; i < size; i++) { 421 char ch = str.charAt(i); 422 if (ch >= 256 || (modifed && (ch == '\n' || ch == '\\'))) { 423 v.append("\\u"); 424 v.append(hexdigit[(ch >>> 12) & 0xF]); 425 v.append(hexdigit[(ch >>> 8) & 0xF]); 426 v.append(hexdigit[(ch >>> 4) & 0xF]); 427 v.append(hexdigit[ch & 0xF]); 428 } else 429 v.append(ch); 430 } 431 432 return v.toString(); 433 } 434 435 436 public static String PyUnicode_DecodeRawUnicodeEscape(String str, 437 String errors) 438 { 439 int size = str.length(); 440 StringBuffer v = new StringBuffer (size); 441 442 for (int i = 0; i < size; ) { 443 char ch = str.charAt(i); 444 445 446 if (ch != '\\') { 447 v.append(ch); 448 i++; 449 continue; 450 } 451 452 454 int bs = i; 455 while (i < size) { 456 ch = str.charAt(i); 457 if (ch != '\\') 458 break; 459 v.append(ch); 460 i++; 461 } 462 if (((i - bs) & 1) == 0 || i >= size || ch != 'u') { 463 continue; 464 } 465 v.setLength(v.length() - 1); 466 i++; 467 468 469 int x = 0; 470 for (int j = 0; j < 4; j++) { 471 ch = str.charAt(i+j); 472 int d = Character.digit(ch, 16); 473 if (d == -1) { 474 codecs.decoding_error("unicode escape", v, errors, 475 "truncated \\uXXXX"); 476 break; 477 } 478 x = ((x<<4) & ~0xF) + d; 479 } 480 i += 4; 481 v.append((char) x); 482 } 483 return v.toString(); 484 } 485 486 487 488 489 public static void encoding_error(String type, StringBuffer dest, 490 String errors, String details) 491 { 492 if (errors == null || errors == "strict") 493 throw Py.UnicodeError(type + " encoding error: " + details); 494 else if (errors == "ignore") { } 495 else if (errors == "replace") 496 dest.append('?'); 497 else 498 throw Py.ValueError(type + " encoding error; "+ 499 "unknown error handling code: " + errors); 500 } 501 502 503 public static void decoding_error(String type, StringBuffer dest, 504 String errors, String details) 505 { 506 if (errors == null || errors == "strict") 507 throw Py.UnicodeError(type + " decoding error: " + details); 508 else if (errors == "ignore") { } 509 else if (errors == "replace") { 510 if (dest != null) 511 dest.append(Py_UNICODE_REPLACEMENT_CHARACTER); 512 } else 513 throw Py.ValueError(type + " decoding error; "+ 514 "unknown error handling code: " + errors); 515 } 516 } 517 | Popular Tags |