KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > python > core > codecs


1 /*
2  * Copyright 2000 Finn Bock
3  *
4  * This program contains material copyrighted by:
5  * Copyright (c) Corporation for National Research Initiatives.
6  * Originally written by Marc-Andre Lemburg (mal@lemburg.com).
7  */

8
9 package org.python.core;
10
11 /**
12  * Contains the implementation of the builtin codecs.
13  * @since Jython 2.0
14  */

15
16 public class codecs {
17     private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
18
19     private static PyList searchPath = new PyList();
20     private static PyStringMap searchCache = new PyStringMap();
21
22     private static String JavaDoc default_encoding = "ascii";
23
24     public static String JavaDoc getDefaultEncoding() {
25         return default_encoding;
26     }
27
28     public static void setDefaultEncoding(String JavaDoc encoding) {
29         lookup(encoding);
30         default_encoding = encoding;
31     }
32
33     public static void register(PyObject search_function) {
34         if (!search_function.isCallable())
35            throw Py.TypeError("argument must be callable");
36         searchPath.append(search_function);
37     }
38
39
40     public static PyTuple lookup(String JavaDoc encoding) {
41         import_encodings();
42         PyString v = new PyString(normalizestring(encoding));
43         PyObject result = searchCache.__finditem__(v);
44         if (result != null)
45             return (PyTuple)result;
46
47         if (searchPath.__len__() == 0)
48              throw new PyException(Py.LookupError,
49                    "no codec search functions registered: " +
50                    "can't find encoding");
51
52         PyObject iter = searchPath.__iter__();
53         PyObject func = null;
54         while ((func = iter.__iternext__()) != null) {
55             result = func.__call__(v);
56             if (result == Py.None)
57                 continue;
58             if (!(result instanceof PyTuple) || result.__len__() != 4)
59                 throw Py.TypeError("codec search functions must "+
60                                    "return 4-tuples");
61             break;
62         }
63         if (func == null)
64             throw new PyException(Py.LookupError, "unknown encoding " +
65                                   encoding);
66         searchCache.__setitem__(v, result);
67         return (PyTuple)result;
68     }
69
70     private static String JavaDoc normalizestring(String JavaDoc string) {
71         return string.toLowerCase().replace(' ', '-');
72     }
73
74
75     private static boolean import_encodings_called = false;
76
77     private static void import_encodings() {
78         if (!import_encodings_called) {
79             import_encodings_called = true;
80             try {
81                 __builtin__.__import__("encodings");
82             } catch (PyException exc) {
83                 if (exc.type != Py.ImportError)
84                     throw exc;
85             }
86         }
87     }
88
89
90
91     public static String JavaDoc decode(PyString v, String JavaDoc encoding,
92                                   String JavaDoc errors)
93     {
94         if (encoding == null)
95             encoding = getDefaultEncoding();
96         else
97             encoding = normalizestring(encoding);
98         if (errors != null)
99             errors = errors.intern();
100
101         /* Shortcuts for common default encodings */
102 /*
103         if (encoding.equals("utf-8"))
104             return utf_8_decode(v, errors).__getitem__(0).__str__();
105         else if (encoding.equals("latin-1"))
106             ; //return PyUnicode_DecodeLatin1(s, size, errors);
107         else if (encoding.equals("ascii"))
108             ; //return PyUnicode_DecodeASCII(s, size, errors);
109 */

110         if (encoding.equals("ascii"))
111             return PyUnicode_DecodeASCII(v.toString(),
112                                                       v.__len__(), errors);
113
114         /* Decode via the codec registry */
115         PyObject decoder = getDecoder(encoding);
116         PyObject result = null;
117         if (errors != null) {
118             result = decoder.__call__(v, new PyString(errors));
119         } else {
120             result = decoder.__call__(v);
121         }
122
123         if (!(result instanceof PyTuple) || result.__len__() != 2)
124             throw Py.TypeError("decoder must return a tuple " +
125                                "(object,integer)");
126         return result.__getitem__(0).toString();
127     }
128
129
130     private static PyObject getDecoder(String JavaDoc encoding) {
131         PyObject codecs = lookup(encoding);
132         return codecs.__getitem__(1);
133     }
134
135
136
137     public static String JavaDoc encode(PyString v, String JavaDoc encoding,
138                                   String JavaDoc errors)
139     {
140         if (encoding == null)
141             encoding = getDefaultEncoding();
142         else
143             encoding = normalizestring(encoding);
144         if (errors != null)
145             errors = errors.intern();
146
147         /* Shortcuts for common default encodings */
148 /*
149         if (encoding.equals("utf-8"))
150             return PyUnicode_DecodeUTF8(v.toString(), v.__len__(), errors);
151         else if (encoding.equals("latin-1"))
152             return PyUnicode_DecodeLatin1(v.toString(), v.__len__(), errors);
153         else
154 */

155
156         if (encoding.equals("ascii"))
157             return PyUnicode_EncodeASCII(v.toString(),
158                                                       v.__len__(), errors);
159
160         /* Decode via the codec registry */
161         PyObject encoder = getEncoder(encoding);
162         PyObject result = null;
163         if (errors != null) {
164             result = encoder.__call__(v, new PyString(errors));
165         } else {
166             result = encoder.__call__(v);
167         }
168
169         if (!(result instanceof PyTuple) || result.__len__() != 2)
170             throw Py.TypeError("encoder must return a tuple " +
171                                "(object,integer)");
172         return result.__getitem__(0).toString();
173     }
174
175     private static PyObject getEncoder(String JavaDoc encoding) {
176         PyObject codecs = lookup(encoding);
177         return codecs.__getitem__(0);
178     }
179
180
181     /* --- UTF-8 Codec ---------------------------------------------------- */
182     private static byte utf8_code_length[] = {
183        /* Map UTF-8 encoded prefix byte to sequence length. zero means
184            illegal prefix. see RFC 2279 for details */

185         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
186         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
187         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
188         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
189         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
190         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
191         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
192         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
193         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
195         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
196         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
198         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
199         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
200         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
201     };
202
203
204     public static String JavaDoc PyUnicode_DecodeUTF8(String JavaDoc str, String JavaDoc errors) {
205         int size = str.length();
206         StringBuffer JavaDoc unicode = new StringBuffer JavaDoc(size);
207
208         /* Unpack UTF-8 encoded data */
209         for (int i = 0; i < size; ) {
210             int ch = str.charAt(i);
211             if (ch > 0xFF) {
212                 codecs.decoding_error("utf-8", unicode, errors,
213                                       "ordinal not in range(255)");
214                 i++;
215                 continue;
216             }
217
218             if (ch < 0x80) {
219                 unicode.append((char) ch);
220                 i++;
221                 continue;
222             }
223
224             int n = utf8_code_length[ch];
225
226             if (i + n > size) {
227                 codecs.decoding_error("utf-8", unicode, errors,
228                                       "unexpected end of data");
229                 i++;
230                 continue;
231             }
232
233
234             switch (n) {
235             case 0:
236                 codecs.decoding_error("utf-8", unicode, errors,
237                                       "unexpected code byte");
238                 i++;
239                 continue;
240             case 1:
241                 codecs.decoding_error("utf-8", unicode, errors,
242                                       "internal error");
243                 i++;
244                 continue;
245             case 2:
246                 char ch1 = str.charAt(i+1);
247                 if ((ch1 & 0xc0) != 0x80) {
248                     codecs.decoding_error("utf-8", unicode, errors,
249                                           "invalid data");
250                     i++;
251                     continue;
252                 }
253                 ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
254                 if (ch < 0x80) {
255                     codecs.decoding_error("utf-8", unicode, errors,
256                                           "illegal encoding");
257                     i++;
258                     continue;
259                 } else
260                     unicode.append((char) ch);
261                 break;
262
263             case 3:
264                 ch1 = str.charAt(i+1);
265                 char ch2 = str.charAt(i+2);
266                 if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
267                     codecs.decoding_error("utf-8", unicode, errors,
268                                           "invalid data");
269                     i++;
270                     continue;
271                 }
272                 ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f);
273                 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
274                     codecs.decoding_error("utf-8", unicode, errors,
275                                           "illegal encoding");
276                     i++;
277                     continue;
278                 } else
279                    unicode.append((char) ch);
280                 break;
281
282             case 4:
283                 ch1 = str.charAt(i+1);
284                 ch2 = str.charAt(i+2);
285                 char ch3 = str.charAt(i+3);
286                 if ((ch1 & 0xc0) != 0x80 ||
287                     (ch2 & 0xc0) != 0x80 ||
288                     (ch3 & 0xc0) != 0x80) {
289                     codecs.decoding_error("utf-8", unicode, errors,
290                                           "invalid data");
291                     i++;
292                     continue;
293                 }
294                 ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) +
295                      ((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
296                 /* validate and convert to UTF-16 */
297                 if ((ch < 0x10000) || /* minimum value allowed for 4
298                                            byte encoding */

299                     (ch > 0x10ffff)) { /* maximum value allowed for
300                                            UTF-16 */

301                     codecs.decoding_error("utf-8", unicode, errors,
302                                           "illegal encoding");
303                     i++;
304                     continue;
305                 }
306                 /* compute and append the two surrogates: */
307
308                 /* translate from 10000..10FFFF to 0..FFFF */
309                 ch -= 0x10000;
310
311                 /* high surrogate = top 10 bits added to D800 */
312                 unicode.append((char) (0xD800 + (ch >> 10)));
313
314                 /* low surrogate = bottom 10 bits added to DC00 */
315                 unicode.append((char) (0xDC00 + (ch & ~0xFC00)));
316                 break;
317
318             default:
319                 /* Other sizes are only needed for UCS-4 */
320                 codecs.decoding_error("utf-8", unicode, errors,
321                                       "unsupported Unicode code range");
322                 i++;
323             }
324             i += n;
325         }
326
327         return unicode.toString();
328     }
329
330
331     public static String JavaDoc PyUnicode_EncodeUTF8(String JavaDoc str, String JavaDoc errors) {
332         int size = str.length();
333         StringBuffer JavaDoc v = new StringBuffer JavaDoc(size * 3);
334
335         for (int i = 0; i < size; ) {
336             int ch = str.charAt(i++);
337             if (ch < 0x80)
338                 v.append((char) ch);
339             else if (ch < 0x0800) {
340                 v.append((char) (0xc0 | (ch >> 6)));
341                 v.append((char) (0x80 | (ch & 0x3f)));
342             } else {
343                 if (0xD800 <= ch && ch <= 0xDFFF) {
344                     if (i != size) {
345                         int ch2 = str.charAt(i);
346                         if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
347                             /* combine the two values */
348                             ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
349
350                             v.append((char)((ch >> 18) | 0xf0));
351                             v.append((char)(0x80 | ((ch >> 12) & 0x3f)));
352                             i++;
353                         }
354                     }
355                 } else {
356                     v.append((char)(0xe0 | (ch >> 12)));
357                 }
358                 v.append((char) (0x80 | ((ch >> 6) & 0x3f)));
359                 v.append((char) (0x80 | (ch & 0x3f)));
360             }
361         }
362         return v.toString();
363     }
364
365
366
367     /* --- 7-bit ASCII Codec -------------------------------------------- */
368
369     public static String JavaDoc PyUnicode_DecodeASCII(String JavaDoc str, int size,
370                                                String JavaDoc errors)
371     {
372         StringBuffer JavaDoc v = new StringBuffer JavaDoc(size);
373
374         for (int i = 0; i < size; i++) {
375             char ch = str.charAt(i);
376             if (ch < 128) {
377                 v.append(ch);
378             } else {
379                 decoding_error("ascii", v, errors,
380                                "ordinal not in range(128)");
381                 continue;
382             }
383         }
384
385         return v.toString();
386     }
387
388
389     public static String JavaDoc PyUnicode_EncodeASCII(String JavaDoc str, int size,
390                                                String JavaDoc errors)
391     {
392         StringBuffer JavaDoc v = new StringBuffer JavaDoc(size);
393
394         for (int i = 0; i < size; i++) {
395             char ch = str.charAt(i);
396             if (ch >= 128) {
397                 encoding_error("ascii", v, errors,
398                                "ordinal not in range(128)");
399             } else
400                 v.append(ch);
401         }
402         return v.toString();
403     }
404
405
406
407     /* --- RawUnicodeEscape Codec ---------------------------------------- */
408
409     private static char[] hexdigit = "0123456789ABCDEF".toCharArray();
410
411     // The modified flag is used by cPickle.
412
public static String JavaDoc PyUnicode_EncodeRawUnicodeEscape(String JavaDoc str,
413                                                           String JavaDoc errors,
414                                                           boolean modifed)
415     {
416
417         int size = str.length();
418         StringBuffer JavaDoc v = new StringBuffer JavaDoc(str.length());
419
420         for (int i = 0; i < size; i++) {
421             char ch = str.charAt(i);
422             if (ch >= 256 || (modifed && (ch == '\n' || ch == '\\'))) {
423                 v.append("\\u");
424                 v.append(hexdigit[(ch >>> 12) & 0xF]);
425                 v.append(hexdigit[(ch >>> 8) & 0xF]);
426                 v.append(hexdigit[(ch >>> 4) & 0xF]);
427                 v.append(hexdigit[ch & 0xF]);
428             } else
429                 v.append(ch);
430         }
431
432         return v.toString();
433     }
434
435
436     public static String JavaDoc PyUnicode_DecodeRawUnicodeEscape(String JavaDoc str,
437                                                           String JavaDoc errors)
438     {
439         int size = str.length();
440         StringBuffer JavaDoc v = new StringBuffer JavaDoc(size);
441
442         for (int i = 0; i < size; ) {
443             char ch = str.charAt(i);
444
445             /* Non-escape characters are interpreted as Unicode ordinals */
446             if (ch != '\\') {
447                 v.append(ch);
448                 i++;
449                 continue;
450             }
451
452             /* \\u-escapes are only interpreted iff the number of leading
453                backslashes is odd */

454             int bs = i;
455             while (i < size) {
456                 ch = str.charAt(i);
457                 if (ch != '\\')
458                     break;
459                 v.append(ch);
460                 i++;
461             }
462             if (((i - bs) & 1) == 0 || i >= size || ch != 'u') {
463                 continue;
464             }
465             v.setLength(v.length() - 1);
466             i++;
467
468             /* \\uXXXX with 4 hex digits */
469             int x = 0;
470             for (int j = 0; j < 4; j++) {
471                 ch = str.charAt(i+j);
472                 int d = Character.digit(ch, 16);
473                 if (d == -1) {
474                     codecs.decoding_error("unicode escape", v, errors,
475                                           "truncated \\uXXXX");
476                     break;
477                 }
478                 x = ((x<<4) & ~0xF) + d;
479             }
480             i += 4;
481             v.append((char) x);
482        }
483        return v.toString();
484     }
485
486
487     /* --- Utility methods -------------------------------------------- */
488
489     public static void encoding_error(String JavaDoc type, StringBuffer JavaDoc dest,
490                                       String JavaDoc errors, String JavaDoc details)
491     {
492         if (errors == null || errors == "strict")
493             throw Py.UnicodeError(type + " encoding error: " + details);
494         else if (errors == "ignore") { }
495         else if (errors == "replace")
496             dest.append('?');
497         else
498             throw Py.ValueError(type + " encoding error; "+
499                                 "unknown error handling code: " + errors);
500     }
501
502
503     public static void decoding_error(String JavaDoc type, StringBuffer JavaDoc dest,
504                                       String JavaDoc errors, String JavaDoc details)
505     {
506         if (errors == null || errors == "strict")
507             throw Py.UnicodeError(type + " decoding error: " + details);
508         else if (errors == "ignore") { }
509         else if (errors == "replace") {
510             if (dest != null)
511                 dest.append(Py_UNICODE_REPLACEMENT_CHARACTER);
512         } else
513             throw Py.ValueError(type + " decoding error; "+
514                                 "unknown error handling code: " + errors);
515     }
516 }
517
Popular Tags