codecs


1   /*
2    * Copyright 2000 Finn Bock
3    *
4    * This program contains material copyrighted by:
5    * Copyright (c) Corporation for National Research Initiatives.
6    * Originally written by Marc-Andre Lemburg (mal@lemburg.com).
7    */
8   
9   package org.python.core;
10  
11  /**
12   * Contains the implementation of the builtin codecs.
13   * @since Jython 2.0
14   */
15  
16  public class codecs {
17      private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
18  
19      private static PyList searchPath = new PyList();
20      private static PyStringMap searchCache = new PyStringMap();
21  
22      private static String   default_encoding = "ascii";
23  
24      public static String   getDefaultEncoding() {
25          return default_encoding;
26      }
27  
28      public static void setDefaultEncoding(String   encoding) {
29          lookup(encoding);
30          default_encoding = encoding;
31      }
32  
33      public static void register(PyObject search_function) {
34          if (!search_function.isCallable())
35             throw Py.TypeError("argument must be callable");
36          searchPath.append(search_function);
37      }
38  
39  
40      public static PyTuple lookup(String   encoding) {
41          import_encodings();
42          PyString v = new PyString(normalizestring(encoding));
43          PyObject result = searchCache.__finditem__(v);
44          if (result != null)
45              return (PyTuple)result;
46  
47          if (searchPath.__len__() == 0)
48               throw new PyException(Py.LookupError,
49                     "no codec search functions registered: " +
50                     "can't find encoding");
51  
52          PyObject iter = searchPath.__iter__();
53          PyObject func = null;
54          while ((func = iter.__iternext__()) != null) {
55              result = func.__call__(v);
56              if (result == Py.None)
57                  continue;
58              if (!(result instanceof PyTuple) || result.__len__() != 4)
59                  throw Py.TypeError("codec search functions must "+
60                                     "return 4-tuples");
61              break;
62          }
63          if (func == null)
64              throw new PyException(Py.LookupError, "unknown encoding " +
65                                    encoding);
66          searchCache.__setitem__(v, result);
67          return (PyTuple)result;
68      }
69  
70      private static String   normalizestring(String   string) {
71          return string.toLowerCase().replace(' ', '-');
72      }
73  
74  
75      private static boolean import_encodings_called = false;
76  
77      private static void import_encodings() {
78          if (!import_encodings_called) {
79              import_encodings_called = true;
80              try {
81                  __builtin__.__import__("encodings");
82              } catch (PyException exc) {
83                  if (exc.type != Py.ImportError)
84                      throw exc;
85              }
86          }
87      }
88  
89  
90  
91      public static String   decode(PyString v, String   encoding,
92                                    String   errors)
93      {
94          if (encoding == null)
95              encoding = getDefaultEncoding();
96          else
97              encoding = normalizestring(encoding);
98          if (errors != null)
99              errors = errors.intern();
100 
101         /* Shortcuts for common default encodings */
102 /*
103         if (encoding.equals("utf-8"))
104             return utf_8_decode(v, errors).__getitem__(0).__str__();
105         else if (encoding.equals("latin-1"))
106             ; //return PyUnicode_DecodeLatin1(s, size, errors);
107         else if (encoding.equals("ascii"))
108             ; //return PyUnicode_DecodeASCII(s, size, errors);
109 */
110         if (encoding.equals("ascii"))
111             return PyUnicode_DecodeASCII(v.toString(),
112                                                       v.__len__(), errors);
113 
114         /* Decode via the codec registry */
115         PyObject decoder = getDecoder(encoding);
116         PyObject result = null;
117         if (errors != null) {
118             result = decoder.__call__(v, new PyString(errors));
119         } else {
120             result = decoder.__call__(v);
121         }
122 
123         if (!(result instanceof PyTuple) || result.__len__() != 2)
124             throw Py.TypeError("decoder must return a tuple " +
125                                "(object,integer)");
126         return result.__getitem__(0).toString();
127     }
128 
129 
130     private static PyObject getDecoder(String   encoding) {
131         PyObject codecs = lookup(encoding);
132         return codecs.__getitem__(1);
133     }
134 
135 
136 
137     public static String   encode(PyString v, String   encoding,
138                                   String   errors)
139     {
140         if (encoding == null)
141             encoding = getDefaultEncoding();
142         else
143             encoding = normalizestring(encoding);
144         if (errors != null)
145             errors = errors.intern();
146 
147         /* Shortcuts for common default encodings */
148 /*
149         if (encoding.equals("utf-8"))
150             return PyUnicode_DecodeUTF8(v.toString(), v.__len__(), errors);
151         else if (encoding.equals("latin-1"))
152             return PyUnicode_DecodeLatin1(v.toString(), v.__len__(), errors);
153         else
154 */
155 
156         if (encoding.equals("ascii"))
157             return PyUnicode_EncodeASCII(v.toString(),
158                                                       v.__len__(), errors);
159 
160         /* Decode via the codec registry */
161         PyObject encoder = getEncoder(encoding);
162         PyObject result = null;
163         if (errors != null) {
164             result = encoder.__call__(v, new PyString(errors));
165         } else {
166             result = encoder.__call__(v);
167         }
168 
169         if (!(result instanceof PyTuple) || result.__len__() != 2)
170             throw Py.TypeError("encoder must return a tuple " +
171                                "(object,integer)");
172         return result.__getitem__(0).toString();
173     }
174 
175     private static PyObject getEncoder(String   encoding) {
176         PyObject codecs = lookup(encoding);
177         return codecs.__getitem__(0);
178     }
179 
180 
181     /* --- UTF-8 Codec ---------------------------------------------------- */
182     private static byte utf8_code_length[] = {
183        /* Map UTF-8 encoded prefix byte to sequence length.  zero means
184            illegal prefix.  see RFC 2279 for details */
185         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
186         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
187         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
188         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
189         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
190         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
191         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
192         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
193         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
195         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
196         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
198         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
199         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
200         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
201     };
202 
203 
204     public static String   PyUnicode_DecodeUTF8(String   str, String   errors) {
205         int size = str.length();
206         StringBuffer   unicode = new StringBuffer  (size);
207 
208         /* Unpack UTF-8 encoded data */
209         for (int i = 0; i < size; ) {
210             int ch = str.charAt(i);
211             if (ch > 0xFF) {
212                 codecs.decoding_error("utf-8", unicode, errors,
213                                       "ordinal not in range(255)");
214                 i++;
215                 continue;
216             }
217 
218             if (ch < 0x80) {
219                 unicode.append((char) ch);
220                 i++;
221                 continue;
222             }
223 
224             int n = utf8_code_length[ch];
225 
226             if (i + n > size) {
227                 codecs.decoding_error("utf-8", unicode, errors,
228                                       "unexpected end of data");
229                 i++;
230                 continue;
231             }
232 
233 
234             switch (n) {
235             case 0:
236                 codecs.decoding_error("utf-8", unicode, errors,
237                                       "unexpected code byte");
238                 i++;
239                 continue;
240             case 1:
241                 codecs.decoding_error("utf-8", unicode, errors,
242                                       "internal error");
243                 i++;
244                 continue;
245             case 2:
246                 char ch1 = str.charAt(i+1);
247                 if ((ch1 & 0xc0) != 0x80) {
248                     codecs.decoding_error("utf-8", unicode, errors,
249                                           "invalid data");
250                     i++;
251                     continue;
252                 }
253                 ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
254                 if (ch < 0x80) {
255                     codecs.decoding_error("utf-8", unicode, errors,
256                                           "illegal encoding");
257                     i++;
258                     continue;
259                 } else
260                     unicode.append((char) ch);
261                 break;
262 
263             case 3:
264                 ch1 = str.charAt(i+1);
265                 char ch2 = str.charAt(i+2);
266                 if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
267                     codecs.decoding_error("utf-8", unicode, errors,
268                                           "invalid data");
269                     i++;
270                     continue;
271                 }
272                 ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f);
273                 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
274                     codecs.decoding_error("utf-8", unicode, errors,
275                                           "illegal encoding");
276                     i++;
277                     continue;
278                 } else
279                    unicode.append((char) ch);
280                 break;
281 
282             case 4:
283                 ch1 = str.charAt(i+1);
284                 ch2 = str.charAt(i+2);
285                 char ch3 = str.charAt(i+3);
286                 if ((ch1 & 0xc0) != 0x80 ||
287                     (ch2 & 0xc0) != 0x80 ||
288                     (ch3 & 0xc0) != 0x80) {
289                     codecs.decoding_error("utf-8", unicode, errors,
290                                           "invalid data");
291                     i++;
292                     continue;
293                 }
294                 ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) +
295                      ((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
296                 /* validate and convert to UTF-16 */
297                 if ((ch < 0x10000) ||   /* minimum value allowed for 4
298                                            byte encoding */
299                     (ch > 0x10ffff)) {  /* maximum value allowed for
300                                            UTF-16 */
301                     codecs.decoding_error("utf-8", unicode, errors,
302                                           "illegal encoding");
303                     i++;
304                     continue;
305                 }
306                 /*  compute and append the two surrogates: */
307 
308                 /*  translate from 10000..10FFFF to 0..FFFF */
309                 ch -= 0x10000;
310 
311                 /*  high surrogate = top 10 bits added to D800 */
312                 unicode.append((char) (0xD800 + (ch >> 10)));
313 
314                 /*  low surrogate = bottom 10 bits added to DC00 */
315                 unicode.append((char) (0xDC00 + (ch & ~0xFC00)));
316                 break;
317 
318             default:
319                 /* Other sizes are only needed for UCS-4 */
320                 codecs.decoding_error("utf-8", unicode, errors,
321                                       "unsupported Unicode code range");
322                 i++;
323             }
324             i += n;
325         }
326 
327         return unicode.toString();
328     }
329 
330 
331     public static String   PyUnicode_EncodeUTF8(String   str, String   errors) {
332         int size = str.length();
333         StringBuffer   v = new StringBuffer  (size * 3);
334 
335         for (int i = 0; i < size; ) {
336             int ch = str.charAt(i++);
337             if (ch < 0x80)
338                 v.append((char) ch);
339             else if (ch < 0x0800) {
340                 v.append((char) (0xc0 | (ch >> 6)));
341                 v.append((char) (0x80 | (ch & 0x3f)));
342             } else {
343                 if (0xD800 <= ch && ch <= 0xDFFF) {
344                     if (i != size) {
345                         int ch2 = str.charAt(i);
346                         if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
347                             /* combine the two values */
348                             ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
349 
350                             v.append((char)((ch >> 18) | 0xf0));
351                             v.append((char)(0x80 | ((ch >> 12) & 0x3f)));
352                             i++;
353                         }
354                     }
355                 } else {
356                     v.append((char)(0xe0 | (ch >> 12)));
357                 }
358                 v.append((char) (0x80 | ((ch >> 6) & 0x3f)));
359                 v.append((char) (0x80 | (ch & 0x3f)));
360             }
361         }
362         return v.toString();
363     }
364 
365 
366 
367     /* --- 7-bit ASCII Codec -------------------------------------------- */
368 
369     public static String   PyUnicode_DecodeASCII(String   str, int size,
370                                                String   errors)
371     {
372         StringBuffer   v = new StringBuffer  (size);
373 
374         for (int i = 0; i < size; i++) {
375             char ch = str.charAt(i);
376             if (ch < 128) {
377                 v.append(ch);
378             } else {
379                 decoding_error("ascii", v, errors,
380                                "ordinal not in range(128)");
381                 continue;
382             }
383         }
384 
385         return v.toString();
386     }
387 
388 
389     public static String   PyUnicode_EncodeASCII(String   str, int size,
390                                                String   errors)
391     {
392         StringBuffer   v = new StringBuffer  (size);
393 
394         for (int i = 0; i < size; i++) {
395             char ch = str.charAt(i);
396             if (ch >= 128) {
397                 encoding_error("ascii", v, errors,
398                                "ordinal not in range(128)");
399             } else
400                 v.append(ch);
401         }
402         return v.toString();
403     }
404 
405 
406 
407     /* --- RawUnicodeEscape Codec ---------------------------------------- */
408 
409     private static char[] hexdigit = "0123456789ABCDEF".toCharArray();
410 
411     // The modified flag is used by cPickle.
412     public static String   PyUnicode_EncodeRawUnicodeEscape(String   str,
413                                                           String   errors,
414                                                           boolean modifed)
415     {
416 
417         int size = str.length();
418         StringBuffer   v = new StringBuffer  (str.length());
419 
420         for (int i = 0; i < size; i++) {
421             char ch = str.charAt(i);
422             if (ch >= 256 || (modifed && (ch == '\n' || ch == '\\'))) {
423                 v.append("\\u");
424                 v.append(hexdigit[(ch >>> 12) & 0xF]);
425                 v.append(hexdigit[(ch >>> 8) & 0xF]);
426                 v.append(hexdigit[(ch >>> 4) & 0xF]);
427                 v.append(hexdigit[ch & 0xF]);
428             } else
429                 v.append(ch);
430         }
431 
432         return v.toString();
433     }
434 
435 
436     public static String   PyUnicode_DecodeRawUnicodeEscape(String   str,
437                                                           String   errors)
438     {
439         int size = str.length();
440         StringBuffer   v = new StringBuffer  (size);
441 
442         for (int i = 0; i < size; ) {
443             char ch = str.charAt(i);
444 
445             /* Non-escape characters are interpreted as Unicode ordinals */
446             if (ch != '\\') {
447                 v.append(ch);
448                 i++;
449                 continue;
450             }
451 
452             /* \\u-escapes are only interpreted iff the number of leading
453                backslashes is odd */
454             int bs = i;
455             while (i < size) {
456                 ch = str.charAt(i);
457                 if (ch != '\\')
458                     break;
459                 v.append(ch);
460                 i++;
461             }
462             if (((i - bs) & 1) == 0 || i >= size || ch != 'u') {
463                 continue;
464             }
465             v.setLength(v.length() - 1);
466             i++;
467 
468             /* \\uXXXX with 4 hex digits */
469             int x = 0;
470             for (int j = 0; j < 4; j++) {
471                 ch = str.charAt(i+j);
472                 int d  = Character.digit(ch, 16);
473                 if (d == -1) {
474                     codecs.decoding_error("unicode escape", v, errors,
475                                           "truncated \\uXXXX");
476                     break;
477                 }
478                 x = ((x<<4) & ~0xF) + d;
479             }
480             i += 4;
481             v.append((char) x);
482        }
483        return v.toString();
484     }
485 
486 
487     /* --- Utility methods -------------------------------------------- */
488 
489     public static void encoding_error(String   type, StringBuffer   dest,
490                                       String   errors, String   details)
491     {
492         if (errors == null || errors == "strict")
493             throw Py.UnicodeError(type + " encoding error: " + details);
494         else if (errors == "ignore") { }
495         else if (errors == "replace")
496             dest.append('?');
497         else
498             throw Py.ValueError(type + " encoding error; "+
499                                 "unknown error handling code: " + errors);
500     }
501 
502 
503     public static void decoding_error(String   type, StringBuffer   dest,
504                                       String   errors, String   details)
505     {
506         if (errors == null || errors == "strict")
507             throw Py.UnicodeError(type + " decoding error: " + details);
508         else if (errors == "ignore") { }
509         else if (errors == "replace") {
510             if (dest != null)
511                 dest.append(Py_UNICODE_REPLACEMENT_CHARACTER);
512         } else
513             throw Py.ValueError(type + " decoding error; "+
514                                 "unknown error handling code: " + errors);
515     }
516 }
517
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags