UnicodeDecompressor


1   /*
2    *******************************************************************************
3    * Copyright (C) 1996-2004, International Business Machines Corporation and    *
4    * others. All Rights Reserved.                                                *
5    *******************************************************************************
6    */
7   
8   package com.ibm.icu.text;
9   
10  /**
11  * A decompression engine implementing the Standard Compression Scheme
12  * for Unicode (SCSU) as outlined in <A
13  * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
14  * Report #6</A>.
15  *
16  * <P><STRONG>USAGE</STRONG></P>
17  *
18  * <P>The static methods on <TT>UnicodeDecompressor</TT> may be used in a
19  * straightforward manner to decompress simple strings:</P>
20  *
21  * <PRE>
22  *  byte [] compressed = ... ; // get compressed bytes from somewhere
23  *  String result = UnicodeDecompressor.decompress(compressed);
24  * </PRE>
25  *
26  * <P>The static methods have a fairly large memory footprint.
27  * For finer-grained control over memory usage, 
28  * <TT>UnicodeDecompressor</TT> offers more powerful APIs allowing
29  * iterative decompression:</P>
30  *
31  * <PRE>
32  *  // Decompress an array "bytes" of length "len" using a buffer of 512 chars
33  *  // to the Writer "out"
34  *
35  *  UnicodeDecompressor myDecompressor         = new UnicodeDecompressor();
36  *  final static int    BUFSIZE                = 512;
37  *  char []             charBuffer             = new char [ BUFSIZE ];
38  *  int                 charsWritten           = 0;
39  *  int []              bytesRead              = new int [1];
40  *  int                 totalBytesDecompressed = 0;
41  *  int                 totalCharsWritten      = 0;
42  *
43  *  do {
44  *    // do the decompression
45  *    charsWritten = myDecompressor.decompress(bytes, totalBytesDecompressed, 
46  *                                             len, bytesRead,
47  *                                             charBuffer, 0, BUFSIZE);
48  *
49  *    // do something with the current set of chars
50  *    out.write(charBuffer, 0, charsWritten);
51  *
52  *    // update the no. of bytes decompressed
53  *    totalBytesDecompressed += bytesRead[0];
54  *
55  *    // update the no. of chars written
56  *    totalCharsWritten += charsWritten;
57  *
58  *  } while(totalBytesDecompressed < len);
59  *
60  *  myDecompressor.reset(); // reuse decompressor
61  * </PRE>
62  *
63  * <P>Decompression is performed according to the standard set forth in 
64  * <A HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical 
65  * Report #6</A></P>
66  *
67  * @see UnicodeCompressor
68  *
69  * @author Stephen F. Booth
70  * @stable ICU 2.4
71  */
72  public final class UnicodeDecompressor implements SCSU
73  {
74      //==========================
75      // Instance variables
76      //==========================
77      
78      /** Alias to current dynamic window */
79      private int       fCurrentWindow   = 0;
80  
81      /** Dynamic compression window offsets */
82      private int []    fOffsets         = new int [ NUMWINDOWS ];
83  
84      /** Current compression mode */
85      private int       fMode            = SINGLEBYTEMODE;
86  
87      /** Size of our internal buffer */
88      private final static int BUFSIZE   = 3;
89  
90      /** Internal buffer for saving state */
91      private byte []   fBuffer          = new byte [BUFSIZE];
92  
93      /** Number of characters in our internal buffer */
94      private int       fBufferLength    = 0;
95      
96  
97      /**
98       * Create a UnicodeDecompressor.
99       * Sets all windows to their default values.
100      * @see #reset
101      * @stable ICU 2.4
102      */
103     public UnicodeDecompressor()
104     {
105     reset();              // initialize to defaults
106     }
107 
108     /**
109      * Decompress a byte array into a String.
110      * @param buffer The byte array to decompress.
111      * @return A String containing the decompressed characters.
112      * @see #decompress(byte [], int, int)
113      * @stable ICU 2.4
114      */
115     public static String   decompress(byte [] buffer)
116     {
117     char [] buf = decompress(buffer, 0, buffer.length);
118     return new String  (buf);
119     }
120 
121     /**
122      * Decompress a byte array into a Unicode character array.
123      * @param buffer The byte array to decompress.
124      * @param start The start of the byte run to decompress.
125      * @param limit The limit of the byte run to decompress.
126      * @return A character array containing the decompressed bytes.
127      * @see #decompress(byte [])
128      * @stable ICU 2.4
129      */
130     public static char [] decompress(byte [] buffer,
131                      int start,
132                      int limit)
133     {
134     UnicodeDecompressor comp = new UnicodeDecompressor();
135 
136     // use a buffer we know will never overflow
137     // in the worst case, each byte will decompress
138     // to a surrogate pair (buffer must be at least 2 chars)
139     int len = Math.max(2, 2 * (limit - start));
140     char [] temp = new char [len];
141 
142     int charCount = comp.decompress(buffer, start, limit, null, 
143                     temp, 0, len);
144 
145     char [] result = new char [charCount];
146     System.arraycopy(temp, 0, result, 0, charCount);
147     return result;
148     }
149     
150     /**
151      * Decompress a byte array into a Unicode character array.
152      *
153      * This function will either completely fill the output buffer, 
154      * or consume the entire input.  
155      *
156      * @param byteBuffer The byte buffer to decompress.
157      * @param byteBufferStart The start of the byte run to decompress.
158      * @param byteBufferLimit The limit of the byte run to decompress.
159      * @param bytesRead A one-element array.  If not null, on return
160      * the number of bytes read from byteBuffer.
161      * @param charBuffer A buffer to receive the decompressed data. 
162      * This buffer must be at minimum two characters in size.
163      * @param charBufferStart The starting offset to which to write 
164      * decompressed data.
165      * @param charBufferLimit The limiting offset for writing 
166      * decompressed data.
167      * @return The number of Unicode characters written to charBuffer.
168      * @stable ICU 2.4
169      */
170     public int decompress(byte []    byteBuffer,
171               int        byteBufferStart,
172               int        byteBufferLimit,
173               int []     bytesRead,
174               char []    charBuffer,
175               int        charBufferStart,
176               int        charBufferLimit)
177     {
178     // the current position in the source byte buffer
179     int bytePos      = byteBufferStart;
180     
181     // the current position in the target char buffer
182     int ucPos        = charBufferStart;
183         
184         // the current byte from the source buffer
185     int aByte        = 0x00;
186 
187 
188     // charBuffer must be at least 2 chars in size
189     if(charBuffer.length < 2 || (charBufferLimit - charBufferStart) < 2)
190         throw new IllegalArgumentException  ("charBuffer.length < 2");
191     
192     // if our internal buffer isn't empty, flush its contents
193     // to the output buffer before doing any more decompression
194     if(fBufferLength > 0) {
195 
196         int newBytes = 0;
197 
198         // fill the buffer completely, to guarantee one full character
199         if(fBufferLength != BUFSIZE) {
200         newBytes = fBuffer.length - fBufferLength;
201 
202         // verify there are newBytes bytes in byteBuffer
203         if(byteBufferLimit - byteBufferStart < newBytes)
204             newBytes = byteBufferLimit - byteBufferStart;
205 
206         System.arraycopy(byteBuffer, byteBufferStart, 
207                  fBuffer, fBufferLength, newBytes);
208         }
209 
210         // reset buffer length to 0 before recursive call
211         fBufferLength = 0;
212 
213         // call self recursively to decompress the buffer
214         int count = decompress(fBuffer, 0, fBuffer.length, null,
215                    charBuffer, charBufferStart, 
216                    charBufferLimit);
217 
218         // update the positions into the arrays
219         ucPos += count;
220         bytePos += newBytes;
221     }
222 
223         // the main decompression loop
224     mainLoop:
225     while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
226         switch(fMode) {  
227         case SINGLEBYTEMODE:
228         // single-byte mode decompression loop
229         singleByteModeLoop:
230         while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
231         aByte = byteBuffer[bytePos++] & 0xFF;
232         switch(aByte) {
233             // All bytes from 0x80 through 0xFF are remapped
234             // to chars or surrogate pairs according to the
235             // currently active window
236         case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: 
237         case 0x85: case 0x86: case 0x87: case 0x88: case 0x89:
238         case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E:
239         case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93:
240         case 0x94: case 0x95: case 0x96: case 0x97: case 0x98:
241         case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D:
242         case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2:
243         case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7:
244         case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC:
245         case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1:
246         case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6:
247         case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB:
248         case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0:
249         case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5:
250         case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA:
251         case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF:
252         case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4:
253         case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9:
254         case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE:
255         case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3:
256         case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8:
257         case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED:
258         case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2:
259         case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7:
260         case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC:
261         case 0xFD: case 0xFE: case 0xFF: 
262             // For offsets <= 0xFFFF, convert to a single char
263             // by adding the window's offset and subtracting
264             // the generic compression offset
265             if(fOffsets[ fCurrentWindow ] <= 0xFFFF) {
266             charBuffer[ucPos++] = (char) 
267                 (aByte + fOffsets[ fCurrentWindow ] 
268                  - COMPRESSIONOFFSET);
269             }
270             // For offsets > 0x10000, convert to a surrogate pair by 
271             // normBase = window's offset - 0x10000
272             // high surr. = 0xD800 + (normBase >> 10)
273             // low  surr. = 0xDC00 + (normBase & 0x3FF) + (byte & 0x7F)
274             else {
275             // make sure there is enough room to write
276             // both characters 
277             // if not, save state and break out
278             if((ucPos + 1) >= charBufferLimit) {
279                 --bytePos;
280                 System.arraycopy(byteBuffer, bytePos,
281                          fBuffer, 0, 
282                          byteBufferLimit - bytePos);
283                 fBufferLength = byteBufferLimit - bytePos;
284                 bytePos += fBufferLength;
285                 break mainLoop; 
286             }
287             
288             int normalizedBase = fOffsets[ fCurrentWindow ] 
289                 - 0x10000;
290             charBuffer[ucPos++] = (char) 
291                 (0xD800 + (normalizedBase >> 10));
292             charBuffer[ucPos++] = (char) 
293                 (0xDC00 + (normalizedBase & 0x3FF)+(aByte & 0x7F));
294             }
295             break;
296 
297             // bytes from 0x20 through 0x7F are treated as ASCII and
298             // are remapped to chars by padding the high byte
299             // (this is the same as quoting from static window 0)
300             // NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D) 
301             // are treated as ASCII as well
302         case 0x00: case 0x09: case 0x0A: case 0x0D:
303         case 0x20: case 0x21: case 0x22: case 0x23: case 0x24:
304         case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:
305         case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E:
306         case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33:
307         case 0x34: case 0x35: case 0x36: case 0x37: case 0x38:
308         case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D:
309         case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42:
310         case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
311         case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:
312         case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51:
313         case 0x52: case 0x53: case 0x54: case 0x55: case 0x56:
314         case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B:
315         case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60:
316         case 0x61: case 0x62: case 0x63: case 0x64: case 0x65:
317         case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A:
318         case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:
319         case 0x70: case 0x71: case 0x72: case 0x73: case 0x74:
320         case 0x75: case 0x76: case 0x77: case 0x78: case 0x79:
321         case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E:
322         case 0x7F: 
323             charBuffer[ucPos++] = (char) aByte;
324             break;
325 
326             // quote unicode
327         case SQUOTEU:
328             // verify we have two bytes following tag
329             // if not, save state and break out
330             if( (bytePos + 1) >= byteBufferLimit ) {
331             --bytePos;
332             System.arraycopy(byteBuffer, bytePos,
333                      fBuffer, 0, 
334                      byteBufferLimit - bytePos);
335             fBufferLength = byteBufferLimit - bytePos;
336             bytePos += fBufferLength;
337             break mainLoop; 
338             }
339                 
340             aByte = byteBuffer[bytePos++];
341             charBuffer[ucPos++] = (char)
342             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
343             break;
344 
345             // switch to Unicode mode
346         case SCHANGEU:
347             fMode = UNICODEMODE;
348             break singleByteModeLoop;
349             //break;
350 
351             // handle all quote tags
352         case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3:
353         case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7:
354             // verify there is a byte following the tag
355             // if not, save state and break out
356             if(bytePos >= byteBufferLimit) {
357             --bytePos;
358             System.arraycopy(byteBuffer, bytePos,
359                      fBuffer, 0, 
360                      byteBufferLimit - bytePos);
361             fBufferLength = byteBufferLimit - bytePos;
362             bytePos += fBufferLength;
363             break mainLoop; 
364             }
365                 
366             // if the byte is in the range 0x00 - 0x7F, use
367             // static window n otherwise, use dynamic window n
368             int dByte = byteBuffer[bytePos++] & 0xFF;
369             charBuffer[ucPos++] = (char) 
370             (dByte+ (dByte >= 0x00 && dByte < 0x80 
371                  ? sOffsets[aByte - SQUOTE0] 
372                  : (fOffsets[aByte - SQUOTE0] 
373                     - COMPRESSIONOFFSET))); 
374             break;
375 
376             // handle all change tags
377         case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3:
378         case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7:
379             fCurrentWindow = aByte - SCHANGE0;
380             break;
381 
382             // handle all define tags
383         case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3:
384         case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7:
385             // verify there is a byte following the tag
386             // if not, save state and break out
387             if(bytePos >= byteBufferLimit) {
388             --bytePos;
389             System.arraycopy(byteBuffer, bytePos,
390                      fBuffer, 0, 
391                      byteBufferLimit - bytePos);
392             fBufferLength = byteBufferLimit - bytePos;
393             bytePos += fBufferLength;
394             break mainLoop; 
395             }
396 
397             fCurrentWindow = aByte - SDEFINE0;
398             fOffsets[fCurrentWindow] = 
399             sOffsetTable[byteBuffer[bytePos++] & 0xFF];
400             break;
401 
402             // handle define extended tag
403         case SDEFINEX:
404             // verify we have two bytes following tag
405             // if not, save state and break out
406             if((bytePos + 1) >= byteBufferLimit ) {
407             --bytePos;
408             System.arraycopy(byteBuffer, bytePos,
409                      fBuffer, 0, 
410                      byteBufferLimit - bytePos);
411             fBufferLength = byteBufferLimit - bytePos;
412             bytePos += fBufferLength;
413             break mainLoop; 
414             }
415                 
416             aByte = byteBuffer[bytePos++] & 0xFF;
417             fCurrentWindow = (aByte & 0xE0) >> 5;
418             fOffsets[fCurrentWindow] = 0x10000 + 
419             (0x80 * (((aByte & 0x1F) << 8) 
420                  | (byteBuffer[bytePos++] & 0xFF)));
421             break;
422                             
423             // reserved, shouldn't happen
424         case SRESERVED:
425             break;
426 
427         } // end switch
428         } // end while
429         break;
430 
431         case UNICODEMODE:
432         // unicode mode decompression loop
433         unicodeModeLoop:
434         while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
435         aByte = byteBuffer[bytePos++] & 0xFF;
436         switch(aByte) {
437             // handle all define tags
438         case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3:
439         case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7:
440             // verify there is a byte following tag
441             // if not, save state and break out
442             if(bytePos >= byteBufferLimit ) {
443             --bytePos;
444             System.arraycopy(byteBuffer, bytePos,
445                      fBuffer, 0, 
446                      byteBufferLimit - bytePos);
447             fBufferLength = byteBufferLimit - bytePos;
448             bytePos += fBufferLength;
449             break mainLoop; 
450             }
451                 
452             fCurrentWindow = aByte - UDEFINE0;
453             fOffsets[fCurrentWindow] = 
454             sOffsetTable[byteBuffer[bytePos++] & 0xFF];
455             fMode = SINGLEBYTEMODE;
456             break unicodeModeLoop;
457             //break;
458 
459             // handle define extended tag
460         case UDEFINEX:
461             // verify we have two bytes following tag
462             // if not, save state and break out
463             if((bytePos + 1) >= byteBufferLimit ) {
464             --bytePos;
465             System.arraycopy(byteBuffer, bytePos,
466                      fBuffer, 0, 
467                      byteBufferLimit - bytePos);
468             fBufferLength = byteBufferLimit - bytePos;
469             bytePos += fBufferLength;
470             break mainLoop; 
471             }
472             
473             aByte = byteBuffer[bytePos++] & 0xFF;
474             fCurrentWindow = (aByte & 0xE0) >> 5;
475             fOffsets[fCurrentWindow] = 0x10000 + 
476             (0x80 * (((aByte & 0x1F) << 8) 
477                  | (byteBuffer[bytePos++] & 0xFF)));
478             fMode = SINGLEBYTEMODE;
479             break unicodeModeLoop;
480             //break;
481 
482             // handle all change tags
483         case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3:
484         case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7:
485             fCurrentWindow = aByte - UCHANGE0;
486             fMode = SINGLEBYTEMODE;
487             break unicodeModeLoop;
488             //break;
489 
490             // quote unicode
491         case UQUOTEU:
492             // verify we have two bytes following tag
493             // if not, save state and break out
494             if(bytePos >= byteBufferLimit  - 1) {
495             --bytePos;
496             System.arraycopy(byteBuffer, bytePos,
497                      fBuffer, 0, 
498                      byteBufferLimit - bytePos);
499             fBufferLength = byteBufferLimit - bytePos;
500             bytePos += fBufferLength;
501             break mainLoop; 
502             }
503                 
504             aByte = byteBuffer[bytePos++];
505             charBuffer[ucPos++] = (char) 
506             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
507             break;
508 
509         default:
510             // verify there is a byte following tag
511             // if not, save state and break out
512             if(bytePos >= byteBufferLimit ) {
513             --bytePos;
514             System.arraycopy(byteBuffer, bytePos,
515                      fBuffer, 0, 
516                      byteBufferLimit - bytePos);
517             fBufferLength = byteBufferLimit - bytePos;
518             bytePos += fBufferLength;
519             break mainLoop; 
520             }
521 
522             charBuffer[ucPos++] = (char) 
523             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
524             break;
525 
526         } // end switch
527         } // end while
528         break;
529         
530         } // end switch( fMode )
531     } // end while
532 
533         // fill in output parameter
534     if(bytesRead != null)
535         bytesRead [0] = (bytePos - byteBufferStart);
536 
537         // return # of chars written
538     return (ucPos - charBufferStart);
539     }
540 
541     /** 
542      * Reset the decompressor to its initial state. 
543      * @stable ICU 2.4
544      */
545     public void reset()
546     {
547         // reset dynamic windows
548         fOffsets[0] = 0x0080;    // Latin-1
549         fOffsets[1] = 0x00C0;    // Latin-1 Supplement + Latin Extended-A
550         fOffsets[2] = 0x0400;    // Cyrillic
551         fOffsets[3] = 0x0600;    // Arabic
552         fOffsets[4] = 0x0900;    // Devanagari
553         fOffsets[5] = 0x3040;    // Hiragana
554         fOffsets[6] = 0x30A0;    // Katakana
555         fOffsets[7] = 0xFF00;    // Fullwidth ASCII
556 
557 
558         fCurrentWindow  = 0;                // Make current window Latin-1
559         fMode           = SINGLEBYTEMODE;   // Always start in single-byte mode
560     fBufferLength   = 0;                // Empty buffer
561     }
562 };
563
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags