UnicodeCompressor


1   /*
2    *******************************************************************************
3    * Copyright (C) 1996-2004, International Business Machines Corporation and    *
4    * others. All Rights Reserved.                                                *
5    *******************************************************************************
6    */
7   package com.ibm.icu.text;
8   
9   /**
10  * A compression engine implementing the Standard Compression Scheme
11  * for Unicode (SCSU) as outlined in <A
12  * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
13  * Report #6</A>.
14  *
15  * <P>The SCSU works by using dynamically positioned <EM>windows</EM>
16  * consisting of 128 consecutive characters in Unicode.  During compression, 
17  * characters within a window are encoded in the compressed stream as the bytes 
18  * <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters 
19  * (bytes) between <TT>U+0000 - U+00FF</TT>.  The SCSU approximates the 
20  * storage size of traditional character sets, for example 1 byte per
21  * character for ASCII or Latin-1 text, and 2 bytes per character for CJK
22  * ideographs.</P>
23  *
24  * <P><STRONG>USAGE</STRONG></P>
25  *
26  * <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a
27  * straightforward manner to compress simple strings:</P>
28  *
29  * <PRE>
30  *  String s = ... ; // get string from somewhere
31  *  byte [] compressed = UnicodeCompressor.compress(s);
32  * </PRE>
33  *
34  * <P>The static methods have a fairly large memory footprint.
35  * For finer-grained control over memory usage, 
36  * <TT>UnicodeCompressor</TT> offers more powerful APIs allowing
37  * iterative compression:</P>
38  *
39  * <PRE>
40  *  // Compress an array "chars" of length "len" using a buffer of 512 bytes
41  *  // to the OutputStream "out"
42  *
43  *  UnicodeCompressor myCompressor         = new UnicodeCompressor();
44  *  final static int  BUFSIZE              = 512;
45  *  byte []           byteBuffer           = new byte [ BUFSIZE ];
46  *  int               bytesWritten         = 0;
47  *  int []            unicharsRead         = new int [1];
48  *  int               totalCharsCompressed = 0;
49  *  int               totalBytesWritten    = 0;
50  *
51  *  do {
52  *    // do the compression
53  *    bytesWritten = myCompressor.compress(chars, totalCharsCompressed, 
54  *                                         len, unicharsRead,
55  *                                         byteBuffer, 0, BUFSIZE);
56  *
57  *    // do something with the current set of bytes
58  *    out.write(byteBuffer, 0, bytesWritten);
59  *
60  *    // update the no. of characters compressed
61  *    totalCharsCompressed += unicharsRead[0];
62  *
63  *    // update the no. of bytes written
64  *    totalBytesWritten += bytesWritten;
65  *
66  *  } while(totalCharsCompressed < len);
67  *
68  *  myCompressor.reset(); // reuse compressor
69  * </PRE>
70  *
71  * @see UnicodeDecompressor
72  *
73  * @author Stephen F. Booth
74  * @stable ICU 2.4
75  */
76  
77  /*
78  *
79  * COMPRESSION STRATEGY
80  *
81  * Single Byte Mode
82  *
83  * There are three relevant cases.
84  * If the character is in the current window or is Latin-1 (U+0000,
85  * U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed
86  * directly in the stream as a single byte.
87  *
88  *  1. Current character is in defined, inactive window.
89  *  2. Current character is in undefined window.
90  *  3. Current character is uncompressible Unicode (U+3400 - U+DFFF).
91  * 
92  *  1. Current character is in defined, inactive window
93  *    A. Look ahead two characters
94  *    B. If both following characters in same window as current character, 
95  *       switch to defined window
96  *    C. If only next character is in same window as current character, 
97  *       quote defined window
98  *    D. If neither of following characters is in same window as current, 
99  *       quote defined window
100 *   
101 *  2. Current character is in undefined window
102 *    A. Look ahead two characters
103 *    B. If both following characters in same window as current character, 
104 *       define new window
105 *    C. If only next character in same window as current character, 
106 *       switch to Unicode mode
107 *       NOTE: This costs us one extra byte.  However, 
108 *        since we have a limited number of windows to work with, it is 
109 *        assumed the cost will pay off later in savings from a window with
110 *        more characters in it.
111 *    D. If neither of following characters in same window as current, 
112 *       switch to Unicode mode.  Alternative to above: just quote 
113 *       Unicode (same byte cost)
114 *   
115 *  3. Current character is uncompressible Unicode (U+3400 - U+DFFF)
116 *    A. Look ahead one character
117 *    B. If next character in non-compressible region, switch to 
118 *       Unicode mode
119 *    C. If next character not in non-compressible region, quote Unicode
120 *   
121 *
122 * The following chart illustrates the bytes required for encoding characters
123 * in each possible way
124 *
125 * 
126 *                                   SINGLE BYTE MODE
127 *                                       Characters in a row with same index
128 *               tag encountered             1       2       3       4
129 *               ---------------------------------------------------------------
130 *               none (in current window)    1       2       3       4
131 *
132 *               quote Unicode               3       6       9       12
133 *
134 *   window not  switch to Unicode           3       5       7       9     byte
135 *   defined     define window               3       4       5       6     cost
136 *      
137 *   window      switch to window            2       3       4       5
138 *   defined     quote window                2       4       6       8
139 *
140 *  Unicode Mode
141 *
142 * There are two relevant cases.
143 * If the character is in the non-compressible region
144 * (U+3400 - U+DFFF), the character is simply written to the
145 * stream as a pair of bytes.
146 *
147 * 1. Current character is in defined, inactive window.
148 * 2. Current character is in undefined window.
149 *
150 *  1.Current character is in defined, inactive window
151 *    A. Look ahead one character
152 *    B. If next character has same index as current character, 
153 *       switch to defined window (and switch to single-byte mode)
154 *    C. If not, just put bytes in stream
155 *   
156 *  
157 *  2. Current character is in undefined window
158 *    A. Look ahead two characters
159 *    B. If both in same window as current character, define window 
160 *       (and switch to single-byte mode)
161 *    C. If only next character in same window, just put bytes in stream
162 *        NOTE: This costs us one extra byte.  However, 
163 *        since we have a limited number of windows to work with, it is 
164 *        assumed the cost will pay off later in savings from a window with 
165 *        more characters in it.
166 *    D. If neither in same window, put bytes in stream
167 *   
168 *
169 * The following chart illustrates the bytes required for encoding characters
170 * in each possible way
171 *
172 * 
173 *                                   UNICODE MODE
174 *                                       Characters in a row with same index
175 *               tag encountered             1       2       3       4
176 *               ---------------------------------------------------------------
177 *               none                        2       4       6       8
178 *
179 *               quote Unicode               3       6       9       12
180 *
181 *   window not  define window               3       4       5       6     byte
182 *   defined                                                               cost
183 *   window      switch to window            2       3       4       5
184 *   defined
185 */
186 public final class UnicodeCompressor implements SCSU
187 {
188     //==========================
189     // Class variables
190     //==========================
191 
192     /** For quick identification of a byte as a single-byte mode tag */
193     private static boolean [] sSingleTagTable = {
194         // table generated by CompressionTableGenerator
195         false, true, true, true, true, true, true, true, true, false,
196     false, true, true, false, true, true, true, true, true, true,
197     true, true, true, true, true, true, true, true, true, true,
198     true, true, false, false, false, false, false, false,false,
199     false, false, false, false, false, false, false, false, false,
200     false, false, false, false, false, false, false, false, false,
201     false, false, false, false, false, false, false, false, false,
202     false, false, false, false, false, false, false, false, false,
203     false, false, false, false, false, false, false, false, false,
204     false, false, false, false, false, false, false, false, false,
205     false, false, false, false, false, false, false, false, false,
206     false, false, false, false, false, false, false, false, false,
207     false, false, false, false, false, false, false, false, false,
208     false, false, false, false, false, false, false, false, false,
209     false, false, false, false, false, false, false, false, false,
210     false, false, false, false, false, false, false, false, false,
211     false, false, false, false, false, false, false, false, false,
212     false, false, false, false, false, false, false, false, false,
213     false, false, false, false, false, false, false, false, false,
214     false, false, false, false, false, false, false, false, false,
215     false, false, false, false, false, false, false, false, false,
216     false, false, false, false, false, false, false, false, false,
217     false, false, false, false, false, false, false, false, false,
218     false, false, false, false, false, false, false, false, false,
219     false, false, false, false, false, false, false, false, false,
220     false, false, false, false, false, false, false, false, false,
221     false, false, false, false, false, false, false, false, false,
222     false, false, false, false, false, false, false, false, false,
223     false   
224     };
225 
226     /** For quick identification of a byte as a unicode mode tag */
227     private static boolean [] sUnicodeTagTable = {
228         // table generated by CompressionTableGenerator
229         false, false, false, false, false, false, false, false, false,
230     false, false, false, false, false, false, false, false, false,
231     false, false, false, false, false, false, false, false, false,
232     false, false, false, false, false, false, false, false, false,
233     false, false, false, false, false, false, false, false, false,
234     false, false, false, false, false, false, false, false, false,
235     false, false, false, false, false, false, false, false, false,
236     false, false, false, false, false, false, false, false, false,
237     false, false, false, false, false, false, false, false, false,
238     false, false, false, false, false, false, false, false, false,
239     false, false, false, false, false, false, false, false, false,
240     false, false, false, false, false, false, false, false, false,
241     false, false, false, false, false, false, false, false, false,
242     false, false, false, false, false, false, false, false, false,
243     false, false, false, false, false, false, false, false, false,
244     false, false, false, false, false, false, false, false, false,
245     false, false, false, false, false, false, false, false, false,
246     false, false, false, false, false, false, false, false, false,
247     false, false, false, false, false, false, false, false, false,
248     false, false, false, false, false, false, false, false, false,
249     false, false, false, false, false, false, false, false, false,
250     false, false, false, false, false, false, false, false, false,
251     false, false, false, false, false, false, false, false, false,
252     false, false, false, false, false, false, false, false, false,
253     false, false, false, false, false, false, false, false, true,
254     true, true, true, true, true, true, true, true, true, true,
255     true, true, true, true, true, true, true, true, false, false,
256     false, false, false, false, false, false, false, false, false,
257     false, false 
258     };
259 
260     //==========================
261     // Instance variables
262     //==========================
263     
264     /** Alias to current dynamic window */
265     private int       fCurrentWindow   = 0;
266 
267     /** Dynamic compression window offsets */
268     private int []    fOffsets         = new int [ NUMWINDOWS ];
269 
270     /** Current compression mode */
271     private int       fMode            = SINGLEBYTEMODE;
272 
273     /** Keeps count of times character indices are encountered */
274     private int []    fIndexCount      = new int [ MAXINDEX + 1 ];
275 
276     /** The time stamps indicate when a window was last defined */
277     private int []    fTimeStamps      = new int [ NUMWINDOWS ];
278     
279     /** The current time stamp */
280     private int       fTimeStamp       = 0;
281     
282 
283     /**
284      * Create a UnicodeCompressor.
285      * Sets all windows to their default values.
286      * @see #reset
287      * @stable ICU 2.4
288      */
289     public UnicodeCompressor()
290     {
291     reset();              // initialize to defaults
292     }
293 
294     /**
295      * Compress a string into a byte array.
296      * @param buffer The string to compress.
297      * @return A byte array containing the compressed characters.
298      * @see #compress(char [], int, int)
299      * @stable ICU 2.4
300      */
301     public static byte [] compress(String   buffer)
302     {
303     return compress(buffer.toCharArray(), 0, buffer.length());
304     }
305 
306     /**
307      * Compress a Unicode character array into a byte array.
308      * @param buffer The character buffer to compress.
309      * @param start The start of the character run to compress.
310      * @param limit The limit of the character run to compress.
311      * @return A byte array containing the compressed characters.
312      * @see #compress(String)
313      * @stable ICU 2.4
314      */
315     public static byte [] compress(char [] buffer,
316                    int start,
317                    int limit)
318     {
319     UnicodeCompressor comp = new UnicodeCompressor();
320 
321     // use a buffer that we know will never overflow
322     // in the worst case, each character will take 3 bytes
323     // to encode: UQU, hibyte, lobyte.  In this case, the
324     // compressed data will look like: SCU, UQU, hibyte, lobyte, ...
325     // buffer must be at least 4 bytes in size
326     int len = Math.max(4, 3 * (limit - start) + 1);
327     byte [] temp = new byte [len];
328 
329     int byteCount = comp.compress(buffer, start, limit, null, 
330                       temp, 0, len);
331 
332     byte [] result = new byte [byteCount];
333     System.arraycopy(temp, 0, result, 0, byteCount);
334     return result;
335     }
336 
337     /**
338      * Compress a Unicode character array into a byte array.
339      *
340      * This function will only consume input that can be completely
341      * output.
342      *
343      * @param charBuffer The character buffer to compress.
344      * @param charBufferStart The start of the character run to compress.
345      * @param charBufferLimit The limit of the character run to compress.
346      * @param charsRead A one-element array.  If not null, on return 
347      * the number of characters read from charBuffer.
348      * @param byteBuffer A buffer to receive the compressed data.  This 
349      * buffer must be at minimum four bytes in size.
350      * @param byteBufferStart The starting offset to which to write 
351      * compressed data.
352      * @param byteBufferLimit The limiting offset for writing compressed data.
353      * @return The number of bytes written to byteBuffer.
354      * @stable ICU 2.4
355      */
356     public int compress(char []     charBuffer,
357             int         charBufferStart,
358             int         charBufferLimit,
359             int []      charsRead,
360             byte []     byteBuffer,
361             int         byteBufferStart,
362             int         byteBufferLimit)
363     {
364         // the current position in the target byte buffer
365     int     bytePos       = byteBufferStart;
366     
367     // the current position in the source unicode character buffer
368     int     ucPos         = charBufferStart;
369     
370     // the current unicode character from the source buffer
371     int     curUC         = INVALIDCHAR;
372     
373     // the index for the current character
374         int     curIndex      = -1;
375         
376     // look ahead
377     int     nextUC        = INVALIDCHAR;
378     int     forwardUC     = INVALIDCHAR;
379     
380         // temporary for window searching
381     int     whichWindow   = 0;
382     
383     // high and low bytes of the current unicode character
384     int     hiByte        = 0;
385     int     loByte        = 0;
386 
387 
388     // byteBuffer must be at least 4 bytes in size
389     if(byteBuffer.length < 4 || (byteBufferLimit - byteBufferStart) < 4)
390         throw new IllegalArgumentException  ("byteBuffer.length < 4");
391 
392     mainLoop:
393     while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
394         switch(fMode) {
395         // main single byte mode compression loop
396         case SINGLEBYTEMODE:
397         singleByteModeLoop:
398         while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
399         // get current char
400         curUC = charBuffer[ucPos++];
401 
402         // get next char
403         if(ucPos < charBufferLimit) 
404             nextUC = charBuffer[ucPos];
405         else
406             nextUC = INVALIDCHAR;
407         
408         // chars less than 0x0080 (excluding tags) go straight
409         // in stream
410         if(curUC < 0x0080) {
411             loByte = curUC & 0xFF;
412 
413             // we need to check and make sure we don't
414             // accidentally write a single byte mode tag to
415             // the stream unless it's quoted
416             if(sSingleTagTable[loByte]) {
417                                 // make sure there is enough room to
418                                 // write both bytes if not, rewind the
419                                 // source stream and break out
420             if( (bytePos + 1) >= byteBufferLimit) 
421                 { --ucPos; break mainLoop; }
422 
423             // since we know the byte is less than 0x80, SQUOTE0
424             // will use static window 0, or ASCII
425             byteBuffer[bytePos++] = (byte) SQUOTE0;
426             }
427 
428             byteBuffer[bytePos++] = (byte) loByte;
429         }
430 
431         // if the char belongs to current window, convert it
432         // to a byte by adding the generic compression offset
433         // and subtracting the window's offset
434         else if(inDynamicWindow(curUC, fCurrentWindow) ) {
435             byteBuffer[bytePos++] = (byte) 
436             (curUC - fOffsets[ fCurrentWindow ] 
437              + COMPRESSIONOFFSET);
438         }
439         
440         // if char is not in compressible range, either switch to or
441         // quote from unicode
442         else if( ! isCompressible(curUC) ) {
443             // only check next character if it is valid
444             if(nextUC != INVALIDCHAR && isCompressible(nextUC)) {
445                                 // make sure there is enough room to
446                                 // write all three bytes if not,
447                                 // rewind the source stream and break
448                                 // out
449             if( (bytePos + 2) >= byteBufferLimit) 
450                 { --ucPos; break mainLoop; }
451 
452             byteBuffer[bytePos++] = (byte) SQUOTEU;
453             byteBuffer[bytePos++] = (byte) (curUC >>> 8);
454             byteBuffer[bytePos++] = (byte) (curUC & 0xFF);
455             }
456             else {
457                                 // make sure there is enough room to
458                                 // write all four bytes if not, rewind
459                                 // the source stream and break out
460             if((bytePos + 3) >= byteBufferLimit) 
461                 { --ucPos; break mainLoop; }
462 
463             byteBuffer[bytePos++] = (byte) SCHANGEU;
464 
465             hiByte = curUC >>> 8;
466             loByte = curUC & 0xFF;
467 
468             if(sUnicodeTagTable[hiByte])
469                 // add quote Unicode tag
470                 byteBuffer[bytePos++]   = (byte) UQUOTEU;    
471 
472             byteBuffer[bytePos++] = (byte) hiByte;
473             byteBuffer[bytePos++] = (byte) loByte;
474                 
475             fMode = UNICODEMODE;
476             break singleByteModeLoop;
477             }
478         }
479 
480         // if the char is in a currently defined dynamic
481         // window, figure out which one, and either switch to
482         // it or quote from it
483         else if((whichWindow = findDynamicWindow(curUC)) 
484             != INVALIDWINDOW ) {
485             // look ahead
486             if( (ucPos + 1) < charBufferLimit )
487             forwardUC = charBuffer[ucPos + 1];
488             else
489             forwardUC = INVALIDCHAR;
490             
491             // all three chars in same window, switch to that
492             // window inDynamicWindow will return false for
493             // INVALIDCHAR
494             if(inDynamicWindow(nextUC, whichWindow) 
495                && inDynamicWindow(forwardUC, whichWindow)) {
496                                 // make sure there is enough room to
497                                 // write both bytes if not, rewind the
498                                 // source stream and break out
499             if( (bytePos + 1) >= byteBufferLimit) 
500                 { --ucPos; break mainLoop; }
501 
502             byteBuffer[bytePos++] = (byte)(SCHANGE0 + whichWindow);
503             byteBuffer[bytePos++] = (byte) 
504                 (curUC - fOffsets[whichWindow] 
505                  + COMPRESSIONOFFSET);
506             fTimeStamps [ whichWindow ] = ++fTimeStamp;
507             fCurrentWindow = whichWindow;
508             }
509             
510             // either only next char or neither in same
511             // window, so quote
512             else {
513                                 // make sure there is enough room to
514                                 // write both bytes if not, rewind the
515                                 // source stream and break out
516             if((bytePos + 1) >= byteBufferLimit) 
517                 { --ucPos; break mainLoop; }
518 
519             byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
520             byteBuffer[bytePos++] = (byte) 
521                 (curUC - fOffsets[whichWindow] 
522                  + COMPRESSIONOFFSET);
523             }
524         }
525 
526         // if a static window is defined, and the following
527         // character is not in that static window, quote from
528         // the static window Note: to quote from a static
529         // window, don't add 0x80
530         else if((whichWindow = findStaticWindow(curUC)) 
531             != INVALIDWINDOW 
532             && ! inStaticWindow(nextUC, whichWindow) ) {
533             // make sure there is enough room to write both
534             // bytes if not, rewind the source stream and
535             // break out
536             if((bytePos + 1) >= byteBufferLimit) 
537             { --ucPos; break mainLoop; }
538 
539             byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
540             byteBuffer[bytePos++] = (byte) 
541             (curUC - sOffsets[whichWindow]);
542         }
543         
544         // if a window is not defined, decide if we want to
545         // define a new one or switch to unicode mode
546         else {
547             // determine index for current char (char is compressible)
548             curIndex = makeIndex(curUC);
549             fIndexCount[curIndex]++;
550 
551             // look ahead
552             if((ucPos + 1) < charBufferLimit)
553             forwardUC = charBuffer[ucPos + 1];
554             else
555             forwardUC = INVALIDCHAR;
556 
557             // if we have encountered this index at least once
558             // before, define a new window
559             // OR
560             // three chars in a row with same index, define a
561             // new window (makeIndex will return RESERVEDINDEX
562             // for INVALIDCHAR)
563             if((fIndexCount[curIndex] > 1) ||
564                (curIndex == makeIndex(nextUC) 
565             && curIndex == makeIndex(forwardUC))) {
566             // make sure there is enough room to write all
567             // three bytes if not, rewind the source
568             // stream and break out
569             if( (bytePos + 2) >= byteBufferLimit) 
570                 { --ucPos; break mainLoop; }
571 
572             // get least recently defined window
573             whichWindow = getLRDefinedWindow();
574 
575             byteBuffer[bytePos++] = (byte)(SDEFINE0 + whichWindow);
576             byteBuffer[bytePos++] = (byte) curIndex;
577             byteBuffer[bytePos++] = (byte) 
578                 (curUC - sOffsetTable[curIndex] 
579                  + COMPRESSIONOFFSET);
580 
581             fOffsets[whichWindow] = sOffsetTable[curIndex];
582             fCurrentWindow = whichWindow;
583             fTimeStamps [whichWindow] = ++fTimeStamp;
584             }
585 
586             // only two chars in a row with same index, so
587             // switch to unicode mode (makeIndex will return
588             // RESERVEDINDEX for INVALIDCHAR)
589             // OR
590             // three chars have different indices, so switch
591             // to unicode mode
592             else {
593             // make sure there is enough room to write all
594             // four bytes if not, rewind the source stream
595             // and break out
596             if((bytePos + 3) >= byteBufferLimit) 
597                 { --ucPos; break mainLoop; }
598 
599             byteBuffer[bytePos++] = (byte) SCHANGEU;
600 
601             hiByte = curUC >>> 8;
602             loByte = curUC & 0xFF;
603 
604             if(sUnicodeTagTable[hiByte])
605                 // add quote Unicode tag
606                 byteBuffer[bytePos++] = (byte) UQUOTEU; 
607 
608             byteBuffer[bytePos++] = (byte) hiByte;
609             byteBuffer[bytePos++] = (byte) loByte;
610 
611             fMode = UNICODEMODE;
612             break singleByteModeLoop;
613             }
614         }
615         }
616         break;
617 
618         case UNICODEMODE:
619         // main unicode mode compression loop
620         unicodeModeLoop:
621         while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
622         // get current char
623         curUC = charBuffer[ucPos++];    
624 
625         // get next char
626         if( ucPos < charBufferLimit )
627             nextUC = charBuffer[ucPos];
628         else
629             nextUC = INVALIDCHAR;
630 
631         // if we have two uncompressible chars in a row,
632         // put the current char's bytes in the stream
633         if( ! isCompressible(curUC) 
634             || (nextUC != INVALIDCHAR && ! isCompressible(nextUC))) {
635             // make sure there is enough room to write all three bytes
636             // if not, rewind the source stream and break out
637             if( (bytePos + 2) >= byteBufferLimit) 
638             { --ucPos; break mainLoop; }
639 
640             hiByte = curUC >>> 8;
641             loByte = curUC & 0xFF;
642 
643             if(sUnicodeTagTable[ hiByte ])
644             // add quote Unicode tag
645             byteBuffer[bytePos++] = (byte) UQUOTEU;
646                 
647             byteBuffer[bytePos++] = (byte) hiByte;
648             byteBuffer[bytePos++] = (byte) loByte;
649         }
650         
651         // bytes less than 0x80 can go straight in the stream,
652         // but in single-byte mode
653         else if(curUC < 0x0080) {
654             loByte = curUC & 0xFF;
655 
656             // if two chars in a row below 0x80 and the
657             // current char is not a single-byte mode tag,
658             // switch to single-byte mode
659             if(nextUC != INVALIDCHAR 
660                && nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) {
661                                 // make sure there is enough room to
662                                 // write both bytes if not, rewind the
663                                 // source stream and break out
664             if( (bytePos + 1) >= byteBufferLimit) 
665                 { --ucPos; break mainLoop; }
666 
667             // use the last-active window
668             whichWindow = fCurrentWindow;
669             byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
670             byteBuffer[bytePos++] = (byte) loByte;
671 
672             //fCurrentWindow = 0;
673             fTimeStamps [whichWindow] = ++fTimeStamp;
674             fMode = SINGLEBYTEMODE;
675             break unicodeModeLoop;
676             }
677 
678             // otherwise, just write the bytes to the stream
679             // (this will cover the case of only 1 char less than 0x80
680             // and single-byte mode tags)
681             else {
682                                 // make sure there is enough room to
683                                 // write both bytes if not, rewind the
684                                 // source stream and break out
685             if((bytePos + 1) >= byteBufferLimit) 
686                 { --ucPos; break mainLoop; }
687 
688             // since the character is less than 0x80, the
689             // high byte is always 0x00 - no need for
690             // (curUC >>> 8)
691             byteBuffer[bytePos++] = (byte) 0x00;
692             byteBuffer[bytePos++] = (byte) loByte;
693             }
694         }
695 
696         // figure out if the current char is in a defined window
697         else if((whichWindow = findDynamicWindow(curUC)) 
698             != INVALIDWINDOW ) {
699             // if two chars in a row in the same window,
700             // switch to that window and go to single-byte mode
701             // inDynamicWindow will return false for INVALIDCHAR
702             if(inDynamicWindow(nextUC, whichWindow)) {
703                                 // make sure there is enough room to
704                                 // write both bytes if not, rewind the
705                                 // source stream and break out
706             if((bytePos + 1) >= byteBufferLimit) 
707                 { --ucPos; break mainLoop; }
708 
709             byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
710             byteBuffer[bytePos++] = (byte) 
711                 (curUC - fOffsets[whichWindow] 
712                  + COMPRESSIONOFFSET);
713 
714             fTimeStamps [ whichWindow ] = ++fTimeStamp;
715             fCurrentWindow = whichWindow;
716             fMode = SINGLEBYTEMODE;
717             break unicodeModeLoop;
718             }
719 
720             // otherwise, just quote the unicode for the char
721             else {
722                                 // make sure there is enough room to
723                                 // write all three bytes if not,
724                                 // rewind the source stream and break
725                                 // out
726             if((bytePos + 2) >= byteBufferLimit) 
727                 { --ucPos; break mainLoop; }
728 
729             hiByte = curUC >>> 8;
730             loByte = curUC & 0xFF;
731 
732             if(sUnicodeTagTable[ hiByte ])
733                 // add quote Unicode tag
734                 byteBuffer[bytePos++] = (byte) UQUOTEU;
735 
736             byteBuffer[bytePos++] = (byte) hiByte;
737             byteBuffer[bytePos++] = (byte) loByte;
738             }
739         }
740         
741         // char is not in a defined window
742         else {
743             // determine index for current char (char is compressible)
744             curIndex = makeIndex(curUC);
745             fIndexCount[curIndex]++;
746             
747             // look ahead
748             if( (ucPos + 1) < charBufferLimit )
749             forwardUC = charBuffer[ucPos + 1];
750             else
751             forwardUC = INVALIDCHAR;
752             
753             // if we have encountered this index at least once
754             // before, define a new window for it that hasn't
755             // previously been redefined
756             // OR
757             // if three chars in a row with the same index,
758             // define a new window (makeIndex will return
759             // RESERVEDINDEX for INVALIDCHAR)
760             if((fIndexCount[curIndex] > 1) ||
761                (curIndex == makeIndex(nextUC) 
762             && curIndex == makeIndex(forwardUC))) {
763                                 // make sure there is enough room to
764                                 // write all three bytes if not,
765                                 // rewind the source stream and break
766                                 // out
767             if((bytePos + 2) >= byteBufferLimit) 
768                 { --ucPos; break mainLoop; }
769 
770             // get least recently defined window
771             whichWindow = getLRDefinedWindow();
772 
773             byteBuffer[bytePos++] = (byte)(UDEFINE0 + whichWindow);
774             byteBuffer[bytePos++] = (byte) curIndex;
775             byteBuffer[bytePos++] = (byte) 
776                 (curUC - sOffsetTable[curIndex] 
777                  + COMPRESSIONOFFSET);
778             
779             fOffsets[whichWindow] = sOffsetTable[curIndex];
780             fCurrentWindow = whichWindow;
781             fTimeStamps [whichWindow] = ++fTimeStamp;
782             fMode = SINGLEBYTEMODE;
783             break unicodeModeLoop;
784             }
785             
786             // otherwise just quote the unicode, and save our
787             // windows for longer runs
788             else {
789                                 // make sure there is enough room to
790                                 // write all three bytes if not,
791                                 // rewind the source stream and break
792                                 // out
793             if((bytePos + 2) >= byteBufferLimit) 
794                 { --ucPos; break mainLoop; }
795 
796             hiByte = curUC >>> 8;
797             loByte = curUC & 0xFF;
798 
799             if(sUnicodeTagTable[ hiByte ])
800                 // add quote Unicode tag
801                 byteBuffer[bytePos++] = (byte) UQUOTEU;  
802             
803             byteBuffer[bytePos++] = (byte) hiByte;
804             byteBuffer[bytePos++] = (byte) loByte;
805             }
806         }
807         }
808         }  // end switch
809     }
810     
811         // fill in output parameter
812     if(charsRead != null)
813         charsRead [0] = (ucPos - charBufferStart);
814         
815         // return # of bytes written
816         return (bytePos - byteBufferStart);
817     }
818 
819     /** 
820      * Reset the compressor to its initial state.
821      * @stable ICU 2.4
822      */
823     public void reset()
824     {
825     int i;
826 
827         // reset dynamic windows
828         fOffsets[0] = 0x0080;    // Latin-1
829         fOffsets[1] = 0x00C0;    // Latin-1 Supplement + Latin Extended-A
830         fOffsets[2] = 0x0400;    // Cyrillic
831         fOffsets[3] = 0x0600;    // Arabic
832         fOffsets[4] = 0x0900;    // Devanagari
833         fOffsets[5] = 0x3040;    // Hiragana
834         fOffsets[6] = 0x30A0;    // Katakana
835         fOffsets[7] = 0xFF00;    // Fullwidth ASCII
836 
837 
838         // reset time stamps
839         for(i = 0; i < NUMWINDOWS; i++) {
840             fTimeStamps[i]          = 0;
841         }
842 
843         // reset count of seen indices
844         for(i = 0; i <= MAXINDEX; i++ ) {
845             fIndexCount[i] = 0;
846         }
847 
848         fTimeStamp      = 0;                // Reset current time stamp
849         fCurrentWindow  = 0;                // Make current window Latin-1
850         fMode           = SINGLEBYTEMODE;   // Always start in single-byte mode
851     }
852 
853     //==========================
854     // Determine the index for a character
855     //==========================
856 
857     /**
858      * Create the index value for a character.
859      * For more information on this function, refer to table X-3
860      * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>.
861      * @param c The character in question.
862      * @return An index for c
863      */
864     private static int makeIndex(int c)
865     {
866         // check the predefined indices
867         if(c >= 0x00C0 && c < 0x0140)
868             return LATININDEX;
869         else if(c >= 0x0250 && c < 0x02D0)
870             return IPAEXTENSIONINDEX;
871         else if(c >= 0x0370 && c < 0x03F0)
872             return GREEKINDEX;
873         else if(c >= 0x0530 && c < 0x0590)
874             return ARMENIANINDEX;
875         else if(c >= 0x3040 && c < 0x30A0)
876             return HIRAGANAINDEX;
877         else if(c >= 0x30A0 && c < 0x3120)
878             return KATAKANAINDEX;
879         else if(c >= 0xFF60 && c < 0xFF9F)
880             return HALFWIDTHKATAKANAINDEX;
881 
882         // calculate index
883         else if(c >= 0x0080 && c < 0x3400)
884             return (c / 0x80) & 0xFF;
885         else if(c >= 0xE000 && c <= 0xFFFF)
886             return ((c - 0xAC00) / 0x80) & 0xFF;
887             
888         // should never happen
889         else {
890             return RESERVEDINDEX;
891         }
892     }
893 
894     //==========================
895     // Check if a given character fits in a window
896     //==========================
897 
898     /**
899     * Determine if a character is in a dynamic window.
900     * @param c The character to test
901     * @param whichWindow The dynamic window the test
902     * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>, 
903     * false otherwise.
904     */
905     private boolean inDynamicWindow(int c, 
906                     int whichWindow)
907     {
908         return (c >= fOffsets[whichWindow] 
909         && c < (fOffsets[whichWindow] + 0x80));
910     }
911 
912     /**
913      * Determine if a character is in a static window.
914     * @param c The character to test
915     * @param whichWindow The static window the test
916     * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>, 
917     * false otherwise.
918     */
919     private static boolean inStaticWindow(int c, 
920                       int whichWindow)
921     {
922         return (c >= sOffsets[whichWindow]
923         && c < (sOffsets[whichWindow] + 0x80));
924     }
925 
926     //==========================
927     // Check if a given character is compressible
928     //==========================
929 
930     /**
931     * Determine if a character is compressible.
932     * @param c The character to test.
933     * @return true if the <TT>c</TT> is compressible, false otherwise.
934     */
935     private static boolean isCompressible(int c)
936     {
937         return (c < 0x3400 || c >= 0xE000);
938     }
939 
940     //==========================
941     // Check if a window is defined for a given character
942     //==========================
943 
944     /**
945      * Determine if a dynamic window for a certain character is defined
946      * @param c The character in question
947      * @return The dynamic window containing <TT>c</TT>, or 
948      * INVALIDWINDOW if not defined.
949      */
950     private int findDynamicWindow(int c)
951     {
952     // supposedly faster to count down
953         //for(int i = 0; i < NUMWINDOWS; i++) {
954     for(int i = NUMWINDOWS - 1; i >= 0; --i) {
955         if(inDynamicWindow(c, i)) {
956         ++fTimeStamps[i];
957                 return i;
958         }
959     }
960         
961         return INVALIDWINDOW;
962     }
963 
964     /**
965      * Determine if a static window for a certain character is defined
966      * @param c The character in question
967      * @return The static window containing <TT>c</TT>, or 
968      * INVALIDWINDOW if not defined.
969      */
970     private static int findStaticWindow(int c)
971     {
972     // supposedly faster to count down
973         //for(int i = 0; i < NUMSTATICWINDOWS; i++) {
974     for(int i = NUMSTATICWINDOWS - 1; i >= 0; --i) {
975         if(inStaticWindow(c, i)) {
976                 return i;
977         }
978     }
979     
980         return INVALIDWINDOW;
981     }
982     
983     //==========================
984     // Find the least-recently used window
985     //==========================
986 
987     /** Find the least-recently defined window */
988     private int getLRDefinedWindow()
989     {
990         int leastRU         = Integer.MAX_VALUE;
991         int whichWindow     = INVALIDWINDOW;
992 
993         // find least recently used window
994         // supposedly faster to count down
995         //for( int i = 0; i < NUMWINDOWS; i++ ) {
996         for(int i = NUMWINDOWS - 1; i >= 0; --i ) {
997             if( fTimeStamps[i] < leastRU ) {
998                 leastRU   = fTimeStamps[i];
999                 whichWindow  = i;
1000            }
1001        }
1002
1003        return whichWindow;
1004    }
1005    
1006};
1007
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags