KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > UnicodeCompressor


1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2004, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7 package com.ibm.icu.text;
8
9 /**
10 * A compression engine implementing the Standard Compression Scheme
11 * for Unicode (SCSU) as outlined in <A
12 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
13 * Report #6</A>.
14 *
15 * <P>The SCSU works by using dynamically positioned <EM>windows</EM>
16 * consisting of 128 consecutive characters in Unicode. During compression,
17 * characters within a window are encoded in the compressed stream as the bytes
18 * <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters
19 * (bytes) between <TT>U+0000 - U+00FF</TT>. The SCSU approximates the
20 * storage size of traditional character sets, for example 1 byte per
21 * character for ASCII or Latin-1 text, and 2 bytes per character for CJK
22 * ideographs.</P>
23 *
24 * <P><STRONG>USAGE</STRONG></P>
25 *
26 * <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a
27 * straightforward manner to compress simple strings:</P>
28 *
29 * <PRE>
30 * String s = ... ; // get string from somewhere
31 * byte [] compressed = UnicodeCompressor.compress(s);
32 * </PRE>
33 *
34 * <P>The static methods have a fairly large memory footprint.
35 * For finer-grained control over memory usage,
36 * <TT>UnicodeCompressor</TT> offers more powerful APIs allowing
37 * iterative compression:</P>
38 *
39 * <PRE>
40 * // Compress an array "chars" of length "len" using a buffer of 512 bytes
41 * // to the OutputStream "out"
42 *
43 * UnicodeCompressor myCompressor = new UnicodeCompressor();
44 * final static int BUFSIZE = 512;
45 * byte [] byteBuffer = new byte [ BUFSIZE ];
46 * int bytesWritten = 0;
47 * int [] unicharsRead = new int [1];
48 * int totalCharsCompressed = 0;
49 * int totalBytesWritten = 0;
50 *
51 * do {
52 * // do the compression
53 * bytesWritten = myCompressor.compress(chars, totalCharsCompressed,
54 * len, unicharsRead,
55 * byteBuffer, 0, BUFSIZE);
56 *
57 * // do something with the current set of bytes
58 * out.write(byteBuffer, 0, bytesWritten);
59 *
60 * // update the no. of characters compressed
61 * totalCharsCompressed += unicharsRead[0];
62 *
63 * // update the no. of bytes written
64 * totalBytesWritten += bytesWritten;
65 *
66 * } while(totalCharsCompressed < len);
67 *
68 * myCompressor.reset(); // reuse compressor
69 * </PRE>
70 *
71 * @see UnicodeDecompressor
72 *
73 * @author Stephen F. Booth
74 * @stable ICU 2.4
75 */

76
77 /*
78 *
79 * COMPRESSION STRATEGY
80 *
81 * Single Byte Mode
82 *
83 * There are three relevant cases.
84 * If the character is in the current window or is Latin-1 (U+0000,
85 * U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed
86 * directly in the stream as a single byte.
87 *
88 * 1. Current character is in defined, inactive window.
89 * 2. Current character is in undefined window.
90 * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF).
91 *
92 * 1. Current character is in defined, inactive window
93 * A. Look ahead two characters
94 * B. If both following characters in same window as current character,
95 * switch to defined window
96 * C. If only next character is in same window as current character,
97 * quote defined window
98 * D. If neither of following characters is in same window as current,
99 * quote defined window
100 *
101 * 2. Current character is in undefined window
102 * A. Look ahead two characters
103 * B. If both following characters in same window as current character,
104 * define new window
105 * C. If only next character in same window as current character,
106 * switch to Unicode mode
107 * NOTE: This costs us one extra byte. However,
108 * since we have a limited number of windows to work with, it is
109 * assumed the cost will pay off later in savings from a window with
110 * more characters in it.
111 * D. If neither of following characters in same window as current,
112 * switch to Unicode mode. Alternative to above: just quote
113 * Unicode (same byte cost)
114 *
115 * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF)
116 * A. Look ahead one character
117 * B. If next character in non-compressible region, switch to
118 * Unicode mode
119 * C. If next character not in non-compressible region, quote Unicode
120 *
121 *
122 * The following chart illustrates the bytes required for encoding characters
123 * in each possible way
124 *
125 *
126 * SINGLE BYTE MODE
127 * Characters in a row with same index
128 * tag encountered 1 2 3 4
129 * ---------------------------------------------------------------
130 * none (in current window) 1 2 3 4
131 *
132 * quote Unicode 3 6 9 12
133 *
134 * window not switch to Unicode 3 5 7 9 byte
135 * defined define window 3 4 5 6 cost
136 *
137 * window switch to window 2 3 4 5
138 * defined quote window 2 4 6 8
139 *
140 * Unicode Mode
141 *
142 * There are two relevant cases.
143 * If the character is in the non-compressible region
144 * (U+3400 - U+DFFF), the character is simply written to the
145 * stream as a pair of bytes.
146 *
147 * 1. Current character is in defined, inactive window.
148 * 2. Current character is in undefined window.
149 *
150 * 1.Current character is in defined, inactive window
151 * A. Look ahead one character
152 * B. If next character has same index as current character,
153 * switch to defined window (and switch to single-byte mode)
154 * C. If not, just put bytes in stream
155 *
156 *
157 * 2. Current character is in undefined window
158 * A. Look ahead two characters
159 * B. If both in same window as current character, define window
160 * (and switch to single-byte mode)
161 * C. If only next character in same window, just put bytes in stream
162 * NOTE: This costs us one extra byte. However,
163 * since we have a limited number of windows to work with, it is
164 * assumed the cost will pay off later in savings from a window with
165 * more characters in it.
166 * D. If neither in same window, put bytes in stream
167 *
168 *
169 * The following chart illustrates the bytes required for encoding characters
170 * in each possible way
171 *
172 *
173 * UNICODE MODE
174 * Characters in a row with same index
175 * tag encountered 1 2 3 4
176 * ---------------------------------------------------------------
177 * none 2 4 6 8
178 *
179 * quote Unicode 3 6 9 12
180 *
181 * window not define window 3 4 5 6 byte
182 * defined cost
183 * window switch to window 2 3 4 5
184 * defined
185 */

186 public final class UnicodeCompressor implements SCSU
187 {
188     //==========================
189
// Class variables
190
//==========================
191

192     /** For quick identification of a byte as a single-byte mode tag */
193     private static boolean [] sSingleTagTable = {
194         // table generated by CompressionTableGenerator
195
false, true, true, true, true, true, true, true, true, false,
196     false, true, true, false, true, true, true, true, true, true,
197     true, true, true, true, true, true, true, true, true, true,
198     true, true, false, false, false, false, false, false,false,
199     false, false, false, false, false, false, false, false, false,
200     false, false, false, false, false, false, false, false, false,
201     false, false, false, false, false, false, false, false, false,
202     false, false, false, false, false, false, false, false, false,
203     false, false, false, false, false, false, false, false, false,
204     false, false, false, false, false, false, false, false, false,
205     false, false, false, false, false, false, false, false, false,
206     false, false, false, false, false, false, false, false, false,
207     false, false, false, false, false, false, false, false, false,
208     false, false, false, false, false, false, false, false, false,
209     false, false, false, false, false, false, false, false, false,
210     false, false, false, false, false, false, false, false, false,
211     false, false, false, false, false, false, false, false, false,
212     false, false, false, false, false, false, false, false, false,
213     false, false, false, false, false, false, false, false, false,
214     false, false, false, false, false, false, false, false, false,
215     false, false, false, false, false, false, false, false, false,
216     false, false, false, false, false, false, false, false, false,
217     false, false, false, false, false, false, false, false, false,
218     false, false, false, false, false, false, false, false, false,
219     false, false, false, false, false, false, false, false, false,
220     false, false, false, false, false, false, false, false, false,
221     false, false, false, false, false, false, false, false, false,
222     false, false, false, false, false, false, false, false, false,
223     false
224     };
225
226     /** For quick identification of a byte as a unicode mode tag */
227     private static boolean [] sUnicodeTagTable = {
228         // table generated by CompressionTableGenerator
229
false, false, false, false, false, false, false, false, false,
230     false, false, false, false, false, false, false, false, false,
231     false, false, false, false, false, false, false, false, false,
232     false, false, false, false, false, false, false, false, false,
233     false, false, false, false, false, false, false, false, false,
234     false, false, false, false, false, false, false, false, false,
235     false, false, false, false, false, false, false, false, false,
236     false, false, false, false, false, false, false, false, false,
237     false, false, false, false, false, false, false, false, false,
238     false, false, false, false, false, false, false, false, false,
239     false, false, false, false, false, false, false, false, false,
240     false, false, false, false, false, false, false, false, false,
241     false, false, false, false, false, false, false, false, false,
242     false, false, false, false, false, false, false, false, false,
243     false, false, false, false, false, false, false, false, false,
244     false, false, false, false, false, false, false, false, false,
245     false, false, false, false, false, false, false, false, false,
246     false, false, false, false, false, false, false, false, false,
247     false, false, false, false, false, false, false, false, false,
248     false, false, false, false, false, false, false, false, false,
249     false, false, false, false, false, false, false, false, false,
250     false, false, false, false, false, false, false, false, false,
251     false, false, false, false, false, false, false, false, false,
252     false, false, false, false, false, false, false, false, false,
253     false, false, false, false, false, false, false, false, true,
254     true, true, true, true, true, true, true, true, true, true,
255     true, true, true, true, true, true, true, true, false, false,
256     false, false, false, false, false, false, false, false, false,
257     false, false
258     };
259
260     //==========================
261
// Instance variables
262
//==========================
263

264     /** Alias to current dynamic window */
265     private int fCurrentWindow = 0;
266
267     /** Dynamic compression window offsets */
268     private int [] fOffsets = new int [ NUMWINDOWS ];
269
270     /** Current compression mode */
271     private int fMode = SINGLEBYTEMODE;
272
273     /** Keeps count of times character indices are encountered */
274     private int [] fIndexCount = new int [ MAXINDEX + 1 ];
275
276     /** The time stamps indicate when a window was last defined */
277     private int [] fTimeStamps = new int [ NUMWINDOWS ];
278     
279     /** The current time stamp */
280     private int fTimeStamp = 0;
281     
282
283     /**
284      * Create a UnicodeCompressor.
285      * Sets all windows to their default values.
286      * @see #reset
287      * @stable ICU 2.4
288      */

289     public UnicodeCompressor()
290     {
291     reset(); // initialize to defaults
292
}
293
294     /**
295      * Compress a string into a byte array.
296      * @param buffer The string to compress.
297      * @return A byte array containing the compressed characters.
298      * @see #compress(char [], int, int)
299      * @stable ICU 2.4
300      */

301     public static byte [] compress(String JavaDoc buffer)
302     {
303     return compress(buffer.toCharArray(), 0, buffer.length());
304     }
305
306     /**
307      * Compress a Unicode character array into a byte array.
308      * @param buffer The character buffer to compress.
309      * @param start The start of the character run to compress.
310      * @param limit The limit of the character run to compress.
311      * @return A byte array containing the compressed characters.
312      * @see #compress(String)
313      * @stable ICU 2.4
314      */

315     public static byte [] compress(char [] buffer,
316                    int start,
317                    int limit)
318     {
319     UnicodeCompressor comp = new UnicodeCompressor();
320
321     // use a buffer that we know will never overflow
322
// in the worst case, each character will take 3 bytes
323
// to encode: UQU, hibyte, lobyte. In this case, the
324
// compressed data will look like: SCU, UQU, hibyte, lobyte, ...
325
// buffer must be at least 4 bytes in size
326
int len = Math.max(4, 3 * (limit - start) + 1);
327     byte [] temp = new byte [len];
328
329     int byteCount = comp.compress(buffer, start, limit, null,
330                       temp, 0, len);
331
332     byte [] result = new byte [byteCount];
333     System.arraycopy(temp, 0, result, 0, byteCount);
334     return result;
335     }
336
337     /**
338      * Compress a Unicode character array into a byte array.
339      *
340      * This function will only consume input that can be completely
341      * output.
342      *
343      * @param charBuffer The character buffer to compress.
344      * @param charBufferStart The start of the character run to compress.
345      * @param charBufferLimit The limit of the character run to compress.
346      * @param charsRead A one-element array. If not null, on return
347      * the number of characters read from charBuffer.
348      * @param byteBuffer A buffer to receive the compressed data. This
349      * buffer must be at minimum four bytes in size.
350      * @param byteBufferStart The starting offset to which to write
351      * compressed data.
352      * @param byteBufferLimit The limiting offset for writing compressed data.
353      * @return The number of bytes written to byteBuffer.
354      * @stable ICU 2.4
355      */

356     public int compress(char [] charBuffer,
357             int charBufferStart,
358             int charBufferLimit,
359             int [] charsRead,
360             byte [] byteBuffer,
361             int byteBufferStart,
362             int byteBufferLimit)
363     {
364         // the current position in the target byte buffer
365
int bytePos = byteBufferStart;
366     
367     // the current position in the source unicode character buffer
368
int ucPos = charBufferStart;
369     
370     // the current unicode character from the source buffer
371
int curUC = INVALIDCHAR;
372     
373     // the index for the current character
374
int curIndex = -1;
375         
376     // look ahead
377
int nextUC = INVALIDCHAR;
378     int forwardUC = INVALIDCHAR;
379     
380         // temporary for window searching
381
int whichWindow = 0;
382     
383     // high and low bytes of the current unicode character
384
int hiByte = 0;
385     int loByte = 0;
386
387
388     // byteBuffer must be at least 4 bytes in size
389
if(byteBuffer.length < 4 || (byteBufferLimit - byteBufferStart) < 4)
390         throw new IllegalArgumentException JavaDoc("byteBuffer.length < 4");
391
392     mainLoop:
393     while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
394         switch(fMode) {
395         // main single byte mode compression loop
396
case SINGLEBYTEMODE:
397         singleByteModeLoop:
398         while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
399         // get current char
400
curUC = charBuffer[ucPos++];
401
402         // get next char
403
if(ucPos < charBufferLimit)
404             nextUC = charBuffer[ucPos];
405         else
406             nextUC = INVALIDCHAR;
407         
408         // chars less than 0x0080 (excluding tags) go straight
409
// in stream
410
if(curUC < 0x0080) {
411             loByte = curUC & 0xFF;
412
413             // we need to check and make sure we don't
414
// accidentally write a single byte mode tag to
415
// the stream unless it's quoted
416
if(sSingleTagTable[loByte]) {
417                                 // make sure there is enough room to
418
// write both bytes if not, rewind the
419
// source stream and break out
420
if( (bytePos + 1) >= byteBufferLimit)
421                 { --ucPos; break mainLoop; }
422
423             // since we know the byte is less than 0x80, SQUOTE0
424
// will use static window 0, or ASCII
425
byteBuffer[bytePos++] = (byte) SQUOTE0;
426             }
427
428             byteBuffer[bytePos++] = (byte) loByte;
429         }
430
431         // if the char belongs to current window, convert it
432
// to a byte by adding the generic compression offset
433
// and subtracting the window's offset
434
else if(inDynamicWindow(curUC, fCurrentWindow) ) {
435             byteBuffer[bytePos++] = (byte)
436             (curUC - fOffsets[ fCurrentWindow ]
437              + COMPRESSIONOFFSET);
438         }
439         
440         // if char is not in compressible range, either switch to or
441
// quote from unicode
442
else if( ! isCompressible(curUC) ) {
443             // only check next character if it is valid
444
if(nextUC != INVALIDCHAR && isCompressible(nextUC)) {
445                                 // make sure there is enough room to
446
// write all three bytes if not,
447
// rewind the source stream and break
448
// out
449
if( (bytePos + 2) >= byteBufferLimit)
450                 { --ucPos; break mainLoop; }
451
452             byteBuffer[bytePos++] = (byte) SQUOTEU;
453             byteBuffer[bytePos++] = (byte) (curUC >>> 8);
454             byteBuffer[bytePos++] = (byte) (curUC & 0xFF);
455             }
456             else {
457                                 // make sure there is enough room to
458
// write all four bytes if not, rewind
459
// the source stream and break out
460
if((bytePos + 3) >= byteBufferLimit)
461                 { --ucPos; break mainLoop; }
462
463             byteBuffer[bytePos++] = (byte) SCHANGEU;
464
465             hiByte = curUC >>> 8;
466             loByte = curUC & 0xFF;
467
468             if(sUnicodeTagTable[hiByte])
469                 // add quote Unicode tag
470
byteBuffer[bytePos++] = (byte) UQUOTEU;
471
472             byteBuffer[bytePos++] = (byte) hiByte;
473             byteBuffer[bytePos++] = (byte) loByte;
474                 
475             fMode = UNICODEMODE;
476             break singleByteModeLoop;
477             }
478         }
479
480         // if the char is in a currently defined dynamic
481
// window, figure out which one, and either switch to
482
// it or quote from it
483
else if((whichWindow = findDynamicWindow(curUC))
484             != INVALIDWINDOW ) {
485             // look ahead
486
if( (ucPos + 1) < charBufferLimit )
487             forwardUC = charBuffer[ucPos + 1];
488             else
489             forwardUC = INVALIDCHAR;
490             
491             // all three chars in same window, switch to that
492
// window inDynamicWindow will return false for
493
// INVALIDCHAR
494
if(inDynamicWindow(nextUC, whichWindow)
495                && inDynamicWindow(forwardUC, whichWindow)) {
496                                 // make sure there is enough room to
497
// write both bytes if not, rewind the
498
// source stream and break out
499
if( (bytePos + 1) >= byteBufferLimit)
500                 { --ucPos; break mainLoop; }
501
502             byteBuffer[bytePos++] = (byte)(SCHANGE0 + whichWindow);
503             byteBuffer[bytePos++] = (byte)
504                 (curUC - fOffsets[whichWindow]
505                  + COMPRESSIONOFFSET);
506             fTimeStamps [ whichWindow ] = ++fTimeStamp;
507             fCurrentWindow = whichWindow;
508             }
509             
510             // either only next char or neither in same
511
// window, so quote
512
else {
513                                 // make sure there is enough room to
514
// write both bytes if not, rewind the
515
// source stream and break out
516
if((bytePos + 1) >= byteBufferLimit)
517                 { --ucPos; break mainLoop; }
518
519             byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
520             byteBuffer[bytePos++] = (byte)
521                 (curUC - fOffsets[whichWindow]
522                  + COMPRESSIONOFFSET);
523             }
524         }
525
526         // if a static window is defined, and the following
527
// character is not in that static window, quote from
528
// the static window Note: to quote from a static
529
// window, don't add 0x80
530
else if((whichWindow = findStaticWindow(curUC))
531             != INVALIDWINDOW
532             && ! inStaticWindow(nextUC, whichWindow) ) {
533             // make sure there is enough room to write both
534
// bytes if not, rewind the source stream and
535
// break out
536
if((bytePos + 1) >= byteBufferLimit)
537             { --ucPos; break mainLoop; }
538
539             byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
540             byteBuffer[bytePos++] = (byte)
541             (curUC - sOffsets[whichWindow]);
542         }
543         
544         // if a window is not defined, decide if we want to
545
// define a new one or switch to unicode mode
546
else {
547             // determine index for current char (char is compressible)
548
curIndex = makeIndex(curUC);
549             fIndexCount[curIndex]++;
550
551             // look ahead
552
if((ucPos + 1) < charBufferLimit)
553             forwardUC = charBuffer[ucPos + 1];
554             else
555             forwardUC = INVALIDCHAR;
556
557             // if we have encountered this index at least once
558
// before, define a new window
559
// OR
560
// three chars in a row with same index, define a
561
// new window (makeIndex will return RESERVEDINDEX
562
// for INVALIDCHAR)
563
if((fIndexCount[curIndex] > 1) ||
564                (curIndex == makeIndex(nextUC)
565             && curIndex == makeIndex(forwardUC))) {
566             // make sure there is enough room to write all
567
// three bytes if not, rewind the source
568
// stream and break out
569
if( (bytePos + 2) >= byteBufferLimit)
570                 { --ucPos; break mainLoop; }
571
572             // get least recently defined window
573
whichWindow = getLRDefinedWindow();
574
575             byteBuffer[bytePos++] = (byte)(SDEFINE0 + whichWindow);
576             byteBuffer[bytePos++] = (byte) curIndex;
577             byteBuffer[bytePos++] = (byte)
578                 (curUC - sOffsetTable[curIndex]
579                  + COMPRESSIONOFFSET);
580
581             fOffsets[whichWindow] = sOffsetTable[curIndex];
582             fCurrentWindow = whichWindow;
583             fTimeStamps [whichWindow] = ++fTimeStamp;
584             }
585
586             // only two chars in a row with same index, so
587
// switch to unicode mode (makeIndex will return
588
// RESERVEDINDEX for INVALIDCHAR)
589
// OR
590
// three chars have different indices, so switch
591
// to unicode mode
592
else {
593             // make sure there is enough room to write all
594
// four bytes if not, rewind the source stream
595
// and break out
596
if((bytePos + 3) >= byteBufferLimit)
597                 { --ucPos; break mainLoop; }
598
599             byteBuffer[bytePos++] = (byte) SCHANGEU;
600
601             hiByte = curUC >>> 8;
602             loByte = curUC & 0xFF;
603
604             if(sUnicodeTagTable[hiByte])
605                 // add quote Unicode tag
606
byteBuffer[bytePos++] = (byte) UQUOTEU;
607
608             byteBuffer[bytePos++] = (byte) hiByte;
609             byteBuffer[bytePos++] = (byte) loByte;
610
611             fMode = UNICODEMODE;
612             break singleByteModeLoop;
613             }
614         }
615         }
616         break;
617
618         case UNICODEMODE:
619         // main unicode mode compression loop
620
unicodeModeLoop:
621         while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
622         // get current char
623
curUC = charBuffer[ucPos++];
624
625         // get next char
626
if( ucPos < charBufferLimit )
627             nextUC = charBuffer[ucPos];
628         else
629             nextUC = INVALIDCHAR;
630
631         // if we have two uncompressible chars in a row,
632
// put the current char's bytes in the stream
633
if( ! isCompressible(curUC)
634             || (nextUC != INVALIDCHAR && ! isCompressible(nextUC))) {
635             // make sure there is enough room to write all three bytes
636
// if not, rewind the source stream and break out
637
if( (bytePos + 2) >= byteBufferLimit)
638             { --ucPos; break mainLoop; }
639
640             hiByte = curUC >>> 8;
641             loByte = curUC & 0xFF;
642
643             if(sUnicodeTagTable[ hiByte ])
644             // add quote Unicode tag
645
byteBuffer[bytePos++] = (byte) UQUOTEU;
646                 
647             byteBuffer[bytePos++] = (byte) hiByte;
648             byteBuffer[bytePos++] = (byte) loByte;
649         }
650         
651         // bytes less than 0x80 can go straight in the stream,
652
// but in single-byte mode
653
else if(curUC < 0x0080) {
654             loByte = curUC & 0xFF;
655
656             // if two chars in a row below 0x80 and the
657
// current char is not a single-byte mode tag,
658
// switch to single-byte mode
659
if(nextUC != INVALIDCHAR
660                && nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) {
661                                 // make sure there is enough room to
662
// write both bytes if not, rewind the
663
// source stream and break out
664
if( (bytePos + 1) >= byteBufferLimit)
665                 { --ucPos; break mainLoop; }
666
667             // use the last-active window
668
whichWindow = fCurrentWindow;
669             byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
670             byteBuffer[bytePos++] = (byte) loByte;
671
672             //fCurrentWindow = 0;
673
fTimeStamps [whichWindow] = ++fTimeStamp;
674             fMode = SINGLEBYTEMODE;
675             break unicodeModeLoop;
676             }
677
678             // otherwise, just write the bytes to the stream
679
// (this will cover the case of only 1 char less than 0x80
680
// and single-byte mode tags)
681
else {
682                                 // make sure there is enough room to
683
// write both bytes if not, rewind the
684
// source stream and break out
685
if((bytePos + 1) >= byteBufferLimit)
686                 { --ucPos; break mainLoop; }
687
688             // since the character is less than 0x80, the
689
// high byte is always 0x00 - no need for
690
// (curUC >>> 8)
691
byteBuffer[bytePos++] = (byte) 0x00;
692             byteBuffer[bytePos++] = (byte) loByte;
693             }
694         }
695
696         // figure out if the current char is in a defined window
697
else if((whichWindow = findDynamicWindow(curUC))
698             != INVALIDWINDOW ) {
699             // if two chars in a row in the same window,
700
// switch to that window and go to single-byte mode
701
// inDynamicWindow will return false for INVALIDCHAR
702
if(inDynamicWindow(nextUC, whichWindow)) {
703                                 // make sure there is enough room to
704
// write both bytes if not, rewind the
705
// source stream and break out
706
if((bytePos + 1) >= byteBufferLimit)
707                 { --ucPos; break mainLoop; }
708
709             byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
710             byteBuffer[bytePos++] = (byte)
711                 (curUC - fOffsets[whichWindow]
712                  + COMPRESSIONOFFSET);
713
714             fTimeStamps [ whichWindow ] = ++fTimeStamp;
715             fCurrentWindow = whichWindow;
716             fMode = SINGLEBYTEMODE;
717             break unicodeModeLoop;
718             }
719
720             // otherwise, just quote the unicode for the char
721
else {
722                                 // make sure there is enough room to
723
// write all three bytes if not,
724
// rewind the source stream and break
725
// out
726
if((bytePos + 2) >= byteBufferLimit)
727                 { --ucPos; break mainLoop; }
728
729             hiByte = curUC >>> 8;
730             loByte = curUC & 0xFF;
731
732             if(sUnicodeTagTable[ hiByte ])
733                 // add quote Unicode tag
734
byteBuffer[bytePos++] = (byte) UQUOTEU;
735
736             byteBuffer[bytePos++] = (byte) hiByte;
737             byteBuffer[bytePos++] = (byte) loByte;
738             }
739         }
740         
741         // char is not in a defined window
742
else {
743             // determine index for current char (char is compressible)
744
curIndex = makeIndex(curUC);
745             fIndexCount[curIndex]++;
746             
747             // look ahead
748
if( (ucPos + 1) < charBufferLimit )
749             forwardUC = charBuffer[ucPos + 1];
750             else
751             forwardUC = INVALIDCHAR;
752             
753             // if we have encountered this index at least once
754
// before, define a new window for it that hasn't
755
// previously been redefined
756
// OR
757
// if three chars in a row with the same index,
758
// define a new window (makeIndex will return
759
// RESERVEDINDEX for INVALIDCHAR)
760
if((fIndexCount[curIndex] > 1) ||
761                (curIndex == makeIndex(nextUC)
762             && curIndex == makeIndex(forwardUC))) {
763                                 // make sure there is enough room to
764
// write all three bytes if not,
765
// rewind the source stream and break
766
// out
767
if((bytePos + 2) >= byteBufferLimit)
768                 { --ucPos; break mainLoop; }
769
770             // get least recently defined window
771
whichWindow = getLRDefinedWindow();
772
773             byteBuffer[bytePos++] = (byte)(UDEFINE0 + whichWindow);
774             byteBuffer[bytePos++] = (byte) curIndex;
775             byteBuffer[bytePos++] = (byte)
776                 (curUC - sOffsetTable[curIndex]
777                  + COMPRESSIONOFFSET);
778             
779             fOffsets[whichWindow] = sOffsetTable[curIndex];
780             fCurrentWindow = whichWindow;
781             fTimeStamps [whichWindow] = ++fTimeStamp;
782             fMode = SINGLEBYTEMODE;
783             break unicodeModeLoop;
784             }
785             
786             // otherwise just quote the unicode, and save our
787
// windows for longer runs
788
else {
789                                 // make sure there is enough room to
790
// write all three bytes if not,
791
// rewind the source stream and break
792
// out
793
if((bytePos + 2) >= byteBufferLimit)
794                 { --ucPos; break mainLoop; }
795
796             hiByte = curUC >>> 8;
797             loByte = curUC & 0xFF;
798
799             if(sUnicodeTagTable[ hiByte ])
800                 // add quote Unicode tag
801
byteBuffer[bytePos++] = (byte) UQUOTEU;
802             
803             byteBuffer[bytePos++] = (byte) hiByte;
804             byteBuffer[bytePos++] = (byte) loByte;
805             }
806         }
807         }
808         } // end switch
809
}
810     
811         // fill in output parameter
812
if(charsRead != null)
813         charsRead [0] = (ucPos - charBufferStart);
814         
815         // return # of bytes written
816
return (bytePos - byteBufferStart);
817     }
818
819     /**
820      * Reset the compressor to its initial state.
821      * @stable ICU 2.4
822      */

823     public void reset()
824     {
825     int i;
826
827         // reset dynamic windows
828
fOffsets[0] = 0x0080; // Latin-1
829
fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A
830
fOffsets[2] = 0x0400; // Cyrillic
831
fOffsets[3] = 0x0600; // Arabic
832
fOffsets[4] = 0x0900; // Devanagari
833
fOffsets[5] = 0x3040; // Hiragana
834
fOffsets[6] = 0x30A0; // Katakana
835
fOffsets[7] = 0xFF00; // Fullwidth ASCII
836

837
838         // reset time stamps
839
for(i = 0; i < NUMWINDOWS; i++) {
840             fTimeStamps[i] = 0;
841         }
842
843         // reset count of seen indices
844
for(i = 0; i <= MAXINDEX; i++ ) {
845             fIndexCount[i] = 0;
846         }
847
848         fTimeStamp = 0; // Reset current time stamp
849
fCurrentWindow = 0; // Make current window Latin-1
850
fMode = SINGLEBYTEMODE; // Always start in single-byte mode
851
}
852
853     //==========================
854
// Determine the index for a character
855
//==========================
856

857     /**
858      * Create the index value for a character.
859      * For more information on this function, refer to table X-3
860      * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>.
861      * @param c The character in question.
862      * @return An index for c
863      */

864     private static int makeIndex(int c)
865     {
866         // check the predefined indices
867
if(c >= 0x00C0 && c < 0x0140)
868             return LATININDEX;
869         else if(c >= 0x0250 && c < 0x02D0)
870             return IPAEXTENSIONINDEX;
871         else if(c >= 0x0370 && c < 0x03F0)
872             return GREEKINDEX;
873         else if(c >= 0x0530 && c < 0x0590)
874             return ARMENIANINDEX;
875         else if(c >= 0x3040 && c < 0x30A0)
876             return HIRAGANAINDEX;
877         else if(c >= 0x30A0 && c < 0x3120)
878             return KATAKANAINDEX;
879         else if(c >= 0xFF60 && c < 0xFF9F)
880             return HALFWIDTHKATAKANAINDEX;
881
882         // calculate index
883
else if(c >= 0x0080 && c < 0x3400)
884             return (c / 0x80) & 0xFF;
885         else if(c >= 0xE000 && c <= 0xFFFF)
886             return ((c - 0xAC00) / 0x80) & 0xFF;
887             
888         // should never happen
889
else {
890             return RESERVEDINDEX;
891         }
892     }
893
894     //==========================
895
// Check if a given character fits in a window
896
//==========================
897

898     /**
899     * Determine if a character is in a dynamic window.
900     * @param c The character to test
901     * @param whichWindow The dynamic window the test
902     * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
903     * false otherwise.
904     */

905     private boolean inDynamicWindow(int c,
906                     int whichWindow)
907     {
908         return (c >= fOffsets[whichWindow]
909         && c < (fOffsets[whichWindow] + 0x80));
910     }
911
912     /**
913      * Determine if a character is in a static window.
914     * @param c The character to test
915     * @param whichWindow The static window the test
916     * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
917     * false otherwise.
918     */

919     private static boolean inStaticWindow(int c,
920                       int whichWindow)
921     {
922         return (c >= sOffsets[whichWindow]
923         && c < (sOffsets[whichWindow] + 0x80));
924     }
925
926     //==========================
927
// Check if a given character is compressible
928
//==========================
929

930     /**
931     * Determine if a character is compressible.
932     * @param c The character to test.
933     * @return true if the <TT>c</TT> is compressible, false otherwise.
934     */

935     private static boolean isCompressible(int c)
936     {
937         return (c < 0x3400 || c >= 0xE000);
938     }
939
940     //==========================
941
// Check if a window is defined for a given character
942
//==========================
943

944     /**
945      * Determine if a dynamic window for a certain character is defined
946      * @param c The character in question
947      * @return The dynamic window containing <TT>c</TT>, or
948      * INVALIDWINDOW if not defined.
949      */

950     private int findDynamicWindow(int c)
951     {
952     // supposedly faster to count down
953
//for(int i = 0; i < NUMWINDOWS; i++) {
954
for(int i = NUMWINDOWS - 1; i >= 0; --i) {
955         if(inDynamicWindow(c, i)) {
956         ++fTimeStamps[i];
957                 return i;
958         }
959     }
960         
961         return INVALIDWINDOW;
962     }
963
964     /**
965      * Determine if a static window for a certain character is defined
966      * @param c The character in question
967      * @return The static window containing <TT>c</TT>, or
968      * INVALIDWINDOW if not defined.
969      */

970     private static int findStaticWindow(int c)
971     {
972     // supposedly faster to count down
973
//for(int i = 0; i < NUMSTATICWINDOWS; i++) {
974
for(int i = NUMSTATICWINDOWS - 1; i >= 0; --i) {
975         if(inStaticWindow(c, i)) {
976                 return i;
977         }
978     }
979     
980         return INVALIDWINDOW;
981     }
982     
983     //==========================
984
// Find the least-recently used window
985
//==========================
986

987     /** Find the least-recently defined window */
988     private int getLRDefinedWindow()
989     {
990         int leastRU = Integer.MAX_VALUE;
991         int whichWindow = INVALIDWINDOW;
992
993         // find least recently used window
994
// supposedly faster to count down
995
//for( int i = 0; i < NUMWINDOWS; i++ ) {
996
for(int i = NUMWINDOWS - 1; i >= 0; --i ) {
997             if( fTimeStamps[i] < leastRU ) {
998                 leastRU = fTimeStamps[i];
999                 whichWindow = i;
1000            }
1001        }
1002
1003        return whichWindow;
1004    }
1005    
1006};
1007
Popular Tags