BOCU


1   /**
2   *******************************************************************************
3   * Copyright (C) 1996-2005, International Business Machines Corporation and    *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   package com.ibm.icu.impl;
8   
9   import com.ibm.icu.text.UCharacterIterator;
10  
11  /**
12   * <p>Binary Ordered Compression for Unicode</p>
13   * 
14   * <p>Users are strongly encouraged to read the ICU paper on 
15   * <a HREF="http://icu.sourceforge.net/docs/papers/binary_ordered_compression_for_unicode.html">
16   * BOCU</a> before attempting to use this class.</p>
17   * 
18   * <p>BOCU is used to compress unicode text into a stream of unsigned
19   * bytes.  For many kinds of text the compression compares favorably
20   * to UTF-8, and for some kinds of text (such as CJK) it does better.
21   * The resulting bytes will compare in the same order as the original
22   * code points.  The byte stream does not contain the values 0, 1, or
23   * 2.</p>
24   * 
25   * <p>One example of a use of BOCU is in {@link 
26   * com.ibm.icu.text.Collator#getCollationKey(String)} for a RuleBasedCollator object with 
27   * collation strength IDENTICAL. The result CollationKey will consist of the 
28   * collation order of the source string followed by the BOCU result of the 
29   * source string. 
30   * </p> 
31   *
32   * <p>Unlike a UTF encoding, BOCU-compressed text is not suitable for
33   * random access.</p>
34   * 
35   * <p>Method: Slope Detection<br> Remember the previous code point
36   * (initial 0).  For each code point in the string, encode the
37   * difference with the previous one.  Similar to a UTF, the length of
38   * the byte sequence is encoded in the lead bytes.  Unlike a UTF, the
39   * trail byte values may overlap with lead/single byte values.  The
40   * signedness of the difference must be encoded as the most
41   * significant part.</p>
42   *
43   * <p>We encode differences with few bytes if their absolute values
44   * are small.  For correct ordering, we must treat the entire value
45   * range -10ffff..+10ffff in ascending order, which forbids encoding
46   * the sign and the absolute value separately. Instead, we split the
47   * lead byte range in the middle and encode non-negative values going
48   * up and negative values going down.</p>
49   *
50   * <p>For very small absolute values, the difference is added to a
51   * middle byte value for single-byte encoded differences.  For
52   * somewhat larger absolute values, the difference is divided by the
53   * number of byte values available, the modulo is used for one trail
54   * byte, and the remainder is added to a lead byte avoiding the
55   * single-byte range.  For large absolute values, the difference is
56   * similarly encoded in three bytes. (Syn Wee, I need examples
57   * here.)</p>
58   *
59   * <p>BOCU does not use byte values 0, 1, or 2, but uses all other
60   * byte values for lead and single bytes, so that the middle range of
61   * single bytes is as large as possible.</p>
62   *
63   * <p>Note that the lead byte ranges overlap some, but that the
64   * sequences as a whole are well ordered. I.e., even if the lead byte
65   * is the same for sequences of different lengths, the trail bytes
66   * establish correct order.  It would be possible to encode slightly
67   * larger ranges for each length (>1) by subtracting the lower bound
68   * of the range. However, that would also slow down the calculation.
69   * (Syn Wee, need an example).</p>
70   *
71   * <p>For the actual string encoding, an optimization moves the
72   * previous code point value to the middle of its Unicode script block
73   * to minimize the differences in same-script text runs.  (Syn Wee,
74   * need an example.)</p>
75   *
76   * @author Syn Wee Quek
77   * @since release 2.2, May 3rd 2002
78   * @draft 2.2 
79   */
80  public class BOCU 
81  {      
82      // public constructors --------------------------------------------------
83      
84      // public methods -------------------------------------------------------
85          
86      /**
87       * <p>Encode the code points of a string as a sequence of bytes,
88       * preserving lexical order.</p>
89       * <p>The minimum size of buffer required for the compression can be 
90       * preflighted by getCompressionLength(String).</p>
91       * @param source text source
92       * @param buffer output buffer
93       * @param offset to start writing to
94       * @return end offset where the writing stopped
95       * @see #getCompressionLength(String)
96       * @exception ArrayIndexOutOfBoundsException thrown if size of buffer is 
97       *            too small for the output.
98       */
99      public static int compress(String   source, byte buffer[], int offset) 
100     {
101         int prev = 0;
102         UCharacterIterator iterator = UCharacterIterator.getInstance(source);
103         int codepoint = iterator.nextCodePoint();
104         while (codepoint != UCharacterIterator.DONE) {
105             if (prev < 0x4e00 || prev >= 0xa000) {
106                 prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
107             } 
108             else {
109                 // Unihan U+4e00..U+9fa5:
110                 // double-bytes down from the upper end
111                 prev = 0x9fff - SLOPE_REACH_POS_2_;
112             }
113         
114             offset = writeDiff(codepoint - prev, buffer, offset);
115             prev = codepoint;
116             codepoint = iterator.nextCodePoint();
117         }
118         return offset;
119     }
120         
121     /** 
122      * Return the number of  bytes that compress() would write.
123      * @param source text source string
124      * @return the length of the BOCU result 
125      * @see #compress(String, byte[], int)
126      */
127     public static int getCompressionLength(String   source) 
128     {
129         int prev = 0;
130         int result = 0;
131         UCharacterIterator iterator =  UCharacterIterator.getInstance(source);
132         int codepoint = iterator.nextCodePoint();
133         while (codepoint != UCharacterIterator.DONE) {
134             if (prev < 0x4e00 || prev >= 0xa000) {
135                 prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
136             } 
137             else {
138                 // Unihan U+4e00..U+9fa5:
139                 // double-bytes down from the upper end
140                 prev = 0x9fff - SLOPE_REACH_POS_2_;
141             }
142         
143             codepoint = iterator.nextCodePoint();
144             result += lengthOfDiff(codepoint - prev);
145             prev = codepoint;
146         }
147         return result;
148     }
149 
150     // public setter methods -------------------------------------------------
151         
152     // public getter methods ------------------------------------------------
153             
154     // public other methods -------------------------------------------------
155     
156     // protected constructor ------------------------------------------------
157       
158     // protected data members ------------------------------------------------
159     
160     // protected methods -----------------------------------------------------
161  
162     // private data members --------------------------------------------------
163     
164     /** 
165      * Do not use byte values 0, 1, 2 because they are separators in sort keys.
166      */
167     private static final int SLOPE_MIN_ = 3;
168     private static final int SLOPE_MAX_ = 0xff;
169     private static final int SLOPE_MIDDLE_ = 0x81;
170     private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1;
171     private static final int SLOPE_MAX_BYTES_ = 4;
172 
173     /**
174      * Number of lead bytes:
175      * 1        middle byte for 0
176      * 2*80=160 single bytes for !=0
177      * 2*42=84  for double-byte values
178      * 2*3=6    for 3-byte values
179      * 2*1=2    for 4-byte values
180      *
181      * The sum must be <=SLOPE_TAIL_COUNT.
182      *
183      * Why these numbers?
184      * - There should be >=128 single-byte values to cover 128-blocks
185      *   with small scripts.
186      * - There should be >=20902 single/double-byte values to cover Unihan.
187      * - It helps CJK Extension B some if there are 3-byte values that cover
188      *   the distance between them and Unihan.
189      *   This also helps to jump among distant places in the BMP.
190      * - Four-byte values are necessary to cover the rest of Unicode.
191      *
192      * Symmetrical lead byte counts are for convenience.
193      * With an equal distribution of even and odd differences there is also
194      * no advantage to asymmetrical lead byte counts.
195      */
196     private static final int SLOPE_SINGLE_ = 80;
197     private static final int SLOPE_LEAD_2_ = 42;
198     private static final int SLOPE_LEAD_3_ = 3;
199     private static final int SLOPE_LEAD_4_ = 1;
200 
201     /** 
202      * The difference value range for single-byters.
203      */
204     private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_;
205     private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_);
206 
207     /** 
208      * The difference value range for double-byters.
209      */
210     private static final int SLOPE_REACH_POS_2_ = 
211         SLOPE_LEAD_2_ * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1;
212     private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1);
213 
214     /** 
215      * The difference value range for 3-byters.
216      */
217     private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_ 
218         * SLOPE_TAIL_COUNT_ 
219         * SLOPE_TAIL_COUNT_ 
220         + (SLOPE_LEAD_3_ - 1)
221         * SLOPE_TAIL_COUNT_ +
222         (SLOPE_TAIL_COUNT_ - 1);
223     private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1);
224 
225     /** 
226      * The lead byte start values.
227      */
228     private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_ 
229         + SLOPE_SINGLE_ + 1;
230     private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_ 
231         + SLOPE_LEAD_2_;
232     private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_ + 
233         SLOPE_REACH_NEG_1_;
234     private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_
235         - SLOPE_LEAD_2_;
236                                                                                                         
237     // private constructor ---------------------------------------------------
238         
239     /**
240      * Constructor private to prevent initialization
241      */
242     ///CLOVER:OFF
243     private BOCU()
244     {
245     }            
246     ///CLOVER:ON                                                                                       
247     
248     // private methods -------------------------------------------------------
249     
250     /**
251      * Integer division and modulo with negative numerators
252      * yields negative modulo results and quotients that are one more than
253      * what we need here.
254      * @param number which operations are to be performed on
255      * @param factor the factor to use for division
256      * @return (result of division) << 32 | modulo 
257      */
258     private static final long getNegDivMod(int number, int factor) 
259     {
260         int modulo = number % factor; 
261         long result = number / factor;
262         if (modulo < 0) { 
263             -- result; 
264             modulo += factor; 
265         } 
266         return (result << 32) | modulo;
267     }
268         
269     /**
270      * Encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes,
271      * preserving lexical order
272      * @param diff
273      * @param buffer byte buffer to append to
274      * @param offset to the byte buffer to start appending
275      * @return end offset where the appending stops
276      */
277     private static final int writeDiff(int diff, byte buffer[], int offset) 
278     {
279         if (diff >= SLOPE_REACH_NEG_1_) {
280             if (diff <= SLOPE_REACH_POS_1_) {
281                 buffer[offset ++] = (byte)(SLOPE_MIDDLE_ + diff);
282             } 
283             else if (diff <= SLOPE_REACH_POS_2_) {
284                 buffer[offset ++] = (byte)(SLOPE_START_POS_2_ 
285                                            + (diff / SLOPE_TAIL_COUNT_));
286                 buffer[offset ++] = (byte)(SLOPE_MIN_ + 
287                                            (diff % SLOPE_TAIL_COUNT_));
288             } 
289             else if (diff <= SLOPE_REACH_POS_3_) {
290                 buffer[offset + 2] = (byte)(SLOPE_MIN_ 
291                                             + (diff % SLOPE_TAIL_COUNT_));
292                 diff /= SLOPE_TAIL_COUNT_;
293                 buffer[offset + 1] = (byte)(SLOPE_MIN_ 
294                                             + (diff % SLOPE_TAIL_COUNT_));
295                 buffer[offset] = (byte)(SLOPE_START_POS_3_ 
296                                         + (diff / SLOPE_TAIL_COUNT_));
297                 offset += 3;
298             } 
299             else {
300                 buffer[offset + 3] = (byte)(SLOPE_MIN_ 
301                                             + diff % SLOPE_TAIL_COUNT_);
302                 diff /= SLOPE_TAIL_COUNT_;
303                 buffer[offset] = (byte)(SLOPE_MIN_ 
304                                         + diff % SLOPE_TAIL_COUNT_);
305                 diff /= SLOPE_TAIL_COUNT_;
306                 buffer[offset + 1] = (byte)(SLOPE_MIN_ 
307                                             + diff % SLOPE_TAIL_COUNT_);
308                 buffer[offset] = (byte)SLOPE_MAX_;
309                 offset += 4;
310             }
311         } 
312         else {
313             long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
314             int modulo = (int)division;
315             if (diff >= SLOPE_REACH_NEG_2_) {
316                 diff = (int)(division >> 32);
317                 buffer[offset ++] = (byte)(SLOPE_START_NEG_2_ + diff);
318                 buffer[offset ++] = (byte)(SLOPE_MIN_ + modulo);
319             } 
320             else if (diff >= SLOPE_REACH_NEG_3_) {
321                 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
322                 diff = (int)(division >> 32);
323                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
324                 modulo = (int)division;
325                 diff = (int)(division >> 32);
326                 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
327                 buffer[offset] = (byte)(SLOPE_START_NEG_3_ + diff);
328                 offset += 3;
329             } 
330             else {
331                 buffer[offset + 3] = (byte)(SLOPE_MIN_ + modulo);
332                 diff = (int)(division >> 32);
333                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
334                 modulo = (int)division;
335                 diff = (int)(division >> 32);
336                 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
337                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
338                 modulo = (int)division;
339                 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
340                 buffer[offset] = SLOPE_MIN_;
341                 offset += 4;
342             }
343         }
344         return offset;
345     }
346         
347     /**
348      * How many bytes would writeDiff() write? 
349      * @param diff
350      */
351     private static final int lengthOfDiff(int diff) 
352     {
353         if (diff >= SLOPE_REACH_NEG_1_) {
354             if (diff <= SLOPE_REACH_POS_1_) {
355                 return 1;
356             } 
357             else if (diff <= SLOPE_REACH_POS_2_) {
358                 return 2;
359             } 
360             else if(diff <= SLOPE_REACH_POS_3_) {
361                 return 3;
362             } 
363             else {
364                 return 4;
365             }
366         } 
367         else {
368             if (diff >= SLOPE_REACH_NEG_2_) {
369                 return 2;
370             } 
371             else if (diff >= SLOPE_REACH_NEG_3_) {
372                 return 3;
373             } 
374             else {
375                 return 4;
376             }
377         }
378     }
379 }
380
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags