KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > impl > Trie


1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2006, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
6 */

7
8 package com.ibm.icu.impl;
9
10 import java.io.InputStream JavaDoc;
11 import java.io.DataInputStream JavaDoc;
12 import java.io.IOException JavaDoc;
13 import java.util.Arrays JavaDoc;
14 import com.ibm.icu.text.UTF16;
15 import com.ibm.icu.lang.UCharacter;
16
17 /**
18  * <p>A trie is a kind of compressed, serializable table of values
19  * associated with Unicode code points (0..0x10ffff).</p>
20  * <p>This class defines the basic structure of a trie and provides methods
21  * to <b>retrieve the offsets to the actual data</b>.</p>
22  * <p>Data will be the form of an array of basic types, char or int.</p>
23  * <p>The actual data format will have to be specified by the user in the
24  * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
25  * <p>This trie implementation is optimized for getting offset while walking
26  * forward through a UTF-16 string.
27  * Therefore, the simplest and fastest access macros are the
28  * fromLead() and fromOffsetTrail() methods.
29  * The fromBMP() method are a little more complicated; they get offsets even
30  * for lead surrogate codepoints, while the fromLead() method get special
31  * "folded" offsets for lead surrogate code units if there is relevant data
32  * associated with them.
33  * From such a folded offsets, an offset needs to be extracted to supply
34  * to the fromOffsetTrail() methods.
35  * To handle such supplementary codepoints, some offset information are kept
36  * in the data.</p>
37  * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
38  * that offset from the folded value for the lead surrogate unit.</p>
39  * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
40  * com.ibm.icu.impl.IntTrie.</p>
41  * @author synwee
42  * @see com.ibm.icu.impl.CharTrie
43  * @see com.ibm.icu.impl.IntTrie
44  * @since release 2.1, Jan 01 2002
45  */

46 public abstract class Trie
47 {
48     // public class declaration ----------------------------------------
49

50     /**
51     * Character data in com.ibm.impl.Trie have different user-specified format
52     * for different purposes.
53     * This interface specifies methods to be implemented in order for
54     * com.ibm.impl.Trie, to surrogate offset information encapsulated within
55     * the data.
56     * @draft 2.1
57     */

58     public static interface DataManipulate
59     {
60         /**
61         * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
62         * data
63         * the index array offset of the indexes for that lead surrogate.
64         * @param value data value for a surrogate from the trie, including the
65         * folding offset
66         * @return data offset or 0 if there is no data for the lead surrogate
67         * @draft 2.1
68         */

69         public int getFoldingOffset(int value);
70     }
71
72     // default implementation
73
private static class DefaultGetFoldingOffset implements DataManipulate {
74         public int getFoldingOffset(int value) {
75             return value;
76         }
77     }
78
79     // public methods --------------------------------------------------
80

81     /**
82      * Determines if this trie has a linear latin 1 array
83      * @return true if this trie has a linear latin 1 array, false otherwise
84      */

85     public final boolean isLatin1Linear()
86     {
87         return m_isLatin1Linear_;
88     }
89     
90     /**
91      * Checks if the argument Trie has the same data as this Trie.
92      * Attributes are checked but not the index data.
93      * @param other Trie to check
94      * @return true if the argument Trie has the same data as this Trie, false
95      * otherwise
96      */

97     ///CLOVER:OFF
98
public boolean equals(Object JavaDoc other)
99     {
100         if (other == this) {
101             return true;
102         }
103         if (!(other instanceof Trie)) {
104             return false;
105         }
106         Trie othertrie = (Trie)other;
107         return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_
108                && m_options_ == othertrie.m_options_
109                && m_dataLength_ == othertrie.m_dataLength_
110                && Arrays.equals(m_index_, othertrie.m_index_);
111     }
112     ///CLOVER:ON
113

114     /**
115      * Gets the serialized data file size of the Trie. This is used during
116      * trie data reading for size checking purposes.
117      * @return size size of serialized trie data file in terms of the number
118      * of bytes
119      */

120     public int getSerializedDataSize()
121     {
122         // includes signature, option, dataoffset and datalength output
123
int result = (4 << 2);
124         result += (m_dataOffset_ << 1);
125         if (isCharTrie()) {
126             result += (m_dataLength_ << 1);
127         }
128         else if (isIntTrie()) {
129             result += (m_dataLength_ << 2);
130         }
131         return result;
132     }
133
134     // protected constructor -------------------------------------------
135

136     /**
137     * Trie constructor for CharTrie use.
138     * @param inputStream ICU data file input stream which contains the
139     * trie
140     * @param dataManipulate object containing the information to parse the
141     * trie data
142     * @throws IOException thrown when input stream does not have the
143     * right header.
144     * @draft 2.1
145     */

146     protected Trie(InputStream JavaDoc inputStream,
147                    DataManipulate dataManipulate) throws IOException JavaDoc
148     {
149         DataInputStream JavaDoc input = new DataInputStream JavaDoc(inputStream);
150         // Magic number to authenticate the data.
151
int signature = input.readInt();
152         m_options_ = input.readInt();
153         
154         if (!checkHeader(signature)) {
155             throw new IllegalArgumentException JavaDoc("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
156         }
157
158         if(dataManipulate != null) {
159             m_dataManipulate_ = dataManipulate;
160         } else {
161             m_dataManipulate_ = new DefaultGetFoldingOffset();
162         }
163         m_isLatin1Linear_ = (m_options_ &
164                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
165         m_dataOffset_ = input.readInt();
166         m_dataLength_ = input.readInt();
167         unserialize(inputStream);
168     }
169     
170     /**
171     * Trie constructor
172     * @param index array to be used for index
173     * @param options used by the trie
174     * @param dataManipulate object containing the information to parse the
175     * trie data
176     * @draft 2.2
177     */

178     protected Trie(char index[], int options, DataManipulate dataManipulate)
179     {
180         m_options_ = options;
181         if(dataManipulate != null) {
182             m_dataManipulate_ = dataManipulate;
183         } else {
184             m_dataManipulate_ = new DefaultGetFoldingOffset();
185         }
186         m_isLatin1Linear_ = (m_options_ &
187                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
188         m_index_ = index;
189         m_dataOffset_ = m_index_.length;
190     }
191
192
193     // protected data members ------------------------------------------
194

195     /**
196     * Lead surrogate code points' index displacement in the index array.
197     * 0x10000-0xd800=0x2800
198     * 0x2800 >> INDEX_STAGE_1_SHIFT_
199     */

200     protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
201     /**
202     * Shift size for shifting right the input index. 1..9
203     * @draft 2.1
204     */

205     protected static final int INDEX_STAGE_1_SHIFT_ = 5;
206     /**
207     * Shift size for shifting left the index array values.
208     * Increases possible data size with 16-bit index values at the cost
209     * of compactability.
210     * This requires blocks of stage 2 data to be aligned by
211     * DATA_GRANULARITY.
212     * 0..INDEX_STAGE_1_SHIFT
213     * @draft 2.1
214     */

215     protected static final int INDEX_STAGE_2_SHIFT_ = 2;
216     /**
217      * Number of data values in a stage 2 (data array) block.
218      */

219     protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
220     /**
221     * Mask for getting the lower bits from the input index.
222     * DATA_BLOCK_LENGTH - 1.
223     * @draft 2.1
224     */

225     protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
226     /** Number of bits of a trail surrogate that are used in index table lookups. */
227     protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
228     /**
229      * Number of index (stage 1) entries per lead surrogate.
230      * Same as number of index entries for 1024 trail surrogates,
231      * ==0x400>>INDEX_STAGE_1_SHIFT_
232      */

233     protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
234     /** Length of the BMP portion of the index (stage 1) array. */
235     protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
236     /**
237     * Surrogate mask to use when shifting offset to retrieve supplementary
238     * values
239     * @draft 2.1
240     */

241     protected static final int SURROGATE_MASK_ = 0x3FF;
242     /**
243     * Index or UTF16 characters
244     * @draft 2.1
245     */

246     protected char m_index_[];
247     /**
248     * Internal TrieValue which handles the parsing of the data value.
249     * This class is to be implemented by the user
250     * @draft 2.1
251     */

252     protected DataManipulate m_dataManipulate_;
253     /**
254     * Start index of the data portion of the trie. CharTrie combines
255     * index and data into a char array, so this is used to indicate the
256     * initial offset to the data portion.
257     * Note this index always points to the initial value.
258     * @draft 2.1
259     */

260     protected int m_dataOffset_;
261     /**
262     * Length of the data array
263     */

264     protected int m_dataLength_;
265      
266     // protected methods -----------------------------------------------
267

268     /**
269     * Gets the offset to the data which the surrogate pair points to.
270     * @param lead lead surrogate
271     * @param trail trailing surrogate
272     * @return offset to data
273     * @draft 2.1
274     */

275     protected abstract int getSurrogateOffset(char lead, char trail);
276     
277     /**
278     * Gets the value at the argument index
279     * @param index value at index will be retrieved
280     * @return 32 bit value
281     * @draft 2.1
282     */

283     protected abstract int getValue(int index);
284
285     /**
286     * Gets the default initial value
287     * @return 32 bit value
288     * @draft 2.1
289     */

290     protected abstract int getInitialValue();
291     
292     /**
293     * Gets the offset to the data which the index ch after variable offset
294     * points to.
295     * Note for locating a non-supplementary character data offset, calling
296     * <p>
297     * getRawOffset(0, ch);
298     * </p>
299     * will do. Otherwise if it is a supplementary character formed by
300     * surrogates lead and trail. Then we would have to call getRawOffset()
301     * with getFoldingIndexOffset(). See getSurrogateOffset().
302     * @param offset index offset which ch is to start from
303     * @param ch index to be used after offset
304     * @return offset to the data
305     * @draft 2.1
306     */

307     protected final int getRawOffset(int offset, char ch)
308     {
309         return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
310                 << INDEX_STAGE_2_SHIFT_)
311                 + (ch & INDEX_STAGE_3_MASK_);
312     }
313     
314     /**
315     * Gets the offset to data which the BMP character points to
316     * Treats a lead surrogate as a normal code point.
317     * @param ch BMP character
318     * @return offset to data
319     * @draft 2.1
320     */

321     protected final int getBMPOffset(char ch)
322     {
323         return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
324                 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
325                 ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
326                 : getRawOffset(0, ch);
327                 // using a getRawOffset(ch) makes no diff
328
}
329
330     /**
331     * Gets the offset to the data which this lead surrogate character points
332     * to.
333     * Data at the returned offset may contain folding offset information for
334     * the next trailing surrogate character.
335     * @param ch lead surrogate character
336     * @return offset to data
337     * @draft 2.1
338     */

339     protected final int getLeadOffset(char ch)
340     {
341        return getRawOffset(0, ch);
342     }
343
344     /**
345     * Internal trie getter from a code point.
346     * Could be faster(?) but longer with
347     * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
348     * Gets the offset to data which the codepoint points to
349     * @param ch codepoint
350     * @return offset to data
351     * @draft 2.1
352     */

353     protected final int getCodePointOffset(int ch)
354     {
355         // if ((ch >> 16) == 0) slower
356
if (ch < 0) {
357             return -1;
358         } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
359             // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
360
return getRawOffset(0, (char)ch);
361         } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
362             // BMP codepoint
363
return getBMPOffset((char)ch);
364         } else if (ch <= UCharacter.MAX_VALUE) {
365             // look at the construction of supplementary characters
366
// trail forms the ends of it.
367
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
368                                       (char)(ch & SURROGATE_MASK_));
369         } else {
370             // return -1 if there is an error, in this case we return
371
return -1;
372         }
373     }
374
375     /**
376     * <p>Parses the inputstream and creates the trie index with it.</p>
377     * <p>This is overwritten by the child classes.
378     * @param inputStream input stream containing the trie information
379     * @exception IOException thrown when data reading fails.
380     * @draft 2.1
381     */

382     protected void unserialize(InputStream JavaDoc inputStream) throws IOException JavaDoc
383     {
384         //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
385
m_index_ = new char[m_dataOffset_];
386         DataInputStream JavaDoc input = new DataInputStream JavaDoc(inputStream);
387         for (int i = 0; i < m_dataOffset_; i ++) {
388              m_index_[i] = input.readChar();
389         }
390     }
391
392     /**
393     * Determines if this is a 32 bit trie
394     * @return true if options specifies this is a 32 bit trie
395     * @draft 2.1
396     */

397     protected final boolean isIntTrie()
398     {
399         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
400     }
401
402     /**
403     * Determines if this is a 16 bit trie
404     * @return true if this is a 16 bit trie
405     * @draft 2.1
406     */

407     protected final boolean isCharTrie()
408     {
409         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
410     }
411
412     // private data members --------------------------------------------
413

414     // struct UTrieHeader {
415
// int32_t signature;
416
// int32_t options (a bit field)
417
// int32_t indexLength
418
// int32_t dataLength
419

420     /**
421     * Size of Trie header in bytes
422     */

423     protected static final int HEADER_LENGTH_ = 4 * 4;
424     /**
425     * Latin 1 option mask
426     */

427     protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
428     /**
429     * Constant number to authenticate the byte block
430     */

431     protected static final int HEADER_SIGNATURE_ = 0x54726965;
432     /**
433     * Header option formatting
434     */

435     private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
436     protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
437     protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
438     
439     /**
440     * Flag indicator for Latin quick access data block
441     */

442     private boolean m_isLatin1Linear_;
443     
444     /**
445     * <p>Trie options field.</p>
446     * <p>options bit field:<br>
447     * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
448     * 8 0 = 16-bit data, 1=32-bit data<br>
449     * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
450     * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
451     */

452     private int m_options_;
453     
454     // private methods ---------------------------------------------------
455

456     /**
457     * Authenticates raw data header.
458     * Checking the header information, signature and options.
459     * @param signature This contains the options and type of a Trie
460     * @return true if the header is authenticated valid
461     * @draft 2.1
462     */

463     private final boolean checkHeader(int signature)
464     {
465         // check the signature
466
// Trie in big-endian US-ASCII (0x54726965).
467
// Magic number to authenticate the data.
468
if (signature != HEADER_SIGNATURE_) {
469             return false;
470         }
471
472         if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
473                                                     INDEX_STAGE_1_SHIFT_ ||
474             ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
475                                                 HEADER_OPTIONS_SHIFT_MASK_)
476                                                  != INDEX_STAGE_2_SHIFT_) {
477             return false;
478         }
479         return true;
480     }
481 }
482
Popular Tags