KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > impl > TrieIterator


1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2004, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
6 */

7
8 package com.ibm.icu.impl;
9
10 import com.ibm.icu.lang.UCharacter;
11 import com.ibm.icu.text.UTF16;
12 import com.ibm.icu.util.RangeValueIterator;
13
14 /**
15  * <p>Class enabling iteration of the values in a Trie.</p>
16  * <p>Result of each iteration contains the interval of codepoints that have
17  * the same value type and the value type itself.</p>
18  * <p>The comparison of each codepoint value is done via extract(), which the
19  * default implementation is to return the value as it is.</p>
20  * <p>Method extract() can be overwritten to perform manipulations on
21  * codepoint values in order to perform specialized comparison.</p>
22  * <p>TrieIterator is designed to be a generic iterator for the CharTrie
23  * and the IntTrie, hence to accommodate both types of data, the return
24  * result will be in terms of int (32 bit) values.</p>
25  * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
26  * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
27  * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
28  * sense, the caller will have to pass a object with a callback function
29  * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
30  * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
31  * codepoints with the same value as determined by
32  * UTrieEnumValue(const void *context, uint32_t value). for each range,
33  * utrie_enum calls the callback function to perform a task. In this way,
34  * icu4c performs the iteration within utrie_enum.
35  * To follow the JDK model, icu4j is slightly different from icu4c.
36  * Instead of requesting the caller to implement an object for a callback.
37  * The caller will have to implement a subclass of TrieIterator, fleshing out
38  * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
39  * the caller will have to code his own iteration and flesh out the task
40  * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
41  * </p>
42  * <p>There are basically 3 usage scenarios for porting:</p>
43  * <p>1) UTrieEnumValue is the only implemented callback then just implement a
44  * subclass of TrieIterator and override the extract(int) method. The
45  * extract(int) method is analogus to UTrieEnumValue callback.
46  * </p>
47  * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
48  * a subclass of TrieIterator, override the extract method and iterate, e.g
49  * </p>
50  * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
51  * set);<br>
52  * In Java :<br>
53  * <pre>
54  * class TrieIteratorImpl extends TrieIterator{
55  * public TrieIteratorImpl(Trie data){
56  * super(data);
57  * }
58  * public int extract(int value){
59  * // port the implementation of _enumPropertyStartsValue here
60  * }
61  * }
62  * ....
63  * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
64  * while(fcdIter.next(result)) {
65  * // port the implementation of _enumPropertyStartsRange
66  * }
67  * </pre>
68  * </p>
69  * <p>3) UTrieEnumRange is the only implemented callback then just implement
70  * the while loop, when utrie_enum is called
71  * <pre>
72  * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
73  * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
74  * while(fcdIter.next(result)){
75  * set.add(result.start);
76  * }
77  * </p>
78  * @author synwee
79  * @see com.ibm.icu.impl.Trie
80  * @see com.ibm.icu.lang.UCharacterTypeIterator
81  * @since release 2.1, Jan 17 2002
82  */

83 public class TrieIterator implements RangeValueIterator
84
85 {
86     // public constructor ---------------------------------------------
87

88     /**
89     * TrieEnumeration constructor
90     * @param trie to be used
91     * @exception IllegalArgumentException throw when argument is null.
92     * @draft 2.1
93     */

94     public TrieIterator(Trie trie)
95     {
96         if (trie == null) {
97             throw new IllegalArgumentException JavaDoc(
98                                           "Argument trie cannot be null");
99         }
100         m_trie_ = trie;
101         // synwee: check that extract belongs to the child class
102
m_initialValue_ = extract(m_trie_.getInitialValue());
103         reset();
104     }
105     
106     // public methods -------------------------------------------------
107

108     /**
109     * <p>Returns true if we are not at the end of the iteration, false
110     * otherwise.</p>
111     * <p>The next set of codepoints with the same value type will be
112     * calculated during this call and returned in the arguement element.</p>
113     * @param element return result
114     * @return true if we are not at the end of the iteration, false otherwise.
115     * @exception NoSuchElementException - if no more elements exist.
116     * @see com.ibm.icu.util.RangeValueIterator.Element
117     * @draft 2.1
118     */

119     public final boolean next(Element element)
120     {
121         if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
122             return false;
123         }
124         if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
125             calculateNextBMPElement(element)) {
126             return true;
127         }
128         calculateNextSupplementaryElement(element);
129         return true;
130     }
131      
132     /**
133     * Resets the iterator to the beginning of the iteration
134     * @draft 2.1
135     */

136     public final void reset()
137     {
138         m_currentCodepoint_ = 0;
139         m_nextCodepoint_ = 0;
140         m_nextIndex_ = 0;
141         m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
142         if (m_nextBlock_ == 0) {
143             m_nextValue_ = m_initialValue_;
144         }
145         else {
146             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
147         }
148         m_nextBlockIndex_ = 0;
149         m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
150     }
151     
152     // protected methods ----------------------------------------------
153

154     /**
155     * Called by next() to extracts a 32 bit value from a trie value
156     * used for comparison.
157     * This method is to be overwritten if special manipulation is to be done
158     * to retrieve a relevant comparison.
159     * The default function is to return the value as it is.
160     * @param value a value from the trie
161     * @return extracted value
162     * @draft 2.1
163     */

164     protected int extract(int value)
165     {
166         return value;
167     }
168     
169     // private methods ------------------------------------------------
170

171     /**
172     * Set the result values
173     * @param element return result object
174     * @param start codepoint of range
175     * @param limit (end + 1) codepoint of range
176     * @param value common value of range
177     */

178     private final void setResult(Element element, int start, int limit,
179                                  int value)
180     {
181         element.start = start;
182         element.limit = limit;
183         element.value = value;
184     }
185     
186     /**
187     * Finding the next element.
188     * This method is called just before returning the result of
189     * next().
190     * We always store the next element before it is requested.
191     * In the case that we have to continue calculations into the
192     * supplementary planes, a false will be returned.
193     * @param element return result object
194     * @return true if the next range is found, false if we have to proceed to
195     * the supplementary range.
196     */

197     private final boolean calculateNextBMPElement(Element element)
198     {
199         int currentBlock = m_nextBlock_;
200         int currentValue = m_nextValue_;
201         m_currentCodepoint_ = m_nextCodepoint_;
202         m_nextCodepoint_ ++;
203         m_nextBlockIndex_ ++;
204         if (!checkBlockDetail(currentValue)) {
205             setResult(element, m_currentCodepoint_, m_nextCodepoint_,
206                       currentValue);
207             return true;
208         }
209         // synwee check that next block index == 0 here
210
// enumerate BMP - the main loop enumerates data blocks
211
while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
212             m_nextIndex_ ++;
213             // because of the way the character is split to form the index
214
// the lead surrogate and trail surrogate can not be in the
215
// mid of a block
216
if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
217                 // skip lead surrogate code units,
218
// go to lead surrogate codepoints
219
m_nextIndex_ = BMP_INDEX_LENGTH_;
220             }
221             else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
222                 // go back to regular BMP code points
223
m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
224             }
225             
226             m_nextBlockIndex_ = 0;
227             if (!checkBlock(currentBlock, currentValue)) {
228                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
229                           currentValue);
230                 return true;
231             }
232         }
233         m_nextCodepoint_ --; // step one back since this value has not been
234
m_nextBlockIndex_ --; // retrieved yet.
235
return false;
236     }
237
238     /**
239     * Finds the next supplementary element.
240     * For each entry in the trie, the value to be delivered is passed through
241     * extract().
242     * We always store the next element before it is requested.
243     * Called after calculateNextBMP() completes its round of BMP characters.
244     * There is a slight difference in the usage of m_currentCodepoint_
245     * here as compared to calculateNextBMP(). Though both represents the
246     * lower bound of the next element, in calculateNextBMP() it gets set
247     * at the start of any loop, where-else, in calculateNextSupplementary()
248     * since m_currentCodepoint_ already contains the lower bound of the
249     * next element (passed down from calculateNextBMP()), we keep it till
250     * the end before resetting it to the new value.
251     * Note, if there are no more iterations, it will never get to here.
252     * Blocked out by next().
253     * @param element return result object
254     * @draft 2.1
255     */

256     private final void calculateNextSupplementaryElement(Element element)
257     {
258         int currentValue = m_nextValue_;
259         int currentBlock = m_nextBlock_;
260         m_nextCodepoint_ ++;
261         m_nextBlockIndex_ ++;
262         
263         if (UTF16.getTrailSurrogate(m_nextCodepoint_)
264                                         != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
265             // this piece is only called when we are in the middle of a lead
266
// surrogate block
267
if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
268                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
269                           currentValue);
270                 m_currentCodepoint_ = m_nextCodepoint_;
271                 return;
272             }
273             // we have cleared one block
274
m_nextIndex_ ++;
275             m_nextTrailIndexOffset_ ++;
276             if (!checkTrailBlock(currentBlock, currentValue)) {
277                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
278                           currentValue);
279                 m_currentCodepoint_ = m_nextCodepoint_;
280                 return;
281             }
282         }
283         int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
284         // enumerate supplementary code points
285
while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
286             // lead surrogate access
287
int leadBlock =
288                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
289                                                    Trie.INDEX_STAGE_2_SHIFT_;
290             if (leadBlock == m_trie_.m_dataOffset_) {
291                 // no entries for a whole block of lead surrogates
292
if (currentValue != m_initialValue_) {
293                     m_nextValue_ = m_initialValue_;
294                     m_nextBlock_ = 0;
295                     m_nextBlockIndex_ = 0;
296                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
297                               currentValue);
298                     m_currentCodepoint_ = m_nextCodepoint_;
299                     return;
300                 }
301
302                 nextLead += DATA_BLOCK_LENGTH_;
303                 // number of total affected supplementary codepoints in one
304
// block
305
// this is not a simple addition of
306
// DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
307
// that we might have moved some of the codepoints
308
m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
309                                      (char)nextLead,
310                                      (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
311                 continue;
312             }
313             if (m_trie_.m_dataManipulate_ == null) {
314                 throw new NullPointerException JavaDoc(
315                             "The field DataManipulate in this Trie is null");
316             }
317             // enumerate trail surrogates for this lead surrogate
318
m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
319                                m_trie_.getValue(leadBlock +
320                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
321             if (m_nextIndex_ <= 0) {
322                 // no data for this lead surrogate
323
if (currentValue != m_initialValue_) {
324                     m_nextValue_ = m_initialValue_;
325                     m_nextBlock_ = 0;
326                     m_nextBlockIndex_ = 0;
327                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
328                               currentValue);
329                     m_currentCodepoint_ = m_nextCodepoint_;
330                     return;
331                 }
332                 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
333             } else {
334                 m_nextTrailIndexOffset_ = 0;
335                 if (!checkTrailBlock(currentBlock, currentValue)) {
336                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
337                               currentValue);
338                     m_currentCodepoint_ = m_nextCodepoint_;
339                     return;
340                 }
341             }
342             nextLead ++;
343          }
344
345          // deliver last range
346
setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
347                    currentValue);
348     }
349     
350     /**
351     * Internal block value calculations
352     * Performs calculations on a data block to find codepoints in m_nextBlock_
353     * after the index m_nextBlockIndex_ that has the same value.
354     * Note m_*_ variables at this point is the next codepoint whose value
355     * has not been calculated.
356     * But when returned with false, it will be the last codepoint whose
357     * value has been calculated.
358     * @param currentValue the value which other codepoints are tested against
359     * @return true if the whole block has the same value as currentValue or if
360     * the whole block has been calculated, false otherwise.
361     */

362     private final boolean checkBlockDetail(int currentValue)
363     {
364         while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
365             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
366                                                     m_nextBlockIndex_));
367             if (m_nextValue_ != currentValue) {
368                 return false;
369             }
370             ++ m_nextBlockIndex_;
371             ++ m_nextCodepoint_;
372         }
373         return true;
374     }
375     
376     /**
377     * Internal block value calculations
378     * Performs calculations on a data block to find codepoints in m_nextBlock_
379     * that has the same value.
380     * Will call checkBlockDetail() if highlevel check fails.
381     * Note m_*_ variables at this point is the next codepoint whose value
382     * has not been calculated.
383     * @param currentBlock the initial block containing all currentValue
384     * @param currentValue the value which other codepoints are tested against
385     * @return true if the whole block has the same value as currentValue or if
386     * the whole block has been calculated, false otherwise.
387     */

388     private final boolean checkBlock(int currentBlock, int currentValue)
389     {
390         m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
391                                                   Trie.INDEX_STAGE_2_SHIFT_;
392         if (m_nextBlock_ == currentBlock &&
393             (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
394             // the block is the same as the previous one, filled with
395
// currentValue
396
m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
397         }
398         else if (m_nextBlock_ == 0) {
399             // this is the all-initial-value block
400
if (currentValue != m_initialValue_) {
401                 m_nextValue_ = m_initialValue_;
402                 m_nextBlockIndex_ = 0;
403                 return false;
404             }
405             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
406         }
407         else {
408             if (!checkBlockDetail(currentValue)) {
409                 return false;
410             }
411         }
412         return true;
413     }
414     
415     /**
416     * Internal block value calculations
417     * Performs calculations on multiple data blocks for a set of trail
418     * surrogates to find codepoints in m_nextBlock_ that has the same value.
419     * Will call checkBlock() for internal block checks.
420     * Note m_*_ variables at this point is the next codepoint whose value
421     * has not been calculated.
422     * @param currentBlock the initial block containing all currentValue
423     * @param currentValue the value which other codepoints are tested against
424     * @return true if the whole block has the same value as currentValue or if
425     * the whole block has been calculated, false otherwise.
426     */

427     private final boolean checkTrailBlock(int currentBlock,
428                                           int currentValue)
429     {
430         // enumerate code points for this lead surrogate
431
while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
432         {
433             // if we ever reach here, we are at the start of a new block
434
m_nextBlockIndex_ = 0;
435             // copy of most of the body of the BMP loop
436
if (!checkBlock(currentBlock, currentValue)) {
437                 return false;
438             }
439             m_nextTrailIndexOffset_ ++;
440             m_nextIndex_ ++;
441         }
442         return true;
443     }
444     
445     /**
446     * Checks if we are beginning at the start of a initial block.
447     * If we are then the rest of the codepoints in this initial block
448     * has the same values.
449     * We increment m_nextCodepoint_ and relevant data members if so.
450     * This is used only in for the supplementary codepoints because
451     * the offset to the trail indexes could be 0.
452     * @return true if we are at the start of a initial block.
453     */

454     private final boolean checkNullNextTrailIndex()
455     {
456         if (m_nextIndex_ <= 0) {
457             m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
458             int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
459             int leadBlock =
460                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
461                                                    Trie.INDEX_STAGE_2_SHIFT_;
462             if (m_trie_.m_dataManipulate_ == null) {
463                 throw new NullPointerException JavaDoc(
464                             "The field DataManipulate in this Trie is null");
465             }
466             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
467                                m_trie_.getValue(leadBlock +
468                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
469             m_nextIndex_ --;
470             m_nextBlockIndex_ = DATA_BLOCK_LENGTH_;
471             return true;
472         }
473         return false;
474     }
475
476     // private data members --------------------------------------------
477

478     /**
479     * Size of the stage 1 BMP indexes
480     */

481     private static final int BMP_INDEX_LENGTH_ =
482                                         0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
483     /**
484     * Lead surrogate minimum value
485     */

486     private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
487     /**
488     * Trail surrogate minimum value
489     */

490     private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
491     /**
492     * Trail surrogate maximum value
493     */

494     private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
495     /**
496     * Number of trail surrogate
497     */

498     private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
499     /**
500     * Number of stage 1 indexes for supplementary calculations that maps to
501     * each lead surrogate character.
502     * See second pass into getRawOffset for the trail surrogate character.
503     * 10 for significant number of bits for trail surrogates, 5 for what we
504     * discard during shifting.
505     */

506     private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
507                                     1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
508     /**
509     * Number of data values in a stage 2 (data array) block.
510     */

511     private static final int DATA_BLOCK_LENGTH_ =
512                                               1 << Trie.INDEX_STAGE_1_SHIFT_;
513     /**
514     * Number of codepoints in a stage 2 block
515     */

516     private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
517                                                      DATA_BLOCK_LENGTH_ << 10;
518     /**
519     * Trie instance
520     */

521     private Trie m_trie_;
522     /**
523     * Initial value for trie values
524     */

525     private int m_initialValue_;
526     /**
527     * Next element results and data.
528     */

529     private int m_currentCodepoint_;
530     private int m_nextCodepoint_;
531     private int m_nextValue_;
532     private int m_nextIndex_;
533     private int m_nextBlock_;
534     private int m_nextBlockIndex_;
535     private int m_nextTrailIndexOffset_;
536     /**
537     * This is the return result element
538     */

539     private int m_start_;
540     private int m_limit_;
541     private int m_value_;
542 }
543
Popular Tags