CollationElementIterator


1   /**
2   *******************************************************************************
3   * Copyright (C) 1996-2005, International Business Machines Corporation and    *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   *
7   *
8   *******************************************************************************
9   */
10  package com.ibm.icu.text;
11  
12  /***
13   * import java.text.StringCharacterIterator;
14   * import java.text.CharacterIterator;
15   */
16  import com.ibm.icu.impl.NormalizerImpl;
17  import com.ibm.icu.impl.UCharacterProperty;
18  import com.ibm.icu.impl.StringUCharacterIterator;
19  import com.ibm.icu.impl.CharacterIteratorWrapper;
20  import com.ibm.icu.impl.ICUDebug;
21  import com.ibm.icu.lang.UCharacter;
22  import java.text.CharacterIterator  ;
23  import java.util.MissingResourceException  ;
24  
25  /**
26   * <p><code>CollationElementIterator</code> is an iterator created by
27   * a RuleBasedCollator to walk through a string. The return result of
28   * each iteration is a 32-bit collation element that defines the
29   * ordering priority of the next character or sequence of characters
30   * in the source string.</p>
31   *
32   * <p>For illustration, consider the following in Spanish:
33   * <blockquote>
34   * <pre>
35   * "ca" -> the first collation element is collation_element('c') and second
36   *         collation element is collation_element('a').
37   *
38   * Since "ch" in Spanish sorts as one entity, the below example returns one
39   * collation element for the two characters 'c' and 'h'
40   *
41   * "cha" -> the first collation element is collation_element('ch') and second
42   *          collation element is collation_element('a').
43   * </pre>
44   * </blockquote>
45   * And in German,
46   * <blockquote>
47   * <pre>
48   * Since the character '&#230;' is a composed character of 'a' and 'e', the
49   * iterator returns two collation elements for the single character '&#230;'
50   *
51   * "&#230;b" -> the first collation element is collation_element('a'), the
52   *              second collation element is collation_element('e'), and the
53   *              third collation element is collation_element('b').
54   * </pre>
55   * </blockquote>
56   * </p>
57   *
58   * <p>For collation ordering comparison, the collation element results
59   * can not be compared simply by using basic arithmetric operators,
60   * e.g. &lt;, == or &gt;, further processing has to be done. Details
61   * can be found in the ICU
62   * <a HREF="http://icu.sourceforge.net/userguide/Collate_ServiceArchitecture.html">
63   * user guide</a>. An example of using the CollationElementIterator
64   * for collation ordering comparison is the class
65   * <a HREF=StringSearch.html> com.ibm.icu.text.StringSearch</a>.</p>
66   *
67   * <p>To construct a CollationElementIterator object, users
68   * call the method getCollationElementIterator() on a
69   * RuleBasedCollator that defines the desired sorting order.</p>
70   *
71   * <p> Example:
72   * <blockquote>
73   * <pre>
74   *  String testString = "This is a test";
75   *  RuleBasedCollator rbc = new RuleBasedCollator("&amp;a&lt;b");
76   *  CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
77   *  int primaryOrder = iterator.IGNORABLE;
78   *  while (primaryOrder != iterator.NULLORDER) {
79   *      int order = iterator.next();
80   *      if (order != iterator.IGNORABLE &&
81   *          order != iterator.NULLORDER) {
82   *          // order is valid, not ignorable and we have not passed the end
83   *          // of the iteration, we do something
84   *          primaryOrder = CollationElementIterator.primaryOrder(order);
85   *          System.out.println("Next primary order 0x" +
86   *                             Integer.toHexString(primaryOrder));
87   *      }
88   *  }
89   * </pre>
90   * </blockquote>
91   * </p>
92   * <p>
93   * This class is not subclassable
94   * </p>
95   * @see Collator
96   * @see RuleBasedCollator
97   * @see StringSearch
98   * @author Syn Wee Quek
99   * @stable ICU 2.8
100  */
101 public final class CollationElementIterator
102 {
103   
104     
105     // public data members --------------------------------------------------
106 
107     /**
108      * <p>This constant is returned by the iterator in the methods
109      * next() and previous() when the end or the beginning of the
110      * source string has been reached, and there are no more valid
111      * collation elements to return.</p>
112      *
113      * <p>See class documentation for an example of use.</p>
114      * @stable ICU 2.8
115      * @see #next
116      * @see #previous */
117     public final static int NULLORDER = 0xffffffff;
118 
119     /**
120      * <p>This constant is returned by the iterator in the methods
121      * next() and previous() when a collation element result is to be
122      * ignored.</p>
123      *
124      * <p>See class documentation for an example of use.</p>
125      * @stable ICU 2.8
126      * @see #next
127      * @see #previous */
128     public static final int IGNORABLE = 0;
129 
130     // public methods -------------------------------------------------------
131 
132     // public getters -------------------------------------------------------
133 
134     /**
135      * <p>Returns the character offset in the source string
136      * corresponding to the next collation element. I.e., getOffset()
137      * returns the position in the source string corresponding to the
138      * collation element that will be returned by the next call to
139      * next(). This value could be any of:
140      * <ul>
141      * <li> The index of the <b>first</b> character corresponding to
142      * the next collation element. (This means that if
143      * <code>setOffset(offset)</code> sets the index in the middle of
144      * a contraction, <code>getOffset()</code> returns the index of
145      * the first character in the contraction, which may not be equal
146      * to the original offset that was set. Hence calling getOffset()
147      * immediately after setOffset(offset) does not guarantee that the
148      * original offset set will be returned.)
149      * <li> If normalization is on, the index of the <b>immediate</b>
150      * subsequent character, or composite character with the first
151      * character, having a combining class of 0.
152      * <li> The length of the source string, if iteration has reached
153      * the end.
154      *</ul>
155      * </p>
156      * @return The character offset in the source string corresponding to the
157      *         collation element that will be returned by the next call to
158      *         next().
159      * @stable ICU 2.8
160      */
161     public int getOffset()
162     {
163         if (m_bufferOffset_ != -1) {
164             if (m_isForwards_) {
165                 return m_FCDLimit_;
166             }
167             return m_FCDStart_;
168         }
169         return m_source_.getIndex();
170     }
171 
172 
173     /**
174      * <p> Returns the maximum length of any expansion sequence that ends with
175      * the specified collation element. If there is no expansion with this
176      * collation element as the last element, returns 1.
177      * </p>
178      * @param ce a collation element returned by previous() or next().
179      * @return the maximum length of any expansion sequence ending
180      *         with the specified collation element.
181      * @stable ICU 2.8
182      */
183     public int getMaxExpansion(int ce)
184     {
185         int start = 0;
186         int limit = m_collator_.m_expansionEndCE_.length;
187         long unsignedce = ce & 0xFFFFFFFFl;
188         while (start < limit - 1) {
189             int mid = start + ((limit - start) >> 1);
190             long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
191             if (unsignedce <= midce) {
192                 limit = mid;
193             }
194             else {
195                 start = mid;
196             }
197         }
198         int result = 1;
199         if (m_collator_.m_expansionEndCE_[start] == ce) {
200             result = m_collator_.m_expansionEndCEMaxSize_[start];
201         }
202         else if (limit < m_collator_.m_expansionEndCE_.length &&
203                  m_collator_.m_expansionEndCE_[limit] == ce) {
204             result = m_collator_.m_expansionEndCEMaxSize_[limit];
205         }
206         else if ((ce & 0xFFFF) == 0x00C0) {
207             result = 2;
208         }
209         return result;
210     }
211 
212     // public other methods -------------------------------------------------
213 
214     /**
215      * <p> Resets the cursor to the beginning of the string. The next
216      * call to next() or previous() will return the first and last
217      * collation element in the string, respectively.</p>
218      *
219      * <p>If the RuleBasedCollator used by this iterator has had its
220      * attributes changed, calling reset() will reinitialize the
221      * iterator to use the new attributes.</p>
222      *
223      * @stable ICU 2.8
224      */
225     public void reset()
226     {
227         m_source_.setToStart();
228         updateInternalState();
229     }
230 
231     /**
232      * <p>Get the next collation element in the source string.</p>
233      *
234      * <p>This iterator iterates over a sequence of collation elements
235      * that were built from the string. Because there isn't
236      * necessarily a one-to-one mapping from characters to collation
237      * elements, this doesn't mean the same thing as "return the
238      * collation element [or ordering priority] of the next character
239      * in the string".</p>
240      *
241      * <p>This function returns the collation element that the
242      * iterator is currently pointing to, and then updates the
243      * internal pointer to point to the next element.  Previous()
244      * updates the pointer first, and then returns the element. This
245      * means that when you change direction while iterating (i.e.,
246      * call next() and then call previous(), or call previous() and
247      * then call next()), you'll get back the same element twice.</p>
248      *
249      * @return the next collation element or NULLORDER if the end of the
250      *         iteration has been reached.
251      * @stable ICU 2.8
252      */
253     public int next()
254     {
255         m_isForwards_ = true;
256         if (m_CEBufferSize_ > 0) {
257             if (m_CEBufferOffset_ < m_CEBufferSize_) {
258                 // if there are expansions left in the buffer, we return it
259                 return m_CEBuffer_[m_CEBufferOffset_ ++];
260             }
261             m_CEBufferSize_ = 0;
262             m_CEBufferOffset_ = 0;
263         }
264  
265         int ch_int = nextChar();
266         
267         if (ch_int == UCharacterIterator.DONE) {
268             return NULLORDER;
269         }
270         char ch = (char)ch_int;
271         if (m_collator_.m_isHiragana4_) {
272             m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309e)
273                                      && !(ch > 0x3094 && ch < 0x309d);
274         }
275 
276         int result = NULLORDER;
277         if (ch <= 0xFF) {
278             // For latin-1 characters we never need to fall back to the UCA
279             // table because all of the UCA data is replicated in the
280             // latinOneMapping array
281             result = m_collator_.m_trie_.getLatin1LinearValue(ch);
282             if (RuleBasedCollator.isSpecial(result)) {
283                 result = nextSpecial(m_collator_, result, ch);
284             }
285         }
286         else {
287             result = m_collator_.m_trie_.getLeadValue(ch);
288             //System.out.println(Integer.toHexString(result));
289             if (RuleBasedCollator.isSpecial(result)) {
290                 // surrogate leads are handled as special ces
291                 result = nextSpecial(m_collator_, result, ch);
292             }
293             if (result == CE_NOT_FOUND_ && RuleBasedCollator.UCA_ != null) {
294                 // couldn't find a good CE in the tailoring
295                 // if we got here, the codepoint MUST be over 0xFF - so we look
296                 // directly in the UCA
297                 result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
298                 if (RuleBasedCollator.isSpecial(result)) {
299                     // UCA also gives us a special CE
300                     result = nextSpecial(RuleBasedCollator.UCA_, result, ch);
301                 }
302             }
303         }
304         if(result == CE_NOT_FOUND_) { 
305             // maybe there is no UCA, unlikely in Java, but ported for consistency
306             result = nextImplicit(ch); 
307         }
308         return result;
309     }
310 
311     /**
312      * <p>Get the previous collation element in the source string.</p>
313      *
314      * <p>This iterator iterates over a sequence of collation elements
315      * that were built from the string. Because there isn't
316      * necessarily a one-to-one mapping from characters to collation
317      * elements, this doesn't mean the same thing as "return the
318      * collation element [or ordering priority] of the previous
319      * character in the string".</p>
320      *
321      * <p>This function updates the iterator's internal pointer to
322      * point to the collation element preceding the one it's currently
323      * pointing to and then returns that element, while next() returns
324      * the current element and then updates the pointer. This means
325      * that when you change direction while iterating (i.e., call
326      * next() and then call previous(), or call previous() and then
327      * call next()), you'll get back the same element twice.</p>
328      *
329      * @return the previous collation element, or NULLORDER when the start of
330      *             the iteration has been reached.
331      * @stable ICU 2.8
332      */
333     public int previous()
334     {
335         if (m_source_.getIndex() <= 0 && m_isForwards_) {
336             // if iterator is new or reset, we can immediate perform  backwards
337             // iteration even when the offset is not right.
338             m_source_.setToLimit();
339             updateInternalState();
340         }
341         m_isForwards_ = false;
342         int result = NULLORDER;
343         if (m_CEBufferSize_ > 0) {
344             if (m_CEBufferOffset_ > 0) {
345                 return m_CEBuffer_[-- m_CEBufferOffset_];
346             }
347             m_CEBufferSize_ = 0;
348             m_CEBufferOffset_ = 0;
349         }
350         int ch_int = previousChar();
351         if (ch_int == UCharacterIterator.DONE) {
352             return NULLORDER;
353         }
354         char ch = (char)ch_int;
355         if (m_collator_.m_isHiragana4_) {
356             m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f);
357         }
358         if (m_collator_.isContractionEnd(ch) && !isBackwardsStart()) {
359             result = previousSpecial(m_collator_, CE_CONTRACTION_, ch);
360         }
361         else {
362             if (ch <= 0xFF) {
363                 result = m_collator_.m_trie_.getLatin1LinearValue(ch);
364             }
365             else {
366                 result = m_collator_.m_trie_.getLeadValue(ch);
367             }
368             if (RuleBasedCollator.isSpecial(result)) {
369                 result = previousSpecial(m_collator_, result, ch);
370             }
371             if (result == CE_NOT_FOUND_) {
372                 if (!isBackwardsStart()
373                     && m_collator_.isContractionEnd(ch)) {
374                     result = CE_CONTRACTION_;
375                 }
376                 else {
377                     if(RuleBasedCollator.UCA_ != null) {
378                         result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
379                     }
380                 }
381 
382                 if (RuleBasedCollator.isSpecial(result)) {
383                     if(RuleBasedCollator.UCA_ != null) {                    
384                         result = previousSpecial(RuleBasedCollator.UCA_, result, ch);
385                     }
386                 }
387             }
388         }
389         if(result == CE_NOT_FOUND_) {
390             result = previousImplicit(ch);
391         }
392         return result;
393     }
394 
395     /**
396      * Return the primary order of the specified collation element,
397      * i.e. the first 16 bits.  This value is unsigned.
398      * @param ce the collation element
399      * @return the element's 16 bits primary order.
400      * @stable ICU 2.8
401      */
402     public final static int primaryOrder(int ce)
403     {
404         return (ce & RuleBasedCollator.CE_PRIMARY_MASK_)
405             >>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
406     }
407     /**
408      * Return the secondary order of the specified collation element,
409      * i.e. the 16th to 23th bits, inclusive.  This value is unsigned.
410      * @param ce the collation element
411      * @return the element's 8 bits secondary order
412      * @stable ICU 2.8
413      */
414     public final static int secondaryOrder(int ce)
415     {
416         return (ce & RuleBasedCollator.CE_SECONDARY_MASK_)
417             >> RuleBasedCollator.CE_SECONDARY_SHIFT_;
418     }
419 
420     /**
421      * Return the tertiary order of the specified collation element, i.e. the last
422      * 8 bits.  This value is unsigned.
423      * @param ce the collation element
424      * @return the element's 8 bits tertiary order
425      * @stable ICU 2.8
426      */
427     public final static int tertiaryOrder(int ce)
428     {
429         return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
430     }
431 
432     /**
433      * <p> Sets the iterator to point to the collation element
434      * corresponding to the character at the specified offset. The
435      * value returned by the next call to next() will be the collation
436      * element corresponding to the characters at offset.</p>
437      *
438      * <p>If offset is in the middle of a contracting character
439      * sequence, the iterator is adjusted to the start of the
440      * contracting sequence. This means that getOffset() is not
441      * guaranteed to return the same value set by this method.</p>
442      *
443      * <p>If the decomposition mode is on, and offset is in the middle
444      * of a decomposible range of source text, the iterator may not
445      * return a correct result for the next forwards or backwards
446      * iteration.  The user must ensure that the offset is not in the
447      * middle of a decomposible range.</p>
448      *
449      * @param offset the character offset into the original source string to
450      *        set. Note that this is not an offset into the corresponding
451      *        sequence of collation elements.
452      * @stable ICU 2.8
453      */
454     public void setOffset(int offset)
455     {
456         m_source_.setIndex(offset);
457         int ch_int = m_source_.current();
458         char ch = (char)ch_int;
459         if (ch_int != UCharacterIterator.DONE && m_collator_.isUnsafe(ch)) {
460             // if it is unsafe we need to check if it is part of a contraction
461             // or a surrogate character
462             if (UTF16.isTrailSurrogate(ch)) {
463                 // if it is a surrogate pair we move up one character
464                 char prevch = (char)m_source_.previous();
465                 if (!UTF16.isLeadSurrogate(prevch)) {
466                     m_source_.setIndex(offset); // go back to the same index
467                 }
468             }
469             else {
470                 // could be part of a contraction
471                 // backup to a safe point and iterate till we pass offset
472                 while (m_source_.getIndex() > 0) {
473                     if (!m_collator_.isUnsafe(ch)) {
474                         break;
475                     }
476                     ch = (char)m_source_.previous();
477                 }
478                 updateInternalState();
479                 int prevoffset = 0;
480                 while (m_source_.getIndex() <= offset) {
481                     prevoffset = m_source_.getIndex();
482                     next();
483                 }
484                 m_source_.setIndex(prevoffset);
485             }
486         }
487         updateInternalState();
488         // direction code to prevent next and previous from returning a 
489         // character if we are already at the ends
490         offset = m_source_.getIndex();
491         if (offset == 0/* m_source_.getBeginIndex() */) {
492             // preventing previous() from returning characters from the end of 
493             // the string again if we are at the beginning
494             m_isForwards_ = false; 
495         }
496         else if (offset == m_source_.getLength()) {
497             // preventing next() from returning characters from the start of 
498             // the string again if we are at the end
499             m_isForwards_ = true;
500         }
501     }
502 
503     /**
504      * <p>Set a new source string for iteration, and reset the offset
505      * to the beginning of the text.</p>
506      *
507      * @param source the new source string for iteration.
508      * @stable ICU 2.8
509      */
510     public void setText(String   source)
511     {
512         m_srcUtilIter_.setText(source);
513         m_source_ = m_srcUtilIter_;
514         updateInternalState();
515     }
516     
517     /**
518      * <p>Set a new source string iterator for iteration, and reset the
519      * offset to the beginning of the text.
520      * </p>
521      * <p>The source iterator's integrity will be preserved since a new copy
522      * will be created for use.</p>
523      * @param source the new source string iterator for iteration.
524      * @stable ICU 2.8
525      */
526     public void setText(UCharacterIterator source)
527     {
528         m_srcUtilIter_.setText(source.getText());
529         m_source_ = m_srcUtilIter_;
530         updateInternalState(); 
531     }
532 
533     /**
534      * <p>Set a new source string iterator for iteration, and reset the
535      * offset to the beginning of the text.
536      * </p>
537      * @param source the new source string iterator for iteration.
538      * @stable ICU 2.8
539      */
540     public void setText(CharacterIterator source)
541     {
542         m_source_ = new CharacterIteratorWrapper(source);
543         m_source_.setToStart();
544         updateInternalState();
545     }
546 
547     // public miscellaneous methods -----------------------------------------
548 
549     /**
550      * Tests that argument object is equals to this CollationElementIterator.
551      * Iterators are equal if the objects uses the same RuleBasedCollator,
552      * the same source text and have the same current position in iteration.
553      * @param that object to test if it is equals to this
554      *             CollationElementIterator
555      * @stable ICU 2.8
556      */
557     public boolean equals(Object   that)
558     {
559         if (that == this) {
560             return true;
561         }
562         if (that instanceof CollationElementIterator) {
563             CollationElementIterator thatceiter
564                                               = (CollationElementIterator)that;
565             if (!m_collator_.equals(thatceiter.m_collator_)) {
566                 return false;
567             }
568             // checks the text 
569             return m_source_.getIndex() == thatceiter.m_source_.getIndex()
570                    && m_source_.getText().equals(
571                                             thatceiter.m_source_.getText());
572         }
573         return false;
574     }
575 
576     // package private constructors ------------------------------------------
577 
578     /**
579      * <p>CollationElementIterator constructor. This takes a source
580      * string and a RuleBasedCollator. The iterator will walk through
581      * the source string based on the rules defined by the
582      * collator. If the source string is empty, NULLORDER will be
583      * returned on the first call to next().</p>
584      *
585      * @param source the source string.
586      * @param collator the RuleBasedCollator
587      * @stable ICU 2.8
588      */
589     CollationElementIterator(String   source, RuleBasedCollator collator)
590     {
591         m_srcUtilIter_ = new StringUCharacterIterator(source);
592         m_utilStringBuffer_ = new StringBuffer  ();
593         m_source_ = m_srcUtilIter_;
594         m_collator_ = collator;
595         m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
596         m_buffer_ = new StringBuffer  ();
597         m_utilSpecialBackUp_ = new Backup();
598         updateInternalState();
599     }
600 
601     /**
602      * <p>CollationElementIterator constructor. This takes a source
603      * character iterator and a RuleBasedCollator. The iterator will
604      * walk through the source string based on the rules defined by
605      * the collator. If the source string is empty, NULLORDER will be
606      * returned on the first call to next().</p>
607      *
608      * @param source the source string iterator.
609      * @param collator the RuleBasedCollator
610      * @stable ICU 2.8
611      */
612     CollationElementIterator(CharacterIterator source,
613                              RuleBasedCollator collator)
614     {
615         m_srcUtilIter_ = new StringUCharacterIterator();
616         m_utilStringBuffer_ = new StringBuffer  ();
617         m_source_ = new CharacterIteratorWrapper(source);
618         m_collator_ = collator;
619         m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
620         m_buffer_ = new StringBuffer  ();
621         m_utilSpecialBackUp_ = new Backup();
622         updateInternalState();
623     }
624     
625     /**
626      * <p>CollationElementIterator constructor. This takes a source
627      * character iterator and a RuleBasedCollator. The iterator will
628      * walk through the source string based on the rules defined by
629      * the collator. If the source string is empty, NULLORDER will be
630      * returned on the first call to next().</p>
631      *
632      * @param source the source string iterator.
633      * @param collator the RuleBasedCollator
634      * @stable ICU 2.8
635      */
636     CollationElementIterator(UCharacterIterator source,
637                              RuleBasedCollator collator)
638     {
639         m_srcUtilIter_ = new StringUCharacterIterator();
640         m_utilStringBuffer_ = new StringBuffer  ();
641         m_srcUtilIter_.setText(source.getText());
642         m_source_ = m_srcUtilIter_;
643         m_collator_ = collator;
644         m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
645         m_buffer_ = new StringBuffer  ();
646         m_utilSpecialBackUp_ = new Backup();
647         updateInternalState();
648     }
649 
650     // package private data members -----------------------------------------
651 
652     /**
653      * true if current codepoint was Hiragana
654      */
655     boolean m_isCodePointHiragana_;
656     /**
657      * Position in the original string that starts with a non-FCD sequence
658      */
659     int m_FCDStart_;
660     /**
661      * This is the CE from CEs buffer that should be returned.
662      * Initial value is 0.
663      * Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
664      * backwards will end with m_CEBufferOffset_ == 0.
665      * The next/previous after we reach the end/beginning of the m_CEBuffer_
666      * will cause this value to be reset to 0.
667      */
668     int m_CEBufferOffset_;
669 
670     /**
671      * This is the position to which we have stored processed CEs.
672      * Initial value is 0.
673      * The next/previous after we reach the end/beginning of the m_CEBuffer_
674      * will cause this value to be reset to 0.
675      */
676     int m_CEBufferSize_;
677     static final int CE_NOT_FOUND_ = 0xF0000000;
678     static final int CE_EXPANSION_TAG_ = 1;
679     static final int CE_CONTRACTION_TAG_ = 2;
680     /** 
681      * Collate Digits As Numbers (CODAN) implementation
682      */
683     static final int CE_DIGIT_TAG_ = 13;
684 
685     // package private methods ----------------------------------------------
686 
687     /**
688      * Sets the collator used.
689      * Internal use, all data members will be reset to the default values
690      * @param collator to set
691      */
692     void setCollator(RuleBasedCollator collator)
693     {
694         m_collator_ = collator;
695         updateInternalState();
696     }
697 
698     /**
699      * <p>Sets the iterator to point to the collation element corresponding to
700      * the specified character (the parameter is a CHARACTER offset in the
701      * original string, not an offset into its corresponding sequence of
702      * collation elements). The value returned by the next call to next()
703      * will be the collation element corresponding to the specified position
704      * in the text. Unlike the public method setOffset(int), this method does
705      * not try to readjust the offset to the start of a contracting sequence.
706      * getOffset() is guaranteed to return the same value as was passed to a
707      * preceding call to setOffset().</p>
708      * @param offset new character offset into the original text to set.
709      */
710     void setExactOffset(int offset)
711     {
712         m_source_.setIndex(offset);
713         updateInternalState();
714     }
715 
716     /**
717      * Checks if iterator is in the buffer zone
718      * @return true if iterator is in buffer zone, false otherwise
719      */
720     boolean isInBuffer()
721     {
722         return m_bufferOffset_ > 0;
723     }
724 
725    
726     /**
727      * <p>Sets the iterator to point to the collation element corresponding to
728      * the specified character (the parameter is a CHARACTER offset in the
729      * original string, not an offset into its corresponding sequence of
730      * collation elements). The value returned by the next call to next()
731      * will be the collation element corresponding to the specified position
732      * in the text. Unlike the public method setOffset(int), this method does
733      * not try to readjust the offset to the start of a contracting sequence.
734      * getOffset() is guaranteed to return the same value as was passed to a
735      * preceding call to setOffset().</p>
736      * </p>
737      * @param source the new source string iterator for iteration.
738      * @param offset to the source
739      */
740     void setText(UCharacterIterator source, int offset)
741     {
742         m_srcUtilIter_.setText(source.getText());
743         m_source_ = m_srcUtilIter_;
744         m_source_.setIndex(offset);
745         updateInternalState();
746     }
747 
748     // private inner class --------------------------------------------------
749 
750     /**
751      * Backup data class
752      */
753     private static final class Backup
754     {
755         // protected data members -------------------------------------------
756 
757         /**
758          * Backup non FCD sequence limit
759          */
760         protected int m_FCDLimit_;
761         /**
762          * Backup non FCD sequence start
763          */
764         protected int m_FCDStart_;
765         /**
766          * Backup if previous Codepoint is Hiragana quatenary
767          */
768         protected boolean m_isCodePointHiragana_;
769         /**
770          * Backup buffer position
771          */
772         protected int m_bufferOffset_;
773         /**
774          * Backup source iterator offset
775          */
776         protected int m_offset_;
777         /**
778          * Backup buffer contents
779          */
780         protected StringBuffer   m_buffer_;
781 
782         // protected constructor --------------------------------------------
783 
784         /**
785          * Empty constructor
786          */
787         protected Backup()
788         {
789             m_buffer_ = new StringBuffer  ();
790         }
791     }
792     // end inner class ------------------------------------------------------
793 
794     /**
795      * Direction of travel
796      */
797     private boolean m_isForwards_;
798     /**
799      * Source string iterator
800      */
801     private UCharacterIterator m_source_;
802     /**
803      * This is position to the m_buffer_, -1 if iterator is not in m_buffer_
804      */
805     private int m_bufferOffset_;
806     /**
807      * Buffer for temporary storage of normalized characters, discontiguous
808      * characters and Thai characters
809      */
810     private StringBuffer   m_buffer_;
811     /**
812      * Position in the original string to continue forward FCD check from.
813      */
814     private int m_FCDLimit_;
815     /**
816      * The collator this iterator is based on
817      */
818     private RuleBasedCollator m_collator_;
819     /**
820      * true if Hiragana quatenary is on
821      */
822     private boolean m_isHiragana4_;
823     /**
824      * CE buffer
825      */
826     private int m_CEBuffer_[];
827     /**
828      * In reality we should not have to deal with expansion sequences longer
829      * then 16. However this value can be change if a bigger buffer is needed.
830      * Note, if the size is change to too small a number, BIG trouble.
831      * Reasonable small value is around 10, if there's no Arabic or other
832      * funky collations that have long expansion sequence. This is the longest
833      * expansion sequence this can handle without bombing out.
834      */
835     private static final int CE_BUFFER_INIT_SIZE_ = 512;
836     /**
837      * Backup storage for special processing inner cases
838      */
839     private Backup m_utilSpecialBackUp_;
840     /**
841      * Backup storage in special processing entry state
842      */
843     private Backup m_utilSpecialEntryBackUp_;
844     /**
845      * Backup storage in special processing discontiguous state
846      */
847     private Backup m_utilSpecialDiscontiguousBackUp_;
848     /**
849      * Utility
850      */
851     private StringUCharacterIterator m_srcUtilIter_;
852     private StringBuffer   m_utilStringBuffer_;
853     private StringBuffer   m_utilSkippedBuffer_;
854     private CollationElementIterator m_utilColEIter_;
855     /**
856      * One character before the first non-zero combining class character
857      */
858     private static final int FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0xC0;
859     /**
860      * One character before the first character with leading non-zero combining
861      * class
862      */
863     private static final int LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0x300;
864     /**
865      * Mask for the last byte
866      */
867     private static final int LAST_BYTE_MASK_ = 0xFF;
868     /**
869      * Shift value for the second last byte
870      */
871     private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
872 
873     // special ce values and tags -------------------------------------------
874     
875     private static final int CE_EXPANSION_ = 0xF1000000;
876     private static final int CE_CONTRACTION_ = 0xF2000000;
877     /**
878      * Indicates the last ce has been consumed. Compare with NULLORDER.
879      * NULLORDER is returned if error occurs.
880      */
881     private static final int CE_NO_MORE_CES_ = 0x00010101;
882     private static final int CE_NO_MORE_CES_PRIMARY_ = 0x00010000;
883     private static final int CE_NO_MORE_CES_SECONDARY_ = 0x00000100;
884     private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
885 
886     private static final int CE_NOT_FOUND_TAG_ = 0;
887     /**
888      * Charset processing, not yet implemented
889      */
890     private static final int CE_CHARSET_TAG_ = 4;
891     /**
892      * AC00-D7AF
893      */
894     private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
895     /**
896      * D800-DBFF
897      */
898     private static final int CE_LEAD_SURROGATE_TAG_ = 7;
899     /**
900      * DC00-DFFF
901      */
902     private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
903     /**
904      * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
905      */
906     private static final int CE_CJK_IMPLICIT_TAG_ = 9;
907     private static final int CE_IMPLICIT_TAG_ = 10;
908     static final int CE_SPEC_PROC_TAG_ = 11;
909     /**
910      * This is a 3 byte primary with starting secondaries and tertiaries.
911      * It fits in a single 32 bit CE and is used instead of expansion to save
912      * space without affecting the performance (hopefully).
913      */
914     private static final int CE_LONG_PRIMARY_TAG_ = 12;
915                         
916     private static final int CE_CE_TAGS_COUNT = 14;
917     private static final int CE_BYTE_COMMON_ = 0x05;
918 
919     // end special ce values and tags ---------------------------------------
920 
921     private static final int HANGUL_SBASE_ = 0xAC00;
922     private static final int HANGUL_LBASE_ = 0x1100;
923     private static final int HANGUL_VBASE_ = 0x1161;
924     private static final int HANGUL_TBASE_ = 0x11A7;
925     private static final int HANGUL_VCOUNT_ = 21;
926     private static final int HANGUL_TCOUNT_ = 28;
927 
928     // CJK stuff ------------------------------------------------------------
929 
930     private static final int CJK_BASE_ = 0x4E00;
931     private static final int CJK_LIMIT_ = 0x9FFF+1;
932     private static final int CJK_COMPAT_USED_BASE_ = 0xFA0E;
933     private static final int CJK_COMPAT_USED_LIMIT_ = 0xFA2F + 1;
934     private static final int CJK_A_BASE_ = 0x3400;
935     private static final int CJK_A_LIMIT_ = 0x4DBF + 1;
936     private static final int CJK_B_BASE_ = 0x20000;
937     private static final int CJK_B_LIMIT_ = 0x2A6DF + 1;
938     private static final int NON_CJK_OFFSET_ = 0x110000;
939 
940     private static final boolean DEBUG  =  ICUDebug.enabled("collator");
941     
942     // private methods ------------------------------------------------------
943 
944     /**
945      * Reset the iterator internally
946      */
947     private void updateInternalState()
948     {
949         m_isCodePointHiragana_ = false;
950         m_buffer_.setLength(0);
951         m_bufferOffset_ = -1;
952         m_CEBufferOffset_ = 0;
953         m_CEBufferSize_ = 0;
954         m_FCDLimit_ = -1;
955         m_FCDStart_ = m_source_.getLength();
956         m_isHiragana4_ = m_collator_.m_isHiragana4_;
957         m_isForwards_ = true;
958     }
959 
960     /**
961      * Backup the current internal state
962      * @param backup object to store the data
963      */
964     private void backupInternalState(Backup backup)
965     {
966         backup.m_offset_ = m_source_.getIndex();
967         backup.m_FCDLimit_ = m_FCDLimit_;
968         backup.m_FCDStart_ = m_FCDStart_;
969         backup.m_isCodePointHiragana_ = m_isCodePointHiragana_;
970         backup.m_bufferOffset_ = m_bufferOffset_;
971         backup.m_buffer_.setLength(0);
972         if (m_bufferOffset_ >= 0) {
973             // jdk 1.3.1 does not have append(StringBuffer) yet
974             if(ICUDebug.isJDK14OrHigher){
975                 backup.m_buffer_.append(m_buffer_);
976             }else{
977                 backup.m_buffer_.append(m_buffer_.toString());
978             }
979         }
980     }
981 
982     /**
983      * Update the iterator internally with backed-up state
984      * @param backup object that stored the data
985      */
986     private void updateInternalState(Backup backup)
987     {
988         m_source_.setIndex(backup.m_offset_);
989         m_isCodePointHiragana_ = backup.m_isCodePointHiragana_;
990         m_bufferOffset_ = backup.m_bufferOffset_;
991         m_FCDLimit_ = backup.m_FCDLimit_;
992         m_FCDStart_ = backup.m_FCDStart_;
993         m_buffer_.setLength(0);
994         if (m_bufferOffset_ >= 0) {
995             // jdk 1.3.1 does not have append(StringBuffer) yet
996             m_buffer_.append(backup.m_buffer_.toString());
997         }
998     }
999 
1000    /**
1001     * A fast combining class retrieval system.
1002     * @param ch UTF16 character
1003     * @return combining class of ch
1004     */
1005    private int getCombiningClass(int ch)
1006    {
1007        if (ch >= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ &&
1008            m_collator_.isUnsafe((char)ch) || ch > 0xFFFF) {
1009            return NormalizerImpl.getCombiningClass(ch);
1010        }
1011        return 0;
1012    }
1013
1014    /**
1015     * <p>Incremental normalization, this is an essential optimization.
1016     * Assuming FCD checks has been done, normalize the non-FCD characters into
1017     * the buffer.
1018     * Source offsets points to the current processing character.
1019     * </p>
1020     */
1021    private void normalize()
1022    {
1023        int size = m_FCDLimit_ - m_FCDStart_;
1024        m_buffer_.setLength(0);
1025        m_source_.setIndex(m_FCDStart_);
1026        for (int i = 0; i < size; i ++) {
1027            m_buffer_.append((char)m_source_.next());
1028        }
1029        String   decomp = Normalizer.decompose(m_buffer_.toString(), false);
1030        m_buffer_.setLength(0);
1031        m_buffer_.append(decomp);
1032        m_bufferOffset_ = 0;
1033    }
1034
1035    /**
1036     * <p>Incremental FCD check and normalization. Gets the next base character
1037     * position and determines if the in-between characters needs normalization.
1038     * </p>
1039     * <p>When entering, the state is known to be this:
1040     * <ul>
1041     * <li>We are working on source string, not the buffer.
1042     * <li>The leading combining class from the current character is 0 or the
1043     *     trailing combining class of the previous char was zero.
1044     * </ul>
1045     * Incoming source offsets points to the current processing character.
1046     * Return source offsets points to the current processing character.
1047     * </p>
1048     * @param ch current character
1049     * @param offset current character offset
1050     * @return true if FCDCheck passes, false otherwise
1051     */
1052    private boolean FCDCheck(char ch, int offset)
1053    {
1054        boolean result = true;
1055
1056        // Get the trailing combining class of the current character.
1057        // If it's zero, we are OK.
1058        m_FCDStart_ = offset;
1059        m_source_.setIndex(offset);
1060        // trie access
1061        char fcd = NormalizerImpl.getFCD16(ch);
1062        if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
1063            m_source_.next();
1064            ch = (char)m_source_.current(); 
1065            // UCharacterIterator.DONE has 0 fcd
1066            if (UTF16.isTrailSurrogate(ch)) {
1067                fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
1068            } else {
1069                fcd = 0;
1070            }
1071        }
1072
1073        int prevTrailCC = fcd & LAST_BYTE_MASK_;
1074
1075        if (prevTrailCC != 0) {
1076            // The current char has a non-zero trailing CC. Scan forward until
1077            // we find a char with a leading cc of zero.
1078            while (true) {
1079                m_source_.next();
1080                int ch_int = m_source_.current();
1081                if (ch_int == UCharacterIterator.DONE) {
1082                    break;
1083                }
1084                ch = (char)ch_int;
1085                // trie access
1086                fcd = NormalizerImpl.getFCD16(ch);
1087                if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
1088                    m_source_.next();
1089                    ch = (char)m_source_.current();
1090                    if (UTF16.isTrailSurrogate(ch)) {
1091                        fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
1092                    } else {
1093                        fcd = 0;
1094                    }
1095                }
1096                int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
1097                if (leadCC == 0) {
1098                    // this is a base character, we stop the FCD checks
1099                    break;
1100                }
1101
1102                if (leadCC < prevTrailCC) {
1103                    result = false;
1104                }
1105
1106                prevTrailCC = fcd & LAST_BYTE_MASK_;
1107            }
1108        }
1109        m_FCDLimit_ = m_source_.getIndex();
1110        m_source_.setIndex(m_FCDStart_);
1111        m_source_.next();
1112        return result;
1113    }
1114
1115    /**
1116     * <p>Method tries to fetch the next character that is in fcd form.</p>
1117     * <p>Normalization is done if required.</p>
1118     * <p>Offsets are returned at the next character.</p>
1119     * @return next fcd character
1120     */
1121    private int nextChar()
1122    {
1123        int result;
1124
1125        // loop handles the next character whether it is in the buffer or not.
1126        if (m_bufferOffset_ < 0) {
1127            // we're working on the source and not normalizing. fast path.
1128            // note Thai pre-vowel reordering uses buffer too
1129            result = m_source_.current();
1130        }
1131        else {
1132            // we are in the buffer, buffer offset will never be 0 here
1133            if (m_bufferOffset_ >= m_buffer_.length()) {
1134                // Null marked end of buffer, revert to the source string and
1135                // loop back to top to try again to get a character.
1136                m_source_.setIndex(m_FCDLimit_);
1137                m_bufferOffset_ = -1;
1138                m_buffer_.setLength(0);
1139                return nextChar();
1140            }
1141            return m_buffer_.charAt(m_bufferOffset_ ++);
1142        }
1143        int startoffset = m_source_.getIndex();
1144        if (result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
1145            // Fast fcd safe path. trail combining class == 0.
1146            || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1147            || m_bufferOffset_ >= 0 || m_FCDLimit_ > startoffset) {
1148            // skip the fcd checks
1149            m_source_.next();
1150            return result;
1151        }
1152
1153        if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1154            // We need to peek at the next character in order to tell if we are
1155            // FCD
1156            m_source_.next();
1157            int next = m_source_.current();
1158            if (next == UCharacterIterator.DONE
1159                || next < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1160                return result; // end of source string and if next character
1161                // starts with a base character is always fcd.
1162            }
1163        }
1164
1165        // Need a more complete FCD check and possible normalization.
1166        if (!FCDCheck((char)result, startoffset)) {
1167            normalize();
1168            result = m_buffer_.charAt(0);
1169            m_bufferOffset_ = 1;
1170        }
1171        return result;
1172    }
1173
1174    /**
1175     * <p>Incremental normalization, this is an essential optimization.
1176     * Assuming FCD checks has been done, normalize the non-FCD characters into
1177     * the buffer.
1178     * Source offsets points to the current processing character.</p>
1179     */
1180    private void normalizeBackwards()
1181    {
1182        normalize();
1183        m_bufferOffset_ = m_buffer_.length();
1184    }
1185
1186    /**
1187     * <p>Incremental backwards FCD check and normalization. Gets the previous
1188     * base character position and determines if the in-between characters
1189     * needs normalization.
1190     * </p>
1191     * <p>When entering, the state is known to be this:
1192     * <ul>
1193     * <li>We are working on source string, not the buffer.
1194     * <li>The trailing combining class from the current character is 0 or the
1195     *     leading combining class of the next char was zero.
1196     * </ul>
1197     * Input source offsets points to the previous character.
1198     * Return source offsets points to the current processing character.
1199     * </p>
1200     * @param ch current character
1201     * @param offset current character offset
1202     * @return true if FCDCheck passes, false otherwise
1203     */
1204    private boolean FCDCheckBackwards(char ch, int offset)
1205    {
1206        boolean result = true;
1207        char fcd = 0;
1208        m_FCDLimit_ = offset + 1;
1209        m_source_.setIndex(offset);
1210        if (!UTF16.isSurrogate(ch)) {
1211            fcd = NormalizerImpl.getFCD16(ch);
1212        }
1213        else if (UTF16.isTrailSurrogate(ch) && m_FCDLimit_ > 0) {
1214            // note trail surrogate characters gets 0 fcd
1215            char trailch = ch;
1216            ch = (char)m_source_.previous();
1217            if (UTF16.isLeadSurrogate(ch)) {
1218                fcd = NormalizerImpl.getFCD16(ch);
1219                if (fcd != 0) {
1220                    fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd,
1221                                                                   trailch);
1222                }
1223            }
1224            else {
1225                fcd = 0; // unpaired surrogate
1226            }
1227        }
1228
1229        int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
1230        // The current char has a non-zero leading combining class.
1231        // Scan backward until we find a char with a trailing cc of zero.
1232
1233        while (leadCC != 0) {
1234            offset = m_source_.getIndex();
1235            if (offset == 0) {
1236                break;
1237            }
1238            ch = (char)m_source_.previous();
1239            if (!UTF16.isSurrogate(ch)) {
1240                fcd = NormalizerImpl.getFCD16(ch);
1241            }
1242            else if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0) {
1243                char trail = ch;
1244                ch = (char)m_source_.previous();
1245                if (UTF16.isLeadSurrogate(ch)) {
1246                    fcd = NormalizerImpl.getFCD16(ch);
1247                }
1248                if (fcd != 0) {
1249                    fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, trail);
1250                }
1251            }
1252            else {
1253                fcd = 0; // unpaired surrogate
1254            }
1255            int prevTrailCC = fcd & LAST_BYTE_MASK_;
1256            if (leadCC < prevTrailCC) {
1257                result = false;
1258            }
1259            leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
1260        }
1261
1262        // storing character with 0 lead fcd or the 1st accent with a base
1263        // character before it
1264        if (fcd == 0) {
1265            m_FCDStart_ = offset;
1266        }
1267        else {
1268            m_FCDStart_ = m_source_.getIndex();
1269        }
1270        m_source_.setIndex(m_FCDLimit_);
1271        return result;
1272    }
1273
1274    /**
1275     * <p>Method tries to fetch the previous character that is in fcd form.</p>
1276     * <p>Normalization is done if required.</p>
1277     * <p>Offsets are returned at the current character.</p>
1278     * @return previous fcd character
1279     */
1280    private int previousChar()
1281    {
1282        if (m_bufferOffset_ >= 0) {
1283            m_bufferOffset_ --;
1284            if (m_bufferOffset_ >= 0) {
1285                return m_buffer_.charAt(m_bufferOffset_);
1286            }
1287            else {
1288                // At the start of buffer, route back to string.
1289                m_buffer_.setLength(0);
1290                if (m_FCDStart_ == 0) {
1291                    m_FCDStart_ = -1;
1292                    m_source_.setIndex(0);
1293                    return UCharacterIterator.DONE;
1294                }
1295                else {
1296                    m_FCDLimit_ = m_FCDStart_;
1297                    m_source_.setIndex(m_FCDStart_);
1298                    return previousChar();
1299                }
1300            }
1301        }
1302        int result = m_source_.previous();
1303        int startoffset = m_source_.getIndex();
1304        if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
1305            || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1306            || m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) {
1307            return result;
1308        }
1309        int ch = m_source_.previous();
1310        if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1311            // if previous character is FCD
1312            m_source_.next();
1313            return result;
1314        }
1315        // Need a more complete FCD check and possible normalization.
1316        if (!FCDCheckBackwards((char)result, startoffset)) {
1317            normalizeBackwards();
1318            m_bufferOffset_ --;
1319            result = m_buffer_.charAt(m_bufferOffset_);
1320        }
1321        else {
1322            // fcd checks alway reset m_source_ to the limit of the FCD
1323            m_source_.setIndex(startoffset);
1324        }
1325        return result;
1326    }
1327
1328    /**
1329     * Determines if it is at the start of source iteration
1330     * @return true if iterator at the start, false otherwise
1331     */
1332    private final boolean isBackwardsStart()
1333    {
1334        return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
1335            || (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
1336    }
1337
1338    /**
1339     * Checks if iterator is at the end of its source string.
1340     * @return true if it is at the end, false otherwise
1341     */
1342    private final boolean isEnd()
1343    {
1344        if (m_bufferOffset_ >= 0) {
1345            if (m_bufferOffset_ != m_buffer_.length()) {
1346                return false;
1347            }
1348            else {
1349                // at end of buffer. check if fcd is at the end
1350                return m_FCDLimit_ == m_source_.getLength();
1351            }
1352        }
1353        return m_source_.getLength() == m_source_.getIndex();
1354    }
1355
1356    /**
1357     * <p>Special CE management for surrogates</p>
1358     * <p>Lead surrogate is encountered. CE to be retrieved by using the
1359     * following code unit. If next character is a trail surrogate, both
1360     * characters will be combined to retrieve the CE, otherwise completely
1361     * ignorable (UCA specification) is returned.</p>
1362     * @param collator collator to use
1363     * @param ce current CE
1364     * @param trail character
1365     * @return next CE for the surrogate characters
1366     */
1367    private final int nextSurrogate(RuleBasedCollator collator, int ce,
1368                                    char trail)
1369    {
1370        if (!UTF16.isTrailSurrogate(trail)) {
1371            updateInternalState(m_utilSpecialBackUp_);
1372            return IGNORABLE;
1373        }
1374        // TODO: CE contain the data from the previous CE + the mask.
1375        // It should at least be unmasked
1376        int result = collator.m_trie_.getTrailValue(ce, trail);
1377        if (result == CE_NOT_FOUND_) {
1378            updateInternalState(m_utilSpecialBackUp_);
1379        }
1380        return result;
1381    }
1382
1383    /**
1384     * Gets the CE expansion offset
1385     * @param collator current collator
1386     * @param ce ce to test
1387     * @return expansion offset
1388     */
1389    private int getExpansionOffset(RuleBasedCollator collator, int ce)
1390    {
1391        return ((ce & 0xFFFFF0) >> 4) - collator.m_expansionOffset_;
1392    }
1393
1394
1395    /**
1396     * Gets the contraction ce offset
1397     * @param collator current collator
1398     * @param ce current ce
1399     * @return contraction offset
1400     */
1401    private int getContractionOffset(RuleBasedCollator collator, int ce)
1402    {
1403        return (ce & 0xFFFFFF) - collator.m_contractionOffset_;
1404    }
1405
1406    /**
1407     * Checks if CE is a special tag CE
1408     * @param ce to check
1409     * @return true if CE is a special tag CE, false otherwise
1410     */
1411    private boolean isSpecialPrefixTag(int ce)
1412    {
1413        return RuleBasedCollator.isSpecial(ce) &&
1414            RuleBasedCollator.getTag(ce) == CE_SPEC_PROC_TAG_;
1415    }
1416
1417    /**
1418     * <p>Special processing getting a CE that is preceded by a certain
1419     * prefix.</p>
1420     * <p>Used for optimizing Japanese length and iteration marks. When a
1421     * special processing tag is encountered, iterate backwards to see if
1422     * there's a match.</p>
1423     * <p>Contraction tables are used, prefix data is stored backwards in the
1424     * table.</p>
1425     * @param collator collator to use
1426     * @param ce current ce
1427     * @param entrybackup entry backup iterator status
1428     * @return next collation element
1429     */
1430    private int nextSpecialPrefix(RuleBasedCollator collator, int ce,
1431                                  Backup entrybackup)
1432    {
1433        backupInternalState(m_utilSpecialBackUp_);
1434        updateInternalState(entrybackup);
1435        previousChar();
1436        // We want to look at the character where we entered
1437
1438        while (true) {
1439            // This loop will run once per source string character, for as
1440            // long as we are matching a potential contraction sequence
1441            // First we position ourselves at the begining of contraction
1442            // sequence
1443            int entryoffset = getContractionOffset(collator, ce);
1444            int offset = entryoffset;
1445            if (isBackwardsStart()) {
1446                ce = collator.m_contractionCE_[offset];
1447                break;
1448            }
1449            char previous = (char)previousChar();
1450            while (previous > collator.m_contractionIndex_[offset]) {
1451                // contraction characters are ordered, skip smaller characters
1452                offset ++;
1453            }
1454
1455            if (previous == collator.m_contractionIndex_[offset]) {
1456                // Found the source string char in the table.
1457                // Pick up the corresponding CE from the table.
1458                ce = collator.m_contractionCE_[offset];
1459            }
1460            else {
1461                // Source string char was not in the table, prefix not found
1462                ce = collator.m_contractionCE_[entryoffset];
1463            }
1464
1465            if (!isSpecialPrefixTag(ce)) {
1466                // The source string char was in the contraction table, and
1467                // the corresponding CE is not a prefix CE. We found the
1468                // prefix, break out of loop, this CE will end up being
1469                // returned. This is the normal way out of prefix handling
1470                // when the source actually contained the prefix.
1471                break;
1472            }
1473        }
1474        if (ce != CE_NOT_FOUND_) {
1475            // we found something and we can merilly continue
1476            updateInternalState(m_utilSpecialBackUp_);
1477        }
1478        else { // prefix search was a failure, we have to backup all the way to
1479            // the start
1480            updateInternalState(entrybackup);
1481        }
1482        return ce;
1483    }
1484
1485    /**
1486     * Checks if the ce is a contraction tag
1487     * @param ce ce to check
1488     * @return true if ce is a contraction tag, false otherwise
1489     */
1490    private boolean isContractionTag(int ce)
1491    {
1492        return RuleBasedCollator.isSpecial(ce) &&
1493            RuleBasedCollator.getTag(ce) == CE_CONTRACTION_TAG_;
1494    }
1495
1496    /**
1497     * Method to copy skipped characters into the buffer and sets the fcd
1498     * position. To ensure that the skipped characters are considered later,
1499     * we need to place it in the appropriate position in the buffer and
1500     * reassign the source index. simple case if index reside in string,
1501     * simply copy to buffer and fcdposition = pos, pos = start of buffer.
1502     * if pos in normalization buffer, we'll insert the copy infront of pos
1503     * and point pos to the start of the buffer. why am i doing these copies?
1504     * well, so that the whole chunk of codes in the getNextCE,
1505     * ucol_prv_getSpecialCE does not require any changes, which will be
1506     * really painful.
1507     * @param skipped character buffer
1508     */
1509    private void setDiscontiguous(StringBuffer   skipped)
1510    {
1511        if (m_bufferOffset_ >= 0) {
1512            m_buffer_.replace(0, m_bufferOffset_, skipped.toString());
1513        }
1514        else {
1515            m_FCDLimit_ = m_source_.getIndex();
1516            m_buffer_.setLength(0);
1517            m_buffer_.append(skipped.toString());
1518        }
1519
1520        m_bufferOffset_ = 0;
1521    }
1522
1523    /**
1524     * Returns the current character for forward iteration
1525     * @return current character
1526     */
1527    private int currentChar()
1528    {
1529        if (m_bufferOffset_ < 0) {
1530            m_source_.previous();
1531            return m_source_.next();
1532        }
1533
1534        // m_bufferOffset_ is never 0 in normal circumstances except after a
1535        // discontiguous contraction since it is always returned and moved
1536        // by 1 when we do nextChar()
1537        return m_buffer_.charAt(m_bufferOffset_ - 1);
1538    }
1539
1540    /**
1541     * Method to get the discontiguous collation element within the source.
1542     * Note this function will set the position to the appropriate places.
1543     * Passed in character offset points to the second combining character
1544     * after the start character.
1545     * @param collator current collator used
1546     * @param entryoffset index to the start character in the contraction table
1547     * @return discontiguous collation element offset
1548     */
1549    private int nextDiscontiguous(RuleBasedCollator collator, int entryoffset)
1550    {
1551        int offset = entryoffset;
1552        boolean multicontraction = false;
1553        // since it will be stuffed into this iterator and ran over again
1554        if (m_utilSkippedBuffer_ == null) {
1555            m_utilSkippedBuffer_ = new StringBuffer  ();
1556        }
1557        else {
1558            m_utilSkippedBuffer_.setLength(0);
1559        }
1560        char ch = (char)currentChar();
1561        m_utilSkippedBuffer_.append((char)currentChar());
1562        // accent after the first character
1563        if (m_utilSpecialDiscontiguousBackUp_ == null) {
1564            m_utilSpecialDiscontiguousBackUp_ = new Backup();
1565        }
1566        backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1567        char nextch = ch;
1568        while (true) {
1569            ch = nextch;
1570            int ch_int = nextChar();
1571            nextch = (char)ch_int;
1572            if (ch_int == UCharacterIterator.DONE
1573                || getCombiningClass(nextch) == 0) {
1574                // if there are no more accents to move around
1575                // we don't have to shift previousChar, since we are resetting
1576                // the offset later
1577                if (multicontraction) {
1578                    if (ch_int != UCharacterIterator.DONE) {
1579                        previousChar(); // backtrack
1580                    }
1581                    setDiscontiguous(m_utilSkippedBuffer_);
1582                    return collator.m_contractionCE_[offset];
1583                }
1584                break;
1585            }
1586
1587            offset ++; // skip the combining class offset
1588            while (nextch > collator.m_contractionIndex_[offset]) {
1589                offset ++;
1590            }
1591
1592            int ce = CE_NOT_FOUND_;
1593            if (nextch != collator.m_contractionIndex_[offset]
1594                    || getCombiningClass(nextch) == getCombiningClass(ch)) {
1595                    // unmatched or blocked character
1596                m_utilSkippedBuffer_.append(nextch);
1597                continue;
1598            }
1599            else {
1600                ce = collator.m_contractionCE_[offset];
1601            }
1602
1603            if (ce == CE_NOT_FOUND_) {
1604                break;
1605            }
1606            else if (isContractionTag(ce)) {
1607                // this is a multi-contraction
1608                offset = getContractionOffset(collator, ce);
1609                if (collator.m_contractionCE_[offset] != CE_NOT_FOUND_) {
1610                    multicontraction = true;
1611                    backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1612                }
1613            }
1614            else {
1615                setDiscontiguous(m_utilSkippedBuffer_);
1616                return ce;
1617            }
1618        }
1619
1620        updateInternalState(m_utilSpecialDiscontiguousBackUp_);
1621        // backup is one forward of the base character, we need to move back
1622        // one more
1623        previousChar();
1624        return collator.m_contractionCE_[entryoffset];
1625    }
1626
1627    /**
1628     * Gets the next contraction ce
1629     * @param collator collator to use
1630     * @param ce current ce
1631     * @param entrybackup entry backup iterator status
1632     * @return ce of the next contraction
1633     */
1634    private int nextContraction(RuleBasedCollator collator, int ce)
1635    {
1636        backupInternalState(m_utilSpecialBackUp_);
1637        int entryce = collator.m_contractionCE_[getContractionOffset(collator, ce)]; //CE_NOT_FOUND_;
1638        while (true) {
1639            int entryoffset = getContractionOffset(collator, ce);
1640            int offset = entryoffset;
1641
1642            if (isEnd()) {
1643                ce = collator.m_contractionCE_[offset];
1644                if (ce == CE_NOT_FOUND_) {
1645                    // back up the source over all the chars we scanned going
1646                    // into this contraction.
1647                    ce = entryce;
1648                    updateInternalState(m_utilSpecialBackUp_);
1649                }
1650                break;
1651            }
1652
1653            // get the discontiguos maximum combining class
1654            int maxCC = (collator.m_contractionIndex_[offset] & 0xFF);
1655            // checks if all characters have the same combining class
1656            byte allSame = (byte)(collator.m_contractionIndex_[offset] >> 8);
1657            char ch = (char)nextChar();
1658            offset ++;
1659            while (ch > collator.m_contractionIndex_[offset]) {
1660                // contraction characters are ordered, skip all smaller
1661                offset ++;
1662            }
1663
1664            if (ch == collator.m_contractionIndex_[offset]) {
1665                // Found the source string char in the contraction table.
1666                //  Pick up the corresponding CE from the table.
1667                ce = collator.m_contractionCE_[offset];
1668            }
1669            else {
1670                // Source string char was not in contraction table.
1671                // Unless it is a discontiguous contraction, we are done
1672                int miss = ch;
1673                if(UTF16.isLeadSurrogate(ch)) { // in order to do the proper detection, we
1674                    // need to see if we're dealing with a supplementary
1675                    miss = UCharacterProperty.getRawSupplementary(ch, (char) nextChar());
1676                  }
1677                int sCC;
1678                if (maxCC == 0 || (sCC = getCombiningClass(miss)) == 0
1679                    || sCC > maxCC || (allSame != 0 && sCC == maxCC) ||
1680                    isEnd()) {
1681                    // Contraction can not be discontiguous, back up by one
1682                    previousChar();
1683                    if(miss > 0xFFFF) {
1684                        previousChar();
1685                    }
1686                    ce = collator.m_contractionCE_[entryoffset];
1687                }
1688                else {
1689                    // Contraction is possibly discontiguous.
1690                    // find the next character if ch is not a base character
1691                    int ch_int = nextChar();
1692                    if (ch_int != UCharacterIterator.DONE) {
1693                        previousChar();
1694                    }
1695                    char nextch = (char)ch_int;
1696                    if (getCombiningClass(nextch) == 0) {
1697                        previousChar();
1698                        if(miss > 0xFFFF) {
1699                            previousChar();
1700                        }    
1701                        // base character not part of discontiguous contraction
1702                        ce = collator.m_contractionCE_[entryoffset];
1703                    }
1704                    else {
1705                        ce = nextDiscontiguous(collator, entryoffset);
1706                    }
1707                }
1708            }
1709
1710            if (ce == CE_NOT_FOUND_) {
1711                // source did not match the contraction, revert back original
1712                updateInternalState(m_utilSpecialBackUp_);
1713                ce = entryce;
1714                break;
1715            }
1716
1717            // source was a contraction
1718            if (!isContractionTag(ce)) {
1719                break;
1720            }
1721
1722            // ccontinue looping to check for the remaining contraction.
1723            if (collator.m_contractionCE_[entryoffset] != CE_NOT_FOUND_) {
1724                // there are further contractions to be performed, so we store
1725                // the so-far completed ce, so that if we fail in the next
1726                // round we just return this one.
1727                entryce = collator.m_contractionCE_[entryoffset];
1728                backupInternalState(m_utilSpecialBackUp_);
1729                if (m_utilSpecialBackUp_.m_bufferOffset_ >= 0) {
1730                    m_utilSpecialBackUp_.m_bufferOffset_ --;
1731                }
1732                else {
1733                    m_utilSpecialBackUp_.m_offset_ --;
1734                }
1735            }
1736        }
1737        return ce;
1738    }
1739
1740    /**
1741     * Gets the next ce for long primaries, stuffs the rest of the collation
1742     * elements into the ce buffer
1743     * @param ce current ce
1744     * @return next ce
1745     */
1746    private int nextLongPrimary(int ce)
1747    {
1748        m_CEBuffer_[1] = ((ce & 0xFF) << 24)
1749            | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1750        m_CEBufferOffset_ = 1;
1751        m_CEBufferSize_ = 2;
1752        m_CEBuffer_[0] = ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) |
1753            CE_BYTE_COMMON_;
1754        return m_CEBuffer_[0];
1755    }
1756
1757    /**
1758     * Gets the number of expansion
1759     * @param ce current ce
1760     * @return number of expansion
1761     */
1762    private int getExpansionCount(int ce)
1763    {
1764        return ce & 0xF;
1765    }
1766
1767    /**
1768     * Gets the next expansion ce and stuffs the rest of the collation elements
1769     * into the ce buffer
1770     * @param collator current collator
1771     * @param ce current ce
1772     * @return next expansion ce
1773     */
1774    private int nextExpansion(RuleBasedCollator collator, int ce)
1775    {
1776        // NOTE: we can encounter both continuations and expansions in an
1777        // expansion!
1778        // I have to decide where continuations are going to be dealt with
1779        int offset = getExpansionOffset(collator, ce);
1780        m_CEBufferSize_ = getExpansionCount(ce);
1781        m_CEBufferOffset_ = 1;
1782        m_CEBuffer_[0] = collator.m_expansion_[offset];
1783        if (m_CEBufferSize_ != 0) {
1784            // if there are less than 16 elements in expansion
1785            for (int i = 1; i < m_CEBufferSize_; i ++) {
1786                m_CEBuffer_[i] = collator.m_expansion_[offset + i];
1787            }
1788        }
1789        else {
1790            // ce are terminated
1791            m_CEBufferSize_ = 1;
1792            while (collator.m_expansion_[offset] != 0) {
1793                m_CEBuffer_[m_CEBufferSize_ ++] =
1794                    collator.m_expansion_[++ offset];
1795            }
1796        }
1797        // in case of one element expansion, we 
1798        // want to immediately return CEpos
1799        if (m_CEBufferSize_ == 1) {
1800            m_CEBufferSize_ = 0;
1801            m_CEBufferOffset_ = 0;
1802        }
1803        return m_CEBuffer_[0];
1804    }
1805    
1806    /**
1807     * Gets the next digit ce
1808     * @param collator current collator
1809     * @param ce current collation element
1810     * @param cp current codepoint
1811     * @return next digit ce
1812     */
1813    private int nextDigit(RuleBasedCollator collator, int ce, int cp)
1814    {
1815        // We do a check to see if we want to collate digits as numbers; 
1816        // if so we generate a custom collation key. Otherwise we pull out 
1817        // the value stored in the expansion table.
1818
1819        if (m_collator_.m_isNumericCollation_){
1820            int collateVal = 0;
1821            int trailingZeroIndex = 0;
1822            boolean nonZeroValReached = false;
1823
1824            // I just need a temporary place to store my generated CEs.
1825            // icu4c uses a unsigned byte array, i'll use a stringbuffer here
1826            // to avoid dealing with the sign problems and array allocation
1827            // clear and set initial string buffer length
1828            m_utilStringBuffer_.setLength(3);
1829        
1830            // We parse the source string until we hit a char that's NOT a 
1831            // digit.
1832            // Use this u_charDigitValue. This might be slow because we have 
1833            // to handle surrogates...
1834            int digVal = UCharacter.digit(cp); 
1835            // if we have arrived here, we have already processed possible 
1836            // supplementaries that trigered the digit tag -
1837            // all supplementaries are marked in the UCA.
1838            // We  pad a zero in front of the first element anyways. 
1839            // This takes care of the (probably) most common case where 
1840            // people are sorting things followed by a single digit
1841            int digIndx = 1;
1842            for (;;) {
1843                // Make sure we have enough space.
1844                if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
1845                    m_utilStringBuffer_.setLength(m_utilStringBuffer_.length() 
1846                                                  << 1);
1847                }
1848                // Skipping over leading zeroes.        
1849                if (digVal != 0 || nonZeroValReached) {
1850                    if (digVal != 0 && !nonZeroValReached) {
1851                        nonZeroValReached = true;
1852                    }    
1853                    // We parse the digit string into base 100 numbers 
1854                    // (this fits into a byte).
1855                    // We only add to the buffer in twos, thus if we are 
1856                    // parsing an odd character, that serves as the 
1857                    // 'tens' digit while the if we are parsing an even 
1858                    // one, that is the 'ones' digit. We dumped the 
1859                    // parsed base 100 value (collateVal) into a buffer. 
1860                    // We multiply each collateVal by 2 (to give us room) 
1861                    // and add 5 (to avoid overlapping magic CE byte 
1862                    // values). The last byte we subtract 1 to ensure it is 
1863                    // less than all the other bytes.
1864                    if (digIndx % 2 == 1) {
1865                        collateVal += digVal;  
1866                        // This removes trailing zeroes.
1867                        if (collateVal == 0 && trailingZeroIndex == 0) {
1868                            trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
1869                        }
1870                        else if (trailingZeroIndex != 0) {
1871                            trailingZeroIndex = 0;
1872                        }
1873                        m_utilStringBuffer_.setCharAt(
1874                                            ((digIndx - 1) >>> 1) + 2,
1875                                            (char)((collateVal << 1) + 6));
1876                        collateVal = 0;
1877                    }
1878                    else {
1879                        // We drop the collation value into the buffer so if 
1880                        // we need to do a "front patch" we don't have to 
1881                        // check to see if we're hitting the last element.
1882                        collateVal = digVal * 10;
1883                        m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2, 
1884                                                (char)((collateVal << 1) + 6));
1885                    }
1886                    digIndx ++;
1887                }
1888            
1889                // Get next character.
1890                if (!isEnd()){
1891                    backupInternalState(m_utilSpecialBackUp_);
1892                    int char32 = nextChar();
1893                    char ch = (char)char32;
1894                    if (UTF16.isLeadSurrogate(ch)){
1895                        if (!isEnd()) {
1896                            char trail = (char)nextChar();
1897                            if (UTF16.isTrailSurrogate(trail)) {
1898                               char32 = UCharacterProperty.getRawSupplementary(
1899                                                                   ch, trail);
1900                            } 
1901                            else {
1902                                goBackOne();
1903                            }
1904                        }
1905                    }
1906                    
1907                    digVal = UCharacter.digit(char32);
1908                    if (digVal == -1) {
1909                        // Resetting position to point to the next unprocessed 
1910                        // char. We overshot it when doing our test/set for 
1911                        // numbers.
1912                        updateInternalState(m_utilSpecialBackUp_);
1913                        break;
1914                    }
1915                } 
1916                else {
1917                    break;
1918                }
1919            }
1920        
1921            if (nonZeroValReached == false){
1922                digIndx = 2;
1923                m_utilStringBuffer_.setCharAt(2, (char)6);
1924            }
1925        
1926            int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex 
1927                                             : (digIndx >>> 1) + 2;              
1928            if (digIndx % 2 != 0){
1929                // We missed a value. Since digIndx isn't even, stuck too many 
1930                // values into the buffer (this is what we get for padding the 
1931                // first byte with a zero). "Front-patch" now by pushing all 
1932                // nybbles forward.
1933                // Doing it this way ensures that at least 50% of the time 
1934                // (statistically speaking) we'll only be doing a single pass 
1935                // and optimizes for strings with single digits. I'm just 
1936                // assuming that's the more common case.
1937                for (int i = 2; i < endIndex; i ++){
1938                    m_utilStringBuffer_.setCharAt(i, 
1939                        (char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1) 
1940                                  % 10) * 10) 
1941                                 + (((m_utilStringBuffer_.charAt(i + 1) - 6) 
1942                                      >>> 1) / 10) << 1) + 6));
1943                }
1944                -- digIndx;
1945            }
1946        
1947            // Subtract one off of the last byte. 
1948            m_utilStringBuffer_.setCharAt(endIndex - 1, 
1949                         (char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));            
1950                
1951            // We want to skip over the first two slots in the buffer. 
1952            // The first slot is reserved for the header byte CODAN_PLACEHOLDER. 
1953            // The second slot is for the sign/exponent byte: 
1954            // 0x80 + (decimalPos/2) & 7f.
1955            m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
1956            m_utilStringBuffer_.setCharAt(1, 
1957                                     (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
1958        
1959            // Now transfer the collation key to our collIterate struct.
1960            // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
1961            ce = (((m_utilStringBuffer_.charAt(0) << 8)
1962                       // Primary weight 
1963                       | m_utilStringBuffer_.charAt(1)) 
1964                                    << RuleBasedCollator.CE_PRIMARY_SHIFT_)
1965                       //  Secondary weight 
1966                       | (RuleBasedCollator.BYTE_COMMON_ 
1967                          << RuleBasedCollator.CE_SECONDARY_SHIFT_) 
1968                       | RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
1969            int i = 2; // Reset the index into the buffer.
1970            
1971            m_CEBuffer_[0] = ce;
1972            m_CEBufferSize_ = 1;
1973            m_CEBufferOffset_ = 1;
1974            while (i < endIndex)
1975            {
1976                int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
1977                if (i < endIndex) {
1978                    primWeight |= m_utilStringBuffer_.charAt(i ++);
1979                }
1980                m_CEBuffer_[m_CEBufferSize_ ++] 
1981                    = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_) 
1982                      | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1983            }
1984            return ce;
1985        } 
1986        
1987        // no numeric mode, we'll just switch to whatever we stashed and 
1988        // continue
1989        // find the offset to expansion table
1990        return collator.m_expansion_[getExpansionOffset(collator, ce)];
1991    }
1992
1993    /**
1994     * Gets the next implicit ce for codepoints
1995     * @param codepoint current codepoint
1996     * @return implicit ce
1997     */
1998    private int nextImplicit(int codepoint)
1999    {
2000        if (!UCharacter.isLegal(codepoint)) {
2001            // synwee to check with vladimir on the range of isNonChar()
2002            // illegal code value, use completely ignoreable!
2003            return IGNORABLE;
2004        }
2005        int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
2006        m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
2007                         | 0x00000505;
2008        m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
2009        m_CEBufferOffset_ = 1;
2010        m_CEBufferSize_ = 2;
2011        return m_CEBuffer_[0];
2012    }
2013
2014    /**
2015     * Returns the next ce associated with the following surrogate characters
2016     * @param ch current character
2017     * @return ce
2018     */
2019    private int nextSurrogate(char ch)
2020    {
2021        int ch_int = nextChar();
2022        char nextch = (char)ch_int;
2023        if (ch_int != CharacterIterator.DONE &&
2024            UTF16.isTrailSurrogate(nextch)) {
2025            int codepoint = UCharacterProperty.getRawSupplementary(ch, nextch);
2026            return nextImplicit(codepoint);
2027        }
2028        if (nextch != CharacterIterator.DONE) {
2029            previousChar(); // reverts back to the original position
2030        }
2031        return IGNORABLE; // completely ignorable
2032    }
2033
2034    /**
2035     * Returns the next ce for a hangul character, this is an implicit
2036     * calculation
2037     * @param collator current collator
2038     * @param ch current character
2039     * @return hangul ce
2040     */
2041    private int nextHangul(RuleBasedCollator collator, char ch)
2042    {
2043        char L = (char)(ch - HANGUL_SBASE_);
2044
2045        // divide into pieces
2046        // do it in this order since some compilers can do % and / in one
2047        // operation
2048        char T = (char)(L % HANGUL_TCOUNT_);
2049        L /= HANGUL_TCOUNT_;
2050        char V = (char)(L % HANGUL_VCOUNT_);
2051        L /= HANGUL_VCOUNT_;
2052
2053        // offset them
2054        L += HANGUL_LBASE_;
2055        V += HANGUL_VBASE_;
2056        T += HANGUL_TBASE_;
2057
2058        // return the first CE, but first put the rest into the expansion
2059        // buffer
2060        m_CEBufferSize_ = 0;
2061        if (!collator.m_isJamoSpecial_) { // FAST PATH
2062            m_CEBuffer_[m_CEBufferSize_ ++] =
2063                collator.m_trie_.getLeadValue(L);
2064            m_CEBuffer_[m_CEBufferSize_ ++] =
2065                collator.m_trie_.getLeadValue(V);
2066
2067            if (T != HANGUL_TBASE_) {
2068                m_CEBuffer_[m_CEBufferSize_ ++] =
2069                    collator.m_trie_.getLeadValue(T);
2070            }
2071            m_CEBufferOffset_ = 1;
2072            return m_CEBuffer_[0];
2073        }
2074        else {
2075            // Jamo is Special
2076            // Since Hanguls pass the FCD check, it is guaranteed that we
2077            // won't be in the normalization buffer if something like this
2078            // happens
2079            // Move Jamos into normalization buffer
2080            m_buffer_.append((char)L);
2081            m_buffer_.append((char)V);
2082            if (T != HANGUL_TBASE_) {
2083                m_buffer_.append((char)T);
2084            }
2085            m_FCDLimit_ = m_source_.getIndex();
2086            m_FCDStart_ = m_FCDLimit_ - 1;
2087            // Indicate where to continue in main input string after
2088            // exhausting the buffer
2089            return IGNORABLE;
2090        }
2091    }
2092
2093    /**
2094     * <p>Special CE management. Expansions, contractions etc...</p>
2095     * @param collator can be plain UCA
2096     * @param ce current ce
2097     * @param ch current character
2098     * @return next special ce
2099     */
2100    private int nextSpecial(RuleBasedCollator collator, int ce, char ch)
2101    {
2102        int codepoint = ch;
2103        Backup entrybackup = m_utilSpecialEntryBackUp_;
2104        // this is to handle recursive looping
2105        if (entrybackup != null) {
2106            m_utilSpecialEntryBackUp_ = null;
2107        }
2108        else {
2109            entrybackup = new Backup();
2110        }
2111        backupInternalState(entrybackup);
2112        try { // forces it to assign m_utilSpecialEntryBackup_
2113            while (true) {
2114                // This loop will repeat only in the case of contractions,
2115                // surrogate
2116                switch(RuleBasedCollator.getTag(ce)) {
2117                case CE_NOT_FOUND_TAG_:
2118                    // impossible case for icu4j
2119                    return ce;
2120                case RuleBasedCollator.CE_SURROGATE_TAG_:
2121                    if (isEnd()) {
2122                        return IGNORABLE;
2123                    }
2124                    backupInternalState(m_utilSpecialBackUp_);
2125                    char trail = (char)nextChar();
2126                    ce = nextSurrogate(collator, ce, trail);
2127                    // calculate the supplementary code point value,
2128                    // if surrogate was not tailored we go one more round
2129                    codepoint =
2130                        UCharacterProperty.getRawSupplementary(ch, trail);
2131                    break;
2132                case CE_SPEC_PROC_TAG_:
2133                    ce = nextSpecialPrefix(collator, ce, entrybackup);
2134                    break;
2135                case CE_CONTRACTION_TAG_:
2136                    ce = nextContraction(collator, ce);
2137                    break;
2138                case CE_LONG_PRIMARY_TAG_:
2139                    return nextLongPrimary(ce);
2140                case CE_EXPANSION_TAG_:
2141                    return nextExpansion(collator, ce);
2142                case CE_DIGIT_TAG_:
2143                    ce = nextDigit(collator, ce, codepoint);
2144                    break;
2145                    // various implicits optimization
2146                case CE_CJK_IMPLICIT_TAG_:
2147                    // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2148                    return nextImplicit(codepoint);
2149                case CE_IMPLICIT_TAG_: // everything that is not defined
2150                    return nextImplicit(codepoint);
2151                case CE_TRAIL_SURROGATE_TAG_:
2152                    return IGNORABLE; // DC00-DFFF broken surrogate
2153                case CE_LEAD_SURROGATE_TAG_:  // D800-DBFF
2154                    return nextSurrogate(ch);
2155                case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2156                    return nextHangul(collator, ch);
2157                case CE_CHARSET_TAG_:
2158                                    // not yet implemented probably after 1.8
2159                    return CE_NOT_FOUND_;
2160                default:
2161                    ce = IGNORABLE;
2162                    // synwee todo, throw exception or something here.
2163                }
2164                if (!RuleBasedCollator.isSpecial(ce)) {
2165                    break;
2166                }
2167            }
2168        } 
2169        finally {
2170            m_utilSpecialEntryBackUp_ = entrybackup;
2171        }
2172        return ce;
2173    }
2174
2175    /**
2176     * Special processing is getting a CE that is preceded by a certain prefix.
2177     * Currently this is only needed for optimizing Japanese length and
2178     * iteration marks. When we encouter a special processing tag, we go
2179     * backwards and try to see if we have a match. Contraction tables are used
2180     * - so the whole process is not unlike contraction. prefix data is stored
2181     * backwards in the table.
2182     * @param collator current collator
2183     * @param ce current ce
2184     * @return previous ce
2185     */
2186    private int previousSpecialPrefix(RuleBasedCollator collator, int ce)
2187    {
2188        backupInternalState(m_utilSpecialBackUp_);
2189        while (true) {
2190            // position ourselves at the begining of contraction sequence
2191            int offset = getContractionOffset(collator, ce);
2192            int entryoffset = offset;
2193            if (isBackwardsStart()) {
2194                ce = collator.m_contractionCE_[offset];
2195                break;
2196            }
2197            char prevch = (char)previousChar();
2198            while (prevch > collator.m_contractionIndex_[offset]) {
2199                // since contraction codepoints are ordered, we skip all that
2200                // are smaller
2201                offset ++;
2202            }
2203            if (prevch == collator.m_contractionIndex_[offset]) {
2204                ce = collator.m_contractionCE_[offset];
2205            }
2206            else {
2207                // if there is a completely ignorable code point in the middle
2208                // of a prefix, we need to act as if it's not there assumption:
2209                // 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to
2210                // zero)
2211                // lone surrogates cannot be set to zero as it would break
2212                // other processing
2213                int isZeroCE = collator.m_trie_.getLeadValue(prevch);
2214                // it's easy for BMP code points
2215                if (isZeroCE == 0) {
2216                    continue;
2217                }
2218                else if (UTF16.isTrailSurrogate(prevch)
2219                         || UTF16.isLeadSurrogate(prevch)) {
2220                    // for supplementary code points, we have to check the next one
2221                    // situations where we are going to ignore
2222                    // 1. beginning of the string: schar is a lone surrogate
2223                    // 2. schar is a lone surrogate
2224                    // 3. schar is a trail surrogate in a valid surrogate
2225                    //    sequence that is explicitly set to zero.
2226                    if (!isBackwardsStart()) {
2227                        char lead = (char)previousChar();
2228                        if (UTF16.isLeadSurrogate(lead)) {
2229                            isZeroCE = collator.m_trie_.getLeadValue(lead);
2230                            if (RuleBasedCollator.getTag(isZeroCE)
2231                                == RuleBasedCollator.CE_SURROGATE_TAG_) {
2232                                int finalCE = collator.m_trie_.getTrailValue(
2233                                                                      isZeroCE,
2234                                                                      prevch);
2235                                if (finalCE == 0) {
2236                                    // this is a real, assigned completely
2237                                    // ignorable code point
2238                                    continue;
2239                                }
2240                            }
2241                        }
2242                        else {
2243                            nextChar(); // revert to original offset
2244                            // lone surrogate, completely ignorable
2245                            continue;
2246                        }
2247                        nextChar(); // revert to original offset
2248                    }
2249                    else {
2250                         // lone surrogate at the beggining, completely ignorable
2251                         continue;
2252                    }
2253                }
2254
2255                // char was not in the table. prefix not found
2256                ce = collator.m_contractionCE_[entryoffset];
2257            }
2258
2259            if (!isSpecialPrefixTag(ce)) {
2260                // char was in the contraction table, and the corresponding ce
2261                // is not a prefix ce.  We found the prefix, break out of loop,
2262                // this ce will end up being returned.
2263                break;
2264            }
2265        }
2266        updateInternalState(m_utilSpecialBackUp_);
2267        return ce;
2268    }
2269
2270    /**
2271     * Retrieves the previous contraction ce. To ensure that the backwards and
2272     * forwards iteration matches, we take the current region of most possible
2273     * match and pass it through the forward iteration. This will ensure that
2274     * the obstinate problem of overlapping contractions will not occur.
2275     * @param collator current collator
2276     * @param ce current ce
2277     * @param ch current character
2278     * @return previous contraction ce
2279     */
2280    private int previousContraction(RuleBasedCollator collator, int ce, char ch)
2281    {
2282        m_utilStringBuffer_.setLength(0);
2283        // since we might encounter normalized characters (from the thai
2284        // processing) we can't use peekCharacter() here.
2285        char prevch = (char)previousChar();
2286        boolean atStart = false;
2287        // TODO: address the comment above - maybe now we *can* use peekCharacter
2288        //while (collator.isUnsafe(ch) || isThaiPreVowel(prevch)) {
2289        while (collator.isUnsafe(ch)) {
2290            m_utilStringBuffer_.insert(0, ch);
2291            ch = prevch;
2292            if (isBackwardsStart()) {
2293                atStart = true;
2294                break;
2295            }
2296            prevch = (char)previousChar();
2297        }
2298        if (!atStart) {
2299            // undo the previousChar() if we didn't reach the beginning 
2300            nextChar();
2301        }
2302        // adds the initial base character to the string
2303        m_utilStringBuffer_.insert(0, ch);
2304
2305        // a new collation element iterator is used to simply things, since
2306        // using the current collation element iterator will mean that the
2307        // forward and backwards iteration will share and change the same
2308        // buffers. it is going to be painful.
2309        int originaldecomp = collator.getDecomposition();
2310        // for faster access, since string would have been normalized above
2311        collator.setDecomposition(Collator.NO_DECOMPOSITION);
2312        if (m_utilColEIter_ == null) {
2313            m_utilColEIter_ = new CollationElementIterator(
2314                                                m_utilStringBuffer_.toString(),
2315                                                collator);
2316        }
2317        else {
2318            m_utilColEIter_.m_collator_ = collator;
2319            m_utilColEIter_.setText(m_utilStringBuffer_.toString());
2320        }
2321        ce = m_utilColEIter_.next();
2322        m_CEBufferSize_ = 0;
2323        while (ce != NULLORDER) {
2324            if (m_CEBufferSize_ == m_CEBuffer_.length) {
2325                try {
2326                    // increasing cebuffer size
2327                    int tempbuffer[] = new int[m_CEBuffer_.length + 50];
2328                    System.arraycopy(m_CEBuffer_, 0, tempbuffer, 0,
2329                                     m_CEBuffer_.length);
2330                    m_CEBuffer_ = tempbuffer;
2331                }
2332                catch( MissingResourceException   e)
2333                {
2334                    throw e;
2335                }
2336                catch (Exception   e) {
2337                    if(DEBUG){
2338                        e.printStackTrace();
2339                    }
2340                    return NULLORDER;
2341                }
2342            }
2343            m_CEBuffer_[m_CEBufferSize_ ++] = ce;
2344            ce = m_utilColEIter_.next();
2345        }
2346        collator.setDecomposition(originaldecomp);
2347        m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2348        return m_CEBuffer_[m_CEBufferOffset_];
2349    }
2350
2351    /**
2352     * Returns the previous long primary ces
2353     * @param ce long primary ce
2354     * @return previous long primary ces
2355     */
2356    private int previousLongPrimary(int ce)
2357    {
2358        m_CEBufferSize_ = 0;
2359        m_CEBuffer_[m_CEBufferSize_ ++] =
2360            ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_;
2361        m_CEBuffer_[m_CEBufferSize_ ++] = ((ce & 0xFF) << 24)
2362            | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2363        m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2364        return m_CEBuffer_[m_CEBufferOffset_];
2365    }
2366
2367    /**
2368     * Returns the previous expansion ces
2369     * @param collator current collator
2370     * @param ce current ce
2371     * @return previous expansion ce
2372     */
2373    private int previousExpansion(RuleBasedCollator collator, int ce)
2374    {
2375        // find the offset to expansion table
2376        int offset = getExpansionOffset(collator, ce);
2377        m_CEBufferSize_ = getExpansionCount(ce);
2378        if (m_CEBufferSize_ != 0) {
2379            // less than 16 elements in expansion
2380            for (int i = 0; i < m_CEBufferSize_; i ++) {
2381                m_CEBuffer_[i] = collator.m_expansion_[offset + i];
2382            }
2383
2384        }
2385        else {
2386            // null terminated ces
2387            while (collator.m_expansion_[offset + m_CEBufferSize_] != 0) {
2388                m_CEBuffer_[m_CEBufferSize_] =
2389                    collator.m_expansion_[offset + m_CEBufferSize_];
2390                m_CEBufferSize_ ++;
2391            }
2392        }
2393        m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2394        return m_CEBuffer_[m_CEBufferOffset_];
2395    }
2396    
2397    /**
2398     * Getting the digit collation elements
2399     * @param collator
2400     * @param ce current collation element
2401     * @param ch current code point
2402     * @return digit collation element
2403     */
2404    private int previousDigit(RuleBasedCollator collator, int ce, char ch)
2405    {
2406        // We do a check to see if we want to collate digits as numbers; if so we generate
2407        //  a custom collation key. Otherwise we pull out the value stored in the expansion table.
2408        if (m_collator_.m_isNumericCollation_){
2409            int leadingZeroIndex = 0;
2410            int collateVal = 0;
2411            boolean nonZeroValReached = false;
2412
2413            // clear and set initial string buffer length
2414            m_utilStringBuffer_.setLength(3);
2415        
2416            // We parse the source string until we hit a char that's NOT a digit
2417            // Use this u_charDigitValue. This might be slow because we have to 
2418            // handle surrogates...
2419            int char32 = ch;
2420            if (UTF16.isTrailSurrogate(ch)) {
2421                if (!isBackwardsStart()){
2422                    char lead = (char)previousChar();
2423                    if (UTF16.isLeadSurrogate(lead)) {
2424                        char32 = UCharacterProperty.getRawSupplementary(lead,
2425                                                                        ch);
2426                    } 
2427                    else {
2428                        goForwardOne();
2429                    }
2430                }
2431            } 
2432            int digVal = UCharacter.digit(char32);
2433            int digIndx = 0;
2434            for (;;) {
2435                // Make sure we have enough space.
2436                if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
2437                    m_utilStringBuffer_.setLength(m_utilStringBuffer_.length() 
2438                                                  << 1);
2439                }
2440                // Skipping over "trailing" zeroes but we still add to digIndx.
2441                if (digVal != 0 || nonZeroValReached) {
2442                    if (digVal != 0 && !nonZeroValReached) {
2443                        nonZeroValReached = true;
2444                    }
2445                
2446                    // We parse the digit string into base 100 numbers (this 
2447                    // fits into a byte).
2448                    // We only add to the buffer in twos, thus if we are 
2449                    // parsing an odd character, that serves as the 'tens' 
2450                    // digit while the if we are parsing an even one, that is 
2451                    // the 'ones' digit. We dumped the parsed base 100 value 
2452                    // (collateVal) into a buffer. We multiply each collateVal 
2453                    // by 2 (to give us room) and add 5 (to avoid overlapping 
2454                    // magic CE byte values). The last byte we subtract 1 to 
2455                    // ensure it is less than all the other bytes. 
2456                    // Since we're doing in this reverse we want to put the 
2457                    // first digit encountered into the ones place and the 
2458                    // second digit encountered into the tens place.
2459                
2460                    if (digIndx % 2 == 1){
2461                        collateVal += digVal * 10;
2462                    
2463                        // This removes leading zeroes.
2464                        if (collateVal == 0 && leadingZeroIndex == 0) {
2465                           leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2466                        }
2467                        else if (leadingZeroIndex != 0) {
2468                            leadingZeroIndex = 0;
2469                        }
2470                                            
2471                        m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2, 
2472                                                (char)((collateVal << 1) + 6));
2473                        collateVal = 0;
2474                    }
2475                    else {
2476                        collateVal = digVal;    
2477                    }
2478                }
2479                digIndx ++;
2480            
2481                if (!isBackwardsStart()){
2482                    backupInternalState(m_utilSpecialBackUp_);
2483                    char32 = previousChar();
2484                    ch = (char)ch;
2485                    if (UTF16.isTrailSurrogate(ch)){
2486                        if (!isBackwardsStart()) {
2487                            char lead = (char)previousChar();
2488                            if (UTF16.isLeadSurrogate(lead)) {
2489                                char32 
2490                                    = UCharacterProperty.getRawSupplementary(
2491                                                                    lead, ch);
2492                            } 
2493                            else {
2494                                updateInternalState(m_utilSpecialBackUp_);
2495                            }
2496                        }
2497                    }
2498                    
2499                    digVal = UCharacter.digit(char32);
2500                    if (digVal == -1) {
2501                        updateInternalState(m_utilSpecialBackUp_);
2502                        break;
2503                    }
2504                }
2505                else {
2506                    break;
2507                }
2508            }
2509
2510            if (nonZeroValReached == false) {
2511                digIndx = 2;
2512                m_utilStringBuffer_.setCharAt(2, (char)6);
2513            }
2514            
2515            if (digIndx % 2 != 0) {
2516                if (collateVal == 0 && leadingZeroIndex == 0) {
2517                    // This removes the leading 0 in a odd number sequence of 
2518                    // numbers e.g. avery001
2519                    leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2520                }
2521                else {
2522                    // this is not a leading 0, we add it in
2523                    m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
2524                                                (char)((collateVal << 1) + 6));
2525                    digIndx ++; 
2526                }               
2527            }
2528                     
2529            int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex 
2530                                               : ((digIndx >>> 1) + 2) ;  
2531            digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros         
2532            // Subtract one off of the last byte. 
2533            // Really the first byte here, but it's reversed...
2534            m_utilStringBuffer_.setCharAt(2, 
2535                                    (char)(m_utilStringBuffer_.charAt(2) - 1));          
2536            // We want to skip over the first two slots in the buffer. 
2537            // The first slot is reserved for the header byte CODAN_PLACEHOLDER. 
2538            // The second slot is for the sign/exponent byte: 
2539            // 0x80 + (decimalPos/2) & 7f.
2540            m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
2541            m_utilStringBuffer_.setCharAt(1, 
2542                                    (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
2543        
2544            // Now transfer the collation key to our collIterate struct.
2545            // The total size for our collation key is endIndx bumped up to the 
2546            // next largest even value divided by two.
2547            m_CEBufferSize_ = 0;
2548            m_CEBuffer_[m_CEBufferSize_ ++] 
2549                        = (((m_utilStringBuffer_.charAt(0) << 8)
2550                            // Primary weight 
2551                            | m_utilStringBuffer_.charAt(1)) 
2552                              << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2553                            // Secondary weight 
2554                            | (RuleBasedCollator.BYTE_COMMON_ 
2555                               << RuleBasedCollator.CE_SECONDARY_SHIFT_)
2556                            // Tertiary weight. 
2557                            | RuleBasedCollator.BYTE_COMMON_; 
2558             int i = endIndex - 1; // Reset the index into the buffer.
2559             while (i >= 2) {
2560                int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
2561                if (i >= 2) {
2562                    primWeight |= m_utilStringBuffer_.charAt(i --);
2563                }
2564                m_CEBuffer_[m_CEBufferSize_ ++] 
2565                    = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_) 
2566                      | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2567             }
2568             m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2569             return m_CEBuffer_[m_CEBufferOffset_];
2570         }
2571         else {
2572             return collator.m_expansion_[getExpansionOffset(collator, ce)];
2573         }
2574    } 
2575
2576    /**
2577     * Returns previous hangul ces
2578     * @param collator current collator
2579     * @param ch current character
2580     * @return previous hangul ce
2581     */
2582    private int previousHangul(RuleBasedCollator collator, char ch)
2583    {
2584        char L = (char)(ch - HANGUL_SBASE_);
2585        // we do it in this order since some compilers can do % and / in one
2586        // operation
2587        char T = (char)(L % HANGUL_TCOUNT_);
2588        L /= HANGUL_TCOUNT_;
2589        char V = (char)(L % HANGUL_VCOUNT_);
2590        L /= HANGUL_VCOUNT_;
2591
2592        // offset them
2593        L += HANGUL_LBASE_;
2594        V += HANGUL_VBASE_;
2595        T += HANGUL_TBASE_;
2596
2597        m_CEBufferSize_ = 0;
2598        if (!collator.m_isJamoSpecial_) {
2599            m_CEBuffer_[m_CEBufferSize_ ++] =
2600                collator.m_trie_.getLeadValue(L);
2601            m_CEBuffer_[m_CEBufferSize_ ++] =
2602                collator.m_trie_.getLeadValue(V);
2603            if (T != HANGUL_TBASE_) {
2604                m_CEBuffer_[m_CEBufferSize_ ++] =
2605                    collator.m_trie_.getLeadValue(T);
2606            }
2607            m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2608            return m_CEBuffer_[m_CEBufferOffset_];
2609        }
2610        else {
2611            // Since Hanguls pass the FCD check, it is guaranteed that we won't
2612            // be in the normalization buffer if something like this happens
2613            // Move Jamos into normalization buffer
2614            m_buffer_.append(L);
2615            m_buffer_.append(V);
2616            if (T != HANGUL_TBASE_) {
2617                m_buffer_.append(T);
2618            }
2619
2620            m_FCDStart_ = m_source_.getIndex();
2621            m_FCDLimit_ = m_FCDStart_ + 1;
2622            return IGNORABLE;
2623        }
2624    }
2625
2626    /**
2627     * Gets implicit codepoint ces
2628     * @param codepoint current codepoint
2629     * @return implicit codepoint ces
2630     */
2631    private int previousImplicit(int codepoint)
2632    {
2633        if (!UCharacter.isLegal(codepoint)) {
2634            return IGNORABLE; // illegal code value, completely ignoreable!
2635        }
2636        int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
2637        m_CEBufferSize_ = 2;
2638        m_CEBufferOffset_ = 1;
2639        m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
2640                         | 0x00000505;
2641        m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
2642        return m_CEBuffer_[1];
2643    }
2644
2645    /**
2646     * Gets the previous surrogate ce
2647     * @param ch current character
2648     * @return previous surrogate ce
2649     */
2650    private int previousSurrogate(char ch)
2651    {
2652        if (isBackwardsStart()) {
2653            // we are at the start of the string, wrong place to be at
2654            return IGNORABLE;
2655        }
2656        char prevch = (char)previousChar();
2657        // Handles Han and Supplementary characters here.
2658        if (UTF16.isLeadSurrogate(prevch)) {
2659            return previousImplicit(
2660                          UCharacterProperty.getRawSupplementary(prevch, ch));
2661        }
2662        if (prevch != CharacterIterator.DONE) {
2663            nextChar();
2664        }
2665        return IGNORABLE; // completely ignorable
2666    }
2667
2668    /**
2669     * <p>Special CE management. Expansions, contractions etc...</p>
2670     * @param collator can be plain UCA
2671     * @param ce current ce
2672     * @param ch current character
2673     * @return previous special ce
2674     */
2675    private int previousSpecial(RuleBasedCollator collator, int ce, char ch)
2676    {
2677        while(true) {
2678            // the only ces that loops are thai, special prefix and
2679            // contractions
2680            switch (RuleBasedCollator.getTag(ce)) {
2681            case CE_NOT_FOUND_TAG_:  // this tag always returns
2682                return ce;
2683            case RuleBasedCollator.CE_SURROGATE_TAG_:
2684                                // essentialy a disengaged lead surrogate. a broken
2685                                // sequence was encountered and this is an error
2686                return IGNORABLE;
2687            case CE_SPEC_PROC_TAG_:
2688                ce = previousSpecialPrefix(collator, ce);
2689                break;
2690            case CE_CONTRACTION_TAG_:
2691                // may loop for first character e.g. "0x0f71" for english
2692                if (isBackwardsStart()) {
2693                    // start of string or this is not the end of any contraction
2694                    ce = collator.m_contractionCE_[
2695                                            getContractionOffset(collator, ce)];
2696                    break;
2697                }
2698                return previousContraction(collator, ce, ch); // else
2699            case CE_LONG_PRIMARY_TAG_:
2700                return previousLongPrimary(ce);
2701            case CE_EXPANSION_TAG_: // always returns
2702                return previousExpansion(collator, ce);
2703            case CE_DIGIT_TAG_:
2704                ce = previousDigit(collator, ce, ch);
2705                break;
2706            case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2707                return previousHangul(collator, ch);
2708            case CE_LEAD_SURROGATE_TAG_:  // D800-DBFF
2709                return IGNORABLE; // broken surrogate sequence
2710            case CE_TRAIL_SURROGATE_TAG_: // DC00-DFFF
2711                return previousSurrogate(ch);
2712            case CE_CJK_IMPLICIT_TAG_:
2713                // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2714                return previousImplicit(ch);
2715            case CE_IMPLICIT_TAG_: // everything that is not defined
2716                // UCA is filled with these. Tailorings are NOT_FOUND
2717                return previousImplicit(ch);
2718            case CE_CHARSET_TAG_: // this tag always returns
2719                return CE_NOT_FOUND_;
2720            default: // this tag always returns
2721                ce = IGNORABLE;
2722            }
2723            if (!RuleBasedCollator.isSpecial(ce)) {
2724                break;
2725            }
2726        }
2727        return ce;
2728    }
2729
2730    /**
2731     * GET IMPLICIT PRIMARY WEIGHTS
2732     * @param cp codepoint
2733     * @param value is left justified primary key
2734     */
2735//    private static final int getImplicitPrimary(int cp)
2736//    {
2737//        cp = swapCJK(cp);
2738//
2739//        //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
2740//        // we now have a range of numbers from 0 to 21FFFF.
2741//        // we must skip all 00, 01, 02 bytes, so most bytes have 253 values
2742//        // we must leave a gap of 01 between all values of the last byte, so
2743//        // the last byte has 126 values (3 byte case)
2744//        // we shift so that HAN all has the same first primary, for
2745//        // compression.
2746//        // for the 4 byte case, we make the gap as large as we can fit.
2747//        // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
2748//        // Four byte forms (most supplementaries) are EF xx xx xx (with a gap
2749//        // of LAST2_MULTIPLIER == 14)
2750//
2751//        int last0 = cp - RuleBasedCollator.IMPLICIT_4BYTE_BOUNDARY_;
2752//        if (last0 < 0) {
2753//            int last1 = cp / RuleBasedCollator.LAST_COUNT_;
2754//            last0 = cp % RuleBasedCollator.LAST_COUNT_;
2755//
2756//            int last2 = last1 / RuleBasedCollator.OTHER_COUNT_;
2757//            last1 %= RuleBasedCollator.OTHER_COUNT_;
2758//            return RuleBasedCollator.IMPLICIT_BASE_3BYTE_ + (last2 << 24)
2759//                   + (last1 << 16)
2760//                   + ((last0 * RuleBasedCollator.LAST_MULTIPLIER_) << 8);
2761//        }
2762//        else {
2763//            int last1 = last0 / RuleBasedCollator.LAST_COUNT2_;
2764//            last0 %= RuleBasedCollator.LAST_COUNT2_;
2765//
2766//            int last2 = last1 / RuleBasedCollator.OTHER_COUNT_;
2767//            last1 %= RuleBasedCollator.OTHER_COUNT_;
2768//
2769//            int last3 = last2 / RuleBasedCollator.OTHER_COUNT_;
2770//            last2 %= RuleBasedCollator.OTHER_COUNT_;
2771//            return RuleBasedCollator.IMPLICIT_BASE_4BYTE_ + (last3 << 24)
2772//                   + (last2 << 16) + (last1 << 8)
2773//                   + (last0 * RuleBasedCollator.LAST2_MULTIPLIER_);
2774//        }
2775//    }
2776
2777//    /**
2778//     * Swapping CJK characters for implicit ces
2779//     * @param cp codepoint CJK
2780//     * @return swapped result
2781//     */
2782//    private static final int swapCJK(int cp)
2783//    {
2784//        if (cp >= CJK_BASE_) {
2785//            if (cp < CJK_LIMIT_) {
2786//                return cp - CJK_BASE_;
2787//            }
2788//            if (cp < CJK_COMPAT_USED_BASE_) {
2789//                return cp + NON_CJK_OFFSET_;
2790//            }
2791//            if (cp < CJK_COMPAT_USED_LIMIT_) {
2792//                return cp - CJK_COMPAT_USED_BASE_ + (CJK_LIMIT_ - CJK_BASE_);
2793//            }
2794//            if (cp < CJK_B_BASE_) {
2795//                return cp + NON_CJK_OFFSET_;
2796//            }
2797//            if (cp < CJK_B_LIMIT_) {
2798//                return cp; // non-BMP-CJK
2799//            }
2800//            return cp + NON_CJK_OFFSET_; // non-CJK
2801//        }
2802//        if (cp < CJK_A_BASE_) {
2803//            return cp + NON_CJK_OFFSET_;
2804//        }
2805//        if (cp < CJK_A_LIMIT_) {
2806//            return cp - CJK_A_BASE_ + (CJK_LIMIT_ - CJK_BASE_)
2807//                   + (CJK_COMPAT_USED_LIMIT_ - CJK_COMPAT_USED_BASE_);
2808//        }
2809//        return cp + NON_CJK_OFFSET_; // non-CJK
2810//    }
2811    
2812    /** 
2813     * Gets a character from the source string at a given offset.
2814     * Handles both normal and iterative cases.
2815     * No error checking and does not access the normalization buffer 
2816     * - caller beware!
2817     * @param offset offset from current position which character is to be 
2818     *               retrieved
2819     * @return character at current position + offset
2820     */
2821    private char peekCharacter(int offset) 
2822    {
2823        if (offset != 0) {
2824            int currentoffset = m_source_.getIndex();
2825            m_source_.setIndex(currentoffset + offset);
2826            char result = (char)m_source_.current();
2827            m_source_.setIndex(currentoffset);
2828            return result;
2829        } 
2830        else {
2831            return (char)m_source_.current();
2832        }
2833    }
2834    
2835    /**
2836     * Moves back 1 position in the source string. This is slightly less 
2837     * complicated than previousChar in that it doesn't normalize while 
2838     * moving back. Boundary checks are not performed.
2839     * This method is to be used with caution, with the assumption that 
2840     * moving back one position will not exceed the source limits.
2841     * Use only with nextChar() and never call this API twice in a row without
2842     * nextChar() in the middle.
2843     */
2844    private void goBackOne() 
2845    {
2846        if (m_bufferOffset_ >= 0) {
2847            m_bufferOffset_ --;
2848        }
2849        else {
2850            m_source_.setIndex(m_source_.getIndex() - 1);
2851        }
2852    }
2853    
2854    /**
2855     * Moves forward 1 position in the source string. This is slightly less 
2856     * complicated than nextChar in that it doesn't normalize while 
2857     * moving back. Boundary checks are not performed.
2858     * This method is to be used with caution, with the assumption that 
2859     * moving back one position will not exceed the source limits.
2860     * Use only with previousChar() and never call this API twice in a row 
2861     * without previousChar() in the middle.
2862     */
2863    private void goForwardOne() 
2864    {
2865        if (m_bufferOffset_ < 0) {
2866            // we're working on the source and not normalizing. fast path.
2867            // note Thai pre-vowel reordering uses buffer too
2868            m_source_.setIndex(m_source_.getIndex() + 1);
2869        }
2870        else {
2871            // we are in the buffer, buffer offset will never be 0 here
2872            m_bufferOffset_ ++;
2873        }
2874    }
2875}
2876
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags