RuleBasedCollator


1   //##header 1189099963000 FOUNDATION
2   /**
3   *******************************************************************************
4   * Copyright (C) 1996-2006, International Business Machines Corporation and    *
5   * others. All Rights Reserved.                                                *
6   *******************************************************************************
7   */
8   package com.ibm.icu.text;
9   
10  import java.io.IOException  ;
11  import java.text.CharacterIterator  ;
12  import java.text.ParseException  ;
13  import java.util.Arrays  ;
14  import java.util.MissingResourceException  ;
15  
16  //#ifndef FOUNDATION
17  //##import java.nio.ByteBuffer;
18  //#else
19  import com.ibm.icu.impl.ByteBuffer;
20  //#endif
21  
22  import com.ibm.icu.impl.BOCU;
23  import com.ibm.icu.impl.ICUDebug;
24  import com.ibm.icu.impl.ICUResourceBundle;
25  import com.ibm.icu.impl.ImplicitCEGenerator;
26  import com.ibm.icu.impl.IntTrie;
27  import com.ibm.icu.impl.StringUCharacterIterator;
28  import com.ibm.icu.impl.Trie;
29  import com.ibm.icu.impl.TrieIterator;
30  import com.ibm.icu.impl.Utility;
31  import com.ibm.icu.lang.UCharacter;
32  import com.ibm.icu.util.RangeValueIterator;
33  import com.ibm.icu.util.ULocale;
34  import com.ibm.icu.util.UResourceBundle;
35  import com.ibm.icu.util.VersionInfo;
36  
37  /**
38   * <p>RuleBasedCollator is a concrete subclass of Collator. It allows
39   * customization of the Collator via user-specified rule sets.
40   * RuleBasedCollator is designed to be fully compliant to the <a
41   * HREF="http://www.unicode.org/unicode/reports/tr10/"> Unicode
42   * Collation Algorithm (UCA)</a> and conforms to ISO 14651.</p>
43   *
44   * <p>Users are strongly encouraged to read <a
45   * HREF="http://icu.sourceforge.net/userguide/Collate_Intro.html">
46   * the users guide</a> for more information about the collation
47   * service before using this class.</p>
48   *
49   * <p>Create a RuleBasedCollator from a locale by calling the
50   * getInstance(Locale) factory method in the base class Collator.
51   * Collator.getInstance(Locale) creates a RuleBasedCollator object
52   * based on the collation rules defined by the argument locale.  If a
53   * customized collation ordering ar attributes is required, use the
54   * RuleBasedCollator(String) constructor with the appropriate
55   * rules. The customized RuleBasedCollator will base its ordering on
56   * UCA, while re-adjusting the attributes and orders of the characters
57   * in the specified rule accordingly.</p>
58   *
59   * <p>RuleBasedCollator provides correct collation orders for most
60   * locales supported in ICU. If specific data for a locale is not
61   * available, the orders eventually falls back to the <a
62   * HREF="http://www.unicode.org/unicode/reports/tr10/">UCA collation
63   * order </a>.</p>
64   *
65   * <p>For information about the collation rule syntax and details
66   * about customization, please refer to the
67   * <a HREF="http://icu.sourceforge.net/userguide/Collate_Customization.html">
68   * Collation customization</a> section of the user's guide.</p>
69   *
70   * <p><strong>Note</strong> that there are some differences between
71   * the Collation rule syntax used in Java and ICU4J:
72   *
73   * <ul>
74   * <li>According to the JDK documentation:
75   * <i>
76   * <p>
77   * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule
78   * is in force when a Thai vowel of the range &#92;U0E40-&#92;U0E44 precedes a
79   * Thai consonant of the range &#92;U0E01-&#92;U0E2E OR a Lao vowel of the
80   * range &#92;U0EC0-&#92;U0EC4 precedes a Lao consonant of the range
81   * &#92;U0E81-&#92;U0EAE then the
82   * vowel is placed after the consonant for collation purposes.
83   * </p>
84   * <p>
85   * If a rule is without the modifier '!', the Thai/Lao vowel-consonant
86   * swapping is not turned on.
87   * </p>
88   * </i>
89   * <p>
90   * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao
91   * vowel-consonant swapping, since the UCA clearly states that it has to be
92   * supported to ensure a correct sorting order. If a '!' is encountered, it is
93   * ignored.
94   * </p>
95   * <li>As mentioned in the documentation of the base class Collator,
96   *     compatibility decomposition mode is not supported.
97   * </ul>
98   * <p>
99   * <strong>Examples</strong>
100  * </p>
101  * <p>
102  * Creating Customized RuleBasedCollators:
103  * <blockquote>
104  * <pre>
105  * String simple = "&amp; a &lt; b &lt; c &lt; d";
106  * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
107  *
108  * String norwegian = "&amp; a , A &lt; b , B &lt; c , C &lt; d , D &lt; e , E "
109  *                    + "&lt; f , F &lt; g , G &lt; h , H &lt; i , I &lt; j , "
110  *                    + "J &lt; k , K &lt; l , L &lt; m , M &lt; n , N &lt; "
111  *                    + "o , O &lt; p , P &lt; q , Q &lt r , R &lt s , S &lt; "
112  *                    + "t , T &lt; u , U &lt; v , V &lt; w , W &lt; x , X "
113  *                    + "&lt; y , Y &lt; z , Z &lt; &#92;u00E5 = a&#92;u030A "
114  *                    + ", &#92;u00C5 = A&#92;u030A ; aa , AA &lt; &#92;u00E6 "
115  *                    + ", &#92;u00C6 &lt; &#92;u00F8 , &#92;u00D8";
116  * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
117  * </pre>
118  * </blockquote>
119  *
120  * Concatenating rules to combine <code>Collator</code>s:
121  * <blockquote>
122  * <pre>
123  * // Create an en_US Collator object
124  * RuleBasedCollator en_USCollator = (RuleBasedCollator)
125  *     Collator.getInstance(new Locale("en", "US", ""));
126  * // Create a da_DK Collator object
127  * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
128  *     Collator.getInstance(new Locale("da", "DK", ""));
129  * // Combine the two
130  * // First, get the collation rules from en_USCollator
131  * String en_USRules = en_USCollator.getRules();
132  * // Second, get the collation rules from da_DKCollator
133  * String da_DKRules = da_DKCollator.getRules();
134  * RuleBasedCollator newCollator =
135  *                             new RuleBasedCollator(en_USRules + da_DKRules);
136  * // newCollator has the combined rules
137  * </pre>
138  * </blockquote>
139  *
140  * Making changes to an existing RuleBasedCollator to create a new
141  * <code>Collator</code> object, by appending changes to the existing rule:
142  * <blockquote>
143  * <pre>
144  * // Create a new Collator object with additional rules
145  * String addRules = "&amp; C &lt; ch, cH, Ch, CH";
146  * RuleBasedCollator myCollator =
147  *     new RuleBasedCollator(en_USCollator + addRules);
148  * // myCollator contains the new rules
149  * </pre>
150  * </blockquote>
151  *
152  * How to change the order of non-spacing accents:
153  * <blockquote>
154  * <pre>
155  * // old rule with main accents
156  * String oldRules = "= &#92;u0301 ; &#92;u0300 ; &#92;u0302 ; &#92;u0308 "
157  *                 + "; &#92;u0327 ; &#92;u0303 ; &#92;u0304 ; &#92;u0305 "
158  *                 + "; &#92;u0306 ; &#92;u0307 ; &#92;u0309 ; &#92;u030A "
159  *                 + "; &#92;u030B ; &#92;u030C ; &#92;u030D ; &#92;u030E "
160  *                 + "; &#92;u030F ; &#92;u0310 ; &#92;u0311 ; &#92;u0312 "
161  *                 + "&lt; a , A ; ae, AE ; &#92;u00e6 , &#92;u00c6 "
162  *                 + "&lt; b , B &lt; c, C &lt; e, E &amp; C &lt; d , D";
163  * // change the order of accent characters
164  * String addOn = "&amp; &#92;u0300 ; &#92;u0308 ; &#92;u0302";
165  * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
166  * </pre>
167  * </blockquote>
168  *
169  * Putting in a new primary ordering before the default setting,
170  * e.g. sort English characters before or after Japanese characters in the Japanese
171  * <code>Collator</code>:
172  * <blockquote>
173  * <pre>
174  * // get en_US Collator rules
175  * RuleBasedCollator en_USCollator
176  *                        = (RuleBasedCollator)Collator.getInstance(Locale.US);
177  * // add a few Japanese characters to sort before English characters
178  * // suppose the last character before the first base letter 'a' in
179  * // the English collation rule is &#92;u2212
180  * String jaString = "& &#92;u2212 &lt &#92;u3041, &#92;u3042 &lt &#92;u3043, "
181  *                   + "&#92;u3044";
182  * RuleBasedCollator myJapaneseCollator
183  *              = new RuleBasedCollator(en_USCollator.getRules() + jaString);
184  * </pre>
185  * </blockquote>
186  * </p>
187  * <p>
188  * This class is not subclassable
189  * </p>
190  * @author Syn Wee Quek
191  * @stable ICU 2.8
192  */
193 public final class RuleBasedCollator extends Collator
194 {   
195     // public constructors ---------------------------------------------------
196 
197     /**
198      * <p>
199      * Constructor that takes the argument rules for
200      * customization. The collator will be based on UCA,
201      * with the attributes and re-ordering of the characters specified in the
202      * argument rules.
203      * </p>
204      * <p>See the user guide's section on
205      * <a HREF="http://icu.sourceforge.net/userguide/Collate_Customization.html">
206      * Collation Customization</a> for details on the rule syntax.
207      * </p>
208      * @param rules the collation rules to build the collation table from.
209      * @exception ParseException and IOException thrown. ParseException thrown
210      *            when argument rules have an invalid syntax. IOException
211      *            thrown when an error occured while reading internal data.
212      * @stable ICU 2.8
213      */
214     public RuleBasedCollator(String   rules) throws Exception  
215     {
216         checkUCA();
217         if (rules == null) {
218             throw new IllegalArgumentException  (
219                                             "Collation rules can not be null");
220         }
221         init(rules);
222     }
223 
224     // public methods --------------------------------------------------------
225 
226     /**
227      * Clones the RuleBasedCollator
228      * @return a new instance of this RuleBasedCollator object
229      * @stable ICU 2.8
230      */
231     public Object   clone() throws CloneNotSupportedException  
232     {
233         RuleBasedCollator result = (RuleBasedCollator)super.clone();
234         if (latinOneCEs_ != null) {
235             result.m_reallocLatinOneCEs_ = true;
236         }
237         // since all collation data in the RuleBasedCollator do not change
238         // we can safely assign the result.fields to this collator
239         result.initUtility(false);  // let the new clone have their own util
240                                     // iterators
241         return result;
242     }
243 
244     /**
245      * Return a CollationElementIterator for the given String.
246      * @see CollationElementIterator
247      * @stable ICU 2.8
248      */
249     public CollationElementIterator getCollationElementIterator(String   source)
250     {
251         return new CollationElementIterator(source, this);
252     }
253 
254     /**
255      * Return a CollationElementIterator for the given CharacterIterator.
256      * The source iterator's integrity will be preserved since a new copy
257      * will be created for use.
258      * @see CollationElementIterator
259      * @stable ICU 2.8
260      */
261     public CollationElementIterator getCollationElementIterator(
262                                                 CharacterIterator   source)
263     {
264         CharacterIterator   newsource = (CharacterIterator  )source.clone();
265         return new CollationElementIterator(newsource, this);
266     }
267     
268     /**
269      * Return a CollationElementIterator for the given UCharacterIterator.
270      * The source iterator's integrity will be preserved since a new copy
271      * will be created for use.
272      * @see CollationElementIterator
273      * @stable ICU 2.8
274      */
275     public CollationElementIterator getCollationElementIterator(
276                                                 UCharacterIterator source)
277     {
278         return new CollationElementIterator(source, this);
279     }
280 
281     // public setters --------------------------------------------------------
282 
283     /**
284      * Sets the Hiragana Quaternary mode to be on or off.
285      * When the Hiragana Quaternary mode is turned on, the collator
286      * positions Hiragana characters before all non-ignorable characters in
287      * QUATERNARY strength. This is to produce a correct JIS collation order,
288      * distinguishing between Katakana  and Hiragana characters.
289      * @param flag true if Hiragana Quaternary mode is to be on, false
290      *        otherwise
291      * @see #setHiraganaQuaternaryDefault
292      * @see #isHiraganaQuaternary
293      * @stable ICU 2.8
294      */
295     public void setHiraganaQuaternary(boolean flag)
296     {
297         m_isHiragana4_ = flag;
298         updateInternalState();        
299     }
300 
301     /**
302      * Sets the Hiragana Quaternary mode to the initial mode set during
303      * construction of the RuleBasedCollator.
304      * See setHiraganaQuaternary(boolean) for more details.
305      * @see #setHiraganaQuaternary(boolean)
306      * @see #isHiraganaQuaternary
307      * @stable ICU 2.8
308      */
309     public void setHiraganaQuaternaryDefault()
310     {
311         m_isHiragana4_ = m_defaultIsHiragana4_;
312         updateInternalState();
313     }
314 
315     /**
316      * Sets whether uppercase characters sort before lowercase
317      * characters or vice versa, in strength TERTIARY. The default
318      * mode is false, and so lowercase characters sort before uppercase
319      * characters.
320      * If true, sort upper case characters first.
321      * @param upperfirst true to sort uppercase characters before
322      *                   lowercase characters, false to sort lowercase
323      *                   characters before uppercase characters
324      * @see #isLowerCaseFirst
325      * @see #isUpperCaseFirst
326      * @see #setLowerCaseFirst
327      * @see #setCaseFirstDefault
328      * @stable ICU 2.8
329      */
330     public void setUpperCaseFirst(boolean upperfirst)
331     {
332         if (upperfirst) {
333             if(m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
334                 latinOneRegenTable_ = true;
335             }
336             m_caseFirst_ = AttributeValue.UPPER_FIRST_;
337         }
338         else {
339             if(m_caseFirst_ != AttributeValue.OFF_) {
340                 latinOneRegenTable_ = true;
341             }
342             m_caseFirst_ = AttributeValue.OFF_;
343         }
344         updateInternalState();
345     }
346 
347     /**
348      * Sets the orders of lower cased characters to sort before upper cased
349      * characters, in strength TERTIARY. The default
350      * mode is false.
351      * If true is set, the RuleBasedCollator will sort lower cased characters
352      * before the upper cased ones.
353      * Otherwise, if false is set, the RuleBasedCollator will ignore case
354      * preferences.
355      * @param lowerfirst true for sorting lower cased characters before
356      *                   upper cased characters, false to ignore case
357      *                   preferences.
358      * @see #isLowerCaseFirst
359      * @see #isUpperCaseFirst
360      * @see #setUpperCaseFirst
361      * @see #setCaseFirstDefault
362      * @stable ICU 2.8
363      */
364     public void setLowerCaseFirst(boolean lowerfirst)
365     {
366         if (lowerfirst) {
367                 if(m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
368                     latinOneRegenTable_ = true;
369                 }
370                 m_caseFirst_ = AttributeValue.LOWER_FIRST_;
371         }
372         else {
373                 if(m_caseFirst_ != AttributeValue.OFF_) {
374                     latinOneRegenTable_ = true;
375                 }
376             m_caseFirst_ = AttributeValue.OFF_;
377             }
378         updateInternalState();
379     }
380 
381     /**
382      * Sets the case first mode to the initial mode set during
383      * construction of the RuleBasedCollator.
384      * See setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more
385      * details.
386      * @see #isLowerCaseFirst
387      * @see #isUpperCaseFirst
388      * @see #setLowerCaseFirst(boolean)
389      * @see #setUpperCaseFirst(boolean)
390      * @stable ICU 2.8
391      */
392     public final void setCaseFirstDefault()
393     {
394         if(m_caseFirst_ != m_defaultCaseFirst_) {
395             latinOneRegenTable_ = true;
396         }
397         m_caseFirst_ = m_defaultCaseFirst_;
398         updateInternalState();
399     }
400 
401     /**
402      * Sets the alternate handling mode to the initial mode set during
403      * construction of the RuleBasedCollator.
404      * See setAlternateHandling(boolean) for more details.
405      * @see #setAlternateHandlingShifted(boolean)
406      * @see #isAlternateHandlingShifted()
407      * @stable ICU 2.8
408      */
409     public void setAlternateHandlingDefault()
410     {
411         m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
412         updateInternalState();
413     }
414 
415     /**
416      * Sets the case level mode to the initial mode set during
417      * construction of the RuleBasedCollator.
418      * See setCaseLevel(boolean) for more details.
419      * @see #setCaseLevel(boolean)
420      * @see #isCaseLevel
421      * @stable ICU 2.8
422      */
423     public void setCaseLevelDefault()
424     {
425         m_isCaseLevel_ = m_defaultIsCaseLevel_;
426         updateInternalState();
427     }
428 
429     /**
430      * Sets the decomposition mode to the initial mode set during construction
431      * of the RuleBasedCollator.
432      * See setDecomposition(int) for more details.
433      * @see #getDecomposition
434      * @see #setDecomposition(int)
435      * @stable ICU 2.8
436      */
437     public void setDecompositionDefault()
438     {
439         setDecomposition(m_defaultDecomposition_);
440         updateInternalState();        
441     }
442 
443     /**
444      * Sets the French collation mode to the initial mode set during
445      * construction of the RuleBasedCollator.
446      * See setFrenchCollation(boolean) for more details.
447      * @see #isFrenchCollation
448      * @see #setFrenchCollation(boolean)
449      * @stable ICU 2.8
450      */
451     public void setFrenchCollationDefault()
452     {
453         if(m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
454             latinOneRegenTable_ = true;
455         }
456         m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
457         updateInternalState();
458     }
459 
460     /**
461      * Sets the collation strength to the initial mode set during the
462      * construction of the RuleBasedCollator.
463      * See setStrength(int) for more details.
464      * @see #setStrength(int)
465      * @see #getStrength
466      * @stable ICU 2.8
467      */
468     public void setStrengthDefault()
469     {
470         setStrength(m_defaultStrength_);
471         updateInternalState();        
472     }
473     
474     /**
475      * Method to set numeric collation to its default value.
476      * When numeric collation is turned on, this Collator generates a collation 
477      * key for the numeric value of substrings of digits. This is a way to get 
478      * '100' to sort AFTER '2'
479      * @see #getNumericCollation
480      * @see #setNumericCollation
481      * @stable ICU 2.8
482      */
483     public void setNumericCollationDefault()
484     {
485         setNumericCollation(m_defaultIsNumericCollation_);
486         updateInternalState();        
487     }
488 
489     /**
490      * Sets the mode for the direction of SECONDARY weights to be used in
491      * French collation.
492      * The default value is false, which treats SECONDARY weights in the order
493      * they appear.
494      * If set to true, the SECONDARY weights will be sorted backwards.
495      * See the section on
496      * <a HREF="http://icu.sourceforge.net/userguide/Collate_ServiceArchitecture.html">
497      * French collation</a> for more information.
498      * @param flag true to set the French collation on, false to set it off
499      * @stable ICU 2.8
500      * @see #isFrenchCollation
501      * @see #setFrenchCollationDefault
502      */
503     public void setFrenchCollation(boolean flag)
504     {
505         if(m_isFrenchCollation_ != flag) {
506             latinOneRegenTable_ = true;
507         }
508         m_isFrenchCollation_ = flag;
509         updateInternalState();
510     }
511 
512     /**
513      * Sets the alternate handling for QUATERNARY strength to be either
514      * shifted or non-ignorable.
515      * See the UCA definition on
516      * <a HREF="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting">
517      * Alternate Weighting</a>.
518      * This attribute will only be effective when QUATERNARY strength is set.
519      * The default value for this mode is false, corresponding to the
520      * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the
521      * RuleBasedCollator will treats all the codepoints with non-ignorable
522      * primary weights in the same way.
523      * If the mode is set to true, the behaviour corresponds to SHIFTED defined
524      * in UCA, this causes codepoints with PRIMARY orders that are equal or
525      * below the variable top value to be ignored in PRIMARY order and
526      * moved to the QUATERNARY order.
527      * @param shifted true if SHIFTED behaviour for alternate handling is
528      *        desired, false for the NON_IGNORABLE behaviour.
529      * @see #isAlternateHandlingShifted
530      * @see #setAlternateHandlingDefault
531      * @stable ICU 2.8
532      */
533     public void setAlternateHandlingShifted(boolean shifted)
534     {
535         m_isAlternateHandlingShifted_ = shifted;
536         updateInternalState();
537     }
538 
539     /**
540      * <p>
541      * When case level is set to true, an additional weight is formed
542      * between the SECONDARY and TERTIARY weight, known as the case level.
543      * The case level is used to distinguish large and small Japanese Kana
544      * characters. Case level could also be used in other situations.
545      * For example to distinguish certain Pinyin characters.
546      * The default value is false, which means the case level is not generated.
547      * The contents of the case level are affected by the case first
548      * mode. A simple way to ignore accent differences in a string is to set
549      * the strength to PRIMARY and enable case level.
550      * </p>
551      * <p>
552      * See the section on
553      * <a HREF="http://icu.sourceforge.net/userguide/Collate_ServiceArchitecture.html">
554      * case level</a> for more information.
555      * </p>
556      * @param flag true if case level sorting is required, false otherwise
557      * @stable ICU 2.8
558      * @see #setCaseLevelDefault
559      * @see #isCaseLevel
560      */
561     public void setCaseLevel(boolean flag)
562     {
563         m_isCaseLevel_ = flag;
564         updateInternalState();
565     }
566 
567     /**
568      * <p>
569      * Sets this Collator's strength property. The strength property
570      * determines the minimum level of difference considered significant
571      * during comparison.
572      * </p>
573      * <p>See the Collator class description for an example of use.</p>
574      * @param newStrength the new strength value.
575      * @see #getStrength
576      * @see #setStrengthDefault
577      * @see #PRIMARY
578      * @see #SECONDARY
579      * @see #TERTIARY
580      * @see #QUATERNARY
581      * @see #IDENTICAL
582      * @exception IllegalArgumentException If the new strength value is not one
583      *              of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
584      * @stable ICU 2.8
585      */
586     public void setStrength(int newStrength)
587     {
588         super.setStrength(newStrength);
589         updateInternalState();
590     }
591     
592     /** 
593      * <p>
594      * Variable top is a two byte primary value which causes all the codepoints 
595      * with primary values that are less or equal than the variable top to be 
596      * shifted when alternate handling is set to SHIFTED.
597      * </p>
598      * <p>
599      * Sets the variable top to a collation element value of a string supplied.
600      * </p> 
601      * @param varTop one or more (if contraction) characters to which the 
602      *               variable top should be set
603      * @return a int value containing the value of the variable top in upper 16
604      *         bits. Lower 16 bits are undefined.
605      * @exception IllegalArgumentException is thrown if varTop argument is not 
606      *            a valid variable top element. A variable top element is 
607      *            invalid when 
608      *            <ul>
609      *            <li>it is a contraction that does not exist in the
610      *                Collation order
611      *            <li>when the PRIMARY strength collation element for the 
612      *                variable top has more than two bytes
613      *            <li>when the varTop argument is null or zero in length.
614      *            </ul>
615      * @see #getVariableTop
616      * @see RuleBasedCollator#setAlternateHandlingShifted
617      * @stable ICU 2.6
618      */
619     public int setVariableTop(String   varTop)
620     {
621         if (varTop == null || varTop.length() == 0) {
622             throw new IllegalArgumentException  (
623             "Variable top argument string can not be null or zero in length.");
624         }
625         if (m_srcUtilIter_ == null) {
626             initUtility(true);
627         }
628 
629         m_srcUtilColEIter_.setText(varTop);
630         int ce = m_srcUtilColEIter_.next();
631         
632         // here we check if we have consumed all characters 
633         // you can put in either one character or a contraction
634         // you shouldn't put more... 
635         if (m_srcUtilColEIter_.getOffset() != varTop.length() 
636             || ce == CollationElementIterator.NULLORDER) {
637             throw new IllegalArgumentException  (
638             "Variable top argument string is a contraction that does not exist "
639             + "in the Collation order");
640         }
641         
642         int nextCE = m_srcUtilColEIter_.next();
643         
644         if ((nextCE != CollationElementIterator.NULLORDER) 
645             && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
646                 throw new IllegalArgumentException  (
647                 "Variable top argument string can only have a single collation "
648                 + "element that has less than or equal to two PRIMARY strength "
649                 + "bytes");
650         }
651         
652         m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;
653         
654         return ce & CE_PRIMARY_MASK_;
655     }
656     
657     /** 
658      * Sets the variable top to a collation element value supplied.
659      * Variable top is set to the upper 16 bits. 
660      * Lower 16 bits are ignored.
661      * @param varTop Collation element value, as returned by setVariableTop or 
662      *               getVariableTop
663      * @see #getVariableTop
664      * @see #setVariableTop(String)
665      * @stable ICU 2.6
666      */
667     public void setVariableTop(int varTop)
668     {
669         m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
670     }
671     
672     /**
673      * When numeric collation is turned on, this Collator generates a collation 
674      * key for the numeric value of substrings of digits. This is a way to get 
675      * '100' to sort AFTER '2'
676      * @param flag true to turn numeric collation on and false to turn it off
677      * @see #getNumericCollation
678      * @see #setNumericCollationDefault
679      * @stable ICU 2.8
680      */
681     public void setNumericCollation(boolean flag)
682     {
683         // sort substrings of digits as numbers
684         m_isNumericCollation_ = flag;
685         updateInternalState();
686     }
687 
688     // public getters --------------------------------------------------------
689 
690     /**
691      * Gets the collation rules for this RuleBasedCollator.
692      * Equivalent to String getRules(RuleOption.FULL_RULES).
693      * @return returns the collation rules
694      * @see #getRules(boolean)
695      * @stable ICU 2.8
696      */
697     public String   getRules()
698     {
699         return m_rules_;
700     }
701     
702     /**
703      * Returns current rules. The argument defines whether full rules 
704      * (UCA + tailored) rules are returned or just the tailoring. 
705      * @param fullrules true if the rules that defines the full set of 
706      *        collation order is required, otherwise false for returning only 
707      *        the tailored rules
708      * @return the current rules that defines this Collator.
709      * @see #getRules()
710      * @stable ICU 2.6
711      */
712     public String   getRules(boolean fullrules)
713     {
714         if (!fullrules) {
715             return m_rules_;
716         }
717         // take the UCA rules and append real rules at the end 
718         return UCA_.m_rules_.concat(m_rules_);
719     }
720 
721     /**
722      * Get an UnicodeSet that contains all the characters and sequences
723      * tailored in this collator.
724      * @return a pointer to a UnicodeSet object containing all the
725      *         code points and sequences that may sort differently than
726      *         in the UCA.
727      * @exception ParseException thrown when argument rules have an
728      *            invalid syntax. IOException
729      * @stable ICU 2.4
730      */
731     public UnicodeSet getTailoredSet()
732     {
733         try {
734            CollationRuleParser src = new CollationRuleParser(getRules());
735            return src.getTailoredSet();
736         } catch(Exception   e) {
737             throw new IllegalStateException  ("A tailoring rule should not " +
738                 "have errors. Something is quite wrong!");
739         }
740     }
741 
742     private class contContext {
743         RuleBasedCollator coll;
744         UnicodeSet contractions;
745         UnicodeSet expansions;
746         UnicodeSet removedContractions;
747         boolean addPrefixes;       
748         contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions, 
749                 UnicodeSet removedContractions, boolean addPrefixes) {
750             this.coll = coll;
751             this.contractions = contractions;
752             this.expansions = expansions;
753             this.removedContractions = removedContractions;
754             this.addPrefixes = addPrefixes;
755         }
756     }
757     
758     private void
759     addSpecial(contContext c, StringBuffer   buffer, int CE)
760     {
761         StringBuffer   b = new StringBuffer  ();
762         int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;
763         int newCE = c.coll.m_contractionCE_[offset];
764         // we might have a contraction that ends from previous level
765         if(newCE != CollationElementIterator.CE_NOT_FOUND_) {
766             if(isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_ 
767                     && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_ 
768                     && c.addPrefixes) {
769                 addSpecial(c, buffer, newCE);
770             }
771             if(buffer.length() > 1) {
772                 if(c.contractions != null) {
773                     c.contractions.add(buffer.toString());
774                 }
775                 if(c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
776                     c.expansions.add(buffer.toString());
777                 }
778             }
779         }    
780         
781         offset++;
782         // check whether we're doing contraction or prefix
783         if(getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
784             while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
785                 b.delete(0, b.length());
786                 b.append(buffer);
787                 newCE = c.coll.m_contractionCE_[offset];
788                 b.insert(0, c.coll.m_contractionIndex_[offset]);
789                 if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
790                     addSpecial(c, b, newCE);
791                 } else {
792                     if(c.contractions != null) {
793                         c.contractions.add(b.toString());
794                     }
795                     if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
796                         c.expansions.add(b.toString());
797                     }
798                 }
799                 offset++;
800             }
801         } else if(getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
802             while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
803                 b.delete(0, b.length());
804                 b.append(buffer);
805                 newCE = c.coll.m_contractionCE_[offset];
806                 b.append(c.coll.m_contractionIndex_[offset]);
807                 if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
808                     addSpecial(c, b, newCE);
809                 } else {
810                     if(c.contractions != null) {
811                         c.contractions.add(b.toString());
812                     }
813                     if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
814                         c.expansions.add(b.toString());
815                     }
816                 }
817                 offset++;
818             }
819         }
820     }
821     
822     private
823     void processSpecials(contContext c) 
824     {
825         int internalBufferSize = 512;
826         TrieIterator trieiterator 
827         = new TrieIterator(c.coll.m_trie_);
828         RangeValueIterator.Element element = new RangeValueIterator.Element();
829         while (trieiterator.next(element)) {
830             int start = element.start;
831             int limit = element.limit;
832             int CE = element.value;
833             StringBuffer   contraction = new StringBuffer  (internalBufferSize);
834             
835             if(isSpecial(CE)) {
836                 if(((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
837                     while(start < limit) {
838                         // if there are suppressed contractions, we don't 
839                         // want to add them.
840                         if(c.removedContractions != null && c.removedContractions.contains(start)) {
841                             start++;
842                             continue;
843                         }
844                         // we start our contraction from middle, since we don't know if it
845                         // will grow toward right or left
846                         contraction.append((char) start);
847                         addSpecial(c, contraction, CE);
848                         start++;
849                     }
850                 } else if(c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
851                     while(start < limit) {
852                         c.expansions.add(start++);
853                     }
854                 }
855             }
856         }
857     }
858     
859     /**
860      * Gets unicode sets containing contractions and/or expansions of a collator
861      * @param contractions if not null, set to contain contractions
862      * @param expansions if not null, set to contain expansions
863      * @param addPrefixes add the prefix contextual elements to contractions
864      * @throws Exception 
865      * @draft ICU 3.4
866      * @provisional This API might change or be removed in a future release.
867      */
868     public void
869     getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions,
870             boolean addPrefixes) throws Exception   {
871         if(contractions != null) {
872             contractions.clear();
873         }
874         if(expansions != null) {
875             expansions.clear();
876         }
877         int rulesLen = 0;
878         String   rules = getRules();
879         try {
880             CollationRuleParser src = new CollationRuleParser(rules);
881             contContext c = new contContext(RuleBasedCollator.UCA_, 
882                     contractions, expansions, src.m_removeSet_, addPrefixes);
883             
884             // Add the UCA contractions
885             processSpecials(c);
886             // This is collator specific. Add contractions from a collator
887             c.coll = this;
888             c.removedContractions =  null;
889             processSpecials(c);
890         } catch (Exception   e) {
891             throw e;
892         }
893     }
894     
895     /**
896      * <p>
897      * Get a Collation key for the argument String source from this
898      * RuleBasedCollator.
899      * </p>
900      * <p>
901      * General recommendation: <br>
902      * If comparison are to be done to the same String multiple times, it would
903      * be more efficient to generate CollationKeys for the Strings and use
904      * CollationKey.compareTo(CollationKey) for the comparisons.
905      * If the each Strings are compared to only once, using the method
906      * RuleBasedCollator.compare(String, String) will have a better performance.
907      * </p>
908      * <p>
909      * See the class documentation for an explanation about CollationKeys.
910      * </p>
911      * @param source the text String to be transformed into a collation key.
912      * @return the CollationKey for the given String based on this
913      *         RuleBasedCollator's collation rules. If the source String is
914      *         null, a null CollationKey is returned.
915      * @see CollationKey
916      * @see #compare(String, String)
917      * @see #getRawCollationKey
918      * @stable ICU 2.8
919      */
920     public CollationKey getCollationKey(String   source) {
921         if (source == null) {
922             return null;
923         }
924         m_utilRawCollationKey_ = getRawCollationKey(source, 
925                                                     m_utilRawCollationKey_);
926         return new CollationKey(source, m_utilRawCollationKey_);
927     }
928     
929     /**
930      * Gets the simpler form of a CollationKey for the String source following
931      * the rules of this Collator and stores the result into the user provided 
932      * argument key. 
933      * If key has a internal byte array of length that's too small for the 
934      * result, the internal byte array will be grown to the exact required 
935      * size.
936      * @param source the text String to be transformed into a RawCollationKey  
937      * @param key output RawCollationKey to store results
938      * @return If key is null, a new instance of RawCollationKey will be 
939      *         created and returned, otherwise the user provided key will be 
940      *         returned.
941      * @see #getCollationKey 
942      * @see #compare(String, String)
943      * @see RawCollationKey
944      * @stable ICU 2.8
945      */
946     public RawCollationKey getRawCollationKey(String   source, 
947                                               RawCollationKey key)
948     {
949         if (source == null) {
950             return null;
951         }
952         int strength = getStrength();
953         m_utilCompare0_ = m_isCaseLevel_;
954         m_utilCompare1_ = true;
955         m_utilCompare2_ = strength >= SECONDARY;
956         m_utilCompare3_ = strength >= TERTIARY;
957         m_utilCompare4_ = strength >= QUATERNARY;
958         m_utilCompare5_ = strength == IDENTICAL;
959 
960         m_utilBytesCount0_ = 0;
961         m_utilBytesCount1_ = 0;
962         m_utilBytesCount2_ = 0;
963         m_utilBytesCount3_ = 0;
964         m_utilBytesCount4_ = 0;
965         m_utilBytesCount5_ = 0;
966         m_utilCount0_ = 0;
967         m_utilCount1_ = 0;
968         m_utilCount2_ = 0;
969         m_utilCount3_ = 0;
970         m_utilCount4_ = 0;
971         m_utilCount5_ = 0;
972         boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
973         // TODO: UCOL_COMMON_BOT4 should be a function of qShifted.
974         // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so
975         // high.
976         int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_;
977         byte hiragana4 = 0;
978         if (m_isHiragana4_ && m_utilCompare4_) {
979             // allocate one more space for hiragana, value for hiragana
980             hiragana4 = (byte)commonBottom4;
981             commonBottom4 ++;
982         }
983 
984         int bottomCount4 = 0xFF - commonBottom4;
985         // If we need to normalize, we'll do it all at once at the beginning!
986         if (m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD,0)
987                                                     != Normalizer.YES) {
988             // if it is identical strength, we have to normalize the string to
989             // NFD so that it will be appended correctly to the end of the sort
990             // key
991             source = Normalizer.decompose(source, false);
992         }
993         else if (getDecomposition() != NO_DECOMPOSITION
994             && Normalizer.quickCheck(source, Normalizer.FCD,0)
995                                                     != Normalizer.YES) {
996             // for the rest of the strength, if decomposition is on, FCD is
997             // enough for us to work on.
998             source = Normalizer.normalize(source,Normalizer.FCD);
999         }
1000        getSortKeyBytes(source, doFrench, hiragana4, commonBottom4,
1001                        bottomCount4);
1002        if (key == null) {
1003            key = new RawCollationKey();
1004        }
1005        getSortKey(source, doFrench, commonBottom4, bottomCount4, key);
1006        return key;
1007    }
1008
1009    /**
1010     * Return true if an uppercase character is sorted before the corresponding lowercase character.
1011     * See setCaseFirst(boolean) for details.
1012     * @see #setUpperCaseFirst
1013     * @see #setLowerCaseFirst
1014     * @see #isLowerCaseFirst
1015     * @see #setCaseFirstDefault
1016     * @return true if upper cased characters are sorted before lower cased
1017     *         characters, false otherwise
1018     * @stable ICU 2.8
1019     */
1020     public boolean isUpperCaseFirst()
1021     {
1022        return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);
1023     }
1024     
1025    /**
1026     * Return true if a lowercase character is sorted before the corresponding uppercase character.
1027     * See setCaseFirst(boolean) for details.
1028     * @see #setUpperCaseFirst
1029     * @see #setLowerCaseFirst
1030     * @see #isUpperCaseFirst
1031     * @see #setCaseFirstDefault
1032     * @return true lower cased characters are sorted before upper cased
1033     *         characters, false otherwise
1034     * @stable ICU 2.8
1035     */
1036    public boolean isLowerCaseFirst()
1037    {
1038        return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);
1039    }
1040
1041    /**
1042     * Checks if the alternate handling behaviour is the UCA defined SHIFTED or
1043     * NON_IGNORABLE.
1044     * If return value is true, then the alternate handling attribute for the
1045     * Collator is SHIFTED. Otherwise if return value is false, then the
1046     * alternate handling attribute for the Collator is NON_IGNORABLE
1047     * See setAlternateHandlingShifted(boolean) for more details.
1048     * @return true or false
1049     * @see #setAlternateHandlingShifted(boolean)
1050     * @see #setAlternateHandlingDefault
1051     * @stable ICU 2.8
1052     */
1053    public boolean isAlternateHandlingShifted()
1054    {
1055        return m_isAlternateHandlingShifted_;
1056    }
1057
1058    /**
1059     * Checks if case level is set to true.
1060     * See setCaseLevel(boolean) for details.
1061     * @return the case level mode
1062     * @see #setCaseLevelDefault
1063     * @see #isCaseLevel
1064     * @see #setCaseLevel(boolean)
1065     * @stable ICU 2.8
1066     */
1067    public boolean isCaseLevel()
1068    {
1069        return m_isCaseLevel_;
1070    }
1071
1072    /**
1073     * Checks if French Collation is set to true.
1074     * See setFrenchCollation(boolean) for details.
1075     * @return true if French Collation is set to true, false otherwise
1076     * @see #setFrenchCollation(boolean)
1077     * @see #setFrenchCollationDefault
1078     * @stable ICU 2.8
1079     */
1080     public boolean isFrenchCollation()
1081     {
1082         return m_isFrenchCollation_;
1083     }
1084
1085    /**
1086     * Checks if the Hiragana Quaternary mode is set on.
1087     * See setHiraganaQuaternary(boolean) for more details.
1088     * @return flag true if Hiragana Quaternary mode is on, false otherwise
1089     * @see #setHiraganaQuaternaryDefault
1090     * @see #setHiraganaQuaternary(boolean)
1091     * @stable ICU 2.8
1092     */
1093    public boolean isHiraganaQuaternary()
1094    {
1095        return m_isHiragana4_;
1096    }
1097
1098    /** 
1099     * Gets the variable top value of a Collator. 
1100     * Lower 16 bits are undefined and should be ignored.
1101     * @return the variable top value of a Collator.
1102     * @see #setVariableTop
1103     * @stable ICU 2.6
1104     */
1105    public int getVariableTop()
1106    {
1107          return m_variableTopValue_ << 16;
1108    }
1109    
1110    /** 
1111     * Method to retrieve the numeric collation value.
1112     * When numeric collation is turned on, this Collator generates a collation 
1113     * key for the numeric value of substrings of digits. This is a way to get 
1114     * '100' to sort AFTER '2'
1115     * @see #setNumericCollation
1116     * @see #setNumericCollationDefault
1117     * @return true if numeric collation is turned on, false otherwise
1118     * @stable ICU 2.8
1119     */
1120    public boolean getNumericCollation()
1121    {
1122        return m_isNumericCollation_;
1123    }
1124    
1125    // public other methods -------------------------------------------------
1126
1127    /**
1128     * Compares the equality of two RuleBasedCollator objects.
1129     * RuleBasedCollator objects are equal if they have the same collation
1130     * rules and the same attributes.
1131     * @param obj the RuleBasedCollator to be compared to.
1132     * @return true if this RuleBasedCollator has exactly the same
1133     *         collation behaviour as obj, false otherwise.
1134     * @stable ICU 2.8
1135     */
1136    public boolean equals(Object   obj)
1137    {
1138        if (obj == null) {
1139            return false;  // super does class check
1140        }
1141        if (this == obj) {
1142            return true;
1143        }
1144        if (getClass() != obj.getClass()) {
1145            return false;
1146        }
1147        RuleBasedCollator other = (RuleBasedCollator)obj;
1148        // all other non-transient information is also contained in rules.
1149        if (getStrength() != other.getStrength()
1150               || getDecomposition() != other.getDecomposition()
1151               || other.m_caseFirst_ != m_caseFirst_
1152               || other.m_caseSwitch_ != m_caseSwitch_
1153               || other.m_isAlternateHandlingShifted_
1154                                             != m_isAlternateHandlingShifted_
1155               || other.m_isCaseLevel_ != m_isCaseLevel_
1156               || other.m_isFrenchCollation_ != m_isFrenchCollation_
1157               || other.m_isHiragana4_ != m_isHiragana4_) {
1158            return false;
1159        }
1160        boolean rules = m_rules_ == other.m_rules_;
1161        if (!rules && (m_rules_ != null && other.m_rules_ != null)) {
1162            rules = m_rules_.equals(other.m_rules_);
1163        }
1164        if (!rules || !ICUDebug.enabled("collation")) {
1165            return rules;
1166        }
1167        if (m_addition3_ != other.m_addition3_
1168                  || m_bottom3_ != other.m_bottom3_
1169                  || m_bottomCount3_ != other.m_bottomCount3_
1170                  || m_common3_ != other.m_common3_
1171                  || m_isSimple3_ != other.m_isSimple3_
1172                  || m_mask3_ != other.m_mask3_
1173                  || m_minContractionEnd_ != other.m_minContractionEnd_
1174                  || m_minUnsafe_ != other.m_minUnsafe_
1175                  || m_top3_ != other.m_top3_
1176                  || m_topCount3_ != other.m_topCount3_
1177                  || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
1178            return false;
1179        }
1180        if (!m_trie_.equals(other.m_trie_)) {
1181            // we should use the trie iterator here, but then this part is
1182            // only used in the test.
1183            for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i --)
1184            {
1185                int v = m_trie_.getCodePointValue(i);
1186                int otherv = other.m_trie_.getCodePointValue(i);
1187                if (v != otherv) {
1188                    int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_);
1189                    if (mask == (otherv & 0xff000000)) {
1190                        v &= 0xffffff;
1191                        otherv &= 0xffffff;
1192                        if (mask == 0xf1000000) {
1193                            v -= (m_expansionOffset_ << 4);
1194                            otherv -= (other.m_expansionOffset_ << 4);
1195                        }
1196                        else if (mask == 0xf2000000) {
1197                            v -= m_contractionOffset_;
1198                            otherv -= other.m_contractionOffset_;
1199                        }
1200                        if (v == otherv) {
1201                            continue;
1202                        }
1203                    }
1204                    return false;
1205                }
1206            }
1207        }
1208        if (Arrays.equals(m_contractionCE_, other.m_contractionCE_)
1209            && Arrays.equals(m_contractionEnd_, other.m_contractionEnd_)
1210            && Arrays.equals(m_contractionIndex_, other.m_contractionIndex_)
1211            && Arrays.equals(m_expansion_, other.m_expansion_)
1212            && Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) {
1213            // not comparing paddings
1214            for (int i = 0; i < m_expansionEndCE_.length; i ++) {
1215                 if (m_expansionEndCEMaxSize_[i]
1216                     != other.m_expansionEndCEMaxSize_[i]) {
1217                     return false;
1218                 }
1219                 return true;
1220            }
1221        }
1222        return false;
1223    }
1224
1225    /**
1226     * Generates a unique hash code for this RuleBasedCollator.
1227     * @return the unique hash code for this Collator
1228     * @stable ICU 2.8
1229     */
1230    public int hashCode()
1231    {
1232        String   rules = getRules();
1233        if (rules == null) {
1234            rules = "";
1235        }
1236        return rules.hashCode();
1237    }
1238
1239    /**
1240     * Compares the source text String to the target text String according to
1241     * the collation rules, strength and decomposition mode for this
1242     * RuleBasedCollator.
1243     * Returns an integer less than,
1244     * equal to or greater than zero depending on whether the source String is
1245     * less than, equal to or greater than the target String. See the Collator
1246     * class description for an example of use.
1247     * </p>
1248     * <p>
1249     * General recommendation: <br>
1250     * If comparison are to be done to the same String multiple times, it would
1251     * be more efficient to generate CollationKeys for the Strings and use
1252     * CollationKey.compareTo(CollationKey) for the comparisons.
1253     * If speed performance is critical and object instantiation is to be 
1254     * reduced, further optimization may be achieved by generating a simpler 
1255     * key of the form RawCollationKey and reusing this RawCollationKey 
1256     * object with the method RuleBasedCollator.getRawCollationKey. Internal 
1257     * byte representation can be directly accessed via RawCollationKey and
1258     * stored for future use. Like CollationKey, RawCollationKey provides a
1259     * method RawCollationKey.compareTo for key comparisons.
1260     * If the each Strings are compared to only once, using the method
1261     * RuleBasedCollator.compare(String, String) will have a better performance.
1262     * </p>
1263     * @param source the source text String.
1264     * @param target the target text String.
1265     * @return Returns an integer value. Value is less than zero if source is
1266     *         less than target, value is zero if source and target are equal,
1267     *         value is greater than zero if source is greater than target.
1268     * @see CollationKey
1269     * @see #getCollationKey
1270     * @stable ICU 2.8
1271     */
1272    public int compare(String   source, String   target)
1273    {
1274        if (source == target) {
1275            return 0;
1276        }
1277
1278        // Find the length of any leading portion that is equal
1279        int offset = getFirstUnmatchedOffset(source, target);
1280        //return compareRegular(source, target, offset);
1281        if(latinOneUse_) {
1282          if ((offset < source.length() 
1283               && source.charAt(offset) > ENDOFLATINONERANGE_) 
1284              || (offset < target.length() 
1285                  && target.charAt(offset) > ENDOFLATINONERANGE_)) { 
1286              // source or target start with non-latin-1
1287            return compareRegular(source, target, offset);
1288          } else {
1289            return compareUseLatin1(source, target, offset);
1290          }
1291        } else {
1292          return compareRegular(source, target, offset);
1293        }
1294    }
1295    
1296    // package private inner interfaces --------------------------------------
1297
1298    /**
1299     * Attribute values to be used when setting the Collator options
1300     */
1301    static interface AttributeValue
1302    {
1303        /**
1304         * Indicates that the default attribute value will be used.
1305         * See individual attribute for details on its default value.
1306         */
1307        static final int DEFAULT_ = -1;
1308        /**
1309         * Primary collation strength
1310         */
1311        static final int PRIMARY_ = Collator.PRIMARY;
1312        /**
1313         * Secondary collation strength
1314         */
1315        static final int SECONDARY_ = Collator.SECONDARY;
1316        /**
1317         * Tertiary collation strength
1318         */
1319        static final int TERTIARY_ = Collator.TERTIARY;
1320        /**
1321         * Default collation strength
1322         */
1323        static final int DEFAULT_STRENGTH_ = Collator.TERTIARY;
1324        /**
1325         * Internal use for strength checks in Collation elements
1326         */
1327        static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1;
1328        /**
1329         * Quaternary collation strength
1330         */
1331        static final int QUATERNARY_ = 3;
1332        /**
1333         * Identical collation strength
1334         */
1335        static final int IDENTICAL_ = Collator.IDENTICAL;
1336        /**
1337         * Internal use for strength checks
1338         */
1339        static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;
1340        /**
1341         * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL,
1342         * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
1343         */
1344        static final int OFF_ = 16;
1345        /**
1346         * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL,
1347         * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
1348         */
1349        static final int ON_ = 17;
1350        /**
1351         * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted
1352         */
1353        static final int SHIFTED_ = 20;
1354        /**
1355         * Valid for ALTERNATE_HANDLING. Alternate handling will be non
1356         * ignorable
1357         */
1358        static final int NON_IGNORABLE_ = 21;
1359        /**
1360         * Valid for CASE_FIRST - lower case sorts before upper case
1361         */
1362        static final int LOWER_FIRST_ = 24;
1363        /**
1364         * Upper case sorts before lower case
1365         */
1366        static final int UPPER_FIRST_ = 25;
1367        /**
1368         * Number of attribute values
1369         */
1370        static final int LIMIT_ = 29;
1371    }
1372
1373    /**
1374     * Attributes that collation service understands. All the attributes can
1375     * take DEFAULT value, as well as the values specific to each one.
1376     */
1377    static interface Attribute
1378    {
1379        /**
1380         * Attribute for direction of secondary weights - used in French.
1381         * Acceptable values are ON, which results in secondary weights being
1382         * considered backwards and OFF which treats secondary weights in the
1383         * order they appear.
1384         */
1385        static final int FRENCH_COLLATION_ = 0;
1386        /**
1387         * Attribute for handling variable elements. Acceptable values are
1388         * NON_IGNORABLE (default) which treats all the codepoints with
1389         * non-ignorable primary weights in the same way, and SHIFTED which
1390         * causes codepoints with primary weights that are equal or below the
1391         * variable top value to be ignored on primary level and moved to the
1392         * quaternary level.
1393         */
1394        static final int ALTERNATE_HANDLING_ = 1;
1395        /**
1396         * Controls the ordering of upper and lower case letters. Acceptable
1397         * values are OFF (default), which orders upper and lower case letters
1398         * in accordance to their tertiary weights, UPPER_FIRST which forces
1399         * upper case letters to sort before lower case letters, and
1400         * LOWER_FIRST which does the opposite.
1401         */
1402        static final int CASE_FIRST_ = 2;
1403        /**
1404         * Controls whether an extra case level (positioned before the third
1405         * level) is generated or not. Acceptable values are OFF (default),
1406         * when case level is not generated, and ON which causes the case
1407         * level to be generated. Contents of the case level are affected by
1408         * the value of CASE_FIRST attribute. A simple way to ignore accent
1409         * differences in a string is to set the strength to PRIMARY and
1410         * enable case level.
1411         */
1412        static final int CASE_LEVEL_ = 3;
1413        /**
1414         * Controls whether the normalization check and necessary
1415         * normalizations are performed. When set to OFF (default) no
1416         * normalization check is performed. The correctness of the result is
1417         * guaranteed only if the input data is in so-called FCD form (see
1418         * users manual for more info). When set to ON, an incremental check
1419         * is performed to see whether the input data is in the FCD form. If
1420         * the data is not in the FCD form, incremental NFD normalization is
1421         * performed.
1422         */
1423        static final int NORMALIZATION_MODE_ = 4;
1424        /**
1425         * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY,
1426         * QUATERNARY or IDENTICAL. The usual strength for most locales
1427         * (except Japanese) is tertiary. Quaternary strength is useful when
1428         * combined with shifted setting for alternate handling attribute and
1429         * for JIS x 4061 collation, when it is used to distinguish between
1430         * Katakana  and Hiragana (this is achieved by setting the
1431         * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is
1432         * affected only by the number of non ignorable code points in the
1433         * string. Identical strength is rarely useful, as it amounts to
1434         * codepoints of the NFD form of the string.
1435         */
1436        static final int STRENGTH_ = 5;
1437        /**
1438         * When turned on, this attribute positions Hiragana before all
1439         * non-ignorables on quaternary level. This is a sneaky way to produce
1440         * JIS sort order.
1441         */
1442        static final int HIRAGANA_QUATERNARY_MODE_ = 6;
1443        /**
1444         * Attribute count
1445         */
1446        static final int LIMIT_ = 7;
1447    }
1448
1449    /**
1450     * DataManipulate singleton
1451     */
1452    static class DataManipulate implements Trie.DataManipulate
1453    {
1454        // public methods ----------------------------------------------------
1455
1456        /**
1457         * Internal method called to parse a lead surrogate's ce for the offset
1458         * to the next trail surrogate data.
1459         * @param ce collation element of the lead surrogate
1460         * @return data offset or 0 for the next trail surrogate
1461         * @stable ICU 2.8
1462         */
1463        public final int getFoldingOffset(int ce)
1464        {
1465            if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {
1466                return (ce & 0xFFFFFF);
1467            }
1468            return 0;
1469        }
1470
1471        /**
1472         * Get singleton object
1473         */
1474        public static final DataManipulate getInstance()
1475        {
1476            if (m_instance_ == null) {
1477                m_instance_ =  new DataManipulate();
1478            }
1479            return m_instance_;
1480        }
1481
1482        // private data member ----------------------------------------------
1483
1484        /**
1485         * Singleton instance
1486         */
1487        private static DataManipulate m_instance_;
1488
1489        // private constructor ----------------------------------------------
1490
1491        /**
1492         * private to prevent initialization
1493         */
1494        private DataManipulate()
1495        {
1496        }
1497    }
1498
1499    /**
1500     * UCAConstants
1501     */
1502    static final class UCAConstants
1503    {
1504         int FIRST_TERTIARY_IGNORABLE_[] = new int[2];       // 0x00000000
1505         int LAST_TERTIARY_IGNORABLE_[] = new int[2];        // 0x00000000
1506         int FIRST_PRIMARY_IGNORABLE_[] = new int[2];        // 0x00008705
1507         int FIRST_SECONDARY_IGNORABLE_[] = new int[2];      // 0x00000000
1508         int LAST_SECONDARY_IGNORABLE_[] = new int[2];       // 0x00000500
1509         int LAST_PRIMARY_IGNORABLE_[] = new int[2];         // 0x0000DD05
1510         int FIRST_VARIABLE_[] = new int[2];                 // 0x05070505
1511         int LAST_VARIABLE_[] = new int[2];                  // 0x13CF0505
1512         int FIRST_NON_VARIABLE_[] = new int[2];             // 0x16200505
1513         int LAST_NON_VARIABLE_[] = new int[2];              // 0x767C0505
1514         int RESET_TOP_VALUE_[] = new int[2];                // 0x9F000303
1515         int FIRST_IMPLICIT_[] = new int[2];
1516         int LAST_IMPLICIT_[] = new int[2];
1517         int FIRST_TRAILING_[] = new int[2];
1518         int LAST_TRAILING_[] = new int[2];
1519         int PRIMARY_TOP_MIN_;
1520         int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
1521         int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
1522         int PRIMARY_TRAILING_MIN_; // 0xE8000000
1523         int PRIMARY_TRAILING_MAX_; // 0xF0000000
1524         int PRIMARY_SPECIAL_MIN_; // 0xE8000000
1525         int PRIMARY_SPECIAL_MAX_; // 0xF0000000
1526    }
1527
1528    // package private data member -------------------------------------------
1529
1530    static final byte BYTE_FIRST_TAILORED_ = (byte)0x04;
1531    static final byte BYTE_COMMON_ = (byte)0x05;
1532    static final int COMMON_TOP_2_ = 0x86; // int for unsigness
1533    static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;
1534    /**
1535     * Case strength mask
1536     */
1537    static final int CE_CASE_BIT_MASK_ = 0xC0;
1538    static final int CE_TAG_SHIFT_ = 24;
1539    static final int CE_TAG_MASK_ = 0x0F000000;
1540
1541    static final int CE_SPECIAL_FLAG_ = 0xF0000000;
1542    /**
1543     * Lead surrogate that is tailored and doesn't start a contraction
1544     */
1545    static final int CE_SURROGATE_TAG_ = 5;
1546    /**
1547     * Mask to get the primary strength of the collation element
1548     */
1549    static final int CE_PRIMARY_MASK_ = 0xFFFF0000;
1550    /**
1551     * Mask to get the secondary strength of the collation element
1552     */
1553    static final int CE_SECONDARY_MASK_ = 0xFF00;
1554    /**
1555     * Mask to get the tertiary strength of the collation element
1556     */
1557    static final int CE_TERTIARY_MASK_ = 0xFF;
1558    /**
1559     * Primary strength shift
1560     */
1561    static final int CE_PRIMARY_SHIFT_ = 16;
1562    /**
1563     * Secondary strength shift
1564     */
1565    static final int CE_SECONDARY_SHIFT_ = 8;
1566    /**
1567     * Continuation marker
1568     */
1569    static final int CE_CONTINUATION_MARKER_ = 0xC0;
1570
1571    /**
1572     * Size of collator raw data headers and options before the expansion
1573     * data. This is used when expansion ces are to be retrieved. ICU4C uses
1574     * the expansion offset starting from UCollator.UColHeader, hence ICU4J
1575     * will have to minus that off to get the right expansion ce offset. In
1576     * number of ints.
1577     */
1578    int m_expansionOffset_;
1579    /**
1580     * Size of collator raw data headers, options and expansions before
1581     * contraction data. This is used when contraction ces are to be retrieved.
1582     * ICU4C uses contraction offset starting from UCollator.UColHeader, hence
1583     * ICU4J will have to minus that off to get the right contraction ce
1584     * offset. In number of chars.
1585     */
1586    int m_contractionOffset_;
1587    /**
1588     * Flag indicator if Jamo is special
1589     */
1590    boolean m_isJamoSpecial_;
1591
1592    // Collator options ------------------------------------------------------
1593    
1594    int m_defaultVariableTopValue_;
1595    boolean m_defaultIsFrenchCollation_;
1596    boolean m_defaultIsAlternateHandlingShifted_;
1597    int m_defaultCaseFirst_;
1598    boolean m_defaultIsCaseLevel_;
1599    int m_defaultDecomposition_;
1600    int m_defaultStrength_;
1601    boolean m_defaultIsHiragana4_;
1602    boolean m_defaultIsNumericCollation_;
1603    
1604    /**
1605     * Value of the variable top
1606     */
1607    int m_variableTopValue_;
1608    /**
1609     * Attribute for special Hiragana
1610     */
1611    boolean m_isHiragana4_;
1612    /**
1613     * Case sorting customization
1614     */
1615    int m_caseFirst_;
1616    /**
1617     * Numeric collation option
1618     */
1619    boolean m_isNumericCollation_;
1620
1621    // end Collator options --------------------------------------------------
1622
1623    /**
1624     * Expansion table
1625     */
1626    int m_expansion_[];
1627    /**
1628     * Contraction index table
1629     */
1630    char m_contractionIndex_[];
1631    /**
1632     * Contraction CE table
1633     */
1634    int m_contractionCE_[];
1635    /**
1636     * Data trie
1637     */
1638    IntTrie m_trie_;
1639    /**
1640     * Table to store all collation elements that are the last element of an
1641     * expansion. This is for use in StringSearch.
1642     */
1643    int m_expansionEndCE_[];
1644    /**
1645     * Table to store the maximum size of any expansions that end with the
1646     * corresponding collation element in m_expansionEndCE_. For use in
1647     * StringSearch too
1648     */
1649    byte m_expansionEndCEMaxSize_[];
1650    /**
1651     * Heuristic table to store information on whether a char character is
1652     * considered "unsafe". "Unsafe" character are combining marks or those
1653     * belonging to some contraction sequence from the offset 1 onwards.
1654     * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered
1655     * unsafe. If we have another contraction "ZA" with the one above, then
1656     * 'A', 'B', 'C' are "unsafe" but 'Z' is not.
1657     */
1658    byte m_unsafe_[];
1659    /**
1660     * Table to store information on whether a codepoint can occur as the last
1661     * character in a contraction
1662     */
1663    byte m_contractionEnd_[];
1664    /**
1665     * Original collation rules
1666     */
1667    String   m_rules_;
1668    /**
1669     * The smallest "unsafe" codepoint
1670     */
1671    char m_minUnsafe_;
1672    /**
1673     * The smallest codepoint that could be the end of a contraction
1674     */
1675    char m_minContractionEnd_;
1676    /**
1677     * General version of the collator
1678     */
1679    VersionInfo m_version_;
1680    /**
1681     * UCA version
1682     */
1683    VersionInfo m_UCA_version_;
1684    /**
1685     * UCD version
1686     */
1687    VersionInfo m_UCD_version_;
1688
1689    /**
1690     * UnicodeData.txt property object
1691     */
1692    static final RuleBasedCollator UCA_;
1693    /**
1694     * UCA Constants
1695     */
1696    static final UCAConstants UCA_CONSTANTS_;
1697    /**
1698     * Table for UCA and builder use
1699     */
1700    static final char UCA_CONTRACTIONS_[];
1701
1702    private static boolean UCA_INIT_COMPLETE;
1703
1704    /**
1705     * Implicit generator
1706     */
1707    static final ImplicitCEGenerator impCEGen_;
1708//    /**
1709//     * Implicit constants
1710//     */
1711//    static final int IMPLICIT_BASE_BYTE_;
1712//    static final int IMPLICIT_LIMIT_BYTE_;
1713//    static final int IMPLICIT_4BYTE_BOUNDARY_;
1714//    static final int LAST_MULTIPLIER_;
1715//    static final int LAST2_MULTIPLIER_;
1716//    static final int IMPLICIT_BASE_3BYTE_;
1717//    static final int IMPLICIT_BASE_4BYTE_;
1718//    static final int BYTES_TO_AVOID_ = 3;
1719//    static final int OTHER_COUNT_ = 256 - BYTES_TO_AVOID_;
1720//    static final int LAST_COUNT_ = OTHER_COUNT_ / 2;
1721//    /**
1722//     * Room for intervening, without expanding to 5 bytes
1723//     */
1724//    static final int LAST_COUNT2_ = OTHER_COUNT_ / 21;
1725//    static final int IMPLICIT_3BYTE_COUNT_ = 1;
1726//    
1727    static final byte SORT_LEVEL_TERMINATOR_ = 1;
1728
1729//  These are values from UCA required for
1730//  implicit generation and supressing sort key compression
1731//  they should regularly be in the UCA, but if one
1732//  is running without UCA, it could be a problem
1733     static final int maxRegularPrimary  = 0xA0;
1734     static final int minImplicitPrimary = 0xE0;
1735     static final int maxImplicitPrimary = 0xE4;
1736
1737
1738    // block to initialise character property database
1739    static
1740    {
1741        // take pains to let static class init succeed, otherwise the class itself won't exist and
1742        // clients will get a NoClassDefFoundException.  Instead, make the constructors fail if
1743        // we can't load the UCA data.
1744
1745        RuleBasedCollator iUCA_ = null;
1746        UCAConstants iUCA_CONSTANTS_ = null;
1747        char iUCA_CONTRACTIONS_[] = null;
1748        ImplicitCEGenerator iimpCEGen_ = null;
1749        try
1750        {
1751            // !!! note what's going on here...
1752            // even though the static init of the class is not yet complete, we
1753            // instantiate an instance of the class.  So we'd better be sure that
1754            // instantiation doesn't rely on the static initialization that's
1755            // not complete yet!
1756            iUCA_ = new RuleBasedCollator();
1757            iUCA_CONSTANTS_ = new UCAConstants();
1758            iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_);
1759
1760            // called before doing canonical closure for the UCA.
1761            iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary);
1762            //iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
1763            iUCA_.init();
1764            ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);
1765            iUCA_.m_rules_ = (String  )rb.getObject("UCARules");
1766        }
1767        catch (MissingResourceException   ex)
1768        {
1769//             throw ex;
1770        }
1771        catch (IOException   e)
1772        {
1773           // e.printStackTrace();
1774//             throw new MissingResourceException(e.getMessage(),"","");
1775        }
1776
1777        UCA_ = iUCA_;
1778        UCA_CONSTANTS_ = iUCA_CONSTANTS_;
1779        UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;
1780        impCEGen_ = iimpCEGen_;
1781
1782        UCA_INIT_COMPLETE = true;
1783    }
1784
1785
1786    private static void checkUCA() throws MissingResourceException   {
1787        if (UCA_INIT_COMPLETE && UCA_ == null) {
1788            throw new MissingResourceException  ("Collator UCA data unavailable", "", "");
1789        }
1790    }
1791        
1792    // package private constructors ------------------------------------------
1793
1794    /**
1795    * <p>Private contructor for use by subclasses.
1796    * Public access to creating Collators is handled by the API
1797    * Collator.getInstance() or RuleBasedCollator(String rules).
1798    * </p>
1799    * <p>
1800    * This constructor constructs the UCA collator internally
1801    * </p>
1802    */
1803    RuleBasedCollator()
1804    {
1805        checkUCA();
1806        initUtility(false);
1807    }
1808
1809    /**
1810     * Constructors a RuleBasedCollator from the argument locale.
1811     * If no resource bundle is associated with the locale, UCA is used
1812     * instead.
1813     * @param locale
1814     */
1815    RuleBasedCollator(ULocale locale)
1816    {
1817        checkUCA();
1818        ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
1819        initUtility(false);
1820        if (rb != null) {
1821            try {
1822                // Use keywords, if supplied for lookup
1823                String   collkey = locale.getKeywordValue("collation");
1824                  if(collkey == null) {
1825                      collkey = rb.getStringWithFallback("collations/default");
1826                }
1827                       
1828                // collations/default will always give a string back
1829                // keyword for the real collation data
1830                // if "collations/collkey" will return null if collkey == null 
1831                ICUResourceBundle elements = rb.getWithFallback("collations/" + collkey);
1832                if (elements != null) {
1833                    // TODO: Determine actual & valid locale correctly
1834                    ULocale uloc = rb.getULocale();
1835                    setLocale(uloc, uloc);
1836
1837                    m_rules_ = elements.getString("Sequence");
1838                    ByteBuffer buf = elements.get("%%CollationBin").getBinary();
1839                    // %%CollationBin
1840                    if(buf!=null){
1841                    //     m_rules_ = (String)rules[1][1];
1842                        byte map[] = buf.array();
1843                        CollatorReader.initRBC(this, map);
1844                        /*
1845                        BufferedInputStream input =
1846                                                 new BufferedInputStream(
1847                                                    new ByteArrayInputStream(map));
1848                        /*
1849                        CollatorReader reader = new CollatorReader(input, false);
1850                        if (map.length > MIN_BINARY_DATA_SIZE_) {
1851                            reader.read(this, null);
1852                        }
1853                        else {
1854                            reader.readHeader(this);
1855                            reader.readOptions(this);
1856                            // duplicating UCA_'s data
1857                            setWithUCATables();
1858                        }
1859                        */
1860                        // at this point, we have read in the collator
1861                        // now we need to check whether the binary image has
1862                        // the right UCA and other versions
1863                        if(!m_UCA_version_.equals(UCA_.m_UCA_version_) ||
1864                        !m_UCD_version_.equals(UCA_.m_UCD_version_)) {
1865                            init(m_rules_);
1866                            return;
1867                        }
1868                        init();
1869                        return;
1870                    }
1871                    else {
1872                        // due to resource redirection ICUListResourceBundle does not
1873                        // raise missing resource error
1874                        //throw new MissingResourceException("Could not get resource for constructing RuleBasedCollator","com.ibm.icu.impl.data.LocaleElements_"+locale.toString(), "%%CollationBin");
1875                        
1876                        init(m_rules_);
1877                        return;
1878                    }
1879                }
1880            }
1881            catch (Exception   e) {
1882                // e.printStackTrace();
1883                // if failed use UCA.
1884            }
1885        }
1886        setWithUCAData();
1887    }
1888
1889    // package private methods -----------------------------------------------
1890
1891    /**
1892     * Sets this collator to use the tables in UCA. Note options not taken
1893     * care of here.
1894     */
1895    final void setWithUCATables()
1896    {
1897        m_contractionOffset_ = UCA_.m_contractionOffset_;
1898        m_expansionOffset_ = UCA_.m_expansionOffset_;
1899        m_expansion_ = UCA_.m_expansion_;
1900        m_contractionIndex_ = UCA_.m_contractionIndex_;
1901        m_contractionCE_ = UCA_.m_contractionCE_;
1902        m_trie_ = UCA_.m_trie_;
1903        m_expansionEndCE_ = UCA_.m_expansionEndCE_;
1904        m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_;
1905        m_unsafe_ = UCA_.m_unsafe_;
1906        m_contractionEnd_ = UCA_.m_contractionEnd_;
1907        m_minUnsafe_ = UCA_.m_minUnsafe_;
1908        m_minContractionEnd_ = UCA_.m_minContractionEnd_;
1909    }
1910
1911    /**
1912     * Sets this collator to use the all options and tables in UCA.
1913     */
1914    final void setWithUCAData()
1915    {
1916        latinOneFailed_ = true;
1917
1918        m_addition3_ = UCA_.m_addition3_;
1919        m_bottom3_ = UCA_.m_bottom3_;
1920        m_bottomCount3_ = UCA_.m_bottomCount3_;
1921        m_caseFirst_ = UCA_.m_caseFirst_;
1922        m_caseSwitch_ = UCA_.m_caseSwitch_;
1923        m_common3_ = UCA_.m_common3_;
1924        m_contractionOffset_ = UCA_.m_contractionOffset_;
1925        setDecomposition(UCA_.getDecomposition());
1926        m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;
1927        m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;
1928        m_defaultIsAlternateHandlingShifted_
1929                                   = UCA_.m_defaultIsAlternateHandlingShifted_;
1930        m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;
1931        m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;
1932        m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
1933        m_defaultStrength_ = UCA_.m_defaultStrength_;
1934        m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
1935        m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
1936        m_expansionOffset_ = UCA_.m_expansionOffset_;
1937        m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
1938        m_isCaseLevel_ = UCA_.m_isCaseLevel_;
1939        m_isFrenchCollation_ = UCA_.m_isFrenchCollation_;
1940        m_isHiragana4_ = UCA_.m_isHiragana4_;
1941        m_isJamoSpecial_ = UCA_.m_isJamoSpecial_;
1942        m_isSimple3_ = UCA_.m_isSimple3_;
1943        m_mask3_ = UCA_.m_mask3_;
1944        m_minContractionEnd_ = UCA_.m_minContractionEnd_;
1945        m_minUnsafe_ = UCA_.m_minUnsafe_;
1946        m_rules_ = UCA_.m_rules_;
1947        setStrength(UCA_.getStrength());
1948        m_top3_ = UCA_.m_top3_;
1949        m_topCount3_ = UCA_.m_topCount3_;
1950        m_variableTopValue_ = UCA_.m_variableTopValue_;
1951        m_isNumericCollation_ = UCA_.m_isNumericCollation_;
1952        setWithUCATables();
1953        latinOneFailed_ = false;
1954    }
1955
1956    /**
1957     * Test whether a char character is potentially "unsafe" for use as a
1958     * collation starting point. "Unsafe" characters are combining marks or
1959     * those belonging to some contraction sequence from the offset 1 onwards.
1960     * E.g. if "ABC" is the only contraction, then 'B' and
1961     * 'C' are considered unsafe. If we have another contraction "ZA" with
1962     * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
1963     * @param ch character to determin
1964     * @return true if ch is unsafe, false otherwise
1965     */
1966    final boolean isUnsafe(char ch)
1967    {
1968        if (ch < m_minUnsafe_) {
1969            return false;
1970        }
1971        
1972        if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
1973            if (UTF16.isLeadSurrogate(ch) 
1974                || UTF16.isTrailSurrogate(ch)) {
1975                //  Trail surrogate are always considered unsafe.
1976                return true;
1977            }
1978            ch &= HEURISTIC_OVERFLOW_MASK_;
1979            ch += HEURISTIC_OVERFLOW_OFFSET_;
1980        }
1981        int value = m_unsafe_[ch >> HEURISTIC_SHIFT_];
1982        return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
1983    }
1984
1985    /**
1986     * Approximate determination if a char character is at a contraction end.
1987     * Guaranteed to be true if a character is at the end of a contraction,
1988     * otherwise it is not deterministic.
1989     * @param ch character to be determined
1990     */
1991    final boolean isContractionEnd(char ch)
1992    {
1993        if (UTF16.isTrailSurrogate(ch)) {
1994            return true;
1995        }
1996
1997        if (ch < m_minContractionEnd_) {
1998            return false;
1999        }
2000
2001        if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
2002            ch &= HEURISTIC_OVERFLOW_MASK_;
2003            ch += HEURISTIC_OVERFLOW_OFFSET_;
2004        }
2005        int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_];
2006        return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
2007    }
2008
2009    /**
2010     * Retrieve the tag of a special ce
2011     * @param ce ce to test
2012     * @return tag of ce
2013     */
2014    static int getTag(int ce)
2015    {
2016        return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;
2017    }
2018
2019    /**
2020     * Checking if ce is special
2021     * @param ce to check
2022     * @return true if ce is special
2023     */
2024    static boolean isSpecial(int ce)
2025    {
2026        return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;
2027    }
2028
2029    /**
2030     * Checks if the argument ce is a continuation
2031     * @param ce collation element to test
2032     * @return true if ce is a continuation
2033     */
2034    static final boolean isContinuation(int ce)
2035    {
2036        return ce != CollationElementIterator.NULLORDER
2037                       && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
2038    }
2039
2040    // private inner classes ------------------------------------------------
2041
2042    // private variables -----------------------------------------------------
2043
2044    /**
2045     * The smallest natural unsafe or contraction end char character before
2046     * tailoring.
2047     * This is a combining mark.
2048     */
2049    private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;
2050    /**
2051     * Heuristic table table size. Size is 32 bytes, 1 bit for each
2052     * latin 1 char, and some power of two for hashing the rest of the chars.
2053     * Size in bytes.
2054     */
2055    private static final char HEURISTIC_SIZE_ = 1056;
2056    /**
2057     * Mask value down to "some power of two" - 1,
2058     * number of bits, not num of bytes.
2059     */
2060    private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;
2061    /**
2062     * Unsafe character shift
2063     */
2064    private static final int HEURISTIC_SHIFT_ = 3;
2065    /**
2066     * Unsafe character addition for character too large, it has to be folded
2067     * then incremented.
2068     */
2069    private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;
2070    /**
2071     * Mask value to get offset in heuristic table.
2072     */
2073    private static final char HEURISTIC_MASK_ = 7;
2074
2075    private int m_caseSwitch_;
2076    private int m_common3_;
2077    private int m_mask3_;
2078    /**
2079     * When switching case, we need to add or subtract different values.
2080     */
2081    private int m_addition3_;
2082    /**
2083     * Upper range when compressing
2084     */
2085    private int m_top3_;
2086    /**
2087     * Upper range when compressing
2088     */
2089    private int m_bottom3_;
2090    private int m_topCount3_;
2091    private int m_bottomCount3_;
2092    /**
2093     * Case first constants
2094     */
2095    private static final int CASE_SWITCH_ = 0xC0;
2096    private static final int NO_CASE_SWITCH_ = 0;
2097    /**
2098     * Case level constants
2099     */
2100    private static final int CE_REMOVE_CASE_ = 0x3F;
2101    private static final int CE_KEEP_CASE_ = 0xFF;
2102    /**
2103     * Case strength mask
2104     */
2105    private static final int CE_CASE_MASK_3_ = 0xFF;
2106    /**
2107     * Sortkey size factor. Values can be changed.
2108     */
2109    private static final double PROPORTION_2_ = 0.5;
2110    private static final double PROPORTION_3_ = 0.667;
2111
2112    // These values come from the UCA ----------------------------------------
2113
2114    /**
2115     * This is an enum that lists magic special byte values from the
2116     * fractional UCA
2117     */
2118    private static final byte BYTE_ZERO_ = 0x0;
2119    private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;
2120    private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;
2121    private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03;
2122    /*private*/ static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
2123    private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
2124    static final byte CODAN_PLACEHOLDER = 0x24;
2125    private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte)0x4C;
2126    private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x4D;
2127    private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF;
2128    private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1;
2129    private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;
2130    private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;
2131    private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85;
2132    private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45;
2133    private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;
2134    private static final int COMMON_BOTTOM_3_ = 0x05;
2135    private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;
2136    private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ =
2137                                                              COMMON_BOTTOM_3_;
2138    private static final int TOP_COUNT_2_ = (int)(PROPORTION_2_ * TOTAL_2_);
2139    private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;
2140    private static final int COMMON_2_ = COMMON_BOTTOM_2_;
2141    private static final int COMMON_UPPER_FIRST_3_ = 0xC5;
2142    private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;
2143    private static final int COMMON_4_ = (byte)0xFF;
2144
2145
2146
2147    /**
2148     * Minimum size required for the binary collation data in bytes.
2149     * Size of UCA header + size of options to 4 bytes
2150     */
2151    //private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
2152
2153    /**
2154     * If this collator is to generate only simple tertiaries for fast path
2155     */
2156    private boolean m_isSimple3_;
2157
2158    /**
2159     * French collation sorting flag
2160     */
2161    private boolean m_isFrenchCollation_;
2162    /**
2163     * Flag indicating if shifted is requested for Quaternary alternate
2164     * handling. If this is not true, the default for alternate handling will
2165     * be non-ignorable.
2166     */
2167    private boolean m_isAlternateHandlingShifted_;
2168    /**
2169     * Extra case level for sorting
2170     */
2171    private boolean m_isCaseLevel_;
2172
2173    private static final int SORT_BUFFER_INIT_SIZE_ = 128;
2174    private static final int SORT_BUFFER_INIT_SIZE_1_ =
2175                                                    SORT_BUFFER_INIT_SIZE_ << 3;
2176    private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;
2177    private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;
2178    private static final int SORT_BUFFER_INIT_SIZE_CASE_ =
2179                                                SORT_BUFFER_INIT_SIZE_ >> 2;
2180    private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;
2181
2182    private static final int CE_CONTINUATION_TAG_ = 0xC0;
2183    private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F;
2184
2185    private static final int LAST_BYTE_MASK_ = 0xFF;
2186
2187    private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
2188    private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
2189
2190    private static final byte SORT_CASE_BYTE_START_ = (byte)0x80;
2191    private static final byte SORT_CASE_SHIFT_START_ = (byte)7;
2192
2193    /**
2194     * CE buffer size
2195     */
2196    private static final int CE_BUFFER_SIZE_ = 512;
2197
2198    // variables for Latin-1 processing
2199    boolean latinOneUse_        = false;
2200    boolean latinOneRegenTable_ = false;
2201    boolean latinOneFailed_     = false;
2202
2203    int latinOneTableLen_ = 0;
2204    int latinOneCEs_[] = null;
2205    /**
2206     * Bunch of utility iterators
2207     */
2208    private StringUCharacterIterator m_srcUtilIter_;
2209    private CollationElementIterator m_srcUtilColEIter_;
2210    private StringUCharacterIterator m_tgtUtilIter_;
2211    private CollationElementIterator m_tgtUtilColEIter_;
2212    /**
2213     * Utility comparison flags
2214     */
2215    private boolean m_utilCompare0_;
2216    private boolean m_utilCompare1_;
2217    private boolean m_utilCompare2_;
2218    private boolean m_utilCompare3_;
2219    private boolean m_utilCompare4_;
2220    private boolean m_utilCompare5_;
2221    /**
2222     * Utility byte buffer
2223     */
2224    private byte m_utilBytes0_[];
2225    private byte m_utilBytes1_[];
2226    private byte m_utilBytes2_[];
2227    private byte m_utilBytes3_[];
2228    private byte m_utilBytes4_[];
2229    private byte m_utilBytes5_[];
2230    private RawCollationKey m_utilRawCollationKey_;
2231
2232    private int m_utilBytesCount0_;
2233    private int m_utilBytesCount1_;
2234    private int m_utilBytesCount2_;
2235    private int m_utilBytesCount3_;
2236    private int m_utilBytesCount4_;
2237    private int m_utilBytesCount5_;
2238    private int m_utilCount0_;
2239    private int m_utilCount1_;
2240    private int m_utilCount2_;
2241    private int m_utilCount3_;
2242    private int m_utilCount4_;
2243    private int m_utilCount5_;
2244
2245    private int m_utilFrenchStart_;
2246    private int m_utilFrenchEnd_;
2247
2248    /**
2249     * Preparing the CE buffers. will be filled during the primary phase
2250     */
2251    private int m_srcUtilCEBuffer_[];
2252    private int m_tgtUtilCEBuffer_[];
2253    private int m_srcUtilCEBufferSize_;
2254    private int m_tgtUtilCEBufferSize_;
2255
2256    private int m_srcUtilContOffset_;
2257    private int m_tgtUtilContOffset_;
2258
2259    private int m_srcUtilOffset_;
2260    private int m_tgtUtilOffset_;
2261
2262    // private methods -------------------------------------------------------
2263
2264    private void init(String   rules) throws Exception  
2265    {
2266        setWithUCAData();
2267        CollationParsedRuleBuilder builder
2268                                       = new CollationParsedRuleBuilder(rules);
2269        builder.setRules(this);
2270        m_rules_ = rules;
2271        init();
2272        initUtility(false);
2273    }
2274    
2275    private final int compareRegular(String   source, String   target, int offset) {
2276        if (m_srcUtilIter_ == null) {
2277            initUtility(true);
2278        }
2279        int strength = getStrength();
2280        // setting up the collator parameters
2281        m_utilCompare0_ = m_isCaseLevel_;
2282        m_utilCompare1_ = true;
2283        m_utilCompare2_ = strength >= SECONDARY;
2284        m_utilCompare3_ = strength >= TERTIARY;
2285        m_utilCompare4_ = strength >= QUATERNARY;
2286        m_utilCompare5_ = strength == IDENTICAL;
2287        boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
2288        boolean doShift4 = m_isAlternateHandlingShifted_ && m_utilCompare4_;
2289        boolean doHiragana4 = m_isHiragana4_ && m_utilCompare4_;
2290
2291        if (doHiragana4 && doShift4) {
2292            String   sourcesub = source.substring(offset);
2293            String   targetsub = target.substring(offset);
2294            return compareBySortKeys(sourcesub, targetsub);
2295        }
2296
2297        // This is the lowest primary value that will not be ignored if shifted
2298        int lowestpvalue = m_isAlternateHandlingShifted_
2299                                            ? m_variableTopValue_ << 16 : 0;
2300        m_srcUtilCEBufferSize_ = 0;
2301        m_tgtUtilCEBufferSize_ = 0;
2302        int result = doPrimaryCompare(doHiragana4, lowestpvalue, source,
2303                                      target, offset);
2304        if (m_srcUtilCEBufferSize_ == -1
2305            && m_tgtUtilCEBufferSize_ == -1) {
2306            // since the cebuffer is cleared when we have determined that
2307            // either source is greater than target or vice versa, the return
2308            // result is the comparison result and not the hiragana result
2309            return result;
2310        }
2311
2312        int hiraganaresult = result;
2313
2314        if (m_utilCompare2_) {
2315            result = doSecondaryCompare(doFrench);
2316            if (result != 0) {
2317                return result;
2318            }
2319        }
2320        // doing the case bit
2321        if (m_utilCompare0_) {
2322            result = doCaseCompare();
2323            if (result != 0) {
2324                return result;
2325            }
2326        }
2327        // Tertiary level
2328        if (m_utilCompare3_) {
2329            result = doTertiaryCompare();
2330            if (result != 0) {
2331                return result;
2332            }
2333        }
2334
2335        if (doShift4) {  // checkQuad
2336            result = doQuaternaryCompare(lowestpvalue);
2337            if (result != 0) {
2338                return result;
2339            }
2340        }
2341        else if (doHiragana4 && hiraganaresult != 0) {
2342            // If we're fine on quaternaries, we might be different
2343            // on Hiragana. This, however, might fail us in shifted.
2344            return hiraganaresult;
2345        }
2346
2347        // For IDENTICAL comparisons, we use a bitwise character comparison
2348        // as a tiebreaker if all else is equal.
2349        // Getting here  should be quite rare - strings are not identical -
2350        // that is checked first, but compared == through all other checks.
2351        if (m_utilCompare5_) {
2352            return doIdenticalCompare(source, target, offset, true);
2353        }
2354        return 0;
2355    }
2356
2357    /**
2358     * Gets the 2 bytes of primary order and adds it to the primary byte array
2359     * @param ce current ce
2360     * @param notIsContinuation flag indicating if the current bytes belong to
2361     *          a continuation ce
2362     * @param doShift flag indicating if ce is to be shifted
2363     * @param leadPrimary lead primary used for compression
2364     * @param commonBottom4 common byte value for Quaternary
2365     * @param bottomCount4 smallest byte value for Quaternary
2366     * @return the new lead primary for compression
2367     */
2368    private final int doPrimaryBytes(int ce, boolean notIsContinuation,
2369                                  boolean doShift, int leadPrimary,
2370                                  int commonBottom4, int bottomCount4)
2371    {
2372
2373        int p2 = (ce >>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
2374        int p1 = ce >>> 8;  // comparison
2375        if (doShift) {
2376            if (m_utilCount4_ > 0) {
2377                while (m_utilCount4_ > bottomCount4) {
2378                    m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2379                                         (byte)(commonBottom4 + bottomCount4));
2380                    m_utilBytesCount4_ ++;
2381                    m_utilCount4_ -= bottomCount4;
2382                }
2383                m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2384                                       (byte)(commonBottom4
2385                                              + (m_utilCount4_ - 1)));
2386                m_utilBytesCount4_ ++;
2387                m_utilCount4_ = 0;
2388            }
2389            // dealing with a variable and we're treating them as shifted
2390            // This is a shifted ignorable
2391            if (p1 != 0) {
2392                // we need to check this since we could be in continuation
2393                m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2394                                       (byte)p1);
2395                m_utilBytesCount4_ ++;
2396            }
2397            if (p2 != 0) {
2398                m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2399                                       (byte)p2);
2400                m_utilBytesCount4_ ++;
2401            }
2402        }
2403        else {
2404            // Note: This code assumes that the table is well built
2405            // i.e. not having 0 bytes where they are not supposed to be.
2406            // Usually, we'll have non-zero primary1 & primary2, except
2407            // in cases of LatinOne and friends, when primary2 will be
2408            // regular and simple sortkey calc
2409            if (p1 != CollationElementIterator.IGNORABLE) {
2410                if (notIsContinuation) {
2411                    if (leadPrimary == p1) {
2412                        m_utilBytes1_ = append(m_utilBytes1_,
2413                                               m_utilBytesCount1_, (byte)p2);
2414                        m_utilBytesCount1_ ++;
2415                    }
2416                    else {
2417                        if (leadPrimary != 0) {
2418                            m_utilBytes1_ = append(m_utilBytes1_,
2419                                                   m_utilBytesCount1_,
2420                                    ((p1 > leadPrimary)
2421                                            ? BYTE_UNSHIFTED_MAX_
2422                                            : BYTE_UNSHIFTED_MIN_)); 
2423                            m_utilBytesCount1_ ++;
2424                        }
2425                        if (p2 == CollationElementIterator.IGNORABLE) {
2426                            // one byter, not compressed
2427                            m_utilBytes1_ = append(m_utilBytes1_,
2428                                                   m_utilBytesCount1_,
2429                                                   (byte)p1);
2430                            m_utilBytesCount1_ ++;
2431                            leadPrimary = 0;
2432                        }
2433                        else if (p1 < BYTE_FIRST_NON_LATIN_PRIMARY_
2434                              || (p1 > maxRegularPrimary
2435                    //> (RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_[0]
2436                    //                                              >>> 24)
2437                                && p1 < minImplicitPrimary
2438                    //< (RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_[0]
2439                    //                                              >>> 24)
2440                    )) {
2441                                // not compressible
2442                                leadPrimary = 0;
2443                                m_utilBytes1_ = append(m_utilBytes1_,
2444                                                       m_utilBytesCount1_,
2445                                                       (byte)p1);
2446                                m_utilBytesCount1_ ++;
2447                                m_utilBytes1_ = append(m_utilBytes1_,
2448                                                       m_utilBytesCount1_,
2449                                                       (byte)p2);
2450                                m_utilBytesCount1_ ++;
2451                        }
2452                        else { // compress
2453                            leadPrimary = p1;
2454                            m_utilBytes1_ = append(m_utilBytes1_,
2455                                                   m_utilBytesCount1_,
2456                                                   (byte)p1);
2457                            m_utilBytesCount1_ ++;
2458                            m_utilBytes1_ = append(m_utilBytes1_,
2459                                                  m_utilBytesCount1_, (byte)p2);
2460                            m_utilBytesCount1_ ++;
2461                        }
2462                    }
2463                }
2464                else {
2465                    // continuation, add primary to the key, no compression
2466                    m_utilBytes1_ = append(m_utilBytes1_,
2467                                           m_utilBytesCount1_, (byte)p1);
2468                    m_utilBytesCount1_ ++;
2469                    if (p2 != CollationElementIterator.IGNORABLE) {
2470                        m_utilBytes1_ = append(m_utilBytes1_,
2471                                           m_utilBytesCount1_, (byte)p2);
2472                        // second part
2473                        m_utilBytesCount1_ ++;
2474                    }
2475                }
2476            }
2477        }
2478        return leadPrimary;
2479    }
2480
2481    /**
2482     * Gets the secondary byte and adds it to the secondary byte array
2483     * @param ce current ce
2484     * @param notIsContinuation flag indicating if the current bytes belong to
2485     *          a continuation ce
2486     * @param doFrench flag indicator if french sort is to be performed
2487     */
2488    private final void doSecondaryBytes(int ce, boolean notIsContinuation,
2489                                        boolean doFrench)
2490    {
2491        int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison
2492        if (s != 0) {
2493            if (!doFrench) {
2494                // This is compression code.
2495                if (s == COMMON_2_ && notIsContinuation) {
2496                   m_utilCount2_ ++;
2497                }
2498                else {
2499                    if (m_utilCount2_ > 0) {
2500                        if (s > COMMON_2_) { // not necessary for 4th level.
2501                            while (m_utilCount2_ > TOP_COUNT_2_) {
2502                                m_utilBytes2_ = append(m_utilBytes2_,
2503                                        m_utilBytesCount2_,
2504                                        (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
2505                                m_utilBytesCount2_ ++;
2506                                m_utilCount2_ -= TOP_COUNT_2_;
2507                            }
2508                            m_utilBytes2_ = append(m_utilBytes2_,
2509                                                   m_utilBytesCount2_,
2510                                                   (byte)(COMMON_TOP_2_
2511                                                       - (m_utilCount2_ - 1)));
2512                            m_utilBytesCount2_ ++;
2513                        }
2514                        else {
2515                            while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2516                                m_utilBytes2_ = append(m_utilBytes2_,
2517                                                       m_utilBytesCount2_,
2518                                    (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
2519                                m_utilBytesCount2_ ++;
2520                                m_utilCount2_ -= BOTTOM_COUNT_2_;
2521                            }
2522                            m_utilBytes2_ = append(m_utilBytes2_,
2523                                                   m_utilBytesCount2_,
2524                                                   (byte)(COMMON_BOTTOM_2_
2525                                                       + (m_utilCount2_ - 1)));
2526                            m_utilBytesCount2_ ++;
2527                        }
2528                        m_utilCount2_ = 0;
2529                    }
2530                    m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
2531                                           (byte)s);
2532                    m_utilBytesCount2_ ++;
2533                }
2534            }
2535            else {
2536                  m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
2537                                         (byte)s);
2538                  m_utilBytesCount2_ ++;
2539                  // Do the special handling for French secondaries
2540                  // We need to get continuation elements and do intermediate
2541                  // restore
2542                  // abc1c2c3de with french secondaries need to be edc1c2c3ba
2543                  // NOT edc3c2c1ba
2544                  if (notIsContinuation) {
2545                        if (m_utilFrenchStart_ != -1) {
2546                            // reverse secondaries from frenchStartPtr up to
2547                            // frenchEndPtr
2548                            reverseBuffer(m_utilBytes2_);
2549                            m_utilFrenchStart_ = -1;
2550                        }
2551                  }
2552                  else {
2553                        if (m_utilFrenchStart_ == -1) {
2554                            m_utilFrenchStart_  = m_utilBytesCount2_ - 2;
2555                        }
2556                        m_utilFrenchEnd_ = m_utilBytesCount2_ - 1;
2557                  }
2558            }
2559        }
2560    }
2561
2562    /**
2563     * Reverse the argument buffer
2564     * @param buffer to reverse
2565     */
2566    private void reverseBuffer(byte buffer[])
2567    {
2568        int start = m_utilFrenchStart_;
2569        int end = m_utilFrenchEnd_;
2570        while (start < end) {
2571            byte b = buffer[start];
2572            buffer[start ++] = buffer[end];
2573            buffer[end --] = b;
2574        }
2575    }
2576
2577    /**
2578     * Insert the case shifting byte if required
2579     * @param caseshift value
2580     * @return new caseshift value
2581     */
2582    private final int doCaseShift(int caseshift)
2583    {
2584        if (caseshift  == 0) {
2585            m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_,
2586                                   SORT_CASE_BYTE_START_);
2587            m_utilBytesCount0_ ++;
2588            caseshift = SORT_CASE_SHIFT_START_;
2589        }
2590        return caseshift;
2591    }
2592
2593    /**
2594     * Performs the casing sort
2595     * @param tertiary byte in ints for easy comparison
2596     * @param notIsContinuation flag indicating if the current bytes belong to
2597     *          a continuation ce
2598     * @param caseshift
2599     * @return the new value of case shift
2600     */
2601    private final int doCaseBytes(int tertiary, boolean notIsContinuation,
2602                                  int caseshift)
2603    {
2604        caseshift = doCaseShift(caseshift);
2605
2606        if (notIsContinuation && tertiary != 0) {
2607            byte casebits = (byte)(tertiary & 0xC0);
2608            if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
2609                if (casebits == 0) {
2610                    m_utilBytes0_[m_utilBytesCount0_ - 1]
2611                                                      |= (1 << (-- caseshift));
2612                }
2613                else {
2614                     // second bit
2615                     caseshift = doCaseShift(caseshift - 1);
2616                     m_utilBytes0_[m_utilBytesCount0_ - 1]
2617                                    |= ((casebits >> 6) & 1) << (-- caseshift);
2618                }
2619            }
2620            else {
2621                if (casebits != 0) {
2622                    m_utilBytes0_[m_utilBytesCount0_ - 1]
2623                                                        |= 1 << (-- caseshift);
2624                    // second bit
2625                    caseshift = doCaseShift(caseshift);
2626                    m_utilBytes0_[m_utilBytesCount0_ - 1]
2627                                  |= ((casebits >> 7) & 1) << (-- caseshift);
2628                }
2629                else {
2630                    caseshift --;
2631                }
2632            }
2633        }
2634
2635        return caseshift;
2636    }
2637
2638    /**
2639     * Gets the tertiary byte and adds it to the tertiary byte array
2640     * @param tertiary byte in int for easy comparison
2641     * @param notIsContinuation flag indicating if the current bytes belong to
2642     *          a continuation ce
2643     */
2644    private final void doTertiaryBytes(int tertiary, boolean notIsContinuation)
2645    {
2646        if (tertiary != 0) {
2647            // This is compression code.
2648            // sequence size check is included in the if clause
2649            if (tertiary == m_common3_ && notIsContinuation) {
2650                 m_utilCount3_ ++;
2651            }
2652            else {
2653                int common3 = m_common3_ & LAST_BYTE_MASK_;
2654                if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) {
2655                    tertiary += m_addition3_;
2656                }
2657                else if (tertiary <= common3
2658                         && m_common3_ == COMMON_UPPER_FIRST_3_) {
2659                    tertiary -= m_addition3_;
2660                }
2661                if (m_utilCount3_ > 0) {
2662                    if (tertiary > common3) {
2663                        while (m_utilCount3_ > m_topCount3_) {
2664                            m_utilBytes3_ = append(m_utilBytes3_,
2665                                                   m_utilBytesCount3_,
2666                                            (byte)(m_top3_ - m_topCount3_));
2667                            m_utilBytesCount3_ ++;
2668                            m_utilCount3_ -= m_topCount3_;
2669                        }
2670                        m_utilBytes3_ = append(m_utilBytes3_,
2671                                               m_utilBytesCount3_,
2672                                               (byte)(m_top3_
2673                                                      - (m_utilCount3_ - 1)));
2674                        m_utilBytesCount3_ ++;
2675                    }
2676                    else {
2677                        while (m_utilCount3_ > m_bottomCount3_) {
2678                            m_utilBytes3_ = append(m_utilBytes3_,
2679                                                   m_utilBytesCount3_,
2680                                         (byte)(m_bottom3_ + m_bottomCount3_));
2681                            m_utilBytesCount3_ ++;
2682                            m_utilCount3_ -= m_bottomCount3_;
2683                        }
2684                        m_utilBytes3_ = append(m_utilBytes3_,
2685                                               m_utilBytesCount3_,
2686                                               (byte)(m_bottom3_
2687                                                      + (m_utilCount3_ - 1)));
2688                        m_utilBytesCount3_ ++;
2689                    }
2690                    m_utilCount3_ = 0;
2691                }
2692                m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
2693                                       (byte)tertiary);
2694                m_utilBytesCount3_ ++;
2695            }
2696        }
2697    }
2698
2699    /**
2700     * Gets the Quaternary byte and adds it to the Quaternary byte array
2701     * @param isCodePointHiragana flag indicator if the previous codepoint
2702     *          we dealt with was Hiragana
2703     * @param commonBottom4 smallest common Quaternary byte
2704     * @param bottomCount4 smallest Quaternary byte
2705     * @param hiragana4 hiragana Quaternary byte
2706     */
2707    private final void doQuaternaryBytes(boolean isCodePointHiragana,
2708                                      int commonBottom4, int bottomCount4,
2709                                      byte hiragana4)
2710    {
2711        if (isCodePointHiragana) { // This was Hiragana, need to note it
2712            if (m_utilCount4_ > 0) { // Close this part
2713                while (m_utilCount4_ > bottomCount4) {
2714                    m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2715                                           (byte)(commonBottom4
2716                                                        + bottomCount4));
2717                    m_utilBytesCount4_ ++;
2718                    m_utilCount4_ -= bottomCount4;
2719                }
2720                m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2721                                      (byte)(commonBottom4
2722                                             + (m_utilCount4_ - 1)));
2723                m_utilBytesCount4_ ++;
2724                m_utilCount4_ = 0;
2725            }
2726            m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2727                                   hiragana4); // Add the Hiragana
2728            m_utilBytesCount4_ ++;
2729        }
2730        else { // This wasn't Hiragana, so we can continue adding stuff
2731            m_utilCount4_ ++;
2732        }
2733    }
2734
2735    /**
2736     * Iterates through the argument string for all ces.
2737     * Split the ces into their relevant primaries, secondaries etc.
2738     * @param source normalized string
2739     * @param doFrench flag indicator if special handling of French has to be
2740     *                  done
2741     * @param hiragana4 offset for Hiragana quaternary
2742     * @param commonBottom4 smallest common quaternary byte
2743     * @param bottomCount4 smallest quaternary byte
2744     */
2745    private final void getSortKeyBytes(String   source, boolean doFrench,
2746                                       byte hiragana4, int commonBottom4,
2747                                       int bottomCount4)
2748
2749    {
2750        if (m_srcUtilIter_ == null) {
2751            initUtility(true);
2752        }
2753        int backupDecomposition = getDecomposition();
2754        setDecomposition(NO_DECOMPOSITION); // have to revert to backup later
2755        m_srcUtilIter_.setText(source);
2756        m_srcUtilColEIter_.setText(m_srcUtilIter_);
2757        m_utilFrenchStart_ = -1;
2758        m_utilFrenchEnd_ = -1;
2759
2760        // scriptorder not implemented yet
2761        // const uint8_t *scriptOrder = coll->scriptOrder;
2762
2763        boolean doShift = false;
2764        boolean notIsContinuation = false;
2765
2766        int leadPrimary = 0; // int for easier comparison
2767        int caseShift = 0;
2768
2769        while (true) {
2770            int ce = m_srcUtilColEIter_.next();
2771            if (ce == CollationElementIterator.NULLORDER) {
2772                break;
2773            }
2774
2775            if (ce == CollationElementIterator.IGNORABLE) {
2776                continue;
2777            }
2778
2779            notIsContinuation = !isContinuation(ce);
2780
2781            /*
2782             * if (notIsContinuation) {
2783                    if (scriptOrder != NULL) {
2784                        primary1 = scriptOrder[primary1];
2785                    }
2786                }*/
2787            boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
2788            // actually we can just check that the first byte is 0
2789            // generation stuffs the order left first
2790            boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_)
2791                                               <= m_variableTopValue_;
2792            doShift = (m_isAlternateHandlingShifted_
2793                        && ((notIsContinuation && isSmallerThanVariableTop
2794                            && !isPrimaryByteIgnorable) // primary byte not 0
2795                        || (!notIsContinuation && doShift))
2796                        || (doShift && isPrimaryByteIgnorable));
2797            if (doShift && isPrimaryByteIgnorable) {
2798                // amendment to the UCA says that primary ignorables and other
2799                // ignorables should be removed if following a shifted code
2800                // point
2801                // if we were shifted and we got an ignorable code point
2802                // we should just completely ignore it
2803                continue;
2804            }
2805            leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift,
2806                                         leadPrimary, commonBottom4,
2807                                         bottomCount4);
2808            if (doShift) {
2809                continue;
2810            }
2811            if (m_utilCompare2_) {
2812                doSecondaryBytes(ce, notIsContinuation, doFrench);
2813            }
2814
2815            int t = ce & LAST_BYTE_MASK_;
2816            if (!notIsContinuation) {
2817                t = ce & CE_REMOVE_CONTINUATION_MASK_;
2818            }
2819
2820            if (m_utilCompare0_ && (!isPrimaryByteIgnorable || m_utilCompare2_)) {
2821                // do the case level if we need to do it. We don't want to calculate
2822                // case level for primary ignorables if we have only primary strength and case level
2823                // otherwise we would break well formedness of CEs 
2824                caseShift = doCaseBytes(t, notIsContinuation, caseShift);
2825            }
2826            else if (notIsContinuation) {
2827                 t ^= m_caseSwitch_;
2828            }
2829
2830            t &= m_mask3_;
2831
2832            if (m_utilCompare3_) {
2833                doTertiaryBytes(t, notIsContinuation);
2834            }
2835
2836            if (m_utilCompare4_ && notIsContinuation) { // compare quad
2837                doQuaternaryBytes(m_srcUtilColEIter_.m_isCodePointHiragana_,
2838                                  commonBottom4, bottomCount4, hiragana4);
2839            }
2840        }
2841        setDecomposition(backupDecomposition); // reverts to original
2842        if (m_utilFrenchStart_ != -1) {
2843            // one last round of checks
2844            reverseBuffer(m_utilBytes2_);
2845        }
2846    }
2847
2848    /**
2849     * From the individual strength byte results the final compact sortkey
2850     * will be calculated.
2851     * @param source text string
2852     * @param doFrench flag indicating that special handling of French has to
2853     *                  be done
2854     * @param commonBottom4 smallest common quaternary byte
2855     * @param bottomCount4 smallest quaternary byte
2856     * @param key output RawCollationKey to store results, key cannot be null
2857     */
2858    private final void getSortKey(String   source, boolean doFrench,
2859                                             int commonBottom4, 
2860                                             int bottomCount4,
2861                                             RawCollationKey key)
2862    {
2863        // we have done all the CE's, now let's put them together to form
2864        // a key
2865        if (m_utilCompare2_) {
2866            doSecondary(doFrench);
2867        }
2868        // adding case level should be independent of secondary level
2869        if (m_utilCompare0_) {
2870            doCase();
2871        }
2872        if (m_utilCompare3_) {
2873            doTertiary();
2874            if (m_utilCompare4_) {
2875                doQuaternary(commonBottom4, bottomCount4);
2876                if (m_utilCompare5_) {
2877                    doIdentical(source);
2878                }
2879
2880            }
2881        }
2882        m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte)0);
2883        m_utilBytesCount1_ ++;
2884
2885        key.set(m_utilBytes1_, 0, m_utilBytesCount1_);
2886    }
2887
2888    /**
2889     * Packs the French bytes
2890     */
2891    private final void doFrench()
2892    {
2893        for (int i = 0; i < m_utilBytesCount2_; i ++) {
2894            byte s = m_utilBytes2_[m_utilBytesCount2_ - i - 1];
2895            // This is compression code.
2896            if (s == COMMON_2_) {
2897                ++ m_utilCount2_;
2898            }
2899            else {
2900                if (m_utilCount2_ > 0) {
2901                    // getting the unsigned value
2902                    if ((s & LAST_BYTE_MASK_) > COMMON_2_) {
2903                        // not necessary for 4th level.
2904                        while (m_utilCount2_ > TOP_COUNT_2_) {
2905                            m_utilBytes1_ = append(m_utilBytes1_,
2906                                                   m_utilBytesCount1_,
2907                                        (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
2908                            m_utilBytesCount1_ ++;
2909                            m_utilCount2_ -= TOP_COUNT_2_;
2910                        }
2911                        m_utilBytes1_ = append(m_utilBytes1_,
2912                                               m_utilBytesCount1_,
2913                                               (byte)(COMMON_TOP_2_
2914                                                      - (m_utilCount2_ - 1)));
2915                        m_utilBytesCount1_ ++;
2916                    }
2917                    else {
2918                        while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2919                            m_utilBytes1_ = append(m_utilBytes1_,
2920                                                   m_utilBytesCount1_,
2921                                (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
2922                            m_utilBytesCount1_ ++;
2923                            m_utilCount2_ -= BOTTOM_COUNT_2_;
2924                        }
2925                        m_utilBytes1_ = append(m_utilBytes1_,
2926                                               m_utilBytesCount1_,
2927                                               (byte)(COMMON_BOTTOM_2_
2928                                                      + (m_utilCount2_ - 1)));
2929                        m_utilBytesCount1_ ++;
2930                    }
2931                    m_utilCount2_ = 0;
2932                }
2933                m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, s);
2934                m_utilBytesCount1_ ++;
2935            }
2936        }
2937        if (m_utilCount2_ > 0) {
2938            while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2939                m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2940                                       (byte)(COMMON_BOTTOM_2_
2941                                                    + BOTTOM_COUNT_2_));
2942                m_utilBytesCount1_ ++;
2943                m_utilCount2_ -= BOTTOM_COUNT_2_;
2944            }
2945            m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2946                                   (byte)(COMMON_BOTTOM_2_
2947                                                    + (m_utilCount2_ - 1)));
2948            m_utilBytesCount1_ ++;
2949        }
2950    }
2951
2952    /**
2953     * Compacts the secondary bytes and stores them into the primary array
2954     * @param doFrench flag indicator that French has to be handled specially
2955     */
2956    private final void doSecondary(boolean doFrench)
2957    {
2958        if (m_utilCount2_ > 0) {
2959            while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2960                m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
2961                                       (byte)(COMMON_BOTTOM_2_
2962                                                        + BOTTOM_COUNT_2_));
2963                m_utilBytesCount2_ ++;
2964                m_utilCount2_ -= BOTTOM_COUNT_2_;
2965            }
2966            m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
2967                                   (byte)(COMMON_BOTTOM_2_ +
2968                                                    (m_utilCount2_ - 1)));
2969            m_utilBytesCount2_ ++;
2970        }
2971
2972        m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2973                               SORT_LEVEL_TERMINATOR_);
2974        m_utilBytesCount1_ ++;
2975
2976        if (doFrench) { // do the reverse copy
2977            doFrench();
2978        }
2979        else {
2980            if (m_utilBytes1_.length <= m_utilBytesCount1_
2981                                        + m_utilBytesCount2_) {
2982                m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
2983                                         m_utilBytesCount2_);
2984            }
2985            System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_,
2986                             m_utilBytesCount1_, m_utilBytesCount2_);
2987            m_utilBytesCount1_ += m_utilBytesCount2_;
2988        }
2989    }
2990
2991    /**
2992     * Increase buffer size
2993     * @param buffer array of bytes
2994     * @param size of the byte array
2995     * @param incrementsize size to increase
2996     * @return the new buffer
2997     */
2998    private static final byte[] increase(byte buffer[], int size,
2999                                         int incrementsize)
3000    {
3001        byte result[] = new byte[buffer.length + incrementsize];
3002        System.arraycopy(buffer, 0, result, 0, size);
3003        return result;
3004    }
3005
3006    /**
3007     * Increase buffer size
3008     * @param buffer array of ints
3009     * @param size of the byte array
3010     * @param incrementsize size to increase
3011     * @return the new buffer
3012     */
3013    private static final int[] increase(int buffer[], int size,
3014                                        int incrementsize)
3015    {
3016        int result[] = new int[buffer.length + incrementsize];
3017        System.arraycopy(buffer, 0, result, 0, size);
3018        return result;
3019    }
3020
3021    /**
3022     * Compacts the case bytes and stores them into the primary array
3023     */
3024    private final void doCase()
3025    {
3026        m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
3027                               SORT_LEVEL_TERMINATOR_);
3028        m_utilBytesCount1_ ++;
3029        if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount0_) {
3030            m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
3031                                     m_utilBytesCount0_);
3032        }
3033        System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_, m_utilBytesCount1_,
3034                         m_utilBytesCount0_);
3035        m_utilBytesCount1_ += m_utilBytesCount0_;
3036    }
3037
3038    /**
3039     * Compacts the tertiary bytes and stores them into the primary array
3040     */
3041    private final void doTertiary()
3042    {
3043        if (m_utilCount3_ > 0) {
3044            if (m_common3_ != COMMON_BOTTOM_3_) {
3045                while (m_utilCount3_ >= m_topCount3_) {
3046                    m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
3047                                           (byte)(m_top3_ - m_topCount3_));
3048                    m_utilBytesCount3_ ++;
3049                    m_utilCount3_ -= m_topCount3_;
3050                }
3051                m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
3052                                       (byte)(m_top3_ - m_utilCount3_));
3053                m_utilBytesCount3_ ++;
3054            }
3055            else {
3056                while (m_utilCount3_ > m_bottomCount3_) {
3057                    m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
3058                                           (byte)(m_bottom3_
3059                                                        + m_bottomCount3_));
3060                    m_utilBytesCount3_ ++;
3061                    m_utilCount3_ -= m_bottomCount3_;
3062                }
3063                m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
3064                                       (byte)(m_bottom3_
3065                                              + (m_utilCount3_ - 1)));
3066                m_utilBytesCount3_ ++;
3067            }
3068        }
3069        m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
3070                               SORT_LEVEL_TERMINATOR_);
3071        m_utilBytesCount1_ ++;
3072        if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount3_) {
3073            m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
3074                                     m_utilBytesCount3_);
3075        }
3076        System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_, m_utilBytesCount1_,
3077                         m_utilBytesCount3_);
3078        m_utilBytesCount1_ += m_utilBytesCount3_;
3079    }
3080
3081    /**
3082     * Compacts the quaternary bytes and stores them into the primary array
3083     */
3084    private final void doQuaternary(int commonbottom4, int bottomcount4)
3085    {
3086        if (m_utilCount4_ > 0) {
3087            while (m_utilCount4_ > bottomcount4) {
3088                m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
3089                                       (byte)(commonbottom4 + bottomcount4));
3090                m_utilBytesCount4_ ++;
3091                m_utilCount4_ -= bottomcount4;
3092            }
3093            m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
3094                                   (byte)(commonbottom4
3095                                                + (m_utilCount4_ - 1)));
3096            m_utilBytesCount4_ ++;
3097        }
3098        m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
3099                               SORT_LEVEL_TERMINATOR_);
3100        m_utilBytesCount1_ ++;
3101        if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount4_) {
3102            m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
3103                                     m_utilBytesCount4_);
3104        }
3105        System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_, m_utilBytesCount1_,
3106                         m_utilBytesCount4_);
3107        m_utilBytesCount1_ += m_utilBytesCount4_;
3108    }
3109
3110    /**
3111     * Deals with the identical sort.
3112     * Appends the BOCSU version of the source string to the ends of the
3113     * byte buffer.
3114     * @param source text string
3115     */
3116    private final void doIdentical(String   source)
3117    {
3118        int isize = BOCU.getCompressionLength(source);
3119        m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
3120                               SORT_LEVEL_TERMINATOR_);
3121        m_utilBytesCount1_ ++;
3122        if (m_utilBytes1_.length <= m_utilBytesCount1_ + isize) {
3123            m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
3124                                     1 + isize);
3125        }
3126        m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_,
3127                                           m_utilBytesCount1_);
3128    }
3129
3130    /**
3131     * Gets the offset of the first unmatched characters in source and target.
3132     * This method returns the offset of the start of a contraction or a
3133     * combining sequence, if the first difference is in the middle of such a
3134     * sequence.
3135     * @param source string
3136     * @param target string
3137     * @return offset of the first unmatched characters in source and target.
3138     */
3139    private final int getFirstUnmatchedOffset(String   source, String   target)
3140    {
3141        int result = 0;
3142        int slength = source.length();
3143        int tlength = target.length();
3144        int minlength = slength;
3145        if (minlength > tlength) {
3146            minlength = tlength;
3147        }
3148        while (result < minlength
3149                && source.charAt(result) == target.charAt(result)) {
3150            result ++;
3151        }
3152        if (result > 0) {
3153            // There is an identical portion at the beginning of the two
3154            // strings. If the identical portion ends within a contraction or a
3155            // combining character sequence, back up to the start of that
3156            // sequence.
3157            char schar = 0;
3158            char tchar = 0;
3159            if (result < minlength) {
3160                schar = source.charAt(result); // first differing chars
3161                tchar = target.charAt(result);
3162            }
3163            else {
3164                schar = source.charAt(minlength - 1);
3165                if (isUnsafe(schar)) {
3166                    tchar = schar;
3167                }
3168                else if (slength == tlength) {
3169                        return result;
3170                }
3171                else if (slength < tlength) {
3172                    tchar = target.charAt(result);
3173                }
3174                else {
3175                    schar = source.charAt(result);
3176                }
3177            }
3178            if (isUnsafe(schar) || isUnsafe(tchar))
3179            {
3180                // We are stopped in the middle of a contraction or combining
3181                // sequence.
3182                // Look backwards for the part of the string for the start of
3183                // the sequence
3184                // It doesn't matter which string we scan, since they are the
3185                // same in this region.
3186                do {
3187                    result --;
3188                }
3189                while (result > 0 && isUnsafe(source.charAt(result)));
3190            }
3191        }
3192        return result;
3193    }
3194
3195    /**
3196     * Appending an byte to an array of bytes and increases it if we run out of
3197     * space
3198     * @param array of byte arrays
3199     * @param appendindex index in the byte array to append
3200     * @param value to append
3201     * @return array if array size can accomodate the new value, otherwise
3202     *         a bigger array will be created and returned
3203     */
3204    private static final byte[] append(byte array[], int appendindex,
3205                                       byte value)
3206    {
3207        try {
3208            array[appendindex] = value;
3209        }
3210        catch (ArrayIndexOutOfBoundsException   e) {
3211            array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);
3212            array[appendindex] = value;
3213        }
3214        return array;
3215    }
3216
3217    /**
3218     * This is a trick string compare function that goes in and uses sortkeys
3219     * to compare. It is used when compare gets in trouble and needs to bail
3220     * out.
3221     * @param source text string
3222     * @param target text string
3223     */
3224    private final int compareBySortKeys(String   source, String   target)
3225
3226    {
3227        m_utilRawCollationKey_ = getRawCollationKey(source, 
3228                                                    m_utilRawCollationKey_);
3229        // this method is very seldom called
3230        RawCollationKey targetkey = getRawCollationKey(target, null);
3231        return m_utilRawCollationKey_.compareTo(targetkey);
3232    }
3233
3234    /**
3235     * Performs the primary comparisons, and fills up the CE buffer at the
3236     * same time.
3237     * The return value toggles between the comparison result and the hiragana
3238     * result. If either the source is greater than target or vice versa, the
3239     * return result is the comparison result, ie 1 or -1, furthermore the
3240     * cebuffers will be cleared when that happens. If the primary comparisons
3241     * are equal, we'll have to continue with secondary comparison. In this case
3242     * the cebuffer will not be cleared and the return result will be the
3243     * hiragana result.
3244     * @param doHiragana4 flag indicator that Hiragana Quaternary has to be
3245     *                  observed
3246     * @param lowestpvalue the lowest primary value that will not be ignored if
3247     *                      alternate handling is shifted
3248     * @param source text string
3249     * @param target text string
3250     * @param textoffset offset in text to start the comparison
3251     * @return comparion result if a primary difference is found, otherwise
3252     *                      hiragana result
3253     */
3254    private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue,
3255                                        String   source, String   target,
3256                                        int textoffset)
3257
3258    {
3259        // Preparing the context objects for iterating over strings
3260        m_srcUtilIter_.setText(source);
3261        m_srcUtilColEIter_.setText(m_srcUtilIter_, textoffset);
3262        m_tgtUtilIter_.setText(target);
3263        m_tgtUtilColEIter_.setText(m_tgtUtilIter_, textoffset);
3264
3265        // Non shifted primary processing is quite simple
3266        if (!m_isAlternateHandlingShifted_) {
3267            int hiraganaresult = 0;
3268            while (true) {
3269                int sorder = 0;
3270                // We fetch CEs until we hit a non ignorable primary or end.
3271                do {
3272                    sorder = m_srcUtilColEIter_.next();
3273                    m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_,
3274                                                m_srcUtilCEBufferSize_, sorder);
3275                    m_srcUtilCEBufferSize_ ++;
3276                    sorder &= CE_PRIMARY_MASK_;
3277                } while (sorder == CollationElementIterator.IGNORABLE);
3278
3279                int torder = 0;
3280                do {
3281                    torder = m_tgtUtilColEIter_.next();
3282                    m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_,
3283                                                m_tgtUtilCEBufferSize_, torder);
3284                    m_tgtUtilCEBufferSize_ ++;
3285                    torder &= CE_PRIMARY_MASK_;
3286                } while (torder == CollationElementIterator.IGNORABLE);
3287
3288                // if both primaries are the same
3289                if (sorder == torder) {
3290                    // and there are no more CEs, we advance to the next level
3291                    // see if we are at the end of either string
3292                    if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
3293                                        == CollationElementIterator.NULLORDER) {
3294                        if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] 
3295                            != CollationElementIterator.NULLORDER) {
3296                            return -1;
3297                        }
3298                        break;
3299                    }
3300                    else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
3301                             == CollationElementIterator.NULLORDER) {
3302                        return 1;
3303                    }
3304                    if (doHiragana4 && hiraganaresult == 0
3305                        && m_srcUtilColEIter_.m_isCodePointHiragana_ !=
3306                                        m_tgtUtilColEIter_.m_isCodePointHiragana_) {
3307                        if (m_srcUtilColEIter_.m_isCodePointHiragana_) {
3308                            hiraganaresult = -1;
3309                        }
3310                        else {
3311                            hiraganaresult = 1;
3312                        }
3313                    }
3314                }
3315                else {
3316                    // if two primaries are different, we are done
3317                    return endPrimaryCompare(sorder, torder);
3318                }
3319            }
3320            // no primary difference... do the rest from the buffers
3321            return hiraganaresult;
3322        }
3323        else { // shifted - do a slightly more complicated processing :)
3324            while (true) {
3325                int sorder = getPrimaryShiftedCompareCE(m_srcUtilColEIter_,
3326                                                        lowestpvalue, true);
3327                int torder = getPrimaryShiftedCompareCE(m_tgtUtilColEIter_,
3328                                                        lowestpvalue, false);
3329                if (sorder == torder) {
3330                    if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
3331                            == CollationElementIterator.NULLORDER) {
3332                        break;
3333                    }
3334                    else {
3335                        continue;
3336                    }
3337                }
3338                else {
3339                    return endPrimaryCompare(sorder, torder);
3340                }
3341            } // no primary difference... do the rest from the buffers
3342        }
3343        return 0;
3344    }
3345
3346    /**
3347     * This is used only for primary strength when we know that sorder is
3348     * already different from torder.
3349     * Compares sorder and torder, returns -1 if sorder is less than torder.
3350     * Clears the cebuffer at the same time.
3351     * @param sorder source strength order
3352     * @param torder target strength order
3353     * @return the comparison result of sorder and torder
3354     */
3355    private final int endPrimaryCompare(int sorder, int torder)
3356    {
3357        // if we reach here, the ce offset accessed is the last ce
3358        // appended to the buffer
3359        boolean isSourceNullOrder = (m_srcUtilCEBuffer_[
3360                                                    m_srcUtilCEBufferSize_ - 1]
3361                                        == CollationElementIterator.NULLORDER);
3362        boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[
3363                                                    m_tgtUtilCEBufferSize_ - 1]
3364                                        == CollationElementIterator.NULLORDER);
3365        m_srcUtilCEBufferSize_ = -1;
3366        m_tgtUtilCEBufferSize_ = -1;
3367        if (isSourceNullOrder) {
3368            return -1;
3369        }
3370        if (isTargetNullOrder) {
3371            return 1;
3372        }
3373        // getting rid of the sign
3374        sorder >>>= CE_PRIMARY_SHIFT_;
3375        torder >>>= CE_PRIMARY_SHIFT_;
3376        if (sorder < torder) {
3377            return -1;
3378        }
3379        return 1;
3380    }
3381
3382    /**
3383     * Calculates the next primary shifted value and fills up cebuffer with the
3384     * next non-ignorable ce.
3385     * @param coleiter collation element iterator
3386     * @param doHiragana4 flag indicator if hiragana quaternary is to be
3387     *                      handled
3388     * @param lowestpvalue lowest primary shifted value that will not be
3389     *                      ignored
3390     * @return result next modified ce
3391     */
3392    private final int getPrimaryShiftedCompareCE(
3393                                        CollationElementIterator coleiter,
3394                                        int lowestpvalue, boolean isSrc)
3395
3396    {
3397        boolean shifted = false;
3398        int result = CollationElementIterator.IGNORABLE;
3399        int cebuffer[] = m_srcUtilCEBuffer_;
3400        int cebuffersize = m_srcUtilCEBufferSize_;
3401        if (!isSrc) {
3402            cebuffer = m_tgtUtilCEBuffer_;
3403            cebuffersize = m_tgtUtilCEBufferSize_;
3404        }
3405        while (true) {
3406            result = coleiter.next();
3407            if (result == CollationElementIterator.NULLORDER) {
3408                cebuffer = append(cebuffer, cebuffersize, result);
3409                cebuffersize ++;
3410                break;
3411            }
3412            else if (result == CollationElementIterator.IGNORABLE
3413                     || (shifted
3414                         && (result & CE_PRIMARY_MASK_)
3415                                      == CollationElementIterator.IGNORABLE)) {
3416                // UCA amendment - ignore ignorables that follow shifted code
3417                // points
3418                continue;
3419            }
3420            else if (isContinuation(result)) {
3421                if ((result & CE_PRIMARY_MASK_)
3422                                    != CollationElementIterator.IGNORABLE) {
3423                    // There is primary value
3424                    if (shifted) {
3425                        result = (result & CE_PRIMARY_MASK_)
3426                                            | CE_CONTINUATION_MARKER_;
3427                        // preserve interesting continuation
3428                        cebuffer = append(cebuffer, cebuffersize, result);
3429                        cebuffersize ++;
3430                        continue;
3431                    }
3432                    else {
3433                        cebuffer = append(cebuffer, cebuffersize, result);
3434                        cebuffersize ++;
3435                        break;
3436                    }
3437                }
3438                else { // Just lower level values
3439                    if (!shifted) {
3440                        cebuffer = append(cebuffer, cebuffersize, result);
3441                        cebuffersize ++;
3442                    }
3443                }
3444            }
3445            else { // regular
3446                if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_,
3447                                            lowestpvalue) > 0) {
3448                    cebuffer = append(cebuffer, cebuffersize, result);
3449                    cebuffersize ++;
3450                    break;
3451                }
3452                else {
3453                    if ((result & CE_PRIMARY_MASK_) != 0) {
3454                        shifted = true;
3455                        result &= CE_PRIMARY_MASK_;
3456                        cebuffer = append(cebuffer, cebuffersize, result);
3457                        cebuffersize ++;
3458                        continue;
3459                    }
3460                    else {
3461                        cebuffer = append(cebuffer, cebuffersize, result);
3462                        cebuffersize ++;
3463                        shifted = false;
3464                        continue;
3465                    }
3466                }
3467            }
3468        }
3469        if (isSrc) {
3470            m_srcUtilCEBuffer_ = cebuffer;
3471            m_srcUtilCEBufferSize_ = cebuffersize;
3472        }
3473        else {
3474            m_tgtUtilCEBuffer_ = cebuffer;
3475            m_tgtUtilCEBufferSize_ = cebuffersize;
3476        }
3477        result &= CE_PRIMARY_MASK_;
3478        return result;
3479    }
3480
3481    /**
3482     * Appending an int to an array of ints and increases it if we run out of
3483     * space
3484     * @param array of int arrays
3485     * @param appendindex index at which value will be appended
3486     * @param value to append
3487     * @return array if size is not increased, otherwise a new array will be
3488     *         returned
3489     */
3490    private static final int[] append(int array[], int appendindex, int value)
3491    {
3492        if (appendindex + 1 >= array.length) {
3493            array = increase(array, appendindex, CE_BUFFER_SIZE_);
3494        }
3495        array[appendindex] = value;
3496        return array;
3497    }
3498
3499    /**
3500     * Does secondary strength comparison based on the collected ces.
3501     * @param doFrench flag indicates if French ordering is to be done
3502     * @return the secondary strength comparison result
3503     */
3504    private final int doSecondaryCompare(boolean doFrench)
3505    {
3506        // now, we're gonna reexamine collected CEs
3507        if (!doFrench) { // normal
3508            int soffset = 0;
3509            int toffset = 0;
3510            while (true) {
3511                int sorder = CollationElementIterator.IGNORABLE;
3512                while (sorder == CollationElementIterator.IGNORABLE) {
3513                    sorder = m_srcUtilCEBuffer_[soffset ++]
3514                             & CE_SECONDARY_MASK_;
3515                }
3516                int torder = CollationElementIterator.IGNORABLE;
3517                while (torder == CollationElementIterator.IGNORABLE) {
3518                    torder = m_tgtUtilCEBuffer_[toffset ++]
3519                             & CE_SECONDARY_MASK_;
3520                }
3521
3522                if (sorder == torder) {
3523                    if (m_srcUtilCEBuffer_[soffset - 1]
3524                                    == CollationElementIterator.NULLORDER) {
3525                        if (m_tgtUtilCEBuffer_[toffset - 1] 
3526                            != CollationElementIterator.NULLORDER) {
3527                            return -1;
3528                        }
3529                        break;
3530                    }
3531                    else if (m_tgtUtilCEBuffer_[toffset - 1]
3532                             == CollationElementIterator.NULLORDER) {
3533                        return 1;
3534                    }
3535                }
3536                else {
3537                    if (m_srcUtilCEBuffer_[soffset - 1] ==
3538                            CollationElementIterator.NULLORDER) {
3539                        return -1;
3540                    }
3541                    if (m_tgtUtilCEBuffer_[toffset - 1] ==
3542                            CollationElementIterator.NULLORDER) {
3543                        return 1;
3544                    }
3545                    return (sorder < torder) ? -1 : 1;
3546                }
3547            }
3548        }
3549        else { // do the French
3550            m_srcUtilContOffset_ = 0;
3551            m_tgtUtilContOffset_ = 0;
3552            m_srcUtilOffset_ = m_srcUtilCEBufferSize_ - 2;
3553            m_tgtUtilOffset_ = m_tgtUtilCEBufferSize_ - 2;
3554            while (true) {
3555                int sorder = getSecondaryFrenchCE(true);
3556                int torder = getSecondaryFrenchCE(false);
3557                if (sorder == torder) {
3558                    if ((m_srcUtilOffset_ < 0 && m_tgtUtilOffset_ < 0)
3559                        || (m_srcUtilOffset_ >= 0 
3560                            && m_srcUtilCEBuffer_[m_srcUtilOffset_]
3561                                    == CollationElementIterator.NULLORDER)) {
3562                        break;
3563                    }
3564                }
3565                else {
3566                    return (sorder < torder) ? -1 : 1;
3567                }
3568            }
3569        }
3570        return 0;
3571    }
3572
3573    /**
3574     * Calculates the next secondary french CE.
3575     * @param isSrc flag indicator if we are calculating the src ces
3576     * @return result next modified ce
3577     */
3578    private final int getSecondaryFrenchCE(boolean isSrc)
3579    {
3580        int result = CollationElementIterator.IGNORABLE;
3581        int offset = m_srcUtilOffset_;
3582        int continuationoffset = m_srcUtilContOffset_;
3583        int cebuffer[] = m_srcUtilCEBuffer_;
3584        if (!isSrc) {
3585            offset = m_tgtUtilOffset_;
3586            continuationoffset = m_tgtUtilContOffset_;
3587            cebuffer = m_tgtUtilCEBuffer_;
3588        }
3589
3590        while (result == CollationElementIterator.IGNORABLE
3591                && offset >= 0) {
3592            if (continuationoffset == 0) {
3593                result = cebuffer[offset];
3594                while (isContinuation(cebuffer[offset --])){
3595                }
3596                // after this, sorder is at the start of continuation,
3597                // and offset points before that
3598                if (isContinuation(cebuffer[offset + 1])) {
3599                    // save offset for later
3600                    continuationoffset = offset;
3601                    offset += 2;
3602                }
3603            }
3604            else {
3605                result = cebuffer[offset ++];
3606                if (!isContinuation(result)) {
3607                    // we have finished with this continuation
3608                    offset = continuationoffset;
3609                    // reset the pointer to before continuation
3610                    continuationoffset = 0;
3611                    continue;
3612                }
3613            }
3614            result &= CE_SECONDARY_MASK_; // remove continuation bit
3615        }
3616        if (isSrc) {
3617            m_srcUtilOffset_ = offset;
3618            m_srcUtilContOffset_ = continuationoffset;
3619        }
3620        else {
3621            m_tgtUtilOffset_ = offset;
3622            m_tgtUtilContOffset_ = continuationoffset;
3623        }
3624        return result;
3625    }
3626
3627    /**
3628     * Does case strength comparison based on the collected ces.
3629     * @return the case strength comparison result
3630     */
3631    private final int doCaseCompare()
3632    {
3633        int soffset = 0;
3634        int toffset = 0;
3635        while (true) {
3636            int sorder = CollationElementIterator.IGNORABLE;
3637            int torder = CollationElementIterator.IGNORABLE;
3638            while ((sorder & CE_REMOVE_CASE_)
3639                                    == CollationElementIterator.IGNORABLE) {
3640                sorder = m_srcUtilCEBuffer_[soffset ++];
3641                if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
3642                    // primary ignorables should not be considered on the case level when the strength is primary
3643                    // otherwise, the CEs stop being well-formed
3644                    sorder &= CE_CASE_MASK_3_;
3645                    sorder ^= m_caseSwitch_;
3646                }
3647                else {
3648                    sorder = CollationElementIterator.IGNORABLE;
3649                }
3650            }
3651
3652            while ((torder & CE_REMOVE_CASE_)
3653                                    == CollationElementIterator.IGNORABLE) {
3654                torder = m_tgtUtilCEBuffer_[toffset ++];
3655                if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
3656                    // primary ignorables should not be considered on the case level when the strength is primary
3657                    // otherwise, the CEs stop being well-formed
3658                    torder &= CE_CASE_MASK_3_;
3659                    torder ^= m_caseSwitch_;
3660                }
3661                else {
3662                    torder = CollationElementIterator.IGNORABLE;
3663                }
3664            }
3665
3666            sorder &= CE_CASE_BIT_MASK_;
3667            torder &= CE_CASE_BIT_MASK_;
3668            if (sorder == torder) {
3669                // checking end of strings
3670                if (m_srcUtilCEBuffer_[soffset - 1]
3671                                        == CollationElementIterator.NULLORDER) {
3672                    if (m_tgtUtilCEBuffer_[toffset - 1] 
3673                        != CollationElementIterator.NULLORDER) {
3674                        return -1;
3675                    }
3676                    break;
3677                }
3678                else if (m_tgtUtilCEBuffer_[toffset - 1]
3679                            == CollationElementIterator.NULLORDER) {
3680                    return 1;
3681                }
3682            }
3683            else {
3684                if (m_srcUtilCEBuffer_[soffset - 1]
3685                                    == CollationElementIterator.NULLORDER) {
3686                    return -1;
3687                }
3688                if (m_tgtUtilCEBuffer_[soffset - 1]
3689                                    == CollationElementIterator.NULLORDER) {
3690                    return 1;
3691                }
3692                return (sorder < torder) ? -1 : 1;
3693            }
3694        }
3695        return 0;
3696    }
3697
3698    /**
3699     * Does tertiary strength comparison based on the collected ces.
3700     * @return the tertiary strength comparison result
3701     */
3702    private final int doTertiaryCompare()
3703    {
3704        int soffset = 0;
3705        int toffset = 0;
3706        while (true) {
3707            int sorder = CollationElementIterator.IGNORABLE;
3708            int torder = CollationElementIterator.IGNORABLE;
3709            while ((sorder & CE_REMOVE_CASE_)
3710                                == CollationElementIterator.IGNORABLE) {
3711                sorder = m_srcUtilCEBuffer_[soffset ++] & m_mask3_;
3712                if (!isContinuation(sorder)) {
3713                    sorder ^= m_caseSwitch_;
3714                }
3715                else {
3716                    sorder &= CE_REMOVE_CASE_;
3717                }
3718            }
3719
3720            while ((torder & CE_REMOVE_CASE_)
3721                                == CollationElementIterator.IGNORABLE) {
3722                torder = m_tgtUtilCEBuffer_[toffset ++] & m_mask3_;
3723                if (!isContinuation(torder)) {
3724                    torder ^= m_caseSwitch_;
3725                }
3726                else {
3727                    torder &= CE_REMOVE_CASE_;
3728                }
3729            }
3730
3731            if (sorder == torder) {
3732                if (m_srcUtilCEBuffer_[soffset - 1]
3733                                    == CollationElementIterator.NULLORDER) {
3734                    if (m_tgtUtilCEBuffer_[toffset - 1]
3735                        != CollationElementIterator.NULLORDER) {
3736                        return -1;
3737                    }
3738                    break;
3739                }
3740                else if (m_tgtUtilCEBuffer_[toffset - 1]
3741                            == CollationElementIterator.NULLORDER) {
3742                    return 1;
3743                }
3744            }
3745            else {
3746                if (m_srcUtilCEBuffer_[soffset - 1] ==
3747                                        CollationElementIterator.NULLORDER) {
3748                    return -1;
3749                }
3750                if (m_tgtUtilCEBuffer_[toffset - 1] ==
3751                            CollationElementIterator.NULLORDER) {
3752                    return 1;
3753                }
3754                return (sorder < torder) ? -1 : 1;
3755            }
3756        }
3757        return 0;
3758    }
3759
3760    /**
3761     * Does quaternary strength comparison based on the collected ces.
3762     * @param lowestpvalue the lowest primary value that will not be ignored if
3763     *                      alternate handling is shifted
3764     * @return the quaternary strength comparison result
3765     */
3766    private final int doQuaternaryCompare(int lowestpvalue)
3767    {
3768        boolean sShifted = true;
3769        boolean tShifted = true;
3770        int soffset = 0;
3771        int toffset = 0;
3772        while (true) {
3773            int sorder = CollationElementIterator.IGNORABLE;
3774            int torder = CollationElementIterator.IGNORABLE;
3775            while (sorder == CollationElementIterator.IGNORABLE
3776                    || (isContinuation(sorder) && !sShifted)) {
3777                sorder = m_srcUtilCEBuffer_[soffset ++];
3778                if (isContinuation(sorder)) {
3779                    if (!sShifted) {
3780                        continue;
3781                    }
3782                }
3783                else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0
3784                            || (sorder & CE_PRIMARY_MASK_)
3785                                    == CollationElementIterator.IGNORABLE) {
3786                    // non continuation
3787                    sorder = CE_PRIMARY_MASK_;
3788                    sShifted = false;
3789                }
3790                else {
3791                    sShifted = true;
3792                }
3793            }
3794            sorder >>>= CE_PRIMARY_SHIFT_;
3795            while (torder == CollationElementIterator.IGNORABLE
3796                    || (isContinuation(torder) && !tShifted)) {
3797                torder = m_tgtUtilCEBuffer_[toffset ++];
3798                if (isContinuation(torder)) {
3799                    if (!tShifted) {
3800                        continue;
3801                    }
3802                }
3803                else if (Utility.compareUnsigned(torder, lowestpvalue) > 0
3804                            || (torder & CE_PRIMARY_MASK_)
3805                                    == CollationElementIterator.IGNORABLE) {
3806                    // non continuation
3807                    torder = CE_PRIMARY_MASK_;
3808                    tShifted = false;
3809                }
3810                else {
3811                    tShifted = true;
3812                }
3813            }
3814            torder >>>= CE_PRIMARY_SHIFT_;
3815
3816            if (sorder == torder) {
3817                if (m_srcUtilCEBuffer_[soffset - 1]
3818                    == CollationElementIterator.NULLORDER) {
3819                    if (m_tgtUtilCEBuffer_[toffset - 1]
3820                        != CollationElementIterator.NULLORDER) {
3821                        return -1;
3822                    }
3823                    break;
3824                }
3825                else if (m_tgtUtilCEBuffer_[toffset - 1]
3826                            == CollationElementIterator.NULLORDER) {
3827                    return 1;
3828                }
3829            }
3830            else {
3831                if (m_srcUtilCEBuffer_[soffset - 1] ==
3832                    CollationElementIterator.NULLORDER) {
3833                    return -1;
3834                }
3835                if (m_tgtUtilCEBuffer_[toffset - 1] ==
3836                    CollationElementIterator.NULLORDER) {
3837                    return 1;
3838                }
3839                return (sorder < torder) ? -1 : 1;
3840            }
3841        }
3842        return 0;
3843    }
3844
3845    /**
3846     * Internal function. Does byte level string compare. Used by strcoll if
3847     * strength == identical and strings are otherwise equal. This is a rare
3848     * case. Comparison must be done on NFD normalized strings. FCD is not good
3849     * enough.
3850     * @param source text
3851     * @param target text
3852     * @param offset of the first difference in the text strings
3853     * @param normalize flag indicating if we are to normalize the text before
3854     *              comparison
3855     * @return 1 if source is greater than target, -1 less than and 0 if equals
3856     */
3857    private static final int doIdenticalCompare(String   source, String   target,
3858                                                int offset, boolean normalize)
3859
3860    {
3861        if (normalize) {
3862            if (Normalizer.quickCheck(source, Normalizer.NFD,0)
3863                                                    != Normalizer.YES) {
3864                source = Normalizer.decompose(source, false);
3865            }
3866
3867            if (Normalizer.quickCheck(target, Normalizer.NFD,0)
3868                                                        != Normalizer.YES) {
3869                target = Normalizer.decompose(target, false);
3870            }
3871            offset = 0;
3872        }
3873
3874        return doStringCompare(source, target, offset);
3875    }
3876
3877    /**
3878     * Compares string for their codepoint order.
3879     * This comparison handles surrogate characters and place them after the
3880     * all non surrogate characters.
3881     * @param source text
3882     * @param target text
3883     * @param offset start offset for comparison
3884     * @return 1 if source is greater than target, -1 less than and 0 if equals
3885     */
3886    private static final int doStringCompare(String   source,
3887                                             String   target,
3888                                             int offset)
3889    {
3890        // compare identical prefixes - they do not need to be fixed up
3891        char schar = 0;
3892        char tchar = 0;
3893        int slength = source.length();
3894        int tlength = target.length();
3895        int minlength = Math.min(slength, tlength);
3896        while (offset < minlength) {
3897            schar = source.charAt(offset);
3898            tchar = target.charAt(offset ++);
3899            if (schar != tchar) {
3900                break;
3901            }
3902        }
3903
3904        if (schar == tchar && offset == minlength) {
3905            if (slength > minlength) {
3906                return 1;
3907            }
3908            if (tlength > minlength) {
3909                return -1;
3910            }
3911            return 0;
3912        }
3913
3914        //  if both values are in or above the surrogate range, Fix them up.
3915        if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE
3916            && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
3917            schar = fixupUTF16(schar);
3918            tchar = fixupUTF16(tchar);
3919        }
3920
3921        // now c1 and c2 are in UTF-32-compatible order
3922        return (schar < tchar) ? -1 : 1; // schar and tchar has to be different
3923    }
3924
3925    /**
3926     * Rotate surrogates to the top to get code point order
3927     */
3928    private static final char fixupUTF16(char ch)
3929    {
3930        if (ch >= 0xe000) {
3931            ch -= 0x800;
3932        }
3933        else {
3934            ch += 0x2000;
3935        }
3936        return ch;
3937    }
3938
3939    /**
3940     * Resets the internal case data members and compression values.
3941     */
3942    private void updateInternalState()
3943    {
3944        if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
3945            m_caseSwitch_ = CASE_SWITCH_;
3946        }
3947        else {
3948            m_caseSwitch_ = NO_CASE_SWITCH_;
3949        }
3950
3951        if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) {
3952            m_mask3_ = CE_REMOVE_CASE_;
3953            m_common3_ = COMMON_NORMAL_3_;
3954            m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;
3955            m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;
3956            m_bottom3_ = COMMON_BOTTOM_3_;
3957        }
3958        else {
3959            m_mask3_ = CE_KEEP_CASE_;
3960            m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;
3961            if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
3962                m_common3_ = COMMON_UPPER_FIRST_3_;
3963                m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_;
3964                m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_;
3965            } else {
3966                m_common3_ = COMMON_NORMAL_3_;
3967                m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_;
3968                m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_;
3969            }
3970        }
3971
3972        // Set the compression values
3973        int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1;
3974        // we multilply double with int, but need only int
3975        m_topCount3_ = (int)(PROPORTION_3_ * total3);
3976        m_bottomCount3_ = total3 - m_topCount3_;
3977
3978        if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_
3979            && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) {
3980            m_isSimple3_ = true;
3981        }
3982        else {
3983            m_isSimple3_ = false;
3984        }
3985        if(!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_
3986          && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
3987          if(latinOneCEs_ == null || latinOneRegenTable_) {
3988            if(setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
3989              latinOneUse_ = true;
3990            } else {
3991              latinOneUse_ = false;
3992              latinOneFailed_ = true;
3993            }
3994            latinOneRegenTable_ = false;
3995          } else { // latin1Table exists and it doesn't need to be regenerated, just use it
3996            latinOneUse_ = true;
3997          }
3998        } else {
3999          latinOneUse_ = false;
4000        }
4001
4002    }
4003
4004    /**
4005     * Initializes the RuleBasedCollator
4006     */
4007    private final void init()
4008    {
4009        for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_;
4010             m_minUnsafe_ ++) {
4011            // Find the smallest unsafe char.
4012            if (isUnsafe(m_minUnsafe_)) {
4013                break;
4014            }
4015        }
4016
4017        for (m_minContractionEnd_ = 0;
4018             m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_;
4019             m_minContractionEnd_ ++) {
4020            // Find the smallest contraction-ending char.
4021            if (isContractionEnd(m_minContractionEnd_)) {
4022                break;
4023            }
4024        }
4025        latinOneFailed_ = true;
4026        setStrength(m_defaultStrength_);
4027        setDecomposition(m_defaultDecomposition_);
4028        m_variableTopValue_ = m_defaultVariableTopValue_;
4029        m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
4030        m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
4031        m_isCaseLevel_ = m_defaultIsCaseLevel_;
4032        m_caseFirst_ = m_defaultCaseFirst_;
4033        m_isHiragana4_ = m_defaultIsHiragana4_;
4034        m_isNumericCollation_ = m_defaultIsNumericCollation_;
4035        latinOneFailed_ = false;
4036        updateInternalState();
4037    }
4038
4039    /**
4040     *  Initializes utility iterators and byte buffer used by compare
4041     */
4042    private final void initUtility(boolean allocate) {
4043        if (allocate) {
4044            if (m_srcUtilIter_ == null) {
4045                m_srcUtilIter_ = new StringUCharacterIterator();
4046                m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, this);
4047                m_tgtUtilIter_ = new StringUCharacterIterator();
4048                m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, this);
4049                m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case
4050                m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary
4051                m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary
4052                m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary
4053                m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_];  // Quaternary
4054                m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
4055                m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
4056            }
4057        } else {
4058            m_srcUtilIter_ = null;
4059            m_srcUtilColEIter_ = null;
4060            m_tgtUtilIter_ = null;
4061            m_tgtUtilColEIter_ = null;
4062            m_utilBytes0_ = null;
4063            m_utilBytes1_ = null;
4064            m_utilBytes2_ = null;
4065            m_utilBytes3_ = null;
4066            m_utilBytes4_ = null;
4067            m_srcUtilCEBuffer_ = null;
4068            m_tgtUtilCEBuffer_ = null;
4069        }
4070    }
4071
4072    // Consts for Latin-1 special processing
4073    private static final int ENDOFLATINONERANGE_ = 0xFF;
4074    private static final int LATINONETABLELEN_   = (ENDOFLATINONERANGE_+50);
4075    private static final int BAIL_OUT_CE_        = 0xFF000000;
4076
4077     /**
4078     * Generate latin-1 tables
4079     */
4080
4081    private class shiftValues {
4082        int primShift = 24;
4083        int secShift = 24;
4084        int terShift = 24;
4085    }
4086
4087    private final void
4088    addLatinOneEntry(char ch, int CE, shiftValues sh) {
4089      int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
4090      boolean reverseSecondary = false;
4091      if(!isContinuation(CE)) {
4092        tertiary = ((CE & m_mask3_));
4093        tertiary ^= m_caseSwitch_;
4094        reverseSecondary = true;
4095      } else {
4096        tertiary = (byte)((CE & CE_REMOVE_CONTINUATION_MASK_));
4097        tertiary &= CE_REMOVE_CASE_;
4098        reverseSecondary = false;
4099      }
4100
4101      secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
4102      primary2 =  ((CE >>>= 8) & LAST_BYTE_MASK_);
4103      primary1 =  (CE >>> 8);
4104
4105      if(primary1 != 0) {
4106        latinOneCEs_[ch] |= (primary1 << sh.primShift);
4107        sh.primShift -= 8;
4108      }
4109      if(primary2 != 0) {
4110        if(sh.primShift < 0) {
4111          latinOneCEs_[ch] = BAIL_OUT_CE_;
4112          latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
4113          latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
4114          return;
4115        }
4116        latinOneCEs_[ch] |= (primary2 << sh.primShift);
4117        sh.primShift -= 8;
4118      }
4119      if(secondary != 0) {
4120        if(reverseSecondary && m_isFrenchCollation_) { // reverse secondary
4121          latinOneCEs_[latinOneTableLen_+ch] >>>= 8; // make space for secondary
4122          latinOneCEs_[latinOneTableLen_+ch] |= (secondary << 24);
4123        } else { // normal case
4124          latinOneCEs_[latinOneTableLen_+ch] |= (secondary << sh.secShift);
4125        }
4126        sh.secShift -= 8;
4127      }
4128      if(tertiary != 0) {
4129        latinOneCEs_[2*latinOneTableLen_+ch] |= (tertiary << sh.terShift);
4130        sh.terShift -= 8;
4131      }
4132    }
4133
4134    private final void
4135    resizeLatinOneTable(int newSize) {
4136        int newTable[] = new int[3*newSize];
4137        int sizeToCopy = ((newSize<latinOneTableLen_)?newSize:latinOneTableLen_);
4138        //uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared.
4139        System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy);
4140        System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy);
4141        System.arraycopy(latinOneCEs_, 2*latinOneTableLen_, newTable, 2*newSize, sizeToCopy);
4142        latinOneTableLen_ = newSize;
4143        latinOneCEs_ = newTable;
4144    }
4145
4146    private final boolean setUpLatinOne() {
4147      if(latinOneCEs_ == null || m_reallocLatinOneCEs_) {
4148        latinOneCEs_ = new int[3*LATINONETABLELEN_];
4149        latinOneTableLen_ = LATINONETABLELEN_;
4150        m_reallocLatinOneCEs_ = false;
4151      } else {
4152        Arrays.fill(latinOneCEs_, 0);
4153      }
4154      if(m_ContInfo_ == null) {
4155        m_ContInfo_ = new ContractionInfo();
4156      }
4157      char ch = 0;
4158      //StringBuffer sCh = new StringBuffer();
4159      //CollationElementIterator it = getCollationElementIterator(sCh.toString());
4160      CollationElementIterator it = getCollationElementIterator("");
4161
4162      shiftValues s = new shiftValues();
4163      int CE = 0;
4164      char contractionOffset = ENDOFLATINONERANGE_+1;
4165
4166      for(ch = 0; ch <= ENDOFLATINONERANGE_; ch++) {
4167        s.primShift = 24; s.secShift = 24; s.terShift = 24;
4168        if(ch < 0x100) {
4169          CE = m_trie_.getLatin1LinearValue(ch);
4170        } else {
4171          CE = m_trie_.getLeadValue(ch);
4172          if(CE == CollationElementIterator.CE_NOT_FOUND_) {
4173            CE = UCA_.m_trie_.getLeadValue(ch);
4174          }
4175        }
4176        if(!isSpecial(CE)) {
4177          addLatinOneEntry(ch, CE, s);
4178        } else {
4179          switch (RuleBasedCollator.getTag(CE)) {
4180          case CollationElementIterator.CE_EXPANSION_TAG_:
4181          case CollationElementIterator.CE_DIGIT_TAG_:
4182            //sCh.delete(0, sCh.length());
4183            //sCh.append(ch);
4184            //it.setText(sCh.toString());
4185            it.setText(UCharacter.toString(ch));
4186            while((CE = it.next()) != CollationElementIterator.NULLORDER) {
4187              if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4188                latinOneCEs_[ch] = BAIL_OUT_CE_;
4189                latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
4190                latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
4191                break;
4192              }
4193              addLatinOneEntry(ch, CE, s);
4194            }
4195            break;
4196          case CollationElementIterator.CE_CONTRACTION_TAG_:
4197            // here is the trick
4198            // F2 is contraction. We do something very similar to contractions
4199            // but have two indices, one in the real contraction table and the
4200            // other to where we stuffed things. This hopes that we don't have
4201            // many contractions (this should work for latin-1 tables).
4202            {
4203              if((CE & 0x00FFF000) != 0) {
4204                latinOneFailed_ = true;
4205                return false;
4206              }
4207
4208              int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; //getContractionOffset(CE)]
4209
4210              CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
4211
4212              latinOneCEs_[ch] = CE;
4213              latinOneCEs_[latinOneTableLen_+ch] = CE;
4214              latinOneCEs_[2*latinOneTableLen_+ch] = CE;
4215
4216              // We're going to jump into contraction table, pick the elements
4217              // and use them
4218              do {
4219                  //CE = *(contractionCEs + (UCharOffset - contractionIndex));
4220                  CE = m_contractionCE_[UCharOffset];
4221                  if(isSpecial(CE) 
4222                     && getTag(CE) 
4223                               == CollationElementIterator.CE_EXPANSION_TAG_) {
4224                    int i;    /* general counter */
4225                    //uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to expansion table */
4226                    int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; //it.getExpansionOffset(this, CE);
4227                    int size = CE & 0xF; // getExpansionCount(CE);
4228                    //CE = *CEOffset++;
4229                    if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
4230                      for(i = 0; i<size; i++) {
4231                        if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4232                          latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4233                          latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4234                          latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4235                          break;
4236                        }
4237                        addLatinOneEntry(contractionOffset, m_expansion_[offset+i], s);
4238                      }
4239                    } else { /* else, we do */
4240                      while(m_expansion_[offset] != 0) {
4241                        if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4242                          latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4243                          latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4244                          latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4245                          break;
4246                        }
4247                        addLatinOneEntry(contractionOffset, m_expansion_[offset++], s);
4248                      }
4249                    }
4250                    contractionOffset++;
4251                  } else if(!isSpecial(CE)) {
4252                    addLatinOneEntry(contractionOffset++, CE, s);
4253                  } else {
4254                      latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4255                      latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4256                      latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4257                      contractionOffset++;
4258                  }
4259                  UCharOffset++;
4260                  s.primShift = 24; s.secShift = 24; s.terShift = 24;
4261                  if(contractionOffset == latinOneTableLen_) { // we need to reallocate
4262                   resizeLatinOneTable(2*latinOneTableLen_);
4263                  }
4264              } while(m_contractionIndex_[UCharOffset] != 0xFFFF);
4265            }
4266            break;
4267          default:
4268            latinOneFailed_ = true;
4269            return false;
4270          }
4271        }
4272      }
4273      // compact table
4274      if(contractionOffset < latinOneTableLen_) {
4275        resizeLatinOneTable(contractionOffset);
4276      }
4277      return true;
4278    }
4279
4280    private class ContractionInfo {
4281        int index;
4282    }
4283
4284    ContractionInfo m_ContInfo_;
4285
4286    private int
4287    getLatinOneContraction(int strength, int CE, String   s) {
4288    //int strength, int CE, String s, Integer ind) {
4289      int len = s.length();
4290      //const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
4291      int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
4292      int offset = 1;
4293      int latinOneOffset = (CE & 0x00FFF000) >>> 12;
4294      char schar = 0, tchar = 0;
4295
4296      for(;;) {
4297        /*
4298        if(len == -1) {
4299          if(s[*index] == 0) { // end of string
4300            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
4301          } else {
4302            schar = s[*index];
4303          }
4304        } else {
4305        */
4306          if(m_ContInfo_.index == len) {
4307            return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
4308          } else {
4309            schar = s.charAt(m_ContInfo_.index);
4310          }
4311        //}
4312
4313        while(schar > (tchar = m_contractionIndex_[UCharOffset+offset]/**(UCharOffset+offset)*/)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
4314          offset++;
4315        }
4316
4317        if (schar == tchar) {
4318          m_ContInfo_.index++;
4319          return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset+offset]);
4320        }
4321        else
4322        {
4323          if(schar  > ENDOFLATINONERANGE_ /*& 0xFF00*/) {
4324            return BAIL_OUT_CE_;
4325          }
4326          // skip completely ignorables
4327          int isZeroCE = m_trie_.getLeadValue(schar); //UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
4328          if(isZeroCE == 0) { // we have to ignore completely ignorables
4329            m_ContInfo_.index++;
4330            continue;
4331          }
4332
4333          return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
4334        }
4335      }
4336    }
4337
4338
4339    /**
4340     * This is a fast strcoll, geared towards text in Latin-1.
4341     * It supports contractions of size two, French secondaries
4342     * and case switching. You can use it with strengths primary
4343     * to tertiary. It does not support shifted and case level.
4344     * It relies on the table build by setupLatin1Table. If it
4345     * doesn't understand something, it will go to the regular
4346     * strcoll.
4347     */
4348    private final int
4349    compareUseLatin1(String   source, String   target, int startOffset)
4350    {
4351        int sLen = source.length();
4352        int tLen = target.length();
4353
4354        int strength = getStrength();
4355
4356        int sIndex = startOffset, tIndex = startOffset;
4357        char sChar = 0, tChar = 0;
4358        int sOrder=0, tOrder=0;
4359
4360        boolean endOfSource = false;
4361
4362        //uint32_t *elements = coll->latinOneCEs;
4363
4364        boolean haveContractions = false; // if we have contractions in our string
4365                                        // we cannot do French secondary
4366
4367        int offset = latinOneTableLen_;
4368
4369        // Do the primary level
4370    primLoop:
4371        for(;;) {
4372          while(sOrder==0) { // this loop skips primary ignorables
4373            // sOrder=getNextlatinOneCE(source);
4374              if(sIndex==sLen) {
4375                endOfSource = true;
4376                break;
4377              }
4378              sChar=source.charAt(sIndex++); //[sIndex++];
4379            //}
4380            if(sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
4381              //fprintf(stderr, "R");
4382              return compareRegular(source, target, startOffset);
4383            }
4384            sOrder = latinOneCEs_[sChar];
4385            if(isSpecial(sOrder)) { // if we got a special
4386              // specials can basically be either contractions or bail-out signs. If we get anything
4387              // else, we'll bail out anywasy
4388              if(getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
4389                m_ContInfo_.index = sIndex;
4390                sOrder = getLatinOneContraction(0, sOrder, source);
4391                sIndex = m_ContInfo_.index;
4392                haveContractions = true; // if there are contractions, we cannot do French secondary
4393                // However, if there are contractions in the table, but we always use just one char,
4394                // we might be able to do French. This should be checked out.
4395              }
4396              if(isSpecial(sOrder) /*== UCOL_BAIL_OUT_CE*/) {
4397                //fprintf(stderr, "S");
4398                return compareRegular(source, target, startOffset);
4399              }
4400            }
4401          }
4402
4403          while(tOrder==0) {  // this loop skips primary ignorables
4404            // tOrder=getNextlatinOneCE(target);
4405            if(tIndex==tLen) {
4406              if(endOfSource) {
4407                break primLoop;
4408              } else {
4409                return 1;
4410              }
4411            }
4412            tChar=target.charAt(tIndex++); //[tIndex++];
4413            if(tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
4414              //fprintf(stderr, "R");
4415              return compareRegular(source, target, startOffset);
4416            }
4417            tOrder = latinOneCEs_[tChar];
4418            if(isSpecial(tOrder)) {
4419              // Handling specials, see the comments for source
4420              if(getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
4421                m_ContInfo_.index = tIndex;
4422                tOrder = getLatinOneContraction(0, tOrder, target);
4423                tIndex = m_ContInfo_.index;
4424                haveContractions = true;
4425              }
4426              if(isSpecial(tOrder)/*== UCOL_BAIL_OUT_CE*/) {
4427                //fprintf(stderr, "S");
4428                return compareRegular(source, target, startOffset);
4429              }
4430            }
4431          }
4432          if(endOfSource) { // source is finished, but target is not, say the result.
4433              return -1;
4434          }
4435
4436          if(sOrder == tOrder) { // if we have same CEs, we continue the loop
4437            sOrder = 0; tOrder = 0;
4438            continue;
4439          } else {
4440            // compare current top bytes
4441            if(((sOrder^tOrder)&0xFF000000)!=0) {
4442              // top bytes differ, return difference
4443              if(sOrder >>> 8 < tOrder >>> 8) {
4444                return -1;
4445              } else {
4446                return 1;
4447              }
4448              // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
4449              // since we must return enum value
4450            }
4451
4452            // top bytes match, continue with following bytes
4453            sOrder<<=8;
4454            tOrder<<=8;
4455          }
4456        }
4457
4458        // after primary loop, we definitely know the sizes of strings,
4459        // so we set it and use simpler loop for secondaries and tertiaries
4460        //sLen = sIndex; tLen = tIndex;
4461        if(strength >= SECONDARY) {
4462          // adjust the table beggining
4463          //latinOneCEs_ += coll->latinOneTableLen;
4464          endOfSource = false;
4465
4466          if(!m_isFrenchCollation_) { // non French
4467            // This loop is a simplified copy of primary loop
4468            // at this point we know that whole strings are latin-1, so we don't
4469            // check for that. We also know that we only have contractions as
4470            // specials.
4471            //sIndex = 0; tIndex = 0;
4472            sIndex = startOffset; tIndex = startOffset;
4473    secLoop:
4474            for(;;) {
4475              while(sOrder==0) {
4476                if(sIndex==sLen) {
4477                  endOfSource = true;
4478                  break;
4479                }
4480                sChar=source.charAt(sIndex++); //[sIndex++];
4481                sOrder = latinOneCEs_[offset+sChar];
4482                if(isSpecial(sOrder)) {
4483                    m_ContInfo_.index = sIndex;
4484                    sOrder = getLatinOneContraction(1, sOrder, source);
4485                    sIndex = m_ContInfo_.index;
4486                }
4487              }
4488
4489              while(tOrder==0) {
4490                if(tIndex==tLen) {
4491                  if(endOfSource) {
4492                    break secLoop;
4493                  } else {
4494                    return 1;
4495                  }
4496                }
4497                tChar=target.charAt(tIndex++); //[tIndex++];
4498                tOrder = latinOneCEs_[offset+tChar];
4499                if(isSpecial(tOrder)) {
4500                    m_ContInfo_.index = tIndex;
4501                    tOrder = getLatinOneContraction(1, tOrder, target);
4502                    tIndex = m_ContInfo_.index;
4503                }
4504              }
4505              if(endOfSource) {
4506                  return -1;
4507              }
4508
4509              if(sOrder == tOrder) {
4510                sOrder = 0; tOrder = 0;
4511                continue;
4512              } else {
4513                // see primary loop for comments on this
4514                if(((sOrder^tOrder)&0xFF000000)!=0) {
4515                  if(sOrder >>> 8 < tOrder >>> 8) {
4516                    return -1;
4517                  } else {
4518                    return 1;
4519                  }
4520                }
4521                sOrder<<=8;
4522                tOrder<<=8;
4523              }
4524            }
4525          } else { // French
4526            if(haveContractions) { // if we have contractions, we have to bail out
4527              // since we don't really know how to handle them here
4528              return compareRegular(source, target, startOffset);
4529            }
4530            // For French, we go backwards
4531            sIndex = sLen; tIndex = tLen;
4532    secFLoop:
4533            for(;;) {
4534              while(sOrder==0) {
4535                if(sIndex==startOffset) {
4536                  endOfSource = true;
4537                  break;
4538                }
4539                sChar=source.charAt(--sIndex); //[--sIndex];
4540                sOrder = latinOneCEs_[offset+sChar];
4541                // don't even look for contractions
4542              }
4543
4544              while(tOrder==0) {
4545                if(tIndex==startOffset) {
4546                  if(endOfSource) {
4547                    break secFLoop;
4548                  } else {
4549                    return 1;
4550                  }
4551                }
4552                tChar=target.charAt(--tIndex); //[--tIndex];
4553                tOrder = latinOneCEs_[offset+tChar];
4554                // don't even look for contractions
4555              }
4556              if(endOfSource) {
4557                  return -1;
4558              }
4559
4560              if(sOrder == tOrder) {
4561                sOrder = 0; tOrder = 0;
4562                continue;
4563              } else {
4564                // see the primary loop for comments
4565                if(((sOrder^tOrder)&0xFF000000)!=0) {
4566                  if(sOrder >>> 8 < tOrder >>> 8) {
4567                    return -1;
4568                  } else {
4569                    return 1;
4570                  }
4571                }
4572                sOrder<<=8;
4573                tOrder<<=8;
4574              }
4575            }
4576          }
4577        }
4578
4579        if(strength >= TERTIARY) {
4580          // tertiary loop is the same as secondary (except no French)
4581          offset += latinOneTableLen_;
4582          //sIndex = 0; tIndex = 0;
4583          sIndex = startOffset; tIndex = startOffset;
4584          endOfSource = false;
4585          for(;;) {
4586            while(sOrder==0) {
4587              if(sIndex==sLen) {
4588                endOfSource = true;
4589                break;
4590              }
4591              sChar=source.charAt(sIndex++); //[sIndex++];
4592              sOrder = latinOneCEs_[offset+sChar];
4593              if(isSpecial(sOrder)) {
4594                m_ContInfo_.index = sIndex;
4595                sOrder = getLatinOneContraction(2, sOrder, source);
4596                sIndex = m_ContInfo_.index;
4597              }
4598            }
4599            while(tOrder==0) {
4600              if(tIndex==tLen) {
4601                if(endOfSource) {
4602                  return 0; // if both strings are at the end, they are equal
4603                } else {
4604                  return 1;
4605                }
4606              }
4607              tChar=target.charAt(tIndex++); //[tIndex++];
4608              tOrder = latinOneCEs_[offset+tChar];
4609              if(isSpecial(tOrder)) {
4610                m_ContInfo_.index = tIndex;
4611                tOrder = getLatinOneContraction(2, tOrder, target);
4612                tIndex = m_ContInfo_.index;
4613              }
4614            }
4615            if(endOfSource) {
4616                return -1;
4617            }
4618            if(sOrder == tOrder) {
4619              sOrder = 0; tOrder = 0;
4620              continue;
4621            } else {
4622              if(((sOrder^tOrder)&0xff000000)!=0) {
4623                if(sOrder >>> 8 < tOrder >>> 8) {
4624                  return -1;
4625                } else {
4626                  return 1;
4627                }
4628              }
4629              sOrder<<=8;
4630              tOrder<<=8;
4631            }
4632          }
4633        }
4634        return 0;
4635    }
4636    /** 
4637     * Get the version of this collator object.
4638     * @return the version object associated with this collator
4639     * @stable ICU 2.8
4640     */
4641    public VersionInfo getVersion() {
4642        /* RunTime version  */
4643        int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();
4644        /* Builder version*/
4645        int bdVersion = m_version_.getMajor();
4646
4647        /* Charset Version. Need to get the version from cnv files
4648         * makeconv should populate cnv files with version and
4649         * an api has to be provided in ucnv.h to obtain this version
4650         */
4651        int csVersion = 0;
4652
4653        /* combine the version info */
4654        int cmbVersion = ((rtVersion<<11) | (bdVersion<<6) | (csVersion)) & 0xFFFF;
4655        
4656        /* Tailoring rules */
4657        return VersionInfo.getInstance(cmbVersion>>8, 
4658                cmbVersion & 0xFF, 
4659                m_version_.getMinor(), 
4660                UCA_.m_UCA_version_.getMajor());
4661
4662//        versionInfo[0] = (uint8_t)(cmbVersion>>8);
4663//        versionInfo[1] = (uint8_t)cmbVersion;
4664//        versionInfo[2] = coll->image->version[1];
4665//        versionInfo[3] = coll->UCA->image->UCAVersion[0];
4666    }
4667    
4668    /** 
4669     * Get the UCA version of this collator object.
4670     * @return the version object associated with this collator
4671     * @stable ICU 2.8
4672     */
4673    public VersionInfo getUCAVersion() {
4674        return UCA_.m_UCA_version_;
4675    }
4676
4677    private transient boolean m_reallocLatinOneCEs_;
4678}
4679
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags