CollationRuleParser


1   /**
2   *******************************************************************************
3   * Copyright (C) 1996-2006, International Business Machines Corporation and    *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   package com.ibm.icu.text;
8   
9   import java.text.ParseException  ;
10  import java.util.Hashtable  ;
11  import java.util.Arrays  ;
12  import com.ibm.icu.lang.UCharacter;
13  import com.ibm.icu.impl.UCharacterProperty;
14  
15  /**
16  * Class for parsing collation rules, produces a list of tokens that will be
17  * turned into collation elements
18  * @author Syn Wee Quek
19  * @since release 2.2, June 7 2002
20  * @draft 2.2
21  */
22  final class CollationRuleParser
23  {
24      // public data members ---------------------------------------------------
25  
26      // package private constructors ------------------------------------------
27  
28      /**
29       * <p>RuleBasedCollator constructor that takes the rules.
30       * Please see RuleBasedCollator class description for more details on the
31       * collation rule syntax.</p>
32       * @see java.util.Locale
33       * @param rules the collation rules to build the collation table from.
34       * @exception ParseException thrown when argument rules have an invalid
35       *            syntax.
36       * @draft 2.2
37       */
38      CollationRuleParser(String   rules) throws ParseException  
39      {
40          extractSetsFromRules(rules);
41          m_source_ = new StringBuffer  (Normalizer.decompose(rules, false).trim());
42          m_rules_ = m_source_.toString();
43          m_current_ = 0;
44          m_extraCurrent_ = m_source_.length();
45          m_variableTop_ = null;
46          m_parsedToken_ = new ParsedToken();
47          m_hashTable_ = new Hashtable  ();
48          m_options_ = new OptionSet(RuleBasedCollator.UCA_);
49          m_listHeader_ = new TokenListHeader[512];
50          m_resultLength_ = 0;
51          // call assembleTokenList() manually, so that we can
52          // init a parser and manually parse tokens
53          //assembleTokenList();
54      }
55  
56      // package private inner classes -----------------------------------------
57  
58      /**
59       * Collation options set
60       */
61      static class OptionSet
62      {
63          // package private constructor ---------------------------------------
64  
65          /**
66           * Initializes the option set with the argument collators
67           * @param collator option to use
68           */
69          OptionSet(RuleBasedCollator collator)
70          {
71              m_variableTopValue_ = collator.m_variableTopValue_;
72              m_isFrenchCollation_ = collator.isFrenchCollation();
73              m_isAlternateHandlingShifted_
74                                     = collator.isAlternateHandlingShifted();
75              m_caseFirst_ = collator.m_caseFirst_;
76              m_isCaseLevel_ = collator.isCaseLevel();
77              m_decomposition_ = collator.getDecomposition();
78              m_strength_ = collator.getStrength();
79              m_isHiragana4_ = collator.m_isHiragana4_;
80          }
81  
82          // package private data members --------------------------------------
83  
84          int m_variableTopValue_;
85          boolean m_isFrenchCollation_;
86          /**
87           * Attribute for handling variable elements
88           */
89          boolean m_isAlternateHandlingShifted_;
90          /**
91           * who goes first, lower case or uppercase
92           */
93          int m_caseFirst_;
94          /**
95           * do we have an extra case level
96           */
97          boolean m_isCaseLevel_;
98          /**
99           * attribute for normalization
100          */
101         int m_decomposition_;
102         /**
103          * attribute for strength
104          */
105         int m_strength_;
106         /**
107          * attribute for special Hiragana
108          */
109         boolean m_isHiragana4_;
110     }
111 
112     /**
113      * List of tokens used by the collation rules
114      */
115     static class TokenListHeader
116     {
117         Token m_first_;
118         Token m_last_;
119         Token m_reset_;
120         boolean m_indirect_;
121         int m_baseCE_;
122         int m_baseContCE_;
123         int m_nextCE_;
124         int m_nextContCE_;
125         int m_previousCE_;
126         int m_previousContCE_;
127         int m_pos_[] = new int[Collator.IDENTICAL + 1];
128         int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];
129         int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];
130         int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];
131         Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];
132         Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];
133     }
134 
135     /**
136      * Token wrapper for collation rules
137      */
138     static class Token
139     {
140        // package private data members ---------------------------------------
141 
142        int m_CE_[];
143        int m_CELength_;
144        int m_expCE_[];
145        int m_expCELength_;
146        int m_source_;
147        int m_expansion_;
148        int m_prefix_;
149        int m_strength_;
150        int m_toInsert_;
151        int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>
152        TokenListHeader m_listHeader_;
153        Token m_previous_;
154        Token m_next_;
155        StringBuffer   m_rules_;
156        char m_flags_;
157 
158        // package private constructors ---------------------------------------
159 
160        Token()
161        {
162            m_CE_ = new int[128];
163            m_expCE_ = new int[128];
164            // TODO: this should also handle reverse
165            m_polarity_ = TOKEN_POLARITY_POSITIVE_;
166            m_next_ = null;
167            m_previous_ = null;
168            m_CELength_ = 0;
169            m_expCELength_ = 0;
170        }
171 
172        // package private methods --------------------------------------------
173 
174        /**
175         * Hashcode calculation for token
176         * @return the hashcode
177         */
178        public int hashCode()
179        {
180            int result = 0;
181            int len = (m_source_ & 0xFF000000) >>> 24;
182            int inc = ((len - 32) / 32) + 1;
183 
184            int start = m_source_ & 0x00FFFFFF;
185            int limit = start + len;
186 
187            while (start < limit) {
188                result = (result * 37) + m_rules_.charAt(start);
189                start += inc;
190            }
191            return result;
192        }
193 
194        /**
195         * Equals calculation
196         * @param target object to compare
197         * @return true if target is the same as this object
198         */
199        public boolean equals(Object   target)
200        {
201            if (target == this) {
202                return true;
203            }
204            if (target instanceof Token) {
205                Token t = (Token)target;
206                int sstart = m_source_ & 0x00FFFFFF;
207                int tstart = t.m_source_ & 0x00FFFFFF;
208                int slimit = (m_source_ & 0xFF000000) >> 24;
209                int tlimit = (m_source_ & 0xFF000000) >> 24;
210 
211                int end = sstart + slimit - 1;
212 
213                if (m_source_ == 0 || t.m_source_ == 0) {
214                    return false;
215                }
216                if (slimit != tlimit) {
217                    return false;
218                }
219                if (m_source_ == t.m_source_) {
220                    return true;
221                }
222 
223                while (sstart < end
224                       && m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart))
225                {
226                    ++ sstart;
227                    ++ tstart;
228                }
229                if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) {
230                    return true;
231                }
232            }
233            return false;
234         }
235     }
236 
237     // package private data member -------------------------------------------
238 
239     /**
240      * Indicator that the token is resetted yet, ie & in the rules
241      */
242     static final int TOKEN_RESET_ = 0xDEADBEEF;
243 
244     /**
245      * Size of the number of tokens
246      */
247     int m_resultLength_;
248     /**
249      * List of parsed tokens
250      */
251     TokenListHeader m_listHeader_[];
252     /**
253      * Variable top token
254      */
255     Token m_variableTop_;
256     /**
257      * Collation options
258      */
259     OptionSet m_options_;
260     /**
261      * Normalized collation rules with some extra characters
262      */
263     StringBuffer   m_source_;
264     /**
265      * Hash table to keep all tokens
266      */
267     Hashtable   m_hashTable_;
268 
269     // package private method ------------------------------------------------
270 
271     void setDefaultOptionsInCollator(RuleBasedCollator collator)
272     {
273         collator.m_defaultStrength_ = m_options_.m_strength_;
274         collator.m_defaultDecomposition_ = m_options_.m_decomposition_;
275         collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;
276         collator.m_defaultIsAlternateHandlingShifted_
277                                     = m_options_.m_isAlternateHandlingShifted_;
278         collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;
279         collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
280         collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
281         collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
282     }
283 
284     // private inner classes -------------------------------------------------
285 
286     /**
287      * This is a token that has been parsed but not yet processed. Used to
288      * reduce the number of arguments in the parser
289      */
290     private static class ParsedToken
291     {
292         // private constructor ----------------------------------------------
293 
294         /**
295          * Empty constructor
296          */
297         ParsedToken()
298         {
299             m_charsLen_ = 0;
300             m_charsOffset_ = 0;
301             m_extensionLen_ = 0;
302             m_extensionOffset_ = 0;
303             m_prefixLen_ = 0;
304             m_prefixOffset_ = 0;
305             m_flags_ = 0;
306             m_strength_ = TOKEN_UNSET_;
307         }
308 
309         // private data members ---------------------------------------------
310 
311         int m_strength_;
312         int m_charsOffset_;
313         int m_charsLen_;
314         int m_extensionOffset_;
315         int m_extensionLen_;
316         int m_prefixOffset_;
317         int m_prefixLen_;
318         char m_flags_;
319         char m_indirectIndex_;
320     }
321 
322     /**
323      * Boundary wrappers
324      */
325     private static class IndirectBoundaries
326     {
327         // package private constructor ---------------------------------------
328 
329         IndirectBoundaries(int startce[], int limitce[])
330         {
331             // Set values for the top - TODO: once we have values for all the
332             // indirects, we are going to initalize here.
333             m_startCE_ = startce[0];
334             m_startContCE_ = startce[1];
335             if (limitce != null) {
336                 m_limitCE_ = limitce[0];
337                 m_limitContCE_ = limitce[1];
338             }
339             else {
340                 m_limitCE_ = 0;
341                 m_limitContCE_ = 0;
342             }
343         }
344 
345         // package private data members --------------------------------------
346 
347         int m_startCE_;
348         int m_startContCE_;
349         int m_limitCE_;
350         int m_limitContCE_;
351     }
352 
353     /**
354      * Collation option rule tag
355      */
356     private static class TokenOption
357     {
358         // package private constructor ---------------------------------------
359 
360         TokenOption(String   name, int attribute, String   suboptions[],
361                     int suboptionattributevalue[])
362         {
363             m_name_ = name;
364             m_attribute_ = attribute;
365             m_subOptions_ = suboptions;
366             m_subOptionAttributeValues_ = suboptionattributevalue;
367         }
368 
369         // package private data member ---------------------------------------
370 
371         private String   m_name_;
372         private int m_attribute_;
373         private String   m_subOptions_[];
374         private int m_subOptionAttributeValues_[];
375     }
376 
377     // private variables -----------------------------------------------------
378 
379     /**
380      * Current parsed token
381      */
382     private ParsedToken m_parsedToken_;
383     /**
384      * Collation rule
385      */
386     private String   m_rules_;
387     private int m_current_;
388     /**
389      * End of the option while reading.
390      * Need it for UnicodeSet reading support.
391      */
392     private int m_optionEnd_;
393     /**
394      * Current offset in m_source
395      */
396     private int m_sourceLimit_;
397     /**
398      * Offset to m_source_ ofr the extra expansion characters
399      */
400     private int m_extraCurrent_;
401 
402     /**
403      * UnicodeSet that contains code points to be copied from the UCA
404      */
405     UnicodeSet m_copySet_;
406 
407     /**
408      * UnicodeSet that contains code points for which we want to remove
409      * UCA contractions. It implies copying of these code points from
410      * the UCA.
411      */
412     UnicodeSet m_removeSet_;
413     /**
414      * This is space for the extra strings that need to be unquoted during the
415      * parsing of the rules
416      */
417     private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;
418     /**
419      * Indicator that the token is not set yet
420      */
421     private static final int TOKEN_UNSET_ = 0xFFFFFFFF;
422     /**
423      * Indicator that the rule is in the > polarity, ie everything on the
424      * right of the rule is less than
425      */
426     private static final int TOKEN_POLARITY_NEGATIVE_ = 0;
427     /**
428      * Indicator that the rule is in the < polarity, ie everything on the
429      * right of the rule is greater than
430      */
431     private static final int TOKEN_POLARITY_POSITIVE_ = 1;
432     /**
433      * Flag mask to determine if top is set
434      */
435     private static final int TOKEN_TOP_MASK_ = 0x04;
436     /**
437      * Flag mask to determine if variable top is set
438      */
439     private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;
440     /**
441      * Flag mask to determine if a before attribute is set
442      */
443     private static final int TOKEN_BEFORE_ = 0x03;
444     /**
445      * For use in parsing token options
446      */
447     private static final int TOKEN_SUCCESS_MASK_ = 0x10;
448 
449     /**
450      * These values are used for finding CE values for indirect positioning.
451      * Indirect positioning is a mechanism for allowing resets on symbolic
452      * values. It only works for resets and you cannot tailor indirect names.
453      * An indirect name can define either an anchor point or a range. An anchor
454      * point behaves in exactly the same way as a code point in reset would,
455      * except that it cannot be tailored. A range (we currently only know for
456      * the [top] range will explicitly set the upper bound for generated CEs,
457      * thus allowing for better control over how many CEs can be squeezed
458      * between in the range without performance penalty. In that respect, we use
459      * [top] for tailoring of locales that use CJK characters. Other indirect
460      * values are currently a pure convenience, they can be used to assure that
461      * the CEs will be always positioned in the same place relative to a point
462      * with known properties (e.g. first primary ignorable).
463      */
464     private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];
465 
466     /**
467      * Inverse UCA constants
468      */
469     private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;
470     private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;
471     private static final int INVERSE_SHIFT_VALUE_ = 20;
472 
473     /**
474      * Collation option tags
475      * [last variable] last variable value
476      * [last primary ignorable] largest CE for primary ignorable
477      * [last secondary ignorable] largest CE for secondary ignorable
478      * [last tertiary ignorable] largest CE for tertiary ignorable
479      * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
480      */
481     private static final TokenOption RULES_OPTIONS_[];
482 
483     static
484     {
485         INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];
486         // UCOL_RESET_TOP_VALUE
487         INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(
488                         RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
489                         RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
490         // UCOL_FIRST_PRIMARY_IGNORABLE
491         INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(
492                     RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_,
493                     null);
494         // UCOL_LAST_PRIMARY_IGNORABLE
495         INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(
496                     RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_,
497                     null);
498 
499         // UCOL_FIRST_SECONDARY_IGNORABLE
500         INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(
501                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_,
502                    null);
503         // UCOL_LAST_SECONDARY_IGNORABLE
504         INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(
505                    RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_,
506                    null);
507         // UCOL_FIRST_TERTIARY_IGNORABLE
508         INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(
509                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_,
510                    null);
511         // UCOL_LAST_TERTIARY_IGNORABLE
512         INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(
513                    RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_,
514                    null);
515         // UCOL_FIRST_VARIABLE;
516         INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(
517                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_,
518                    null);
519         // UCOL_LAST_VARIABLE
520         INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(
521                    RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_,
522                    null);
523         // UCOL_FIRST_NON_VARIABLE
524         INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(
525                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_,
526                    null);
527         // UCOL_LAST_NON_VARIABLE
528         INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(
529                    RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
530                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
531         // UCOL_FIRST_IMPLICIT
532         INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(
533                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_,
534                    null);
535         // UCOL_LAST_IMPLICIT
536         INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(
537                    RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,
538                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);
539         // UCOL_FIRST_TRAILING
540         INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(
541                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_,
542                    null);
543         // UCOL_LAST_TRAILING
544         INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(
545                    RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_,
546                    null);
547         INDIRECT_BOUNDARIES_[14].m_limitCE_
548                  = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;
549 
550         RULES_OPTIONS_ = new TokenOption[19];
551         String   option[] = {"non-ignorable", "shifted"};
552         int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_,
553                        RuleBasedCollator.AttributeValue.SHIFTED_};
554         RULES_OPTIONS_[0] = new TokenOption("alternate",
555                               RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
556                               option, value);
557         option = new String  [1];
558         option[0] = "2";
559         value = new int[1];
560         value[0] = RuleBasedCollator.AttributeValue.ON_;
561         RULES_OPTIONS_[1] = new TokenOption("backwards",
562                                  RuleBasedCollator.Attribute.FRENCH_COLLATION_,
563                                  option, value);
564         String   offonoption[] = new String  [2];
565         offonoption[0] = "off";
566         offonoption[1] = "on";
567         int offonvalue[] = new int[2];
568         offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;
569         offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;
570         RULES_OPTIONS_[2] = new TokenOption("caseLevel",
571                                        RuleBasedCollator.Attribute.CASE_LEVEL_,
572                                        offonoption, offonvalue);
573         option = new String  [3];
574         option[0] = "lower";
575         option[1] = "upper";
576         option[2] = "off";
577         value = new int[3];
578         value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;
579         value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;
580         value[2] = RuleBasedCollator.AttributeValue.OFF_;
581         RULES_OPTIONS_[3] = new TokenOption("caseFirst",
582                                        RuleBasedCollator.Attribute.CASE_FIRST_,
583                                        option, value);
584         RULES_OPTIONS_[4] = new TokenOption("normalization",
585                                RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
586                                offonoption, offonvalue);
587         RULES_OPTIONS_[5] = new TokenOption("hiraganaQ",
588                          RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
589                          offonoption, offonvalue);
590         option = new String  [5];
591         option[0] = "1";
592         option[1] = "2";
593         option[2] = "3";
594         option[3] = "4";
595         option[4] = "I";
596         value = new int[5];
597         value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
598         value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
599         value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
600         value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;
601         value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;
602         RULES_OPTIONS_[6] = new TokenOption("strength",
603                                          RuleBasedCollator.Attribute.STRENGTH_,
604                                          option, value);
605         RULES_OPTIONS_[7] = new TokenOption("variable top",
606                                   RuleBasedCollator.Attribute.LIMIT_,
607                                   null, null);
608         RULES_OPTIONS_[8] = new TokenOption("rearrange",
609                                   RuleBasedCollator.Attribute.LIMIT_,
610                                   null, null);
611         option = new String  [3];
612         option[0] = "1";
613         option[1] = "2";
614         option[2] = "3";
615         value = new int[3];
616         value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
617         value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
618         value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
619         RULES_OPTIONS_[9] = new TokenOption("before",
620                                   RuleBasedCollator.Attribute.LIMIT_,
621                                   option, value);
622         RULES_OPTIONS_[10] = new TokenOption("top",
623                                   RuleBasedCollator.Attribute.LIMIT_,
624                                   null, null);
625         String   firstlastoption[] = new String  [7];
626         firstlastoption[0] = "primary";
627         firstlastoption[1] = "secondary";
628         firstlastoption[2] = "tertiary";
629         firstlastoption[3] = "variable";
630         firstlastoption[4] = "regular";
631         firstlastoption[5] = "implicit";
632         firstlastoption[6] = "trailing";
633 
634         int firstlastvalue[] = new int[7];
635         Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_);
636 
637         RULES_OPTIONS_[11] = new TokenOption("first",
638                                   RuleBasedCollator.Attribute.LIMIT_,
639                                   firstlastoption, firstlastvalue);
640         RULES_OPTIONS_[12] = new TokenOption("last",
641                                   RuleBasedCollator.Attribute.LIMIT_,
642                                   firstlastoption, firstlastvalue);
643         RULES_OPTIONS_[13] = new TokenOption("optimize",
644                                   RuleBasedCollator.Attribute.LIMIT_,
645                                   null, null);
646         RULES_OPTIONS_[14] = new TokenOption("suppressContractions",
647                                   RuleBasedCollator.Attribute.LIMIT_,
648                                   null, null);
649         RULES_OPTIONS_[15] = new TokenOption("undefined",
650                                   RuleBasedCollator.Attribute.LIMIT_,
651                                   null, null);
652         RULES_OPTIONS_[16] = new TokenOption("scriptOrder",
653                                   RuleBasedCollator.Attribute.LIMIT_,
654                                   null, null);
655         RULES_OPTIONS_[17] = new TokenOption("charsetname",
656                                   RuleBasedCollator.Attribute.LIMIT_,
657                                   null, null);
658         RULES_OPTIONS_[18] = new TokenOption("charset",
659                                   RuleBasedCollator.Attribute.LIMIT_,
660                                   null, null);
661     }
662 
663     /**
664      * Utility data members
665      */
666     private Token m_utilToken_ = new Token();
667     private CollationElementIterator m_UCAColEIter_
668                       = RuleBasedCollator.UCA_.getCollationElementIterator("");
669     private int m_utilCEBuffer_[] = new int[2];
670 
671     // private methods -------------------------------------------------------
672 
673     /**
674      * Assembles the token list
675      * @exception ParseException thrown when rules syntax fails
676      */
677     int assembleTokenList() throws ParseException  
678     {
679         Token lastToken = null;
680         m_parsedToken_.m_strength_ = TOKEN_UNSET_;
681         int sourcelimit = m_source_.length();
682         int expandNext = 0;
683 
684         while (m_current_ < sourcelimit) {
685             m_parsedToken_.m_prefixOffset_ = 0;
686             if (parseNextToken(lastToken == null) < 0) {
687                 // we have reached the end
688                 continue;
689             }
690             char specs = m_parsedToken_.m_flags_;
691             boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);
692             boolean top = ((specs & TOKEN_TOP_MASK_) != 0);
693             int lastStrength = TOKEN_UNSET_;
694             if (lastToken != null) {
695                 lastStrength = lastToken.m_strength_;
696             }
697             m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24
698                                              | m_parsedToken_.m_charsOffset_;
699             m_utilToken_.m_rules_ = m_source_;
700             // 4 Lookup each source in the CharsToToken map, and find a
701             // sourcetoken
702             Token sourceToken = (Token)m_hashTable_.get(m_utilToken_);
703             if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
704                 if (lastToken == null) {
705                     // this means that rules haven't started properly
706                     throwParseException(m_source_.toString(), 0);
707                 }
708                 //  6 Otherwise (when relation != reset)
709                 if (sourceToken == null) {
710                     // If sourceToken is null, create new one
711                     sourceToken = new Token();
712                      sourceToken.m_rules_ = m_source_;
713                     sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
714                                            | m_parsedToken_.m_charsOffset_;
715                     sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24
716                                            | m_parsedToken_.m_prefixOffset_;
717                     // TODO: this should also handle reverse
718                     sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
719                     sourceToken.m_next_ = null;
720                      sourceToken.m_previous_ = null;
721                     sourceToken.m_CELength_ = 0;
722                     sourceToken.m_expCELength_ = 0;
723                     m_hashTable_.put(sourceToken, sourceToken);
724                 }
725                 else {
726                     // we could have fished out a reset here
727                     if (sourceToken.m_strength_ != TOKEN_RESET_
728                         && lastToken != sourceToken) {
729                         // otherwise remove sourceToken from where it was.
730                         if (sourceToken.m_next_ != null) {
731                             if (sourceToken.m_next_.m_strength_
732                                                    > sourceToken.m_strength_) {
733                                 sourceToken.m_next_.m_strength_
734                                                    = sourceToken.m_strength_;
735                             }
736                             sourceToken.m_next_.m_previous_
737                                                     = sourceToken.m_previous_;
738                         }
739                         else {
740                             sourceToken.m_listHeader_.m_last_
741                                                     = sourceToken.m_previous_;
742                         }
743                         if (sourceToken.m_previous_ != null) {
744                             sourceToken.m_previous_.m_next_
745                                                         = sourceToken.m_next_;
746                         }
747                         else {
748                             sourceToken.m_listHeader_.m_first_
749                                                         = sourceToken.m_next_;
750                         }
751                         sourceToken.m_next_ = null;
752                         sourceToken.m_previous_ = null;
753                     }
754                 }
755                 sourceToken.m_strength_ = m_parsedToken_.m_strength_;
756                 sourceToken.m_listHeader_ = lastToken.m_listHeader_;
757 
758                 // 1.  Find the strongest strength in each list, and set
759                 // strongestP and strongestN accordingly in the headers.
760                 if (lastStrength == TOKEN_RESET_
761                     || sourceToken.m_listHeader_.m_first_ == null) {
762                     // If LAST is a reset insert sourceToken in the list.
763                     if (sourceToken.m_listHeader_.m_first_ == null) {
764                         sourceToken.m_listHeader_.m_first_ = sourceToken;
765                         sourceToken.m_listHeader_.m_last_ = sourceToken;
766                     }
767                     else { // we need to find a place for us
768                            // and we'll get in front of the same strength
769                         if (sourceToken.m_listHeader_.m_first_.m_strength_
770                                                  <= sourceToken.m_strength_) {
771                             sourceToken.m_next_
772                                           = sourceToken.m_listHeader_.m_first_;
773                             sourceToken.m_next_.m_previous_ = sourceToken;
774                             sourceToken.m_listHeader_.m_first_ = sourceToken;
775                             sourceToken.m_previous_ = null;
776                         }
777                         else {
778                             lastToken = sourceToken.m_listHeader_.m_first_;
779                             while (lastToken.m_next_ != null
780                                    && lastToken.m_next_.m_strength_
781                                                  > sourceToken.m_strength_) {
782                                 lastToken = lastToken.m_next_;
783                             }
784                             if (lastToken.m_next_ != null) {
785                                 lastToken.m_next_.m_previous_ = sourceToken;
786                             }
787                             else {
788                                 sourceToken.m_listHeader_.m_last_
789                                                                = sourceToken;
790                             }
791                             sourceToken.m_previous_ = lastToken;
792                             sourceToken.m_next_ = lastToken.m_next_;
793                             lastToken.m_next_ = sourceToken;
794                         }
795                     }
796                 }
797                 else {
798                     // Otherwise (when LAST is not a reset)
799                     // if polarity (LAST) == polarity(relation), insert
800                     // sourceToken after LAST, otherwise insert before.
801                     // when inserting after or before, search to the next
802                     // position with the same strength in that direction.
803                     // (This is called postpone insertion).
804                     if (sourceToken != lastToken) {
805                         if (lastToken.m_polarity_ == sourceToken.m_polarity_) {
806                             while (lastToken.m_next_ != null
807                                    && lastToken.m_next_.m_strength_
808                                                    > sourceToken.m_strength_) {
809                                 lastToken = lastToken.m_next_;
810                             }
811                             sourceToken.m_previous_ = lastToken;
812                             if (lastToken.m_next_ != null) {
813                                 lastToken.m_next_.m_previous_ = sourceToken;
814                             }
815                             else {
816                                 sourceToken.m_listHeader_.m_last_ = sourceToken;
817                             }
818                             sourceToken.m_next_ = lastToken.m_next_;
819                             lastToken.m_next_ = sourceToken;
820                         }
821                         else {
822                             while (lastToken.m_previous_ != null
823                                    && lastToken.m_previous_.m_strength_
824                                                 > sourceToken.m_strength_) {
825                                 lastToken = lastToken.m_previous_;
826                             }
827                             sourceToken.m_next_ = lastToken;
828                             if (lastToken.m_previous_ != null) {
829                                 lastToken.m_previous_.m_next_ = sourceToken;
830                             }
831                             else {
832                                 sourceToken.m_listHeader_.m_first_
833                                                                  = sourceToken;
834                             }
835                             sourceToken.m_previous_ = lastToken.m_previous_;
836                             lastToken.m_previous_ = sourceToken;
837                         }
838                     }
839                     else { // repeated one thing twice in rules, stay with the
840                            // stronger strength
841                         if (lastStrength < sourceToken.m_strength_) {
842                             sourceToken.m_strength_ = lastStrength;
843                         }
844                     }
845                 }
846                 // if the token was a variable top, we're gonna put it in
847                 if (variableTop == true && m_variableTop_ == null) {
848                     variableTop = false;
849                     m_variableTop_ = sourceToken;
850                 }
851                 // Treat the expansions.
852                 // There are two types of expansions: explicit (x / y) and
853                 // reset based propagating expansions
854                 // (&abc * d * e <=> &ab * d / c * e / c)
855                 // if both of them are in effect for a token, they are combined.
856                sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
857                                           | m_parsedToken_.m_extensionOffset_;
858                if (expandNext != 0) {
859                    if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) {
860                        // primary strength kills off the implicit expansion
861                        expandNext = 0;
862                    }
863                    else if (sourceToken.m_expansion_ == 0) {
864                        // if there is no expansion, implicit is just added to
865                        // the token
866                        sourceToken.m_expansion_ = expandNext;
867                    }
868                    else {
869                        // there is both explicit and implicit expansion.
870                        // We need to make a combination
871                        int start = expandNext & 0xFFFFFF;
872                        int size = expandNext >>> 24;
873                        if (size > 0) {
874                           m_source_.append(m_source_.substring(start,
875                                                                start + size));
876                        }
877                           start = m_parsedToken_.m_extensionOffset_;
878                        m_source_.append(m_source_.substring(start,
879                                       start + m_parsedToken_.m_extensionLen_));
880                        sourceToken.m_expansion_ = (size
881                                        + m_parsedToken_.m_extensionLen_) << 24
882                                        | m_extraCurrent_;
883                        m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_;
884                    }
885                 }
886                // if the previous token was a reset before, the strength of this
887                // token must match the strength of before. Otherwise we have an
888                // undefined situation.
889                // In other words, we currently have a cludge which we use to
890                // represent &a >> x. This is written as &[before 2]a << x.
891                if((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {
892                    int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;
893                    if(beforeStrength != sourceToken.m_strength_) {
894                           throwParseException(m_source_.toString(), m_current_);
895                    }
896                }
897 
898             }
899             else {
900                 if (lastToken != null && lastStrength == TOKEN_RESET_) {
901                     // if the previous token was also a reset, this means that
902                     // we have two consecutive resets and we want to remove the
903                     // previous one if empty
904                     if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
905                         m_resultLength_ --;
906                     }
907                 }
908                 if (sourceToken == null) {
909                     // this is a reset, but it might still be somewhere in the
910                     // tailoring, in shorter form
911                     int searchCharsLen = m_parsedToken_.m_charsLen_;
912                     while (searchCharsLen > 1 && sourceToken == null) {
913                         searchCharsLen --;
914                         // key = searchCharsLen << 24 | charsOffset;
915                         m_utilToken_.m_source_ = searchCharsLen << 24
916                                              | m_parsedToken_.m_charsOffset_;
917                         m_utilToken_.m_rules_ = m_source_;
918                         sourceToken = (Token)m_hashTable_.get(m_utilToken_);
919                     }
920                     if (sourceToken != null) {
921                         expandNext = (m_parsedToken_.m_charsLen_
922                                                       - searchCharsLen) << 24
923                                         | (m_parsedToken_.m_charsOffset_
924                                            + searchCharsLen);
925                     }
926                 }
927                 if ((specs & TOKEN_BEFORE_) != 0) {
928                     if (top == false) {
929                         // we're doing before & there is no indirection
930                         int strength = (specs & TOKEN_BEFORE_) - 1;
931                         if (sourceToken != null
932                             && sourceToken.m_strength_ != TOKEN_RESET_) {
933                             // this is a before that is already ordered in the UCA
934                             // - so we need to get the previous with good strength
935                             while (sourceToken.m_strength_ > strength
936                                    && sourceToken.m_previous_ != null) {
937                                 sourceToken = sourceToken.m_previous_;
938                             }
939                             // here, either we hit the strength or NULL
940                             if (sourceToken.m_strength_ == strength) {
941                                 if (sourceToken.m_previous_ != null) {
942                                     sourceToken = sourceToken.m_previous_;
943                                 }
944                                 else { // start of list
945                                     sourceToken
946                                          = sourceToken.m_listHeader_.m_reset_;
947                                 }
948                             }
949                             else { // we hit NULL, we should be doing the else part
950                                 sourceToken
951                                          = sourceToken.m_listHeader_.m_reset_;
952                                 sourceToken = getVirginBefore(sourceToken,
953                                                               strength);
954                             }
955                         }
956                         else {
957                             sourceToken
958                                       = getVirginBefore(sourceToken, strength);
959                         }
960                     }
961                     else {
962                         // this is both before and indirection
963                         top = false;
964                         m_listHeader_[m_resultLength_] = new TokenListHeader();
965                         m_listHeader_[m_resultLength_].m_previousCE_ = 0;
966                         m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
967                         m_listHeader_[m_resultLength_].m_indirect_ = true;
968                         // we need to do slightly more work. we need to get the
969                         // baseCE using the inverse UCA & getPrevious. The next
970                         // bound is not set, and will be decided in ucol_bld
971                         int strength = (specs & TOKEN_BEFORE_) - 1;
972                         int baseCE = INDIRECT_BOUNDARIES_[
973                                    m_parsedToken_.m_indirectIndex_].m_startCE_;
974                         int baseContCE = INDIRECT_BOUNDARIES_[
975                                m_parsedToken_.m_indirectIndex_].m_startContCE_;
976                         int ce[] = new int[2];
977                         if((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
978                         && (baseCE >>> 24 <=  RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
979                             int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_ | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
980                             int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
981                             int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
982                             ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
983                             ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
984                         } else {
985                             CollationParsedRuleBuilder.InverseUCA invuca
986                                 = CollationParsedRuleBuilder.INVERSE_UCA_;
987                             invuca.getInversePrevCE(baseCE, baseContCE, strength,
988                                     ce);
989                         }
990                         m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];
991                         m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];
992                         m_listHeader_[m_resultLength_].m_nextCE_ = 0;
993                         m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
994 
995                         sourceToken = new Token();
996                         expandNext = initAReset(0, sourceToken);
997                     }
998                 }
999                 // 5 If the relation is a reset:
1000                // If sourceToken is null
1001                // Create new list, create new sourceToken, make the baseCE
1002                // from source, put the sourceToken in ListHeader of the new
1003                // list
1004                if (sourceToken == null) {
1005                    if (m_listHeader_[m_resultLength_] == null) {
1006                        m_listHeader_[m_resultLength_] = new TokenListHeader();
1007                    }
1008                    // 3 Consider each item: relation, source, and expansion:
1009                    // e.g. ...< x / y ...
1010                    // First convert all expansions into normal form.
1011                    // Examples:
1012                    // If "xy" doesn't occur earlier in the list or in the UCA,
1013                    // convert &xy * c * d * ... into &x * c/y * d * ...
1014                    // Note: reset values can never have expansions, although
1015                    // they can cause the very next item to have one. They may
1016                    // be contractions, if they are found earlier in the list.
1017                    if (top == false) {
1018                        CollationElementIterator coleiter
1019                        = RuleBasedCollator.UCA_.getCollationElementIterator(
1020                            m_source_.substring(m_parsedToken_.m_charsOffset_,
1021                                                m_parsedToken_.m_charsOffset_
1022                                                + m_parsedToken_.m_charsLen_));
1023
1024                        int CE = coleiter.next();
1025                        // offset to the character in the full rule string
1026                        int expand = coleiter.getOffset()
1027                                     + m_parsedToken_.m_charsOffset_;
1028                        int SecondCE = coleiter.next();
1029
1030                        m_listHeader_[m_resultLength_].m_baseCE_
1031                                                             = CE & 0xFFFFFF3F;
1032                        if (RuleBasedCollator.isContinuation(SecondCE)) {
1033                            m_listHeader_[m_resultLength_].m_baseContCE_
1034                                                                    = SecondCE;
1035                        }
1036                        else {
1037                            m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
1038                        }
1039                        m_listHeader_[m_resultLength_].m_nextCE_ = 0;
1040                        m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1041                        m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1042                        m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1043                        m_listHeader_[m_resultLength_].m_indirect_ = false;
1044                        sourceToken = new Token();
1045                        expandNext = initAReset(expand, sourceToken);
1046                    }
1047                    else { // top == TRUE
1048                        top = false;
1049                        m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1050                        m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1051                        m_listHeader_[m_resultLength_].m_indirect_ = true;
1052                        IndirectBoundaries ib = INDIRECT_BOUNDARIES_[
1053                                              m_parsedToken_.m_indirectIndex_];
1054                        m_listHeader_[m_resultLength_].m_baseCE_
1055                                                               = ib.m_startCE_;
1056                        m_listHeader_[m_resultLength_].m_baseContCE_
1057                                                           = ib.m_startContCE_;
1058                        m_listHeader_[m_resultLength_].m_nextCE_
1059                                                               = ib.m_limitCE_;
1060                        m_listHeader_[m_resultLength_].m_nextContCE_
1061                                                           = ib.m_limitContCE_;
1062                        sourceToken = new Token();
1063                        expandNext = initAReset(0, sourceToken);
1064                    }
1065                }
1066                else { // reset to something already in rules
1067                    top = false;
1068                }
1069            }
1070            // 7 After all this, set LAST to point to sourceToken, and goto
1071            // step 3.
1072            lastToken = sourceToken;
1073        }
1074
1075        if (m_resultLength_ > 0
1076            && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
1077            m_resultLength_ --;
1078        }
1079        return m_resultLength_;
1080    }
1081
1082    /**
1083     * Formats and throws a ParseException
1084     * @param rules collation rule that failed
1085     * @param offset failed offset in rules
1086     * @throws ParseException with failure information
1087     */
1088    private static final void throwParseException(String   rules, int offset)
1089                                                          throws ParseException  
1090    {
1091        // for pre-context
1092        String   precontext = rules.substring(0, offset);
1093        String   postcontext = rules.substring(offset, rules.length());
1094        StringBuffer   error = new StringBuffer  (
1095                                    "Parse error occurred in rule at offset ");
1096        error.append(offset);
1097        error.append("\n after the prefix \"");
1098        error.append(precontext);
1099        error.append("\" before the suffix \"");
1100        error.append(postcontext);
1101        throw new ParseException  (error.toString(), offset);
1102    }
1103
1104    private final boolean doSetTop() {
1105        m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1106        m_source_.append((char)0xFFFE);
1107        IndirectBoundaries ib =
1108                  INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
1109        m_source_.append((char)(ib.m_startCE_ >> 16));
1110        m_source_.append((char)(ib.m_startCE_ & 0xFFFF));
1111        m_extraCurrent_ += 3;
1112        if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_
1113                                                       ].m_startContCE_ == 0) {
1114            m_parsedToken_.m_charsLen_ = 3;
1115        }
1116        else {
1117            m_source_.append((char)(INDIRECT_BOUNDARIES_[
1118                                        m_parsedToken_.m_indirectIndex_
1119                                    ].m_startContCE_ >> 16));
1120            m_source_.append((char)(INDIRECT_BOUNDARIES_[
1121                                        m_parsedToken_.m_indirectIndex_
1122                                    ].m_startContCE_ & 0xFFFF));
1123            m_extraCurrent_ += 2;
1124            m_parsedToken_.m_charsLen_ = 5;
1125        }
1126        return true;
1127    }
1128
1129    private static boolean isCharNewLine(char c) {
1130        switch (c) {
1131        case 0x000A: /* LF */
1132        case 0x000D: /* CR */
1133        case 0x000C: /* FF */
1134        case 0x0085: /* NEL */
1135        case 0x2028: /* LS */
1136        case 0x2029: /* PS */
1137            return true;
1138        default:
1139            return false;
1140        }
1141    }
1142
1143    /**
1144     * Getting the next token
1145     *
1146     * @param startofrules
1147     *            flag indicating if we are at the start of rules
1148     * @return the offset of the rules
1149     * @exception ParseException
1150     *                thrown when rule parsing fails
1151     */
1152    private int parseNextToken(boolean startofrules) throws ParseException  
1153    {
1154        // parsing part
1155        boolean variabletop = false;
1156        boolean top = false;
1157        boolean inchars = true;
1158        boolean inquote = false;
1159        boolean wasinquote = false;
1160        byte before = 0;
1161        boolean isescaped = false;
1162        int /*newcharslen = 0,*/ newextensionlen = 0;
1163        int /*charsoffset = 0,*/ extensionoffset = 0;
1164        int newstrength = TOKEN_UNSET_;
1165
1166        m_parsedToken_.m_charsLen_ = 0;
1167        m_parsedToken_.m_charsOffset_ = 0;
1168        m_parsedToken_.m_prefixOffset_ = 0;
1169        m_parsedToken_.m_prefixLen_ = 0;
1170        m_parsedToken_.m_indirectIndex_ = 0;
1171
1172        int limit = m_rules_.length();
1173        while (m_current_ < limit) {
1174            char ch = m_source_.charAt(m_current_);
1175            if (inquote) {
1176                if (ch == 0x0027) { // '\''
1177                    inquote = false;
1178                }
1179                else {
1180                    if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {
1181                         if (m_parsedToken_.m_charsLen_ == 0) {
1182                             m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1183                         }
1184                         m_parsedToken_.m_charsLen_ ++;
1185                    }
1186                    else {
1187                        if (newextensionlen == 0) {
1188                            extensionoffset = m_extraCurrent_;
1189                        }
1190                        newextensionlen ++;
1191                    }
1192                }
1193            }
1194            else if (isescaped) {
1195                isescaped = false;
1196                if (newstrength == TOKEN_UNSET_) {
1197                    throwParseException(m_rules_, m_current_);
1198                }
1199                if (ch != 0 && m_current_ != limit) {
1200                    if (inchars) {
1201                        if (m_parsedToken_.m_charsLen_ == 0) {
1202                            m_parsedToken_.m_charsOffset_ = m_current_;
1203                        }
1204                        m_parsedToken_.m_charsLen_ ++;
1205                    }
1206                    else {
1207                        if (newextensionlen == 0) {
1208                            extensionoffset = m_current_;
1209                        }
1210                        newextensionlen ++;
1211                    }
1212                }
1213            }
1214            else {
1215                if (!UCharacterProperty.isRuleWhiteSpace(ch)) {
1216                    // Sets the strength for this entry
1217                    switch (ch) {
1218                    case 0x003D : // '='
1219                        if (newstrength != TOKEN_UNSET_) {
1220                            return doEndParseNextToken(newstrength,
1221                                                       top,
1222                                                       extensionoffset,
1223                                                       newextensionlen,
1224                                                       variabletop, before);
1225                        }
1226                        // if we start with strength, we'll reset to top
1227                        if (startofrules == true) {
1228                            m_parsedToken_.m_indirectIndex_ = 5;
1229                            top = doSetTop();
1230                            return doEndParseNextToken(TOKEN_RESET_,
1231                                                       top,
1232                                                       extensionoffset,
1233                                                       newextensionlen,
1234                                                       variabletop, before);
1235                        }
1236                        newstrength = Collator.IDENTICAL;
1237                        break;
1238                    case 0x002C : // ','
1239                        if (newstrength != TOKEN_UNSET_) {
1240                            return doEndParseNextToken(newstrength,
1241                                                       top,
1242                                                       extensionoffset,
1243                                                       newextensionlen,
1244                                                       variabletop, before);
1245                        }
1246                        // if we start with strength, we'll reset to top
1247                        if (startofrules == true) {
1248                            m_parsedToken_.m_indirectIndex_ = 5;
1249                            top = doSetTop();
1250                            return doEndParseNextToken(TOKEN_RESET_,
1251                                                       top,
1252                                                       extensionoffset,
1253                                                       newextensionlen,
1254                                                       variabletop, before);
1255                        }
1256                        newstrength = Collator.TERTIARY;
1257                        break;
1258                    case 0x003B : // ';'
1259                        if (newstrength != TOKEN_UNSET_) {
1260                            return doEndParseNextToken(newstrength,
1261                                                       top,
1262                                                       extensionoffset,
1263                                                       newextensionlen,
1264                                                       variabletop, before);
1265                        }
1266                        // if we start with strength, we'll reset to top
1267                        if (startofrules == true) {
1268                            m_parsedToken_.m_indirectIndex_ = 5;
1269                            top = doSetTop();
1270                            return doEndParseNextToken(TOKEN_RESET_,
1271                                                       top,
1272                                                       extensionoffset,
1273                                                       newextensionlen,
1274                                                       variabletop, before);
1275                        }
1276                        newstrength = Collator.SECONDARY;
1277                        break;
1278                    case 0x003C : // '<'
1279                        if (newstrength != TOKEN_UNSET_) {
1280                            return doEndParseNextToken(newstrength,
1281                                                       top,
1282                                                       extensionoffset,
1283                                                       newextensionlen,
1284                                                       variabletop, before);
1285                        }
1286                        // if we start with strength, we'll reset to top
1287                        if (startofrules == true) {
1288                            m_parsedToken_.m_indirectIndex_ = 5;
1289                            top = doSetTop();
1290                            return doEndParseNextToken(TOKEN_RESET_,
1291                                                       top,
1292                                                       extensionoffset,
1293                                                       newextensionlen,
1294                                                       variabletop, before);
1295                        }
1296                        // before this, do a scan to verify whether this is
1297                        // another strength
1298                        if (m_source_.charAt(m_current_ + 1) == 0x003C) {
1299                            m_current_ ++;
1300                            if (m_source_.charAt(m_current_ + 1) == 0x003C) {
1301                                m_current_ ++; // three in a row!
1302                                newstrength = Collator.TERTIARY;
1303                            }
1304                            else { // two in a row
1305                                newstrength = Collator.SECONDARY;
1306                            }
1307                        }
1308                        else { // just one
1309                            newstrength = Collator.PRIMARY;
1310                        }
1311                        break;
1312                    case 0x0026 : // '&'
1313                        if (newstrength != TOKEN_UNSET_) {
1314                            return doEndParseNextToken(newstrength,
1315                                                       top,
1316                                                       extensionoffset,
1317                                                       newextensionlen,
1318                                                       variabletop, before);
1319                        }
1320                        newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0
1321                        break;
1322                    case 0x005b : // '['
1323                        // options - read an option, analyze it
1324                        m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_);
1325                        if (m_optionEnd_ != -1) { // ']'
1326                            byte result = readAndSetOption();
1327                            m_current_ = m_optionEnd_;
1328                            if ((result & TOKEN_TOP_MASK_) != 0) {
1329                                if (newstrength == TOKEN_RESET_) {
1330                                    top = doSetTop();
1331                                    if (before != 0) {
1332                                        // This is a combination of before and
1333                                        // indirection like
1334                                        // '&[before 2][first regular]<b'
1335                                        m_source_.append((char)0x002d);
1336                                        m_source_.append((char)before);
1337                                        m_extraCurrent_ += 2;
1338                                        m_parsedToken_.m_charsLen_ += 2;
1339                                    }
1340                                    m_current_ ++;
1341                                    return doEndParseNextToken(newstrength,
1342                                                       true,
1343                                                       extensionoffset,
1344                                                       newextensionlen,
1345                                                       variabletop, before);
1346                                }
1347                                else {
1348                                    throwParseException(m_rules_, m_current_);
1349                                }
1350                            }
1351                            else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) {
1352                                if (newstrength != TOKEN_RESET_
1353                                    && newstrength != TOKEN_UNSET_) {
1354                                    variabletop = true;
1355                                    m_parsedToken_.m_charsOffset_
1356                                                             = m_extraCurrent_;
1357                                    m_source_.append((char)0xFFFF);
1358                                    m_extraCurrent_ ++;
1359                                    m_current_ ++;
1360                                    m_parsedToken_.m_charsLen_ = 1;
1361                                    return doEndParseNextToken(newstrength,
1362                                                       top,
1363                                                       extensionoffset,
1364                                                       newextensionlen,
1365                                                       variabletop, before);
1366                                }
1367                                else {
1368                                    throwParseException(m_rules_, m_current_);
1369                                }
1370                            }
1371                            else if ((result & TOKEN_BEFORE_) != 0){
1372                                if (newstrength == TOKEN_RESET_) {
1373                                    before = (byte)(result & TOKEN_BEFORE_);
1374                                }
1375                                else {
1376                                    throwParseException(m_rules_, m_current_);
1377                                }
1378                            }
1379                        }
1380                        break;
1381                    case 0x002F : // '/'
1382                        wasinquote = false; // if we were copying source
1383                                            // characters, we want to stop now
1384                        inchars = false; // we're now processing expansion
1385                        break;
1386                    case 0x005C : // back slash for escaped chars
1387                        isescaped = true;
1388                        break;
1389                    // found a quote, we're gonna start copying
1390                    case 0x0027 : //'\''
1391                        if (newstrength == TOKEN_UNSET_) {
1392                            // quote is illegal until we have a strength
1393                            throwParseException(m_rules_, m_current_);
1394                        }
1395                        inquote = true;
1396                        if (inchars) { // we're doing characters
1397                            if (wasinquote == false) {
1398                                m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1399                            }
1400                            if (m_parsedToken_.m_charsLen_ != 0) {
1401                                m_source_.append(m_source_.substring(
1402                                       m_current_ - m_parsedToken_.m_charsLen_,
1403                                       m_current_));
1404                                m_extraCurrent_ += m_parsedToken_.m_charsLen_;
1405                            }
1406                            m_parsedToken_.m_charsLen_ ++;
1407                        }
1408                        else { // we're doing an expansion
1409                            if (wasinquote == false) {
1410                                extensionoffset = m_extraCurrent_;
1411                            }
1412                            if (newextensionlen != 0) {
1413                                m_source_.append(m_source_.substring(
1414                                                   m_current_ - newextensionlen,
1415                                                   m_current_));
1416                                m_extraCurrent_ += newextensionlen;
1417                            }
1418                            newextensionlen ++;
1419                        }
1420                        wasinquote = true;
1421                        m_current_ ++;
1422                        ch = m_source_.charAt(m_current_);
1423                        if (ch == 0x0027) { // copy the double quote
1424                            m_source_.append(ch);
1425                            m_extraCurrent_ ++;
1426                            inquote = false;
1427                        }
1428                        break;
1429                    // '@' is french only if the strength is not currently set
1430                    // if it is, it's just a regular character in collation
1431                    case 0x0040 : // '@'
1432                        if (newstrength == TOKEN_UNSET_) {
1433                            m_options_.m_isFrenchCollation_ = true;
1434                        break;
1435                    }
1436                    case 0x007C : //|
1437                        // this means we have actually been reading prefix part
1438                        // we want to store read characters to the prefix part
1439                        // and continue reading the characters (proper way
1440                        // would be to restart reading the chars, but in that
1441                        // case we would have to complicate the token hasher,
1442                        // which I do not intend to play with. Instead, we will
1443                        // do prefixes when prefixes are due (before adding the
1444                        // elements).
1445                        m_parsedToken_.m_prefixOffset_
1446                                                = m_parsedToken_.m_charsOffset_;
1447                        m_parsedToken_.m_prefixLen_
1448                                                = m_parsedToken_.m_charsLen_;
1449                        if (inchars) { // we're doing characters
1450                            if (wasinquote == false) {
1451                                m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1452                            }
1453                            if (m_parsedToken_.m_charsLen_ != 0) {
1454                                String   prefix = m_source_.substring(
1455                                       m_current_ - m_parsedToken_.m_charsLen_,
1456                                       m_current_);
1457                                m_source_.append(prefix);
1458                                m_extraCurrent_ += m_parsedToken_.m_charsLen_;
1459                            }
1460                            m_parsedToken_.m_charsLen_ ++;
1461                        }
1462                        wasinquote = true;
1463                        do {
1464                            m_current_ ++;
1465                            ch = m_source_.charAt(m_current_);
1466                            // skip whitespace between '|' and the character
1467                        } while (UCharacterProperty.isRuleWhiteSpace(ch));
1468                        break;
1469                    case 0x0023: // '#' // this is a comment, skip everything through the end of line
1470                        do {
1471                            m_current_ ++;
1472                            ch = m_source_.charAt(m_current_);
1473                        } while (!isCharNewLine(ch));
1474                        break;
1475                    case 0x0021: // '!' // ignoring java set thai reordering
1476                        break;
1477                    default :
1478                        if (newstrength == TOKEN_UNSET_) {
1479                            throwParseException(m_rules_, m_current_);
1480                        }
1481                        if (isSpecialChar(ch) && (inquote == false)) {
1482                            throwParseException(m_rules_, m_current_);
1483                        }
1484                        if (ch == 0x0000 && m_current_ + 1 == limit) {
1485                            break;
1486                        }
1487                        if (inchars) {
1488                            if (m_parsedToken_.m_charsLen_ == 0) {
1489                                m_parsedToken_.m_charsOffset_ = m_current_;
1490                            }
1491                            m_parsedToken_.m_charsLen_++;
1492                        }
1493                        else {
1494                            if (newextensionlen == 0) {
1495                                extensionoffset = m_current_;
1496                            }
1497                            newextensionlen ++;
1498                        }
1499                        break;
1500                    }
1501                }
1502            }
1503            if (wasinquote) {
1504                if (ch != 0x27) {
1505                      m_source_.append(ch);
1506                    m_extraCurrent_ ++;
1507                }
1508            }
1509            m_current_ ++;
1510        }
1511        return doEndParseNextToken(newstrength, top,
1512                                   extensionoffset, newextensionlen,
1513                                   variabletop, before);
1514    }
1515
1516    /**
1517     * End the next parse token
1518     * @param newstrength new strength
1519     * @return offset in rules, -1 for end of rules
1520     */
1521    private int doEndParseNextToken(int newstrength, /*int newcharslen,*/
1522                                    boolean top, /*int charsoffset,*/
1523                                    int extensionoffset, int newextensionlen,
1524                                    boolean variabletop, int before)
1525                                    throws ParseException  
1526    {
1527        if (newstrength == TOKEN_UNSET_) {
1528            return -1;
1529        }
1530        if (m_parsedToken_.m_charsLen_ == 0 && top == false) {
1531            throwParseException(m_rules_, m_current_);
1532        }
1533
1534        m_parsedToken_.m_strength_ = newstrength;
1535        //m_parsedToken_.m_charsOffset_ = charsoffset;
1536        //m_parsedToken_.m_charsLen_ = newcharslen;
1537        m_parsedToken_.m_extensionOffset_ = extensionoffset;
1538        m_parsedToken_.m_extensionLen_ = newextensionlen;
1539        m_parsedToken_.m_flags_ = (char)
1540                                  ((variabletop ? TOKEN_VARIABLE_TOP_MASK_ : 0)
1541                                  | (top ? TOKEN_TOP_MASK_ : 0) | before);
1542        return m_current_;
1543    }
1544
1545    /**
1546     * Token before this element
1547     * @param sourcetoken
1548     * @param strength collation strength
1549     * @return the token before source token
1550     * @exception ParseException thrown when rules have the wrong syntax
1551     */
1552    private Token getVirginBefore(Token sourcetoken, int strength)
1553                                                          throws ParseException  
1554    {
1555        // this is a virgin before - we need to fish the anchor from the UCA
1556        if (sourcetoken != null) {
1557            int offset = sourcetoken.m_source_ & 0xFFFFFF;
1558            m_UCAColEIter_.setText(m_source_.substring(offset, offset + 1));
1559        }
1560        else {
1561            m_UCAColEIter_.setText(
1562                             m_source_.substring(m_parsedToken_.m_charsOffset_,
1563                             m_parsedToken_.m_charsOffset_ + 1));
1564        }
1565
1566        int basece = m_UCAColEIter_.next() & 0xFFFFFF3F;
1567        int basecontce = m_UCAColEIter_.next();
1568        if (basecontce == CollationElementIterator.NULLORDER) {
1569            basecontce = 0;
1570        }
1571
1572        int ch = 0;
1573
1574
1575        if((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
1576                && (basece >>> 24 <=  RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
1577
1578            int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
1579            int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
1580            ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw-1);
1581            int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
1582            m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
1583            m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1584
1585            m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1586            m_source_.append('\uFFFE');
1587            m_source_.append((char)ch);
1588            m_extraCurrent_ += 2;
1589            m_parsedToken_.m_charsLen_++;
1590
1591            m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
1592            | m_parsedToken_.m_charsOffset_;
1593            m_utilToken_.m_rules_ = m_source_;
1594            sourcetoken = (Token)m_hashTable_.get(m_utilToken_);
1595
1596            if(sourcetoken == null) {
1597                m_listHeader_[m_resultLength_] = new TokenListHeader();
1598                m_listHeader_[m_resultLength_].m_baseCE_
1599                    = m_utilCEBuffer_[0] & 0xFFFFFF3F;
1600                if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
1601                    m_listHeader_[m_resultLength_].m_baseContCE_
1602                    = m_utilCEBuffer_[1];
1603                }
1604                else {
1605                    m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
1606                }
1607                m_listHeader_[m_resultLength_].m_nextCE_ = 0;
1608                m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1609                m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1610                m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1611                m_listHeader_[m_resultLength_].m_indirect_ = false;
1612
1613                sourcetoken = new Token();
1614                initAReset(-1, sourcetoken);
1615            }
1616
1617        } else {
1618
1619            // first ce and second ce m_utilCEBuffer_
1620            int invpos = CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE(
1621                                                         basece, basecontce,
1622                                                         strength, m_utilCEBuffer_);
1623            // we got the previous CE. Now we need to see if the difference between
1624            // the two CEs is really of the requested strength.
1625            // if it's a bigger difference (we asked for secondary and got primary), we
1626            // need to modify the CE.
1627            if(CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {
1628                // adjust the strength
1629                // now we are in the situation where our baseCE should actually be modified in
1630                // order to get the CE in the right position.
1631                if(strength == Collator.SECONDARY) {
1632                    m_utilCEBuffer_[0] = basece - 0x0200;
1633                } else { // strength == UCOL_TERTIARY
1634                    m_utilCEBuffer_[0] = basece - 0x02;
1635                }
1636                if(RuleBasedCollator.isContinuation(basecontce)) {
1637                    if(strength == Collator.SECONDARY) {
1638                        m_utilCEBuffer_[1] = basecontce - 0x0200;
1639                    } else { // strength == UCOL_TERTIARY
1640                        m_utilCEBuffer_[1] = basecontce - 0x02;
1641                    }
1642                }
1643            }
1644
1645/*
1646            // the code below relies on getting a code point from the inverse table, in order to be
1647            // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1648            // 1. There are many code points that have the same CE
1649            // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1650            // Also, in case when there is no equivalent strength before an element, we have to actually
1651            // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1652            // before a is a primary difference.
1653            ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos
1654                                                                      + 2];
1655            if ((ch &  INVERSE_SIZE_MASK_) != 0) {
1656                int offset = ch & INVERSE_OFFSET_MASK_;
1657                ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[
1658                                                                           offset];
1659            }
1660            m_source_.append((char)ch);
1661            m_extraCurrent_ ++;
1662            m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;
1663            m_parsedToken_.m_charsLen_ = 1;
1664
1665            // We got an UCA before. However, this might have been tailored.
1666            // example:
1667            // &? = ?
1668            // &[before 3]?<<<?|?
1669
1670            m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
1671                                                 | m_parsedToken_.m_charsOffset_;
1672            m_utilToken_.m_rules_ = m_source_;
1673            sourcetoken = (Token)m_hashTable_.get(m_utilToken_);
1674*/
1675
1676            // here is how it should be. The situation such as &[before 1]a < x, should be
1677            // resolved exactly as if we wrote &a > x.
1678            // therefore, I don't really care if the UCA value before a has been changed.
1679            // However, I do care if the strength between my element and the previous element
1680            // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1681            // have to construct the base CE.
1682
1683            // if we found a tailored thing, we have to use the UCA value and
1684            // construct a new reset token with constructed name
1685            //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {
1686                // character to which we want to anchor is already tailored.
1687                // We need to construct a new token which will be the anchor point
1688                //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');
1689                //m_source_.append(ch);
1690                //m_extraCurrent_ ++;
1691                //m_parsedToken_.m_charsLen_ ++;
1692                // grab before
1693                m_parsedToken_.m_charsOffset_ -= 10;
1694                m_parsedToken_.m_charsLen_ += 10;
1695                m_listHeader_[m_resultLength_] = new TokenListHeader();
1696                m_listHeader_[m_resultLength_].m_baseCE_
1697                                                 = m_utilCEBuffer_[0] & 0xFFFFFF3F;
1698                if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
1699                    m_listHeader_[m_resultLength_].m_baseContCE_
1700                                                              = m_utilCEBuffer_[1];
1701                }
1702                else {
1703                    m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
1704                }
1705                m_listHeader_[m_resultLength_].m_nextCE_ = 0;
1706                m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1707                m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1708                m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1709                m_listHeader_[m_resultLength_].m_indirect_ = false;
1710                sourcetoken = new Token();
1711                initAReset(-1, sourcetoken);
1712            //}
1713        }
1714        return sourcetoken;
1715    }
1716
1717    /**
1718     * Processing Description.
1719     * 1. Build a m_listHeader_. Each list has a header, which contains two lists
1720     * (positive and negative), a reset token, a baseCE, nextCE, and
1721     * previousCE. The lists and reset may be null.
1722     * 2. As you process, you keep a LAST pointer that points to the last token
1723     * you handled.
1724     * @param expand string offset, -1 for null strings
1725     * @param targetToken token to update
1726     * @return expandnext offset
1727     * @throws ParseException thrown when rules syntax failed
1728     */
1729    private int initAReset(int expand, Token targetToken) throws ParseException  
1730    {
1731        if (m_resultLength_ == m_listHeader_.length - 1) {
1732            // Unfortunately, this won't work, as we store addresses of lhs in
1733            // token
1734            TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];
1735            System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1);
1736            m_listHeader_ = temp;
1737        }
1738        // do the reset thing
1739        targetToken.m_rules_ = m_source_;
1740        targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
1741                                | m_parsedToken_.m_charsOffset_;
1742        targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
1743                                   | m_parsedToken_.m_extensionOffset_;
1744        // keep the flags around so that we know about before
1745        targetToken.m_flags_ = m_parsedToken_.m_flags_;
1746
1747        if (m_parsedToken_.m_prefixOffset_ != 0) {
1748            throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1);
1749        }
1750
1751        targetToken.m_prefix_ = 0;
1752        // TODO: this should also handle reverse
1753        targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
1754        targetToken.m_strength_ = TOKEN_RESET_;
1755        targetToken.m_next_ = null;
1756        targetToken.m_previous_ = null;
1757        targetToken.m_CELength_ = 0;
1758        targetToken.m_expCELength_ = 0;
1759        targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];
1760        m_listHeader_[m_resultLength_].m_first_ = null;
1761        m_listHeader_[m_resultLength_].m_last_ = null;
1762        m_listHeader_[m_resultLength_].m_first_ = null;
1763        m_listHeader_[m_resultLength_].m_last_ = null;
1764        m_listHeader_[m_resultLength_].m_reset_ = targetToken;
1765
1766        /* 3 Consider each item: relation, source, and expansion:
1767         * e.g. ...< x / y ...
1768         * First convert all expansions into normal form. Examples:
1769         * If "xy" doesn't occur earlier in the list or in the UCA, convert
1770         * &xy * c * d * ... into &x * c/y * d * ...
1771         * Note: reset values can never have expansions, although they can
1772         * cause the very next item to have one. They may be contractions, if
1773         * they are found earlier in the list.
1774         */
1775        int result = 0;
1776        if (expand > 0) {
1777            // check to see if there is an expansion
1778            if (m_parsedToken_.m_charsLen_ > 1) {
1779                targetToken.m_source_ = ((expand
1780                                          - m_parsedToken_.m_charsOffset_ )
1781                                          << 24)
1782                                          | m_parsedToken_.m_charsOffset_;
1783                result = ((m_parsedToken_.m_charsLen_
1784                               + m_parsedToken_.m_charsOffset_ - expand) << 24)
1785                               | expand;
1786            }
1787        }
1788
1789        m_resultLength_ ++;
1790        m_hashTable_.put(targetToken, targetToken);
1791        return result;
1792    }
1793
1794    /**
1795     * Checks if an character is special
1796     * @param ch character to test
1797     * @return true if the character is special
1798     */
1799    private static final boolean isSpecialChar(char ch)
1800    {
1801        return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A)
1802               || (ch <= 0x0060 && ch >= 0x005B)
1803               || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;
1804    }
1805
1806    private
1807    UnicodeSet readAndSetUnicodeSet(String   source, int start) throws ParseException  
1808    {
1809      while(source.charAt(start) != '[') { /* advance while we find the first '[' */
1810        start++;
1811      }
1812      // now we need to get a balanced set of '[]'. The problem is that a set can have
1813      // many, and *end point to the first closing '['
1814      int noOpenBraces = 1;
1815      int current = 1; // skip the opening brace
1816      while(start+current < source.length() && noOpenBraces != 0) {
1817        if(source.charAt(start+current) == '[') {
1818          noOpenBraces++;
1819        } else if(source.charAt(start+current) == ']') { // closing brace
1820          noOpenBraces--;
1821        }
1822        current++;
1823      }
1824      //int nextBrace = -1;
1825
1826      if(noOpenBraces != 0 || (/*nextBrace =*/ source.indexOf("]", start+current) /*']'*/) == -1) {
1827        throwParseException(m_rules_, start);
1828      }
1829      return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);
1830    }
1831
1832
1833    /** in C, optionarg is passed by reference to function.
1834     *  We use a private int to simulate this.
1835     */
1836    private int m_optionarg_ = 0;
1837
1838    private int readOption(String   rules, int start, int optionend)
1839    {
1840        m_optionarg_ = 0;
1841        int i = 0;
1842        while (i < RULES_OPTIONS_.length) {
1843            String   option = RULES_OPTIONS_[i].m_name_;
1844            int optionlength = option.length();
1845            if (rules.length() > start + optionlength
1846                && option.equalsIgnoreCase(rules.substring(start,
1847                                                      start + optionlength))) {
1848                if (optionend - start > optionlength) {
1849                    m_optionarg_ = start + optionlength;
1850                    // start of the options, skip space
1851                    while (m_optionarg_ < optionend && UCharacter.isWhitespace(rules.charAt(m_optionarg_)))
1852                    {   // eat whitespace
1853                        m_optionarg_ ++;
1854                    }
1855                }
1856                break;
1857            }
1858            i ++;
1859        }
1860        if(i == RULES_OPTIONS_.length) {
1861            i = -1;
1862        }
1863        return i;
1864    }
1865    /**
1866     * Reads and set collation options
1867     * @return TOKEN_SUCCESS if option is set correct, 0 otherwise
1868     * @exception ParseException thrown when options in rules are wrong
1869     */
1870    private byte readAndSetOption() throws ParseException  
1871    {
1872        int start = m_current_ + 1; // skip opening '['
1873        int i = readOption(m_rules_, start, m_optionEnd_);
1874
1875        int optionarg = m_optionarg_;
1876
1877        if (i < 0) {
1878            throwParseException(m_rules_, start);
1879        }
1880
1881        if (i < 7) {
1882            if (optionarg != 0) {
1883                for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
1884                                                                        j ++) {
1885                     String   subname = RULES_OPTIONS_[i].m_subOptions_[j];
1886                     int size = optionarg + subname.length();
1887                     if (m_rules_.length() > size
1888                         && subname.equalsIgnoreCase(m_rules_.substring(
1889                                                           optionarg, size))) {
1890                         setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_,
1891                             RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);
1892                         return TOKEN_SUCCESS_MASK_;
1893                     }
1894                }
1895            }
1896            throwParseException(m_rules_, optionarg);
1897        }
1898        else if (i == 7) { // variable top
1899            return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;
1900        }
1901        else if (i == 8) { // rearange
1902            return TOKEN_SUCCESS_MASK_;
1903        }
1904        else if (i == 9) { // before
1905            if (optionarg != 0) {
1906                for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
1907                                                                        j ++) {
1908                     String   subname = RULES_OPTIONS_[i].m_subOptions_[j];
1909                     int size = optionarg + subname.length();
1910                     if (m_rules_.length() > size
1911                         && subname.equalsIgnoreCase(
1912                                               m_rules_.substring(optionarg,
1913                                              optionarg + subname.length()))) {
1914                         return (byte)(TOKEN_SUCCESS_MASK_
1915                            | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]
1916                            + 1);
1917                     }
1918                }
1919            }
1920            throwParseException(m_rules_, optionarg);
1921        }
1922        else if (i == 10) {  // top, we are going to have an array with
1923            // structures of limit CEs index to this array will be
1924            // src->parsedToken.indirectIndex
1925            m_parsedToken_.m_indirectIndex_ = 0;
1926            return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
1927        }
1928        else if (i < 13) { // first, last
1929            for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) {
1930                String   subname = RULES_OPTIONS_[i].m_subOptions_[j];
1931                int size = optionarg + subname.length();
1932                if (m_rules_.length() > size
1933                    && subname.equalsIgnoreCase(m_rules_.substring(optionarg,
1934                                                                   size))) {
1935                    m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + (j << 1));
1936                    return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
1937                }
1938            }
1939            throwParseException(m_rules_, optionarg);
1940        }
1941        else if(i == 13 || i == 14) { // copy and remove are handled before normalization
1942            // we need to move end here
1943            int noOpenBraces = 1;
1944            m_current_++; // skip opening brace
1945            while(m_current_ < m_source_.length() && noOpenBraces != 0) {
1946                if(m_source_.charAt(m_current_) == '[') {
1947                  noOpenBraces++;
1948                } else if(m_source_.charAt(m_current_) == ']') { // closing brace
1949                  noOpenBraces--;
1950                }
1951                m_current_++;
1952            }
1953            m_optionEnd_ = m_current_-1;
1954            return TOKEN_SUCCESS_MASK_;
1955        }
1956        else {
1957            throwParseException(m_rules_, optionarg);
1958        }
1959        return TOKEN_SUCCESS_MASK_; // we will never reach here.
1960    }
1961
1962    /**
1963     * Set collation option
1964     * @param optionset option set to set
1965     * @param attribute type to set
1966     * @param value attribute value
1967     */
1968    private void setOptions(OptionSet optionset, int attribute, int value)
1969    {
1970        switch (attribute) {
1971            case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ :
1972                optionset.m_isHiragana4_
1973                            = (value == RuleBasedCollator.AttributeValue.ON_);
1974                break;
1975            case RuleBasedCollator.Attribute.FRENCH_COLLATION_ :
1976                optionset.m_isFrenchCollation_
1977                             = (value == RuleBasedCollator.AttributeValue.ON_);
1978                break;
1979            case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ :
1980                optionset.m_isAlternateHandlingShifted_
1981                             = (value
1982                                == RuleBasedCollator.AttributeValue.SHIFTED_);
1983                break;
1984            case RuleBasedCollator.Attribute.CASE_FIRST_ :
1985                optionset.m_caseFirst_ = value;
1986                break;
1987            case RuleBasedCollator.Attribute.CASE_LEVEL_ :
1988                optionset.m_isCaseLevel_
1989                             = (value == RuleBasedCollator.AttributeValue.ON_);
1990                break;
1991            case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ :
1992                if (value == RuleBasedCollator.AttributeValue.ON_) {
1993                    value = Collator.CANONICAL_DECOMPOSITION;
1994                }
1995                optionset.m_decomposition_ = value;
1996                break;
1997            case RuleBasedCollator.Attribute.STRENGTH_ :
1998                optionset.m_strength_ = value;
1999                break;
2000            default :
2001                break;
2002        }
2003      }
2004
2005    UnicodeSet getTailoredSet() throws ParseException  
2006    {
2007        boolean startOfRules = true;
2008        UnicodeSet tailored = new UnicodeSet();
2009        String   pattern;
2010        CanonicalIterator it = new CanonicalIterator("");
2011
2012        m_parsedToken_.m_strength_ = TOKEN_UNSET_;
2013        int sourcelimit = m_source_.length();
2014        //int expandNext = 0;
2015
2016        while (m_current_ < sourcelimit) {
2017        m_parsedToken_.m_prefixOffset_ = 0;
2018        if (parseNextToken(startOfRules) < 0) {
2019            // we have reached the end
2020            continue;
2021        }
2022        startOfRules = false;
2023        // The idea is to tokenize the rule set. For each non-reset token,
2024        // we add all the canonicaly equivalent FCD sequences
2025            if(m_parsedToken_.m_strength_ != TOKEN_RESET_) {
2026                it.setSource(m_source_.substring(
2027                      m_parsedToken_.m_charsOffset_,
2028                      m_parsedToken_.m_charsOffset_+m_parsedToken_.m_charsLen_));
2029                pattern = it.next();
2030                while(pattern != null) {
2031                      if(Normalizer.quickCheck(pattern, Normalizer.FCD,0) != Normalizer.NO) {
2032                        tailored.add(pattern);
2033                    }
2034                    pattern = it.next();
2035                }
2036            }
2037        }
2038        return tailored;
2039    }
2040
2041    final private void extractSetsFromRules(String   rules) throws ParseException   {
2042      int optionNumber = -1;
2043      int setStart = 0;
2044      int i = 0;
2045      while(i < rules.length()) {
2046        if(rules.charAt(i) == 0x005B) {
2047          optionNumber = readOption(rules, i+1, rules.length());
2048          setStart = m_optionarg_;
2049          if(optionNumber == 13) { /* copy - parts of UCA to tailoring */
2050            UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
2051              if(m_copySet_ == null) {
2052                m_copySet_ = newSet;
2053              } else {
2054                m_copySet_.addAll(newSet);
2055              }
2056          } else if(optionNumber == 14) {
2057            UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
2058              if(m_removeSet_ == null) {
2059                m_removeSet_ = newSet;
2060              } else {
2061                m_removeSet_.addAll(newSet);
2062              }
2063          }
2064        }
2065        i++;
2066      }
2067    }
2068}
2069
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags