UnicodeSet


1   //##header 1189099963000 FOUNDATION
2   /*
3    *******************************************************************************
4    * Copyright (C) 1996-2006, International Business Machines Corporation and    *
5    * others. All Rights Reserved.                                                *
6    *******************************************************************************
7    */
8   package com.ibm.icu.text;
9   
10  import java.text.*;
11  import com.ibm.icu.lang.*;
12  
13  import java.io.IOException  ;
14  
15  import com.ibm.icu.impl.CollectionUtilities;
16  import com.ibm.icu.impl.NormalizerImpl;
17  import com.ibm.icu.impl.Utility;
18  import com.ibm.icu.impl.UCharacterProperty;
19  import com.ibm.icu.impl.UBiDiProps;
20  import com.ibm.icu.impl.UCaseProps;
21  import com.ibm.icu.impl.UPropertyAliases;
22  import com.ibm.icu.impl.SortedSetRelation;
23  import com.ibm.icu.impl.RuleCharacterIterator;
24  
25  import com.ibm.icu.util.Freezable;
26  import com.ibm.icu.util.ULocale;
27  import com.ibm.icu.util.VersionInfo;
28  
29  import com.ibm.icu.text.BreakIterator;
30  
31  import java.util.Map  ;
32  import java.util.HashMap  ;
33  import java.util.MissingResourceException  ;
34  import java.util.TreeSet  ;
35  import java.util.Iterator  ;
36  import java.util.Collection  ;
37  
38  /**
39   * A mutable set of Unicode characters and multicharacter strings.  Objects of this class
40   * represent <em>character classes</em> used in regular expressions.
41   * A character specifies a subset of Unicode code points.  Legal
42   * code points are U+0000 to U+10FFFF, inclusive.
43   *
44   * <p>The UnicodeSet class is not designed to be subclassed.
45   *
46   * <p><code>UnicodeSet</code> supports two APIs. The first is the
47   * <em>operand</em> API that allows the caller to modify the value of
48   * a <code>UnicodeSet</code> object. It conforms to Java 2's
49   * <code>java.util.Set</code> interface, although
50   * <code>UnicodeSet</code> does not actually implement that
51   * interface. All methods of <code>Set</code> are supported, with the
52   * modification that they take a character range or single character
53   * instead of an <code>Object</code>, and they take a
54   * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
55   * operand API may be thought of in terms of boolean logic: a boolean
56   * OR is implemented by <code>add</code>, a boolean AND is implemented
57   * by <code>retain</code>, a boolean XOR is implemented by
58   * <code>complement</code> taking an argument, and a boolean NOT is
59   * implemented by <code>complement</code> with no argument.  In terms
60   * of traditional set theory function names, <code>add</code> is a
61   * union, <code>retain</code> is an intersection, <code>remove</code>
62   * is an asymmetric difference, and <code>complement</code> with no
63   * argument is a set complement with respect to the superset range
64   * <code>MIN_VALUE-MAX_VALUE</code>
65   *
66   * <p>The second API is the
67   * <code>applyPattern()</code>/<code>toPattern()</code> API from the
68   * <code>java.text.Format</code>-derived classes.  Unlike the
69   * methods that add characters, add categories, and control the logic
70   * of the set, the method <code>applyPattern()</code> sets all
71   * attributes of a <code>UnicodeSet</code> at once, based on a
72   * string pattern.
73   *
74   * <p><b>Pattern syntax</b></p>
75   *
76   * Patterns are accepted by the constructors and the
77   * <code>applyPattern()</code> methods and returned by the
78   * <code>toPattern()</code> method.  These patterns follow a syntax
79   * similar to that employed by version 8 regular expression character
80   * classes.  Here are some simple examples:
81   *
82   * <blockquote>
83   *   <table>
84   *     <tr align="top">
85   *       <td nowrap valign="top" align="left"><code>[]</code></td>
86   *       <td valign="top">No characters</td>
87   *     </tr><tr align="top">
88   *       <td nowrap valign="top" align="left"><code>[a]</code></td>
89   *       <td valign="top">The character 'a'</td>
90   *     </tr><tr align="top">
91   *       <td nowrap valign="top" align="left"><code>[ae]</code></td>
92   *       <td valign="top">The characters 'a' and 'e'</td>
93   *     </tr>
94   *     <tr>
95   *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
96   *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
97   *       point order</td>
98   *     </tr>
99   *     <tr>
100  *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
101  *       <td valign="top">The character U+4E01</td>
102  *     </tr>
103  *     <tr>
104  *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
105  *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
106  *       &quot;ac&quot;</td>
107  *     </tr>
108  *     <tr>
109  *       <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
110  *       <td valign="top">All characters in the general category Uppercase Letter</td>
111  *     </tr>
112  *   </table>
113  * </blockquote>
114  *
115  * Any character may be preceded by a backslash in order to remove any special
116  * meaning.  White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
117  * ignored, unless they are escaped.
118  *
119  * <p>Property patterns specify a set of characters having a certain
120  * property as defined by the Unicode standard.  Both the POSIX-like
121  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a
122  * complete list of supported property patterns, see the User's Guide
123  * for UnicodeSet at
124  * <a HREF="http://icu.sourceforge.net/userguide/unicodeSet.html">
125  * http://icu.sourceforge.net/userguide/unicodeSet.html</a>.
126  * Actual determination of property data is defined by the underlying
127  * Unicode database as implemented by UCharacter.
128  *
129  * <p>Patterns specify individual characters, ranges of characters, and
130  * Unicode property sets.  When elements are concatenated, they
131  * specify their union.  To complement a set, place a '^' immediately
132  * after the opening '['.  Property patterns are inverted by modifying
133  * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
134  * '^' has no special meaning.
135  *
136  * <p>Ranges are indicated by placing two a '-' between two
137  * characters, as in "a-z".  This specifies the range of all
138  * characters from the left to the right, in Unicode order.  If the
139  * left character is greater than or equal to the
140  * right character it is a syntax error.  If a '-' occurs as the first
141  * character after the opening '[' or '[^', or if it occurs as the
142  * last character before the closing ']', then it is taken as a
143  * literal.  Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
144  * set of three characters, 'a', 'b', and '-'.
145  *
146  * <p>Sets may be intersected using the '&' operator or the asymmetric
147  * set difference may be taken using the '-' operator, for example,
148  * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
149  * with values less than 4096.  Operators ('&' and '|') have equal
150  * precedence and bind left-to-right.  Thus
151  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
152  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
153  * difference; intersection is commutative.
154  *
155  * <table>
156  * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
157  * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
158  * through 'z' and all letters in between, in Unicode order
159  * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
160  * all characters but 'a' through 'z',
161  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
162  * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
163  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
164  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
165  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
166  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
167  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
168  * <em>pat2</em>
169  * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
170  * <td>The set of characters having the specified
171  * Unicode property; in
172  * this case, Unicode uppercase letters
173  * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
174  * <td>The set of characters <em>not</em> having the given
175  * Unicode property
176  * </table>
177  *
178  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
179  *
180  * <p><b>Formal syntax</b></p>
181  *
182  * <blockquote>
183  *   <table>
184  *     <tr align="top">
185  *       <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
186  *       <td valign="top"><code>('[' '^'? item* ']') |
187  *       property</code></td>
188  *     </tr>
189  *     <tr align="top">
190  *       <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
191  *       <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
192  *       </code></td>
193  *     </tr>
194  *     <tr align="top">
195  *       <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
196  *       <td valign="top"><code>pattern | pattern-expr pattern |
197  *       pattern-expr op pattern<br>
198  *       </code></td>
199  *     </tr>
200  *     <tr align="top">
201  *       <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
202  *       <td valign="top"><code>'&amp;' | '-'<br>
203  *       </code></td>
204  *     </tr>
205  *     <tr align="top">
206  *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
207  *       <td valign="top"><code>'[' | ']' | '-'<br>
208  *       </code></td>
209  *     </tr>
210  *     <tr align="top">
211  *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
212  *       <td valign="top"><em>any character that is not</em><code> special<br>
213  *       | ('\\' </code><em>any character</em><code>)<br>
214  *       | ('&#92;u' hex hex hex hex)<br>
215  *       </code></td>
216  *     </tr>
217  *     <tr align="top">
218  *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
219  *       <td valign="top"><em>any character for which
220  *       </em><code>Character.digit(c, 16)</code><em>
221  *       returns a non-negative result</em></td>
222  *     </tr>
223  *     <tr>
224  *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
225  *       <td valign="top"><em>a Unicode property set pattern</td>
226  *     </tr>
227  *   </table>
228  *   <br>
229  *   <table border="1">
230  *     <tr>
231  *       <td>Legend: <table>
232  *         <tr>
233  *           <td nowrap valign="top"><code>a := b</code></td>
234  *           <td width="20" valign="top">&nbsp; </td>
235  *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
236  *         </tr>
237  *         <tr>
238  *           <td nowrap valign="top"><code>a?</code></td>
239  *           <td valign="top"></td>
240  *           <td valign="top">zero or one instance of <code>a</code><br>
241  *           </td>
242  *         </tr>
243  *         <tr>
244  *           <td nowrap valign="top"><code>a*</code></td>
245  *           <td valign="top"></td>
246  *           <td valign="top">one or more instances of <code>a</code><br>
247  *           </td>
248  *         </tr>
249  *         <tr>
250  *           <td nowrap valign="top"><code>a | b</code></td>
251  *           <td valign="top"></td>
252  *           <td valign="top">either <code>a</code> or <code>b</code><br>
253  *           </td>
254  *         </tr>
255  *         <tr>
256  *           <td nowrap valign="top"><code>'a'</code></td>
257  *           <td valign="top"></td>
258  *           <td valign="top">the literal string between the quotes </td>
259  *         </tr>
260  *       </table>
261  *       </td>
262  *     </tr>
263  *   </table>
264  * </blockquote>
265  * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
266  *
267  * @author Alan Liu
268  * @stable ICU 2.0
269  * @see UnicodeSetIterator
270  */
271 public class UnicodeSet extends UnicodeFilter implements Freezable {
272 
273     private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
274     private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
275                                              // 110000 for codepoints
276 
277     /**
278      * Minimum value that can be stored in a UnicodeSet.
279      * @stable ICU 2.0
280      */
281     public static final int MIN_VALUE = LOW;
282 
283     /**
284      * Maximum value that can be stored in a UnicodeSet.
285      * @stable ICU 2.0
286      */
287     public static final int MAX_VALUE = HIGH - 1;
288 
289     private int len;      // length used; list may be longer to minimize reallocs
290     private int[] list;   // MUST be terminated with HIGH
291     private int[] rangeList; // internal buffer
292     private int[] buffer; // internal buffer
293 
294     // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
295     // is not private so that UnicodeSetIterator can get access
296     TreeSet   strings = new TreeSet  ();
297 
298     /**
299      * The pattern representation of this set.  This may not be the
300      * most economical pattern.  It is the pattern supplied to
301      * applyPattern(), with variables substituted and whitespace
302      * removed.  For sets constructed without applyPattern(), or
303      * modified using the non-pattern API, this string will be null,
304      * indicating that toPattern() must generate a pattern
305      * representation from the inversion list.
306      */
307     private String   pat = null;
308 
309     private static final int START_EXTRA = 16;         // initial storage. Must be >= 0
310     private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
311 
312     // Special property set IDs
313     private static final String   ANY_ID   = "ANY";   // [\u0000-\U0010FFFF]
314     private static final String   ASCII_ID = "ASCII"; // [\u0000-\u007F]
315     private static final String   ASSIGNED = "Assigned"; // [:^Cn:]
316 
317     /**
318      * A set of all characters _except_ the second through last characters of
319      * certain ranges.  These ranges are ranges of characters whose
320      * properties are all exactly alike, e.g. CJK Ideographs from
321      * U+4E00 to U+9FA5.
322      */
323     private static UnicodeSet INCLUSIONS[] = null;
324 
325     //----------------------------------------------------------------
326     // Public API
327     //----------------------------------------------------------------
328 
329     /**
330      * Constructs an empty set.
331      * @stable ICU 2.0
332      */
333     public UnicodeSet() {
334         list = new int[1 + START_EXTRA];
335         list[len++] = HIGH;
336     }
337 
338     /**
339      * Constructs a copy of an existing set.
340      * @stable ICU 2.0
341      */
342     public UnicodeSet(UnicodeSet other) {
343         set(other);
344     }
345 
346     /**
347      * Constructs a set containing the given range. If <code>end >
348      * start</code> then an empty set is created.
349      *
350      * @param start first character, inclusive, of range
351      * @param end last character, inclusive, of range
352      * @stable ICU 2.0
353      */
354     public UnicodeSet(int start, int end) {
355         this();
356         complement(start, end);
357     }
358 
359     /**
360      * Constructs a set from the given pattern.  See the class description
361      * for the syntax of the pattern language.  Whitespace is ignored.
362      * @param pattern a string specifying what characters are in the set
363      * @exception java.lang.IllegalArgumentException if the pattern contains
364      * a syntax error.
365      * @stable ICU 2.0
366      */
367     public UnicodeSet(String   pattern) {
368         this();
369         applyPattern(pattern, null, null, IGNORE_SPACE);
370     }
371 
372     /**
373      * Constructs a set from the given pattern.  See the class description
374      * for the syntax of the pattern language.
375      * @param pattern a string specifying what characters are in the set
376      * @param ignoreWhitespace if true, ignore characters for which
377      * UCharacterProperty.isRuleWhiteSpace() returns true
378      * @exception java.lang.IllegalArgumentException if the pattern contains
379      * a syntax error.
380      * @stable ICU 2.0
381      */
382     public UnicodeSet(String   pattern, boolean ignoreWhitespace) {
383         this();
384         applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
385     }
386 
387     /**
388      * Constructs a set from the given pattern.  See the class description
389      * for the syntax of the pattern language.
390      * @param pattern a string specifying what characters are in the set
391      * @param options a bitmask indicating which options to apply.
392      * Valid options are IGNORE_SPACE and CASE.
393      * @exception java.lang.IllegalArgumentException if the pattern contains
394      * a syntax error.
395      * @internal
396      * @deprecated This API is ICU internal only.
397      */
398     public UnicodeSet(String   pattern, int options) {
399         this();
400         applyPattern(pattern, null, null, options);
401     }
402 
403     /**
404      * Constructs a set from the given pattern.  See the class description
405      * for the syntax of the pattern language.
406      * @param pattern a string specifying what characters are in the set
407      * @param pos on input, the position in pattern at which to start parsing.
408      * On output, the position after the last character parsed.
409      * @param symbols a symbol table mapping variables to char[] arrays
410      * and chars to UnicodeSets
411      * @exception java.lang.IllegalArgumentException if the pattern
412      * contains a syntax error.
413      * @stable ICU 2.0
414      */
415     public UnicodeSet(String   pattern, ParsePosition pos, SymbolTable symbols) {
416         this();
417         applyPattern(pattern, pos, symbols, IGNORE_SPACE);
418     }
419 
420     /**
421      * Constructs a set from the given pattern.  See the class description
422      * for the syntax of the pattern language.
423      * @param pattern a string specifying what characters are in the set
424      * @param pos on input, the position in pattern at which to start parsing.
425      * On output, the position after the last character parsed.
426      * @param symbols a symbol table mapping variables to char[] arrays
427      * and chars to UnicodeSets
428      * @param options a bitmask indicating which options to apply.
429      * Valid options are IGNORE_SPACE and CASE.
430      * @exception java.lang.IllegalArgumentException if the pattern
431      * contains a syntax error.
432      * @draft ICU 3.2
433      * @provisional This API might change or be removed in a future release.
434      */
435     public UnicodeSet(String   pattern, ParsePosition pos, SymbolTable symbols, int options) {
436         this();
437         applyPattern(pattern, pos, symbols, options);
438     }
439 
440 
441     /**
442      * Return a new set that is equivalent to this one.
443      * @stable ICU 2.0
444      */
445     public Object   clone() {
446         UnicodeSet result = new UnicodeSet(this);
447         result.frozen = this.frozen;
448         return result;
449     }
450 
451     /**
452      * Make this object represent the range <code>start - end</code>.
453      * If <code>end > start</code> then this object is set to an
454      * an empty range.
455      *
456      * @param start first character in the set, inclusive
457      * @param end last character in the set, inclusive
458      * @stable ICU 2.0
459      */
460     public UnicodeSet set(int start, int end) {
461         checkFrozen();
462         clear();
463         complement(start, end);
464         return this;
465     }
466 
467     /**
468      * Make this object represent the same set as <code>other</code>.
469      * @param other a <code>UnicodeSet</code> whose value will be
470      * copied to this object
471      * @stable ICU 2.0
472      */
473     public UnicodeSet set(UnicodeSet other) {
474         checkFrozen();
475         list = (int[]) other.list.clone();
476         len = other.len;
477         pat = other.pat;
478         strings = (TreeSet  )other.strings.clone();
479         return this;
480     }
481 
482     /**
483      * Modifies this set to represent the set specified by the given pattern.
484      * See the class description for the syntax of the pattern language.
485      * Whitespace is ignored.
486      * @param pattern a string specifying what characters are in the set
487      * @exception java.lang.IllegalArgumentException if the pattern
488      * contains a syntax error.
489      * @stable ICU 2.0
490      */
491     public final UnicodeSet applyPattern(String   pattern) {
492         checkFrozen();
493         return applyPattern(pattern, null, null, IGNORE_SPACE);
494     }
495 
496     /**
497      * Modifies this set to represent the set specified by the given pattern,
498      * optionally ignoring whitespace.
499      * See the class description for the syntax of the pattern language.
500      * @param pattern a string specifying what characters are in the set
501      * @param ignoreWhitespace if true then characters for which
502      * UCharacterProperty.isRuleWhiteSpace() returns true are ignored
503      * @exception java.lang.IllegalArgumentException if the pattern
504      * contains a syntax error.
505      * @stable ICU 2.0
506      */
507     public UnicodeSet applyPattern(String   pattern, boolean ignoreWhitespace) {
508         checkFrozen();
509         return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
510     }
511 
512     /**
513      * Modifies this set to represent the set specified by the given pattern,
514      * optionally ignoring whitespace.
515      * See the class description for the syntax of the pattern language.
516      * @param pattern a string specifying what characters are in the set
517      * @param options a bitmask indicating which options to apply.
518      * Valid options are IGNORE_SPACE and CASE.
519      * @exception java.lang.IllegalArgumentException if the pattern
520      * contains a syntax error.
521      * @internal
522      * @deprecated This API is ICU internal only.
523      */
524     public UnicodeSet applyPattern(String   pattern, int options) {
525         checkFrozen();
526         return applyPattern(pattern, null, null, options);
527     }
528 
529     /**
530      * Return true if the given position, in the given pattern, appears
531      * to be the start of a UnicodeSet pattern.
532      * @stable ICU 2.0
533      */
534     public static boolean resemblesPattern(String   pattern, int pos) {
535         return ((pos+1) < pattern.length() &&
536                 pattern.charAt(pos) == '[') ||
537             resemblesPropertyPattern(pattern, pos);
538     }
539 
540     /**
541      * Append the <code>toPattern()</code> representation of a
542      * string to the given <code>StringBuffer</code>.
543      */
544     private static void _appendToPat(StringBuffer   buf, String   s, boolean escapeUnprintable) {
545         for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
546             _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
547         }
548     }
549 
550     /**
551      * Append the <code>toPattern()</code> representation of a
552      * character to the given <code>StringBuffer</code>.
553      */
554     private static void _appendToPat(StringBuffer   buf, int c, boolean escapeUnprintable) {
555         if (escapeUnprintable && Utility.isUnprintable(c)) {
556             // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
557             // unprintable
558             if (Utility.escapeUnprintable(buf, c)) {
559                 return;
560             }
561         }
562         // Okay to let ':' pass through
563         switch (c) {
564         case '[': // SET_OPEN:
565         case ']': // SET_CLOSE:
566         case '-': // HYPHEN:
567         case '^': // COMPLEMENT:
568         case '&': // INTERSECTION:
569         case '\\': //BACKSLASH:
570         case '{':
571         case '}':
572         case '$':
573         case ':':
574             buf.append('\\');
575             break;
576         default:
577             // Escape whitespace
578             if (UCharacterProperty.isRuleWhiteSpace(c)) {
579                 buf.append('\\');
580             }
581             break;
582         }
583         UTF16.append(buf, c);
584     }
585 
586     /**
587      * Returns a string representation of this set.  If the result of
588      * calling this function is passed to a UnicodeSet constructor, it
589      * will produce another set that is equal to this one.
590      * @stable ICU 2.0
591      */
592     public String   toPattern(boolean escapeUnprintable) {
593         StringBuffer   result = new StringBuffer  ();
594         return _toPattern(result, escapeUnprintable).toString();
595     }
596 
597     /**
598      * Append a string representation of this set to result.  This will be
599      * a cleaned version of the string passed to applyPattern(), if there
600      * is one.  Otherwise it will be generated.
601      */
602     private StringBuffer   _toPattern(StringBuffer   result,
603                                     boolean escapeUnprintable) {
604         if (pat != null) {
605             int i;
606             int backslashCount = 0;
607             for (i=0; i<pat.length(); ) {
608                 int c = UTF16.charAt(pat, i);
609                 i += UTF16.getCharCount(c);
610                 if (escapeUnprintable && Utility.isUnprintable(c)) {
611                     // If the unprintable character is preceded by an odd
612                     // number of backslashes, then it has been escaped.
613                     // Before unescaping it, we delete the final
614                     // backslash.
615                     if ((backslashCount % 2) == 1) {
616                         result.setLength(result.length() - 1);
617                     }
618                     Utility.escapeUnprintable(result, c);
619                     backslashCount = 0;
620                 } else {
621                     UTF16.append(result, c);
622                     if (c == '\\') {
623                         ++backslashCount;
624                     } else {
625                         backslashCount = 0;
626                     }
627                 }
628             }
629             return result;
630         }
631 
632         return _generatePattern(result, escapeUnprintable, true);
633     }
634 
635     /**
636      * Generate and append a string representation of this set to result.
637      * This does not use this.pat, the cleaned up copy of the string
638      * passed to applyPattern().
639      * @param result the buffer into which to generate the pattern
640      * @param escapeUnprintable escape unprintable characters if true
641      * @stable ICU 2.0
642      */
643     public StringBuffer   _generatePattern(StringBuffer   result, boolean escapeUnprintable) {
644         return _generatePattern(result, escapeUnprintable, true);
645     }
646 
647     /**
648      * Generate and append a string representation of this set to result.
649      * This does not use this.pat, the cleaned up copy of the string
650      * passed to applyPattern().
651      * @param includeStrings if false, doesn't include the strings.
652      * @internal
653      * @deprecated This API is ICU internal only.
654      */
655     public StringBuffer   _generatePattern(StringBuffer   result,
656                                          boolean escapeUnprintable, boolean includeStrings) {
657         result.append('[');
658 
659 //      // Check against the predefined categories.  We implicitly build
660 //      // up ALL category sets the first time toPattern() is called.
661 //      for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
662 //          if (this.equals(getCategorySet(cat))) {
663 //              result.append(':');
664 //              result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
665 //              return result.append(":]");
666 //          }
667 //      }
668 
669         int count = getRangeCount();
670 
671         // If the set contains at least 2 intervals and includes both
672         // MIN_VALUE and MAX_VALUE, then the inverse representation will
673         // be more economical.
674         if (count > 1 &&
675             getRangeStart(0) == MIN_VALUE &&
676             getRangeEnd(count-1) == MAX_VALUE) {
677 
678             // Emit the inverse
679             result.append('^');
680 
681             for (int i = 1; i < count; ++i) {
682                 int start = getRangeEnd(i-1)+1;
683                 int end = getRangeStart(i)-1;
684                 _appendToPat(result, start, escapeUnprintable);
685                 if (start != end) {
686                     if ((start+1) != end) {
687                         result.append('-');
688                     }
689                     _appendToPat(result, end, escapeUnprintable);
690                 }
691             }
692         }
693 
694         // Default; emit the ranges as pairs
695         else {
696             for (int i = 0; i < count; ++i) {
697                 int start = getRangeStart(i);
698                 int end = getRangeEnd(i);
699                 _appendToPat(result, start, escapeUnprintable);
700                 if (start != end) {
701                     if ((start+1) != end) {
702                         result.append('-');
703                     }
704                     _appendToPat(result, end, escapeUnprintable);
705                 }
706             }
707         }
708 
709         if (includeStrings && strings.size() > 0) {
710             Iterator it = strings.iterator();
711             while (it.hasNext()) {
712                 result.append('{');
713                 _appendToPat(result, (String  ) it.next(), escapeUnprintable);
714                 result.append('}');
715             }
716         }
717         return result.append(']');
718     }
719 
720     /**
721      * Returns the number of elements in this set (its cardinality)
722      * Note than the elements of a set may include both individual
723      * codepoints and strings.
724      *
725      * @return the number of elements in this set (its cardinality).
726      * @stable ICU 2.0
727      */
728     public int size() {
729         int n = 0;
730         int count = getRangeCount();
731         for (int i = 0; i < count; ++i) {
732             n += getRangeEnd(i) - getRangeStart(i) + 1;
733         }
734         return n + strings.size();
735     }
736 
737     /**
738      * Returns <tt>true</tt> if this set contains no elements.
739      *
740      * @return <tt>true</tt> if this set contains no elements.
741      * @stable ICU 2.0
742      */
743     public boolean isEmpty() {
744         return len == 1 && strings.size() == 0;
745     }
746 
747     /**
748      * Implementation of UnicodeMatcher API.  Returns <tt>true</tt> if
749      * this set contains any character whose low byte is the given
750      * value.  This is used by <tt>RuleBasedTransliterator</tt> for
751      * indexing.
752      * @stable ICU 2.0
753      */
754     public boolean matchesIndexValue(int v) {
755         /* The index value v, in the range [0,255], is contained in this set if
756          * it is contained in any pair of this set.  Pairs either have the high
757          * bytes equal, or unequal.  If the high bytes are equal, then we have
758          * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
759          * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
760          * Then v is contained if xx <= v || v <= yy.  (This is identical to the
761          * time zone month containment logic.)
762          */
763         for (int i=0; i<getRangeCount(); ++i) {
764             int low = getRangeStart(i);
765             int high = getRangeEnd(i);
766             if ((low & ~0xFF) == (high & ~0xFF)) {
767                 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
768                     return true;
769                 }
770             } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
771                 return true;
772             }
773         }
774         if (strings.size() != 0) {
775             Iterator it = strings.iterator();
776             while (it.hasNext()) {
777                 String   s = (String  ) it.next();
778                 //if (s.length() == 0) {
779                 //    // Empty strings match everything
780                 //    return true;
781                 //}
782                 // assert(s.length() != 0); // We enforce this elsewhere
783                 int c = UTF16.charAt(s, 0);
784                 if ((c & 0xFF) == v) {
785                     return true;
786                 }
787             }
788         }
789         return false;
790     }
791 
792     /**
793      * Implementation of UnicodeMatcher.matches().  Always matches the
794      * longest possible multichar string.
795      * @stable ICU 2.0
796      */
797     public int matches(Replaceable text,
798                        int[] offset,
799                        int limit,
800                        boolean incremental) {
801 
802         if (offset[0] == limit) {
803             // Strings, if any, have length != 0, so we don't worry
804             // about them here.  If we ever allow zero-length strings
805             // we much check for them here.
806             if (contains(UnicodeMatcher.ETHER)) {
807                 return incremental ? U_PARTIAL_MATCH : U_MATCH;
808             } else {
809                 return U_MISMATCH;
810             }
811         } else {
812             if (strings.size() != 0) { // try strings first
813 
814                 // might separate forward and backward loops later
815                 // for now they are combined
816 
817                 // TODO Improve efficiency of this, at least in the forward
818                 // direction, if not in both.  In the forward direction we
819                 // can assume the strings are sorted.
820 
821                 Iterator it = strings.iterator();
822                 boolean forward = offset[0] < limit;
823 
824                 // firstChar is the leftmost char to match in the
825                 // forward direction or the rightmost char to match in
826                 // the reverse direction.
827                 char firstChar = text.charAt(offset[0]);
828 
829                 // If there are multiple strings that can match we
830                 // return the longest match.
831                 int highWaterLength = 0;
832 
833                 while (it.hasNext()) {
834                     String   trial = (String  ) it.next();
835 
836                     //if (trial.length() == 0) {
837                     //    return U_MATCH; // null-string always matches
838                     //}
839                     // assert(trial.length() != 0); // We ensure this elsewhere
840 
841                     char c = trial.charAt(forward ? 0 : trial.length() - 1);
842 
843                     // Strings are sorted, so we can optimize in the
844                     // forward direction.
845                     if (forward && c > firstChar) break;
846                     if (c != firstChar) continue;
847 
848                     int len = matchRest(text, offset[0], limit, trial);
849 
850                     if (incremental) {
851                         int maxLen = forward ? limit-offset[0] : offset[0]-limit;
852                         if (len == maxLen) {
853                             // We have successfully matched but only up to limit.
854                             return U_PARTIAL_MATCH;
855                         }
856                     }
857 
858                     if (len == trial.length()) {
859                         // We have successfully matched the whole string.
860                         if (len > highWaterLength) {
861                             highWaterLength = len;
862                         }
863                         // In the forward direction we know strings
864                         // are sorted so we can bail early.
865                         if (forward && len < highWaterLength) {
866                             break;
867                         }
868                         continue;
869                     }
870                 }
871 
872                 // We've checked all strings without a partial match.
873                 // If we have full matches, return the longest one.
874                 if (highWaterLength != 0) {
875                     offset[0] += forward ? highWaterLength : -highWaterLength;
876                     return U_MATCH;
877                 }
878             }
879             return super.matches(text, offset, limit, incremental);
880         }
881     }
882 
883     /**
884      * Returns the longest match for s in text at the given position.
885      * If limit > start then match forward from start+1 to limit
886      * matching all characters except s.charAt(0).  If limit < start,
887      * go backward starting from start-1 matching all characters
888      * except s.charAt(s.length()-1).  This method assumes that the
889      * first character, text.charAt(start), matches s, so it does not
890      * check it.
891      * @param text the text to match
892      * @param start the first character to match.  In the forward
893      * direction, text.charAt(start) is matched against s.charAt(0).
894      * In the reverse direction, it is matched against
895      * s.charAt(s.length()-1).
896      * @param limit the limit offset for matching, either last+1 in
897      * the forward direction, or last-1 in the reverse direction,
898      * where last is the index of the last character to match.
899      * @return If part of s matches up to the limit, return |limit -
900      * start|.  If all of s matches before reaching the limit, return
901      * s.length().  If there is a mismatch between s and text, return
902      * 0
903      */
904     private static int matchRest (Replaceable text, int start, int limit, String   s) {
905         int maxLen;
906         int slen = s.length();
907         if (start < limit) {
908             maxLen = limit - start;
909             if (maxLen > slen) maxLen = slen;
910             for (int i = 1; i < maxLen; ++i) {
911                 if (text.charAt(start + i) != s.charAt(i)) return 0;
912             }
913         } else {
914             maxLen = start - limit;
915             if (maxLen > slen) maxLen = slen;
916             --slen; // <=> slen = s.length() - 1;
917             for (int i = 1; i < maxLen; ++i) {
918                 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
919             }
920         }
921         return maxLen;
922     }
923 
924 //#ifndef FOUNDATION
925 //##    /**
926 //##     * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1. For now, an internal routine.
927 //##     * @internal
928 //##     * @deprecated This API is ICU internal only.
929 //##     */
930 //##    public int matchesAt(CharSequence text, int offset) {
931 //##        int len = -1;
932 //##        strings:
933 //##        if (strings.size() != 0) {
934 //##            char firstChar = text.charAt(offset);
935 //##            String trial = null;
936 //##            // find the first string starting with firstChar
937 //##            Iterator it = strings.iterator();
938 //##            while (it.hasNext()) {
939 //##                trial = (String) it.next();
940 //##                char firstStringChar = trial.charAt(0);
941 //##                if (firstStringChar < firstChar) continue;
942 //##                if (firstStringChar > firstChar) break strings;
943 //##            }
944 //##            // now keep checking string until we get the longest one
945 //##            while (true) {
946 //##                int tempLen = CollectionUtilities.matchesAt(text, offset, trial);
947 //##                if (len > tempLen) break strings;
948 //##                len = tempLen;
949 //##                if (!it.hasNext()) break;
950 //##                trial = (String) it.next();
951 //##            }
952 //##        }
953 //##        if (len < 2) {
954 //##            int cp = UTF16.charAt(text, offset);
955 //##            if (contains(cp)) {
956 //##                len = UTF16.getCharCount(cp);
957 //##            }
958 //##        }
959 //##        return offset+len;
960 //##    }
961 //#endif
962 
963     /**
964      * Implementation of UnicodeMatcher API.  Union the set of all
965      * characters that may be matched by this object into the given
966      * set.
967      * @param toUnionTo the set into which to union the source characters
968      * @stable ICU 2.2
969      */
970     public void addMatchSetTo(UnicodeSet toUnionTo) {
971         toUnionTo.addAll(this);
972     }
973 
974     /**
975      * Returns the index of the given character within this set, where
976      * the set is ordered by ascending code point.  If the character
977      * is not in this set, return -1.  The inverse of this method is
978      * <code>charAt()</code>.
979      * @return an index from 0..size()-1, or -1
980      * @stable ICU 2.0
981      */
982     public int indexOf(int c) {
983         if (c < MIN_VALUE || c > MAX_VALUE) {
984             throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(c, 6));
985         }
986         int i = 0;
987         int n = 0;
988         for (;;) {
989             int start = list[i++];
990             if (c < start) {
991                 return -1;
992             }
993             int limit = list[i++];
994             if (c < limit) {
995                 return n + c - start;
996             }
997             n += limit - start;
998         }
999     }
1000
1001    /**
1002     * Returns the character at the given index within this set, where
1003     * the set is ordered by ascending code point.  If the index is
1004     * out of range, return -1.  The inverse of this method is
1005     * <code>indexOf()</code>.
1006     * @param index an index from 0..size()-1
1007     * @return the character at the given index, or -1.
1008     * @stable ICU 2.0
1009     */
1010    public int charAt(int index) {
1011        if (index >= 0) {
1012            // len2 is the largest even integer <= len, that is, it is len
1013            // for even values and len-1 for odd values.  With odd values
1014            // the last entry is UNICODESET_HIGH.
1015            int len2 = len & ~1;
1016            for (int i=0; i < len2;) {
1017                int start = list[i++];
1018                int count = list[i++] - start;
1019                if (index < count) {
1020                    return start + index;
1021                }
1022                index -= count;
1023            }
1024        }
1025        return -1;
1026    }
1027
1028    /**
1029     * Adds the specified range to this set if it is not already
1030     * present.  If this set already contains the specified range,
1031     * the call leaves this set unchanged.  If <code>end > start</code>
1032     * then an empty range is added, leaving the set unchanged.
1033     *
1034     * @param start first character, inclusive, of range to be added
1035     * to this set.
1036     * @param end last character, inclusive, of range to be added
1037     * to this set.
1038     * @stable ICU 2.0
1039     */
1040    public UnicodeSet add(int start, int end) {
1041        checkFrozen();
1042        return add_unchecked(start, end);
1043    }
1044    
1045    // for internal use, after checkFrozen has been called
1046    private UnicodeSet add_unchecked(int start, int end) {
1047        if (start < MIN_VALUE || start > MAX_VALUE) {
1048            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(start, 6));
1049        }
1050        if (end < MIN_VALUE || end > MAX_VALUE) {
1051            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(end, 6));
1052        }
1053        if (start < end) {
1054            add(range(start, end), 2, 0);
1055        } else if (start == end) {
1056            add(start);
1057        }
1058        return this;
1059    }
1060
1061//    /**
1062//     * Format out the inversion list as a string, for debugging.  Uncomment when
1063//     * needed.
1064//     */
1065//    public final String dump() {
1066//        StringBuffer buf = new StringBuffer("[");
1067//        for (int i=0; i<len; ++i) {
1068//            if (i != 0) buf.append(", ");
1069//            int c = list[i];
1070//            //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') {
1071//            //    buf.append((char) c);
1072//            //} else {
1073//                buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6));
1074//            //}
1075//        }
1076//        buf.append("]");
1077//        return buf.toString();
1078//    }
1079
1080    /**
1081     * Adds the specified character to this set if it is not already
1082     * present.  If this set already contains the specified character,
1083     * the call leaves this set unchanged.
1084     * @stable ICU 2.0
1085     */
1086    public final UnicodeSet add(int c) {
1087        checkFrozen();
1088        return add_unchecked(c);
1089    }
1090    
1091    // for internal use only, after checkFrozen has been called
1092    private final UnicodeSet add_unchecked(int c) {
1093        if (c < MIN_VALUE || c > MAX_VALUE) {
1094            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(c, 6));
1095        }
1096
1097        // find smallest i such that c < list[i]
1098        // if odd, then it is IN the set
1099        // if even, then it is OUT of the set
1100        int i = findCodePoint(c);
1101
1102        // already in set?
1103        if ((i & 1) != 0) return this;
1104
1105        // HIGH is 0x110000
1106        // assert(list[len-1] == HIGH);
1107
1108        // empty = [HIGH]
1109        // [start_0, limit_0, start_1, limit_1, HIGH]
1110
1111        // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1112        //                             ^
1113        //                             list[i]
1114
1115        // i == 0 means c is before the first range
1116
1117        if (c == list[i]-1) {
1118            // c is before start of next range
1119            list[i] = c;
1120            // if we touched the HIGH mark, then add a new one
1121            if (c == MAX_VALUE) {
1122                ensureCapacity(len+1);
1123                list[len++] = HIGH;
1124            }
1125            if (i > 0 && c == list[i-1]) {
1126                // collapse adjacent ranges
1127
1128                // [..., start_k-1, c, c, limit_k, ..., HIGH]
1129                //                     ^
1130                //                     list[i]
1131                System.arraycopy(list, i+1, list, i-1, len-i-1);
1132                len -= 2;
1133            }
1134        }
1135
1136        else if (i > 0 && c == list[i-1]) {
1137            // c is after end of prior range
1138            list[i-1]++;
1139            // no need to chcek for collapse here
1140        }
1141
1142        else {
1143            // At this point we know the new char is not adjacent to
1144            // any existing ranges, and it is not 10FFFF.
1145
1146
1147            // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1148            //                             ^
1149            //                             list[i]
1150
1151            // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
1152            //                             ^
1153            //                             list[i]
1154
1155            // Don't use ensureCapacity() to save on copying.
1156            // NOTE: This has no measurable impact on performance,
1157            // but it might help in some usage patterns.
1158            if (len+2 > list.length) {
1159                int[] temp = new int[len + 2 + GROW_EXTRA];
1160                if (i != 0) System.arraycopy(list, 0, temp, 0, i);
1161                System.arraycopy(list, i, temp, i+2, len-i);
1162                list = temp;
1163            } else {
1164                System.arraycopy(list, i, list, i+2, len-i);
1165            }
1166
1167            list[i] = c;
1168            list[i+1] = c+1;
1169            len += 2;
1170        }
1171
1172        pat = null;
1173        return this;
1174    }
1175
1176    /**
1177     * Adds the specified multicharacter to this set if it is not already
1178     * present.  If this set already contains the multicharacter,
1179     * the call leaves this set unchanged.
1180     * Thus "ch" => {"ch"}
1181     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1182     * @param s the source string
1183     * @return this object, for chaining
1184     * @stable ICU 2.0
1185     */
1186    public final UnicodeSet add(String   s) {
1187        checkFrozen();
1188        int cp = getSingleCP(s);
1189        if (cp < 0) {
1190            strings.add(s);
1191            pat = null;
1192        } else {
1193            add_unchecked(cp, cp);
1194        }
1195        return this;
1196    }
1197
1198    /**
1199     * @return a code point IF the string consists of a single one.
1200     * otherwise returns -1.
1201     * @param string to test
1202     */
1203    private static int getSingleCP(String   s) {
1204        if (s.length() < 1) {
1205            throw new IllegalArgumentException  ("Can't use zero-length strings in UnicodeSet");
1206        }
1207        if (s.length() > 2) return -1;
1208        if (s.length() == 1) return s.charAt(0);
1209
1210        // at this point, len = 2
1211        int cp = UTF16.charAt(s, 0);
1212        if (cp > 0xFFFF) { // is surrogate pair
1213            return cp;
1214        }
1215        return -1;
1216    }
1217
1218    /**
1219     * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1220     * If this set already any particular character, it has no effect on that character.
1221     * @param s the source string
1222     * @return this object, for chaining
1223     * @stable ICU 2.0
1224     */
1225    public final UnicodeSet addAll(String   s) {
1226        checkFrozen();
1227        int cp;
1228        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1229            cp = UTF16.charAt(s, i);
1230            add_unchecked(cp, cp);
1231        }
1232        return this;
1233    }
1234
1235    /**
1236     * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1237     * If this set already any particular character, it has no effect on that character.
1238     * @param s the source string
1239     * @return this object, for chaining
1240     * @stable ICU 2.0
1241     */
1242    public final UnicodeSet retainAll(String   s) {
1243        return retainAll(fromAll(s));
1244    }
1245
1246    /**
1247     * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1248     * If this set already any particular character, it has no effect on that character.
1249     * @param s the source string
1250     * @return this object, for chaining
1251     * @stable ICU 2.0
1252     */
1253    public final UnicodeSet complementAll(String   s) {
1254        return complementAll(fromAll(s));
1255    }
1256
1257    /**
1258     * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1259     * If this set already any particular character, it has no effect on that character.
1260     * @param s the source string
1261     * @return this object, for chaining
1262     * @stable ICU 2.0
1263     */
1264    public final UnicodeSet removeAll(String   s) {
1265        return removeAll(fromAll(s));
1266    }
1267
1268    /**
1269     * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1270     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1271     * @param s the source string
1272     * @return a newly created set containing the given string
1273     * @stable ICU 2.0
1274     */
1275    public static UnicodeSet from(String   s) {
1276        return new UnicodeSet().add(s);
1277    }
1278
1279
1280    /**
1281     * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1282     * @param s the source string
1283     * @return a newly created set containing the given characters
1284     * @stable ICU 2.0
1285     */
1286    public static UnicodeSet fromAll(String   s) {
1287        return new UnicodeSet().addAll(s);
1288    }
1289
1290
1291    /**
1292     * Retain only the elements in this set that are contained in the
1293     * specified range.  If <code>end > start</code> then an empty range is
1294     * retained, leaving the set empty.
1295     *
1296     * @param start first character, inclusive, of range to be retained
1297     * to this set.
1298     * @param end last character, inclusive, of range to be retained
1299     * to this set.
1300     * @stable ICU 2.0
1301     */
1302    public UnicodeSet retain(int start, int end) {
1303        checkFrozen();
1304        if (start < MIN_VALUE || start > MAX_VALUE) {
1305            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(start, 6));
1306        }
1307        if (end < MIN_VALUE || end > MAX_VALUE) {
1308            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(end, 6));
1309        }
1310        if (start <= end) {
1311            retain(range(start, end), 2, 0);
1312        } else {
1313            clear();
1314        }
1315        return this;
1316    }
1317
1318    /**
1319     * Retain the specified character from this set if it is present.
1320     * Upon return this set will be empty if it did not contain c, or
1321     * will only contain c if it did contain c.
1322     * @param c the character to be retained
1323     * @return this object, for chaining
1324     * @stable ICU 2.0
1325     */
1326    public final UnicodeSet retain(int c) {
1327        return retain(c, c);
1328    }
1329
1330    /**
1331     * Retain the specified string in this set if it is present.
1332     * Upon return this set will be empty if it did not contain s, or
1333     * will only contain s if it did contain s.
1334     * @param s the string to be retained
1335     * @return this object, for chaining
1336     * @stable ICU 2.0
1337     */
1338    public final UnicodeSet retain(String   s) {
1339        int cp = getSingleCP(s);
1340        if (cp < 0) {
1341            boolean isIn = strings.contains(s);
1342            if (isIn && size() == 1) {
1343                return this;
1344            }
1345            clear();
1346            strings.add(s);
1347            pat = null;
1348        } else {
1349            retain(cp, cp);
1350        }
1351        return this;
1352    }
1353
1354    /**
1355     * Removes the specified range from this set if it is present.
1356     * The set will not contain the specified range once the call
1357     * returns.  If <code>end > start</code> then an empty range is
1358     * removed, leaving the set unchanged.
1359     *
1360     * @param start first character, inclusive, of range to be removed
1361     * from this set.
1362     * @param end last character, inclusive, of range to be removed
1363     * from this set.
1364     * @stable ICU 2.0
1365     */
1366    public UnicodeSet remove(int start, int end) {
1367        checkFrozen();
1368        if (start < MIN_VALUE || start > MAX_VALUE) {
1369            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(start, 6));
1370        }
1371        if (end < MIN_VALUE || end > MAX_VALUE) {
1372            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(end, 6));
1373        }
1374        if (start <= end) {
1375            retain(range(start, end), 2, 2);
1376        }
1377        return this;
1378    }
1379
1380    /**
1381     * Removes the specified character from this set if it is present.
1382     * The set will not contain the specified character once the call
1383     * returns.
1384     * @param c the character to be removed
1385     * @return this object, for chaining
1386     * @stable ICU 2.0
1387     */
1388    public final UnicodeSet remove(int c) {
1389        return remove(c, c);
1390    }
1391
1392    /**
1393     * Removes the specified string from this set if it is present.
1394     * The set will not contain the specified string once the call
1395     * returns.
1396     * @param s the string to be removed
1397     * @return this object, for chaining
1398     * @stable ICU 2.0
1399     */
1400    public final UnicodeSet remove(String   s) {
1401        int cp = getSingleCP(s);
1402        if (cp < 0) {
1403            strings.remove(s);
1404            pat = null;
1405        } else {
1406            remove(cp, cp);
1407        }
1408        return this;
1409    }
1410
1411    /**
1412     * Complements the specified range in this set.  Any character in
1413     * the range will be removed if it is in this set, or will be
1414     * added if it is not in this set.  If <code>end > start</code>
1415     * then an empty range is complemented, leaving the set unchanged.
1416     *
1417     * @param start first character, inclusive, of range to be removed
1418     * from this set.
1419     * @param end last character, inclusive, of range to be removed
1420     * from this set.
1421     * @stable ICU 2.0
1422     */
1423    public UnicodeSet complement(int start, int end) {
1424        checkFrozen();
1425        if (start < MIN_VALUE || start > MAX_VALUE) {
1426            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(start, 6));
1427        }
1428        if (end < MIN_VALUE || end > MAX_VALUE) {
1429            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(end, 6));
1430        }
1431        if (start <= end) {
1432            xor(range(start, end), 2, 0);
1433        }
1434        pat = null;
1435        return this;
1436    }
1437
1438    /**
1439     * Complements the specified character in this set.  The character
1440     * will be removed if it is in this set, or will be added if it is
1441     * not in this set.
1442     * @stable ICU 2.0
1443     */
1444    public final UnicodeSet complement(int c) {
1445        return complement(c, c);
1446    }
1447
1448    /**
1449     * This is equivalent to
1450     * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1451     * @stable ICU 2.0
1452     */
1453    public UnicodeSet complement() {
1454        checkFrozen();
1455        if (list[0] == LOW) {
1456            System.arraycopy(list, 1, list, 0, len-1);
1457            --len;
1458        } else {
1459            ensureCapacity(len+1);
1460            System.arraycopy(list, 0, list, 1, len);
1461            list[0] = LOW;
1462            ++len;
1463        }
1464        pat = null;
1465        return this;
1466    }
1467
1468    /**
1469     * Complement the specified string in this set.
1470     * The set will not contain the specified string once the call
1471     * returns.
1472     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1473     * @param s the string to complement
1474     * @return this object, for chaining
1475     * @stable ICU 2.0
1476     */
1477    public final UnicodeSet complement(String   s) {
1478        checkFrozen();
1479        int cp = getSingleCP(s);
1480        if (cp < 0) {
1481            if (strings.contains(s)) strings.remove(s);
1482            else strings.add(s);
1483            pat = null;
1484        } else {
1485            complement(cp, cp);
1486        }
1487        return this;
1488    }
1489
1490    /**
1491     * Returns true if this set contains the given character.
1492     * @param c character to be checked for containment
1493     * @return true if the test condition is met
1494     * @stable ICU 2.0
1495     */
1496    public boolean contains(int c) {
1497        if (c < MIN_VALUE || c > MAX_VALUE) {
1498            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(c, 6));
1499        }
1500
1501        /*
1502        // Set i to the index of the start item greater than ch
1503        // We know we will terminate without length test!
1504        int i = -1;
1505        while (true) {
1506            if (c < list[++i]) break;
1507        }
1508        */
1509
1510        int i = findCodePoint(c);
1511
1512        return ((i & 1) != 0); // return true if odd
1513    }
1514
1515    /**
1516     * Returns the smallest value i such that c < list[i].  Caller
1517     * must ensure that c is a legal value or this method will enter
1518     * an infinite loop.  This method performs a binary search.
1519     * @param c a character in the range MIN_VALUE..MAX_VALUE
1520     * inclusive
1521     * @return the smallest integer i in the range 0..len-1,
1522     * inclusive, such that c < list[i]
1523     */
1524    private final int findCodePoint(int c) {
1525        /* Examples:
1526                                           findCodePoint(c)
1527           set              list[]         c=0 1 3 4 7 8
1528           ===              ==============   ===========
1529           []               [110000]         0 0 0 0 0 0
1530           [-]  [0, 4, 110000]   1 1 1 2 2 2
1531           [-]  [4, 8, 110000]   0 0 0 1 1 2
1532           [:all:]          [0, 110000]      1 1 1 1 1 1
1533         */
1534
1535        // Return the smallest i such that c < list[i].  Assume
1536        // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
1537        if (c < list[0]) return 0;
1538        // High runner test.  c is often after the last range, so an
1539        // initial check for this condition pays off.
1540        if (len >= 2 && c >= list[len-2]) return len-1;
1541        int lo = 0;
1542        int hi = len - 1;
1543        // invariant: c >= list[lo]
1544        // invariant: c < list[hi]
1545        for (;;) {
1546            int i = (lo + hi) >>> 1;
1547            if (i == lo) return hi;
1548            if (c < list[i]) {
1549                hi = i;
1550            } else {
1551                lo = i;
1552            }
1553        }
1554    }
1555
1556//    //----------------------------------------------------------------
1557//    // Unrolled binary search
1558//    //----------------------------------------------------------------
1559//
1560//    private int validLen = -1; // validated value of len
1561//    private int topOfLow;
1562//    private int topOfHigh;
1563//    private int power;
1564//    private int deltaStart;
1565//
1566//    private void validate() {
1567//        if (len <= 1) {
1568//            throw new IllegalArgumentException("list.len==" + len + "; must be >1");
1569//        }
1570//
1571//        // find greatest power of 2 less than or equal to len
1572//        for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {}
1573//
1574//        // assert(exp2[power] <= len);
1575//
1576//        // determine the starting points
1577//        topOfLow = exp2[power] - 1;
1578//        topOfHigh = len - 1;
1579//        deltaStart = exp2[power-1];
1580//        validLen = len;
1581//    }
1582//
1583//    private static final int exp2[] = {
1584//        0x1, 0x2, 0x4, 0x8,
1585//        0x10, 0x20, 0x40, 0x80,
1586//        0x100, 0x200, 0x400, 0x800,
1587//        0x1000, 0x2000, 0x4000, 0x8000,
1588//        0x10000, 0x20000, 0x40000, 0x80000,
1589//        0x100000, 0x200000, 0x400000, 0x800000,
1590//        0x1000000, 0x2000000, 0x4000000, 0x8000000,
1591//        0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java
1592//    };
1593//
1594//    /**
1595//     * Unrolled lowest index GT.
1596//     */
1597//    private final int leastIndexGT(int searchValue) {
1598//
1599//        if (len != validLen) {
1600//            if (len == 1) return 0;
1601//            validate();
1602//        }
1603//        int temp;
1604//
1605//        // set up initial range to search. Each subrange is a power of two in length
1606//        int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh;
1607//
1608//        // Completely unrolled binary search, folhighing "Programming Pearls"
1609//        // Each case deliberately falls through to the next
1610//        // Logically, list[-1] < all_search_values && list[count] > all_search_values
1611//        // although the values -1 and count are never actually touched.
1612//
1613//        // The bounds at each point are low & high,
1614//        // where low == high - delta*2
1615//        // so high - delta is the midpoint
1616//
1617//        // The invariant AFTER each line is that list[low] < searchValue <= list[high]
1618//
1619//        switch (power) {
1620//        //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java
1621//        case 30: if (searchValue < list[temp = high-0x20000000]) high = temp;
1622//        case 29: if (searchValue < list[temp = high-0x10000000]) high = temp;
1623//
1624//        case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp;
1625//        case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp;
1626//        case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp;
1627//        case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp;
1628//
1629//        case 24: if (searchValue < list[temp = high-  0x800000]) high = temp;
1630//        case 23: if (searchValue < list[temp = high-  0x400000]) high = temp;
1631//        case 22: if (searchValue < list[temp = high-  0x200000]) high = temp;
1632//        case 21: if (searchValue < list[temp = high-  0x100000]) high = temp;
1633//
1634//        case 20: if (searchValue < list[temp = high-   0x80000]) high = temp;
1635//        case 19: if (searchValue < list[temp = high-   0x40000]) high = temp;
1636//        case 18: if (searchValue < list[temp = high-   0x20000]) high = temp;
1637//        case 17: if (searchValue < list[temp = high-   0x10000]) high = temp;
1638//
1639//        case 16: if (searchValue < list[temp = high-    0x8000]) high = temp;
1640//        case 15: if (searchValue < list[temp = high-    0x4000]) high = temp;
1641//        case 14: if (searchValue < list[temp = high-    0x2000]) high = temp;
1642//        case 13: if (searchValue < list[temp = high-    0x1000]) high = temp;
1643//
1644//        case 12: if (searchValue < list[temp = high-     0x800]) high = temp;
1645//        case 11: if (searchValue < list[temp = high-     0x400]) high = temp;
1646//        case 10: if (searchValue < list[temp = high-     0x200]) high = temp;
1647//        case  9: if (searchValue < list[temp = high-     0x100]) high = temp;
1648//
1649//        case  8: if (searchValue < list[temp = high-      0x80]) high = temp;
1650//        case  7: if (searchValue < list[temp = high-      0x40]) high = temp;
1651//        case  6: if (searchValue < list[temp = high-      0x20]) high = temp;
1652//        case  5: if (searchValue < list[temp = high-      0x10]) high = temp;
1653//
1654//        case  4: if (searchValue < list[temp = high-       0x8]) high = temp;
1655//        case  3: if (searchValue < list[temp = high-       0x4]) high = temp;
1656//        case  2: if (searchValue < list[temp = high-       0x2]) high = temp;
1657//        case  1: if (searchValue < list[temp = high-       0x1]) high = temp;
1658//        }
1659//
1660//        return high;
1661//    }
1662//
1663//    // For debugging only
1664//    public int len() {
1665//        return len;
1666//    }
1667//
1668//    //----------------------------------------------------------------
1669//    //----------------------------------------------------------------
1670
1671    /**
1672     * Returns true if this set contains every character
1673     * of the given range.
1674     * @param start first character, inclusive, of the range
1675     * @param end last character, inclusive, of the range
1676     * @return true if the test condition is met
1677     * @stable ICU 2.0
1678     */
1679    public boolean contains(int start, int end) {
1680        if (start < MIN_VALUE || start > MAX_VALUE) {
1681            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(start, 6));
1682        }
1683        if (end < MIN_VALUE || end > MAX_VALUE) {
1684            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(end, 6));
1685        }
1686        //int i = -1;
1687        //while (true) {
1688        //    if (start < list[++i]) break;
1689        //}
1690        int i = findCodePoint(start);
1691        return ((i & 1) != 0 && end < list[i]);
1692    }
1693
1694    /**
1695     * Returns <tt>true</tt> if this set contains the given
1696     * multicharacter string.
1697     * @param s string to be checked for containment
1698     * @return <tt>true</tt> if this set contains the specified string
1699     * @stable ICU 2.0
1700     */
1701    public final boolean contains(String   s) {
1702
1703        int cp = getSingleCP(s);
1704        if (cp < 0) {
1705            return strings.contains(s);
1706        } else {
1707            return contains(cp);
1708        }
1709    }
1710
1711    /**
1712     * Returns true if this set contains all the characters and strings
1713     * of the given set.
1714     * @param c set to be checked for containment
1715     * @return true if the test condition is met
1716     * @stable ICU 2.0
1717     */
1718    public boolean containsAll(UnicodeSet c) {
1719        // The specified set is a subset if all of its pairs are contained in
1720        // this set.  It's possible to code this more efficiently in terms of
1721        // direct manipulation of the inversion lists if the need arises.
1722        int n = c.getRangeCount();
1723        for (int i=0; i<n; ++i) {
1724            if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
1725                return false;
1726            }
1727        }
1728        if (!strings.containsAll(c.strings)) return false;
1729        return true;
1730    }
1731
1732    /**
1733     * Returns true if there is a partition of the string such that this set contains each of the partitioned strings.
1734     * For example, for the Unicode set [a{bc}{cd}]<br>
1735     * containsAll is true for each of: "a", "bc", ""cdbca"<br>
1736     * containsAll is false for each of: "acb", "bcda", "bcx"<br>
1737     * @param s string containing characters to be checked for containment
1738     * @return true if the test condition is met
1739     * @stable ICU 2.0
1740     */
1741     public boolean containsAll(String   s) {
1742        int cp;
1743        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1744            cp = UTF16.charAt(s, i);
1745            if (!contains(cp))  {
1746                if (strings.size() == 0) {
1747                    return false;
1748                }
1749                return containsAll(s, 0);
1750            }
1751        }
1752        return true;
1753    }
1754
1755    /**
1756     * Recursive routine called if we fail to find a match in containsAll, and there are strings
1757     * @param s source string
1758     * @param i point to match to the end on
1759     * @return true if ok
1760     */
1761    private boolean containsAll(String   s, int i) {
1762        if (i >= s.length()) {
1763            return true;
1764        }
1765        int  cp= UTF16.charAt(s, i);
1766        if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) {
1767            return true;
1768        }
1769        
1770        Iterator it = strings.iterator();
1771        while (it.hasNext()) {
1772            String   setStr = (String  )it.next();
1773            if (s.startsWith(setStr, i) &&  containsAll(s, i+setStr.length())) {
1774                return true;
1775            }
1776        }
1777        return false;
1778        
1779    }
1780
1781    /**
1782     * @return regex pattern equivalent to this UnicodeSet
1783     * @internal
1784     * @deprecated This API is ICU internal only.
1785     */
1786    public String   getRegexEquivalent() {
1787        if (strings.size() == 0) return toString();
1788        StringBuffer   result = new StringBuffer  ("(?:");
1789        _generatePattern(result, true, false);
1790        Iterator it = strings.iterator();
1791        while (it.hasNext()) {
1792            result.append('|');
1793            _appendToPat(result, (String  ) it.next(), true);
1794        }
1795        return result.append(")").toString();
1796    }
1797
1798    /**
1799     * Returns true if this set contains none of the characters
1800     * of the given range.
1801     * @param start first character, inclusive, of the range
1802     * @param end last character, inclusive, of the range
1803     * @return true if the test condition is met
1804     * @stable ICU 2.0
1805     */
1806    public boolean containsNone(int start, int end) {
1807        if (start < MIN_VALUE || start > MAX_VALUE) {
1808            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(start, 6));
1809        }
1810        if (end < MIN_VALUE || end > MAX_VALUE) {
1811            throw new IllegalArgumentException  ("Invalid code point U+" + Utility.hex(end, 6));
1812        }
1813        int i = -1;
1814        while (true) {
1815            if (start < list[++i]) break;
1816        }
1817        return ((i & 1) == 0 && end < list[i]);
1818    }
1819
1820    /**
1821     * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
1822     * For example, for the Unicode set [a{bc}{cd}]<br>
1823     * containsNone is true for: "xy", "cb"<br>
1824     * containsNone is false for: "a", "bc", "bcd"<br>
1825     * @param c set to be checked for containment
1826     * @return true if the test condition is met
1827     * @stable ICU 2.0
1828     */
1829    public boolean containsNone(UnicodeSet c) {
1830        // The specified set is a subset if all of its pairs are contained in
1831        // this set.  It's possible to code this more efficiently in terms of
1832        // direct manipulation of the inversion lists if the need arises.
1833        int n = c.getRangeCount();
1834        for (int i=0; i<n; ++i) {
1835            if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
1836                return false;
1837            }
1838        }
1839        if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false;
1840        return true;
1841    }
1842
1843    /**
1844     * Returns true if this set contains none of the characters
1845     * of the given string.
1846     * @param s string containing characters to be checked for containment
1847     * @return true if the test condition is met
1848     * @stable ICU 2.0
1849     */
1850    public boolean containsNone(String   s) {
1851        int cp;
1852        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1853            cp = UTF16.charAt(s, i);
1854            if (contains(cp)) return false;
1855        }
1856        if (strings.size() == 0) return true;
1857        // do a last check to make sure no strings are in.
1858        for (Iterator it = strings.iterator(); it.hasNext();) {
1859            String   item = (String  )it.next();
1860            if (s.indexOf(item) >= 0) return false;
1861        }
1862        return true;
1863    }
1864
1865    /**
1866     * Returns true if this set contains one or more of the characters
1867     * in the given range.
1868     * @param start first character, inclusive, of the range
1869     * @param end last character, inclusive, of the range
1870     * @return true if the condition is met
1871     * @stable ICU 2.0
1872     */
1873    public final boolean containsSome(int start, int end) {
1874        return !containsNone(start, end);
1875    }
1876
1877    /**
1878     * Returns true if this set contains one or more of the characters
1879     * and strings of the given set.
1880     * @param s set to be checked for containment
1881     * @return true if the condition is met
1882     * @stable ICU 2.0
1883     */
1884    public final boolean containsSome(UnicodeSet s) {
1885        return !containsNone(s);
1886    }
1887
1888    /**
1889     * Returns true if this set contains one or more of the characters
1890     * of the given string.
1891     * @param s string containing characters to be checked for containment
1892     * @return true if the condition is met
1893     * @stable ICU 2.0
1894     */
1895    public final boolean containsSome(String   s) {
1896        return !containsNone(s);
1897    }
1898
1899
1900    /**
1901     * Adds all of the elements in the specified set to this set if
1902     * they're not already present.  This operation effectively
1903     * modifies this set so that its value is the <i>union</i> of the two
1904     * sets.  The behavior of this operation is unspecified if the specified
1905     * collection is modified while the operation is in progress.
1906     *
1907     * @param c set whose elements are to be added to this set.
1908     * @stable ICU 2.0
1909     */
1910    public UnicodeSet addAll(UnicodeSet c) {
1911        checkFrozen();
1912        add(c.list, c.len, 0);
1913        strings.addAll(c.strings);
1914        return this;
1915    }
1916
1917    /**
1918     * Retains only the elements in this set that are contained in the
1919     * specified set.  In other words, removes from this set all of
1920     * its elements that are not contained in the specified set.  This
1921     * operation effectively modifies this set so that its value is
1922     * the <i>intersection</i> of the two sets.
1923     *
1924     * @param c set that defines which elements this set will retain.
1925     * @stable ICU 2.0
1926     */
1927    public UnicodeSet retainAll(UnicodeSet c) {
1928        checkFrozen();
1929        retain(c.list, c.len, 0);
1930        strings.retainAll(c.strings);
1931        return this;
1932    }
1933
1934    /**
1935     * Removes from this set all of its elements that are contained in the
1936     * specified set.  This operation effectively modifies this
1937     * set so that its value is the <i>asymmetric set difference</i> of
1938     * the two sets.
1939     *
1940     * @param c set that defines which elements will be removed from
1941     *          this set.
1942     * @stable ICU 2.0
1943     */
1944    public UnicodeSet removeAll(UnicodeSet c) {
1945        checkFrozen();
1946        retain(c.list, c.len, 2);
1947        strings.removeAll(c.strings);
1948        return this;
1949    }
1950
1951    /**
1952     * Complements in this set all elements contained in the specified
1953     * set.  Any character in the other set will be removed if it is
1954     * in this set, or will be added if it is not in this set.
1955     *
1956     * @param c set that defines which elements will be complemented from
1957     *          this set.
1958     * @stable ICU 2.0
1959     */
1960    public UnicodeSet complementAll(UnicodeSet c) {
1961        checkFrozen();
1962        xor(c.list, c.len, 0);
1963        SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings);
1964        return this;
1965    }
1966
1967    /**
1968     * Removes all of the elements from this set.  This set will be
1969     * empty after this call returns.
1970     * @stable ICU 2.0
1971     */
1972    public UnicodeSet clear() {
1973        checkFrozen();
1974        list[0] = HIGH;
1975        len = 1;
1976        pat = null;
1977        strings.clear();
1978        return this;
1979    }
1980
1981    /**
1982     * Iteration method that returns the number of ranges contained in
1983     * this set.
1984     * @see #getRangeStart
1985     * @see #getRangeEnd
1986     * @stable ICU 2.0
1987     */
1988    public int getRangeCount() {
1989        return len/2;
1990    }
1991
1992    /**
1993     * Iteration method that returns the first character in the
1994     * specified range of this set.
1995     * @exception ArrayIndexOutOfBoundsException if index is outside
1996     * the range <code>0..getRangeCount()-1</code>
1997     * @see #getRangeCount
1998     * @see #getRangeEnd
1999     * @stable ICU 2.0
2000     */
2001    public int getRangeStart(int index) {
2002        return list[index*2];
2003    }
2004
2005    /**
2006     * Iteration method that returns the last character in the
2007     * specified range of this set.
2008     * @exception ArrayIndexOutOfBoundsException if index is outside
2009     * the range <code>0..getRangeCount()-1</code>
2010     * @see #getRangeStart
2011     * @see #getRangeEnd
2012     * @stable ICU 2.0
2013     */
2014    public int getRangeEnd(int index) {
2015        return (list[index*2 + 1] - 1);
2016    }
2017
2018    /**
2019     * Reallocate this objects internal structures to take up the least
2020     * possible space, without changing this object's value.
2021     * @stable ICU 2.0
2022     */
2023    public UnicodeSet compact() {
2024        checkFrozen();
2025        if (len != list.length) {
2026            int[] temp = new int[len];
2027            System.arraycopy(list, 0, temp, 0, len);
2028            list = temp;
2029        }
2030        rangeList = null;
2031        buffer = null;
2032        return this;
2033    }
2034
2035    /**
2036     * Compares the specified object with this set for equality.  Returns
2037     * <tt>true</tt> if the specified object is also a set, the two sets
2038     * have the same size, and every member of the specified set is
2039     * contained in this set (or equivalently, every member of this set is
2040     * contained in the specified set).
2041     *
2042     * @param o Object to be compared for equality with this set.
2043     * @return <tt>true</tt> if the specified Object is equal to this set.
2044     * @stable ICU 2.0
2045     */
2046    public boolean equals(Object   o) {
2047        try {
2048            UnicodeSet that = (UnicodeSet) o;
2049            if (len != that.len) return false;
2050            for (int i = 0; i < len; ++i) {
2051                if (list[i] != that.list[i]) return false;
2052            }
2053            if (!strings.equals(that.strings)) return false;
2054        } catch (Exception   e) {
2055            return false;
2056        }
2057        return true;
2058    }
2059
2060    /**
2061     * Returns the hash code value for this set.
2062     *
2063     * @return the hash code value for this set.
2064     * @see java.lang.Object#hashCode()
2065     * @stable ICU 2.0
2066     */
2067    public int hashCode() {
2068        int result = len;
2069        for (int i = 0; i < len; ++i) {
2070            result *= 1000003;
2071            result += list[i];
2072        }
2073        return result;
2074    }
2075
2076    /**
2077     * Return a programmer-readable string representation of this object.
2078     * @stable ICU 2.0
2079     */
2080    public String   toString() {
2081        return toPattern(true);
2082    }
2083
2084    //----------------------------------------------------------------
2085    // Implementation: Pattern parsing
2086    //----------------------------------------------------------------
2087
2088    /**
2089     * Parses the given pattern, starting at the given position.  The character
2090     * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
2091     * Parsing continues until the corresponding closing ']'.  If a syntax error
2092     * is encountered between the opening and closing brace, the parse fails.
2093     * Upon return from a successful parse, the ParsePosition is updated to
2094     * point to the character following the closing ']', and an inversion
2095     * list for the parsed pattern is returned.  This method
2096     * calls itself recursively to parse embedded subpatterns.
2097     *
2098     * @param pattern the string containing the pattern to be parsed.  The
2099     * portion of the string from pos.getIndex(), which must be a '[', to the
2100     * corresponding closing ']', is parsed.
2101     * @param pos upon entry, the position at which to being parsing.  The
2102     * character at pattern.charAt(pos.getIndex()) must be a '['.  Upon return
2103     * from a successful parse, pos.getIndex() is either the character after the
2104     * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
2105     * is the last character of the pattern string.
2106     * @return an inversion list for the parsed substring
2107     * of <code>pattern</code>
2108     * @exception java.lang.IllegalArgumentException if the parse fails.
2109     */
2110    UnicodeSet applyPattern(String   pattern,
2111                      ParsePosition pos,
2112                      SymbolTable symbols,
2113                      int options) {
2114
2115        // Need to build the pattern in a temporary string because
2116        // _applyPattern calls add() etc., which set pat to empty.
2117        boolean parsePositionWasNull = pos == null;
2118        if (parsePositionWasNull) {
2119            pos = new ParsePosition(0);
2120        }
2121
2122        StringBuffer   rebuiltPat = new StringBuffer  ();
2123        RuleCharacterIterator chars =
2124            new RuleCharacterIterator(pattern, symbols, pos);
2125        applyPattern(chars, symbols, rebuiltPat, options);
2126        if (chars.inVariable()) {
2127            syntaxError(chars, "Extra chars in variable value");
2128        }
2129        pat = rebuiltPat.toString();
2130        if (parsePositionWasNull) {
2131            int i = pos.getIndex();
2132
2133            // Skip over trailing whitespace
2134            if ((options & IGNORE_SPACE) != 0) {
2135                i = Utility.skipWhitespace(pattern, i);
2136            }
2137
2138            if (i != pattern.length()) {
2139                throw new IllegalArgumentException  ("Parse of \"" + pattern +
2140                                                   "\" failed at " + i);
2141            }
2142        }
2143        return this;
2144    }
2145
2146    /**
2147     * Parse the pattern from the given RuleCharacterIterator.  The
2148     * iterator is advanced over the parsed pattern.
2149     * @param chars iterator over the pattern characters.  Upon return
2150     * it will be advanced to the first character after the parsed
2151     * pattern, or the end of the iteration if all characters are
2152     * parsed.
2153     * @param symbols symbol table to use to parse and dereference
2154     * variables, or null if none.
2155     * @param rebuiltPat the pattern that was parsed, rebuilt or
2156     * copied from the input pattern, as appropriate.
2157     * @param options a bit mask of zero or more of the following:
2158     * IGNORE_SPACE, CASE.
2159     */
2160    void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
2161                      StringBuffer   rebuiltPat, int options) {
2162
2163        // Syntax characters: [ ] ^ - & { }
2164
2165        // Recognized special forms for chars, sets: c-c s-s s&s
2166
2167        int opts = RuleCharacterIterator.PARSE_VARIABLES |
2168                   RuleCharacterIterator.PARSE_ESCAPES;
2169        if ((options & IGNORE_SPACE) != 0) {
2170            opts |= RuleCharacterIterator.SKIP_WHITESPACE;
2171        }
2172
2173        StringBuffer   pat = new StringBuffer  (), buf = null;
2174        boolean usePat = false;
2175        UnicodeSet scratch = null;
2176        Object   backup = null;
2177
2178        // mode: 0=before [, 1=between [...], 2=after ]
2179        // lastItem: 0=none, 1=char, 2=set
2180        int lastItem = 0, lastChar = 0, mode = 0;
2181        char op = 0;
2182
2183        boolean invert = false;
2184
2185        clear();
2186
2187        while (mode != 2 && !chars.atEnd()) {
2188            if (false) {
2189                // Debugging assertion
2190                if (!((lastItem == 0 && op == 0) ||
2191                      (lastItem == 1 && (op == 0 || op == '-')) ||
2192                      (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
2193                    throw new IllegalArgumentException  ();
2194                }
2195            }
2196
2197            int c = 0;
2198            boolean literal = false;
2199            UnicodeSet nested = null;
2200
2201            // -------- Check for property pattern
2202
2203            // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
2204            int setMode = 0;
2205            if (resemblesPropertyPattern(chars, opts)) {
2206                setMode = 2;
2207            }
2208
2209            // -------- Parse '[' of opening delimiter OR nested set.
2210            // If there is a nested set, use `setMode' to define how
2211            // the set should be parsed.  If the '[' is part of the
2212            // opening delimiter for this pattern, parse special
2213            // strings "[", "[^", "[-", and "[^-".  Check for stand-in
2214            // characters representing a nested set in the symbol
2215            // table.
2216
2217            else {
2218                // Prepare to backup if necessary
2219                backup = chars.getPos(backup);
2220                c = chars.next(opts);
2221                literal = chars.isEscaped();
2222
2223                if (c == '[' && !literal) {
2224                    if (mode == 1) {
2225                        chars.setPos(backup); // backup
2226                        setMode = 1;
2227                    } else {
2228                        // Handle opening '[' delimiter
2229                        mode = 1;
2230                        pat.append('[');
2231                        backup = chars.getPos(backup); // prepare to backup
2232                        c = chars.next(opts);
2233                        literal = chars.isEscaped();
2234                        if (c == '^' && !literal) {
2235                            invert = true;
2236                            pat.append('^');
2237                            backup = chars.getPos(backup); // prepare to backup
2238                            c = chars.next(opts);
2239                            literal = chars.isEscaped();
2240                        }
2241                        // Fall through to handle special leading '-';
2242                        // otherwise restart loop for nested [], \p{}, etc.
2243                        if (c == '-') {
2244                            literal = true;
2245                            // Fall through to handle literal '-' below
2246                        } else {
2247                            chars.setPos(backup); // backup
2248                            continue;
2249                        }
2250                    }
2251                } else if (symbols != null) {
2252                     UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
2253                     if (m != null) {
2254                         try {
2255                             nested = (UnicodeSet) m;
2256                             setMode = 3;
2257                         } catch (ClassCastException   e) {
2258                             syntaxError(chars, "Syntax error");
2259                         }
2260                     }
2261                }
2262            }
2263
2264            // -------- Handle a nested set.  This either is inline in
2265            // the pattern or represented by a stand-in that has
2266            // previously been parsed and was looked up in the symbol
2267            // table.
2268
2269            if (setMode != 0) {
2270                if (lastItem == 1) {
2271                    if (op != 0) {
2272                        syntaxError(chars, "Char expected after operator");
2273                    }
2274                    add_unchecked(lastChar, lastChar);
2275                    _appendToPat(pat, lastChar, false);
2276                    lastItem = op = 0;
2277                }
2278
2279                if (op == '-' || op == '&') {
2280                    pat.append(op);
2281                }
2282
2283                if (nested == null) {
2284                    if (scratch == null) scratch = new UnicodeSet();
2285                    nested = scratch;
2286                }
2287                switch (setMode) {
2288                case 1:
2289                    nested.applyPattern(chars, symbols, pat, options);
2290                    break;
2291                case 2:
2292                    chars.skipIgnored(opts);
2293                    nested.applyPropertyPattern(chars, pat, symbols);
2294                    break;
2295                case 3: // `nested' already parsed
2296                    nested._toPattern(pat, false);
2297                    break;
2298                }
2299
2300                usePat = true;
2301
2302                if (mode == 0) {
2303                    // Entire pattern is a category; leave parse loop
2304                    set(nested);
2305                    mode = 2;
2306                    break;
2307                }
2308
2309                switch (op) {
2310                case '-':
2311                    removeAll(nested);
2312                    break;
2313                case '&':
2314                    retainAll(nested);
2315                    break;
2316                case 0:
2317                    addAll(nested);
2318                    break;
2319                }
2320
2321                op = 0;
2322                lastItem = 2;
2323
2324                continue;
2325            }
2326
2327            if (mode == 0) {
2328                syntaxError(chars, "Missing '['");
2329            }
2330
2331            // -------- Parse special (syntax) characters.  If the
2332            // current character is not special, or if it is escaped,
2333            // then fall through and handle it below.
2334
2335            if (!literal) {
2336                switch (c) {
2337                case ']':
2338                    if (lastItem == 1) {
2339                        add_unchecked(lastChar, lastChar);
2340                        _appendToPat(pat, lastChar, false);
2341                    }
2342                    // Treat final trailing '-' as a literal
2343                    if (op == '-') {
2344                        add_unchecked(op, op);
2345                        pat.append(op);
2346                    } else if (op == '&') {
2347                        syntaxError(chars, "Trailing '&'");
2348                    }
2349                    pat.append(']');
2350                    mode = 2;
2351                    continue;
2352                case '-':
2353                    if (op == 0) {
2354                        if (lastItem != 0) {
2355                            op = (char) c;
2356                            continue;
2357                        } else {
2358                            // Treat final trailing '-' as a literal
2359                            add_unchecked(c, c);
2360                            c = chars.next(opts);
2361                            literal = chars.isEscaped();
2362                            if (c == ']' && !literal) {
2363                                pat.append("-]");
2364                                mode = 2;
2365                                continue;
2366                            }
2367                        }
2368                    }
2369                    syntaxError(chars, "'-' not after char or set");
2370                case '&':
2371                    if (lastItem == 2 && op == 0) {
2372                        op = (char) c;
2373                        continue;
2374                    }
2375                    syntaxError(chars, "'&' not after set");
2376                case '^':
2377                    syntaxError(chars, "'^' not after '['");
2378                case '{':
2379                    if (op != 0) {
2380                        syntaxError(chars, "Missing operand after operator");
2381                    }
2382                    if (lastItem == 1) {
2383                        add_unchecked(lastChar, lastChar);
2384                        _appendToPat(pat, lastChar, false);
2385                    }
2386                    lastItem = 0;
2387                    if (buf == null) {
2388                        buf = new StringBuffer  ();
2389                    } else {
2390                        buf.setLength(0);
2391                    }
2392                    boolean ok = false;
2393                    while (!chars.atEnd()) {
2394                        c = chars.next(opts);
2395                        literal = chars.isEscaped();
2396                        if (c == '}' && !literal) {
2397                            ok = true;
2398                            break;
2399                        }
2400                        UTF16.append(buf, c);
2401                    }
2402                    if (buf.length() < 1 || !ok) {
2403                        syntaxError(chars, "Invalid multicharacter string");
2404                    }
2405                    // We have new string. Add it to set and continue;
2406                    // we don't need to drop through to the further
2407                    // processing
2408                    add(buf.toString());
2409                    pat.append('{');
2410                    _appendToPat(pat, buf.toString(), false);
2411                    pat.append('}');
2412                    continue;
2413                case SymbolTable.SYMBOL_REF:
2414                    //         symbols  nosymbols
2415                    // [a-$]   error    error (ambiguous)
2416                    // [a$]    anchor   anchor
2417                    // [a-$x]  var "x"* literal '$'
2418                    // [a-$.]  error    literal '$'
2419                    // *We won't get here in the case of var "x"
2420                    backup = chars.getPos(backup);
2421                    c = chars.next(opts);
2422                    literal = chars.isEscaped();
2423                    boolean anchor = (c == ']' && !literal);
2424                    if (symbols == null && !anchor) {
2425                        c = SymbolTable.SYMBOL_REF;
2426                        chars.setPos(backup);
2427                        break; // literal '$'
2428                    }
2429                    if (anchor && op == 0) {
2430                        if (lastItem == 1) {
2431                            add_unchecked(lastChar, lastChar);
2432                            _appendToPat(pat, lastChar, false);
2433                        }
2434                        add_unchecked(UnicodeMatcher.ETHER);
2435                        usePat = true;
2436                        pat.append(SymbolTable.SYMBOL_REF).append(']');
2437                        mode = 2;
2438                        continue;
2439                    }
2440                    syntaxError(chars, "Unquoted '$'");
2441                default:
2442                    break;
2443                }
2444            }
2445
2446            // -------- Parse literal characters.  This includes both
2447            // escaped chars ("\u4E01") and non-syntax characters
2448            // ("a").
2449
2450            switch (lastItem) {
2451            case 0:
2452                lastItem = 1;
2453                lastChar = c;
2454                break;
2455            case 1:
2456                if (op == '-') {
2457                    if (lastChar >= c) {
2458                        // Don't allow redundant (a-a) or empty (b-a) ranges;
2459                        // these are most likely typos.
2460                        syntaxError(chars, "Invalid range");
2461                    }
2462                    add_unchecked(lastChar, c);
2463                    _appendToPat(pat, lastChar, false);
2464                    pat.append(op);
2465                    _appendToPat(pat, c, false);
2466                    lastItem = op = 0;
2467                } else {
2468                    add_unchecked(lastChar, lastChar);
2469                    _appendToPat(pat, lastChar, false);
2470                    lastChar = c;
2471                }
2472                break;
2473            case 2:
2474                if (op != 0) {
2475                    syntaxError(chars, "Set expected after operator");
2476                }
2477                lastChar = c;
2478                lastItem = 1;
2479                break;
2480            }
2481        }
2482
2483        if (mode != 2) {
2484            syntaxError(chars, "Missing ']'");
2485        }
2486
2487        chars.skipIgnored(opts);
2488
2489        /**
2490         * Handle global flags (invert, case insensitivity).  If this
2491         * pattern should be compiled case-insensitive, then we need
2492         * to close over case BEFORE COMPLEMENTING.  This makes
2493         * patterns like /[^abc]/i work.
2494         */
2495        if ((options & CASE) != 0) {
2496            closeOver(CASE);
2497        }
2498        if (invert) {
2499            complement();
2500        }
2501
2502        // Use the rebuilt pattern (pat) only if necessary.  Prefer the
2503        // generated pattern.
2504        if (usePat) {
2505            rebuiltPat.append(pat.toString());
2506        } else {
2507            _generatePattern(rebuiltPat, false, true);
2508        }
2509    }
2510
2511    private static void syntaxError(RuleCharacterIterator chars, String   msg) {
2512        throw new IllegalArgumentException  ("Error: " + msg + " at \"" +
2513                                           Utility.escape(chars.toString()) +
2514                                           '"');
2515    }
2516
2517    /**
2518     * Add the contents of the UnicodeSet (as strings) into a collection.
2519     * @param target collection to add into
2520     * @stable ICU 2.8
2521     */
2522    public void addAllTo(Collection   target) {
2523        UnicodeSetIterator it = new UnicodeSetIterator(this);
2524        while (it.next()) {
2525            target.add(it.getString());
2526        }
2527    }
2528
2529    /**
2530     * Add the contents of the collection (as strings) into this UnicodeSet.
2531     * @param source the collection to add
2532     * @stable ICU 2.8
2533     */
2534    public void addAll(Collection   source) {
2535        checkFrozen();
2536        Iterator it = source.iterator();
2537        while (it.hasNext()) {
2538            add(it.next().toString());
2539        }
2540    }
2541
2542    //----------------------------------------------------------------
2543    // Implementation: Utility methods
2544    //----------------------------------------------------------------
2545
2546    private void ensureCapacity(int newLen) {
2547        if (newLen <= list.length) return;
2548        int[] temp = new int[newLen + GROW_EXTRA];
2549        System.arraycopy(list, 0, temp, 0, len);
2550        list = temp;
2551    }
2552
2553    private void ensureBufferCapacity(int newLen) {
2554        if (buffer != null && newLen <= buffer.length) return;
2555        buffer = new int[newLen + GROW_EXTRA];
2556    }
2557
2558    /**
2559     * Assumes start <= end.
2560     */
2561    private int[] range(int start, int end) {
2562        if (rangeList == null) {
2563            rangeList = new int[] { start, end+1, HIGH };
2564        } else {
2565            rangeList[0] = start;
2566            rangeList[1] = end+1;
2567        }
2568        return rangeList;
2569    }
2570
2571    //----------------------------------------------------------------
2572    // Implementation: Fundamental operations
2573    //----------------------------------------------------------------
2574
2575    // polarity = 0, 3 is normal: x xor y
2576    // polarity = 1, 2: x xor ~y == x === y
2577
2578    private UnicodeSet xor(int[] other, int otherLen, int polarity) {
2579        ensureBufferCapacity(len + otherLen);
2580        int i = 0, j = 0, k = 0;
2581        int a = list[i++];
2582        int b;
2583        if (polarity == 1 || polarity == 2) {
2584            b = LOW;
2585            if (other[j] == LOW) { // skip base if already LOW
2586                ++j;
2587                b = other[j];
2588            }
2589        } else {
2590            b = other[j++];
2591        }
2592        // simplest of all the routines
2593        // sort the values, discarding identicals!
2594        while (true) {
2595            if (a < b) {
2596                buffer[k++] = a;
2597                a = list[i++];
2598            } else if (b < a) {
2599                buffer[k++] = b;
2600                b = other[j++];
2601            } else if (a != HIGH) { // at this point, a == b
2602                // discard both values!
2603                a = list[i++];
2604                b = other[j++];
2605            } else { // DONE!
2606                buffer[k++] = HIGH;
2607                len = k;
2608                break;
2609            }
2610        }
2611        // swap list and buffer
2612        int[] temp = list;
2613        list = buffer;
2614        buffer = temp;
2615        pat = null;
2616        return this;
2617    }
2618
2619    // polarity = 0 is normal: x union y
2620    // polarity = 2: x union ~y
2621    // polarity = 1: ~x union y
2622    // polarity = 3: ~x union ~y
2623
2624    private UnicodeSet add(int[] other, int otherLen, int polarity) {
2625        ensureBufferCapacity(len + otherLen);
2626        int i = 0, j = 0, k = 0;
2627        int a = list[i++];
2628        int b = other[j++];
2629        // change from xor is that we have to check overlapping pairs
2630        // polarity bit 1 means a is second, bit 2 means b is.
2631        main:
2632        while (true) {
2633            switch (polarity) {
2634              case 0: // both first; take lower if unequal
2635                if (a < b) { // take a
2636                    // Back up over overlapping ranges in buffer[]
2637                    if (k > 0 && a <= buffer[k-1]) {
2638                        // Pick latter end value in buffer[] vs. list[]
2639                        a = max(list[i], buffer[--k]);
2640                    } else {
2641                        // No overlap
2642                        buffer[k++] = a;
2643                        a = list[i];
2644                    }
2645                    i++; // Common if/else code factored out
2646                    polarity ^= 1;
2647                } else if (b < a) { // take b
2648                    if (k > 0 && b <= buffer[k-1]) {
2649                        b = max(other[j], buffer[--k]);
2650                    } else {
2651                        buffer[k++] = b;
2652                        b = other[j];
2653                    }
2654                    j++;
2655                    polarity ^= 2;
2656                } else { // a == b, take a, drop b
2657                    if (a == HIGH) break main;
2658                    // This is symmetrical; it doesn't matter if
2659                    // we backtrack with a or b. - liu
2660                    if (k > 0 && a <= buffer[k-1]) {
2661                        a = max(list[i], buffer[--k]);
2662                    } else {
2663                        // No overlap
2664                        buffer[k++] = a;
2665                        a = list[i];
2666                    }
2667                    i++;
2668                    polarity ^= 1;
2669                    b = other[j++]; polarity ^= 2;
2670                }
2671                break;
2672              case 3: // both second; take higher if unequal, and drop other
2673                if (b <= a) { // take a
2674                    if (a == HIGH) break main;
2675                    buffer[k++] = a;
2676                } else { // take b
2677                    if (b == HIGH) break main;
2678                    buffer[k++] = b;
2679                }
2680                a = list[i++]; polarity ^= 1;   // factored common code
2681                b = other[j++]; polarity ^= 2;
2682                break;
2683              case 1: // a second, b first; if b < a, overlap
2684                if (a < b) { // no overlap, take a
2685                    buffer[k++] = a; a = list[i++]; polarity ^= 1;
2686                } else if (b < a) { // OVERLAP, drop b
2687                    b = other[j++]; polarity ^= 2;
2688                } else { // a == b, drop both!
2689                    if (a == HIGH) break main;
2690                    a = list[i++]; polarity ^= 1;
2691                    b = other[j++]; polarity ^= 2;
2692                }
2693                break;
2694              case 2: // a first, b second; if a < b, overlap
2695                if (b < a) { // no overlap, take b
2696                    buffer[k++] = b; b = other[j++]; polarity ^= 2;
2697                } else  if (a < b) { // OVERLAP, drop a
2698                    a = list[i++]; polarity ^= 1;
2699                } else { // a == b, drop both!
2700                    if (a == HIGH) break main;
2701                    a = list[i++]; polarity ^= 1;
2702                    b = other[j++]; polarity ^= 2;
2703                }
2704                break;
2705            }
2706        }
2707        buffer[k++] = HIGH;    // terminate
2708        len = k;
2709        // swap list and buffer
2710        int[] temp = list;
2711        list = buffer;
2712        buffer = temp;
2713        pat = null;
2714        return this;
2715    }
2716
2717    // polarity = 0 is normal: x intersect y
2718    // polarity = 2: x intersect ~y == set-minus
2719    // polarity = 1: ~x intersect y
2720    // polarity = 3: ~x intersect ~y
2721
2722    private UnicodeSet retain(int[] other, int otherLen, int polarity) {
2723        ensureBufferCapacity(len + otherLen);
2724        int i = 0, j = 0, k = 0;
2725        int a = list[i++];
2726        int b = other[j++];
2727        // change from xor is that we have to check overlapping pairs
2728        // polarity bit 1 means a is second, bit 2 means b is.
2729        main:
2730        while (true) {
2731            switch (polarity) {
2732              case 0: // both first; drop the smaller
2733                if (a < b) { // drop a
2734                    a = list[i++]; polarity ^= 1;
2735                } else if (b < a) { // drop b
2736                    b = other[j++]; polarity ^= 2;
2737                } else { // a == b, take one, drop other
2738                    if (a == HIGH) break main;
2739                    buffer[k++] = a; a = list[i++]; polarity ^= 1;
2740                    b = other[j++]; polarity ^= 2;
2741                }
2742                break;
2743              case 3: // both second; take lower if unequal
2744                if (a < b) { // take a
2745                    buffer[k++] = a; a = list[i++]; polarity ^= 1;
2746                } else if (b < a) { // take b
2747                    buffer[k++] = b; b = other[j++]; polarity ^= 2;
2748                } else { // a == b, take one, drop other
2749                    if (a == HIGH) break main;
2750                    buffer[k++] = a; a = list[i++]; polarity ^= 1;
2751                    b = other[j++]; polarity ^= 2;
2752                }
2753                break;
2754              case 1: // a second, b first;
2755                if (a < b) { // NO OVERLAP, drop a
2756                    a = list[i++]; polarity ^= 1;
2757                } else if (b < a) { // OVERLAP, take b
2758                    buffer[k++] = b; b = other[j++]; polarity ^= 2;
2759                } else { // a == b, drop both!
2760                    if (a == HIGH) break main;
2761                    a = list[i++]; polarity ^= 1;
2762                    b = other[j++]; polarity ^= 2;
2763                }
2764                break;
2765              case 2: // a first, b second; if a < b, overlap
2766                if (b < a) { // no overlap, drop b
2767                    b = other[j++]; polarity ^= 2;
2768                } else  if (a < b) { // OVERLAP, take a
2769                    buffer[k++] = a; a = list[i++]; polarity ^= 1;
2770                } else { // a == b, drop both!
2771                    if (a == HIGH) break main;
2772                    a = list[i++]; polarity ^= 1;
2773                    b = other[j++]; polarity ^= 2;
2774                }
2775                break;
2776            }
2777        }
2778        buffer[k++] = HIGH;    // terminate
2779        len = k;
2780        // swap list and buffer
2781        int[] temp = list;
2782        list = buffer;
2783        buffer = temp;
2784        pat = null;
2785        return this;
2786    }
2787
2788    private static final int max(int a, int b) {
2789        return (a > b) ? a : b;
2790    }
2791
2792    //----------------------------------------------------------------
2793    // Generic filter-based scanning code
2794    //----------------------------------------------------------------
2795
2796    private static interface Filter {
2797        boolean contains(int codePoint);
2798    }
2799
2800    private static class NumericValueFilter implements Filter {
2801        double value;
2802        NumericValueFilter(double value) { this.value = value; }
2803        public boolean contains(int ch) {
2804            return UCharacter.getUnicodeNumericValue(ch) == value;
2805        }
2806    }
2807
2808    private static class GeneralCategoryMaskFilter implements Filter {
2809        int mask;
2810        GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
2811        public boolean contains(int ch) {
2812            return ((1 << UCharacter.getType(ch)) & mask) != 0;
2813        }
2814    }
2815
2816    private static class IntPropertyFilter implements Filter {
2817        int prop;
2818        int value;
2819        IntPropertyFilter(int prop, int value) {
2820            this.prop = prop;
2821            this.value = value;
2822        }
2823        public boolean contains(int ch) {
2824            return UCharacter.getIntPropertyValue(ch, prop) == value;
2825        }
2826    }
2827
2828    // VersionInfo for unassigned characters
2829    static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
2830
2831    private static class VersionFilter implements Filter {
2832        VersionInfo version;
2833        VersionFilter(VersionInfo version) { this.version = version; }
2834        public boolean contains(int ch) {
2835            VersionInfo v = UCharacter.getAge(ch);
2836            // Reference comparison ok; VersionInfo caches and reuses
2837            // unique objects.
2838            return v != NO_VERSION &&
2839                   v.compareTo(version) <= 0;
2840        }
2841    }
2842
2843    private static synchronized UnicodeSet getInclusions(int src) {
2844        if (INCLUSIONS == null) {
2845            INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
2846        }
2847        if(INCLUSIONS[src] == null) {
2848            UnicodeSet incl = new UnicodeSet();
2849            switch(src) {
2850            case UCharacterProperty.SRC_CHAR:
2851                UCharacterProperty.getInstance().addPropertyStarts(incl);
2852                break;
2853            case UCharacterProperty.SRC_PROPSVEC:
2854                UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
2855                break;
2856            case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
2857                UCharacterProperty.getInstance().addPropertyStarts(incl);
2858                UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
2859                break;
2860            case UCharacterProperty.SRC_HST:
2861                UCharacterProperty.getInstance().uhst_addPropertyStarts(incl);
2862                break;
2863            case UCharacterProperty.SRC_NORM:
2864                NormalizerImpl.addPropertyStarts(incl);
2865                break;
2866            case UCharacterProperty.SRC_CASE:
2867                try {
2868                    UCaseProps.getSingleton().addPropertyStarts(incl);
2869                } catch(IOException   e) {
2870                    throw new MissingResourceException  (e.getMessage(),"","");
2871                }
2872                break;
2873            case UCharacterProperty.SRC_BIDI:
2874                try {
2875                    UBiDiProps.getSingleton().addPropertyStarts(incl);
2876                } catch(IOException   e) {
2877                    throw new MissingResourceException  (e.getMessage(),"","");
2878                }
2879                break;
2880            default:
2881                throw new IllegalStateException  ("UnicodeSet.getInclusions(unknown src "+src+")");
2882            }
2883            INCLUSIONS[src] = incl;
2884        }
2885        return INCLUSIONS[src];
2886    }
2887
2888    /**
2889     * Generic filter-based scanning code for UCD property UnicodeSets.
2890     */
2891    private UnicodeSet applyFilter(Filter filter, int src) {
2892        // Walk through all Unicode characters, noting the start
2893        // and end of each range for which filter.contain(c) is
2894        // true.  Add each range to a set.
2895        //
2896        // To improve performance, use the INCLUSIONS set, which
2897        // encodes information about character ranges that are known
2898        // to have identical properties, such as the CJK Ideographs
2899        // from U+4E00 to U+9FA5.  INCLUSIONS contains all characters
2900        // except the first characters of such ranges.
2901        //
2902        // TODO Where possible, instead of scanning over code points,
2903        // use internal property data to initialize UnicodeSets for
2904        // those properties.  Scanning code points is slow.
2905
2906        clear();
2907
2908        int startHasProperty = -1;
2909        UnicodeSet inclusions = getInclusions(src);
2910        int limitRange = inclusions.getRangeCount();
2911
2912        for (int j=0; j<limitRange; ++j) {
2913            // get current range
2914            int start = inclusions.getRangeStart(j);
2915            int end = inclusions.getRangeEnd(j);
2916
2917            // for all the code points in the range, process
2918            for (int ch = start; ch <= end; ++ch) {
2919                // only add to the unicodeset on inflection points --
2920                // where the hasProperty value changes to false
2921                if (filter.contains(ch)) {
2922                    if (startHasProperty < 0) {
2923                        startHasProperty = ch;
2924                    }
2925                } else if (startHasProperty >= 0) {
2926                    add_unchecked(startHasProperty, ch-1);
2927                    startHasProperty = -1;
2928                }
2929            }
2930        }
2931        if (startHasProperty >= 0) {
2932            add_unchecked(startHasProperty, 0x10FFFF);
2933        }
2934
2935        return this;
2936    }
2937
2938
2939    /**
2940     * Remove leading and trailing rule white space and compress
2941     * internal rule white space to a single space character.
2942     *
2943     * @see UCharacterProperty#isRuleWhiteSpace
2944     */
2945    private static String   mungeCharName(String   source) {
2946        StringBuffer   buf = new StringBuffer  ();
2947        for (int i=0; i<source.length(); ) {
2948            int ch = UTF16.charAt(source, i);
2949            i += UTF16.getCharCount(ch);
2950            if (UCharacterProperty.isRuleWhiteSpace(ch)) {
2951                if (buf.length() == 0 ||
2952                    buf.charAt(buf.length() - 1) == ' ') {
2953                    continue;
2954                }
2955                ch = ' '; // convert to ' '
2956            }
2957            UTF16.append(buf, ch);
2958        }
2959        if (buf.length() != 0 &&
2960            buf.charAt(buf.length() - 1) == ' ') {
2961            buf.setLength(buf.length() - 1);
2962        }
2963        return buf.toString();
2964    }
2965
2966    //----------------------------------------------------------------
2967    // Property set API
2968    //----------------------------------------------------------------
2969
2970    /**
2971     * Modifies this set to contain those code points which have the
2972     * given value for the given binary or enumerated property, as
2973     * returned by UCharacter.getIntPropertyValue.  Prior contents of
2974     * this set are lost.
2975     *
2976     * @param prop a property in the range
2977     * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or
2978     * UProperty.INT_START..UProperty.INT_LIMIT-1 or.
2979     * UProperty.MASK_START..UProperty.MASK_LIMIT-1.
2980     *
2981     * @param value a value in the range
2982     * UCharacter.getIntPropertyMinValue(prop)..
2983     * UCharacter.getIntPropertyMaxValue(prop), with one exception.
2984     * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be
2985     * a UCharacter.getType() result, but rather a mask value produced
2986     * by logically ORing (1 << UCharacter.getType()) values together.
2987     * This allows grouped categories such as [:L:] to be represented.
2988     *
2989     * @return a reference to this set
2990     *
2991     * @stable ICU 2.4
2992     */
2993    public UnicodeSet applyIntPropertyValue(int prop, int value) {
2994        checkFrozen();
2995        if (prop == UProperty.GENERAL_CATEGORY_MASK) {
2996            applyFilter(new GeneralCategoryMaskFilter(value), UCharacterProperty.SRC_CHAR);
2997        } else {
2998            applyFilter(new IntPropertyFilter(prop, value), UCharacterProperty.getInstance().getSource(prop));
2999        }
3000        return this;
3001    }
3002
3003
3004
3005    /**
3006     * Modifies this set to contain those code points which have the
3007     * given value for the given property.  Prior contents of this
3008     * set are lost.
3009     *
3010     * @param propertyAlias a property alias, either short or long.
3011     * The name is matched loosely.  See PropertyAliases.txt for names
3012     * and a description of loose matching.  If the value string is
3013     * empty, then this string is interpreted as either a
3014     * General_Category value alias, a Script value alias, a binary
3015     * property alias, or a special ID.  Special IDs are matched
3016     * loosely and correspond to the following sets:
3017     *
3018     * "ANY" = [-\U0010FFFF],
3019     * "ASCII" = [-].
3020     *
3021     * @param valueAlias a value alias, either short or long.  The
3022     * name is matched loosely.  See PropertyValueAliases.txt for
3023     * names and a description of loose matching.  In addition to
3024     * aliases listed, numeric values and canonical combining classes
3025     * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc",
3026     * "220").  The value string may also be empty.
3027     *
3028     * @return a reference to this set
3029     *
3030     * @stable ICU 2.4
3031     */
3032    public UnicodeSet applyPropertyAlias(String   propertyAlias, String   valueAlias) {
3033        return applyPropertyAlias(propertyAlias, valueAlias, null);
3034    }
3035
3036    /**
3037     * Modifies this set to contain those code points which have the
3038     * given value for the given property.  Prior contents of this
3039     * set are lost.
3040     * @param propertyAlias
3041     * @param valueAlias
3042     * @param symbols if not null, then symbols are first called to see if a property
3043     * is available. If true, then everything else is skipped.
3044     * @return this set
3045     * @draft ICU 3.2
3046     * @provisional This API might change or be removed in a future release.
3047     */
3048    public UnicodeSet applyPropertyAlias(String   propertyAlias,
3049                                         String   valueAlias, SymbolTable symbols) {
3050        checkFrozen();
3051        int p;
3052        int v;
3053        boolean mustNotBeEmpty = false, invert = false;
3054
3055        if (symbols != null
3056                && (symbols instanceof XSymbolTable)
3057                && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) {
3058                return this;
3059        }
3060
3061        if (valueAlias.length() > 0) {
3062            p = UCharacter.getPropertyEnum(propertyAlias);
3063
3064            // Treat gc as gcm
3065            if (p == UProperty.GENERAL_CATEGORY) {
3066                p = UProperty.GENERAL_CATEGORY_MASK;
3067            }
3068
3069            if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) ||
3070                (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) ||
3071                (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) {
3072                try {
3073                    v = UCharacter.getPropertyValueEnum(p, valueAlias);
3074                } catch (IllegalArgumentException   e) {
3075                    // Handle numeric CCC
3076                    if (p == UProperty.CANONICAL_COMBINING_CLASS ||
3077                        p == UProperty.LEAD_CANONICAL_COMBINING_CLASS ||
3078                        p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) {
3079                        v = Integer.parseInt(Utility.deleteRuleWhiteSpace(valueAlias));
3080                        // If the resultant set is empty then the numeric value
3081                        // was invalid.
3082                        mustNotBeEmpty = true;
3083                    } else {
3084                        throw e;
3085                    }
3086                }
3087            }
3088
3089            else {
3090
3091                switch (p) {
3092                case UProperty.NUMERIC_VALUE:
3093                    {
3094                        double value = Double.parseDouble(Utility.deleteRuleWhiteSpace(valueAlias));
3095                        applyFilter(new NumericValueFilter(value), UCharacterProperty.SRC_CHAR);
3096                        return this;
3097                    }
3098                case UProperty.NAME:
3099                case UProperty.UNICODE_1_NAME:
3100                    {
3101                        // Must munge name, since
3102                        // UCharacter.charFromName() does not do
3103                        // 'loose' matching.
3104                        String   buf = mungeCharName(valueAlias);
3105                        int ch =
3106                            (p == UProperty.NAME) ?
3107                            UCharacter.getCharFromExtendedName(buf) :
3108                            UCharacter.getCharFromName1_0(buf);
3109                        if (ch == -1) {
3110                            throw new IllegalArgumentException  ("Invalid character name");
3111                        }
3112                        clear();
3113                        add_unchecked(ch);
3114                        return this;
3115                    }
3116                case UProperty.AGE:
3117                    {
3118                        // Must munge name, since
3119                        // VersionInfo.getInstance() does not do
3120                        // 'loose' matching.
3121                        VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
3122                        applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
3123                        return this;
3124                    }
3125                }
3126
3127                // p is a non-binary, non-enumerated property that we
3128                // don't support (yet).
3129                throw new IllegalArgumentException  ("Unsupported property");
3130            }
3131        }
3132
3133        else {
3134            // valueAlias is empty.  Interpret as General Category, Script,
3135            // Binary property, or ANY or ASCII.  Upon success, p and v will
3136            // be set.
3137            try {
3138                p = UProperty.GENERAL_CATEGORY_MASK;
3139                v = UCharacter.getPropertyValueEnum(p, propertyAlias);
3140            } catch (IllegalArgumentException   e) {
3141                try {
3142                    p = UProperty.SCRIPT;
3143                    v = UCharacter.getPropertyValueEnum(p, propertyAlias);
3144                } catch (IllegalArgumentException   e2) {
3145                    try {
3146                        p = UCharacter.getPropertyEnum(propertyAlias);
3147                    } catch (IllegalArgumentException   e3) {
3148                        p = -1;
3149                    }
3150                    if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) {
3151                        v = 1;
3152                    } else if (p == -1) {
3153                        if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) {
3154                            set(MIN_VALUE, MAX_VALUE);
3155                            return this;
3156                        } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) {
3157                            set(0, 0x7F);
3158                            return this;
3159                        } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) {
3160                            // [:Assigned:]=[:^Cn:]
3161                            p = UProperty.GENERAL_CATEGORY_MASK;
3162                            v = (1<<UCharacter.UNASSIGNED);
3163                            invert = true;
3164                        } else {
3165                            // Property name was never matched.
3166                            throw new IllegalArgumentException  ("Invalid property alias: " + propertyAlias + "=" + valueAlias);
3167                        }
3168                    } else {
3169                        // Valid propery name, but it isn't binary, so the value
3170                        // must be supplied.
3171                        throw new IllegalArgumentException  ("Missing property value");
3172                    }
3173                }
3174            }
3175        }
3176
3177        applyIntPropertyValue(p, v);
3178        if(invert) {
3179            complement();
3180        }
3181
3182        if (mustNotBeEmpty && isEmpty()) {
3183            // mustNotBeEmpty is set to true if an empty set indicates
3184            // invalid input.
3185            throw new IllegalArgumentException  ("Invalid property value");
3186        }
3187
3188        return this;
3189    }
3190
3191    //----------------------------------------------------------------
3192    // Property set patterns
3193    //----------------------------------------------------------------
3194
3195    /**
3196     * Return true if the given position, in the given pattern, appears
3197     * to be the start of a property set pattern.
3198     */
3199    private static boolean resemblesPropertyPattern(String   pattern, int pos) {
3200        // Patterns are at least 5 characters long
3201        if ((pos+5) > pattern.length()) {
3202            return false;
3203        }
3204
3205        // Look for an opening [:, [:^, \p, or \P
3206        return pattern.regionMatches(pos, "[:", 0, 2) ||
3207            pattern.regionMatches(true, pos, "\\p", 0, 2) ||
3208            pattern.regionMatches(pos, "\\N", 0, 2);
3209    }
3210
3211    /**
3212     * Return true if the given iterator appears to point at a
3213     * property pattern.  Regardless of the result, return with the
3214     * iterator unchanged.
3215     * @param chars iterator over the pattern characters.  Upon return
3216     * it will be unchanged.
3217     * @param iterOpts RuleCharacterIterator options
3218     */
3219    private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
3220                                                    int iterOpts) {
3221        boolean result = false;
3222        iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
3223        Object   pos = chars.getPos(null);
3224        int c = chars.next(iterOpts);
3225        if (c == '[' || c == '\\') {
3226            int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
3227            result = (c == '[') ? (d == ':') :
3228                     (d == 'N' || d == 'p' || d == 'P');
3229        }
3230        chars.setPos(pos);
3231        return result;
3232    }
3233
3234    /**
3235     * Parse the given property pattern at the given parse position.
3236     * @param symbols TODO
3237     */
3238    private UnicodeSet applyPropertyPattern(String   pattern, ParsePosition ppos, SymbolTable symbols) {
3239        int pos = ppos.getIndex();
3240
3241        // On entry, ppos should point to one of the following locations:
3242
3243        // Minimum length is 5 characters, e.g. \p{L}
3244        if ((pos+5) > pattern.length()) {
3245            return null;
3246        }
3247
3248        boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
3249        boolean isName = false; // true for \N{pat}, o/w false
3250        boolean invert = false;
3251
3252        // Look for an opening [:, [:^, \p, or \P
3253        if (pattern.regionMatches(pos, "[:", 0, 2)) {
3254            posix = true;
3255            pos = Utility.skipWhitespace(pattern, pos+2);
3256            if (pos < pattern.length() && pattern.charAt(pos) == '^') {
3257                ++pos;
3258                invert = true;
3259            }
3260        } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
3261                   pattern.regionMatches(pos, "\\N", 0, 2)) {
3262            char c = pattern.charAt(pos+1);
3263            invert = (c == 'P');
3264            isName = (c == 'N');
3265            pos = Utility.skipWhitespace(pattern, pos+2);
3266            if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
3267                // Syntax error; "\p" or "\P" not followed by "{"
3268                return null;
3269            }
3270        } else {
3271            // Open delimiter not seen
3272            return null;
3273        }
3274
3275        // Look for the matching close delimiter, either :] or }
3276        int close = pattern.indexOf(posix ? ":]" : "}", pos);
3277        if (close < 0) {
3278            // Syntax error; close delimiter missing
3279            return null;
3280        }
3281
3282        // Look for an '=' sign.  If this is present, we will parse a
3283        // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
3284        // pattern.
3285        int equals = pattern.indexOf('=', pos);
3286        String   propName, valueName;
3287        if (equals >= 0 && equals < close && !isName) {
3288            // Equals seen; parse medium/long pattern
3289            propName = pattern.substring(pos, equals);
3290            valueName = pattern.substring(equals+1, close);
3291        }
3292
3293        else {
3294            // Handle case where no '=' is seen, and \N{}
3295            propName = pattern.substring(pos, close);
3296            valueName = "";
3297
3298            // Handle \N{name}
3299            if (isName) {
3300                // This is a little inefficient since it means we have to
3301                // parse "na" back to UProperty.NAME even though we already
3302                // know it's UProperty.NAME.  If we refactor the API to
3303                // support args of (int, String) then we can remove
3304                // "na" and make this a little more efficient.
3305                valueName = propName;
3306                propName = "na";
3307            }
3308        }
3309
3310        applyPropertyAlias(propName, valueName, symbols);
3311
3312        if (invert) {
3313            complement();
3314        }
3315
3316        // Move to the limit position after the close delimiter
3317        ppos.setIndex(close + (posix ? 2 : 1));
3318
3319        return this;
3320    }
3321
3322    /**
3323     * Parse a property pattern.
3324     * @param chars iterator over the pattern characters.  Upon return
3325     * it will be advanced to the first character after the parsed
3326     * pattern, or the end of the iteration if all characters are
3327     * parsed.
3328     * @param rebuiltPat the pattern that was parsed, rebuilt or
3329     * copied from the input pattern, as appropriate.
3330     * @param symbols TODO
3331     */
3332    private void applyPropertyPattern(RuleCharacterIterator chars,
3333                                      StringBuffer   rebuiltPat, SymbolTable symbols) {
3334        String   pat = chars.lookahead();
3335        ParsePosition pos = new ParsePosition(0);
3336        applyPropertyPattern(pat, pos, symbols);
3337        if (pos.getIndex() == 0) {
3338            syntaxError(chars, "Invalid property pattern");
3339        }
3340        chars.jumpahead(pos.getIndex());
3341        rebuiltPat.append(pat.substring(0, pos.getIndex()));
3342    }
3343
3344    //----------------------------------------------------------------
3345    // Case folding API
3346    //----------------------------------------------------------------
3347
3348    /**
3349     * Bitmask for constructor and applyPattern() indicating that
3350     * white space should be ignored.  If set, ignore characters for
3351     * which UCharacterProperty.isRuleWhiteSpace() returns true,
3352     * unless they are quoted or escaped.  This may be ORed together
3353     * with other selectors.
3354     * @internal
3355     * @deprecated This API is ICU internal only.
3356     */
3357    public static final int IGNORE_SPACE = 1;
3358
3359    /**
3360     * Bitmask for constructor, applyPattern(), and closeOver()
3361     * indicating letter case.  This may be ORed together with other
3362     * selectors.
3363     *
3364     * Enable case insensitive matching.  E.g., "[ab]" with this flag
3365     * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
3366     * match all except 'a', 'A', 'b', and 'B'. This performs a full
3367     * closure over case mappings, e.g. U+017F for s.
3368     *
3369     * The resulting set is a superset of the input for the code points but
3370     * not for the strings.
3371     * It performs a case mapping closure of the code points and adds
3372     * full case folding strings for the code points, and reduces strings of
3373     * the original set to their full case folding equivalents.
3374     *
3375     * This is designed for case-insensitive matches, for example
3376     * in regular expressions. The full code point case closure allows checking of
3377     * an input character directly against the closure set.
3378     * Strings are matched by comparing the case-folded form from the closure
3379     * set with an incremental case folding of the string in question.
3380     *
3381     * The closure set will also contain single code points if the original
3382     * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
3383     * This is not necessary (that is, redundant) for the above matching method
3384     * but results in the same closure sets regardless of whether the original
3385     * set contained the code point or a string.
3386     *
3387     * @internal
3388     * @deprecated This API is ICU internal only.
3389     */
3390    public static final int CASE = 2;
3391
3392    /**
3393     * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
3394     * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
3395     * @see #CASE
3396     * @draft ICU 3.4
3397     * @provisional This API might change or be removed in a future release.
3398     */
3399    public static final int CASE_INSENSITIVE = 2;
3400
3401    /**
3402     * Bitmask for constructor, applyPattern(), and closeOver()
3403     * indicating letter case.  This may be ORed together with other
3404     * selectors.
3405     *
3406     * Enable case insensitive matching.  E.g., "[ab]" with this flag
3407     * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
3408     * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
3409     * title-, and uppercase mappings as well as the case folding
3410     * of each existing element in the set.
3411     * @draft ICU 3.4
3412     * @provisional This API might change or be removed in a future release.
3413     */
3414    public static final int ADD_CASE_MAPPINGS = 4;
3415
3416    //  add the result of a full case mapping to the set
3417    //  use str as a temporary string to avoid constructing one
3418    private static final void addCaseMapping(UnicodeSet set, int result, StringBuffer   full) {
3419        if(result >= 0) {
3420            if(result > UCaseProps.MAX_STRING_LENGTH) {
3421                // add a single-code point case mapping
3422                set.add(result);
3423            } else {
3424                // add a string case mapping from full with length result
3425                set.add(full.toString());
3426                full.setLength(0);
3427            }
3428        }
3429        // result < 0: the code point mapped to itself, no need to add it
3430        // see UCaseProps
3431    }
3432
3433    /**
3434     * Close this set over the given attribute.  For the attribute
3435     * CASE, the result is to modify this set so that:
3436     *
3437     * 1. For each character or string 'a' in this set, all strings
3438     * 'b' such that foldCase(a) == foldCase(b) are added to this set.
3439     * (For most 'a' that are single characters, 'b' will have
3440     * b.length() == 1.)
3441     *
3442     * 2. For each string 'e' in the resulting set, if e !=
3443     * foldCase(e), 'e' will be removed.
3444     *
3445     * Example: [aq�{Bc}{bC}{Fi}] => [aAqQ�?{ss}{bc}{fi}]
3446     *
3447     * (Here foldCase(x) refers to the operation
3448     * UCharacter.foldCase(x, true), and a == b actually denotes
3449     * a.equals(b), not pointer comparison.)
3450     *
3451     * @param attribute bitmask for attributes to close over.
3452     * Currently only the CASE bit is supported.  Any undefined bits
3453     * are ignored.
3454     * @return a reference to this set.
3455     * @internal
3456     * @deprecated This API is ICU internal only.
3457     */
3458    public UnicodeSet closeOver(int attribute) {
3459        checkFrozen();
3460        if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
3461            UCaseProps csp;
3462            try {
3463                csp = UCaseProps.getSingleton();
3464            } catch(IOException   e) {
3465                return this;
3466            }
3467            UnicodeSet foldSet = new UnicodeSet(this);
3468            ULocale root = ULocale.ROOT;
3469
3470            // start with input set to guarantee inclusion
3471            // CASE: remove strings because the strings will actually be reduced (folded);
3472            //       therefore, start with no strings and add only those needed
3473            if((attribute & CASE) != 0) {
3474                foldSet.strings.clear();
3475            }
3476
3477            int n = getRangeCount();
3478            int result;
3479            StringBuffer   full = new StringBuffer  ();
3480            int locCache[] = new int[1];
3481
3482            for (int i=0; i<n; ++i) {
3483                int start = getRangeStart(i);
3484                int end   = getRangeEnd(i);
3485
3486                if((attribute & CASE) != 0) {
3487                    // full case closure
3488                    for (int cp=start; cp<=end; ++cp) {
3489                        csp.addCaseClosure(cp, foldSet);
3490                    }
3491                } else {
3492                    // add case mappings
3493                    // (does not add long s for regular s, or Kelvin for k, for example)
3494                    for (int cp=start; cp<=end; ++cp) {
3495                        result = csp.toFullLower(cp, null, full, root, locCache);
3496                        addCaseMapping(foldSet, result, full);
3497
3498                        result = csp.toFullTitle(cp, null, full, root, locCache);
3499                        addCaseMapping(foldSet, result, full);
3500
3501                        result = csp.toFullUpper(cp, null, full, root, locCache);
3502                        addCaseMapping(foldSet, result, full);
3503
3504                        result = csp.toFullFolding(cp, full, 0);
3505                        addCaseMapping(foldSet, result, full);
3506                    }
3507                }
3508            }
3509            if (!strings.isEmpty()) {
3510                String   str;
3511                if ((attribute & CASE) != 0) {
3512                    Iterator it = strings.iterator();
3513                    while (it.hasNext()) {
3514                        str = UCharacter.foldCase((String  )it.next(), 0);
3515                        if(!csp.addStringCaseClosure(str, foldSet)) {
3516                            foldSet.add(str); // does not map to code points: add the folded string itself
3517                        }
3518                    }
3519                } else {
3520                    BreakIterator bi = BreakIterator.getWordInstance(root);
3521                    Iterator it = strings.iterator();
3522                    while (it.hasNext()) {
3523                        str = (String  )it.next();
3524                        foldSet.add(UCharacter.toLowerCase(root, str));
3525                        foldSet.add(UCharacter.toTitleCase(root, str, bi));
3526                        foldSet.add(UCharacter.toUpperCase(root, str));
3527                        foldSet.add(UCharacter.foldCase(str, 0));
3528                    }
3529                }
3530            }
3531            set(foldSet);
3532        }
3533        return this;
3534    }
3535
3536    /**
3537     * Internal class for customizing UnicodeSet parsing of properties.
3538     * TODO: extend to allow customizing of codepoint ranges
3539     * @internal
3540     * @deprecated This API is ICU internal only.
3541     * @author medavis
3542     */
3543    abstract public static class XSymbolTable implements SymbolTable {
3544        /**
3545         * Default constructor
3546         * @internal
3547         * @deprecated This API is ICU internal only.
3548         */
3549        public XSymbolTable(){}
3550        /**
3551         * @internal
3552         * @deprecated This API is ICU internal only.
3553         */
3554        public UnicodeMatcher lookupMatcher(int i) {
3555            return null;
3556        }
3557        /**
3558         * @internal
3559         * @deprecated This API is ICU internal only.
3560         */
3561        public boolean applyPropertyAlias(String   propertyName, String   propertyValue, UnicodeSet result) {
3562            return false;
3563        }
3564        /**
3565         * @internal
3566         * @deprecated This API is ICU internal only.
3567         */
3568        public char[] lookup(String   s) {
3569            return null;
3570        }
3571        /**
3572         * @internal
3573         * @deprecated This API is ICU internal only.
3574         */
3575        public String   parseReference(String   text, ParsePosition pos, int limit) {
3576            return null;
3577        }
3578    }
3579
3580    private boolean frozen;
3581    
3582    /**
3583     * Is this frozen, according to the Freezable interface?
3584     * @return value
3585     * @internal
3586     * @deprecated This API is ICU internal only.
3587     */
3588    public boolean isFrozen() {
3589        return frozen;
3590    }
3591
3592    /**
3593     * Freeze this class, according to the Freezable interface.
3594     * @return this
3595     * @internal
3596     * @deprecated This API is ICU internal only.
3597     */
3598    public Object   freeze() {
3599        frozen = true;
3600        return this;
3601    }
3602    
3603    /**
3604     * Clone a thawed version of this class, according to the Freezable interface.
3605     * @return this
3606     * @internal
3607     * @deprecated This API is ICU internal only.
3608     */
3609    public Object   cloneAsThawed() {
3610        UnicodeSet result = (UnicodeSet) clone();
3611        result.frozen = false;
3612        return result;
3613    }
3614    
3615    // internal function
3616    private void checkFrozen() {
3617        if (frozen) {
3618            throw new UnsupportedOperationException  ("Attempt to modify frozen object");
3619        }
3620    }
3621}
3622//eof
3623
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags