KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > UnicodeSet


1 //##header 1189099963000 FOUNDATION
2
/*
3  *******************************************************************************
4  * Copyright (C) 1996-2006, International Business Machines Corporation and *
5  * others. All Rights Reserved. *
6  *******************************************************************************
7  */

8 package com.ibm.icu.text;
9
10 import java.text.*;
11 import com.ibm.icu.lang.*;
12
13 import java.io.IOException JavaDoc;
14
15 import com.ibm.icu.impl.CollectionUtilities;
16 import com.ibm.icu.impl.NormalizerImpl;
17 import com.ibm.icu.impl.Utility;
18 import com.ibm.icu.impl.UCharacterProperty;
19 import com.ibm.icu.impl.UBiDiProps;
20 import com.ibm.icu.impl.UCaseProps;
21 import com.ibm.icu.impl.UPropertyAliases;
22 import com.ibm.icu.impl.SortedSetRelation;
23 import com.ibm.icu.impl.RuleCharacterIterator;
24
25 import com.ibm.icu.util.Freezable;
26 import com.ibm.icu.util.ULocale;
27 import com.ibm.icu.util.VersionInfo;
28
29 import com.ibm.icu.text.BreakIterator;
30
31 import java.util.Map JavaDoc;
32 import java.util.HashMap JavaDoc;
33 import java.util.MissingResourceException JavaDoc;
34 import java.util.TreeSet JavaDoc;
35 import java.util.Iterator JavaDoc;
36 import java.util.Collection JavaDoc;
37
38 /**
39  * A mutable set of Unicode characters and multicharacter strings. Objects of this class
40  * represent <em>character classes</em> used in regular expressions.
41  * A character specifies a subset of Unicode code points. Legal
42  * code points are U+0000 to U+10FFFF, inclusive.
43  *
44  * <p>The UnicodeSet class is not designed to be subclassed.
45  *
46  * <p><code>UnicodeSet</code> supports two APIs. The first is the
47  * <em>operand</em> API that allows the caller to modify the value of
48  * a <code>UnicodeSet</code> object. It conforms to Java 2's
49  * <code>java.util.Set</code> interface, although
50  * <code>UnicodeSet</code> does not actually implement that
51  * interface. All methods of <code>Set</code> are supported, with the
52  * modification that they take a character range or single character
53  * instead of an <code>Object</code>, and they take a
54  * <code>UnicodeSet</code> instead of a <code>Collection</code>. The
55  * operand API may be thought of in terms of boolean logic: a boolean
56  * OR is implemented by <code>add</code>, a boolean AND is implemented
57  * by <code>retain</code>, a boolean XOR is implemented by
58  * <code>complement</code> taking an argument, and a boolean NOT is
59  * implemented by <code>complement</code> with no argument. In terms
60  * of traditional set theory function names, <code>add</code> is a
61  * union, <code>retain</code> is an intersection, <code>remove</code>
62  * is an asymmetric difference, and <code>complement</code> with no
63  * argument is a set complement with respect to the superset range
64  * <code>MIN_VALUE-MAX_VALUE</code>
65  *
66  * <p>The second API is the
67  * <code>applyPattern()</code>/<code>toPattern()</code> API from the
68  * <code>java.text.Format</code>-derived classes. Unlike the
69  * methods that add characters, add categories, and control the logic
70  * of the set, the method <code>applyPattern()</code> sets all
71  * attributes of a <code>UnicodeSet</code> at once, based on a
72  * string pattern.
73  *
74  * <p><b>Pattern syntax</b></p>
75  *
76  * Patterns are accepted by the constructors and the
77  * <code>applyPattern()</code> methods and returned by the
78  * <code>toPattern()</code> method. These patterns follow a syntax
79  * similar to that employed by version 8 regular expression character
80  * classes. Here are some simple examples:
81  *
82  * <blockquote>
83  * <table>
84  * <tr align="top">
85  * <td nowrap valign="top" align="left"><code>[]</code></td>
86  * <td valign="top">No characters</td>
87  * </tr><tr align="top">
88  * <td nowrap valign="top" align="left"><code>[a]</code></td>
89  * <td valign="top">The character 'a'</td>
90  * </tr><tr align="top">
91  * <td nowrap valign="top" align="left"><code>[ae]</code></td>
92  * <td valign="top">The characters 'a' and 'e'</td>
93  * </tr>
94  * <tr>
95  * <td nowrap valign="top" align="left"><code>[a-e]</code></td>
96  * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
97  * point order</td>
98  * </tr>
99  * <tr>
100  * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
101  * <td valign="top">The character U+4E01</td>
102  * </tr>
103  * <tr>
104  * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
105  * <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
106  * &quot;ac&quot;</td>
107  * </tr>
108  * <tr>
109  * <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
110  * <td valign="top">All characters in the general category Uppercase Letter</td>
111  * </tr>
112  * </table>
113  * </blockquote>
114  *
115  * Any character may be preceded by a backslash in order to remove any special
116  * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
117  * ignored, unless they are escaped.
118  *
119  * <p>Property patterns specify a set of characters having a certain
120  * property as defined by the Unicode standard. Both the POSIX-like
121  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
122  * complete list of supported property patterns, see the User's Guide
123  * for UnicodeSet at
124  * <a HREF="http://icu.sourceforge.net/userguide/unicodeSet.html">
125  * http://icu.sourceforge.net/userguide/unicodeSet.html</a>.
126  * Actual determination of property data is defined by the underlying
127  * Unicode database as implemented by UCharacter.
128  *
129  * <p>Patterns specify individual characters, ranges of characters, and
130  * Unicode property sets. When elements are concatenated, they
131  * specify their union. To complement a set, place a '^' immediately
132  * after the opening '['. Property patterns are inverted by modifying
133  * their delimiters; "[:^foo]" and "\P{foo}". In any other location,
134  * '^' has no special meaning.
135  *
136  * <p>Ranges are indicated by placing two a '-' between two
137  * characters, as in "a-z". This specifies the range of all
138  * characters from the left to the right, in Unicode order. If the
139  * left character is greater than or equal to the
140  * right character it is a syntax error. If a '-' occurs as the first
141  * character after the opening '[' or '[^', or if it occurs as the
142  * last character before the closing ']', then it is taken as a
143  * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
144  * set of three characters, 'a', 'b', and '-'.
145  *
146  * <p>Sets may be intersected using the '&' operator or the asymmetric
147  * set difference may be taken using the '-' operator, for example,
148  * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
149  * with values less than 4096. Operators ('&' and '|') have equal
150  * precedence and bind left-to-right. Thus
151  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
152  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
153  * difference; intersection is commutative.
154  *
155  * <table>
156  * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
157  * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
158  * through 'z' and all letters in between, in Unicode order
159  * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
160  * all characters but 'a' through 'z',
161  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
162  * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
163  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
164  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
165  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
166  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
167  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
168  * <em>pat2</em>
169  * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
170  * <td>The set of characters having the specified
171  * Unicode property; in
172  * this case, Unicode uppercase letters
173  * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
174  * <td>The set of characters <em>not</em> having the given
175  * Unicode property
176  * </table>
177  *
178  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
179  *
180  * <p><b>Formal syntax</b></p>
181  *
182  * <blockquote>
183  * <table>
184  * <tr align="top">
185  * <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
186  * <td valign="top"><code>('[' '^'? item* ']') |
187  * property</code></td>
188  * </tr>
189  * <tr align="top">
190  * <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
191  * <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
192  * </code></td>
193  * </tr>
194  * <tr align="top">
195  * <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
196  * <td valign="top"><code>pattern | pattern-expr pattern |
197  * pattern-expr op pattern<br>
198  * </code></td>
199  * </tr>
200  * <tr align="top">
201  * <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
202  * <td valign="top"><code>'&amp;' | '-'<br>
203  * </code></td>
204  * </tr>
205  * <tr align="top">
206  * <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
207  * <td valign="top"><code>'[' | ']' | '-'<br>
208  * </code></td>
209  * </tr>
210  * <tr align="top">
211  * <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
212  * <td valign="top"><em>any character that is not</em><code> special<br>
213  * | ('\\' </code><em>any character</em><code>)<br>
214  * | ('&#92;u' hex hex hex hex)<br>
215  * </code></td>
216  * </tr>
217  * <tr align="top">
218  * <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
219  * <td valign="top"><em>any character for which
220  * </em><code>Character.digit(c, 16)</code><em>
221  * returns a non-negative result</em></td>
222  * </tr>
223  * <tr>
224  * <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
225  * <td valign="top"><em>a Unicode property set pattern</td>
226  * </tr>
227  * </table>
228  * <br>
229  * <table border="1">
230  * <tr>
231  * <td>Legend: <table>
232  * <tr>
233  * <td nowrap valign="top"><code>a := b</code></td>
234  * <td width="20" valign="top">&nbsp; </td>
235  * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
236  * </tr>
237  * <tr>
238  * <td nowrap valign="top"><code>a?</code></td>
239  * <td valign="top"></td>
240  * <td valign="top">zero or one instance of <code>a</code><br>
241  * </td>
242  * </tr>
243  * <tr>
244  * <td nowrap valign="top"><code>a*</code></td>
245  * <td valign="top"></td>
246  * <td valign="top">one or more instances of <code>a</code><br>
247  * </td>
248  * </tr>
249  * <tr>
250  * <td nowrap valign="top"><code>a | b</code></td>
251  * <td valign="top"></td>
252  * <td valign="top">either <code>a</code> or <code>b</code><br>
253  * </td>
254  * </tr>
255  * <tr>
256  * <td nowrap valign="top"><code>'a'</code></td>
257  * <td valign="top"></td>
258  * <td valign="top">the literal string between the quotes </td>
259  * </tr>
260  * </table>
261  * </td>
262  * </tr>
263  * </table>
264  * </blockquote>
265  * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
266  *
267  * @author Alan Liu
268  * @stable ICU 2.0
269  * @see UnicodeSetIterator
270  */

271 public class UnicodeSet extends UnicodeFilter implements Freezable {
272
273     private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
274
private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
275
// 110000 for codepoints
276

277     /**
278      * Minimum value that can be stored in a UnicodeSet.
279      * @stable ICU 2.0
280      */

281     public static final int MIN_VALUE = LOW;
282
283     /**
284      * Maximum value that can be stored in a UnicodeSet.
285      * @stable ICU 2.0
286      */

287     public static final int MAX_VALUE = HIGH - 1;
288
289     private int len; // length used; list may be longer to minimize reallocs
290
private int[] list; // MUST be terminated with HIGH
291
private int[] rangeList; // internal buffer
292
private int[] buffer; // internal buffer
293

294     // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
295
// is not private so that UnicodeSetIterator can get access
296
TreeSet JavaDoc strings = new TreeSet JavaDoc();
297
298     /**
299      * The pattern representation of this set. This may not be the
300      * most economical pattern. It is the pattern supplied to
301      * applyPattern(), with variables substituted and whitespace
302      * removed. For sets constructed without applyPattern(), or
303      * modified using the non-pattern API, this string will be null,
304      * indicating that toPattern() must generate a pattern
305      * representation from the inversion list.
306      */

307     private String JavaDoc pat = null;
308
309     private static final int START_EXTRA = 16; // initial storage. Must be >= 0
310
private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
311

312     // Special property set IDs
313
private static final String JavaDoc ANY_ID = "ANY"; // [\u0000-\U0010FFFF]
314
private static final String JavaDoc ASCII_ID = "ASCII"; // [\u0000-\u007F]
315
private static final String JavaDoc ASSIGNED = "Assigned"; // [:^Cn:]
316

317     /**
318      * A set of all characters _except_ the second through last characters of
319      * certain ranges. These ranges are ranges of characters whose
320      * properties are all exactly alike, e.g. CJK Ideographs from
321      * U+4E00 to U+9FA5.
322      */

323     private static UnicodeSet INCLUSIONS[] = null;
324
325     //----------------------------------------------------------------
326
// Public API
327
//----------------------------------------------------------------
328

329     /**
330      * Constructs an empty set.
331      * @stable ICU 2.0
332      */

333     public UnicodeSet() {
334         list = new int[1 + START_EXTRA];
335         list[len++] = HIGH;
336     }
337
338     /**
339      * Constructs a copy of an existing set.
340      * @stable ICU 2.0
341      */

342     public UnicodeSet(UnicodeSet other) {
343         set(other);
344     }
345
346     /**
347      * Constructs a set containing the given range. If <code>end >
348      * start</code> then an empty set is created.
349      *
350      * @param start first character, inclusive, of range
351      * @param end last character, inclusive, of range
352      * @stable ICU 2.0
353      */

354     public UnicodeSet(int start, int end) {
355         this();
356         complement(start, end);
357     }
358
359     /**
360      * Constructs a set from the given pattern. See the class description
361      * for the syntax of the pattern language. Whitespace is ignored.
362      * @param pattern a string specifying what characters are in the set
363      * @exception java.lang.IllegalArgumentException if the pattern contains
364      * a syntax error.
365      * @stable ICU 2.0
366      */

367     public UnicodeSet(String JavaDoc pattern) {
368         this();
369         applyPattern(pattern, null, null, IGNORE_SPACE);
370     }
371
372     /**
373      * Constructs a set from the given pattern. See the class description
374      * for the syntax of the pattern language.
375      * @param pattern a string specifying what characters are in the set
376      * @param ignoreWhitespace if true, ignore characters for which
377      * UCharacterProperty.isRuleWhiteSpace() returns true
378      * @exception java.lang.IllegalArgumentException if the pattern contains
379      * a syntax error.
380      * @stable ICU 2.0
381      */

382     public UnicodeSet(String JavaDoc pattern, boolean ignoreWhitespace) {
383         this();
384         applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
385     }
386
387     /**
388      * Constructs a set from the given pattern. See the class description
389      * for the syntax of the pattern language.
390      * @param pattern a string specifying what characters are in the set
391      * @param options a bitmask indicating which options to apply.
392      * Valid options are IGNORE_SPACE and CASE.
393      * @exception java.lang.IllegalArgumentException if the pattern contains
394      * a syntax error.
395      * @internal
396      * @deprecated This API is ICU internal only.
397      */

398     public UnicodeSet(String JavaDoc pattern, int options) {
399         this();
400         applyPattern(pattern, null, null, options);
401     }
402
403     /**
404      * Constructs a set from the given pattern. See the class description
405      * for the syntax of the pattern language.
406      * @param pattern a string specifying what characters are in the set
407      * @param pos on input, the position in pattern at which to start parsing.
408      * On output, the position after the last character parsed.
409      * @param symbols a symbol table mapping variables to char[] arrays
410      * and chars to UnicodeSets
411      * @exception java.lang.IllegalArgumentException if the pattern
412      * contains a syntax error.
413      * @stable ICU 2.0
414      */

415     public UnicodeSet(String JavaDoc pattern, ParsePosition pos, SymbolTable symbols) {
416         this();
417         applyPattern(pattern, pos, symbols, IGNORE_SPACE);
418     }
419
420     /**
421      * Constructs a set from the given pattern. See the class description
422      * for the syntax of the pattern language.
423      * @param pattern a string specifying what characters are in the set
424      * @param pos on input, the position in pattern at which to start parsing.
425      * On output, the position after the last character parsed.
426      * @param symbols a symbol table mapping variables to char[] arrays
427      * and chars to UnicodeSets
428      * @param options a bitmask indicating which options to apply.
429      * Valid options are IGNORE_SPACE and CASE.
430      * @exception java.lang.IllegalArgumentException if the pattern
431      * contains a syntax error.
432      * @draft ICU 3.2
433      * @provisional This API might change or be removed in a future release.
434      */

435     public UnicodeSet(String JavaDoc pattern, ParsePosition pos, SymbolTable symbols, int options) {
436         this();
437         applyPattern(pattern, pos, symbols, options);
438     }
439
440
441     /**
442      * Return a new set that is equivalent to this one.
443      * @stable ICU 2.0
444      */

445     public Object JavaDoc clone() {
446         UnicodeSet result = new UnicodeSet(this);
447         result.frozen = this.frozen;
448         return result;
449     }
450
451     /**
452      * Make this object represent the range <code>start - end</code>.
453      * If <code>end > start</code> then this object is set to an
454      * an empty range.
455      *
456      * @param start first character in the set, inclusive
457      * @param end last character in the set, inclusive
458      * @stable ICU 2.0
459      */

460     public UnicodeSet set(int start, int end) {
461         checkFrozen();
462         clear();
463         complement(start, end);
464         return this;
465     }
466
467     /**
468      * Make this object represent the same set as <code>other</code>.
469      * @param other a <code>UnicodeSet</code> whose value will be
470      * copied to this object
471      * @stable ICU 2.0
472      */

473     public UnicodeSet set(UnicodeSet other) {
474         checkFrozen();
475         list = (int[]) other.list.clone();
476         len = other.len;
477         pat = other.pat;
478         strings = (TreeSet JavaDoc)other.strings.clone();
479         return this;
480     }
481
482     /**
483      * Modifies this set to represent the set specified by the given pattern.
484      * See the class description for the syntax of the pattern language.
485      * Whitespace is ignored.
486      * @param pattern a string specifying what characters are in the set
487      * @exception java.lang.IllegalArgumentException if the pattern
488      * contains a syntax error.
489      * @stable ICU 2.0
490      */

491     public final UnicodeSet applyPattern(String JavaDoc pattern) {
492         checkFrozen();
493         return applyPattern(pattern, null, null, IGNORE_SPACE);
494     }
495
496     /**
497      * Modifies this set to represent the set specified by the given pattern,
498      * optionally ignoring whitespace.
499      * See the class description for the syntax of the pattern language.
500      * @param pattern a string specifying what characters are in the set
501      * @param ignoreWhitespace if true then characters for which
502      * UCharacterProperty.isRuleWhiteSpace() returns true are ignored
503      * @exception java.lang.IllegalArgumentException if the pattern
504      * contains a syntax error.
505      * @stable ICU 2.0
506      */

507     public UnicodeSet applyPattern(String JavaDoc pattern, boolean ignoreWhitespace) {
508         checkFrozen();
509         return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
510     }
511
512     /**
513      * Modifies this set to represent the set specified by the given pattern,
514      * optionally ignoring whitespace.
515      * See the class description for the syntax of the pattern language.
516      * @param pattern a string specifying what characters are in the set
517      * @param options a bitmask indicating which options to apply.
518      * Valid options are IGNORE_SPACE and CASE.
519      * @exception java.lang.IllegalArgumentException if the pattern
520      * contains a syntax error.
521      * @internal
522      * @deprecated This API is ICU internal only.
523      */

524     public UnicodeSet applyPattern(String JavaDoc pattern, int options) {
525         checkFrozen();
526         return applyPattern(pattern, null, null, options);
527     }
528
529     /**
530      * Return true if the given position, in the given pattern, appears
531      * to be the start of a UnicodeSet pattern.
532      * @stable ICU 2.0
533      */

534     public static boolean resemblesPattern(String JavaDoc pattern, int pos) {
535         return ((pos+1) < pattern.length() &&
536                 pattern.charAt(pos) == '[') ||
537             resemblesPropertyPattern(pattern, pos);
538     }
539
540     /**
541      * Append the <code>toPattern()</code> representation of a
542      * string to the given <code>StringBuffer</code>.
543      */

544     private static void _appendToPat(StringBuffer JavaDoc buf, String JavaDoc s, boolean escapeUnprintable) {
545         for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
546             _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
547         }
548     }
549
550     /**
551      * Append the <code>toPattern()</code> representation of a
552      * character to the given <code>StringBuffer</code>.
553      */

554     private static void _appendToPat(StringBuffer JavaDoc buf, int c, boolean escapeUnprintable) {
555         if (escapeUnprintable && Utility.isUnprintable(c)) {
556             // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
557
// unprintable
558
if (Utility.escapeUnprintable(buf, c)) {
559                 return;
560             }
561         }
562         // Okay to let ':' pass through
563
switch (c) {
564         case '[': // SET_OPEN:
565
case ']': // SET_CLOSE:
566
case '-': // HYPHEN:
567
case '^': // COMPLEMENT:
568
case '&': // INTERSECTION:
569
case '\\': //BACKSLASH:
570
case '{':
571         case '}':
572         case '$':
573         case ':':
574             buf.append('\\');
575             break;
576         default:
577             // Escape whitespace
578
if (UCharacterProperty.isRuleWhiteSpace(c)) {
579                 buf.append('\\');
580             }
581             break;
582         }
583         UTF16.append(buf, c);
584     }
585
586     /**
587      * Returns a string representation of this set. If the result of
588      * calling this function is passed to a UnicodeSet constructor, it
589      * will produce another set that is equal to this one.
590      * @stable ICU 2.0
591      */

592     public String JavaDoc toPattern(boolean escapeUnprintable) {
593         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
594         return _toPattern(result, escapeUnprintable).toString();
595     }
596
597     /**
598      * Append a string representation of this set to result. This will be
599      * a cleaned version of the string passed to applyPattern(), if there
600      * is one. Otherwise it will be generated.
601      */

602     private StringBuffer JavaDoc _toPattern(StringBuffer JavaDoc result,
603                                     boolean escapeUnprintable) {
604         if (pat != null) {
605             int i;
606             int backslashCount = 0;
607             for (i=0; i<pat.length(); ) {
608                 int c = UTF16.charAt(pat, i);
609                 i += UTF16.getCharCount(c);
610                 if (escapeUnprintable && Utility.isUnprintable(c)) {
611                     // If the unprintable character is preceded by an odd
612
// number of backslashes, then it has been escaped.
613
// Before unescaping it, we delete the final
614
// backslash.
615
if ((backslashCount % 2) == 1) {
616                         result.setLength(result.length() - 1);
617                     }
618                     Utility.escapeUnprintable(result, c);
619                     backslashCount = 0;
620                 } else {
621                     UTF16.append(result, c);
622                     if (c == '\\') {
623                         ++backslashCount;
624                     } else {
625                         backslashCount = 0;
626                     }
627                 }
628             }
629             return result;
630         }
631
632         return _generatePattern(result, escapeUnprintable, true);
633     }
634
635     /**
636      * Generate and append a string representation of this set to result.
637      * This does not use this.pat, the cleaned up copy of the string
638      * passed to applyPattern().
639      * @param result the buffer into which to generate the pattern
640      * @param escapeUnprintable escape unprintable characters if true
641      * @stable ICU 2.0
642      */

643     public StringBuffer JavaDoc _generatePattern(StringBuffer JavaDoc result, boolean escapeUnprintable) {
644         return _generatePattern(result, escapeUnprintable, true);
645     }
646
647     /**
648      * Generate and append a string representation of this set to result.
649      * This does not use this.pat, the cleaned up copy of the string
650      * passed to applyPattern().
651      * @param includeStrings if false, doesn't include the strings.
652      * @internal
653      * @deprecated This API is ICU internal only.
654      */

655     public StringBuffer JavaDoc _generatePattern(StringBuffer JavaDoc result,
656                                          boolean escapeUnprintable, boolean includeStrings) {
657         result.append('[');
658
659 // // Check against the predefined categories. We implicitly build
660
// // up ALL category sets the first time toPattern() is called.
661
// for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
662
// if (this.equals(getCategorySet(cat))) {
663
// result.append(':');
664
// result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
665
// return result.append(":]");
666
// }
667
// }
668

669         int count = getRangeCount();
670
671         // If the set contains at least 2 intervals and includes both
672
// MIN_VALUE and MAX_VALUE, then the inverse representation will
673
// be more economical.
674
if (count > 1 &&
675             getRangeStart(0) == MIN_VALUE &&
676             getRangeEnd(count-1) == MAX_VALUE) {
677
678             // Emit the inverse
679
result.append('^');
680
681             for (int i = 1; i < count; ++i) {
682                 int start = getRangeEnd(i-1)+1;
683                 int end = getRangeStart(i)-1;
684                 _appendToPat(result, start, escapeUnprintable);
685                 if (start != end) {
686                     if ((start+1) != end) {
687                         result.append('-');
688                     }
689                     _appendToPat(result, end, escapeUnprintable);
690                 }
691             }
692         }
693
694         // Default; emit the ranges as pairs
695
else {
696             for (int i = 0; i < count; ++i) {
697                 int start = getRangeStart(i);
698                 int end = getRangeEnd(i);
699                 _appendToPat(result, start, escapeUnprintable);
700                 if (start != end) {
701                     if ((start+1) != end) {
702                         result.append('-');
703                     }
704                     _appendToPat(result, end, escapeUnprintable);
705                 }
706             }
707         }
708
709         if (includeStrings && strings.size() > 0) {
710             Iterator it = strings.iterator();
711             while (it.hasNext()) {
712                 result.append('{');
713                 _appendToPat(result, (String JavaDoc) it.next(), escapeUnprintable);
714                 result.append('}');
715             }
716         }
717         return result.append(']');
718     }
719
720     /**
721      * Returns the number of elements in this set (its cardinality)
722      * Note than the elements of a set may include both individual
723      * codepoints and strings.
724      *
725      * @return the number of elements in this set (its cardinality).
726      * @stable ICU 2.0
727      */

728     public int size() {
729         int n = 0;
730         int count = getRangeCount();
731         for (int i = 0; i < count; ++i) {
732             n += getRangeEnd(i) - getRangeStart(i) + 1;
733         }
734         return n + strings.size();
735     }
736
737     /**
738      * Returns <tt>true</tt> if this set contains no elements.
739      *
740      * @return <tt>true</tt> if this set contains no elements.
741      * @stable ICU 2.0
742      */

743     public boolean isEmpty() {
744         return len == 1 && strings.size() == 0;
745     }
746
747     /**
748      * Implementation of UnicodeMatcher API. Returns <tt>true</tt> if
749      * this set contains any character whose low byte is the given
750      * value. This is used by <tt>RuleBasedTransliterator</tt> for
751      * indexing.
752      * @stable ICU 2.0
753      */

754     public boolean matchesIndexValue(int v) {
755         /* The index value v, in the range [0,255], is contained in this set if
756          * it is contained in any pair of this set. Pairs either have the high
757          * bytes equal, or unequal. If the high bytes are equal, then we have
758          * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
759          * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
760          * Then v is contained if xx <= v || v <= yy. (This is identical to the
761          * time zone month containment logic.)
762          */

763         for (int i=0; i<getRangeCount(); ++i) {
764             int low = getRangeStart(i);
765             int high = getRangeEnd(i);
766             if ((low & ~0xFF) == (high & ~0xFF)) {
767                 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
768                     return true;
769                 }
770             } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
771                 return true;
772             }
773         }
774         if (strings.size() != 0) {
775             Iterator it = strings.iterator();
776             while (it.hasNext()) {
777                 String JavaDoc s = (String JavaDoc) it.next();
778                 //if (s.length() == 0) {
779
// // Empty strings match everything
780
// return true;
781
//}
782
// assert(s.length() != 0); // We enforce this elsewhere
783
int c = UTF16.charAt(s, 0);
784                 if ((c & 0xFF) == v) {
785                     return true;
786                 }
787             }
788         }
789         return false;
790     }
791
792     /**
793      * Implementation of UnicodeMatcher.matches(). Always matches the
794      * longest possible multichar string.
795      * @stable ICU 2.0
796      */

797     public int matches(Replaceable text,
798                        int[] offset,
799                        int limit,
800                        boolean incremental) {
801
802         if (offset[0] == limit) {
803             // Strings, if any, have length != 0, so we don't worry
804
// about them here. If we ever allow zero-length strings
805
// we much check for them here.
806
if (contains(UnicodeMatcher.ETHER)) {
807                 return incremental ? U_PARTIAL_MATCH : U_MATCH;
808             } else {
809                 return U_MISMATCH;
810             }
811         } else {
812             if (strings.size() != 0) { // try strings first
813

814                 // might separate forward and backward loops later
815
// for now they are combined
816

817                 // TODO Improve efficiency of this, at least in the forward
818
// direction, if not in both. In the forward direction we
819
// can assume the strings are sorted.
820

821                 Iterator it = strings.iterator();
822                 boolean forward = offset[0] < limit;
823
824                 // firstChar is the leftmost char to match in the
825
// forward direction or the rightmost char to match in
826
// the reverse direction.
827
char firstChar = text.charAt(offset[0]);
828
829                 // If there are multiple strings that can match we
830
// return the longest match.
831
int highWaterLength = 0;
832
833                 while (it.hasNext()) {
834                     String JavaDoc trial = (String JavaDoc) it.next();
835
836                     //if (trial.length() == 0) {
837
// return U_MATCH; // null-string always matches
838
//}
839
// assert(trial.length() != 0); // We ensure this elsewhere
840

841                     char c = trial.charAt(forward ? 0 : trial.length() - 1);
842
843                     // Strings are sorted, so we can optimize in the
844
// forward direction.
845
if (forward && c > firstChar) break;
846                     if (c != firstChar) continue;
847
848                     int len = matchRest(text, offset[0], limit, trial);
849
850                     if (incremental) {
851                         int maxLen = forward ? limit-offset[0] : offset[0]-limit;
852                         if (len == maxLen) {
853                             // We have successfully matched but only up to limit.
854
return U_PARTIAL_MATCH;
855                         }
856                     }
857
858                     if (len == trial.length()) {
859                         // We have successfully matched the whole string.
860
if (len > highWaterLength) {
861                             highWaterLength = len;
862                         }
863                         // In the forward direction we know strings
864
// are sorted so we can bail early.
865
if (forward && len < highWaterLength) {
866                             break;
867                         }
868                         continue;
869                     }
870                 }
871
872                 // We've checked all strings without a partial match.
873
// If we have full matches, return the longest one.
874
if (highWaterLength != 0) {
875                     offset[0] += forward ? highWaterLength : -highWaterLength;
876                     return U_MATCH;
877                 }
878             }
879             return super.matches(text, offset, limit, incremental);
880         }
881     }
882
883     /**
884      * Returns the longest match for s in text at the given position.
885      * If limit > start then match forward from start+1 to limit
886      * matching all characters except s.charAt(0). If limit < start,
887      * go backward starting from start-1 matching all characters
888      * except s.charAt(s.length()-1). This method assumes that the
889      * first character, text.charAt(start), matches s, so it does not
890      * check it.
891      * @param text the text to match
892      * @param start the first character to match. In the forward
893      * direction, text.charAt(start) is matched against s.charAt(0).
894      * In the reverse direction, it is matched against
895      * s.charAt(s.length()-1).
896      * @param limit the limit offset for matching, either last+1 in
897      * the forward direction, or last-1 in the reverse direction,
898      * where last is the index of the last character to match.
899      * @return If part of s matches up to the limit, return |limit -
900      * start|. If all of s matches before reaching the limit, return
901      * s.length(). If there is a mismatch between s and text, return
902      * 0
903      */

904     private static int matchRest (Replaceable text, int start, int limit, String JavaDoc s) {
905         int maxLen;
906         int slen = s.length();
907         if (start < limit) {
908             maxLen = limit - start;
909             if (maxLen > slen) maxLen = slen;
910             for (int i = 1; i < maxLen; ++i) {
911                 if (text.charAt(start + i) != s.charAt(i)) return 0;
912             }
913         } else {
914             maxLen = start - limit;
915             if (maxLen > slen) maxLen = slen;
916             --slen; // <=> slen = s.length() - 1;
917
for (int i = 1; i < maxLen; ++i) {
918                 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
919             }
920         }
921         return maxLen;
922     }
923
924 //#ifndef FOUNDATION
925
//## /**
926
//## * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1. For now, an internal routine.
927
//## * @internal
928
//## * @deprecated This API is ICU internal only.
929
//## */
930
//## public int matchesAt(CharSequence text, int offset) {
931
//## int len = -1;
932
//## strings:
933
//## if (strings.size() != 0) {
934
//## char firstChar = text.charAt(offset);
935
//## String trial = null;
936
//## // find the first string starting with firstChar
937
//## Iterator it = strings.iterator();
938
//## while (it.hasNext()) {
939
//## trial = (String) it.next();
940
//## char firstStringChar = trial.charAt(0);
941
//## if (firstStringChar < firstChar) continue;
942
//## if (firstStringChar > firstChar) break strings;
943
//## }
944
//## // now keep checking string until we get the longest one
945
//## while (true) {
946
//## int tempLen = CollectionUtilities.matchesAt(text, offset, trial);
947
//## if (len > tempLen) break strings;
948
//## len = tempLen;
949
//## if (!it.hasNext()) break;
950
//## trial = (String) it.next();
951
//## }
952
//## }
953
//## if (len < 2) {
954
//## int cp = UTF16.charAt(text, offset);
955
//## if (contains(cp)) {
956
//## len = UTF16.getCharCount(cp);
957
//## }
958
//## }
959
//## return offset+len;
960
//## }
961
//#endif
962

963     /**
964      * Implementation of UnicodeMatcher API. Union the set of all
965      * characters that may be matched by this object into the given
966      * set.
967      * @param toUnionTo the set into which to union the source characters
968      * @stable ICU 2.2
969      */

970     public void addMatchSetTo(UnicodeSet toUnionTo) {
971         toUnionTo.addAll(this);
972     }
973
974     /**
975      * Returns the index of the given character within this set, where
976      * the set is ordered by ascending code point. If the character
977      * is not in this set, return -1. The inverse of this method is
978      * <code>charAt()</code>.
979      * @return an index from 0..size()-1, or -1
980      * @stable ICU 2.0
981      */

982     public int indexOf(int c) {
983         if (c < MIN_VALUE || c > MAX_VALUE) {
984             throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(c, 6));
985         }
986         int i = 0;
987         int n = 0;
988         for (;;) {
989             int start = list[i++];
990             if (c < start) {
991                 return -1;
992             }
993             int limit = list[i++];
994             if (c < limit) {
995                 return n + c - start;
996             }
997             n += limit - start;
998         }
999     }
1000
1001    /**
1002     * Returns the character at the given index within this set, where
1003     * the set is ordered by ascending code point. If the index is
1004     * out of range, return -1. The inverse of this method is
1005     * <code>indexOf()</code>.
1006     * @param index an index from 0..size()-1
1007     * @return the character at the given index, or -1.
1008     * @stable ICU 2.0
1009     */

1010    public int charAt(int index) {
1011        if (index >= 0) {
1012            // len2 is the largest even integer <= len, that is, it is len
1013
// for even values and len-1 for odd values. With odd values
1014
// the last entry is UNICODESET_HIGH.
1015
int len2 = len & ~1;
1016            for (int i=0; i < len2;) {
1017                int start = list[i++];
1018                int count = list[i++] - start;
1019                if (index < count) {
1020                    return start + index;
1021                }
1022                index -= count;
1023            }
1024        }
1025        return -1;
1026    }
1027
1028    /**
1029     * Adds the specified range to this set if it is not already
1030     * present. If this set already contains the specified range,
1031     * the call leaves this set unchanged. If <code>end > start</code>
1032     * then an empty range is added, leaving the set unchanged.
1033     *
1034     * @param start first character, inclusive, of range to be added
1035     * to this set.
1036     * @param end last character, inclusive, of range to be added
1037     * to this set.
1038     * @stable ICU 2.0
1039     */

1040    public UnicodeSet add(int start, int end) {
1041        checkFrozen();
1042        return add_unchecked(start, end);
1043    }
1044    
1045    // for internal use, after checkFrozen has been called
1046
private UnicodeSet add_unchecked(int start, int end) {
1047        if (start < MIN_VALUE || start > MAX_VALUE) {
1048            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(start, 6));
1049        }
1050        if (end < MIN_VALUE || end > MAX_VALUE) {
1051            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(end, 6));
1052        }
1053        if (start < end) {
1054            add(range(start, end), 2, 0);
1055        } else if (start == end) {
1056            add(start);
1057        }
1058        return this;
1059    }
1060
1061// /**
1062
// * Format out the inversion list as a string, for debugging. Uncomment when
1063
// * needed.
1064
// */
1065
// public final String dump() {
1066
// StringBuffer buf = new StringBuffer("[");
1067
// for (int i=0; i<len; ++i) {
1068
// if (i != 0) buf.append(", ");
1069
// int c = list[i];
1070
// //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') {
1071
// // buf.append((char) c);
1072
// //} else {
1073
// buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6));
1074
// //}
1075
// }
1076
// buf.append("]");
1077
// return buf.toString();
1078
// }
1079

1080    /**
1081     * Adds the specified character to this set if it is not already
1082     * present. If this set already contains the specified character,
1083     * the call leaves this set unchanged.
1084     * @stable ICU 2.0
1085     */

1086    public final UnicodeSet add(int c) {
1087        checkFrozen();
1088        return add_unchecked(c);
1089    }
1090    
1091    // for internal use only, after checkFrozen has been called
1092
private final UnicodeSet add_unchecked(int c) {
1093        if (c < MIN_VALUE || c > MAX_VALUE) {
1094            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(c, 6));
1095        }
1096
1097        // find smallest i such that c < list[i]
1098
// if odd, then it is IN the set
1099
// if even, then it is OUT of the set
1100
int i = findCodePoint(c);
1101
1102        // already in set?
1103
if ((i & 1) != 0) return this;
1104
1105        // HIGH is 0x110000
1106
// assert(list[len-1] == HIGH);
1107

1108        // empty = [HIGH]
1109
// [start_0, limit_0, start_1, limit_1, HIGH]
1110

1111        // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1112
// ^
1113
// list[i]
1114

1115        // i == 0 means c is before the first range
1116

1117        if (c == list[i]-1) {
1118            // c is before start of next range
1119
list[i] = c;
1120            // if we touched the HIGH mark, then add a new one
1121
if (c == MAX_VALUE) {
1122                ensureCapacity(len+1);
1123                list[len++] = HIGH;
1124            }
1125            if (i > 0 && c == list[i-1]) {
1126                // collapse adjacent ranges
1127

1128                // [..., start_k-1, c, c, limit_k, ..., HIGH]
1129
// ^
1130
// list[i]
1131
System.arraycopy(list, i+1, list, i-1, len-i-1);
1132                len -= 2;
1133            }
1134        }
1135
1136        else if (i > 0 && c == list[i-1]) {
1137            // c is after end of prior range
1138
list[i-1]++;
1139            // no need to chcek for collapse here
1140
}
1141
1142        else {
1143            // At this point we know the new char is not adjacent to
1144
// any existing ranges, and it is not 10FFFF.
1145

1146
1147            // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1148
// ^
1149
// list[i]
1150

1151            // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
1152
// ^
1153
// list[i]
1154

1155            // Don't use ensureCapacity() to save on copying.
1156
// NOTE: This has no measurable impact on performance,
1157
// but it might help in some usage patterns.
1158
if (len+2 > list.length) {
1159                int[] temp = new int[len + 2 + GROW_EXTRA];
1160                if (i != 0) System.arraycopy(list, 0, temp, 0, i);
1161                System.arraycopy(list, i, temp, i+2, len-i);
1162                list = temp;
1163            } else {
1164                System.arraycopy(list, i, list, i+2, len-i);
1165            }
1166
1167            list[i] = c;
1168            list[i+1] = c+1;
1169            len += 2;
1170        }
1171
1172        pat = null;
1173        return this;
1174    }
1175
1176    /**
1177     * Adds the specified multicharacter to this set if it is not already
1178     * present. If this set already contains the multicharacter,
1179     * the call leaves this set unchanged.
1180     * Thus "ch" => {"ch"}
1181     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1182     * @param s the source string
1183     * @return this object, for chaining
1184     * @stable ICU 2.0
1185     */

1186    public final UnicodeSet add(String JavaDoc s) {
1187        checkFrozen();
1188        int cp = getSingleCP(s);
1189        if (cp < 0) {
1190            strings.add(s);
1191            pat = null;
1192        } else {
1193            add_unchecked(cp, cp);
1194        }
1195        return this;
1196    }
1197
1198    /**
1199     * @return a code point IF the string consists of a single one.
1200     * otherwise returns -1.
1201     * @param string to test
1202     */

1203    private static int getSingleCP(String JavaDoc s) {
1204        if (s.length() < 1) {
1205            throw new IllegalArgumentException JavaDoc("Can't use zero-length strings in UnicodeSet");
1206        }
1207        if (s.length() > 2) return -1;
1208        if (s.length() == 1) return s.charAt(0);
1209
1210        // at this point, len = 2
1211
int cp = UTF16.charAt(s, 0);
1212        if (cp > 0xFFFF) { // is surrogate pair
1213
return cp;
1214        }
1215        return -1;
1216    }
1217
1218    /**
1219     * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1220     * If this set already any particular character, it has no effect on that character.
1221     * @param s the source string
1222     * @return this object, for chaining
1223     * @stable ICU 2.0
1224     */

1225    public final UnicodeSet addAll(String JavaDoc s) {
1226        checkFrozen();
1227        int cp;
1228        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1229            cp = UTF16.charAt(s, i);
1230            add_unchecked(cp, cp);
1231        }
1232        return this;
1233    }
1234
1235    /**
1236     * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1237     * If this set already any particular character, it has no effect on that character.
1238     * @param s the source string
1239     * @return this object, for chaining
1240     * @stable ICU 2.0
1241     */

1242    public final UnicodeSet retainAll(String JavaDoc s) {
1243        return retainAll(fromAll(s));
1244    }
1245
1246    /**
1247     * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1248     * If this set already any particular character, it has no effect on that character.
1249     * @param s the source string
1250     * @return this object, for chaining
1251     * @stable ICU 2.0
1252     */

1253    public final UnicodeSet complementAll(String JavaDoc s) {
1254        return complementAll(fromAll(s));
1255    }
1256
1257    /**
1258     * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1259     * If this set already any particular character, it has no effect on that character.
1260     * @param s the source string
1261     * @return this object, for chaining
1262     * @stable ICU 2.0
1263     */

1264    public final UnicodeSet removeAll(String JavaDoc s) {
1265        return removeAll(fromAll(s));
1266    }
1267
1268    /**
1269     * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1270     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1271     * @param s the source string
1272     * @return a newly created set containing the given string
1273     * @stable ICU 2.0
1274     */

1275    public static UnicodeSet from(String JavaDoc s) {
1276        return new UnicodeSet().add(s);
1277    }
1278
1279
1280    /**
1281     * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1282     * @param s the source string
1283     * @return a newly created set containing the given characters
1284     * @stable ICU 2.0
1285     */

1286    public static UnicodeSet fromAll(String JavaDoc s) {
1287        return new UnicodeSet().addAll(s);
1288    }
1289
1290
1291    /**
1292     * Retain only the elements in this set that are contained in the
1293     * specified range. If <code>end > start</code> then an empty range is
1294     * retained, leaving the set empty.
1295     *
1296     * @param start first character, inclusive, of range to be retained
1297     * to this set.
1298     * @param end last character, inclusive, of range to be retained
1299     * to this set.
1300     * @stable ICU 2.0
1301     */

1302    public UnicodeSet retain(int start, int end) {
1303        checkFrozen();
1304        if (start < MIN_VALUE || start > MAX_VALUE) {
1305            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(start, 6));
1306        }
1307        if (end < MIN_VALUE || end > MAX_VALUE) {
1308            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(end, 6));
1309        }
1310        if (start <= end) {
1311            retain(range(start, end), 2, 0);
1312        } else {
1313            clear();
1314        }
1315        return this;
1316    }
1317
1318    /**
1319     * Retain the specified character from this set if it is present.
1320     * Upon return this set will be empty if it did not contain c, or
1321     * will only contain c if it did contain c.
1322     * @param c the character to be retained
1323     * @return this object, for chaining
1324     * @stable ICU 2.0
1325     */

1326    public final UnicodeSet retain(int c) {
1327        return retain(c, c);
1328    }
1329
1330    /**
1331     * Retain the specified string in this set if it is present.
1332     * Upon return this set will be empty if it did not contain s, or
1333     * will only contain s if it did contain s.
1334     * @param s the string to be retained
1335     * @return this object, for chaining
1336     * @stable ICU 2.0
1337     */

1338    public final UnicodeSet retain(String JavaDoc s) {
1339        int cp = getSingleCP(s);
1340        if (cp < 0) {
1341            boolean isIn = strings.contains(s);
1342            if (isIn && size() == 1) {
1343                return this;
1344            }
1345            clear();
1346            strings.add(s);
1347            pat = null;
1348        } else {
1349            retain(cp, cp);
1350        }
1351        return this;
1352    }
1353
1354    /**
1355     * Removes the specified range from this set if it is present.
1356     * The set will not contain the specified range once the call
1357     * returns. If <code>end > start</code> then an empty range is
1358     * removed, leaving the set unchanged.
1359     *
1360     * @param start first character, inclusive, of range to be removed
1361     * from this set.
1362     * @param end last character, inclusive, of range to be removed
1363     * from this set.
1364     * @stable ICU 2.0
1365     */

1366    public UnicodeSet remove(int start, int end) {
1367        checkFrozen();
1368        if (start < MIN_VALUE || start > MAX_VALUE) {
1369            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(start, 6));
1370        }
1371        if (end < MIN_VALUE || end > MAX_VALUE) {
1372            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(end, 6));
1373        }
1374        if (start <= end) {
1375            retain(range(start, end), 2, 2);
1376        }
1377        return this;
1378    }
1379
1380    /**
1381     * Removes the specified character from this set if it is present.
1382     * The set will not contain the specified character once the call
1383     * returns.
1384     * @param c the character to be removed
1385     * @return this object, for chaining
1386     * @stable ICU 2.0
1387     */

1388    public final UnicodeSet remove(int c) {
1389        return remove(c, c);
1390    }
1391
1392    /**
1393     * Removes the specified string from this set if it is present.
1394     * The set will not contain the specified string once the call
1395     * returns.
1396     * @param s the string to be removed
1397     * @return this object, for chaining
1398     * @stable ICU 2.0
1399     */

1400    public final UnicodeSet remove(String JavaDoc s) {
1401        int cp = getSingleCP(s);
1402        if (cp < 0) {
1403            strings.remove(s);
1404            pat = null;
1405        } else {
1406            remove(cp, cp);
1407        }
1408        return this;
1409    }
1410
1411    /**
1412     * Complements the specified range in this set. Any character in
1413     * the range will be removed if it is in this set, or will be
1414     * added if it is not in this set. If <code>end > start</code>
1415     * then an empty range is complemented, leaving the set unchanged.
1416     *
1417     * @param start first character, inclusive, of range to be removed
1418     * from this set.
1419     * @param end last character, inclusive, of range to be removed
1420     * from this set.
1421     * @stable ICU 2.0
1422     */

1423    public UnicodeSet complement(int start, int end) {
1424        checkFrozen();
1425        if (start < MIN_VALUE || start > MAX_VALUE) {
1426            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(start, 6));
1427        }
1428        if (end < MIN_VALUE || end > MAX_VALUE) {
1429            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(end, 6));
1430        }
1431        if (start <= end) {
1432            xor(range(start, end), 2, 0);
1433        }
1434        pat = null;
1435        return this;
1436    }
1437
1438    /**
1439     * Complements the specified character in this set. The character
1440     * will be removed if it is in this set, or will be added if it is
1441     * not in this set.
1442     * @stable ICU 2.0
1443     */

1444    public final UnicodeSet complement(int c) {
1445        return complement(c, c);
1446    }
1447
1448    /**
1449     * This is equivalent to
1450     * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1451     * @stable ICU 2.0
1452     */

1453    public UnicodeSet complement() {
1454        checkFrozen();
1455        if (list[0] == LOW) {
1456            System.arraycopy(list, 1, list, 0, len-1);
1457            --len;
1458        } else {
1459            ensureCapacity(len+1);
1460            System.arraycopy(list, 0, list, 1, len);
1461            list[0] = LOW;
1462            ++len;
1463        }
1464        pat = null;
1465        return this;
1466    }
1467
1468    /**
1469     * Complement the specified string in this set.
1470     * The set will not contain the specified string once the call
1471     * returns.
1472     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1473     * @param s the string to complement
1474     * @return this object, for chaining
1475     * @stable ICU 2.0
1476     */

1477    public final UnicodeSet complement(String JavaDoc s) {
1478        checkFrozen();
1479        int cp = getSingleCP(s);
1480        if (cp < 0) {
1481            if (strings.contains(s)) strings.remove(s);
1482            else strings.add(s);
1483            pat = null;
1484        } else {
1485            complement(cp, cp);
1486        }
1487        return this;
1488    }
1489
1490    /**
1491     * Returns true if this set contains the given character.
1492     * @param c character to be checked for containment
1493     * @return true if the test condition is met
1494     * @stable ICU 2.0
1495     */

1496    public boolean contains(int c) {
1497        if (c < MIN_VALUE || c > MAX_VALUE) {
1498            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(c, 6));
1499        }
1500
1501        /*
1502        // Set i to the index of the start item greater than ch
1503        // We know we will terminate without length test!
1504        int i = -1;
1505        while (true) {
1506            if (c < list[++i]) break;
1507        }
1508        */

1509
1510        int i = findCodePoint(c);
1511
1512        return ((i & 1) != 0); // return true if odd
1513
}
1514
1515    /**
1516     * Returns the smallest value i such that c < list[i]. Caller
1517     * must ensure that c is a legal value or this method will enter
1518     * an infinite loop. This method performs a binary search.
1519     * @param c a character in the range MIN_VALUE..MAX_VALUE
1520     * inclusive
1521     * @return the smallest integer i in the range 0..len-1,
1522     * inclusive, such that c < list[i]
1523     */

1524    private final int findCodePoint(int c) {
1525        /* Examples:
1526                                           findCodePoint(c)
1527           set list[] c=0 1 3 4 7 8
1528           === ============== ===========
1529           [] [110000] 0 0 0 0 0 0
1530           [-] [0, 4, 110000] 1 1 1 2 2 2
1531           [-] [4, 8, 110000] 0 0 0 1 1 2
1532           [:all:] [0, 110000] 1 1 1 1 1 1
1533         */

1534
1535        // Return the smallest i such that c < list[i]. Assume
1536
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
1537
if (c < list[0]) return 0;
1538        // High runner test. c is often after the last range, so an
1539
// initial check for this condition pays off.
1540
if (len >= 2 && c >= list[len-2]) return len-1;
1541        int lo = 0;
1542        int hi = len - 1;
1543        // invariant: c >= list[lo]
1544
// invariant: c < list[hi]
1545
for (;;) {
1546            int i = (lo + hi) >>> 1;
1547            if (i == lo) return hi;
1548            if (c < list[i]) {
1549                hi = i;
1550            } else {
1551                lo = i;
1552            }
1553        }
1554    }
1555
1556// //----------------------------------------------------------------
1557
// // Unrolled binary search
1558
// //----------------------------------------------------------------
1559
//
1560
// private int validLen = -1; // validated value of len
1561
// private int topOfLow;
1562
// private int topOfHigh;
1563
// private int power;
1564
// private int deltaStart;
1565
//
1566
// private void validate() {
1567
// if (len <= 1) {
1568
// throw new IllegalArgumentException("list.len==" + len + "; must be >1");
1569
// }
1570
//
1571
// // find greatest power of 2 less than or equal to len
1572
// for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {}
1573
//
1574
// // assert(exp2[power] <= len);
1575
//
1576
// // determine the starting points
1577
// topOfLow = exp2[power] - 1;
1578
// topOfHigh = len - 1;
1579
// deltaStart = exp2[power-1];
1580
// validLen = len;
1581
// }
1582
//
1583
// private static final int exp2[] = {
1584
// 0x1, 0x2, 0x4, 0x8,
1585
// 0x10, 0x20, 0x40, 0x80,
1586
// 0x100, 0x200, 0x400, 0x800,
1587
// 0x1000, 0x2000, 0x4000, 0x8000,
1588
// 0x10000, 0x20000, 0x40000, 0x80000,
1589
// 0x100000, 0x200000, 0x400000, 0x800000,
1590
// 0x1000000, 0x2000000, 0x4000000, 0x8000000,
1591
// 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java
1592
// };
1593
//
1594
// /**
1595
// * Unrolled lowest index GT.
1596
// */
1597
// private final int leastIndexGT(int searchValue) {
1598
//
1599
// if (len != validLen) {
1600
// if (len == 1) return 0;
1601
// validate();
1602
// }
1603
// int temp;
1604
//
1605
// // set up initial range to search. Each subrange is a power of two in length
1606
// int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh;
1607
//
1608
// // Completely unrolled binary search, folhighing "Programming Pearls"
1609
// // Each case deliberately falls through to the next
1610
// // Logically, list[-1] < all_search_values && list[count] > all_search_values
1611
// // although the values -1 and count are never actually touched.
1612
//
1613
// // The bounds at each point are low & high,
1614
// // where low == high - delta*2
1615
// // so high - delta is the midpoint
1616
//
1617
// // The invariant AFTER each line is that list[low] < searchValue <= list[high]
1618
//
1619
// switch (power) {
1620
// //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java
1621
// case 30: if (searchValue < list[temp = high-0x20000000]) high = temp;
1622
// case 29: if (searchValue < list[temp = high-0x10000000]) high = temp;
1623
//
1624
// case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp;
1625
// case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp;
1626
// case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp;
1627
// case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp;
1628
//
1629
// case 24: if (searchValue < list[temp = high- 0x800000]) high = temp;
1630
// case 23: if (searchValue < list[temp = high- 0x400000]) high = temp;
1631
// case 22: if (searchValue < list[temp = high- 0x200000]) high = temp;
1632
// case 21: if (searchValue < list[temp = high- 0x100000]) high = temp;
1633
//
1634
// case 20: if (searchValue < list[temp = high- 0x80000]) high = temp;
1635
// case 19: if (searchValue < list[temp = high- 0x40000]) high = temp;
1636
// case 18: if (searchValue < list[temp = high- 0x20000]) high = temp;
1637
// case 17: if (searchValue < list[temp = high- 0x10000]) high = temp;
1638
//
1639
// case 16: if (searchValue < list[temp = high- 0x8000]) high = temp;
1640
// case 15: if (searchValue < list[temp = high- 0x4000]) high = temp;
1641
// case 14: if (searchValue < list[temp = high- 0x2000]) high = temp;
1642
// case 13: if (searchValue < list[temp = high- 0x1000]) high = temp;
1643
//
1644
// case 12: if (searchValue < list[temp = high- 0x800]) high = temp;
1645
// case 11: if (searchValue < list[temp = high- 0x400]) high = temp;
1646
// case 10: if (searchValue < list[temp = high- 0x200]) high = temp;
1647
// case 9: if (searchValue < list[temp = high- 0x100]) high = temp;
1648
//
1649
// case 8: if (searchValue < list[temp = high- 0x80]) high = temp;
1650
// case 7: if (searchValue < list[temp = high- 0x40]) high = temp;
1651
// case 6: if (searchValue < list[temp = high- 0x20]) high = temp;
1652
// case 5: if (searchValue < list[temp = high- 0x10]) high = temp;
1653
//
1654
// case 4: if (searchValue < list[temp = high- 0x8]) high = temp;
1655
// case 3: if (searchValue < list[temp = high- 0x4]) high = temp;
1656
// case 2: if (searchValue < list[temp = high- 0x2]) high = temp;
1657
// case 1: if (searchValue < list[temp = high- 0x1]) high = temp;
1658
// }
1659
//
1660
// return high;
1661
// }
1662
//
1663
// // For debugging only
1664
// public int len() {
1665
// return len;
1666
// }
1667
//
1668
// //----------------------------------------------------------------
1669
// //----------------------------------------------------------------
1670

1671    /**
1672     * Returns true if this set contains every character
1673     * of the given range.
1674     * @param start first character, inclusive, of the range
1675     * @param end last character, inclusive, of the range
1676     * @return true if the test condition is met
1677     * @stable ICU 2.0
1678     */

1679    public boolean contains(int start, int end) {
1680        if (start < MIN_VALUE || start > MAX_VALUE) {
1681            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(start, 6));
1682        }
1683        if (end < MIN_VALUE || end > MAX_VALUE) {
1684            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(end, 6));
1685        }
1686        //int i = -1;
1687
//while (true) {
1688
// if (start < list[++i]) break;
1689
//}
1690
int i = findCodePoint(start);
1691        return ((i & 1) != 0 && end < list[i]);
1692    }
1693
1694    /**
1695     * Returns <tt>true</tt> if this set contains the given
1696     * multicharacter string.
1697     * @param s string to be checked for containment
1698     * @return <tt>true</tt> if this set contains the specified string
1699     * @stable ICU 2.0
1700     */

1701    public final boolean contains(String JavaDoc s) {
1702
1703        int cp = getSingleCP(s);
1704        if (cp < 0) {
1705            return strings.contains(s);
1706        } else {
1707            return contains(cp);
1708        }
1709    }
1710
1711    /**
1712     * Returns true if this set contains all the characters and strings
1713     * of the given set.
1714     * @param c set to be checked for containment
1715     * @return true if the test condition is met
1716     * @stable ICU 2.0
1717     */

1718    public boolean containsAll(UnicodeSet c) {
1719        // The specified set is a subset if all of its pairs are contained in
1720
// this set. It's possible to code this more efficiently in terms of
1721
// direct manipulation of the inversion lists if the need arises.
1722
int n = c.getRangeCount();
1723        for (int i=0; i<n; ++i) {
1724            if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
1725                return false;
1726            }
1727        }
1728        if (!strings.containsAll(c.strings)) return false;
1729        return true;
1730    }
1731
1732    /**
1733     * Returns true if there is a partition of the string such that this set contains each of the partitioned strings.
1734     * For example, for the Unicode set [a{bc}{cd}]<br>
1735     * containsAll is true for each of: "a", "bc", ""cdbca"<br>
1736     * containsAll is false for each of: "acb", "bcda", "bcx"<br>
1737     * @param s string containing characters to be checked for containment
1738     * @return true if the test condition is met
1739     * @stable ICU 2.0
1740     */

1741     public boolean containsAll(String JavaDoc s) {
1742        int cp;
1743        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1744            cp = UTF16.charAt(s, i);
1745            if (!contains(cp)) {
1746                if (strings.size() == 0) {
1747                    return false;
1748                }
1749                return containsAll(s, 0);
1750            }
1751        }
1752        return true;
1753    }
1754
1755    /**
1756     * Recursive routine called if we fail to find a match in containsAll, and there are strings
1757     * @param s source string
1758     * @param i point to match to the end on
1759     * @return true if ok
1760     */

1761    private boolean containsAll(String JavaDoc s, int i) {
1762        if (i >= s.length()) {
1763            return true;
1764        }
1765        int cp= UTF16.charAt(s, i);
1766        if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) {
1767            return true;
1768        }
1769        
1770        Iterator it = strings.iterator();
1771        while (it.hasNext()) {
1772            String JavaDoc setStr = (String JavaDoc)it.next();
1773            if (s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) {
1774                return true;
1775            }
1776        }
1777        return false;
1778        
1779    }
1780
1781    /**
1782     * @return regex pattern equivalent to this UnicodeSet
1783     * @internal
1784     * @deprecated This API is ICU internal only.
1785     */

1786    public String JavaDoc getRegexEquivalent() {
1787        if (strings.size() == 0) return toString();
1788        StringBuffer JavaDoc result = new StringBuffer JavaDoc("(?:");
1789        _generatePattern(result, true, false);
1790        Iterator it = strings.iterator();
1791        while (it.hasNext()) {
1792            result.append('|');
1793            _appendToPat(result, (String JavaDoc) it.next(), true);
1794        }
1795        return result.append(")").toString();
1796    }
1797
1798    /**
1799     * Returns true if this set contains none of the characters
1800     * of the given range.
1801     * @param start first character, inclusive, of the range
1802     * @param end last character, inclusive, of the range
1803     * @return true if the test condition is met
1804     * @stable ICU 2.0
1805     */

1806    public boolean containsNone(int start, int end) {
1807        if (start < MIN_VALUE || start > MAX_VALUE) {
1808            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(start, 6));
1809        }
1810        if (end < MIN_VALUE || end > MAX_VALUE) {
1811            throw new IllegalArgumentException JavaDoc("Invalid code point U+" + Utility.hex(end, 6));
1812        }
1813        int i = -1;
1814        while (true) {
1815            if (start < list[++i]) break;
1816        }
1817        return ((i & 1) == 0 && end < list[i]);
1818    }
1819
1820    /**
1821     * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
1822     * For example, for the Unicode set [a{bc}{cd}]<br>
1823     * containsNone is true for: "xy", "cb"<br>
1824     * containsNone is false for: "a", "bc", "bcd"<br>
1825     * @param c set to be checked for containment
1826     * @return true if the test condition is met
1827     * @stable ICU 2.0
1828     */

1829    public boolean containsNone(UnicodeSet c) {
1830        // The specified set is a subset if all of its pairs are contained in
1831
// this set. It's possible to code this more efficiently in terms of
1832
// direct manipulation of the inversion lists if the need arises.
1833
int n = c.getRangeCount();
1834        for (int i=0; i<n; ++i) {
1835            if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
1836                return false;
1837            }
1838        }
1839        if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false;
1840        return true;
1841    }
1842
1843    /**
1844     * Returns true if this set contains none of the characters
1845     * of the given string.
1846     * @param s string containing characters to be checked for containment
1847     * @return true if the test condition is met
1848     * @stable ICU 2.0
1849     */

1850    public boolean containsNone(String JavaDoc s) {
1851        int cp;
1852        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1853            cp = UTF16.charAt(s, i);
1854            if (contains(cp)) return false;
1855        }
1856        if (strings.size() == 0) return true;
1857        // do a last check to make sure no strings are in.
1858
for (Iterator it = strings.iterator(); it.hasNext();) {
1859            String JavaDoc item = (String JavaDoc)it.next();
1860            if (s.indexOf(item) >= 0) return false;
1861        }
1862        return true;
1863    }
1864
1865    /**
1866     * Returns true if this set contains one or more of the characters
1867     * in the given range.
1868     * @param start first character, inclusive, of the range
1869     * @param end last character, inclusive, of the range
1870     * @return true if the condition is met
1871     * @stable ICU 2.0
1872     */

1873    public final boolean containsSome(int start, int end) {
1874        return !containsNone(start, end);
1875    }
1876
1877    /**
1878     * Returns true if this set contains one or more of the characters
1879     * and strings of the given set.
1880     * @param s set to be checked for containment
1881     * @return true if the condition is met
1882     * @stable ICU 2.0
1883     */

1884    public final boolean containsSome(UnicodeSet s) {
1885        return !containsNone(s);
1886    }
1887
1888    /**
1889     * Returns true if this set contains one or more of the characters
1890     * of the given string.
1891     * @param s string containing characters to be checked for containment
1892     * @return true if the condition is met
1893     * @stable ICU 2.0
1894     */

1895    public final boolean containsSome(String JavaDoc s) {
1896        return !containsNone(s);
1897    }
1898
1899
1900    /**
1901     * Adds all of the elements in the specified set to this set if
1902     * they're not already present. This operation effectively
1903     * modifies this set so that its value is the <i>union</i> of the two
1904     * sets. The behavior of this operation is unspecified if the specified
1905     * collection is modified while the operation is in progress.
1906     *
1907     * @param c set whose elements are to be added to this set.
1908     * @stable ICU 2.0
1909     */

1910    public UnicodeSet addAll(UnicodeSet c) {
1911        checkFrozen();
1912        add(c.list, c.len, 0);
1913        strings.addAll(c.strings);
1914        return this;
1915    }
1916
1917    /**
1918     * Retains only the elements in this set that are contained in the
1919     * specified set. In other words, removes from this set all of
1920     * its elements that are not contained in the specified set. This
1921     * operation effectively modifies this set so that its value is
1922     * the <i>intersection</i> of the two sets.
1923     *
1924     * @param c set that defines which elements this set will retain.
1925     * @stable ICU 2.0
1926     */

1927    public UnicodeSet retainAll(UnicodeSet c) {
1928        checkFrozen();
1929        retain(c.list, c.len, 0);
1930        strings.retainAll(c.strings);
1931        return this;
1932    }
1933
1934    /**
1935     * Removes from this set all of its elements that are contained in the
1936     * specified set. This operation effectively modifies this
1937     * set so that its value is the <i>asymmetric set difference</i> of
1938     * the two sets.
1939     *
1940     * @param c set that defines which elements will be removed from
1941     * this set.
1942     * @stable ICU 2.0
1943     */

1944    public UnicodeSet removeAll(UnicodeSet c) {
1945        checkFrozen();
1946        retain(c.list, c.len, 2);
1947        strings.removeAll(c.strings);
1948        return this;
1949    }
1950
1951    /**
1952     * Complements in this set all elements contained in the specified
1953     * set. Any character in the other set will be removed if it is
1954     * in this set, or will be added if it is not in this set.
1955     *
1956     * @param c set that defines which elements will be complemented from
1957     * this set.
1958     * @stable ICU 2.0
1959     */

1960    public UnicodeSet complementAll(UnicodeSet c) {
1961        checkFrozen();
1962        xor(c.list, c.len, 0);
1963        SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings);
1964        return this;
1965    }
1966
1967    /**
1968     * Removes all of the elements from this set. This set will be
1969     * empty after this call returns.
1970     * @stable ICU 2.0
1971     */

1972    public UnicodeSet clear() {
1973        checkFrozen();
1974        list[0] = HIGH;
1975        len = 1;
1976        pat = null;
1977        strings.clear();
1978        return this;
1979    }
1980
1981    /**
1982     * Iteration method that returns the number of ranges contained in
1983     * this set.
1984     * @see #getRangeStart
1985     * @see #getRangeEnd
1986     * @stable ICU 2.0
1987     */

1988    public int getRangeCount() {
1989        return len/2;
1990    }
1991
1992    /**
1993     * Iteration method that returns the first character in the
1994     * specified range of this set.
1995     * @exception ArrayIndexOutOfBoundsException if index is outside
1996     * the range <code>0..getRangeCount()-1</code>
1997     * @see #getRangeCount
1998     * @see #getRangeEnd
1999     * @stable ICU 2.0
2000     */

2001    public int getRangeStart(int index) {
2002        return list[index*2];
2003    }
2004
2005    /**
2006     * Iteration method that returns the last character in the
2007     * specified range of this set.
2008     * @exception ArrayIndexOutOfBoundsException if index is outside
2009     * the range <code>0..getRangeCount()-1</code>
2010     * @see #getRangeStart
2011     * @see #getRangeEnd
2012     * @stable ICU 2.0
2013     */

2014    public int getRangeEnd(int index) {
2015        return (list[index*2 + 1] - 1);
2016    }
2017
2018    /**
2019     * Reallocate this objects internal structures to take up the least
2020     * possible space, without changing this object's value.
2021     * @stable ICU 2.0
2022     */

2023    public UnicodeSet compact() {
2024        checkFrozen();
2025        if (len != list.length) {
2026            int[] temp = new int[len];
2027            System.arraycopy(list, 0, temp, 0, len);
2028            list = temp;
2029        }
2030        rangeList = null;
2031        buffer = null;
2032        return this;
2033    }
2034
2035    /**
2036     * Compares the specified object with this set for equality. Returns
2037     * <tt>true</tt> if the specified object is also a set, the two sets
2038     * have the same size, and every member of the specified set is
2039     * contained in this set (or equivalently, every member of this set is
2040     * contained in the specified set).
2041     *
2042     * @param o Object to be compared for equality with this set.
2043     * @return <tt>true</tt> if the specified Object is equal to this set.
2044     * @stable ICU 2.0
2045     */

2046    public boolean equals(Object JavaDoc o) {
2047        try {
2048            UnicodeSet that = (UnicodeSet) o;
2049            if (len != that.len) return false;
2050            for (int i = 0; i < len; ++i) {
2051                if (list[i] != that.list[i]) return false;
2052            }
2053            if (!strings.equals(that.strings)) return false;
2054        } catch (Exception JavaDoc e) {
2055            return false;
2056        }
2057        return true;
2058    }
2059
2060    /**
2061     * Returns the hash code value for this set.
2062     *
2063     * @return the hash code value for this set.
2064     * @see java.lang.Object#hashCode()
2065     * @stable ICU 2.0
2066     */

2067    public int hashCode() {
2068        int result = len;
2069        for (int i = 0; i < len; ++i) {
2070            result *= 1000003;
2071            result += list[i];
2072        }
2073        return result;
2074    }
2075
2076    /**
2077     * Return a programmer-readable string representation of this object.
2078     * @stable ICU 2.0
2079     */

2080    public String JavaDoc toString() {
2081        return toPattern(true);
2082    }
2083
2084    //----------------------------------------------------------------
2085
// Implementation: Pattern parsing
2086
//----------------------------------------------------------------
2087

2088    /**
2089     * Parses the given pattern, starting at the given position. The character
2090     * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
2091     * Parsing continues until the corresponding closing ']'. If a syntax error
2092     * is encountered between the opening and closing brace, the parse fails.
2093     * Upon return from a successful parse, the ParsePosition is updated to
2094     * point to the character following the closing ']', and an inversion
2095     * list for the parsed pattern is returned. This method
2096     * calls itself recursively to parse embedded subpatterns.
2097     *
2098     * @param pattern the string containing the pattern to be parsed. The
2099     * portion of the string from pos.getIndex(), which must be a '[', to the
2100     * corresponding closing ']', is parsed.
2101     * @param pos upon entry, the position at which to being parsing. The
2102     * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return
2103     * from a successful parse, pos.getIndex() is either the character after the
2104     * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
2105     * is the last character of the pattern string.
2106     * @return an inversion list for the parsed substring
2107     * of <code>pattern</code>
2108     * @exception java.lang.IllegalArgumentException if the parse fails.
2109     */

2110    UnicodeSet applyPattern(String JavaDoc pattern,
2111                      ParsePosition pos,
2112                      SymbolTable symbols,
2113                      int options) {
2114
2115        // Need to build the pattern in a temporary string because
2116
// _applyPattern calls add() etc., which set pat to empty.
2117
boolean parsePositionWasNull = pos == null;
2118        if (parsePositionWasNull) {
2119            pos = new ParsePosition(0);
2120        }
2121
2122        StringBuffer JavaDoc rebuiltPat = new StringBuffer JavaDoc();
2123        RuleCharacterIterator chars =
2124            new RuleCharacterIterator(pattern, symbols, pos);
2125        applyPattern(chars, symbols, rebuiltPat, options);
2126        if (chars.inVariable()) {
2127            syntaxError(chars, "Extra chars in variable value");
2128        }
2129        pat = rebuiltPat.toString();
2130        if (parsePositionWasNull) {
2131            int i = pos.getIndex();
2132
2133            // Skip over trailing whitespace
2134
if ((options & IGNORE_SPACE) != 0) {
2135                i = Utility.skipWhitespace(pattern, i);
2136            }
2137
2138            if (i != pattern.length()) {
2139                throw new IllegalArgumentException JavaDoc("Parse of \"" + pattern +
2140                                                   "\" failed at " + i);
2141            }
2142        }
2143        return this;
2144    }
2145
2146    /**
2147     * Parse the pattern from the given RuleCharacterIterator. The
2148     * iterator is advanced over the parsed pattern.
2149     * @param chars iterator over the pattern characters. Upon return
2150     * it will be advanced to the first character after the parsed
2151     * pattern, or the end of the iteration if all characters are
2152     * parsed.
2153     * @param symbols symbol table to use to parse and dereference
2154     * variables, or null if none.
2155     * @param rebuiltPat the pattern that was parsed, rebuilt or
2156     * copied from the input pattern, as appropriate.
2157     * @param options a bit mask of zero or more of the following:
2158     * IGNORE_SPACE, CASE.
2159     */

2160    void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
2161                      StringBuffer JavaDoc rebuiltPat, int options) {
2162
2163        // Syntax characters: [ ] ^ - & { }
2164

2165        // Recognized special forms for chars, sets: c-c s-s s&s
2166

2167        int opts = RuleCharacterIterator.PARSE_VARIABLES |
2168                   RuleCharacterIterator.PARSE_ESCAPES;
2169        if ((options & IGNORE_SPACE) != 0) {
2170            opts |= RuleCharacterIterator.SKIP_WHITESPACE;
2171        }
2172
2173        StringBuffer JavaDoc pat = new StringBuffer JavaDoc(), buf = null;
2174        boolean usePat = false;
2175        UnicodeSet scratch = null;
2176        Object JavaDoc backup = null;
2177
2178        // mode: 0=before [, 1=between [...], 2=after ]
2179
// lastItem: 0=none, 1=char, 2=set
2180
int lastItem = 0, lastChar = 0, mode = 0;
2181        char op = 0;
2182
2183        boolean invert = false;
2184
2185        clear();
2186
2187        while (mode != 2 && !chars.atEnd()) {
2188            if (false) {
2189                // Debugging assertion
2190
if (!((lastItem == 0 && op == 0) ||
2191                      (lastItem == 1 && (op == 0 || op == '-')) ||
2192                      (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
2193                    throw new IllegalArgumentException JavaDoc();
2194                }
2195            }
2196
2197            int c = 0;
2198            boolean literal = false;
2199            UnicodeSet nested = null;
2200
2201            // -------- Check for property pattern
2202

2203            // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
2204
int setMode = 0;
2205            if (resemblesPropertyPattern(chars, opts)) {
2206                setMode = 2;
2207            }
2208
2209            // -------- Parse '[' of opening delimiter OR nested set.
2210
// If there is a nested set, use `setMode' to define how
2211
// the set should be parsed. If the '[' is part of the
2212
// opening delimiter for this pattern, parse special
2213
// strings "[", "[^", "[-", and "[^-". Check for stand-in
2214
// characters representing a nested set in the symbol
2215
// table.
2216

2217            else {
2218                // Prepare to backup if necessary
2219
backup = chars.getPos(backup);
2220                c = chars.next(opts);
2221                literal = chars.isEscaped();
2222
2223                if (c == '[' && !literal) {
2224                    if (mode == 1) {
2225                        chars.setPos(backup); // backup
2226
setMode = 1;
2227                    } else {
2228                        // Handle opening '[' delimiter
2229
mode = 1;
2230                        pat.append('[');
2231                        backup = chars.getPos(backup); // prepare to backup
2232
c = chars.next(opts);
2233                        literal = chars.isEscaped();
2234                        if (c == '^' && !literal) {
2235                            invert = true;
2236                            pat.append('^');
2237                            backup = chars.getPos(backup); // prepare to backup
2238
c = chars.next(opts);
2239                            literal = chars.isEscaped();
2240                        }
2241                        // Fall through to handle special leading '-';
2242
// otherwise restart loop for nested [], \p{}, etc.
2243
if (c == '-') {
2244                            literal = true;
2245                            // Fall through to handle literal '-' below
2246
} else {
2247                            chars.setPos(backup); // backup
2248
continue;
2249                        }
2250                    }
2251                } else if (symbols != null) {
2252                     UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
2253
if (m != null) {
2254                         try {
2255                             nested = (UnicodeSet) m;
2256                             setMode = 3;
2257                         } catch (ClassCastException JavaDoc e) {
2258                             syntaxError(chars, "Syntax error");
2259                         }
2260                     }
2261                }
2262            }
2263
2264            // -------- Handle a nested set. This either is inline in
2265
// the pattern or represented by a stand-in that has
2266
// previously been parsed and was looked up in the symbol
2267
// table.
2268

2269            if (setMode != 0) {
2270                if (lastItem == 1) {
2271                    if (op != 0) {
2272                        syntaxError(chars, "Char expected after operator");
2273                    }
2274                    add_unchecked(lastChar, lastChar);
2275                    _appendToPat(pat, lastChar, false);
2276                    lastItem = op = 0;
2277                }
2278
2279                if (op == '-' || op == '&') {
2280                    pat.append(op);
2281                }
2282
2283                if (nested == null) {
2284                    if (scratch == null) scratch = new UnicodeSet();
2285                    nested = scratch;
2286                }
2287                switch (setMode) {
2288                case 1:
2289                    nested.applyPattern(chars, symbols, pat, options);
2290                    break;
2291                case 2:
2292                    chars.skipIgnored(opts);
2293                    nested.applyPropertyPattern(chars, pat, symbols);
2294                    break;
2295                case 3: // `nested' already parsed
2296
nested._toPattern(pat, false);
2297                    break;
2298                }
2299
2300                usePat = true;
2301
2302                if (mode == 0) {
2303                    // Entire pattern is a category; leave parse loop
2304
set(nested);
2305                    mode = 2;
2306                    break;
2307                }
2308
2309                switch (op) {
2310                case '-':
2311                    removeAll(nested);
2312                    break;
2313                case '&':
2314                    retainAll(nested);
2315                    break;
2316                case 0:
2317                    addAll(nested);
2318                    break;
2319                }
2320
2321                op = 0;
2322                lastItem = 2;
2323
2324                continue;
2325            }
2326
2327            if (mode == 0) {
2328                syntaxError(chars, "Missing '['");
2329            }
2330
2331            // -------- Parse special (syntax) characters. If the
2332
// current character is not special, or if it is escaped,
2333
// then fall through and handle it below.
2334

2335            if (!literal) {
2336                switch (c) {
2337                case ']':
2338                    if (lastItem == 1) {
2339                        add_unchecked(lastChar, lastChar);
2340                        _appendToPat(pat, lastChar, false);
2341                    }
2342                    // Treat final trailing '-' as a literal
2343
if (op == '-') {
2344                        add_unchecked(op, op);
2345                        pat.append(op);
2346                    } else if (op == '&') {
2347                        syntaxError(chars, "Trailing '&'");
2348                    }
2349                    pat.append(']');
2350                    mode = 2;
2351                    continue;
2352                case '-':
2353                    if (op == 0) {
2354                        if (lastItem != 0) {
2355                            op = (char) c;
2356                            continue;
2357                        } else {
2358                            // Treat final trailing '-' as a literal
2359
add_unchecked(c, c);
2360                            c = chars.next(opts);
2361                            literal = chars.isEscaped();
2362                            if (c == ']' && !literal) {
2363                                pat.append("-]");
2364                                mode = 2;
2365                                continue;
2366                            }
2367                        }
2368                    }
2369                    syntaxError(chars, "'-' not after char or set");
2370                case '&':
2371                    if (lastItem == 2 && op == 0) {
2372                        op = (char) c;
2373                        continue;
2374                    }
2375                    syntaxError(chars, "'&' not after set");
2376                case '^':
2377                    syntaxError(chars, "'^' not after '['");
2378                case '{':
2379                    if (op != 0) {
2380                        syntaxError(chars, "Missing operand after operator");
2381                    }
2382                    if (lastItem == 1) {
2383                        add_unchecked(lastChar, lastChar);
2384                        _appendToPat(pat, lastChar, false);
2385                    }
2386                    lastItem = 0;
2387                    if (buf == null) {
2388                        buf = new StringBuffer JavaDoc();
2389                    } else {
2390                        buf.setLength(0);
2391                    }
2392                    boolean ok = false;
2393                    while (!chars.atEnd()) {
2394                        c = chars.next(opts);
2395                        literal = chars.isEscaped();
2396                        if (c == '}' && !literal) {
2397                            ok = true;
2398                            break;
2399                        }
2400                        UTF16.append(buf, c);
2401                    }
2402                    if (buf.length() < 1 || !ok) {
2403                        syntaxError(chars, "Invalid multicharacter string");
2404                    }
2405                    // We have new string. Add it to set and continue;
2406
// we don't need to drop through to the further
2407
// processing
2408
add(buf.toString());
2409                    pat.append('{');
2410                    _appendToPat(pat, buf.toString(), false);
2411                    pat.append('}');
2412                    continue;
2413                case SymbolTable.SYMBOL_REF:
2414                    // symbols nosymbols
2415
// [a-$] error error (ambiguous)
2416
// [a$] anchor anchor
2417
// [a-$x] var "x"* literal '$'
2418
// [a-$.] error literal '$'
2419
// *We won't get here in the case of var "x"
2420
backup = chars.getPos(backup);
2421                    c = chars.next(opts);
2422                    literal = chars.isEscaped();
2423                    boolean anchor = (c == ']' && !literal);
2424                    if (symbols == null && !anchor) {
2425                        c = SymbolTable.SYMBOL_REF;
2426                        chars.setPos(backup);
2427                        break; // literal '$'
2428
}
2429                    if (anchor && op == 0) {
2430                        if (lastItem == 1) {
2431                            add_unchecked(lastChar, lastChar);
2432                            _appendToPat(pat, lastChar, false);
2433                        }
2434                        add_unchecked(UnicodeMatcher.ETHER);
2435                        usePat = true;
2436                        pat.append(SymbolTable.SYMBOL_REF).append(']');
2437                        mode = 2;
2438                        continue;
2439                    }
2440                    syntaxError(chars, "Unquoted '$'");
2441                default:
2442                    break;
2443                }
2444            }
2445
2446            // -------- Parse literal characters. This includes both
2447
// escaped chars ("\u4E01") and non-syntax characters
2448
// ("a").
2449

2450            switch (lastItem) {
2451            case 0:
2452                lastItem = 1;
2453                lastChar = c;
2454                break;
2455            case 1:
2456                if (op == '-') {
2457                    if (lastChar >= c) {
2458                        // Don't allow redundant (a-a) or empty (b-a) ranges;
2459
// these are most likely typos.
2460
syntaxError(chars, "Invalid range");
2461                    }
2462                    add_unchecked(lastChar, c);
2463                    _appendToPat(pat, lastChar, false);
2464                    pat.append(op);
2465                    _appendToPat(pat, c, false);
2466                    lastItem = op = 0;
2467                } else {
2468                    add_unchecked(lastChar, lastChar);
2469                    _appendToPat(pat, lastChar, false);
2470                    lastChar = c;
2471                }
2472                break;
2473            case 2:
2474                if (op != 0) {
2475                    syntaxError(chars, "Set expected after operator");
2476                }
2477                lastChar = c;
2478                lastItem = 1;
2479                break;
2480            }
2481        }
2482
2483        if (mode != 2) {
2484            syntaxError(chars, "Missing ']'");
2485        }
2486
2487        chars.skipIgnored(opts);
2488
2489        /**
2490         * Handle global flags (invert, case insensitivity). If this
2491         * pattern should be compiled case-insensitive, then we need
2492         * to close over case BEFORE COMPLEMENTING. This makes
2493         * patterns like /[^abc]/i work.
2494         */

2495        if ((options & CASE) != 0) {
2496            closeOver(CASE);
2497        }
2498        if (invert) {
2499            complement();
2500        }
2501
2502        // Use the rebuilt pattern (pat) only if necessary. Prefer the
2503
// generated pattern.
2504
if (usePat) {
2505            rebuiltPat.append(pat.toString());
2506        } else {
2507            _generatePattern(rebuiltPat, false, true);
2508        }
2509    }
2510
2511    private static void syntaxError(RuleCharacterIterator chars, String JavaDoc msg) {
2512        throw new IllegalArgumentException JavaDoc("Error: " + msg + " at \"" +
2513                                           Utility.escape(chars.toString()) +
2514                                           '"');
2515    }
2516
2517    /**
2518     * Add the contents of the UnicodeSet (as strings) into a collection.
2519     * @param target collection to add into
2520     * @stable ICU 2.8
2521     */

2522    public void addAllTo(Collection JavaDoc target) {
2523        UnicodeSetIterator it = new UnicodeSetIterator(this);
2524        while (it.next()) {
2525            target.add(it.getString());
2526        }
2527    }
2528
2529    /**
2530     * Add the contents of the collection (as strings) into this UnicodeSet.
2531     * @param source the collection to add
2532     * @stable ICU 2.8
2533     */

2534    public void addAll(Collection JavaDoc source) {
2535        checkFrozen();
2536        Iterator it = source.iterator();
2537        while (it.hasNext()) {
2538            add(it.next().toString());
2539        }
2540    }
2541
2542    //----------------------------------------------------------------
2543
// Implementation: Utility methods
2544
//----------------------------------------------------------------
2545

2546    private void ensureCapacity(int newLen) {
2547        if (newLen <= list.length) return;
2548        int[] temp = new int[newLen + GROW_EXTRA];
2549        System.arraycopy(list, 0, temp, 0, len);
2550        list = temp;
2551    }
2552
2553    private void ensureBufferCapacity(int newLen) {
2554        if (buffer != null && newLen <= buffer.length) return;
2555        buffer = new int[newLen + GROW_EXTRA];
2556    }
2557
2558    /**
2559     * Assumes start <= end.
2560     */

2561    private int[] range(int start, int end) {
2562        if (rangeList == null) {
2563            rangeList = new int[] { start, end+1, HIGH };
2564        } else {
2565            rangeList[0] = start;
2566            rangeList[1] = end+1;
2567        }
2568        return rangeList;
2569    }
2570
2571    //----------------------------------------------------------------
2572
// Implementation: Fundamental operations
2573
//----------------------------------------------------------------
2574

2575    // polarity = 0, 3 is normal: x xor y
2576
// polarity = 1, 2: x xor ~y == x === y
2577

2578    private UnicodeSet xor(int[] other, int otherLen, int polarity) {
2579        ensureBufferCapacity(len + otherLen);
2580        int i = 0, j = 0, k = 0;
2581        int a = list[i++];
2582        int b;
2583        if (polarity == 1 || polarity == 2) {
2584            b = LOW;
2585            if (other[j] == LOW) { // skip base if already LOW
2586
++j;
2587                b = other[j];
2588            }
2589        } else {
2590            b = other[j++];
2591        }
2592        // simplest of all the routines
2593
// sort the values, discarding identicals!
2594
while (true) {
2595            if (a < b) {
2596                buffer[k++] = a;
2597                a = list[i++];
2598            } else if (b < a) {
2599                buffer[k++] = b;
2600                b = other[j++];
2601            } else if (a != HIGH) { // at this point, a == b
2602
// discard both values!
2603
a = list[i++];
2604                b = other[j++];
2605            } else { // DONE!
2606
buffer[k++] = HIGH;
2607                len = k;
2608                break;
2609            }
2610        }
2611        // swap list and buffer
2612
int[] temp = list;
2613        list = buffer;
2614        buffer = temp;
2615        pat = null;
2616        return this;
2617    }
2618
2619    // polarity = 0 is normal: x union y
2620
// polarity = 2: x union ~y
2621
// polarity = 1: ~x union y
2622
// polarity = 3: ~x union ~y
2623

2624    private UnicodeSet add(int[] other, int otherLen, int polarity) {
2625        ensureBufferCapacity(len + otherLen);
2626        int i = 0, j = 0, k = 0;
2627        int a = list[i++];
2628        int b = other[j++];
2629        // change from xor is that we have to check overlapping pairs
2630
// polarity bit 1 means a is second, bit 2 means b is.
2631
main:
2632        while (true) {
2633            switch (polarity) {
2634              case 0: // both first; take lower if unequal
2635
if (a < b) { // take a
2636
// Back up over overlapping ranges in buffer[]
2637
if (k > 0 && a <= buffer[k-1]) {
2638                        // Pick latter end value in buffer[] vs. list[]
2639
a = max(list[i], buffer[--k]);
2640                    } else {
2641                        // No overlap
2642
buffer[k++] = a;
2643                        a = list[i];
2644                    }
2645                    i++; // Common if/else code factored out
2646
polarity ^= 1;
2647                } else if (b < a) { // take b
2648
if (k > 0 && b <= buffer[k-1]) {
2649                        b = max(other[j], buffer[--k]);
2650                    } else {
2651                        buffer[k++] = b;
2652                        b = other[j];
2653                    }
2654                    j++;
2655                    polarity ^= 2;
2656                } else { // a == b, take a, drop b
2657
if (a == HIGH) break main;
2658                    // This is symmetrical; it doesn't matter if
2659
// we backtrack with a or b. - liu
2660
if (k > 0 && a <= buffer[k-1]) {
2661                        a = max(list[i], buffer[--k]);
2662                    } else {
2663                        // No overlap
2664
buffer[k++] = a;
2665                        a = list[i];
2666                    }
2667                    i++;
2668                    polarity ^= 1;
2669                    b = other[j++]; polarity ^= 2;
2670                }
2671                break;
2672              case 3: // both second; take higher if unequal, and drop other
2673
if (b <= a) { // take a
2674
if (a == HIGH) break main;
2675                    buffer[k++] = a;
2676                } else { // take b
2677
if (b == HIGH) break main;
2678                    buffer[k++] = b;
2679                }
2680                a = list[i++]; polarity ^= 1; // factored common code
2681
b = other[j++]; polarity ^= 2;
2682                break;
2683              case 1: // a second, b first; if b < a, overlap
2684
if (a < b) { // no overlap, take a
2685
buffer[k++] = a; a = list[i++]; polarity ^= 1;
2686                } else if (b < a) { // OVERLAP, drop b
2687
b = other[j++]; polarity ^= 2;
2688                } else { // a == b, drop both!
2689
if (a == HIGH) break main;
2690                    a = list[i++]; polarity ^= 1;
2691                    b = other[j++]; polarity ^= 2;
2692                }
2693                break;
2694              case 2: // a first, b second; if a < b, overlap
2695
if (b < a) { // no overlap, take b
2696
buffer[k++] = b; b = other[j++]; polarity ^= 2;
2697                } else if (a < b) { // OVERLAP, drop a
2698
a = list[i++]; polarity ^= 1;
2699                } else { // a == b, drop both!
2700
if (a == HIGH) break main;
2701                    a = list[i++]; polarity ^= 1;
2702                    b = other[j++]; polarity ^= 2;
2703                }
2704                break;
2705            }
2706        }
2707        buffer[k++] = HIGH; // terminate
2708
len = k;
2709        // swap list and buffer
2710
int[] temp = list;
2711        list = buffer;
2712        buffer = temp;
2713        pat = null;
2714        return this;
2715    }
2716
2717    // polarity = 0 is normal: x intersect y
2718
// polarity = 2: x intersect ~y == set-minus
2719
// polarity = 1: ~x intersect y
2720
// polarity = 3: ~x intersect ~y
2721

2722    private UnicodeSet retain(int[] other, int otherLen, int polarity) {
2723        ensureBufferCapacity(len + otherLen);
2724        int i = 0, j = 0, k = 0;
2725        int a = list[i++];
2726        int b = other[j++];
2727        // change from xor is that we have to check overlapping pairs
2728
// polarity bit 1 means a is second, bit 2 means b is.
2729
main:
2730        while (true) {
2731            switch (polarity) {
2732              case 0: // both first; drop the smaller
2733
if (a < b) { // drop a
2734
a = list[i++]; polarity ^= 1;
2735                } else if (b < a) { // drop b
2736
b = other[j++]; polarity ^= 2;
2737                } else { // a == b, take one, drop other
2738
if (a == HIGH) break main;
2739                    buffer[k++] = a; a = list[i++]; polarity ^= 1;
2740                    b = other[j++]; polarity ^= 2;
2741                }
2742                break;
2743              case 3: // both second; take lower if unequal
2744
if (a < b) { // take a
2745
buffer[k++] = a; a = list[i++]; polarity ^= 1;
2746                } else if (b < a) { // take b
2747
buffer[k++] = b; b = other[j++]; polarity ^= 2;
2748                } else { // a == b, take one, drop other
2749
if (a == HIGH) break main;
2750                    buffer[k++] = a; a = list[i++]; polarity ^= 1;
2751                    b = other[j++]; polarity ^= 2;
2752                }
2753                break;
2754              case 1: // a second, b first;
2755
if (a < b) { // NO OVERLAP, drop a
2756
a = list[i++]; polarity ^= 1;
2757                } else if (b < a) { // OVERLAP, take b
2758
buffer[k++] = b; b = other[j++]; polarity ^= 2;
2759                } else { // a == b, drop both!
2760
if (a == HIGH) break main;
2761                    a = list[i++]; polarity ^= 1;
2762                    b = other[j++]; polarity ^= 2;
2763                }
2764                break;
2765              case 2: // a first, b second; if a < b, overlap
2766
if (b < a) { // no overlap, drop b
2767
b = other[j++]; polarity ^= 2;
2768                } else if (a < b) { // OVERLAP, take a
2769
buffer[k++] = a; a = list[i++]; polarity ^= 1;
2770                } else { // a == b, drop both!
2771
if (a == HIGH) break main;
2772                    a = list[i++]; polarity ^= 1;
2773                    b = other[j++]; polarity ^= 2;
2774                }
2775                break;
2776            }
2777        }
2778        buffer[k++] = HIGH; // terminate
2779
len = k;
2780        // swap list and buffer
2781
int[] temp = list;
2782        list = buffer;
2783        buffer = temp;
2784        pat = null;
2785        return this;
2786    }
2787
2788    private static final int max(int a, int b) {
2789        return (a > b) ? a : b;
2790    }
2791
2792    //----------------------------------------------------------------
2793
// Generic filter-based scanning code
2794
//----------------------------------------------------------------
2795

2796    private static interface Filter {
2797        boolean contains(int codePoint);
2798    }
2799
2800    private static class NumericValueFilter implements Filter {
2801        double value;
2802        NumericValueFilter(double value) { this.value = value; }
2803        public boolean contains(int ch) {
2804            return UCharacter.getUnicodeNumericValue(ch) == value;
2805        }
2806    }
2807
2808    private static class GeneralCategoryMaskFilter implements Filter {
2809        int mask;
2810        GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
2811        public boolean contains(int ch) {
2812            return ((1 << UCharacter.getType(ch)) & mask) != 0;
2813        }
2814    }
2815
2816    private static class IntPropertyFilter implements Filter {
2817        int prop;
2818        int value;
2819        IntPropertyFilter(int prop, int value) {
2820            this.prop = prop;
2821            this.value = value;
2822        }
2823        public boolean contains(int ch) {
2824            return UCharacter.getIntPropertyValue(ch, prop) == value;
2825        }
2826    }
2827
2828    // VersionInfo for unassigned characters
2829
static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
2830
2831    private static class VersionFilter implements Filter {
2832        VersionInfo version;
2833        VersionFilter(VersionInfo version) { this.version = version; }
2834        public boolean contains(int ch) {
2835            VersionInfo v = UCharacter.getAge(ch);
2836            // Reference comparison ok; VersionInfo caches and reuses
2837
// unique objects.
2838
return v != NO_VERSION &&
2839                   v.compareTo(version) <= 0;
2840        }
2841    }
2842
2843    private static synchronized UnicodeSet getInclusions(int src) {
2844        if (INCLUSIONS == null) {
2845            INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
2846        }
2847        if(INCLUSIONS[src] == null) {
2848            UnicodeSet incl = new UnicodeSet();
2849            switch(src) {
2850            case UCharacterProperty.SRC_CHAR:
2851                UCharacterProperty.getInstance().addPropertyStarts(incl);
2852                break;
2853            case UCharacterProperty.SRC_PROPSVEC:
2854                UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
2855                break;
2856            case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
2857                UCharacterProperty.getInstance().addPropertyStarts(incl);
2858                UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
2859                break;
2860            case UCharacterProperty.SRC_HST:
2861                UCharacterProperty.getInstance().uhst_addPropertyStarts(incl);
2862                break;
2863            case UCharacterProperty.SRC_NORM:
2864                NormalizerImpl.addPropertyStarts(incl);
2865                break;
2866            case UCharacterProperty.SRC_CASE:
2867                try {
2868                    UCaseProps.getSingleton().addPropertyStarts(incl);
2869                } catch(IOException JavaDoc e) {
2870                    throw new MissingResourceException JavaDoc(e.getMessage(),"","");
2871                }
2872                break;
2873            case UCharacterProperty.SRC_BIDI:
2874                try {
2875                    UBiDiProps.getSingleton().addPropertyStarts(incl);
2876                } catch(IOException JavaDoc e) {
2877                    throw new MissingResourceException JavaDoc(e.getMessage(),"","");
2878                }
2879                break;
2880            default:
2881                throw new IllegalStateException JavaDoc("UnicodeSet.getInclusions(unknown src "+src+")");
2882            }
2883            INCLUSIONS[src] = incl;
2884        }
2885        return INCLUSIONS[src];
2886    }
2887
2888    /**
2889     * Generic filter-based scanning code for UCD property UnicodeSets.
2890     */

2891    private UnicodeSet applyFilter(Filter filter, int src) {
2892        // Walk through all Unicode characters, noting the start
2893
// and end of each range for which filter.contain(c) is
2894
// true. Add each range to a set.
2895
//
2896
// To improve performance, use the INCLUSIONS set, which
2897
// encodes information about character ranges that are known
2898
// to have identical properties, such as the CJK Ideographs
2899
// from U+4E00 to U+9FA5. INCLUSIONS contains all characters
2900
// except the first characters of such ranges.
2901
//
2902
// TODO Where possible, instead of scanning over code points,
2903
// use internal property data to initialize UnicodeSets for
2904
// those properties. Scanning code points is slow.
2905

2906        clear();
2907
2908        int startHasProperty = -1;
2909        UnicodeSet inclusions = getInclusions(src);
2910        int limitRange = inclusions.getRangeCount();
2911
2912        for (int j=0; j<limitRange; ++j) {
2913            // get current range
2914
int start = inclusions.getRangeStart(j);
2915            int end = inclusions.getRangeEnd(j);
2916
2917            // for all the code points in the range, process
2918
for (int ch = start; ch <= end; ++ch) {
2919                // only add to the unicodeset on inflection points --
2920
// where the hasProperty value changes to false
2921
if (filter.contains(ch)) {
2922                    if (startHasProperty < 0) {
2923                        startHasProperty = ch;
2924                    }
2925                } else if (startHasProperty >= 0) {
2926                    add_unchecked(startHasProperty, ch-1);
2927                    startHasProperty = -1;
2928                }
2929            }
2930        }
2931        if (startHasProperty >= 0) {
2932            add_unchecked(startHasProperty, 0x10FFFF);
2933        }
2934
2935        return this;
2936    }
2937
2938
2939    /**
2940     * Remove leading and trailing rule white space and compress
2941     * internal rule white space to a single space character.
2942     *
2943     * @see UCharacterProperty#isRuleWhiteSpace
2944     */

2945    private static String JavaDoc mungeCharName(String JavaDoc source) {
2946        StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
2947        for (int i=0; i<source.length(); ) {
2948            int ch = UTF16.charAt(source, i);
2949            i += UTF16.getCharCount(ch);
2950            if (UCharacterProperty.isRuleWhiteSpace(ch)) {
2951                if (buf.length() == 0 ||
2952                    buf.charAt(buf.length() - 1) == ' ') {
2953                    continue;
2954                }
2955                ch = ' '; // convert to ' '
2956
}
2957            UTF16.append(buf, ch);
2958        }
2959        if (buf.length() != 0 &&
2960            buf.charAt(buf.length() - 1) == ' ') {
2961            buf.setLength(buf.length() - 1);
2962        }
2963        return buf.toString();
2964    }
2965
2966    //----------------------------------------------------------------
2967
// Property set API
2968
//----------------------------------------------------------------
2969

2970    /**
2971     * Modifies this set to contain those code points which have the
2972     * given value for the given binary or enumerated property, as
2973     * returned by UCharacter.getIntPropertyValue. Prior contents of
2974     * this set are lost.
2975     *
2976     * @param prop a property in the range
2977     * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or
2978     * UProperty.INT_START..UProperty.INT_LIMIT-1 or.
2979     * UProperty.MASK_START..UProperty.MASK_LIMIT-1.
2980     *
2981     * @param value a value in the range
2982     * UCharacter.getIntPropertyMinValue(prop)..
2983     * UCharacter.getIntPropertyMaxValue(prop), with one exception.
2984     * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be
2985     * a UCharacter.getType() result, but rather a mask value produced
2986     * by logically ORing (1 << UCharacter.getType()) values together.
2987     * This allows grouped categories such as [:L:] to be represented.
2988     *
2989     * @return a reference to this set
2990     *
2991     * @stable ICU 2.4
2992     */

2993    public UnicodeSet applyIntPropertyValue(int prop, int value) {
2994        checkFrozen();
2995        if (prop == UProperty.GENERAL_CATEGORY_MASK) {
2996            applyFilter(new GeneralCategoryMaskFilter(value), UCharacterProperty.SRC_CHAR);
2997        } else {
2998            applyFilter(new IntPropertyFilter(prop, value), UCharacterProperty.getInstance().getSource(prop));
2999        }
3000        return this;
3001    }
3002
3003
3004
3005    /**
3006     * Modifies this set to contain those code points which have the
3007     * given value for the given property. Prior contents of this
3008     * set are lost.
3009     *
3010     * @param propertyAlias a property alias, either short or long.
3011     * The name is matched loosely. See PropertyAliases.txt for names
3012     * and a description of loose matching. If the value string is
3013     * empty, then this string is interpreted as either a
3014     * General_Category value alias, a Script value alias, a binary
3015     * property alias, or a special ID. Special IDs are matched
3016     * loosely and correspond to the following sets:
3017     *
3018     * "ANY" = [-\U0010FFFF],
3019     * "ASCII" = [-].
3020     *
3021     * @param valueAlias a value alias, either short or long. The
3022     * name is matched loosely. See PropertyValueAliases.txt for
3023     * names and a description of loose matching. In addition to
3024     * aliases listed, numeric values and canonical combining classes
3025     * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc",
3026     * "220"). The value string may also be empty.
3027     *
3028     * @return a reference to this set
3029     *
3030     * @stable ICU 2.4
3031     */

3032    public UnicodeSet applyPropertyAlias(String JavaDoc propertyAlias, String JavaDoc valueAlias) {
3033        return applyPropertyAlias(propertyAlias, valueAlias, null);
3034    }
3035
3036    /**
3037     * Modifies this set to contain those code points which have the
3038     * given value for the given property. Prior contents of this
3039     * set are lost.
3040     * @param propertyAlias
3041     * @param valueAlias
3042     * @param symbols if not null, then symbols are first called to see if a property
3043     * is available. If true, then everything else is skipped.
3044     * @return this set
3045     * @draft ICU 3.2
3046     * @provisional This API might change or be removed in a future release.
3047     */

3048    public UnicodeSet applyPropertyAlias(String JavaDoc propertyAlias,
3049                                         String JavaDoc valueAlias, SymbolTable symbols) {
3050        checkFrozen();
3051        int p;
3052        int v;
3053        boolean mustNotBeEmpty = false, invert = false;
3054
3055        if (symbols != null
3056                && (symbols instanceof XSymbolTable)
3057                && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) {
3058                return this;
3059        }
3060
3061        if (valueAlias.length() > 0) {
3062            p = UCharacter.getPropertyEnum(propertyAlias);
3063
3064            // Treat gc as gcm
3065
if (p == UProperty.GENERAL_CATEGORY) {
3066                p = UProperty.GENERAL_CATEGORY_MASK;
3067            }
3068
3069            if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) ||
3070                (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) ||
3071                (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) {
3072                try {
3073                    v = UCharacter.getPropertyValueEnum(p, valueAlias);
3074                } catch (IllegalArgumentException JavaDoc e) {
3075                    // Handle numeric CCC
3076
if (p == UProperty.CANONICAL_COMBINING_CLASS ||
3077                        p == UProperty.LEAD_CANONICAL_COMBINING_CLASS ||
3078                        p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) {
3079                        v = Integer.parseInt(Utility.deleteRuleWhiteSpace(valueAlias));
3080                        // If the resultant set is empty then the numeric value
3081
// was invalid.
3082
mustNotBeEmpty = true;
3083                    } else {
3084                        throw e;
3085                    }
3086                }
3087            }
3088
3089            else {
3090
3091                switch (p) {
3092                case UProperty.NUMERIC_VALUE:
3093                    {
3094                        double value = Double.parseDouble(Utility.deleteRuleWhiteSpace(valueAlias));
3095                        applyFilter(new NumericValueFilter(value), UCharacterProperty.SRC_CHAR);
3096                        return this;
3097                    }
3098                case UProperty.NAME:
3099                case UProperty.UNICODE_1_NAME:
3100                    {
3101                        // Must munge name, since
3102
// UCharacter.charFromName() does not do
3103
// 'loose' matching.
3104
String JavaDoc buf = mungeCharName(valueAlias);
3105                        int ch =
3106                            (p == UProperty.NAME) ?
3107                            UCharacter.getCharFromExtendedName(buf) :
3108                            UCharacter.getCharFromName1_0(buf);
3109                        if (ch == -1) {
3110                            throw new IllegalArgumentException JavaDoc("Invalid character name");
3111                        }
3112                        clear();
3113                        add_unchecked(ch);
3114                        return this;
3115                    }
3116                case UProperty.AGE:
3117                    {
3118                        // Must munge name, since
3119
// VersionInfo.getInstance() does not do
3120
// 'loose' matching.
3121
VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
3122                        applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
3123                        return this;
3124                    }
3125                }
3126
3127                // p is a non-binary, non-enumerated property that we
3128
// don't support (yet).
3129
throw new IllegalArgumentException JavaDoc("Unsupported property");
3130            }
3131        }
3132
3133        else {
3134            // valueAlias is empty. Interpret as General Category, Script,
3135
// Binary property, or ANY or ASCII. Upon success, p and v will
3136
// be set.
3137
try {
3138                p = UProperty.GENERAL_CATEGORY_MASK;
3139                v = UCharacter.getPropertyValueEnum(p, propertyAlias);
3140            } catch (IllegalArgumentException JavaDoc e) {
3141                try {
3142                    p = UProperty.SCRIPT;
3143                    v = UCharacter.getPropertyValueEnum(p, propertyAlias);
3144                } catch (IllegalArgumentException JavaDoc e2) {
3145                    try {
3146                        p = UCharacter.getPropertyEnum(propertyAlias);
3147                    } catch (IllegalArgumentException JavaDoc e3) {
3148                        p = -1;
3149                    }
3150                    if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) {
3151                        v = 1;
3152                    } else if (p == -1) {
3153                        if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) {
3154                            set(MIN_VALUE, MAX_VALUE);
3155                            return this;
3156                        } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) {
3157                            set(0, 0x7F);
3158                            return this;
3159                        } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) {
3160                            // [:Assigned:]=[:^Cn:]
3161
p = UProperty.GENERAL_CATEGORY_MASK;
3162                            v = (1<<UCharacter.UNASSIGNED);
3163                            invert = true;
3164                        } else {
3165                            // Property name was never matched.
3166
throw new IllegalArgumentException JavaDoc("Invalid property alias: " + propertyAlias + "=" + valueAlias);
3167                        }
3168                    } else {
3169                        // Valid propery name, but it isn't binary, so the value
3170
// must be supplied.
3171
throw new IllegalArgumentException JavaDoc("Missing property value");
3172                    }
3173                }
3174            }
3175        }
3176
3177        applyIntPropertyValue(p, v);
3178        if(invert) {
3179            complement();
3180        }
3181
3182        if (mustNotBeEmpty && isEmpty()) {
3183            // mustNotBeEmpty is set to true if an empty set indicates
3184
// invalid input.
3185
throw new IllegalArgumentException JavaDoc("Invalid property value");
3186        }
3187
3188        return this;
3189    }
3190
3191    //----------------------------------------------------------------
3192
// Property set patterns
3193
//----------------------------------------------------------------
3194

3195    /**
3196     * Return true if the given position, in the given pattern, appears
3197     * to be the start of a property set pattern.
3198     */

3199    private static boolean resemblesPropertyPattern(String JavaDoc pattern, int pos) {
3200        // Patterns are at least 5 characters long
3201
if ((pos+5) > pattern.length()) {
3202            return false;
3203        }
3204
3205        // Look for an opening [:, [:^, \p, or \P
3206
return pattern.regionMatches(pos, "[:", 0, 2) ||
3207            pattern.regionMatches(true, pos, "\\p", 0, 2) ||
3208            pattern.regionMatches(pos, "\\N", 0, 2);
3209    }
3210
3211    /**
3212     * Return true if the given iterator appears to point at a
3213     * property pattern. Regardless of the result, return with the
3214     * iterator unchanged.
3215     * @param chars iterator over the pattern characters. Upon return
3216     * it will be unchanged.
3217     * @param iterOpts RuleCharacterIterator options
3218     */

3219    private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
3220                                                    int iterOpts) {
3221        boolean result = false;
3222        iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
3223        Object JavaDoc pos = chars.getPos(null);
3224        int c = chars.next(iterOpts);
3225        if (c == '[' || c == '\\') {
3226            int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
3227            result = (c == '[') ? (d == ':') :
3228                     (d == 'N' || d == 'p' || d == 'P');
3229        }
3230        chars.setPos(pos);
3231        return result;
3232    }
3233
3234    /**
3235     * Parse the given property pattern at the given parse position.
3236     * @param symbols TODO
3237     */

3238    private UnicodeSet applyPropertyPattern(String JavaDoc pattern, ParsePosition ppos, SymbolTable symbols) {
3239        int pos = ppos.getIndex();
3240
3241        // On entry, ppos should point to one of the following locations:
3242

3243        // Minimum length is 5 characters, e.g. \p{L}
3244
if ((pos+5) > pattern.length()) {
3245            return null;
3246        }
3247
3248        boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
3249
boolean isName = false; // true for \N{pat}, o/w false
3250
boolean invert = false;
3251
3252        // Look for an opening [:, [:^, \p, or \P
3253
if (pattern.regionMatches(pos, "[:", 0, 2)) {
3254            posix = true;
3255            pos = Utility.skipWhitespace(pattern, pos+2);
3256            if (pos < pattern.length() && pattern.charAt(pos) == '^') {
3257                ++pos;
3258                invert = true;
3259            }
3260        } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
3261                   pattern.regionMatches(pos, "\\N", 0, 2)) {
3262            char c = pattern.charAt(pos+1);
3263            invert = (c == 'P');
3264            isName = (c == 'N');
3265            pos = Utility.skipWhitespace(pattern, pos+2);
3266            if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
3267                // Syntax error; "\p" or "\P" not followed by "{"
3268
return null;
3269            }
3270        } else {
3271            // Open delimiter not seen
3272
return null;
3273        }
3274
3275        // Look for the matching close delimiter, either :] or }
3276
int close = pattern.indexOf(posix ? ":]" : "}", pos);
3277        if (close < 0) {
3278            // Syntax error; close delimiter missing
3279
return null;
3280        }
3281
3282        // Look for an '=' sign. If this is present, we will parse a
3283
// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
3284
// pattern.
3285
int equals = pattern.indexOf('=', pos);
3286        String JavaDoc propName, valueName;
3287        if (equals >= 0 && equals < close && !isName) {
3288            // Equals seen; parse medium/long pattern
3289
propName = pattern.substring(pos, equals);
3290            valueName = pattern.substring(equals+1, close);
3291        }
3292
3293        else {
3294            // Handle case where no '=' is seen, and \N{}
3295
propName = pattern.substring(pos, close);
3296            valueName = "";
3297
3298            // Handle \N{name}
3299
if (isName) {
3300                // This is a little inefficient since it means we have to
3301
// parse "na" back to UProperty.NAME even though we already
3302
// know it's UProperty.NAME. If we refactor the API to
3303
// support args of (int, String) then we can remove
3304
// "na" and make this a little more efficient.
3305
valueName = propName;
3306                propName = "na";
3307            }
3308        }
3309
3310        applyPropertyAlias(propName, valueName, symbols);
3311
3312        if (invert) {
3313            complement();
3314        }
3315
3316        // Move to the limit position after the close delimiter
3317
ppos.setIndex(close + (posix ? 2 : 1));
3318
3319        return this;
3320    }
3321
3322    /**
3323     * Parse a property pattern.
3324     * @param chars iterator over the pattern characters. Upon return
3325     * it will be advanced to the first character after the parsed
3326     * pattern, or the end of the iteration if all characters are
3327     * parsed.
3328     * @param rebuiltPat the pattern that was parsed, rebuilt or
3329     * copied from the input pattern, as appropriate.
3330     * @param symbols TODO
3331     */

3332    private void applyPropertyPattern(RuleCharacterIterator chars,
3333                                      StringBuffer JavaDoc rebuiltPat, SymbolTable symbols) {
3334        String JavaDoc pat = chars.lookahead();
3335        ParsePosition pos = new ParsePosition(0);
3336        applyPropertyPattern(pat, pos, symbols);
3337        if (pos.getIndex() == 0) {
3338            syntaxError(chars, "Invalid property pattern");
3339        }
3340        chars.jumpahead(pos.getIndex());
3341        rebuiltPat.append(pat.substring(0, pos.getIndex()));
3342    }
3343
3344    //----------------------------------------------------------------
3345
// Case folding API
3346
//----------------------------------------------------------------
3347

3348    /**
3349     * Bitmask for constructor and applyPattern() indicating that
3350     * white space should be ignored. If set, ignore characters for
3351     * which UCharacterProperty.isRuleWhiteSpace() returns true,
3352     * unless they are quoted or escaped. This may be ORed together
3353     * with other selectors.
3354     * @internal
3355     * @deprecated This API is ICU internal only.
3356     */

3357    public static final int IGNORE_SPACE = 1;
3358
3359    /**
3360     * Bitmask for constructor, applyPattern(), and closeOver()
3361     * indicating letter case. This may be ORed together with other
3362     * selectors.
3363     *
3364     * Enable case insensitive matching. E.g., "[ab]" with this flag
3365     * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
3366     * match all except 'a', 'A', 'b', and 'B'. This performs a full
3367     * closure over case mappings, e.g. U+017F for s.
3368     *
3369     * The resulting set is a superset of the input for the code points but
3370     * not for the strings.
3371     * It performs a case mapping closure of the code points and adds
3372     * full case folding strings for the code points, and reduces strings of
3373     * the original set to their full case folding equivalents.
3374     *
3375     * This is designed for case-insensitive matches, for example
3376     * in regular expressions. The full code point case closure allows checking of
3377     * an input character directly against the closure set.
3378     * Strings are matched by comparing the case-folded form from the closure
3379     * set with an incremental case folding of the string in question.
3380     *
3381     * The closure set will also contain single code points if the original
3382     * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
3383     * This is not necessary (that is, redundant) for the above matching method
3384     * but results in the same closure sets regardless of whether the original
3385     * set contained the code point or a string.
3386     *
3387     * @internal
3388     * @deprecated This API is ICU internal only.
3389     */

3390    public static final int CASE = 2;
3391
3392    /**
3393     * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
3394     * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
3395     * @see #CASE
3396     * @draft ICU 3.4
3397     * @provisional This API might change or be removed in a future release.
3398     */

3399    public static final int CASE_INSENSITIVE = 2;
3400
3401    /**
3402     * Bitmask for constructor, applyPattern(), and closeOver()
3403     * indicating letter case. This may be ORed together with other
3404     * selectors.
3405     *
3406     * Enable case insensitive matching. E.g., "[ab]" with this flag
3407     * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
3408     * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
3409     * title-, and uppercase mappings as well as the case folding
3410     * of each existing element in the set.
3411     * @draft ICU 3.4
3412     * @provisional This API might change or be removed in a future release.
3413     */

3414    public static final int ADD_CASE_MAPPINGS = 4;
3415
3416    // add the result of a full case mapping to the set
3417
// use str as a temporary string to avoid constructing one
3418
private static final void addCaseMapping(UnicodeSet set, int result, StringBuffer JavaDoc full) {
3419        if(result >= 0) {
3420            if(result > UCaseProps.MAX_STRING_LENGTH) {
3421                // add a single-code point case mapping
3422
set.add(result);
3423            } else {
3424                // add a string case mapping from full with length result
3425
set.add(full.toString());
3426                full.setLength(0);
3427            }
3428        }
3429        // result < 0: the code point mapped to itself, no need to add it
3430
// see UCaseProps
3431
}
3432
3433    /**
3434     * Close this set over the given attribute. For the attribute
3435     * CASE, the result is to modify this set so that:
3436     *
3437     * 1. For each character or string 'a' in this set, all strings
3438     * 'b' such that foldCase(a) == foldCase(b) are added to this set.
3439     * (For most 'a' that are single characters, 'b' will have
3440     * b.length() == 1.)
3441     *
3442     * 2. For each string 'e' in the resulting set, if e !=
3443     * foldCase(e), 'e' will be removed.
3444     *
3445     * Example: [aqß{Bc}{bC}{Fi}] => [aAqQß?{ss}{bc}{fi}]
3446     *
3447     * (Here foldCase(x) refers to the operation
3448     * UCharacter.foldCase(x, true), and a == b actually denotes
3449     * a.equals(b), not pointer comparison.)
3450     *
3451     * @param attribute bitmask for attributes to close over.
3452     * Currently only the CASE bit is supported. Any undefined bits
3453     * are ignored.
3454     * @return a reference to this set.
3455     * @internal
3456     * @deprecated This API is ICU internal only.
3457     */

3458    public UnicodeSet closeOver(int attribute) {
3459        checkFrozen();
3460        if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
3461            UCaseProps csp;
3462            try {
3463                csp = UCaseProps.getSingleton();
3464            } catch(IOException JavaDoc e) {
3465                return this;
3466            }
3467            UnicodeSet foldSet = new UnicodeSet(this);
3468            ULocale root = ULocale.ROOT;
3469
3470            // start with input set to guarantee inclusion
3471
// CASE: remove strings because the strings will actually be reduced (folded);
3472
// therefore, start with no strings and add only those needed
3473
if((attribute & CASE) != 0) {
3474                foldSet.strings.clear();
3475            }
3476
3477            int n = getRangeCount();
3478            int result;
3479            StringBuffer JavaDoc full = new StringBuffer JavaDoc();
3480            int locCache[] = new int[1];
3481
3482            for (int i=0; i<n; ++i) {
3483                int start = getRangeStart(i);
3484                int end = getRangeEnd(i);
3485
3486                if((attribute & CASE) != 0) {
3487                    // full case closure
3488
for (int cp=start; cp<=end; ++cp) {
3489                        csp.addCaseClosure(cp, foldSet);
3490                    }
3491                } else {
3492                    // add case mappings
3493
// (does not add long s for regular s, or Kelvin for k, for example)
3494
for (int cp=start; cp<=end; ++cp) {
3495                        result = csp.toFullLower(cp, null, full, root, locCache);
3496                        addCaseMapping(foldSet, result, full);
3497
3498                        result = csp.toFullTitle(cp, null, full, root, locCache);
3499                        addCaseMapping(foldSet, result, full);
3500
3501                        result = csp.toFullUpper(cp, null, full, root, locCache);
3502                        addCaseMapping(foldSet, result, full);
3503
3504                        result = csp.toFullFolding(cp, full, 0);
3505                        addCaseMapping(foldSet, result, full);
3506                    }
3507                }
3508            }
3509            if (!strings.isEmpty()) {
3510                String JavaDoc str;
3511                if ((attribute & CASE) != 0) {
3512                    Iterator it = strings.iterator();
3513                    while (it.hasNext()) {
3514                        str = UCharacter.foldCase((String JavaDoc)it.next(), 0);
3515                        if(!csp.addStringCaseClosure(str, foldSet)) {
3516                            foldSet.add(str); // does not map to code points: add the folded string itself
3517
}
3518                    }
3519                } else {
3520                    BreakIterator bi = BreakIterator.getWordInstance(root);
3521                    Iterator it = strings.iterator();
3522                    while (it.hasNext()) {
3523                        str = (String JavaDoc)it.next();
3524                        foldSet.add(UCharacter.toLowerCase(root, str));
3525                        foldSet.add(UCharacter.toTitleCase(root, str, bi));
3526                        foldSet.add(UCharacter.toUpperCase(root, str));
3527                        foldSet.add(UCharacter.foldCase(str, 0));
3528                    }
3529                }
3530            }
3531            set(foldSet);
3532        }
3533        return this;
3534    }
3535
3536    /**
3537     * Internal class for customizing UnicodeSet parsing of properties.
3538     * TODO: extend to allow customizing of codepoint ranges
3539     * @internal
3540     * @deprecated This API is ICU internal only.
3541     * @author medavis
3542     */

3543    abstract public static class XSymbolTable implements SymbolTable {
3544        /**
3545         * Default constructor
3546         * @internal
3547         * @deprecated This API is ICU internal only.
3548         */

3549        public XSymbolTable(){}
3550        /**
3551         * @internal
3552         * @deprecated This API is ICU internal only.
3553         */

3554        public UnicodeMatcher lookupMatcher(int i) {
3555            return null;
3556        }
3557        /**
3558         * @internal
3559         * @deprecated This API is ICU internal only.
3560         */

3561        public boolean applyPropertyAlias(String JavaDoc propertyName, String JavaDoc propertyValue, UnicodeSet result) {
3562            return false;
3563        }
3564        /**
3565         * @internal
3566         * @deprecated This API is ICU internal only.
3567         */

3568        public char[] lookup(String JavaDoc s) {
3569            return null;
3570        }
3571        /**
3572         * @internal
3573         * @deprecated This API is ICU internal only.
3574         */

3575        public String JavaDoc parseReference(String JavaDoc text, ParsePosition pos, int limit) {
3576            return null;
3577        }
3578    }
3579
3580    private boolean frozen;
3581    
3582    /**
3583     * Is this frozen, according to the Freezable interface?
3584     * @return value
3585     * @internal
3586     * @deprecated This API is ICU internal only.
3587     */

3588    public boolean isFrozen() {
3589        return frozen;
3590    }
3591
3592    /**
3593     * Freeze this class, according to the Freezable interface.
3594     * @return this
3595     * @internal
3596     * @deprecated This API is ICU internal only.
3597     */

3598    public Object JavaDoc freeze() {
3599        frozen = true;
3600        return this;
3601    }
3602    
3603    /**
3604     * Clone a thawed version of this class, according to the Freezable interface.
3605     * @return this
3606     * @internal
3607     * @deprecated This API is ICU internal only.
3608     */

3609    public Object JavaDoc cloneAsThawed() {
3610        UnicodeSet result = (UnicodeSet) clone();
3611        result.frozen = false;
3612        return result;
3613    }
3614    
3615    // internal function
3616
private void checkFrozen() {
3617        if (frozen) {
3618            throw new UnsupportedOperationException JavaDoc("Attempt to modify frozen object");
3619        }
3620    }
3621}
3622//eof
3623
Popular Tags