UTF16


1   //##header 1189099963000 FOUNDATION
2   /**
3   *******************************************************************************
4   * Copyright (C) 1996-2006, International Business Machines Corporation and    *
5   * others. All Rights Reserved.                                                *
6   *******************************************************************************
7   */
8   
9   package com.ibm.icu.text;
10  
11  import com.ibm.icu.impl.UCharacterProperty;
12  import com.ibm.icu.impl.NormalizerImpl;
13  
14  /**
15   * <p>Standalone utility class providing UTF16 character conversions and
16   * indexing conversions.</p>
17   * <p>Code that uses strings alone rarely need modification.
18   * By design, UTF-16 does not allow overlap, so searching for strings is a safe
19   * operation. Similarly, concatenation is always safe. Substringing is safe if
20   * the start and end are both on UTF-32 boundaries. In normal code, the values
21   * for start and end are on those boundaries, since they arose from operations
22   * like searching. If not, the nearest UTF-32 boundaries can be determined
23   * using <code>bounds()</code>.</p>
24   * <strong>Examples:</strong>
25   * <p>The following examples illustrate use of some of these methods.
26   * <pre>
27   * // iteration forwards: Original
28   * for (int i = 0; i &lt; s.length(); ++i) {
29   *     char ch = s.charAt(i);
30   *     doSomethingWith(ch);
31   * }
32   *
33   * // iteration forwards: Changes for UTF-32
34   * int ch;
35   * for (int i = 0; i &lt; s.length(); i+=UTF16.getCharCount(ch)) {
36   *     ch = UTF16.charAt(s,i);
37   *     doSomethingWith(ch);
38   * }
39   *
40   * // iteration backwards: Original
41   * for (int i = s.length() -1; i >= 0; --i) {
42   *     char ch = s.charAt(i);
43   *     doSomethingWith(ch);
44   * }
45   *
46   * // iteration backwards: Changes for UTF-32
47   * int ch;
48   * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
49   *     ch = UTF16.charAt(s,i);
50   *     doSomethingWith(ch);
51   * }
52   * </pre>
53   * <strong>Notes:</strong>
54   * <ul>
55   *   <li>
56   *   <strong>Naming:</strong> For clarity, High and Low surrogates are called
57   *   <code>Lead</code> and <code>Trail</code> in the API, which gives a better
58   *   sense of their ordering in a string. <code>offset16</code> and
59   *   <code>offset32</code> are used to distinguish offsets to UTF-16
60   *   boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
61   *   used to contain UTF-32 characters, as opposed to <code>char16</code>,
62   *   which is a UTF-16 code unit.
63   *   </li>
64   *   <li>
65   *   <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
66   *   UTF-32 offset to a UTF-16 offset and back. Because of the difference in
67   *   structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
68   *   back if and only if <code>bounds(string, offset16) != TRAIL</code>.
69   *   </li>
70   *   <li>
71   *    <strong>Exceptions:</strong> The error checking will throw an exception
72   *   if indices are out of bounds. Other than than that, all methods will
73   *   behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
74   *   values are present. <code>UCharacter.isLegal()</code> can be used to check
75   *   for validity if desired.
76   *   </li>
77   *   <li>
78   *   <strong>Unmatched Surrogates:</strong> If the string contains unmatched
79   *   surrogates, then these are counted as one UTF-32 value. This matches
80   *   their iteration behavior, which is vital. It also matches common display
81   *   practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
82   *   </li>
83   *   <li>
84   *     <strong>Optimization:</strong> The method implementations may need
85   *     optimization if the compiler doesn't fold static final methods. Since
86   *     surrogate pairs will form an exceeding small percentage of all the text
87   *     in the world, the singleton case should always be optimized for.
88   *   </li>
89   * </ul>
90   * @author Mark Davis, with help from Markus Scherer
91   * @stable ICU 2.1
92   */
93  
94  public final class UTF16
95  {
96      // public variables ---------------------------------------------------
97  
98      /**
99       * Value returned in <code><a HREF="#bounds(java.lang.String, int)">
100      * bounds()</a></code>.
101      * These values are chosen specifically so that it actually represents
102      * the position of the character
103      * [offset16 - (value >> 2), offset16 + (value & 3)]
104      * @stable ICU 2.1
105      */
106     public static final int SINGLE_CHAR_BOUNDARY = 1,
107         LEAD_SURROGATE_BOUNDARY = 2,
108         TRAIL_SURROGATE_BOUNDARY = 5;
109     /**
110      * The lowest Unicode code point value.
111      * @stable ICU 2.1
112      */
113     public static final int CODEPOINT_MIN_VALUE = 0;
114     /**
115      * The highest Unicode code point value (scalar value) according to the
116      * Unicode Standard.
117      * @stable ICU 2.1
118      */
119     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
120     /**
121      * The minimum value for Supplementary code points
122      * @stable ICU 2.1
123      */
124     public static final int SUPPLEMENTARY_MIN_VALUE  = 0x10000;
125     /**
126      * Lead surrogate minimum value
127      * @stable ICU 2.1
128      */
129     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
130     /**
131      * Trail surrogate minimum value
132      * @stable ICU 2.1
133      */
134     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
135     /**
136      * Lead surrogate maximum value
137      * @stable ICU 2.1
138      */
139     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
140     /**
141      * Trail surrogate maximum value
142      * @stable ICU 2.1
143      */
144     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
145     /**
146      * Surrogate minimum value
147      * @stable ICU 2.1
148      */
149     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
150     /**
151      * Maximum surrogate value
152      * @stable ICU 2.1
153      */
154     public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
155 
156     // constructor --------------------------------------------------------
157 
158     ///CLOVER:OFF
159     /**
160      * Prevent instance from being created.
161      */
162     private UTF16()
163     {
164     }
165     ///CLOVER:ON
166     // public method ------------------------------------------------------
167 
168     /**
169      * Extract a single UTF-32 value from a string.
170      * Used when iterating forwards or backwards (with
171      * <code>UTF16.getCharCount()</code>, as well as random access. If a
172      * validity check is required, use
173      * <code><a HREF="../lang/UCharacter.html#isLegal(char)">
174      * UCharacter.isLegal()</a></code> on the return value.
175      * If the char retrieved is part of a surrogate pair, its supplementary
176      * character will be returned. If a complete supplementary character is
177      * not found the incomplete character will be returned
178      * @param source array of UTF-16 chars
179      * @param offset16 UTF-16 offset to the start of the character.
180      * @return UTF-32 value for the UTF-32 value that contains the char at
181      *         offset16. The boundaries of that codepoint are the same as in
182      *         <code>bounds32()</code>.
183      * @exception IndexOutOfBoundsException thrown if offset16 is out of
184      *            bounds.
185      * @stable ICU 2.1
186      */
187     public static int charAt(String   source, int offset16)
188     {
189         char single = source.charAt(offset16);
190         if (single < LEAD_SURROGATE_MIN_VALUE) {
191             return single;
192         }
193         return _charAt(source, offset16, single);
194     }
195 
196     private static int _charAt(String   source, int offset16, char single)
197     {
198         if (single > TRAIL_SURROGATE_MAX_VALUE) {
199             return single;
200         }
201 
202         // Convert the UTF-16 surrogate pair if necessary.
203         // For simplicity in usage, and because the frequency of pairs is
204         // low, look both directions.
205 
206         if (single <= LEAD_SURROGATE_MAX_VALUE) {
207             ++ offset16;
208             if (source.length() != offset16) {
209                 char trail = source.charAt(offset16);
210                 if (trail >= TRAIL_SURROGATE_MIN_VALUE &&
211                     trail <= TRAIL_SURROGATE_MAX_VALUE) {
212                     return UCharacterProperty.getRawSupplementary(single,
213                                                                   trail);
214                 }
215             }
216         }
217         else
218             {
219                 -- offset16;
220                 if (offset16 >= 0) {
221                     // single is a trail surrogate so
222                     char lead = source.charAt(offset16);
223                     if (lead >= LEAD_SURROGATE_MIN_VALUE &&
224                         lead <= LEAD_SURROGATE_MAX_VALUE) {
225                         return UCharacterProperty.getRawSupplementary(lead,
226                                                                       single);
227                     }
228                 }
229             }
230         return single; // return unmatched surrogate
231     }
232 
233 //#ifndef FOUNDATION
234 //##    /**
235 //##     * Extract a single UTF-32 value from a string.
236 //##     * Used when iterating forwards or backwards (with
237 //##     * <code>UTF16.getCharCount()</code>, as well as random access. If a
238 //##     * validity check is required, use
239 //##     * <code><a HREF="../lang/UCharacter.html#isLegal(char)">
240 //##     * UCharacter.isLegal()</a></code> on the return value.
241 //##     * If the char retrieved is part of a surrogate pair, its supplementary
242 //##     * character will be returned. If a complete supplementary character is
243 //##     * not found the incomplete character will be returned
244 //##     * @param source array of UTF-16 chars
245 //##     * @param offset16 UTF-16 offset to the start of the character.
246 //##     * @return UTF-32 value for the UTF-32 value that contains the char at
247 //##     *         offset16. The boundaries of that codepoint are the same as in
248 //##     *         <code>bounds32()</code>.
249 //##     * @exception IndexOutOfBoundsException thrown if offset16 is out of
250 //##     *            bounds.
251 //##     * @stable ICU 2.1
252 //##     */
253 //##    public static int charAt(CharSequence source, int offset16)
254 //##    {
255 //##        char single = source.charAt(offset16);
256 //##        if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
257 //##            return single;
258 //##        }
259 //##        return _charAt(source, offset16, single);
260 //##    }
261 //##
262 //##    private static int _charAt(CharSequence source, int offset16, char single)
263 //##    {
264 //##        if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
265 //##            return single;
266 //##        }
267 //##
268 //##        // Convert the UTF-16 surrogate pair if necessary.
269 //##        // For simplicity in usage, and because the frequency of pairs is
270 //##        // low, look both directions.
271 //##
272 //##        if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
273 //##            ++ offset16;
274 //##            if (source.length() != offset16) {
275 //##                char trail = source.charAt(offset16);
276 //##                if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
277 //##                    trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
278 //##                    return UCharacterProperty.getRawSupplementary(single,
279 //##                                                                  trail);
280 //##                }
281 //##            }
282 //##        }
283 //##        else
284 //##            {
285 //##                -- offset16;
286 //##                if (offset16 >= 0) {
287 //##                    // single is a trail surrogate so
288 //##                    char lead = source.charAt(offset16);
289 //##                    if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
290 //##                        lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
291 //##                        return UCharacterProperty.getRawSupplementary(lead,
292 //##                                                                      single);
293 //##                    }
294 //##                }
295 //##            }
296 //##        return single; // return unmatched surrogate
297 //##    }
298 //#endif
299 
300     /**
301      * Extract a single UTF-32 value from a string.
302      * Used when iterating forwards or backwards (with
303      * <code>UTF16.getCharCount()</code>, as well as random access. If a
304      * validity check is required, use
305      * <code><a HREF="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
306      * </a></code> on the return value.
307      * If the char retrieved is part of a surrogate pair, its supplementary
308      * character will be returned. If a complete supplementary character is
309      * not found the incomplete character will be returned
310      * @param source UTF-16 chars string buffer
311      * @param offset16 UTF-16 offset to the start of the character.
312      * @return UTF-32 value for the UTF-32 value that contains the char at
313      *         offset16. The boundaries of that codepoint are the same as in
314      *         <code>bounds32()</code>.
315      * @exception IndexOutOfBoundsException thrown if offset16 is out of
316      *            bounds.
317      * @stable ICU 2.1
318      */
319     public static int charAt(StringBuffer   source, int offset16)
320     {
321         if (offset16 < 0 || offset16 >= source.length()) {
322             throw new StringIndexOutOfBoundsException  (offset16);
323         }
324 
325         char single = source.charAt(offset16);
326         if (!isSurrogate(single)) {
327             return single;
328         }
329 
330         // Convert the UTF-16 surrogate pair if necessary.
331         // For simplicity in usage, and because the frequency of pairs is
332         // low, look both directions.
333 
334         if (single <= LEAD_SURROGATE_MAX_VALUE)
335             {
336                 ++ offset16;
337                 if (source.length() != offset16)
338                     {
339                         char trail = source.charAt(offset16);
340                         if (isTrailSurrogate(trail))
341                             return UCharacterProperty.getRawSupplementary(single, trail);
342                     }
343             }
344         else
345             {
346                 -- offset16;
347                 if (offset16 >= 0)
348                     {
349                         // single is a trail surrogate so
350                         char lead = source.charAt(offset16);
351                         if (isLeadSurrogate(lead)) {
352                             return UCharacterProperty.getRawSupplementary(lead, single);
353                         }
354                     }
355             }
356         return single; // return unmatched surrogate
357     }
358 
359     /**
360      * Extract a single UTF-32 value from a substring.
361      * Used when iterating forwards or backwards (with
362      * <code>UTF16.getCharCount()</code>, as well as random access. If a
363      * validity check is required, use
364      * <code><a HREF="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
365      * </a></code> on the return value.
366      * If the char retrieved is part of a surrogate pair, its supplementary
367      * character will be returned. If a complete supplementary character is
368      * not found the incomplete character will be returned
369      * @param source array of UTF-16 chars
370      * @param start offset to substring in the source array for analyzing
371      * @param limit offset to substring in the source array for analyzing
372      * @param offset16 UTF-16 offset relative to start
373      * @return UTF-32 value for the UTF-32 value that contains the char at
374      *         offset16. The boundaries of that codepoint are the same as in
375      *         <code>bounds32()</code>.
376      * @exception IndexOutOfBoundsException thrown if offset16 is not within
377      *            the range of start and limit.
378      * @stable ICU 2.1
379      */
380     public static int charAt(char source[], int start, int limit,
381                              int offset16)
382     {
383         offset16 += start;
384         if (offset16 < start || offset16 >= limit) {
385             throw new ArrayIndexOutOfBoundsException  (offset16);
386         }
387 
388         char single = source[offset16];
389         if (!isSurrogate(single)) {
390             return single;
391         }
392 
393         // Convert the UTF-16 surrogate pair if necessary.
394         // For simplicity in usage, and because the frequency of pairs is
395         // low, look both directions.
396         if (single <= LEAD_SURROGATE_MAX_VALUE) {
397             offset16 ++;
398             if (offset16 >= limit) {
399                 return single;
400             }
401             char trail = source[offset16];
402             if (isTrailSurrogate(trail)) {
403                 return UCharacterProperty.getRawSupplementary(single, trail);
404             }
405         }
406         else { // isTrailSurrogate(single), so
407             if (offset16 == start) {
408                 return single;
409             }
410             offset16 --;
411             char lead = source[offset16];
412             if (isLeadSurrogate(lead))
413                 return UCharacterProperty.getRawSupplementary(lead, single);
414         }
415         return single; // return unmatched surrogate
416     }
417 
418     /**
419      * Extract a single UTF-32 value from a string.
420      * Used when iterating forwards or backwards (with
421      * <code>UTF16.getCharCount()</code>, as well as random access. If a
422      * validity check is required, use
423      * <code><a HREF="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
424      * </a></code> on the return value.
425      * If the char retrieved is part of a surrogate pair, its supplementary
426      * character will be returned. If a complete supplementary character is
427      * not found the incomplete character will be returned
428      * @param source UTF-16 chars string buffer
429      * @param offset16 UTF-16 offset to the start of the character.
430      * @return UTF-32 value for the UTF-32 value that contains the char at
431      *         offset16. The boundaries of that codepoint are the same as in
432      *         <code>bounds32()</code>.
433      * @exception IndexOutOfBoundsException thrown if offset16 is out of
434      *            bounds.
435      * @stable ICU 2.1
436      */
437     public static int charAt(Replaceable source, int offset16)
438     {
439         if (offset16 < 0 || offset16 >= source.length()) {
440             throw new StringIndexOutOfBoundsException  (offset16);
441         }
442 
443         char single = source.charAt(offset16);
444         if (!isSurrogate(single)) {
445             return single;
446         }
447 
448         // Convert the UTF-16 surrogate pair if necessary.
449         // For simplicity in usage, and because the frequency of pairs is
450         // low, look both directions.
451 
452         if (single <= LEAD_SURROGATE_MAX_VALUE)
453             {
454                 ++ offset16;
455                 if (source.length() != offset16)
456                     {
457                         char trail = source.charAt(offset16);
458                         if (isTrailSurrogate(trail))
459                             return UCharacterProperty.getRawSupplementary(single, trail);
460                     }
461             }
462         else
463             {
464                 -- offset16;
465                 if (offset16 >= 0)
466                     {
467                         // single is a trail surrogate so
468                         char lead = source.charAt(offset16);
469                         if (isLeadSurrogate(lead)) {
470                             return UCharacterProperty.getRawSupplementary(lead, single);
471                         }
472                     }
473             }
474         return single; // return unmatched surrogate
475     }
476 
477     /**
478      * Determines how many chars this char32 requires.
479      * If a validity check is required, use <code>
480      * <a HREF="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
481      * char32 before calling.
482      * @param char32 the input codepoint.
483      * @return 2 if is in supplementary space, otherwise 1.
484      * @stable ICU 2.1
485      */
486     public static int getCharCount(int char32)
487     {
488         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
489             return 1;
490         }
491         return 2;
492     }
493 
494     /**
495      * Returns the type of the boundaries around the char at offset16.
496      * Used for random access.
497      * @param source text to analyse
498      * @param offset16 UTF-16 offset
499      * @return <ul>
500      *           <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
501      *                                       [offset16, offset16+1]
502      *           <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at
503      *                                          offset16;
504      *                                          the bounds are
505      *                                          [offset16, offset16 + 2]
506      *           <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at
507      *                                           offset16 - 1; the bounds are
508      *                                           [offset16 - 1, offset16 + 1]
509      *         </ul>
510      *         For bit-twiddlers, the return values for these are chosen so
511      *         that the boundaries can be gotten by:
512      *         [offset16 - (value >> 2), offset16 + (value & 3)].
513      * @exception IndexOutOfBoundsException if offset16 is out of bounds.
514      * @stable ICU 2.1
515      */
516     public static int bounds(String   source, int offset16)
517     {
518         char ch = source.charAt(offset16);
519         if (isSurrogate(ch)) {
520             if (isLeadSurrogate(ch))
521                 {
522                     if (++ offset16 < source.length() &&
523                         isTrailSurrogate(source.charAt(offset16))) {
524                         return LEAD_SURROGATE_BOUNDARY;
525                     }
526                 }
527             else {
528                 // isTrailSurrogate(ch), so
529                 -- offset16;
530                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
531                     return TRAIL_SURROGATE_BOUNDARY;
532                 }
533             }
534         }
535         return SINGLE_CHAR_BOUNDARY;
536     }
537 
538     /**
539      * Returns the type of the boundaries around the char at offset16. Used
540      * for random access.
541      * @param source string buffer to analyse
542      * @param offset16 UTF16 offset
543      * @return
544      *     <ul>
545      *     <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
546      *                                               [offset16, offset16 + 1]
547      *     <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at
548      *                                    offset16; the bounds are
549      *                                    [offset16, offset16 + 2]
550      *     <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at
551      *                                     offset16 - 1; the bounds are
552      *                                     [offset16 - 1, offset16 + 1]
553      *     </ul>
554      * For bit-twiddlers, the return values for these are chosen so that the
555      * boundaries can be gotten by:
556      *                    [offset16 - (value >> 2), offset16 + (value & 3)].
557      * @exception IndexOutOfBoundsException if offset16 is out of bounds.
558      * @stable ICU 2.1
559      */
560     public static int bounds(StringBuffer   source, int offset16)
561     {
562         char ch = source.charAt(offset16);
563         if (isSurrogate(ch)) {
564             if (isLeadSurrogate(ch))
565                 {
566                     if (++ offset16 < source.length() &&
567                         isTrailSurrogate(source.charAt(offset16))) {
568                         return LEAD_SURROGATE_BOUNDARY;
569                     }
570                 }
571             else {
572                 // isTrailSurrogate(ch), so
573                 -- offset16;
574                 if (offset16 >= 0 &&
575                     isLeadSurrogate(source.charAt(offset16))) {
576                     return TRAIL_SURROGATE_BOUNDARY;
577                 }
578             }
579         }
580         return SINGLE_CHAR_BOUNDARY;
581     }
582 
583     /**
584      * Returns the type of the boundaries around the char at offset16. Used
585      * for random access. Note that the boundaries are determined with respect
586      * to the subarray, hence the char array {0xD800, 0xDC00} has the result
587      * SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
588      * @param source char array to analyse
589      * @param start offset to substring in the source array for analyzing
590      * @param limit offset to substring in the source array for analyzing
591      * @param offset16 UTF16 offset relative to start
592      * @return
593      *     <ul>
594      *         <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
595      *         <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at
596      *                       offset16; the bounds are [offset16, offset16 + 2]
597      *         <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at
598      *               offset16 - 1; the bounds are [offset16 - 1, offset16 + 1]
599      *     </ul>
600      * For bit-twiddlers, the boundary values for these are chosen so that the
601      * boundaries can be gotten by: [offset16 - (boundvalue >> 2), offset16
602      *                                                    + (boundvalue & 3)].
603      * @exception IndexOutOfBoundsException if offset16 is not within the
604      *                                      range of start and limit.
605      * @stable ICU 2.1
606      */
607     public static int bounds(char source[], int start, int limit,
608                              int offset16)
609     {
610         offset16 += start;
611         if (offset16 < start || offset16 >= limit) {
612             throw new ArrayIndexOutOfBoundsException  (offset16);
613         }
614         char ch = source[offset16];
615         if (isSurrogate(ch)) {
616             if (isLeadSurrogate(ch)) {
617                 ++ offset16;
618                 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
619                     return LEAD_SURROGATE_BOUNDARY;
620                 }
621             }
622             else { // isTrailSurrogate(ch), so
623                 -- offset16;
624                 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
625                     return TRAIL_SURROGATE_BOUNDARY;
626                 }
627             }
628         }
629         return SINGLE_CHAR_BOUNDARY;
630     }
631 
632     /**
633      * Determines whether the code value is a surrogate.
634      * @param char16 the input character.
635      * @return true iff the input character is a surrogate.
636      * @stable ICU 2.1
637      */
638     public static boolean isSurrogate(char char16)
639     {
640         return LEAD_SURROGATE_MIN_VALUE <= char16 &&
641             char16 <= TRAIL_SURROGATE_MAX_VALUE;
642     }
643 
644     /**
645      * Determines whether the character is a trail surrogate.
646      * @param char16 the input character.
647      * @return true iff the input character is a trail surrogate.
648      * @stable ICU 2.1
649      */
650     public static boolean isTrailSurrogate(char char16)
651     {
652         return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
653                 char16 <= TRAIL_SURROGATE_MAX_VALUE);
654     }
655 
656     /**
657      * Determines whether the character is a lead surrogate.
658      * @param char16 the input character.
659      * @return true iff the input character is a lead surrogate
660      * @stable ICU 2.1
661      */
662     public static boolean isLeadSurrogate(char char16)
663     {
664         return LEAD_SURROGATE_MIN_VALUE <= char16 &&
665             char16 <= LEAD_SURROGATE_MAX_VALUE;
666     }
667 
668     /**
669      * Returns the lead surrogate.
670      * If a validity check is required, use
671      * <code><a HREF="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
672      * on char32 before calling.
673      * @param char32 the input character.
674      * @return lead surrogate if the getCharCount(ch) is 2; <br>
675      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
676      * @stable ICU 2.1
677      */
678     public static char getLeadSurrogate(int char32)
679     {
680         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
681             return (char)(LEAD_SURROGATE_OFFSET_ +
682                           (char32 >> LEAD_SURROGATE_SHIFT_));
683         }
684 
685         return 0;
686     }
687 
688     /**
689      * Returns the trail surrogate.
690      * If a validity check is required, use
691      * <code><a HREF="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
692      * on char32 before calling.
693      * @param char32 the input character.
694      * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
695      *         the character itself
696      * @stable ICU 2.1
697      */
698     public static char getTrailSurrogate(int char32)
699     {
700         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
701             return (char)(TRAIL_SURROGATE_MIN_VALUE +
702                           (char32 & TRAIL_SURROGATE_MASK_));
703         }
704 
705         return (char)char32;
706     }
707 
708     /**
709      * Convenience method corresponding to String.valueOf(char). Returns a one
710      * or two char string containing the UTF-32 value in UTF16 format. If a
711      * validity check is required, use
712      * <a HREF="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
713      * char32 before calling.
714      * @param char32 the input character.
715      * @return string value of char32 in UTF16 format
716      * @exception IllegalArgumentException thrown if char32 is a invalid
717      *            codepoint.
718      * @stable ICU 2.1
719      */
720     public static String   valueOf(int char32)
721     {
722         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
723             throw new IllegalArgumentException  ("Illegal codepoint");
724         }
725         return toString(char32);
726     }
727 
728     /**
729      * Convenience method corresponding to String.valueOf(codepoint at
730      * offset16).
731      * Returns a one or two char string containing the UTF-32 value in UTF16
732      * format. If offset16 indexes a surrogate character, the whole
733      * supplementary codepoint will be returned.
734      * If a validity check is required, use
735      * <a HREF="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
736      * codepoint at offset16 before calling.
737      * The result returned will be a newly created String obtained by calling
738      * source.substring(..) with the appropriate indexes.
739      * @param source the input string.
740      * @param offset16 the UTF16 index to the codepoint in source
741      * @return string value of char32 in UTF16 format
742      * @stable ICU 2.1
743      */
744     public static String   valueOf(String   source, int offset16)
745     {
746         switch (bounds(source, offset16)) {
747         case LEAD_SURROGATE_BOUNDARY:
748             return source.substring(offset16, offset16 + 2);
749         case TRAIL_SURROGATE_BOUNDARY:
750             return source.substring(offset16 - 1, offset16 + 1);
751         default: return source.substring(offset16, offset16 + 1);
752         }
753     }
754 
755     /**
756      * Convenience method corresponding to
757      * StringBuffer.valueOf(codepoint at offset16).
758      * Returns a one or two char string containing the UTF-32 value in UTF16
759      * format. If offset16 indexes a surrogate character, the whole
760      * supplementary codepoint will be returned.
761      * If a validity check is required, use
762      * <a HREF="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
763      * codepoint at offset16 before calling.
764      * The result returned will be a newly created String obtained by calling
765      * source.substring(..) with the appropriate indexes.
766      * @param source the input string buffer.
767      * @param offset16 the UTF16 index to the codepoint in source
768      * @return string value of char32 in UTF16 format
769      * @stable ICU 2.1
770      */
771     public static String   valueOf(StringBuffer   source, int offset16)
772     {
773         switch (bounds(source, offset16)) {
774         case LEAD_SURROGATE_BOUNDARY:
775             return source.substring(offset16, offset16 + 2);
776         case TRAIL_SURROGATE_BOUNDARY:
777             return source.substring(offset16 - 1, offset16 + 1);
778         default: return source.substring(offset16, offset16 + 1);
779         }
780     }
781 
782     /**
783      * Convenience method.
784      * Returns a one or two char string containing the UTF-32 value in UTF16
785      * format. If offset16 indexes a surrogate character, the whole
786      * supplementary codepoint will be returned, except when either the
787      * leading or trailing surrogate character lies out of the specified
788      * subarray. In the latter case, only the surrogate character within
789      * bounds will be returned.
790      * If a validity check is required, use
791      * <a HREF="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
792      * codepoint at offset16 before calling.
793      * The result returned will be a newly created String containing the
794      * relevant characters.
795      * @param source the input char array.
796      * @param start start index of the subarray
797      * @param limit end index of the subarray
798      * @param offset16 the UTF16 index to the codepoint in source relative to
799      *        start
800      * @return string value of char32 in UTF16 format
801      * @stable ICU 2.1
802      */
803     public static String   valueOf(char source[], int start, int limit,
804                                  int offset16)
805     {
806         switch (bounds(source, start, limit, offset16)) {
807         case LEAD_SURROGATE_BOUNDARY:
808             return new String  (source, start + offset16, 2);
809         case TRAIL_SURROGATE_BOUNDARY:
810             return new String  (source, start + offset16 - 1, 2);
811         }
812         return new String  (source, start + offset16, 1);
813     }
814 
815     /**
816      * Returns the UTF-16 offset that corresponds to a UTF-32 offset.
817      * Used for random access. See the <a name="_top_">class description</a>
818      * for notes on roundtripping.
819      * @param source the UTF-16 string
820      * @param offset32 UTF-32 offset
821      * @return UTF-16 offset
822      * @exception IndexOutOfBoundsException if offset32 is out of bounds.
823      * @stable ICU 2.1
824      */
825     public static int findOffsetFromCodePoint(String   source, int offset32)
826     {
827         char ch;
828         int size = source.length(),
829             result = 0,
830             count = offset32;
831         if (offset32 < 0 || offset32 > size) {
832             throw new StringIndexOutOfBoundsException  (offset32);
833         }
834         while (result < size && count > 0)
835             {
836                 ch = source.charAt(result);
837                 if (isLeadSurrogate(ch) && ((result + 1) < size) &&
838                     isTrailSurrogate(source.charAt(result + 1))) {
839                     result ++;
840                 }
841 
842                 count --;
843                 result ++;
844             }
845         if (count != 0) {
846             throw new StringIndexOutOfBoundsException  (offset32);
847         }
848         return result;
849     }
850 
851     /**
852      * Returns the UTF-16 offset that corresponds to a UTF-32 offset.
853      * Used for random access. See the <a name="_top_">class description</a>
854      * for notes on roundtripping.
855      * @param source the UTF-16 string buffer
856      * @param offset32 UTF-32 offset
857      * @return UTF-16 offset
858      * @exception IndexOutOfBoundsException if offset32 is out of bounds.
859      * @stable ICU 2.1
860      */
861     public static int findOffsetFromCodePoint(StringBuffer   source,
862                                               int offset32)
863     {
864         char ch;
865         int size = source.length(),
866             result = 0,
867             count = offset32;
868         if (offset32 < 0 || offset32 > size) {
869             throw new StringIndexOutOfBoundsException  (offset32);
870         }
871         while (result < size && count > 0)
872             {
873                 ch = source.charAt(result);
874                 if (isLeadSurrogate(ch) && ((result + 1) < size) &&
875                     isTrailSurrogate(source.charAt(result + 1))) {
876                     result ++;
877                 }
878 
879                 count --;
880                 result ++;
881             }
882         if (count != 0) {
883             throw new StringIndexOutOfBoundsException  (offset32);
884         }
885         return result;
886     }
887 
888     /**
889      * Returns the UTF-16 offset that corresponds to a UTF-32 offset.
890      * Used for random access. See the <a name="_top_">class description</a>
891      * for notes on roundtripping.
892      * @param source the UTF-16 char array whose substring is to be analysed
893      * @param start offset of the substring to be analysed
894      * @param limit offset of the substring to be analysed
895      * @param offset32 UTF-32 offset relative to start
896      * @return UTF-16 offset relative to start
897      * @exception IndexOutOfBoundsException if offset32 is out of bounds.
898      * @stable ICU 2.1
899      */
900     public static int findOffsetFromCodePoint(char source[], int start,
901                                               int limit, int offset32)
902     {
903         char ch;
904         int result = start,
905             count = offset32;
906         if (offset32 > limit - start) {
907             throw new ArrayIndexOutOfBoundsException  (offset32);
908         }
909         while (result < limit && count > 0)
910             {
911                 ch = source[result];
912                 if (isLeadSurrogate(ch) && ((result + 1) < limit) &&
913                     isTrailSurrogate(source[result + 1])) {
914                     result ++;
915                 }
916 
917                 count --;
918                 result ++;
919             }
920         if (count != 0) {
921             throw new ArrayIndexOutOfBoundsException  (offset32);
922         }
923         return result - start;
924     }
925 
926     /**
927      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at
928      * or after the given UTF-16 offset. Used for random access. See the
929      * <a name="_top_">class description</a> for notes on roundtripping.<br>
930      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair,
931      * then the UTF-32 offset of the <strong>lead</strong> of the pair is
932      * returned.
933      * </i>
934      * <p>
935      * To find the UTF-32 length of a string, use:
936      *   <pre>
937      *     len32 = countCodePoint(source, source.length());
938      *   </pre>
939      * </p>
940      * <p>
941      * @param source text to analyse
942      * @param offset16 UTF-16 offset < source text length.
943      * @return UTF-32 offset
944      * @exception IndexOutOfBoundsException if offset16 is out of bounds.
945      * @stable ICU 2.1
946      */
947     public static int findCodePointOffset(String   source, int offset16)
948     {
949         if (offset16 < 0 || offset16 > source.length()) {
950             throw new StringIndexOutOfBoundsException  (offset16);
951         }
952 
953         int result = 0;
954         char ch;
955         boolean hadLeadSurrogate = false;
956 
957         for (int i = 0; i < offset16; ++ i)
958             {
959                 ch = source.charAt(i);
960                 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
961                     hadLeadSurrogate = false;           // count valid trail as zero
962                 }
963                 else
964                     {
965                         hadLeadSurrogate = isLeadSurrogate(ch);
966                         ++ result;                          // count others as 1
967                     }
968             }
969 
970         if (offset16 == source.length()) {
971             return result;
972         }
973 
974         // end of source being the less significant surrogate character
975         // shift result back to the start of the supplementary character
976         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
977             result --;
978         }
979 
980         return result;
981     }
982 
983     /**
984      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at
985      * the given UTF-16 offset. Used for random access. See the
986      * <a name="_top_">class description</a> for notes on roundtripping.<br>
987      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair,
988      * then the UTF-32 offset of the <strong>lead</strong> of the pair is
989      * returned.
990      * </i>
991      * <p>
992      * To find the UTF-32 length of a string, use:
993      *   <pre>
994      *     len32 = countCodePoint(source);
995      *   </pre>
996      * </p>
997      * <p>
998      * @param source text to analyse
999      * @param offset16 UTF-16 offset < source text length.
1000     * @return UTF-32 offset
1001     * @exception IndexOutOfBoundsException if offset16 is out of bounds.
1002     * @stable ICU 2.1
1003     */
1004    public static int findCodePointOffset(StringBuffer   source, int offset16)
1005    {
1006        if (offset16 < 0 || offset16 > source.length()) {
1007            throw new StringIndexOutOfBoundsException  (offset16);
1008        }
1009
1010        int result = 0;
1011        char ch;
1012        boolean hadLeadSurrogate = false;
1013
1014        for (int i = 0; i < offset16; ++ i)
1015            {
1016                ch = source.charAt(i);
1017                if (hadLeadSurrogate && isTrailSurrogate(ch)) {
1018                    hadLeadSurrogate = false;           // count valid trail as zero
1019                }
1020                else
1021                    {
1022                        hadLeadSurrogate = isLeadSurrogate(ch);
1023                        ++ result;                          // count others as 1
1024                    }
1025            }
1026
1027        if (offset16 == source.length()) {
1028            return result;
1029        }
1030
1031        // end of source being the less significant surrogate character
1032        // shift result back to the start of the supplementary character
1033        if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16))))
1034            {
1035                result --;
1036            }
1037
1038        return result;
1039    }
1040
1041    /**
1042     * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at
1043     * the given UTF-16 offset. Used for random access. See the
1044     * <a name="_top_">class description</a> for notes on roundtripping.<br>
1045     * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair,
1046     * then the UTF-32 offset of the <strong>lead</strong> of the pair is
1047     * returned.
1048     * </i>
1049     * <p>
1050     * To find the UTF-32 length of a substring, use:
1051     *   <pre>
1052     *     len32 = countCodePoint(source, start, limit);
1053     *   </pre>
1054     * </p>
1055     * <p>
1056     * @param source text to analyse
1057     * @param start offset of the substring
1058     * @param limit offset of the substring
1059     * @param offset16 UTF-16 relative to start
1060     * @return UTF-32 offset relative to start
1061     * @exception IndexOutOfBoundsException if offset16 is not within the
1062     *            range of start and limit.
1063     * @stable ICU 2.1
1064     */
1065    public static int findCodePointOffset(char source[], int start, int limit,
1066                                          int offset16)
1067    {
1068        offset16 += start;
1069        if (offset16 > limit) {
1070            throw new StringIndexOutOfBoundsException  (offset16);
1071        }
1072
1073        int result = 0;
1074        char ch;
1075        boolean hadLeadSurrogate = false;
1076
1077        for (int i = start; i < offset16; ++ i)
1078            {
1079                ch = source[i];
1080                if (hadLeadSurrogate && isTrailSurrogate(ch)) {
1081                    hadLeadSurrogate = false; // count valid trail as zero
1082                }
1083                else
1084                    {
1085                        hadLeadSurrogate = isLeadSurrogate(ch);
1086                        ++ result;                          // count others as 1
1087                    }
1088            }
1089
1090        if (offset16 == limit) {
1091            return result;
1092        }
1093
1094        // end of source being the less significant surrogate character
1095        // shift result back to the start of the supplementary character
1096        if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
1097            result --;
1098        }
1099
1100        return result;
1101    }
1102
1103    /**
1104     * Append a single UTF-32 value to the end of a StringBuffer.
1105     * If a validity check is required, use
1106     * <a HREF="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
1107     * char32 before calling.
1108     * @param target the buffer to append to
1109     * @param char32 value to append.
1110     * @return the updated StringBuffer
1111     * @exception IllegalArgumentException thrown when char32 does not lie
1112     *            within the range of the Unicode codepoints
1113     * @stable ICU 2.1
1114     */
1115    public static StringBuffer   append(StringBuffer   target, int char32)
1116    {
1117        // Check for irregular values
1118        if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1119            throw new IllegalArgumentException  ("Illegal codepoint: " + Integer.toHexString(char32));
1120        }
1121
1122        // Write the UTF-16 values
1123        if (char32 >= SUPPLEMENTARY_MIN_VALUE)
1124            {
1125                target.append(getLeadSurrogate(char32));
1126                target.append(getTrailSurrogate(char32));
1127            }
1128        else {
1129            target.append((char)char32);
1130        }
1131        return target;
1132    }
1133
1134    /**
1135     * Cover JDK 1.5 APIs.  Append the code point to the buffer and return the buffer
1136     * as a convenience.
1137     *
1138     * @param target the buffer to append to
1139     * @param cp the code point to append
1140     * @return the updated StringBuffer
1141     * @throws IllegalArgumentException if cp is not a valid code point
1142     * @stable ICU 3.0
1143     */
1144    public static StringBuffer   appendCodePoint(StringBuffer   target, int cp) {
1145        return append(target, cp);
1146    }
1147
1148    /**
1149     * Adds a codepoint to offset16 position of the argument char array.
1150     * @param target char array to be append with the new code point
1151     * @param limit UTF16 offset which the codepoint will be appended.
1152     * @param char32 code point to be appended
1153     * @return offset after char32 in the array.
1154     * @exception IllegalArgumentException thrown if there is not enough
1155     *            space for the append, or when char32 does not lie within
1156     *            the range of the Unicode codepoints.
1157     * @stable ICU 2.1
1158     */
1159    public static int append(char[] target, int limit, int char32)
1160    {
1161        // Check for irregular values
1162        if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1163            throw new IllegalArgumentException  ("Illegal codepoint");
1164        }
1165        // Write the UTF-16 values
1166        if (char32 >= SUPPLEMENTARY_MIN_VALUE)
1167            {
1168                target[limit ++] = getLeadSurrogate(char32);
1169                target[limit ++] = getTrailSurrogate(char32);
1170            }
1171        else {
1172            target[limit ++] = (char)char32;
1173        }
1174        return limit;
1175    }
1176
1177    /**
1178     * Number of codepoints in a UTF16 String
1179     * @param source UTF16 string
1180     * @return number of codepoint in string
1181     * @stable ICU 2.1
1182     */
1183    public static int countCodePoint(String   source)
1184    {
1185        if (source == null || source.length() == 0) {
1186            return 0;
1187        }
1188        return findCodePointOffset(source, source.length());
1189    }
1190
1191    /**
1192     * Number of codepoints in a UTF16 String buffer
1193     * @param source UTF16 string buffer
1194     * @return number of codepoint in string
1195     * @stable ICU 2.1
1196     */
1197    public static int countCodePoint(StringBuffer   source)
1198    {
1199        if (source == null || source.length() == 0) {
1200            return 0;
1201        }
1202        return findCodePointOffset(source, source.length());
1203    }
1204
1205    /**
1206     * Number of codepoints in a UTF16 char array substring
1207     * @param source UTF16 char array
1208     * @param start offset of the substring
1209     * @param limit offset of the substring
1210     * @return number of codepoint in the substring
1211     * @exception IndexOutOfBoundsException if start and limit are not valid.
1212     * @stable ICU 2.1
1213     */
1214    public static int countCodePoint(char source[], int start, int limit)
1215    {
1216        if (source == null || source.length == 0) {
1217            return 0;
1218        }
1219        return findCodePointOffset(source, start, limit, limit - start);
1220    }
1221
1222    /**
1223     * Set a code point into a UTF16 position.
1224     * Adjusts target according if we are replacing a non-supplementary
1225     * codepoint with a supplementary and vice versa.
1226     * @param target stringbuffer
1227     * @param offset16 UTF16 position to insert into
1228     * @param char32 code point
1229     * @stable ICU 2.1
1230     */
1231    public static void setCharAt(StringBuffer   target, int offset16,
1232                                 int char32)
1233    {
1234        int count = 1;
1235        char single = target.charAt(offset16);
1236
1237        if (isSurrogate(single))
1238            {
1239                // pairs of the surrogate with offset16 at the lead char found
1240                if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
1241                    && isTrailSurrogate(target.charAt(offset16 + 1))) {
1242                    count ++;
1243                }
1244                else {
1245                    // pairs of the surrogate with offset16 at the trail char
1246                    // found
1247                    if (isTrailSurrogate(single) && (offset16 > 0) &&
1248                        isLeadSurrogate(target.charAt(offset16 -1)))
1249                        {
1250                            offset16 --;
1251                            count ++;
1252                        }
1253                }
1254            }
1255        target.replace(offset16, offset16 + count, valueOf(char32));
1256    }
1257
1258    /**
1259     * Set a code point into a UTF16 position in a char array.
1260     * Adjusts target according if we are replacing a non-supplementary
1261     * codepoint with a supplementary and vice versa.
1262     * @param target char array
1263     * @param limit numbers of valid chars in target, different from
1264     *        target.length. limit counts the number of chars in target
1265     *        that represents a string, not the size of array target.
1266     * @param offset16 UTF16 position to insert into
1267     * @param char32 code point
1268     * @return new number of chars in target that represents a string
1269     * @exception IndexOutOfBoundsException if offset16 is out of range
1270     * @stable ICU 2.1
1271     */
1272    public static int setCharAt(char target[], int limit,
1273                                int offset16, int char32)
1274    {
1275        if (offset16 >= limit) {
1276            throw new ArrayIndexOutOfBoundsException  (offset16);
1277        }
1278        int count = 1;
1279        char single = target[offset16];
1280
1281        if (isSurrogate(single))
1282            {
1283                // pairs of the surrogate with offset16 at the lead char found
1284                if (isLeadSurrogate(single) && (target.length > offset16 + 1) &&
1285                    isTrailSurrogate(target[offset16 + 1])) {
1286                    count ++;
1287                }
1288                else {
1289                    // pairs of the surrogate with offset16 at the trail char
1290                    // found
1291                    if (isTrailSurrogate(single) && (offset16 > 0) &&
1292                        isLeadSurrogate(target[offset16 -1]))
1293                        {
1294                            offset16 --;
1295                            count ++;
1296                        }
1297                }
1298            }
1299
1300        String   str = valueOf(char32);
1301        int result = limit;
1302        int strlength = str.length();
1303        target[offset16] = str.charAt(0);
1304        if (count == strlength) {
1305            if (count == 2) {
1306                target[offset16 + 1] = str.charAt(1);
1307            }
1308        }
1309        else {
1310            // this is not exact match in space, we'll have to do some
1311            // shifting
1312            System.arraycopy(target, offset16 + count, target,
1313                             offset16 + strlength, limit - (offset16 + count));
1314            if (count < strlength) {
1315                // char32 is a supplementary character trying to squeeze into
1316                // a non-supplementary space
1317                target[offset16 + 1] = str.charAt(1);
1318                result ++;
1319                if (result < target.length) {
1320                    target[result] = 0;
1321                }
1322            }
1323            else {
1324                // char32 is a non-supplementary character trying to fill
1325                // into a supplementary space
1326                result --;
1327                target[result] = 0;
1328            }
1329        }
1330        return result;
1331    }
1332
1333    /**
1334     * Shifts offset16 by the argument number of codepoints
1335     * @param source string
1336     * @param offset16 UTF16 position to shift
1337     * @param shift32 number of codepoints to shift
1338     * @return new shifted offset16
1339     * @exception IndexOutOfBoundsException if the new offset16 is out of
1340     *                                      bounds.
1341     * @stable ICU 2.1
1342     */
1343    public static int moveCodePointOffset(String   source, int offset16,
1344                                          int shift32)
1345    {
1346        int result = offset16;
1347        int size = source.length();
1348        int count;
1349        char ch;
1350        if (offset16<0 || offset16>size) {
1351            throw new StringIndexOutOfBoundsException  (offset16);
1352        }
1353        if (shift32 > 0 ) {
1354            if (shift32 + offset16 > size) {
1355                throw new StringIndexOutOfBoundsException  (offset16);
1356            }
1357            count = shift32;
1358            while (result < size && count > 0)
1359            {
1360                ch = source.charAt(result);
1361                if (isLeadSurrogate(ch) && ((result + 1) < size) &&
1362                        isTrailSurrogate(source.charAt(result + 1))) {
1363                    result ++;
1364                }
1365                count --;
1366                result ++;
1367            }
1368        } else {
1369            if (offset16 + shift32 < 0) {
1370                throw new StringIndexOutOfBoundsException  (offset16);
1371            }
1372            for (count=-shift32; count>0; count--) {
1373                result--;
1374                if (result<0) {
1375                    break;
1376                }
1377                ch = source.charAt(result);
1378                if (isTrailSurrogate(ch) && result>0 && isLeadSurrogate(source.charAt(result-1))) {
1379                    result--;
1380                }
1381            }
1382        }
1383        if (count != 0)  {
1384            throw new StringIndexOutOfBoundsException  (shift32);
1385        }
1386        return result;
1387    }
1388
1389    /**
1390     * Shifts offset16 by the argument number of codepoints
1391     * @param source string buffer
1392     * @param offset16 UTF16 position to shift
1393     * @param shift32 number of codepoints to shift
1394     * @return new shifted offset16
1395     * @exception IndexOutOfBoundsException if the new offset16 is out of
1396     *                                      bounds.
1397     * @stable ICU 2.1
1398     */
1399    public static int moveCodePointOffset(StringBuffer   source, int offset16,
1400                                          int shift32)
1401    {
1402        int result = offset16;
1403        int size = source.length();
1404        int count;
1405        char ch;
1406        if (offset16<0 || offset16>size) {
1407            throw new StringIndexOutOfBoundsException  (offset16);
1408        }
1409        if (shift32 > 0 ) {
1410            if (shift32 + offset16 > size) {
1411                throw new StringIndexOutOfBoundsException  (offset16);
1412            }
1413            count = shift32;
1414            while (result < size && count > 0)
1415            {
1416                ch = source.charAt(result);
1417                if (isLeadSurrogate(ch) && ((result + 1) < size) &&
1418                        isTrailSurrogate(source.charAt(result + 1))) {
1419                    result ++;
1420                }
1421                count --;
1422                result ++;
1423            }
1424        } else {
1425            if (offset16 + shift32 < 0) {
1426                throw new StringIndexOutOfBoundsException  (offset16);
1427            }
1428            for (count=-shift32; count>0; count--) {
1429                result--;
1430                if (result<0) {
1431                    break;
1432                }
1433                ch = source.charAt(result);
1434                if (isTrailSurrogate(ch) && result>0 && isLeadSurrogate(source.charAt(result-1))) {
1435                    result--;
1436                }
1437            }
1438        }
1439        if (count != 0)  {
1440            throw new StringIndexOutOfBoundsException  (shift32);
1441        }
1442        return result;
1443    }
1444
1445    /**
1446     * Shifts offset16 by the argument number of codepoints within a subarray.
1447     * @param source char array
1448     * @param start position of the subarray to be performed on
1449     * @param limit position of the subarray to be performed on
1450     * @param offset16 UTF16 position to shift relative to start
1451     * @param shift32 number of codepoints to shift
1452     * @return new shifted offset16 relative to start
1453     * @exception IndexOutOfBoundsException if the new offset16 is out of
1454     *            bounds with respect to the subarray or the subarray bounds
1455     *            are out of range.
1456     * @stable ICU 2.1
1457     */
1458    public static int moveCodePointOffset(char source[], int start, int limit,
1459                                          int offset16, int shift32)
1460    {
1461        int         size = source.length;
1462        int         count;
1463        char        ch;
1464        int         result = offset16 + start;
1465        if (start<0 || limit<start) {
1466            throw new StringIndexOutOfBoundsException  (start);
1467        }
1468        if (limit>size) {
1469            throw new StringIndexOutOfBoundsException  (limit);
1470        }
1471        if (offset16<0 || result>limit) {
1472            throw new StringIndexOutOfBoundsException  (offset16);
1473        }
1474        if (shift32 > 0 ) {
1475            if (shift32 + result > size) {
1476                throw new StringIndexOutOfBoundsException  (result);
1477            }
1478            count = shift32;
1479            while (result < limit && count > 0)
1480            {
1481                ch = source[result];
1482                if (isLeadSurrogate(ch) && (result+1 < limit) &&
1483                        isTrailSurrogate(source[result+1])) {
1484                    result ++;
1485                }
1486                count --;
1487                result ++;
1488            }
1489        } else {
1490            if (result + shift32 < start) {
1491                throw new StringIndexOutOfBoundsException  (result);
1492            }
1493            for (count=-shift32; count>0; count--) {
1494                result--;
1495                if (result<start) {
1496                    break;
1497                }
1498                ch = source[result];
1499                if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
1500                    result--;
1501                }
1502            }
1503        }
1504        if (count != 0)  {
1505            throw new StringIndexOutOfBoundsException  (shift32);
1506        }
1507        result -= start;
1508        return result;
1509    }
1510
1511    /**
1512     * Inserts char32 codepoint into target at the argument offset16.
1513     * If the offset16 is in the middle of a supplementary codepoint, char32
1514     * will be inserted after the supplementary codepoint.
1515     * The length of target increases by one if codepoint is non-supplementary,
1516     * 2 otherwise.
1517     * <p>
1518     * The overall effect is exactly as if the argument were converted to a
1519     * string by the method valueOf(char) and the characters in that string
1520     * were then inserted into target at the position indicated by offset16.
1521     * </p>
1522     * <p>
1523     * The offset argument must be greater than or equal to 0, and less than
1524     * or equal to the length of source.
1525     * @param target string buffer to insert to
1526     * @param offset16 offset which char32 will be inserted in
1527     * @param char32 codepoint to be inserted
1528     * @return a reference to target
1529     * @exception IndexOutOfBoundsException thrown if offset16 is invalid.
1530     * @stable ICU 2.1
1531     */
1532    public static StringBuffer   insert(StringBuffer   target, int offset16,
1533                                      int char32)
1534    {
1535        String   str = valueOf(char32);
1536        if (offset16 != target.length() &&
1537            bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1538            offset16 ++;
1539        }
1540        target.insert(offset16, str);
1541        return target;
1542    }
1543
1544    /**
1545     * Inserts char32 codepoint into target at the argument offset16.
1546     * If the offset16 is in the middle of a supplementary codepoint, char32
1547     * will be inserted after the supplementary codepoint.
1548     * Limit increases by one if codepoint is non-supplementary, 2 otherwise.
1549     * <p>
1550     * The overall effect is exactly as if the argument were converted to a
1551     * string by the method valueOf(char) and the characters in that string
1552     * were then inserted into target at the position indicated by offset16.
1553     * </p>
1554     * <p>
1555     * The offset argument must be greater than or equal to 0, and less than
1556     * or equal to the limit.
1557     * @param target char array to insert to
1558     * @param limit end index of the char array, limit <= target.length
1559     * @param offset16 offset which char32 will be inserted in
1560     * @param char32 codepoint to be inserted
1561     * @return new limit size
1562     * @exception IndexOutOfBoundsException thrown if offset16 is invalid.
1563     * @stable ICU 2.1
1564     */
1565    public static int insert(char target[], int limit, int offset16,
1566                             int char32)
1567    {
1568        String   str = valueOf(char32);
1569        if (offset16 != limit &&
1570            bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1571            offset16 ++;
1572        }
1573        int size = str.length();
1574        if (limit + size > target.length) {
1575            throw new ArrayIndexOutOfBoundsException  (offset16 + size);
1576        }
1577        System.arraycopy(target, offset16, target, offset16 + size,
1578                         limit - offset16);
1579        target[offset16] = str.charAt(0);
1580        if (size == 2) {
1581            target[offset16 + 1] = str.charAt(1);
1582        }
1583        return limit + size;
1584    }
1585
1586    /**
1587     * Removes the codepoint at the specified position in this target
1588     * (shortening target by 1 character if the codepoint is a
1589     * non-supplementary, 2 otherwise).
1590     * @param target string buffer to remove codepoint from
1591     * @param offset16 offset which the codepoint will be removed
1592     * @return a reference to target
1593     * @exception IndexOutOfBoundsException thrown if offset16 is invalid.
1594     * @stable ICU 2.1
1595     */
1596    public static StringBuffer   delete(StringBuffer   target, int offset16)
1597    {
1598        int count = 1;
1599        switch (bounds(target, offset16)) {
1600        case LEAD_SURROGATE_BOUNDARY:
1601            count ++;
1602            break;
1603        case TRAIL_SURROGATE_BOUNDARY:
1604            count ++;
1605            offset16 --;
1606            break;
1607        }
1608        target.delete(offset16, offset16 + count);
1609        return target;
1610    }
1611
1612    /**
1613     * Removes the codepoint at the specified position in this target
1614     * (shortening target by 1 character if the codepoint is a
1615     * non-supplementary, 2 otherwise).
1616     * @param target string buffer to remove codepoint from
1617     * @param limit end index of the char array, limit <= target.length
1618     * @param offset16 offset which the codepoint will be removed
1619     * @return a new limit size
1620     * @exception IndexOutOfBoundsException thrown if offset16 is invalid.
1621     * @stable ICU 2.1
1622     */
1623    public static int delete(char target[], int limit, int offset16)
1624    {
1625        int count = 1;
1626        switch (bounds(target, 0, limit, offset16)) {
1627        case LEAD_SURROGATE_BOUNDARY:
1628            count ++;
1629            break;
1630        case TRAIL_SURROGATE_BOUNDARY:
1631            count ++;
1632            offset16 --;
1633            break;
1634        }
1635        System.arraycopy(target, offset16 + count, target, offset16,
1636                         limit - (offset16 + count));
1637        target[limit - count] = 0;
1638        return limit - count;
1639    }
1640
1641    /**
1642     * Returns the index within the argument UTF16 format Unicode string of
1643     * the first occurrence of the argument codepoint. I.e., the smallest
1644     * index <code>i</code> such that <code>UTF16.charAt(source, i) ==
1645     * char32</code> is true.
1646     * <p>If no such character occurs in this string, then -1 is returned.</p>
1647     * <p>
1648     * Examples:<br>
1649     * UTF16.indexOf("abc", 'a') returns 0<br>
1650     * UTF16.indexOf("abc?", 0x10000) returns 3<br>
1651     * UTF16.indexOf("abc?", 0xd800) returns -1<br>
1652     * </p>
1653     * Note this method is provided as support to jdk 1.3, which does not
1654     * support supplementary characters to its fullest.
1655     * @param source UTF16 format Unicode string that will be searched
1656     * @param char32 codepoint to search for
1657     * @return the index of the first occurrence of the codepoint in the
1658     *         argument Unicode string, or -1 if the codepoint does not occur.
1659     * @stable ICU 2.6
1660     */
1661    public static int indexOf(String   source, int char32)
1662    {
1663        if (char32 < CODEPOINT_MIN_VALUE ||
1664            char32 > CODEPOINT_MAX_VALUE) {
1665            throw new IllegalArgumentException  (
1666                                               "Argument char32 is not a valid codepoint");
1667        }
1668        // non-surrogate bmp
1669        if (char32 < LEAD_SURROGATE_MIN_VALUE ||
1670            (char32 > TRAIL_SURROGATE_MAX_VALUE &&
1671             char32 < SUPPLEMENTARY_MIN_VALUE)) {
1672            return source.indexOf((char)char32);
1673        }
1674        // surrogate
1675        if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1676            int result = source.indexOf((char)char32);
1677            if (result >= 0) {
1678                if (isLeadSurrogate((char)char32) &&
1679                    (result < source.length() - 1) &&
1680                    isTrailSurrogate(source.charAt(result + 1))) {
1681                    return indexOf(source, char32, result + 1);
1682                }
1683                // trail surrogate
1684                if (result > 0 &&
1685                    isLeadSurrogate(source.charAt(result - 1))) {
1686                    return indexOf(source, char32, result + 1);
1687                }
1688            }
1689            return result;
1690        }
1691        // supplementary
1692        String   char32str = toString(char32);
1693        return source.indexOf(char32str);
1694    }
1695
1696    /**
1697     * Returns the index within the argument UTF16 format Unicode string of
1698     * the first occurrence of the argument string str. This method is
1699     * implemented based on codepoints, hence a "lead surrogate character +
1700     * trail surrogate character" is treated as one entity.e
1701     * Hence if the str starts with trail surrogate character at index 0, a
1702     * source with a leading a surrogate character before str found at in
1703     * source will not have a valid match. Vice versa for lead surrogates
1704     * that ends str.
1705     * See example below.
1706     * <p>If no such string str occurs in this source, then -1 is returned.
1707     * </p> <p>
1708     * Examples:<br>
1709     * UTF16.indexOf("abc", "ab") returns 0<br>
1710     * UTF16.indexOf("abc?", "?") returns 3<br>
1711     * UTF16.indexOf("abc?", "?") returns -1<br>
1712     * </p>
1713     * Note this method is provided as support to jdk 1.3, which does not
1714     * support supplementary characters to its fullest.
1715     * @param source UTF16 format Unicode string that will be searched
1716     * @param str UTF16 format Unicode string to search for
1717     * @return the index of the first occurrence of the codepoint in the
1718     *         argument Unicode string, or -1 if the codepoint does not occur.
1719     * @stable ICU 2.6
1720     */
1721    public static int indexOf(String   source, String   str)
1722    {
1723        int strLength = str.length();
1724        // non-surrogate ends
1725        if (!isTrailSurrogate(str.charAt(0)) &&
1726            !isLeadSurrogate(str.charAt(strLength - 1))) {
1727            return source.indexOf(str);
1728        }
1729
1730        int result    = source.indexOf(str);
1731        int resultEnd = result + strLength;
1732        if (result >= 0) {
1733            // check last character
1734            if (isLeadSurrogate(str.charAt(strLength - 1)) &&
1735                (result < source.length() - 1) &&
1736                isTrailSurrogate(source.charAt(resultEnd + 1))) {
1737                return indexOf(source, str, resultEnd + 1);
1738            }
1739            // check first character which is a trail surrogate
1740            if (isTrailSurrogate(str.charAt(0)) && result > 0 &&
1741                isLeadSurrogate(source.charAt(result - 1))) {
1742                return indexOf(source, str, resultEnd + 1);
1743            }
1744        }
1745        return result;
1746    }
1747
1748    /**
1749     * Returns the index within the argument UTF16 format Unicode string of
1750     * the first occurrence of the argument codepoint. I.e., the smallest
1751     * index i such that: <br>
1752     * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true.
1753     * <p>If no such character occurs in this string, then -1 is returned.</p>
1754     * <p>
1755     * Examples:<br>
1756     * UTF16.indexOf("abc", 'a', 1) returns -1<br>
1757     * UTF16.indexOf("abc?", 0x10000, 1) returns 3<br>
1758     * UTF16.indexOf("abc?", 0xd800, 1) returns -1<br>
1759     * </p>
1760     * Note this method is provided as support to jdk 1.3, which does not
1761     * support supplementary characters to its fullest.
1762     * @param source UTF16 format Unicode string that will be searched
1763     * @param char32 codepoint to search for
1764     * @param fromIndex the index to start the search from.
1765     * @return the index of the first occurrence of the codepoint in the
1766     *         argument Unicode string at or after fromIndex, or -1 if the
1767     *         codepoint does not occur.
1768     * @stable ICU 2.6
1769     */
1770    public static int indexOf(String   source, int char32, int fromIndex)
1771    {
1772        if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1773            throw new IllegalArgumentException  (
1774                                               "Argument char32 is not a valid codepoint");
1775        }
1776        // non-surrogate bmp
1777        if (char32 < LEAD_SURROGATE_MIN_VALUE ||
1778            (char32 > TRAIL_SURROGATE_MAX_VALUE &&
1779             char32 < SUPPLEMENTARY_MIN_VALUE)) {
1780            return source.indexOf((char)char32, fromIndex);
1781        }
1782        // surrogate
1783        if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1784            int result = source.indexOf((char)char32, fromIndex);
1785            if (result >= 0) {
1786                if (isLeadSurrogate((char)char32) &&
1787                    (result < source.length() - 1) &&
1788                    isTrailSurrogate(source.charAt(result + 1))) {
1789                    return indexOf(source, char32, result + 1);
1790                }
1791                // trail surrogate
1792                if (result > 0 &&
1793                    isLeadSurrogate(source.charAt(result - 1))) {
1794                    return indexOf(source, char32, result + 1);
1795                }
1796            }
1797            return result;
1798        }
1799        // supplementary
1800        String   char32str = toString(char32);
1801        return source.indexOf(char32str, fromIndex);
1802    }
1803
1804    /**
1805     * Returns the index within the argument UTF16 format Unicode string of
1806     * the first occurrence of the argument string str. This method is
1807     * implemented based on codepoints, hence a "lead surrogate character +
1808     * trail surrogate character" is treated as one entity.e
1809     * Hence if the str starts with trail surrogate character at index 0, a
1810     * source with a leading a surrogate character before str found at in
1811     * source will not have a valid match. Vice versa for lead surrogates
1812     * that ends str.
1813     * See example below.
1814     * <p>If no such string str occurs in this source, then -1 is returned.
1815     * </p> <p>
1816     * Examples:<br>
1817     * UTF16.indexOf("abc", "ab", 0) returns 0<br>
1818     * UTF16.indexOf("abc?", "?", 0) returns 3<br>
1819     * UTF16.indexOf("abc?", "?", 2) returns 3<br>
1820     * UTF16.indexOf("abc?", "?", 0) returns -1<br>
1821     * </p>
1822     * Note this method is provided as support to jdk 1.3, which does not
1823     * support supplementary characters to its fullest.
1824     * @param source UTF16 format Unicode string that will be searched
1825     * @param str UTF16 format Unicode string to search for
1826     * @param fromIndex the index to start the search from.
1827     * @return the index of the first occurrence of the codepoint in the
1828     *         argument Unicode string, or -1 if the codepoint does not occur.
1829     * @stable ICU 2.6
1830     */
1831    public static int indexOf(String   source, String   str, int fromIndex)
1832    {
1833        int strLength = str.length();
1834        // non-surrogate ends
1835        if (!isTrailSurrogate(str.charAt(0)) &&
1836            !isLeadSurrogate(str.charAt(strLength - 1))) {
1837            return source.indexOf(str, fromIndex);
1838        }
1839
1840        int result    = source.indexOf(str, fromIndex);
1841        int resultEnd = result + strLength;
1842        if (result >= 0) {
1843            // check last character
1844            if (isLeadSurrogate(str.charAt(strLength - 1)) &&
1845                (result < source.length() - 1) &&
1846                isTrailSurrogate(source.charAt(resultEnd))) {
1847                return indexOf(source, str, resultEnd + 1);
1848            }
1849            // check first character which is a trail surrogate
1850            if (isTrailSurrogate(str.charAt(0)) && result > 0 &&
1851                isLeadSurrogate(source.charAt(result - 1))) {
1852                return indexOf(source, str, resultEnd + 1);
1853            }
1854        }
1855        return result;
1856    }
1857
1858    /**
1859     * Returns the index within the argument UTF16 format Unicode string of
1860     * the last occurrence of the argument codepoint. I.e., the index returned
1861     * is the largest value i such that: UTF16.charAt(source, i) == char32
1862     * is true.
1863     * <p>
1864     * Examples:<br>
1865     * UTF16.lastIndexOf("abc", 'a') returns 0<br>
1866     * UTF16.lastIndexOf("abc?", 0x10000) returns 3<br>
1867     * UTF16.lastIndexOf("abc?", 0xd800) returns -1<br>
1868     * </p>
1869     * <p>source is searched backwards starting at the last character.</p>
1870     * Note this method is provided as support to jdk 1.3, which does not
1871     * support supplementary characters to its fullest.
1872     * @param source UTF16 format Unicode string that will be searched
1873     * @param char32 codepoint to search for
1874     * @return the index of the last occurrence of the codepoint in source,
1875     *         or -1 if the codepoint does not occur.
1876     * @stable ICU 2.6
1877     */
1878    public static int lastIndexOf(String   source, int char32)
1879    {
1880        if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1881            throw new IllegalArgumentException  (
1882                                               "Argument char32 is not a valid codepoint");
1883        }
1884        // non-surrogate bmp
1885        if (char32 < LEAD_SURROGATE_MIN_VALUE ||
1886            (char32 > TRAIL_SURROGATE_MAX_VALUE &&
1887             char32 < SUPPLEMENTARY_MIN_VALUE)) {
1888            return source.lastIndexOf((char)char32);
1889        }
1890        // surrogate
1891        if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1892            int result = source.lastIndexOf((char)char32);
1893            if (result >= 0) {
1894                if (isLeadSurrogate((char)char32) &&
1895                    (result < source.length() - 1) &&
1896                    isTrailSurrogate(source.charAt(result + 1))) {
1897                    return lastIndexOf(source, char32, result - 1);
1898                }
1899                // trail surrogate
1900                if (result > 0 &&
1901                    isLeadSurrogate(source.charAt(result - 1))) {
1902                    return lastIndexOf(source, char32, result - 1);
1903                }
1904            }
1905            return result;
1906        }
1907        // supplementary
1908        String   char32str = toString(char32);
1909        return source.lastIndexOf(char32str);
1910    }
1911
1912    /**
1913     * Returns the index within the argument UTF16 format Unicode string of
1914     * the last occurrence of the argument string str. This method is
1915     * implemented based on codepoints, hence a "lead surrogate character +
1916     * trail surrogate character" is treated as one entity.e
1917     * Hence if the str starts with trail surrogate character at index 0, a
1918     * source with a leading a surrogate character before str found at in
1919     * source will not have a valid match. Vice versa for lead surrogates
1920     * that ends str.
1921     * See example below.
1922     * <p>
1923     * Examples:<br>
1924     * UTF16.lastIndexOf("abc", "a") returns 0<br>
1925     * UTF16.lastIndexOf("abc?", "?") returns 3<br>
1926     * UTF16.lastIndexOf("abc?", "?") returns -1<br>
1927     * </p>
1928     * <p>source is searched backwards starting at the last character.</p>
1929     * Note this method is provided as support to jdk 1.3, which does not
1930     * support supplementary characters to its fullest.
1931     * @param source UTF16 format Unicode string that will be searched
1932     * @param str UTF16 format Unicode string to search for
1933     * @return the index of the last occurrence of the codepoint in source,
1934     *         or -1 if the codepoint does not occur.
1935     * @stable ICU 2.6
1936     */
1937    public static int lastIndexOf(String   source, String   str)
1938    {
1939        int strLength = str.length();
1940        // non-surrogate ends
1941        if (!isTrailSurrogate(str.charAt(0)) &&
1942            !isLeadSurrogate(str.charAt(strLength - 1))) {
1943            return source.lastIndexOf(str);
1944        }
1945
1946        int result    = source.lastIndexOf(str);
1947        if (result >= 0) {
1948            // check last character
1949            if (isLeadSurrogate(str.charAt(strLength - 1)) &&
1950                (result < source.length() - 1) &&
1951                isTrailSurrogate(source.charAt(result + strLength + 1))) {
1952                return lastIndexOf(source, str, result - 1);
1953            }
1954            // check first character which is a trail surrogate
1955            if (isTrailSurrogate(str.charAt(0)) && result > 0 &&
1956                isLeadSurrogate(source.charAt(result - 1))) {
1957                return lastIndexOf(source, str, result - 1);
1958            }
1959        }
1960        return result;
1961    }
1962
1963    /**
1964     * <p>Returns the index within the argument UTF16 format Unicode string of
1965     * the last occurrence of the argument codepoint, where the result is less
1966     * than or equals to fromIndex.</p>
1967     * <p>This method is implemented based on codepoints, hence a single
1968     * surrogate character will not match a supplementary character.</p>
1969     * <p>source is searched backwards starting at the last character starting
1970     * at the specified index.</p>
1971     * <p>
1972     * Examples:<br>
1973     * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
1974     * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
1975     * UTF16.lastIndexOf("abc?", 0x10000, 5) returns 3<br>
1976     * UTF16.lastIndexOf("abc?", 0x10000, 3) returns 3<br>
1977     * UTF16.lastIndexOf("abc?", 0xd800) returns -1<br>
1978     * </p>
1979     * Note this method is provided as support to jdk 1.3, which does not
1980     * support supplementary characters to its fullest.
1981     * @param source UTF16 format Unicode string that will be searched
1982     * @param char32 codepoint to search for
1983     * @param fromIndex the index to start the search from. There is no
1984     *                  restriction on the value of fromIndex. If it is
1985     *                  greater than or equal to the length of this string,
1986     *                  it has the same effect as if it were equal to one
1987     *                  less than the length of this string: this entire
1988     *                  string may be searched. If it is negative, it has
1989     *                  the same effect as if it were -1: -1 is returned.
1990     * @return the index of the last occurrence of the codepoint in source,
1991     *         or -1 if the codepoint does not occur.
1992     * @stable ICU 2.6
1993     */
1994    public static int lastIndexOf(String   source, int char32, int fromIndex)
1995    {
1996        if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1997            throw new IllegalArgumentException  (
1998                                               "Argument char32 is not a valid codepoint");
1999        }
2000        // non-surrogate bmp
2001        if (char32 < LEAD_SURROGATE_MIN_VALUE ||
2002            (char32 > TRAIL_SURROGATE_MAX_VALUE &&
2003             char32 < SUPPLEMENTARY_MIN_VALUE)) {
2004            return source.lastIndexOf((char)char32, fromIndex);
2005        }
2006        // surrogate
2007        if (char32 < SUPPLEMENTARY_MIN_VALUE) {
2008            int result = source.lastIndexOf((char)char32, fromIndex);
2009            if (result >= 0) {
2010                if (isLeadSurrogate((char)char32) &&
2011                    (result < source.length() - 1) &&
2012                    isTrailSurrogate(source.charAt(result + 1))) {
2013                    return lastIndexOf(source, char32, result - 1);
2014                }
2015                // trail surrogate
2016                if (result > 0 &&
2017                    isLeadSurrogate(source.charAt(result - 1))) {
2018                    return lastIndexOf(source, char32, result - 1);
2019                }
2020            }
2021            return result;
2022        }
2023        // supplementary
2024        String   char32str = toString(char32);
2025        return source.lastIndexOf(char32str, fromIndex);
2026    }
2027
2028    /**
2029     * <p>Returns the index within the argument UTF16 format Unicode string of
2030     * the last occurrence of the argument string str, where the result is less
2031     * than or equals to fromIndex.</p>
2032     * <p>This method is implemented based on codepoints, hence a
2033     * "lead surrogate character + trail surrogate character" is treated as one
2034     * entity.
2035     * Hence if the str starts with trail surrogate character at index 0, a
2036     * source with a leading a surrogate character before str found at in
2037     * source will not have a valid match. Vice versa for lead surrogates
2038     * that ends str.
2039     * </p>
2040     * See example below.
2041     * <p>
2042     * Examples:<br>
2043     * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
2044     * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
2045     * UTF16.lastIndexOf("abc?", "?", 5) returns 3<br>
2046     * UTF16.lastIndexOf("abc?", "?", 3) returns 3<br>
2047     * UTF16.lastIndexOf("abc?", "?", 4) returns -1<br>
2048     * </p>
2049     * <p>source is searched backwards starting at the last character.</p>
2050     * Note this method is provided as support to jdk 1.3, which does not
2051     * support supplementary characters to its fullest.
2052     * @param source UTF16 format Unicode string that will be searched
2053     * @param str UTF16 format Unicode string to search for
2054     * @param fromIndex the index to start the search from. There is no
2055     *                  restriction on the value of fromIndex. If it is
2056     *                  greater than or equal to the length of this string,
2057     *                  it has the same effect as if it were equal to one
2058     *                  less than the length of this string: this entire
2059     *                  string may be searched. If it is negative, it has
2060     *                  the same effect as if it were -1: -1 is returned.
2061     * @return the index of the last occurrence of the codepoint in source,
2062     *         or -1 if the codepoint does not occur.
2063     * @stable ICU 2.6
2064     */
2065    public static int lastIndexOf(String   source, String   str, int fromIndex)
2066    {
2067        int strLength = str.length();
2068        // non-surrogate ends
2069        if (!isTrailSurrogate(str.charAt(0)) &&
2070            !isLeadSurrogate(str.charAt(strLength - 1))) {
2071            return source.lastIndexOf(str, fromIndex);
2072        }
2073
2074        int result    = source.lastIndexOf(str, fromIndex);
2075        if (result >= 0) {
2076            // check last character
2077            if (isLeadSurrogate(str.charAt(strLength - 1)) &&
2078                (result < source.length() - 1) &&
2079                isTrailSurrogate(source.charAt(result + strLength))) {
2080                return lastIndexOf(source, str, result - 1);
2081            }
2082            // check first character which is a trail surrogate
2083            if (isTrailSurrogate(str.charAt(0)) && result > 0 &&
2084                isLeadSurrogate(source.charAt(result - 1))) {
2085                return lastIndexOf(source, str, result - 1);
2086            }
2087        }
2088        return result;
2089    }
2090
2091    /**
2092     * Returns a new UTF16 format Unicode string resulting from replacing all
2093     * occurrences of oldChar32 in source with newChar32.
2094     * If the character oldChar32 does not occur in the UTF16 format Unicode
2095     * string source, then source will be returned. Otherwise, a new String
2096     * object is created that represents a codepoint sequence identical to the
2097     * codepoint sequence represented by source, except that every occurrence
2098     * of oldChar32 is replaced by an occurrence of newChar32.
2099     * <p>
2100     * Examples: <br>
2101     * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
2102     *        returns "mosquito in your collar"<br>
2103     * UTF16.replace("JonL", 'q', 'x');<br>
2104     *        returns "JonL" (no change)<br>
2105     * UTF16.replace("Supplementary character ?", 0x10000, '!');
2106     * <br>   returns "Supplementary character !"<br>
2107     * UTF16.replace("Supplementary character ?", 0xd800, '!');
2108     * <br>   returns "Supplementary character ?"<br>
2109     * </p>
2110     * Note this method is provided as support to jdk 1.3, which does not
2111     * support supplementary characters to its fullest.
2112     * @param source UTF16 format Unicode string which the codepoint
2113     *               replacements will be based on.
2114     * @param oldChar32 non-zero old codepoint to be replaced.
2115     * @param newChar32 the new codepoint to replace oldChar32
2116     * @return new String derived from source by replacing every occurrence
2117     *         of oldChar32 with newChar32, unless when no oldChar32 is found
2118     *         in source then source will be returned.
2119     * @stable ICU 2.6
2120     */
2121    public static String   replace(String   source, int oldChar32,
2122                                 int newChar32)
2123    {
2124        if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
2125            throw new IllegalArgumentException  (
2126                                               "Argument oldChar32 is not a valid codepoint");
2127        }
2128        if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
2129            throw new IllegalArgumentException  (
2130                                               "Argument newChar32 is not a valid codepoint");
2131        }
2132
2133        int index     = indexOf(source, oldChar32);
2134        if (index == -1) {
2135            return source;
2136        }
2137        String         newChar32Str    = toString(newChar32);
2138        int          oldChar32Size   = 1;
2139        int          newChar32Size   = newChar32Str.length();
2140        StringBuffer   result = new StringBuffer  (source);
2141        int          resultIndex     = index;
2142
2143        if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
2144            oldChar32Size = 2;
2145        }
2146
2147        while (index != -1) {
2148            int endResultIndex  = resultIndex + oldChar32Size;
2149            result.replace(resultIndex, endResultIndex, newChar32Str);
2150            int lastEndIndex    = index + oldChar32Size;
2151            index       = indexOf(source, oldChar32, lastEndIndex);
2152            resultIndex += newChar32Size + index - lastEndIndex;
2153        }
2154        return result.toString();
2155    }
2156
2157    /**
2158     * Returns a new UTF16 format Unicode string resulting from replacing all
2159     * occurrences of oldStr in source with newStr.
2160     * If the string oldStr does not occur in the UTF16 format Unicode
2161     * string source, then source will be returned. Otherwise, a new String
2162     * object is created that represents a codepoint sequence identical to the
2163     * codepoint sequence represented by source, except that every occurrence
2164     * of oldStr is replaced by an occurrence of newStr.
2165     * <p>
2166     * Examples: <br>
2167     * UTF16.replace("mesquite in your cellar", "e", "o");<br>
2168     *        returns "mosquito in your collar"<br>
2169     * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
2170     *        returns "cat in your cellar"<br>
2171     * UTF16.replace("JonL", "q", "x");<br>
2172     *        returns "JonL" (no change)<br>
2173     * UTF16.replace("Supplementary character ?", "?",
2174     *               '!');
2175     * <br>   returns "Supplementary character !"<br>
2176     * UTF16.replace("Supplementary character ?", "?", '!');
2177     * <br>   returns "Supplementary character ?"<br>
2178     * </p>
2179     * Note this method is provided as support to jdk 1.3, which does not
2180     * support supplementary characters to its fullest.
2181     * @param source UTF16 format Unicode string which the
2182     *               replacements will be based on.
2183     * @param oldStr non-zero-length string to be replaced.
2184     * @param newStr the new string to replace oldStr
2185     * @return new String derived from source by replacing every occurrence
2186     *         of oldStr with newStr.  When no oldStr is found
2187     *         in source, then source will be returned.
2188     * @stable ICU 2.6
2189     */
2190    public static String   replace(String   source, String   oldStr,
2191                                 String   newStr)
2192    {
2193        int index     = indexOf(source, oldStr);
2194        if (index == -1) {
2195            return source;
2196        }
2197        int          oldStrSize   = oldStr.length();
2198        int          newStrSize   = newStr.length();
2199        StringBuffer   result       = new StringBuffer  (source);
2200        int          resultIndex     = index;
2201
2202        while (index != -1) {
2203            int endResultIndex  = resultIndex + oldStrSize;
2204            result.replace(resultIndex, endResultIndex, newStr);
2205            int lastEndIndex    = index + oldStrSize;
2206            index       = indexOf(source, oldStr, lastEndIndex);
2207            resultIndex += newStrSize + index - lastEndIndex;
2208        }
2209        return result.toString();
2210    }
2211
2212    /**
2213     * Reverses a UTF16 format Unicode string and replaces source's content
2214     * with it.
2215     * This method will reverse surrogate characters correctly, instead of
2216     * blindly reversing every character.
2217     * <p>
2218     * Examples:<br>
2219     * UTF16.reverse(new StringBuffer(
2220     *             "Supplementary characters ??"))<br>
2221     * returns "?? sretcarahc yratnemelppuS".
2222     * @param source the source StringBuffer that contains UTF16 format
2223     *        Unicode string to be reversed
2224     * @return a modified source with reversed UTF16 format Unicode string.
2225     * @stable ICU 2.6
2226     */
2227    public static StringBuffer   reverse(StringBuffer   source)
2228    {
2229    int length = source.length();
2230    StringBuffer   result = new StringBuffer  (length);
2231    for (int i = length; i-- > 0;) {
2232        char ch = source.charAt(i);
2233        if (isTrailSurrogate(ch) && i > 0) {
2234        char ch2 = source.charAt(i-1);
2235        if (isLeadSurrogate(ch2)) {
2236            result.append(ch2);
2237            result.append(ch);
2238            --i;
2239            continue;
2240        }
2241        }
2242        result.append(ch);
2243    }
2244    return result;
2245    }
2246
2247    /**
2248     * Check if the string contains more Unicode code points than a certain
2249     * number. This is more efficient than counting all code points in the
2250     * entire string and comparing that number with a threshold.
2251     * This function may not need to scan the string at all if the length is
2252     * within a certain range, and never needs to count more than 'number + 1'
2253     * code points. Logically equivalent to (countCodePoint(s) > number). A
2254     * Unicode code point may occupy either one or two code units.
2255     * @param source The input string.
2256     * @param number The number of code points in the string is compared
2257     *               against the 'number' parameter.
2258     * @return boolean value for whether the string contains more Unicode code
2259     *         points than 'number'.
2260     * @stable ICU 2.4
2261     */
2262    public static boolean hasMoreCodePointsThan(String   source, int number)
2263    {
2264        if (number < 0) {
2265            return true;
2266        }
2267        if (source == null) {
2268            return false;
2269        }
2270        int length = source.length();
2271
2272        // length >= 0 known
2273        // source contains at least (length + 1) / 2 code points: <= 2
2274        // chars per cp
2275        if (((length + 1) >> 1) > number) {
2276            return true;
2277        }
2278
2279        // check if source does not even contain enough chars
2280        int maxsupplementary = length - number;
2281        if (maxsupplementary <= 0) {
2282            return false;
2283        }
2284
2285        // there are maxsupplementary = length - number more chars than
2286        // asked-for code points
2287
2288        // count code points until they exceed and also check that there are
2289        // no more than maxsupplementary supplementary code points (char pairs)
2290        int start = 0;
2291        while (true) {
2292            if (length == 0) {
2293                return false;
2294            }
2295            if (number == 0) {
2296                return true;
2297            }
2298            if (isLeadSurrogate(source.charAt(start ++)) && start != length
2299                && isTrailSurrogate(source.charAt(start))) {
2300                start ++;
2301                if (-- maxsupplementary <= 0) {
2302                    // too many pairs - too few code points
2303                    return false;
2304                }
2305            }
2306            -- number;
2307        }
2308    }
2309
2310    /**
2311     * Check if the sub-range of char array, from argument start to limit,
2312     * contains more Unicode code points than a certain
2313     * number. This is more efficient than counting all code points in the
2314     * entire char array range and comparing that number with a threshold.
2315     * This function may not need to scan the char array at all if start and
2316     * limit is within a certain range, and never needs to count more than
2317     * 'number + 1' code points.
2318     * Logically equivalent to (countCodePoint(source, start, limit) > number).
2319     * A Unicode code point may occupy either one or two code units.
2320     * @param source array of UTF-16 chars
2321     * @param start offset to substring in the source array for analyzing
2322     * @param limit offset to substring in the source array for analyzing
2323     * @param number The number of code points in the string is compared
2324     *               against the 'number' parameter.
2325     * @return boolean value for whether the string contains more Unicode code
2326     *         points than 'number'.
2327     * @exception IndexOutOfBoundsException thrown when limit &lt; start
2328     * @stable ICU 2.4
2329     */
2330    public static boolean hasMoreCodePointsThan(char source[], int start,
2331                                                int limit, int number)
2332    {
2333        int length = limit - start;
2334        if (length < 0 || start < 0 || limit < 0) {
2335            throw new IndexOutOfBoundsException  (
2336                                                "Start and limit indexes should be non-negative and start <= limit");
2337        }
2338        if (number < 0) {
2339            return true;
2340        }
2341        if (source == null) {
2342            return false;
2343        }
2344
2345        // length >= 0 known
2346        // source contains at least (length + 1) / 2 code points: <= 2
2347        // chars per cp
2348        if (((length + 1) >> 1) > number) {
2349            return true;
2350        }
2351
2352        // check if source does not even contain enough chars
2353        int maxsupplementary = length - number;
2354        if (maxsupplementary <= 0) {
2355            return false;
2356        }
2357
2358        // there are maxsupplementary = length - number more chars than
2359        // asked-for code points
2360
2361        // count code points until they exceed and also check that there are
2362        // no more than maxsupplementary supplementary code points (char pairs)
2363        while (true) {
2364            if (length == 0) {
2365                return false;
2366            }
2367            if (number == 0) {
2368                return true;
2369            }
2370            if (isLeadSurrogate(source[start ++]) && start != limit
2371                && isTrailSurrogate(source[start])) {
2372                start ++;
2373                if (-- maxsupplementary <= 0) {
2374                    // too many pairs - too few code points
2375                    return false;
2376                }
2377            }
2378            -- number;
2379        }
2380    }
2381
2382    /**
2383     * Check if the string buffer contains more Unicode code points than a
2384     * certain number. This is more efficient than counting all code points in
2385     * the entire string buffer and comparing that number with a threshold.
2386     * This function may not need to scan the string buffer at all if the
2387     * length is within a certain range, and never needs to count more than
2388     * 'number + 1' code points. Logically equivalent to
2389     * (countCodePoint(s) > number). A Unicode code point may occupy either one
2390     * or two code units.
2391     * @param source The input string buffer.
2392     * @param number The number of code points in the string buffer is compared
2393     *               against the 'number' parameter.
2394     * @return boolean value for whether the string buffer contains more
2395     *         Unicode code points than 'number'.
2396     * @stable ICU 2.4
2397     */
2398    public static boolean hasMoreCodePointsThan(StringBuffer   source, int number)
2399    {
2400        if (number < 0) {
2401            return true;
2402        }
2403        if (source == null) {
2404            return false;
2405        }
2406        int length = source.length();
2407
2408        // length >= 0 known
2409        // source contains at least (length + 1) / 2 code points: <= 2
2410        // chars per cp
2411        if (((length + 1) >> 1) > number) {
2412            return true;
2413        }
2414
2415        // check if source does not even contain enough chars
2416        int maxsupplementary = length - number;
2417        if (maxsupplementary <= 0) {
2418            return false;
2419        }
2420
2421        // there are maxsupplementary = length - number more chars than
2422        // asked-for code points
2423
2424        // count code points until they exceed and also check that there are
2425        // no more than maxsupplementary supplementary code points (char pairs)
2426        int start = 0;
2427        while (true) {
2428            if (length == 0) {
2429                return false;
2430            }
2431            if (number == 0) {
2432                return true;
2433            }
2434            if (isLeadSurrogate(source.charAt(start ++)) && start != length
2435                && isTrailSurrogate(source.charAt(start))) {
2436                start ++;
2437                if (-- maxsupplementary <= 0) {
2438                    // too many pairs - too few code points
2439                    return false;
2440                }
2441            }
2442            -- number;
2443        }
2444    }
2445
2446    /**
2447     * Cover JDK 1.5 API.  Create a String from an array of codePoints.
2448     * @param codePoints the code array
2449     * @param offset     the start of the text in the code point array
2450     * @param count      the number of code points
2451     * @return a String representing the code points between offset and count
2452     * @throws IllegalArgumentException if an invalid code point is encountered
2453     * @throws IndexOutOfBoundsException  if the offset or count are out of bounds.
2454     * @stable ICU 3.0
2455     */
2456    public static String   newString(int[] codePoints, int offset, int count) {
2457        if (count < 0) {
2458            throw new IllegalArgumentException  ();
2459        }
2460        char[] chars = new char[count];
2461        int w = 0;
2462        for (int r = offset, e = offset + count; r < e; ++r) {
2463            int cp = codePoints[r];
2464            if (cp < 0 || cp > 0x10ffff) {
2465                throw new IllegalArgumentException  ();
2466            }
2467            while (true) {
2468                try {
2469                    if (cp < 0x010000) {
2470                        chars[w] = (char)cp;
2471                        w++;
2472                    } else {
2473                        chars[w] = (char)(LEAD_SURROGATE_OFFSET_ +
2474                                          (cp >> LEAD_SURROGATE_SHIFT_));
2475                        chars[w+1] = (char)(TRAIL_SURROGATE_MIN_VALUE +
2476                                            (cp & TRAIL_SURROGATE_MASK_));
2477                        w += 2;
2478                    }
2479                    break;
2480                }
2481                catch (IndexOutOfBoundsException   ex) {
2482                    int newlen = (int)(Math.ceil((double)codePoints.length * (w+2) / (r-offset+1)));
2483                    char[] temp = new char[newlen];
2484                    System.arraycopy(chars, 0, temp, 0, w);
2485                    chars = temp;
2486                }
2487            }
2488        }
2489        return new String  (chars, 0, w);
2490    }
2491
2492    /**
2493     * <p>UTF16 string comparator class.
2494     * Allows UTF16 string comparison to be done with the various modes</p>
2495     * <ul>
2496     * <li> Code point comparison or code unit comparison
2497     * <li> Case sensitive comparison, case insensitive comparison or case
2498     *      insensitive comparison with special handling for character 'i'.
2499     * </ul>
2500     * <p>The code unit or code point comparison differ only when comparing
2501     * supplementary code points (&#92;u10000..&#92;u10ffff) to BMP code points
2502     * near the end of the BMP (i.e., &#92;ue000..&#92;uffff). In code unit
2503     * comparison, high BMP code points sort after supplementary code points
2504     * because they are stored as pairs of surrogates which are at
2505     * &#92;ud800..&#92;udfff.</p>
2506     * @see #FOLD_CASE_DEFAULT
2507     * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2508     * @stable ICU 2.1
2509     */
2510    public static final class StringComparator implements java.util.Comparator  
2511    {
2512        // public constructor ------------------------------------------------
2513
2514        /**
2515         * Default constructor that does code unit comparison and case
2516         * sensitive comparison.
2517         * @stable ICU 2.1
2518         */
2519        public StringComparator()
2520        {
2521            this(false, false, FOLD_CASE_DEFAULT);
2522        }
2523
2524        /**
2525         * Constructor that does comparison based on the argument options.
2526         * @param codepointcompare flag to indicate true for code point
2527         *        comparison or false for code unit comparison.
2528         * @param ignorecase false for case sensitive comparison, true for
2529         *        case-insensitive comparison
2530         * @param foldcaseoption FOLD_CASE_DEFAULT or
2531         *        FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only when
2532         *        ignorecase is set to true. If ignorecase is false, this option
2533         *        is ignored.
2534         * @see #FOLD_CASE_DEFAULT
2535         * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2536         * @throws IllegalArgumentException if foldcaseoption is out of range
2537         * @stable ICU 2.4
2538         */
2539        public StringComparator(boolean codepointcompare,
2540                                boolean ignorecase,
2541                                int foldcaseoption)
2542        {
2543            setCodePointCompare(codepointcompare);
2544            m_ignoreCase_ = ignorecase;
2545            if (foldcaseoption < FOLD_CASE_DEFAULT
2546                || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2547                throw new IllegalArgumentException  ("Invalid fold case option");
2548            }
2549            m_foldCase_ = foldcaseoption;
2550        }
2551
2552        // public data member ------------------------------------------------
2553
2554        /**
2555         * <p>Option value for case folding comparison:</p>
2556         * <p>Comparison is case insensitive, strings are folded using default
2557         * mappings defined in Unicode data file CaseFolding.txt, before
2558         * comparison.
2559         * </p>
2560         * @stable ICU 2.4
2561         */
2562        public static final int FOLD_CASE_DEFAULT = 0;
2563        /**
2564         * <p>Option value for case folding comparison:</p>
2565         * <p>Comparison is case insensitive, strings are folded using modified
2566         * mappings defined in Unicode data file CaseFolding.txt, before
2567         * comparison.
2568         * </p>
2569         * <p>The modified set of mappings is provided in a Unicode data file
2570         * CaseFolding.txt to handle dotted I and dotless i appropriately for
2571         * Turkic languages (tr, az).</p>
2572         * <p>Before Unicode 3.2, CaseFolding.txt contains mappings marked with
2573         * 'I' that are to be included for default mappings and excluded for
2574         * the Turkic-specific mappings.</p>
2575         * <p>Unicode 3.2 CaseFolding.txt instead contains mappings marked with
2576         * 'T' that are to be excluded for default mappings and included for
2577         * the Turkic-specific mappings.</p>
2578         * @stable ICU 2.4
2579         */
2580        public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
2581
2582        // public methods ----------------------------------------------------
2583
2584        // public setters ----------------------------------------------------
2585
2586        /**
2587         * Sets the comparison mode to code point compare if flag is true.
2588         * Otherwise comparison mode is set to code unit compare
2589         * @param flag true for code point compare, false for code unit compare
2590         * @stable ICU 2.4
2591         */
2592        public void setCodePointCompare(boolean flag)
2593        {
2594            if (flag) {
2595                m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
2596            }
2597            else {
2598                m_codePointCompare_ = 0;
2599            }
2600        }
2601
2602        /**
2603         * Sets the Comparator to case-insensitive comparison mode if argument
2604         * is true, otherwise case sensitive comparison mode if set to false.
2605         * @param ignorecase true for case-insitive comparison, false for
2606         *        case sensitive comparison
2607         * @param foldcaseoption FOLD_CASE_DEFAULT or
2608         *        FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only when
2609         *        ignorecase is set to true. If ignorecase is false, this option
2610         *        is ignored.
2611         * @see #FOLD_CASE_DEFAULT
2612         * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2613         * @stable ICU 2.4
2614         */
2615        public void setIgnoreCase(boolean ignorecase, int foldcaseoption)
2616        {
2617            m_ignoreCase_ = ignorecase;
2618            if (foldcaseoption < FOLD_CASE_DEFAULT
2619                || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2620                throw new IllegalArgumentException  ("Invalid fold case option");
2621            }
2622            m_foldCase_ = foldcaseoption;
2623        }
2624
2625        // public getters ----------------------------------------------------
2626
2627        /**
2628         * Checks if the comparison mode is code point compare.
2629         * @return true for code point compare, false for code unit compare
2630         * @stable ICU 2.4
2631         */
2632        public boolean getCodePointCompare()
2633        {
2634            return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2635        }
2636
2637        /**
2638         * Checks if Comparator is in the case insensitive mode.
2639         * @return true if Comparator performs case insensitive comparison,
2640         *         false otherwise
2641         * @stable ICU 2.4
2642         */
2643        public boolean getIgnoreCase()
2644        {
2645            return m_ignoreCase_;
2646        }
2647
2648        /**
2649         * Gets the fold case options set in Comparator to be used with case
2650         * insensitive comparison.
2651         * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
2652         * @see #FOLD_CASE_DEFAULT
2653         * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2654         * @stable ICU 2.4
2655         */
2656        public int getIgnoreCaseOption()
2657        {
2658            return m_foldCase_;
2659        }
2660
2661        // public other methods ----------------------------------------------
2662
2663        /**
2664         * Compare two strings depending on the options selected during
2665         * construction.
2666         * @param a first source string.
2667         * @param b second source string.
2668         * @return 0 returned if a == b. If a < b, a negative value is returned.
2669         *         Otherwise if a > b, a positive value is returned.
2670         * @exception ClassCastException thrown when either a or b is not a
2671         *            String object
2672         * @stable ICU 2.4
2673         */
2674        public int compare(Object   a, Object   b)
2675        {
2676            String   str1 = (String  )a;
2677            String   str2 = (String  )b;
2678
2679            if (str1 == str2) {
2680                return 0;
2681            }
2682            if (str1 == null) {
2683                return -1;
2684            }
2685            if (str2 == null) {
2686                return 1;
2687            }
2688
2689            if (m_ignoreCase_) {
2690                return compareCaseInsensitive(str1, str2);
2691            }
2692            return compareCaseSensitive(str1, str2);
2693        }
2694
2695        // private data member ----------------------------------------------
2696
2697        /**
2698         * Code unit comparison flag. True if code unit comparison is required.
2699         * False if code point comparison is required.
2700         */
2701        private int m_codePointCompare_;
2702
2703        /**
2704         * Fold case comparison option.
2705         */
2706        private int m_foldCase_;
2707
2708        /**
2709         * Flag indicator if ignore case is to be used during comparison
2710         */
2711        private boolean m_ignoreCase_;
2712
2713        /**
2714         * Code point order offset for surrogate characters
2715         */
2716        private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
2717
2718        // private method ---------------------------------------------------
2719
2720        /**
2721         * Compares case insensitive. This is a direct port of ICU4C, to make
2722         * maintainence life easier.
2723         * @param s1 first string to compare
2724         * @param s2 second string to compare
2725         * @return -1 is s1 &lt; s2, 0 if equals,
2726         */
2727        private int compareCaseInsensitive(String   s1, String   s2)
2728        {
2729            return NormalizerImpl.cmpEquivFold(s1, s2,
2730                                               m_foldCase_ | m_codePointCompare_
2731                                               | Normalizer.COMPARE_IGNORE_CASE);
2732        }
2733
2734        /**
2735         * Compares case sensitive. This is a direct port of ICU4C, to make
2736         * maintainence life easier.
2737         * @param s1 first string to compare
2738         * @param s2 second string to compare
2739         * @return -1 is s1 &lt; s2, 0 if equals,
2740         */
2741        private int compareCaseSensitive(String   s1, String   s2)
2742        {
2743            // compare identical prefixes - they do not need to be fixed up
2744            // limit1 = start1 + min(lenght1, length2)
2745            int length1 = s1.length();
2746            int length2 = s2.length();
2747            int minlength = length1;
2748            int result = 0;
2749            if (length1 < length2) {
2750                result = -1;
2751            }
2752            else if (length1 > length2) {
2753                result = 1;
2754                minlength = length2;
2755            }
2756
2757            char c1 = 0;
2758            char c2 = 0;
2759            int index = 0;
2760            for (; index < minlength; index ++) {
2761                c1 = s1.charAt(index);
2762                c2 = s2.charAt(index);
2763                // check pseudo-limit
2764                if (c1 != c2) {
2765                    break;
2766                }
2767            }
2768
2769            if (index == minlength) {
2770                return result;
2771            }
2772
2773            boolean codepointcompare
2774                = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2775            // if both values are in or above the surrogate range, fix them up
2776            if (c1 >= LEAD_SURROGATE_MIN_VALUE
2777                && c2 >= LEAD_SURROGATE_MIN_VALUE && codepointcompare) {
2778                // subtract 0x2800 from BMP code points to make them smaller
2779                // than supplementary ones
2780                if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1
2781                     && isTrailSurrogate(s1.charAt(index + 1)))
2782                    || (isTrailSurrogate(c1) && index != 0
2783                        && isLeadSurrogate(s1.charAt(index - 1)))) {
2784                    // part of a surrogate pair, leave >=d800
2785                }
2786                else {
2787                    // BMP code point - may be surrogate code point - make
2788                    // < d800
2789                    c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2790                }
2791
2792                if ((c2 <= LEAD_SURROGATE_MAX_VALUE
2793                     && (index + 1) != length2
2794                     && isTrailSurrogate(s2.charAt(index + 1))) ||
2795                    (isTrailSurrogate(c2) && index != 0
2796                     && isLeadSurrogate(s2.charAt(index - 1)))) {
2797                    // part of a surrogate pair, leave >=d800
2798                }
2799                else {
2800                    // BMP code point - may be surrogate code point - make <d800
2801                    c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2802                }
2803            }
2804
2805            // now c1 and c2 are in UTF-32-compatible order
2806            return c1 - c2;
2807        }
2808    }
2809
2810    // private data members -------------------------------------------------
2811
2812    /**
2813     * Shift value for lead surrogate to form a supplementary character.
2814     */
2815    private static final int LEAD_SURROGATE_SHIFT_ = 10;
2816
2817    /**
2818     * Mask to retrieve the significant value from a trail surrogate.
2819     */
2820    private static final int TRAIL_SURROGATE_MASK_     = 0x3FF;
2821
2822    /**
2823     * Value that all lead surrogate starts with
2824     */
2825    private static final int LEAD_SURROGATE_OFFSET_ =
2826        LEAD_SURROGATE_MIN_VALUE -
2827        (SUPPLEMENTARY_MIN_VALUE
2828         >> LEAD_SURROGATE_SHIFT_);
2829
2830    // private methods ------------------------------------------------------
2831
2832    /**
2833     * <p>Converts argument code point and returns a String object representing
2834     * the code point's value in UTF16 format.</p>
2835     * <p>This method does not check for the validity of the codepoint, the
2836     * results are not guaranteed if a invalid codepoint is passed as
2837     * argument.</p>
2838     * <p>The result is a string whose length is 1 for non-supplementary code
2839     * points, 2 otherwise.</p>
2840     * @param ch code point
2841     * @return string representation of the code point
2842     */
2843    private static String   toString(int ch)
2844    {
2845        if (ch < SUPPLEMENTARY_MIN_VALUE) {
2846            return String.valueOf((char)ch);
2847        }
2848
2849        StringBuffer   result = new StringBuffer  ();
2850        result.append(getLeadSurrogate(ch));
2851        result.append(getTrailSurrogate(ch));
2852        return result.toString();
2853    }
2854}
2855//eof
2856
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags