Normalizer


1   /*
2    *******************************************************************************
3    * Copyright (C) 2000-2006, International Business Machines Corporation and    *
4    * others. All Rights Reserved.                                                *
5    *******************************************************************************
6    */
7   package com.ibm.icu.text;
8   import com.ibm.icu.impl.NormalizerImpl;
9   import com.ibm.icu.impl.UCharacterProperty;
10  import com.ibm.icu.lang.UCharacter;
11  import com.ibm.icu.util.VersionInfo;
12  
13  import java.text.CharacterIterator  ;
14  import com.ibm.icu.impl.Utility;
15  
16  /**
17   * Unicode Normalization 
18   *
19   * <h2>Unicode normalization API</h2>
20   *
21   * <code>normalize</code> transforms Unicode text into an equivalent composed or
22   * decomposed form, allowing for easier sorting and searching of text.
23   * <code>normalize</code> supports the standard normalization forms described in
24   * <a HREF="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
25   * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
26   *
27   * Characters with accents or other adornments can be encoded in
28   * several different ways in Unicode.  For example, take the character A-acute.
29   * In Unicode, this can be encoded as a single character (the
30   * "composed" form):
31   *
32   * <p>
33   *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
34   * </p>
35   *
36   * or as two separate characters (the "decomposed" form):
37   *
38   * <p>
39   *      0041    LATIN CAPITAL LETTER A
40   *      0301    COMBINING ACUTE ACCENT
41   * </p>
42   *
43   * To a user of your program, however, both of these sequences should be
44   * treated as the same "user-level" character "A with acute accent".  When you 
45   * are searching or comparing text, you must ensure that these two sequences are 
46   * treated equivalently.  In addition, you must handle characters with more than
47   * one accent.  Sometimes the order of a character's combining accents is
48   * significant, while in other cases accent sequences in different orders are
49   * really equivalent.
50   *
51   * Similarly, the string "ffi" can be encoded as three separate letters:
52   *
53   * <p>
54   *      0066    LATIN SMALL LETTER F
55   *      0066    LATIN SMALL LETTER F
56   *      0069    LATIN SMALL LETTER I
57   * <\p>
58   *
59   * or as the single character
60   *
61   * <p>
62   *      FB03    LATIN SMALL LIGATURE FFI
63   * <\p>
64   *
65   * The ffi ligature is not a distinct semantic character, and strictly speaking
66   * it shouldn't be in Unicode at all, but it was included for compatibility
67   * with existing character sets that already provided it.  The Unicode standard
68   * identifies such characters by giving them "compatibility" decompositions
69   * into the corresponding semantic characters.  When sorting and searching, you
70   * will often want to use these mappings.
71   *
72   * <code>normalize</code> helps solve these problems by transforming text into 
73   * the canonical composed and decomposed forms as shown in the first example 
74   * above. In addition, you can have it perform compatibility decompositions so 
75   * that you can treat compatibility characters the same as their equivalents.
76   * Finally, <code>normalize</code> rearranges accents into the proper canonical
77   * order, so that you do not have to worry about accent rearrangement on your
78   * own.
79   *
80   * Form FCD, "Fast C or D", is also designed for collation.
81   * It allows to work on strings that are not necessarily normalized
82   * with an algorithm (like in collation) that works under "canonical closure", 
83   * i.e., it treats precomposed characters and their decomposed equivalents the 
84   * same.
85   *
86   * It is not a normalization form because it does not provide for uniqueness of 
87   * representation. Multiple strings may be canonically equivalent (their NFDs 
88   * are identical) and may all conform to FCD without being identical themselves.
89   *
90   * The form is defined such that the "raw decomposition", the recursive 
91   * canonical decomposition of each character, results in a string that is 
92   * canonically ordered. This means that precomposed characters are allowed for 
93   * as long as their decompositions do not need canonical reordering.
94   *
95   * Its advantage for a process like collation is that all NFD and most NFC texts
96   * - and many unnormalized texts - already conform to FCD and do not need to be 
97   * normalized (NFD) for such a process. The FCD quick check will return YES for 
98   * most strings in practice.
99   *
100  * normalize(FCD) may be implemented with NFD.
101  *
102  * For more details on FCD see the collation design document:
103  * http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icuhtml/design/collation/ICU_collation_design.htm
104  *
105  * ICU collation performs either NFD or FCD normalization automatically if 
106  * normalization is turned on for the collator object. Beyond collation and 
107  * string search, normalized strings may be useful for string equivalence 
108  * comparisons, transliteration/transcription, unique representations, etc.
109  *
110  * The W3C generally recommends to exchange texts in NFC.
111  * Note also that most legacy character encodings use only precomposed forms and
112  * often do not encode any combining marks by themselves. For conversion to such
113  * character encodings the Unicode text needs to be normalized to NFC.
114  * For more usage examples, see the Unicode Standard Annex.
115  * @stable ICU 2.8
116  */
117 
118 public final class Normalizer implements Cloneable   {
119     
120     //-------------------------------------------------------------------------
121     // Private data
122     //-------------------------------------------------------------------------  
123     private char[] buffer = new char[100];
124     private int bufferStart = 0;
125     private int bufferPos   = 0;
126     private int bufferLimit = 0;
127     
128     // This tells us what the bits in the "mode" object mean.
129     private static final int COMPAT_BIT = 1;
130     private static final int DECOMP_BIT = 2;
131     private static final int COMPOSE_BIT = 4;
132     
133     // The input text and our position in it
134     private UCharacterIterator  text;
135     private Mode                mode = NFC;
136     private int                 options = 0;
137     private int                 currentIndex;
138     private int                 nextIndex;
139     
140     /**
141      * Options bit set value to select Unicode 3.2 normalization
142      * (except NormalizationCorrections).
143      * At most one Unicode version can be selected at a time.
144      * @stable ICU 2.6
145      */
146     public static final int UNICODE_3_2=0x20;
147 
148     /**
149      * Constant indicating that the end of the iteration has been reached.
150      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
151      * @stable ICU 2.8
152      */
153     public static final int DONE = UCharacterIterator.DONE;
154 
155     /**
156      * Constants for normalization modes.
157      * @stable ICU 2.8
158      */
159     public static class Mode {
160         private int modeValue;
161         private Mode(int value) {
162             modeValue = value;
163         }
164 
165         /**
166          * This method is used for method dispatch
167          * @stable ICU 2.6
168          */
169         protected int normalize(char[] src, int srcStart, int srcLimit,
170                                 char[] dest,int destStart,int destLimit, 
171                                 UnicodeSet nx) {
172             int srcLen = (srcLimit - srcStart);
173             int destLen = (destLimit - destStart);
174             if( srcLen > destLen ) {
175                 return srcLen;
176             }
177             System.arraycopy(src,srcStart,dest,destStart,srcLen);
178             return srcLen;
179         }
180 
181         /**
182          * This method is used for method dispatch
183          * @stable ICU 2.6
184          */
185         protected int normalize(char[] src, int srcStart, int srcLimit,
186                                 char[] dest,int destStart,int destLimit,
187                                 int options) {
188             return normalize(   src, srcStart, srcLimit,
189                                 dest,destStart,destLimit,
190                                 NormalizerImpl.getNX(options)
191                                 );
192         }
193         
194         /**
195          * This method is used for method dispatch
196          * @stable ICU 2.6
197          */
198         protected String   normalize(String   src, int options) {
199             return src;
200         }
201 
202         /**
203          * This method is used for method dispatch
204          * @stable ICU 2.8
205          */
206         protected int getMinC() {
207             return -1;
208         }
209 
210         /**
211          * This method is used for method dispatch
212          * @stable ICU 2.8
213          */
214         protected int getMask() {
215             return -1;
216         }
217 
218         /**
219          * This method is used for method dispatch
220          * @stable ICU 2.8
221          */
222         protected IsPrevBoundary getPrevBoundary() {
223             return null;
224         }
225 
226         /**
227          * This method is used for method dispatch
228          * @stable ICU 2.8
229          */
230         protected IsNextBoundary getNextBoundary() {
231             return null;
232         }
233 
234         /**
235          * This method is used for method dispatch
236          * @stable ICU 2.6
237          */
238         protected QuickCheckResult quickCheck(char[] src,int start, int limit, 
239                                               boolean allowMaybe,UnicodeSet nx) {
240             if(allowMaybe) {
241                 return MAYBE;
242             }
243             return NO;
244         }
245 
246         /**
247          * This method is used for method dispatch
248          * @stable ICU 2.8
249          */
250         protected boolean isNFSkippable(int c) {
251             return true;
252         }
253     }
254     
255     /** 
256      * No decomposition/composition.  
257      * @stable ICU 2.8
258      */
259     public static final Mode NONE = new Mode(1);
260 
261     /** 
262      * Canonical decomposition.  
263      * @stable ICU 2.8
264      */
265     public static final Mode NFD = new NFDMode(2);
266     
267     private static final class NFDMode extends Mode {
268         private NFDMode(int value) {
269             super(value);
270         }
271 
272         protected int normalize(char[] src, int srcStart, int srcLimit,
273                                 char[] dest,int destStart,int destLimit, 
274                                 UnicodeSet nx) {
275             int[] trailCC = new int[1];
276             return NormalizerImpl.decompose(src,  srcStart,srcLimit,
277                                             dest, destStart,destLimit,
278                                             false, trailCC,nx);
279         }
280         
281         protected String   normalize( String   src, int options) {
282             return decompose(src,false);
283         }
284 
285         protected int getMinC() {
286             return NormalizerImpl.MIN_WITH_LEAD_CC;
287         }
288 
289         protected IsPrevBoundary getPrevBoundary() {
290             return new IsPrevNFDSafe();
291         }
292 
293         protected IsNextBoundary getNextBoundary() {
294             return new IsNextNFDSafe();
295         }
296 
297         protected int getMask() {
298             return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD);
299         }
300 
301         protected QuickCheckResult quickCheck(char[] src,int start, 
302                                               int limit,boolean allowMaybe,
303                                               UnicodeSet nx) {
304             return NormalizerImpl.quickCheck(
305                                              src, start,limit,
306                                              NormalizerImpl.getFromIndexesArr(
307                                                                               NormalizerImpl.INDEX_MIN_NFD_NO_MAYBE
308                                                                               ),
309                                              NormalizerImpl.QC_NFD,
310                                              0,
311                                              allowMaybe,
312                                              nx
313                                              );
314         }
315 
316         protected boolean isNFSkippable(int c) {
317             return NormalizerImpl.isNFSkippable(c,this,
318                                                 (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD)
319                                                 );
320         }           
321     }
322                                          
323     /** 
324      * Compatibility decomposition.  
325      * @stable ICU 2.8
326      */
327     public static final Mode NFKD = new NFKDMode(3);
328     
329     private static final class NFKDMode extends Mode {
330         private NFKDMode(int value) {
331             super(value);
332         }
333 
334         protected int normalize(char[] src, int srcStart, int srcLimit,
335                                 char[] dest,int destStart,int destLimit, 
336                                 UnicodeSet nx) {
337             int[] trailCC = new int[1];
338             return NormalizerImpl.decompose(src,  srcStart,srcLimit,
339                                             dest, destStart,destLimit,
340                                             true, trailCC, nx);
341         }
342 
343         protected String   normalize( String   src, int options) {
344             return decompose(src,true);
345         }
346 
347         protected int getMinC() {
348             return NormalizerImpl.MIN_WITH_LEAD_CC;
349         }
350 
351         protected IsPrevBoundary getPrevBoundary() {
352             return new IsPrevNFDSafe();
353         }
354 
355         protected IsNextBoundary getNextBoundary() {
356             return new IsNextNFDSafe();
357         }
358 
359         protected int getMask() {
360             return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD);
361         }
362 
363         protected QuickCheckResult quickCheck(char[] src,int start, 
364                                               int limit,boolean allowMaybe,
365                                               UnicodeSet nx) {
366             return NormalizerImpl.quickCheck(
367                                              src,start,limit,
368                                              NormalizerImpl.getFromIndexesArr(
369                                                                               NormalizerImpl.INDEX_MIN_NFKD_NO_MAYBE
370                                                                               ),
371                                              NormalizerImpl.QC_NFKD,
372                                              NormalizerImpl.OPTIONS_COMPAT,
373                                              allowMaybe,
374                                              nx
375                                              );
376         }
377 
378         protected boolean isNFSkippable(int c) {
379             return NormalizerImpl.isNFSkippable(c, this,
380                                                 (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD)
381                                                 );
382         }                                         
383     }
384                                          
385     /** 
386      * Canonical decomposition followed by canonical composition.  
387      * @stable ICU 2.8
388      */
389     public static final Mode NFC = new NFCMode(4);
390     
391     private static final class NFCMode extends Mode{
392         private NFCMode(int value) {
393             super(value);
394         }
395         protected int normalize(char[] src, int srcStart, int srcLimit,
396                                 char[] dest,int destStart,int destLimit,
397                                 UnicodeSet nx) {
398             return NormalizerImpl.compose( src, srcStart, srcLimit,
399                                            dest,destStart,destLimit,
400                                            0, nx);
401         }
402   
403         protected String   normalize( String   src, int options) {
404             return compose(src, false, options);
405         }
406        
407         protected int getMinC() {
408             return NormalizerImpl.getFromIndexesArr(
409                                                     NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
410                                                     );
411         }
412         protected IsPrevBoundary getPrevBoundary() {
413             return new IsPrevTrueStarter();
414         }
415         protected IsNextBoundary getNextBoundary() {
416             return new IsNextTrueStarter();
417         }
418         protected int getMask() {
419             return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFC);
420         }
421         protected QuickCheckResult quickCheck(char[] src,int start, 
422                                               int limit,boolean allowMaybe,
423                                               UnicodeSet nx) {
424             return NormalizerImpl.quickCheck(
425                                              src,start,limit,
426                                              NormalizerImpl.getFromIndexesArr(
427                                                                               NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
428                                                                               ),
429                                              NormalizerImpl.QC_NFC,
430                                              0,
431                                              allowMaybe,
432                                              nx
433                                              );
434         }
435         protected boolean isNFSkippable(int c) {
436             return NormalizerImpl.isNFSkippable(c,this,
437                                                 ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
438                                                   (NormalizerImpl.QC_NFC & NormalizerImpl.QC_ANY_NO)
439                                                   )
440                                                 );
441         } 
442     };
443                                          
444     /** 
445      * Default normalization.  
446      * @stable ICU 2.8
447      */
448     public static final Mode DEFAULT = NFC; 
449     
450     /** 
451      * Compatibility decomposition followed by canonical composition. 
452      * @stable ICU 2.8
453      */
454     public static final Mode NFKC =new NFKCMode(5);
455     
456     private static final class NFKCMode extends Mode{
457         private NFKCMode(int value) {
458             super(value);
459         }
460         protected int normalize(char[] src, int srcStart, int srcLimit,
461                                 char[] dest,int destStart,int destLimit, 
462                                 UnicodeSet nx) {
463             return NormalizerImpl.compose(src,  srcStart,srcLimit,
464                                           dest, destStart,destLimit,
465                                           NormalizerImpl.OPTIONS_COMPAT, nx);
466         }
467 
468         protected String   normalize( String   src, int options) {
469             return compose(src, true, options);
470         }
471         protected int getMinC() {
472             return NormalizerImpl.getFromIndexesArr(
473                                                     NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
474                                                     );
475         }
476         protected IsPrevBoundary getPrevBoundary() {
477             return new IsPrevTrueStarter();
478         }
479         protected IsNextBoundary getNextBoundary() {
480             return new IsNextTrueStarter();
481         }
482         protected int getMask() {
483             return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKC);
484         }
485         protected QuickCheckResult quickCheck(char[] src,int start, 
486                                               int limit,boolean allowMaybe,
487                                               UnicodeSet nx) {
488             return NormalizerImpl.quickCheck(
489                                              src,start,limit,
490                                              NormalizerImpl.getFromIndexesArr(
491                                                                               NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
492                                                                               ),
493                                              NormalizerImpl.QC_NFKC,
494                                              NormalizerImpl.OPTIONS_COMPAT,
495                                              allowMaybe,
496                                              nx
497                                              );
498         }
499         protected boolean isNFSkippable(int c) {
500             return NormalizerImpl.isNFSkippable(c, this,
501                                                 ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
502                                                   (NormalizerImpl.QC_NFKC & NormalizerImpl.QC_ANY_NO)
503                                                   )
504                                                 );
505         } 
506     };
507                                         
508     /** 
509      * "Fast C or D" form. 
510      * @stable ICU 2.8 
511      */
512     public static final Mode FCD = new FCDMode(6);
513     
514     private static final class FCDMode extends Mode{
515         private FCDMode(int value) {
516             super(value);
517         }
518         protected int normalize(char[] src, int srcStart, int srcLimit,
519                                 char[] dest,int destStart,int destLimit, 
520                                 UnicodeSet nx) {
521             return NormalizerImpl.makeFCD(src, srcStart,srcLimit,
522                                           dest, destStart,destLimit, nx);
523         }
524         protected String   normalize( String   src, int options) {
525             return makeFCD(src, options);
526         }
527         protected int getMinC() {
528             return NormalizerImpl.MIN_WITH_LEAD_CC;
529         }
530         protected IsPrevBoundary getPrevBoundary() {
531             return new IsPrevNFDSafe();
532         }
533         protected IsNextBoundary getNextBoundary() {
534             return new IsNextNFDSafe();
535         }
536         protected int getMask() {
537             return NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD;
538         }
539         protected QuickCheckResult quickCheck(char[] src,int start, 
540                                               int limit,boolean allowMaybe,
541                                               UnicodeSet nx) {
542             return NormalizerImpl.checkFCD(src,start,limit,nx) ? YES : NO;
543         }
544         protected boolean isNFSkippable(int c) {
545             /* FCD: skippable if lead cc==0 and trail cc<=1 */
546             return (NormalizerImpl.getFCD16(c)>1);
547         }   
548     };
549 
550     
551     /**
552      * Null operation for use with the {@link #Normalizer constructors}
553      * and the static {@link #normalize normalize} method.  This value tells
554      * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
555      * from the underlying String or CharacterIterator.  If you have code which
556      * requires raw text at some times and normalized text at others, you can
557      * use <tt>NO_OP</tt> for the cases where you want raw text, rather
558      * than having a separate code path that bypasses <tt>Normalizer</tt>
559      * altogether.
560      * <p>
561      * @see #setMode
562      * @deprecated ICU 2.8. Use Nomalizer.NONE
563      * @see #NONE
564      */
565     public static final Mode NO_OP = NONE;
566 
567     /**
568      * Canonical decomposition followed by canonical composition.  Used with the
569      * {@link #Normalizer constructors} and the static 
570      * {@link #normalize normalize} method to determine the operation to be 
571      * performed.
572      * <p>
573      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
574      * off, this operation produces output that is in
575      * <a HREF=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 
576      * Form</a>
577      * <b>C</b>.
578      * <p>
579      * @see #setMode
580      * @deprecated ICU 2.8. Use Normalier.NFC
581      * @see #NFC
582      */
583     public static final Mode COMPOSE = NFC;
584 
585     /**
586      * Compatibility decomposition followed by canonical composition.
587      * Used with the {@link #Normalizer constructors} and the static
588      * {@link #normalize normalize} method to determine the operation to be 
589      * performed.
590      * <p>
591      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
592      * off, this operation produces output that is in
593      * <a HREF=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 
594      * Form</a>
595      * <b>KC</b>.
596      * <p>
597      * @see #setMode
598      * @deprecated ICU 2.8. Use Normalizer.NFKC
599      * @see #NFKC
600      */
601     public static final Mode COMPOSE_COMPAT = NFKC;
602 
603     /**
604      * Canonical decomposition.  This value is passed to the
605      * {@link #Normalizer constructors} and the static
606      * {@link #normalize normalize}
607      * method to determine the operation to be performed.
608      * <p>
609      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
610      * off, this operation produces output that is in
611      * <a HREF=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 
612      * Form</a>
613      * <b>D</b>.
614      * <p>
615      * @see #setMode
616      * @deprecated ICU 2.8. Use Normalizer.NFD
617      * @see #NFD
618      */
619     public static final Mode DECOMP = NFD;
620 
621     /**
622      * Compatibility decomposition.  This value is passed to the
623      * {@link #Normalizer constructors} and the static 
624      * {@link #normalize normalize}
625      * method to determine the operation to be performed.
626      * <p>
627      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
628      * off, this operation produces output that is in
629      * <a HREF=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 
630      * Form</a>
631      * <b>KD</b>.
632      * <p>
633      * @see #setMode
634      * @deprecated ICU 2.8. Use Normalizer.NFKD
635      * @see #NFKD
636      */
637     public static final Mode DECOMP_COMPAT = NFKD;
638 
639     /**
640      * Option to disable Hangul/Jamo composition and decomposition.
641      * This option applies to Korean text,
642      * which can be represented either in the Jamo alphabet or in Hangul
643      * characters, which are really just two or three Jamo combined
644      * into one visual glyph.  Since Jamo takes up more storage space than
645      * Hangul, applications that process only Hangul text may wish to turn
646      * this option on when decomposing text.
647      * <p>
648      * The Unicode standard treates Hangul to Jamo conversion as a
649      * canonical decomposition, so this option must be turned <b>off</b> if you
650      * wish to transform strings into one of the standard
651      * <a HREF="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
652      * Unicode Normalization Forms</a>.
653      * <p>
654      * @see #setOption
655      * @deprecated ICU 2.8. This option is no longer supported. TODO: check with Ram
656      */
657     public static final int IGNORE_HANGUL = 0x0001;
658           
659     /**
660      * Result values for quickCheck().
661      * For details see Unicode Technical Report 15.
662      * @stable ICU 2.8
663      */
664     public static final class QuickCheckResult{
665         private int resultValue;
666         private QuickCheckResult(int value) {
667             resultValue=value;
668         }
669     }
670     /** 
671      * Indicates that string is not in the normalized format
672      * @stable ICU 2.8
673      */
674     public static final QuickCheckResult NO = new QuickCheckResult(0);
675         
676     /** 
677      * Indicates that string is in the normalized format
678      * @stable ICU 2.8
679      */
680     public static final QuickCheckResult YES = new QuickCheckResult(1);
681 
682     /** 
683      * Indicates it cannot be determined if string is in the normalized 
684      * format without further thorough checks.
685      * @stable ICU 2.8
686      */
687     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
688     
689     /**
690      * Option bit for compare:
691      * Case sensitively compare the strings
692      * @stable ICU 2.8
693      */
694     public static final int FOLD_CASE_DEFAULT =  UCharacter.FOLD_CASE_DEFAULT;
695     
696     /**
697      * Option bit for compare:
698      * Both input strings are assumed to fulfill FCD conditions.
699      * @stable ICU 2.8
700      */
701     public static final int INPUT_IS_FCD    =      0x20000;
702         
703     /**
704      * Option bit for compare:
705      * Perform case-insensitive comparison.
706      * @stable ICU 2.8
707      */
708     public static final int COMPARE_IGNORE_CASE  =     0x10000;
709         
710     /**
711      * Option bit for compare:
712      * Compare strings in code point order instead of code unit order.
713      * @stable ICU 2.8
714      */
715     public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
716     
717     /** 
718      * Option value for case folding: exclude the mappings for dotted I 
719      * and dotless i marked with 'I' in CaseFolding.txt. 
720      * @stable ICU 2.8
721      */
722     public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
723     
724     /**
725      * Lowest-order bit number of compare() options bits corresponding to
726      * normalization options bits.
727      *
728      * The options parameter for compare() uses most bits for
729      * itself and for various comparison and folding flags.
730      * The most significant bits, however, are shifted down and passed on
731      * to the normalization implementation.
732      * (That is, from compare(..., options, ...),
733      * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
734      * internal normalization functions.)
735      *
736      * @see #compare
737      * @stable ICU 2.6
738      */
739     public static final int COMPARE_NORM_OPTIONS_SHIFT  = 20;
740         
741     //-------------------------------------------------------------------------
742     // Constructors
743     //-------------------------------------------------------------------------
744 
745     /**
746      * Creates a new <tt>Normalizer</tt> object for iterating over the
747      * normalized form of a given string.
748      * <p>
749      * The <tt>options</tt> parameter specifies which optional
750      * <tt>Normalizer</tt> features are to be enabled for this object.
751      * <p>
752      * @param str  The string to be normalized.  The normalization
753      *              will start at the beginning of the string.
754      *
755      * @param mode The normalization mode.
756      *
757      * @param opt Any optional features to be enabled.
758      *            Currently the only available option is {@link #UNICODE_3_2}.
759      *            If you want the default behavior corresponding to one of the
760      *            standard Unicode Normalization Forms, use 0 for this argument.
761      * @stable ICU 2.6
762      */
763     public Normalizer(String   str, Mode mode, int opt) {
764         this.text = UCharacterIterator.getInstance(str);
765         this.mode = mode; 
766         this.options=opt;
767     }
768 
769     /**
770      * Creates a new <tt>Normalizer</tt> object for iterating over the
771      * normalized form of the given text.
772      * <p>
773      * @param iter  The input text to be normalized.  The normalization
774      *              will start at the beginning of the string.
775      *
776      * @param mode  The normalization mode.
777      *
778      * @param opt Any optional features to be enabled.
779      *            Currently the only available option is {@link #UNICODE_3_2}.
780      *            If you want the default behavior corresponding to one of the
781      *            standard Unicode Normalization Forms, use 0 for this argument.
782      * @stable ICU 2.6
783      */
784     public Normalizer(CharacterIterator   iter, Mode mode, int opt) {
785         this.text = UCharacterIterator.getInstance(
786                                                    (CharacterIterator  )iter.clone()
787                                                    );
788         this.mode = mode;
789         this.options = opt;
790     }
791     
792     /**
793      * Creates a new <tt>Normalizer</tt> object for iterating over the
794      * normalized form of the given text.
795      * <p>
796      * @param iter  The input text to be normalized.  The normalization
797      *              will start at the beginning of the string.
798      *
799      * @param mode  The normalization mode.
800      * @param options The normalization options, ORed together (0 for no options).
801      * @stable ICU 2.6
802      */
803     public Normalizer(UCharacterIterator iter, Mode mode, int options) {
804         try {
805             this.text     = (UCharacterIterator)iter.clone();
806             this.mode     = mode;
807             this.options  = options;
808         } catch (CloneNotSupportedException   e) {
809             throw new IllegalStateException  (e.toString());
810         }
811     }
812 
813     /**
814      * Clones this <tt>Normalizer</tt> object.  All properties of this
815      * object are duplicated in the new object, including the cloning of any
816      * {@link CharacterIterator} that was passed in to the constructor
817      * or to {@link #setText(CharacterIterator) setText}.
818      * However, the text storage underlying
819      * the <tt>CharacterIterator</tt> is not duplicated unless the
820      * iterator's <tt>clone</tt> method does so.
821      * @stable ICU 2.8
822      */
823     public Object   clone() {
824         try {
825             Normalizer copy = (Normalizer) super.clone();
826             copy.text = (UCharacterIterator) text.clone();
827             //clone the internal buffer
828             if (buffer != null) {
829                 copy.buffer = new char[buffer.length];
830                 System.arraycopy(buffer,0,copy.buffer,0,buffer.length);
831             }
832             return copy;
833         }
834         catch (CloneNotSupportedException   e) {
835             throw new IllegalStateException  (e.toString());
836         }
837     }
838     
839     //--------------------------------------------------------------------------
840     // Static Utility methods
841     //--------------------------------------------------------------------------
842     
843     /**
844      * Compose a string.
845      * The string will be composed to according the the specified mode.
846      * @param str        The string to compose.
847      * @param compat     If true the string will be composed accoding to 
848      *                    NFKC rules and if false will be composed according to 
849      *                    NFC rules.
850      * @return String    The composed string   
851      * @stable ICU 2.8
852      */            
853     public static String   compose(String   str, boolean compat) {
854         return compose(str,compat,0);           
855     }
856     
857     /**
858      * Compose a string.
859      * The string will be composed to according the the specified mode.
860      * @param str        The string to compose.
861      * @param compat     If true the string will be composed accoding to 
862      *                    NFKC rules and if false will be composed according to 
863      *                    NFC rules.
864      * @param options    The only recognized option is UNICODE_3_2
865      * @return String    The composed string   
866      * @stable ICU 2.6
867      */            
868     public static String   compose(String   str, boolean compat, int options) {
869            
870         char[] dest = new char[str.length()*MAX_BUF_SIZE_COMPOSE];
871         int destSize=0;
872         char[] src = str.toCharArray();
873         UnicodeSet nx = NormalizerImpl.getNX(options);
874 
875         /* reset options bits that should only be set here or inside compose() */
876         options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
877 
878         if(compat) {
879             options|=NormalizerImpl.OPTIONS_COMPAT;
880         }
881 
882         for(;;) {
883             destSize=NormalizerImpl.compose(src,0,src.length,
884                                             dest,0,dest.length,options,
885                                             nx);
886             if(destSize<=dest.length) {
887                 return new String  (dest,0,destSize);  
888             } else {
889                 dest = new char[destSize];
890             }
891         }                   
892     }
893     
894     /**
895      * Compose a string.
896      * The string will be composed to according the the specified mode.
897      * @param source The char array to compose.
898      * @param target A char buffer to receive the normalized text.
899      * @param compat If true the char array will be composed accoding to 
900      *                NFKC rules and if false will be composed according to 
901      *                NFC rules.
902      * @param options The normalization options, ORed together (0 for no options).
903      * @return int   The total buffer size needed;if greater than length of 
904      *                result, the output was truncated.
905      * @exception IndexOutOfBoundsException if target.length is less than the 
906      *             required length
907      * @stable ICU 2.6  
908      */         
909     public static int compose(char[] source,char[] target, boolean compat, int options) {
910         UnicodeSet nx = NormalizerImpl.getNX(options);
911 
912         /* reset options bits that should only be set here or inside compose() */
913         options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
914 
915         if(compat) {
916             options|=NormalizerImpl.OPTIONS_COMPAT;
917         }
918 
919         int length = NormalizerImpl.compose(source,0,source.length,
920                                             target,0,target.length,
921                                             options,nx);
922         if(length<=target.length) {
923             return length;
924         } else {
925             throw new IndexOutOfBoundsException  (Integer.toString(length));
926         } 
927     }
928     
929     /**
930      * Compose a string.
931      * The string will be composed to according the the specified mode.
932      * @param src       The char array to compose.
933      * @param srcStart  Start index of the source
934      * @param srcLimit  Limit index of the source
935      * @param dest      The char buffer to fill in
936      * @param destStart Start index of the destination buffer  
937      * @param destLimit End index of the destination buffer
938      * @param compat If true the char array will be composed accoding to 
939      *                NFKC rules and if false will be composed according to 
940      *                NFC rules.
941      * @param options The normalization options, ORed together (0 for no options).
942      * @return int   The total buffer size needed;if greater than length of 
943      *                result, the output was truncated.
944      * @exception IndexOutOfBoundsException if target.length is less than the 
945      *             required length 
946      * @stable ICU 2.6 
947      */         
948     public static int compose(char[] src,int srcStart, int srcLimit,
949                               char[] dest,int destStart, int destLimit,
950                               boolean compat, int options) {
951         UnicodeSet nx = NormalizerImpl.getNX(options);
952 
953         /* reset options bits that should only be set here or inside compose() */
954         options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
955 
956         if(compat) {
957             options|=NormalizerImpl.OPTIONS_COMPAT;
958         }
959 
960         int length = NormalizerImpl.compose(src,srcStart,srcLimit,
961                                             dest,destStart,destLimit,
962                                             options, nx);
963         if(length<=(destLimit-destStart)) {
964             return length;
965         } else {
966             throw new IndexOutOfBoundsException  (Integer.toString(length));
967         } 
968     }
969     
970     private static final int MAX_BUF_SIZE_COMPOSE = 2;
971     private static final int MAX_BUF_SIZE_DECOMPOSE = 3;
972     
973     /**
974      * Decompose a string.
975      * The string will be decomposed to according the the specified mode.
976      * @param str       The string to decompose.
977      * @param compat    If true the string will be decomposed accoding to NFKD 
978      *                   rules and if false will be decomposed according to NFD 
979      *                   rules.
980      * @return String   The decomposed string  
981      * @stable ICU 2.8 
982      */         
983     public static String   decompose(String   str, boolean compat) {
984         return decompose(str,compat,0);                  
985     }
986     
987     /**
988      * Decompose a string.
989      * The string will be decomposed to according the the specified mode.
990      * @param str     The string to decompose.
991      * @param compat  If true the string will be decomposed accoding to NFKD 
992      *                 rules and if false will be decomposed according to NFD 
993      *                 rules.
994      * @param options The normalization options, ORed together (0 for no options).
995      * @return String The decomposed string 
996      * @stable ICU 2.6
997      */         
998     public static String   decompose(String   str, boolean compat, int options) {
999         
1000        char[] dest = new char[str.length()*MAX_BUF_SIZE_DECOMPOSE];
1001        int[] trailCC = new int[1];
1002        int destSize=0;
1003        UnicodeSet nx = NormalizerImpl.getNX(options);
1004        for(;;) {
1005            destSize=NormalizerImpl.decompose(str.toCharArray(),0,str.length(),
1006                                              dest,0,dest.length,
1007                                              compat,trailCC, nx);
1008            if(destSize<=dest.length) {
1009                return new String  (dest,0,destSize); 
1010            } else {
1011                dest = new char[destSize];
1012            }
1013        } 
1014                
1015    }
1016    
1017    /**
1018     * Decompose a string.
1019     * The string will be decomposed to according the the specified mode.
1020     * @param source The char array to decompose.
1021     * @param target A char buffer to receive the normalized text.
1022     * @param compat If true the char array will be decomposed accoding to NFKD 
1023     *                rules and if false will be decomposed according to 
1024     *                NFD rules.
1025     * @return int   The total buffer size needed;if greater than length of 
1026     *                result,the output was truncated.
1027     * @param options The normalization options, ORed together (0 for no options).
1028     * @exception IndexOutOfBoundsException if the target capacity is less than
1029     *             the required length   
1030     * @stable ICU 2.6
1031     */
1032    public static int decompose(char[] source,char[] target, boolean compat, int options) {
1033        int[] trailCC = new int[1];
1034        UnicodeSet nx = NormalizerImpl.getNX(options);
1035        int length = NormalizerImpl.decompose(source,0,source.length,
1036                                              target,0,target.length,
1037                                              compat,trailCC,nx);
1038        if(length<=target.length) {
1039            return length;
1040        } else {
1041            throw new IndexOutOfBoundsException  (Integer.toString(length));
1042        } 
1043    }
1044    
1045    /**
1046     * Decompose a string.
1047     * The string will be decomposed to according the the specified mode.
1048     * @param src       The char array to compose.
1049     * @param srcStart  Start index of the source
1050     * @param srcLimit  Limit index of the source
1051     * @param dest      The char buffer to fill in
1052     * @param destStart Start index of the destination buffer  
1053     * @param destLimit End index of the destination buffer
1054     * @param compat If true the char array will be decomposed accoding to NFKD 
1055     *                rules and if false will be decomposed according to 
1056     *                NFD rules.
1057     * @param options The normalization options, ORed together (0 for no options).
1058     * @return int   The total buffer size needed;if greater than length of 
1059     *                result,the output was truncated.
1060     * @exception IndexOutOfBoundsException if the target capacity is less than
1061     *             the required length  
1062     * @stable ICU 2.6 
1063     */
1064    public static int decompose(char[] src,int srcStart, int srcLimit,
1065                                char[] dest,int destStart, int destLimit,
1066                                boolean compat, int options) {
1067        int[] trailCC = new int[1];
1068        UnicodeSet nx = NormalizerImpl.getNX(options);
1069        int length = NormalizerImpl.decompose(src,srcStart,srcLimit,
1070                                              dest,destStart,destLimit,
1071                                              compat,trailCC,nx);
1072        if(length<=(destLimit-destStart)) {
1073            return length;
1074        } else {
1075            throw new IndexOutOfBoundsException  (Integer.toString(length));
1076        } 
1077    }
1078        
1079    private static String   makeFCD(String   src,int options) {
1080        int srcLen = src.length();
1081        char[] dest = new char[MAX_BUF_SIZE_DECOMPOSE*srcLen];
1082        int length = 0;
1083        UnicodeSet nx = NormalizerImpl.getNX(options);
1084        for(;;) {
1085            length = NormalizerImpl.makeFCD(src.toCharArray(),0,srcLen,
1086                                            dest,0,dest.length,nx);
1087            if(length <= dest.length) {
1088                return new String  (dest,0,length);
1089            } else {
1090                dest = new char[length];
1091            }
1092        }
1093    }
1094    
1095    /**
1096     * Normalizes a <tt>String</tt> using the given normalization operation.
1097     * <p>
1098     * The <tt>options</tt> parameter specifies which optional
1099     * <tt>Normalizer</tt> features are to be enabled for this operation.
1100     * Currently the only available option is {@link #UNICODE_3_2}.
1101     * If you want the default behavior corresponding to one of the standard
1102     * Unicode Normalization Forms, use 0 for this argument.
1103     * <p>
1104     * @param str       the input string to be normalized.
1105     * @param mode      the normalization mode
1106     * @param options   the optional features to be enabled.
1107     * @return String   the normalized string
1108     * @stable ICU 2.6
1109     */
1110    public static String   normalize(String   str, Mode mode, int options) {
1111        return mode.normalize(str,options);
1112    }
1113    
1114    /**
1115     * Normalize a string.
1116     * The string will be normalized according the the specified normalization 
1117     * mode and options.
1118     * @param src        The string to normalize.
1119     * @param mode       The normalization mode; one of Normalizer.NONE, 
1120     *                    Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 
1121     *                    Normalizer.NFKD, Normalizer.DEFAULT
1122     * @return the normalized string
1123     * @stable ICU 2.8
1124     *   
1125     */
1126    public static String   normalize(String   src,Mode mode) {
1127        return normalize(src, mode, 0);    
1128    }
1129    /**
1130     * Normalize a string.
1131     * The string will be normalized according the the specified normalization 
1132     * mode and options.
1133     * @param source The char array to normalize.
1134     * @param target A char buffer to receive the normalized text.
1135     * @param mode   The normalization mode; one of Normalizer.NONE, 
1136     *                Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 
1137     *                Normalizer.NFKD, Normalizer.DEFAULT
1138     * @param options The normalization options, ORed together (0 for no options).
1139     * @return int   The total buffer size needed;if greater than length of 
1140     *                result, the output was truncated.
1141     * @exception    IndexOutOfBoundsException if the target capacity is less 
1142     *                than the required length
1143     * @stable ICU 2.6     
1144     */
1145    public static int normalize(char[] source,char[] target, Mode  mode, int options) {
1146        int length = normalize(source,0,source.length,target,0,target.length,mode, options);
1147        if(length<=target.length) {
1148            return length;
1149        } else {
1150            throw new IndexOutOfBoundsException  (Integer.toString(length));
1151        } 
1152    }
1153    
1154    /**
1155     * Normalize a string.
1156     * The string will be normalized according the the specified normalization
1157     * mode and options.
1158     * @param src       The char array to compose.
1159     * @param srcStart  Start index of the source
1160     * @param srcLimit  Limit index of the source
1161     * @param dest      The char buffer to fill in
1162     * @param destStart Start index of the destination buffer  
1163     * @param destLimit End index of the destination buffer
1164     * @param mode      The normalization mode; one of Normalizer.NONE, 
1165     *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 
1166     *                   Normalizer.NFKD, Normalizer.DEFAULT
1167     * @param options The normalization options, ORed together (0 for no options). 
1168     * @return int      The total buffer size needed;if greater than length of 
1169     *                   result, the output was truncated.
1170     * @exception       IndexOutOfBoundsException if the target capacity is 
1171     *                   less than the required length
1172     * @stable ICU 2.6    
1173     */       
1174    public static int normalize(char[] src,int srcStart, int srcLimit, 
1175                                char[] dest,int destStart, int destLimit,
1176                                Mode  mode, int options) {
1177        int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);
1178       
1179        if(length<=(destLimit-destStart)) {
1180            return length;
1181        } else {
1182            throw new IndexOutOfBoundsException  (Integer.toString(length));
1183        } 
1184    }
1185    
1186    /**
1187     * Normalize a codepoint accoding to the given mode
1188     * @param char32    The input string to be normalized.
1189     * @param mode      The normalization mode
1190     * @param options   Options for use with exclusion set an tailored Normalization
1191     *                                   The only option that is currently recognized is UNICODE_3_2
1192     * @return String   The normalized string
1193     * @stable ICU 2.6
1194     * @see #UNICODE_3_2
1195     */
1196    // TODO: actually do the optimization when the guts of Normalizer are 
1197    // upgraded --has just dumb implementation for now
1198    public static String   normalize(int char32, Mode mode, int options) {
1199        return normalize(UTF16.valueOf(char32), mode, options);
1200    }
1201
1202    /**
1203     * Conveinience method to normalize a codepoint accoding to the given mode
1204     * @param char32    The input string to be normalized.
1205     * @param mode      The normalization mode
1206     * @return String   The normalized string
1207     * @see #UNICODE_3_2                
1208     * @stable ICU 2.6
1209     */
1210    // TODO: actually do the optimization when the guts of Normalizer are 
1211    // upgraded --has just dumb implementation for now
1212    public static String   normalize(int char32, Mode mode) {
1213        return normalize(UTF16.valueOf(char32), mode, 0);
1214    }
1215    
1216    /**
1217     * Convenience method.
1218     *
1219     * @param source   string for determining if it is in a normalized format
1220     * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,  
1221     *                  Normalizer.NFKC,Normalizer.NFKD)
1222     * @return         Return code to specify if the text is normalized or not 
1223     *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
1224     * @stable ICU 2.8
1225     */
1226    public static QuickCheckResult quickCheck( String   source, Mode mode) {
1227        return mode.quickCheck(source.toCharArray(),0,source.length(),true,null);
1228    }
1229    
1230    /**
1231     * Convenience method.
1232     *
1233     * @param source   string for determining if it is in a normalized format
1234     * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,  
1235     *                  Normalizer.NFKC,Normalizer.NFKD)
1236     * @param options   Options for use with exclusion set an tailored Normalization
1237     *                                   The only option that is currently recognized is UNICODE_3_2     
1238     * @return         Return code to specify if the text is normalized or not 
1239     *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
1240     * @stable ICU 2.6
1241     */
1242    public static QuickCheckResult quickCheck( String   source, Mode mode, int options) {
1243        return mode.quickCheck(source.toCharArray(),0,source.length(),true,NormalizerImpl.getNX(options));
1244    }
1245    
1246    /**
1247     * Convenience method.
1248     *
1249     * @param source Array of characters for determining if it is in a 
1250     *                normalized format
1251     * @param mode   normalization format (Normalizer.NFC,Normalizer.NFD,  
1252     *                Normalizer.NFKC,Normalizer.NFKD)
1253     * @param options   Options for use with exclusion set an tailored Normalization
1254     *                                   The only option that is currently recognized is UNICODE_3_2
1255     * @return       Return code to specify if the text is normalized or not 
1256     *                (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
1257     * @stable ICU 2.6
1258     */
1259    public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
1260        return mode.quickCheck(source,0,source.length,true, NormalizerImpl.getNX(options));
1261    }
1262    
1263    /**
1264     * Performing quick check on a string, to quickly determine if the string is 
1265     * in a particular normalization format.
1266     * Three types of result can be returned Normalizer.YES, Normalizer.NO or
1267     * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
1268     * string is in the desired normalized format, Normalizer.NO determines that
1269     * argument string is not in the desired normalized format. A 
1270     * Normalizer.MAYBE result indicates that a more thorough check is required, 
1271     * the user may have to put the string in its normalized form and compare 
1272     * the results.
1273     *
1274     * @param source    string for determining if it is in a normalized format
1275     * @param start     the start index of the source
1276     * @param limit     the limit index of the source it is equal to the length
1277     * @param mode      normalization format (Normalizer.NFC,Normalizer.NFD,  
1278     *                   Normalizer.NFKC,Normalizer.NFKD)
1279     * @param options   Options for use with exclusion set an tailored Normalization
1280     *                                   The only option that is currently recognized is UNICODE_3_2    
1281     * @return          Return code to specify if the text is normalized or not 
1282     *                   (Normalizer.YES, Normalizer.NO or
1283     *                   Normalizer.MAYBE)
1284     * @stable ICU 2.6
1285     */
1286
1287    public static QuickCheckResult quickCheck(char[] source,int start, 
1288                                              int limit, Mode mode,int options) {       
1289        return mode.quickCheck(source,start,limit,true,NormalizerImpl.getNX(options));
1290    }
1291    
1292    //-------------------------------------------------------------------------
1293    // Internal methods (for now)
1294    //-------------------------------------------------------------------------
1295
1296    /**
1297     * Test if a string is in a given normalization form.
1298     * This is semantically equivalent to source.equals(normalize(source, mode)).
1299     *
1300     * Unlike quickCheck(), this function returns a definitive result,
1301     * never a "maybe".
1302     * For NFD, NFKD, and FCD, both functions work exactly the same.
1303     * For NFC and NFKC where quickCheck may return "maybe", this function will
1304     * perform further tests to arrive at a true/false result.
1305     * @param src       The input array of characters to be checked to see if 
1306     *                   it is normalized
1307     * @param start     The strart index in the source
1308     * @param limit     The limit index in the source
1309     * @param mode      the normalization mode
1310     * @param options   Options for use with exclusion set an tailored Normalization
1311     *                                   The only option that is currently recognized is UNICODE_3_2    
1312     * @return Boolean value indicating whether the source string is in the
1313     *         "mode" normalization form
1314     * @stable ICU 2.6
1315     */
1316    public static boolean isNormalized(char[] src,int start,
1317                                       int limit, Mode mode, 
1318                                       int options) {
1319        return (mode.quickCheck(src,start,limit,false,NormalizerImpl.getNX(options))==YES);
1320    }
1321    
1322    /**
1323     * Convenience Method
1324     * @param str       the input string to be checked to see if it is 
1325     *                   normalized
1326     * @param mode      the normalization mode
1327     * @param options   Options for use with exclusion set an tailored Normalization
1328     *                                   The only option that is currently recognized is UNICODE_3_2   
1329     * @see #isNormalized
1330     * @stable ICU 2.6
1331     */
1332    public static boolean isNormalized(String   str, Mode mode, int options) {
1333        return (mode.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
1334    }
1335    
1336    /**
1337     * Convenience Method
1338     * @param char32    the input code point to be checked to see if it is 
1339     *                   normalized
1340     * @param mode      the normalization mode
1341     * @param options   Options for use with exclusion set an tailored Normalization
1342     *                                   The only option that is currently recognized is UNICODE_3_2    
1343     *
1344     * @see #isNormalized
1345     * @stable ICU 2.6
1346     */
1347    // TODO: actually do the optimization when the guts of Normalizer are 
1348    // upgraded --has just dumb implementation for now
1349    public static boolean isNormalized(int char32, Mode mode,int options) {
1350        return isNormalized(UTF16.valueOf(char32), mode, options);
1351    }
1352     
1353    /**
1354     * Compare two strings for canonical equivalence.
1355     * Further options include case-insensitive comparison and
1356     * code point order (as opposed to code unit order).
1357     *
1358     * Canonical equivalence between two strings is defined as their normalized
1359     * forms (NFD or NFC) being identical.
1360     * This function compares strings incrementally instead of normalizing
1361     * (and optionally case-folding) both strings entirely,
1362     * improving performance significantly.
1363     *
1364     * Bulk normalization is only necessary if the strings do not fulfill the 
1365     * FCD conditions. Only in this case, and only if the strings are relatively 
1366     * long, is memory allocated temporarily.
1367     * For FCD strings and short non-FCD strings there is no memory allocation.
1368     *
1369     * Semantically, this is equivalent to
1370     *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1371     * where code point order and foldCase are all optional.
1372     *
1373     * @param s1        First source character array.
1374     * @param s1Start   start index of source
1375     * @param s1Limit   limit of the source
1376     *
1377     * @param s2        Second source character array.
1378     * @param s2Start   start index of the source
1379     * @param s2Limit   limit of the source
1380     * 
1381     * @param options A bit set of options:
1382     *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1383     *     Case-sensitive comparison in code unit order, and the input strings
1384     *     are quick-checked for FCD.
1385     *
1386     *   - INPUT_IS_FCD
1387     *     Set if the caller knows that both s1 and s2 fulfill the FCD 
1388     *     conditions.If not set, the function will quickCheck for FCD
1389     *     and normalize if necessary.
1390     *
1391     *   - COMPARE_CODE_POINT_ORDER
1392     *     Set to choose code point order instead of code unit order
1393     *
1394     *   - COMPARE_IGNORE_CASE
1395     *     Set to compare strings case-insensitively using case folding,
1396     *     instead of case-sensitively.
1397     *     If set, then the following case folding options are used.
1398     *
1399     *
1400     * @return <0 or 0 or >0 as usual for string comparisons
1401     *
1402     * @see #normalize
1403     * @see #FCD
1404     * @stable ICU 2.8
1405     */
1406    public static int compare(char[] s1, int s1Start, int s1Limit,
1407                              char[] s2, int s2Start, int s2Limit,
1408                              int options) {
1409        return internalCompare(s1, s1Start, s1Limit, 
1410                               s2, s2Start, s2Limit, 
1411                               options);
1412    } 
1413       
1414    /**
1415     * Compare two strings for canonical equivalence.
1416     * Further options include case-insensitive comparison and
1417     * code point order (as opposed to code unit order).
1418     * Convenience method.
1419     *
1420     * @param s1 First source string.
1421     * @param s2 Second source string.
1422     *
1423     * @param options A bit set of options:
1424     *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1425     *     Case-sensitive comparison in code unit order, and the input strings
1426     *     are quick-checked for FCD.
1427     *
1428     *   - INPUT_IS_FCD
1429     *     Set if the caller knows that both s1 and s2 fulfill the FCD 
1430     *     conditions. If not set, the function will quickCheck for FCD
1431     *     and normalize if necessary.
1432     *
1433     *   - COMPARE_CODE_POINT_ORDER
1434     *     Set to choose code point order instead of code unit order
1435     *
1436     *   - COMPARE_IGNORE_CASE
1437     *     Set to compare strings case-insensitively using case folding,
1438     *     instead of case-sensitively.
1439     *     If set, then the following case folding options are used.
1440     *
1441     * @return <0 or 0 or >0 as usual for string comparisons
1442     *
1443     * @see #normalize
1444     * @see #FCD
1445     * @stable ICU 2.8
1446     */
1447    public static int compare(String   s1, String   s2, int options) {
1448         
1449        return compare(s1.toCharArray(),0,s1.length(),
1450                       s2.toCharArray(),0,s2.length(),
1451                       options);
1452    }
1453     
1454    /**
1455     * Compare two strings for canonical equivalence.
1456     * Further options include case-insensitive comparison and
1457     * code point order (as opposed to code unit order).
1458     * Convenience method.
1459     *
1460     * @param s1 First source string.
1461     * @param s2 Second source string.
1462     *
1463     * @param options A bit set of options:
1464     *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1465     *     Case-sensitive comparison in code unit order, and the input strings
1466     *     are quick-checked for FCD.
1467     *
1468     *   - INPUT_IS_FCD
1469     *     Set if the caller knows that both s1 and s2 fulfill the FCD 
1470     *     conditions. If not set, the function will quickCheck for FCD
1471     *     and normalize if necessary.
1472     *
1473     *   - COMPARE_CODE_POINT_ORDER
1474     *     Set to choose code point order instead of code unit order
1475     *
1476     *   - COMPARE_IGNORE_CASE
1477     *     Set to compare strings case-insensitively using case folding,
1478     *     instead of case-sensitively.
1479     *     If set, then the following case folding options are used.
1480     *
1481     * @return <0 or 0 or >0 as usual for string comparisons
1482     *
1483     * @see #normalize
1484     * @see #FCD
1485     * @stable ICU 2.8
1486     */
1487    public static int compare(char[] s1, char[] s2, int options) {
1488        return compare(s1,0,s1.length,s2,0,s2.length,options);
1489    } 
1490        
1491    /**
1492     * Convenience method that can have faster implementation
1493     * by not allocating buffers.
1494     * @param char32a    the first code point to be checked against the
1495     * @param char32b    the second code point
1496     * @param options    A bit set of options
1497     * @stable ICU 2.8
1498     */
1499    // TODO: actually do the optimization when the guts of Normalizer are 
1500    // upgraded --has just dumb implementation for now
1501    public static int compare(int char32a, int char32b,int options) {
1502        return compare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options);
1503    }
1504    
1505    /**
1506     * Convenience method that can have faster implementation
1507     * by not allocating buffers.
1508     * @param char32a   the first code point to be checked against
1509     * @param str2      the second string
1510     * @param options   A bit set of options
1511     * @stable ICU 2.8
1512     */
1513    // TODO: actually do the optimization when the guts of Normalizer are 
1514    // upgraded --has just dumb implementation for now
1515    public static int compare(int char32a, String   str2, int options) {
1516        return compare(UTF16.valueOf(char32a), str2, options);
1517    }
1518   
1519    /**
1520     * Concatenate normalized strings, making sure that the result is normalized
1521     * as well.
1522     *
1523     * If both the left and the right strings are in
1524     * the normalization form according to "mode",
1525     * then the result will be
1526     *
1527     * <code>
1528     *     dest=normalize(left+right, mode)
1529     * </code>
1530     *
1531     * With the input strings already being normalized,
1532     * this function will use next() and previous()
1533     * to find the adjacent end pieces of the input strings.
1534     * Only the concatenation of these end pieces will be normalized and
1535     * then concatenated with the remaining parts of the input strings.
1536     *
1537     * It is allowed to have dest==left to avoid copying the entire left string.
1538     *
1539     * @param left Left source array, may be same as dest.
1540     * @param leftStart start in the left array.
1541     * @param leftLimit limit in the left array (==length)
1542     * @param right Right source array.
1543     * @param rightStart start in the right array.
1544     * @param rightLimit limit in the right array (==length)
1545     * @param dest The output buffer; can be null if destStart==destLimit==0 
1546     *              for pure preflighting.
1547     * @param destStart start in the destination array
1548     * @param destLimit limit in the destination array (==length)
1549     * @param mode The normalization mode.
1550     * @param options The normalization options, ORed together (0 for no options).
1551     * @return Length of output (number of chars) when successful or 
1552     *          IndexOutOfBoundsException
1553     * @exception IndexOutOfBoundsException whose message has the string 
1554     *             representation of destination capacity required. 
1555     * @see #normalize
1556     * @see #next
1557     * @see #previous
1558     * @exception IndexOutOfBoundsException if target capacity is less than the
1559     *             required length
1560     * @stable ICU 2.8
1561     */
1562    /* Concatenation of normalized strings --------------------------------- */
1563    
1564    public static int concatenate(char[] left,  int leftStart,  int leftLimit,
1565                                  char[] right, int rightStart, int rightLimit, 
1566                                  char[] dest,  int destStart,  int destLimit,
1567                                  Normalizer.Mode mode, int options) {
1568                               
1569    
1570        UCharacterIterator iter;
1571        
1572        int leftBoundary, rightBoundary, destLength;
1573    
1574        if(dest == null) {
1575            throw new IllegalArgumentException  ();
1576        }
1577    
1578        /* check for overlapping right and destination */
1579        if (right == dest && rightStart < destLimit && destStart < rightLimit) {
1580            throw new IllegalArgumentException  ("overlapping right and dst ranges");
1581        }
1582    
1583        /* allow left==dest */
1584    
1585        /*
1586         * Input: left[0..leftLength[ + right[0..rightLength[
1587         *
1588         * Find normalization-safe boundaries leftBoundary and rightBoundary
1589         * and copy the end parts together:
1590         * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
1591         *
1592         * dest=left[0..leftBoundary[ +
1593         *      normalize(buffer) +
1594         *      right[rightBoundary..rightLength[
1595         */
1596    
1597        /*
1598         * find a normalization boundary at the end of the left string
1599         * and copy the end part into the buffer
1600         */
1601
1602        iter = UCharacterIterator.getInstance(left, leftStart, leftLimit);
1603                                             
1604        iter.setIndex(iter.getLength()); /* end of left string */
1605        char[] buffer=new char[100];
1606        int bufferLength;
1607        bufferLength=previous(iter, buffer,0,buffer.length,mode,false,null,options);
1608        
1609        leftBoundary=iter.getIndex();
1610        
1611        if(bufferLength>buffer.length) {
1612            char[] newBuf = new char[buffer.length*2];
1613            buffer = newBuf;
1614            newBuf = null; // null the reference for GC
1615            /* just copy from the left string: we know the boundary already */
1616            System.arraycopy(left,leftBoundary,buffer,0,bufferLength);
1617        }
1618    
1619        /*
1620         * find a normalization boundary at the beginning of the right string
1621         * and concatenate the beginning part to the buffer
1622         */
1623
1624        iter = UCharacterIterator.getInstance(right, rightStart, rightLimit);
1625        
1626        rightBoundary=next(iter,buffer,bufferLength, buffer.length-bufferLength,
1627                           mode, false,null, options);
1628                           
1629        if(bufferLength>buffer.length) {
1630            char[] newBuf = new char[buffer.length*2];
1631            buffer = newBuf;
1632            newBuf = null; // null the reference for GC
1633            /* just copy from the right string: we know the boundary already */
1634            System.arraycopy(right,rightBoundary,buffer,
1635                             bufferLength,rightBoundary);
1636        }
1637
1638        bufferLength+=rightBoundary;
1639    
1640        /* copy left[0..leftBoundary[ to dest */
1641        if(left!=dest && leftBoundary>0 && (destLimit)>0) {
1642            System.arraycopy(left,0,dest,0, Math.min(leftBoundary,destLimit)); 
1643        }
1644        destLength=leftBoundary;
1645    
1646        /* concatenate the normalization of the buffer to dest */
1647        if(destLimit>destLength) {
1648            destLength+=Normalizer.normalize(buffer,0,bufferLength,dest,
1649                                             destLength,destLimit,mode,options);
1650            
1651        } else {
1652            destLength+=Normalizer.normalize(buffer, 0, bufferLength,null,0,0,mode,options);
1653        }
1654    
1655        /* concatenate right[rightBoundary..rightLength[ to dest */
1656        rightStart+=rightBoundary;
1657        int rightLength=(rightLimit-rightStart);
1658        if(rightLength>0 && destLimit>destLength) {
1659            System.arraycopy(right,rightStart,dest,destLength,
1660                             Math.min(rightLength,destLength)
1661                             );
1662        }
1663        destLength+=rightLength;
1664        
1665        if(destLength<=(destLimit-destStart)) {
1666            return destLength;
1667        } else {
1668            throw new IndexOutOfBoundsException  (Integer.toString(destLength));
1669        }  
1670    }
1671    
1672    /**
1673     * Concatenate normalized strings, making sure that the result is normalized
1674     * as well.
1675     *
1676     * If both the left and the right strings are in
1677     * the normalization form according to "mode",
1678     * then the result will be
1679     *
1680     * <code>
1681     *     dest=normalize(left+right, mode)
1682     * </code>
1683     *
1684     * For details see concatenate 
1685     *
1686     * @param left Left source string.
1687     * @param right Right source string.
1688     * @param mode The normalization mode.
1689     * @param options The normalization options, ORed together (0 for no options).
1690     * @return result
1691     *
1692     * @see #concatenate
1693     * @see #normalize
1694     * @see #next
1695     * @see #previous
1696     * @see #concatenate
1697     * @stable ICU 2.8
1698     */
1699    public static String   concatenate(char[] left, char[] right,Mode mode, int options) {
1700        char[] result = new char[(left.length+right.length)* MAX_BUF_SIZE_DECOMPOSE];
1701        for(;;) {
1702               
1703            int length = concatenate(left,  0, left.length,
1704                                     right, 0, right.length,
1705                                     result,0, result.length,
1706                                     mode, options);
1707            if(length<=result.length) {
1708                return new String  (result,0,length);
1709            } else {
1710                result = new char[length];
1711            }
1712        }            
1713    }
1714    
1715    /**
1716     * Concatenate normalized strings, making sure that the result is normalized
1717     * as well.
1718     *
1719     * If both the left and the right strings are in
1720     * the normalization form according to "mode",
1721     * then the result will be
1722     *
1723     * <code>
1724     *     dest=normalize(left+right, mode)
1725     * </code>
1726     *
1727     * For details see concatenate
1728     *
1729     * @param left Left source string.
1730     * @param right Right source string.
1731     * @param mode The normalization mode.
1732     * @param options The normalization options, ORed together (0 for no options).
1733     * @return result
1734     *
1735     * @see #concatenate
1736     * @see #normalize
1737     * @see #next
1738     * @see #previous
1739     * @see #concatenate
1740     * @stable ICU 2.8
1741     */
1742    public static String   concatenate(String   left, String   right,Mode mode, int options) {
1743        char[] result = new char[(left.length()+right.length())* MAX_BUF_SIZE_DECOMPOSE];
1744        for(;;) {
1745               
1746            int length = concatenate(left.toCharArray(), 0, left.length(),
1747                                     right.toCharArray(),0, right.length(),
1748                                     result,             0, result.length,
1749                                     mode, options);
1750            if(length<=result.length) {
1751                return new String  (result,0,length);
1752            } else {
1753                result = new char[length];
1754            }
1755        }            
1756    }
1757    
1758    /**
1759     * Gets the FC_NFKC closure set from the normalization data
1760     * @param c The code point whose closure set is to be retrieved
1761     * @param dest The char array to recive the closure set
1762     * @internal
1763     * @deprecated This API is ICU internal only.
1764     */
1765    public static int getFC_NFKC_Closure(int c,char[] dest) {
1766        return NormalizerImpl.getFC_NFKC_Closure(c,dest);
1767    }
1768    /**
1769     * Gets the FC_NFKC closure set from the normalization data
1770     * @param c The the code point whose closure set is to be retrieved
1771     * @return String representation of the closure set
1772     * @internal
1773     * @deprecated This API is ICU internal only.
1774     */ 
1775    public static String   getFC_NFKC_Closure(int c) {
1776        char[] dest = new char[10];
1777        for(;;) {
1778            int length = getFC_NFKC_Closure(c,dest);
1779            if(length<=dest.length) {
1780                return new String  (dest,0,length);
1781            } else {
1782                dest = new char[length];
1783            }
1784        }
1785    }
1786    //-------------------------------------------------------------------------
1787    // Iteration API
1788    //-------------------------------------------------------------------------
1789        
1790    /**
1791     * Return the current character in the normalized text->
1792     * @return The codepoint as an int
1793     * @stable ICU 2.8
1794     */
1795    public int current() {
1796        if(bufferPos<bufferLimit || nextNormalize()) {
1797            return getCodePointAt(bufferPos);
1798        } else {
1799            return DONE;
1800        }
1801    }
1802        
1803    /**
1804     * Return the next character in the normalized text and advance
1805     * the iteration position by one.  If the end
1806     * of the text has already been reached, {@link #DONE} is returned.
1807     * @return The codepoint as an int
1808     * @stable ICU 2.8
1809     */
1810    public int next() {
1811        if(bufferPos<bufferLimit ||  nextNormalize()) {
1812            int c=getCodePointAt(bufferPos);
1813            bufferPos+=(c>0xFFFF) ? 2 : 1;
1814            return c;
1815        } else {
1816            return DONE;
1817        }
1818    }
1819        
1820        
1821    /**
1822     * Return the previous character in the normalized text and decrement
1823     * the iteration position by one.  If the beginning
1824     * of the text has already been reached, {@link #DONE} is returned.
1825     * @return The codepoint as an int
1826     * @stable ICU 2.8
1827     */
1828    public int previous() {
1829        if(bufferPos>0 || previousNormalize()) {
1830            int c=getCodePointAt(bufferPos-1);
1831            bufferPos-=(c>0xFFFF) ? 2 : 1;
1832            return c;
1833        } else {
1834            return DONE;
1835        }
1836    }
1837        
1838    /**
1839     * Reset the index to the beginning of the text.
1840     * This is equivalent to setIndexOnly(startIndex)).
1841     * @stable ICU 2.8
1842     */
1843    public void reset() {
1844        text.setIndex(0);
1845        currentIndex=nextIndex=0;
1846        clearBuffer();
1847    }
1848    
1849    /**
1850     * Set the iteration position in the input text that is being normalized,
1851     * without any immediate normalization.
1852     * After setIndexOnly(), getIndex() will return the same index that is
1853     * specified here.
1854     *
1855     * @param index the desired index in the input text.
1856     * @stable ICU 2.8
1857     */
1858    public void setIndexOnly(int index) {
1859        text.setIndex(index);
1860        currentIndex=nextIndex=index; // validates index
1861        clearBuffer();
1862    }
1863        
1864    /**
1865     * Set the iteration position in the input text that is being normalized
1866     * and return the first normalized character at that position.
1867     * <p>
1868     * <b>Note:</b> This method sets the position in the <em>input</em> text,
1869     * while {@link #next} and {@link #previous} iterate through characters
1870     * in the normalized <em>output</em>.  This means that there is not
1871     * necessarily a one-to-one correspondence between characters returned
1872     * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
1873     * returned from <tt>setIndex</tt> and {@link #getIndex}.
1874     * <p>
1875     * @param index the desired index in the input text->
1876     *
1877     * @return   the first normalized character that is the result of iterating
1878     *            forward starting at the given index.
1879     *
1880     * @throws IllegalArgumentException if the given index is less than
1881     *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
1882     * @return The codepoint as an int
1883     * @deprecated ICU 3.2
1884     * @obsolete ICU 3.2
1885     */
1886     ///CLOVER:OFF
1887     public int setIndex(int index) {
1888         setIndexOnly(index);
1889         return current();
1890     }
1891     ///CLOVER:ON
1892    /**
1893     * Retrieve the index of the start of the input text. This is the begin 
1894     * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the 
1895     * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1896     * @deprecated ICU 2.2. Use startIndex() instead.
1897     * @return The codepoint as an int
1898     * @see #startIndex
1899     */
1900    public int getBeginIndex() {
1901        return 0;
1902    }
1903
1904    /**
1905     * Retrieve the index of the end of the input text.  This is the end index
1906     * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1907     * over which this <tt>Normalizer</tt> is iterating
1908     * @deprecated ICU 2.2. Use endIndex() instead.
1909     * @return The codepoint as an int
1910     * @see #endIndex
1911     */
1912    public int getEndIndex() {
1913        return endIndex();
1914    }
1915    /**
1916     * Return the first character in the normalized text->  This resets
1917     * the <tt>Normalizer's</tt> position to the beginning of the text->
1918     * @return The codepoint as an int
1919     * @stable ICU 2.8
1920     */
1921    public int first() {
1922        reset();
1923        return next();
1924    }
1925        
1926    /**
1927     * Return the last character in the normalized text->  This resets
1928     * the <tt>Normalizer's</tt> position to be just before the
1929     * the input text corresponding to that normalized character.
1930     * @return The codepoint as an int
1931     * @stable ICU 2.8
1932     */
1933    public int last() {
1934        text.setToLimit();
1935        currentIndex=nextIndex=text.getIndex();
1936        clearBuffer();
1937        return previous();
1938    }
1939        
1940    /**
1941     * Retrieve the current iteration position in the input text that is
1942     * being normalized.  This method is useful in applications such as
1943     * searching, where you need to be able to determine the position in
1944     * the input text that corresponds to a given normalized output character.
1945     * <p>
1946     * <b>Note:</b> This method sets the position in the <em>input</em>, while
1947     * {@link #next} and {@link #previous} iterate through characters in the
1948     * <em>output</em>.  This means that there is not necessarily a one-to-one
1949     * correspondence between characters returned by <tt>next</tt> and
1950     * <tt>previous</tt> and the indices passed to and returned from
1951     * <tt>setIndex</tt> and {@link #getIndex}.
1952     * @return The current iteration position
1953     * @stable ICU 2.8
1954     */
1955    public int getIndex() {
1956        if(bufferPos<bufferLimit) {
1957            return currentIndex;
1958        } else {
1959            return nextIndex;
1960        }
1961    }
1962        
1963    /**
1964     * Retrieve the index of the start of the input text. This is the begin 
1965     * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the 
1966     * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1967     * @return The current iteration position
1968     * @stable ICU 2.8
1969     */
1970    public int startIndex() {
1971        return 0;
1972    }
1973        
1974    /**
1975     * Retrieve the index of the end of the input text->  This is the end index
1976     * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1977     * over which this <tt>Normalizer</tt> is iterating
1978     * @return The current iteration position
1979     * @stable ICU 2.8
1980     */
1981    public int endIndex() {
1982        return text.getLength();
1983    }
1984    
1985    //-------------------------------------------------------------------------
1986    // Property access methods
1987    //-------------------------------------------------------------------------
1988    /**
1989     * Set the normalization mode for this object.
1990     * <p>
1991     * <b>Note:</b>If the normalization mode is changed while iterating
1992     * over a string, calls to {@link #next} and {@link #previous} may
1993     * return previously buffers characters in the old normalization mode
1994     * until the iteration is able to re-sync at the next base character.
1995     * It is safest to call {@link #setText setText()}, {@link #first},
1996     * {@link #last}, etc. after calling <tt>setMode</tt>.
1997     * <p>
1998     * @param newMode the new mode for this <tt>Normalizer</tt>.
1999     * The supported modes are:
2000     * <ul>
2001     *  <li>{@link #COMPOSE}        - Unicode canonical decompositiion
2002     *                                  followed by canonical composition.
2003     *  <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
2004     *                                  follwed by canonical composition.
2005     *  <li>{@link #DECOMP}         - Unicode canonical decomposition
2006     *  <li>{@link #DECOMP_COMPAT}  - Unicode compatibility decomposition.
2007     *  <li>{@link #NO_OP}          - Do nothing but return characters
2008     *                                  from the underlying input text.
2009     * </ul>
2010     *
2011     * @see #getMode
2012     * @stable ICU 2.8
2013     */
2014    public void setMode(Mode newMode) {
2015        mode = newMode;
2016    }
2017    /**
2018     * Return the basic operation performed by this <tt>Normalizer</tt>
2019     *
2020     * @see #setMode
2021     * @stable ICU 2.8
2022     */
2023    public Mode getMode() {
2024        return mode;
2025    }
2026    /**
2027     * Set options that affect this <tt>Normalizer</tt>'s operation.
2028     * Options do not change the basic composition or decomposition operation
2029     * that is being performed , but they control whether
2030     * certain optional portions of the operation are done.
2031     * Currently the only available option is:
2032     * <p>
2033     * <ul>
2034     *   <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
2035     * </ul>
2036     * <p>
2037     * @param   option  the option whose value is to be set.
2038     * @param   value   the new setting for the option.  Use <tt>true</tt> to
2039     *                  turn the option on and <tt>false</tt> to turn it off.
2040     *
2041     * @see #getOption
2042     * @stable ICU 2.6
2043     */
2044    public void setOption(int option,boolean value) {
2045        if (value) {
2046            options |= option;
2047        } else {
2048            options &= (~option);
2049        }
2050    }
2051        
2052    /**
2053     * Determine whether an option is turned on or off.
2054     * <p>
2055     * @see #setOption
2056     * @stable ICU 2.6
2057     */
2058    public int getOption(int option) {
2059        if((options & option)!=0) {
2060            return 1 ;
2061        } else {
2062            return 0;
2063        }
2064    }
2065    
2066    /**
2067     * Gets the underlying text storage
2068     * @param fillIn the char buffer to fill the UTF-16 units.
2069     *         The length of the buffer should be equal to the length of the
2070     *         underlying text storage
2071     * @throws IndexOutOfBoundsException
2072     * @see   #getLength
2073     * @stable ICU 2.8
2074     */
2075    public int getText(char[] fillIn) {
2076        return text.getText(fillIn);
2077    }
2078    
2079    /**
2080     * Gets the length of underlying text storage
2081     * @return the length
2082     * @stable ICU 2.8
2083     */ 
2084    public int getLength() {
2085        return text.getLength();
2086    }
2087    
2088    /**
2089     * Returns the text under iteration as a string
2090     * @return a copy of the text under iteration.
2091     * @stable ICU 2.8
2092     */
2093    public String   getText() {
2094        return text.getText();
2095    }
2096    
2097    /**
2098     * Set the input text over which this <tt>Normalizer</tt> will iterate.
2099     * The iteration position is set to the beginning of the input text->
2100     * @param newText   The new string to be normalized.
2101     * @stable ICU 2.8
2102     */
2103    public void setText(StringBuffer   newText) {
2104        
2105        UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
2106        if (newIter == null) {
2107            throw new IllegalStateException  ("Could not create a new UCharacterIterator");
2108        }  
2109        text = newIter;
2110        reset();
2111    }
2112        
2113    /**
2114     * Set the input text over which this <tt>Normalizer</tt> will iterate.
2115     * The iteration position is set to the beginning of the input text->
2116     * @param newText   The new string to be normalized.
2117     * @stable ICU 2.8
2118     */
2119    public void setText(char[] newText) {
2120        
2121        UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
2122        if (newIter == null) {
2123            throw new IllegalStateException  ("Could not create a new UCharacterIterator");
2124        }  
2125        text = newIter;
2126        reset();
2127    }
2128    
2129    /**
2130     * Set the input text over which this <tt>Normalizer</tt> will iterate.
2131     * The iteration position is set to the beginning of the input text->
2132     * @param newText   The new string to be normalized.
2133     * @stable ICU 2.8
2134     */
2135    public void setText(String   newText) {
2136            
2137        UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
2138        if (newIter == null) {
2139            throw new IllegalStateException  ("Could not create a new UCharacterIterator");
2140        }  
2141        text = newIter;
2142        reset();
2143    }
2144    
2145    /**
2146     * Set the input text over which this <tt>Normalizer</tt> will iterate.
2147     * The iteration position is set to the beginning of the input text->
2148     * @param newText   The new string to be normalized.
2149     * @stable ICU 2.8
2150     */
2151    public void setText(CharacterIterator   newText) {
2152        
2153        UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
2154        if (newIter == null) {
2155            throw new IllegalStateException  ("Could not create a new UCharacterIterator");
2156        }  
2157        text = newIter;
2158        reset();
2159    }
2160    
2161    /**
2162     * Set the input text over which this <tt>Normalizer</tt> will iterate.
2163     * The iteration position is set to the beginning of the string.
2164     * @param newText   The new string to be normalized.
2165     * @stable ICU 2.8
2166     */
2167    public void setText(UCharacterIterator newText) { 
2168        try{
2169            UCharacterIterator newIter = (UCharacterIterator)newText.clone();
2170            if (newIter == null) {
2171                throw new IllegalStateException  ("Could not create a new UCharacterIterator");
2172            }
2173            text = newIter;
2174            reset();
2175        }catch(CloneNotSupportedException   e) {
2176            throw new IllegalStateException  ("Could not clone the UCharacterIterator");
2177        }
2178    }
2179    
2180    //-------------------------------------------------------------------------
2181    // Private utility methods
2182    //-------------------------------------------------------------------------
2183    
2184
2185    /* backward iteration --------------------------------------------------- */
2186               
2187    /*
2188     * read backwards and get norm32
2189     * return 0 if the character is <minC
2190     * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first 
2191     * surrogate but read second!)
2192     */
2193
2194    private static  long getPrevNorm32(UCharacterIterator src, 
2195                                       int/*unsigned*/ minC, 
2196                                       int/*unsigned*/ mask, 
2197                                       char[] chars) {
2198        long norm32;
2199        int ch=0;
2200        /* need src.hasPrevious() */
2201        if((ch=src.previous()) == UCharacterIterator.DONE) {
2202            return 0;
2203        }
2204        chars[0]=(char)ch;
2205        chars[1]=0;
2206    
2207        /* check for a surrogate before getting norm32 to see if we need to 
2208         * predecrement further */
2209        if(chars[0]<minC) {
2210            return 0;
2211        } else if(!UTF16.isSurrogate(chars[0])) {
2212            return NormalizerImpl.getNorm32(chars[0]);
2213        } else if(UTF16.isLeadSurrogate(chars[0]) || (src.getIndex()==0)) {
2214            /* unpaired surrogate */
2215            chars[1]=(char)src.current();
2216            return 0;
2217        } else if(UTF16.isLeadSurrogate(chars[1]=(char)src.previous())) {
2218            norm32=NormalizerImpl.getNorm32(chars[1]);
2219            if((norm32&mask)==0) {
2220                /* all surrogate pairs with this lead surrogate have irrelevant 
2221                 * data */
2222                return 0;
2223            } else {
2224                /* norm32 must be a surrogate special */
2225                return NormalizerImpl.getNorm32FromSurrogatePair(norm32,chars[0]);
2226            }
2227        } else {
2228            /* unpaired second surrogate, undo the c2=src.previous() movement */
2229            src.moveIndex( 1);
2230            return 0;
2231        }
2232    }
2233 
2234    private interface IsPrevBoundary{
2235        public boolean isPrevBoundary(UCharacterIterator src,
2236                                      int/*unsigned*/ minC, 
2237                                      int/*unsigned*/ mask, 
2238                                      char[] chars);
2239    }
2240    private static final class IsPrevNFDSafe implements IsPrevBoundary{
2241        /*
2242         * for NF*D:
2243         * read backwards and check if the lead combining class is 0
2244         * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first 
2245         * surrogate but read second!)
2246         */
2247        public boolean isPrevBoundary(UCharacterIterator src,
2248                                      int/*unsigned*/ minC, 
2249                                      int/*unsigned*/ ccOrQCMask, 
2250                                      char[] chars) {
2251    
2252            return NormalizerImpl.isNFDSafe(getPrevNorm32(src, minC, 
2253                                                          ccOrQCMask, chars), 
2254                                            ccOrQCMask, 
2255                                            ccOrQCMask& NormalizerImpl.QC_MASK);
2256        }
2257    }
2258    
2259    private static final class IsPrevTrueStarter implements IsPrevBoundary{
2260        /*
2261         * read backwards and check if the character is (or its decomposition 
2262         * begins with) a "true starter" (cc==0 and NF*C_YES)
2263         * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first 
2264         * surrogate but read second!)
2265         */
2266        public boolean isPrevBoundary(UCharacterIterator src, 
2267                                      int/*unsigned*/ minC,
2268                                      int/*unsigned*/ ccOrQCMask,
2269                                      char[] chars) {
2270            long norm32; 
2271            int/*unsigned*/ decompQCMask;
2272            
2273            decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/
2274            norm32=getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, chars);
2275            return NormalizerImpl.isTrueStarter(norm32,ccOrQCMask,decompQCMask);
2276        }
2277    }
2278    
2279    private static int findPreviousIterationBoundary(UCharacterIterator src,
2280                                                     IsPrevBoundary obj, 
2281                                                     int/*unsigned*/ minC,
2282                                                     int/*mask*/ mask,
2283                                                     char[] buffer, 
2284                                                     int[] startIndex) {
2285        char[] chars=new char[2];
2286        boolean isBoundary;
2287    
2288        /* fill the buffer from the end backwards */
2289        startIndex[0] = buffer.length;
2290        chars[0]=0;
2291        while(src.getIndex()>0 && chars[0]!=UCharacterIterator.DONE) {
2292            isBoundary=obj.isPrevBoundary(src, minC, mask, chars);
2293    
2294            /* always write this character to the front of the buffer */
2295            /* make sure there is enough space in the buffer */
2296            if(startIndex[0] < (chars[1]==0 ? 1 : 2)) {
2297
2298                // grow the buffer
2299                char[] newBuf = new char[buffer.length*2];
2300                /* move the current buffer contents up */
2301                System.arraycopy(buffer,startIndex[0],newBuf,
2302                                 newBuf.length-(buffer.length-startIndex[0]),
2303                                 buffer.length-startIndex[0]);
2304                //adjust the startIndex
2305                startIndex[0]+=newBuf.length-buffer.length;
2306                
2307                buffer=newBuf;
2308                newBuf=null;                
2309                
2310            }
2311    
2312            buffer[--startIndex[0]]=chars[0];
2313            if(chars[1]!=0) {
2314                buffer[--startIndex[0]]=chars[1];
2315            }
2316    
2317            /* stop if this just-copied character is a boundary */
2318            if(isBoundary) {
2319                break;
2320            }
2321        }
2322    
2323        /* return the length of the buffer contents */
2324        return buffer.length-startIndex[0];
2325    }
2326    
2327    private static int previous(UCharacterIterator src,
2328                                char[] dest, int destStart, int destLimit, 
2329                                Mode mode, 
2330                                boolean doNormalize, 
2331                                boolean[] pNeededToNormalize,
2332                                int options) {
2333
2334        IsPrevBoundary isPreviousBoundary;
2335        int destLength, bufferLength;
2336        int/*unsigned*/ mask;
2337        
2338        int c,c2;
2339        
2340        char minC;
2341        int destCapacity = destLimit-destStart;
2342        destLength=0;
2343        
2344        
2345        if(pNeededToNormalize!=null) {
2346            pNeededToNormalize[0]=false;
2347        }
2348        minC = (char)mode.getMinC();
2349        mask = mode.getMask();
2350        isPreviousBoundary = mode.getPrevBoundary();
2351
2352        if(isPreviousBoundary==null) {
2353            destLength=0;
2354            if((c=src.previous())>=0) {
2355                destLength=1;
2356                if(UTF16.isTrailSurrogate((char)c)) {
2357                    c2= src.previous();
2358                    if(c2!= UCharacterIterator.DONE) {
2359                        if(UTF16.isLeadSurrogate((char)c2)) {
2360                            if(destCapacity>=2) {
2361                                dest[1]=(char)c; // trail surrogate 
2362                                destLength=2;
2363                            }
2364                            // lead surrogate to be written below 
2365                            c=c2; 
2366                        } else {
2367                            src.moveIndex(1);
2368                        }
2369                    }
2370                }
2371    
2372                if(destCapacity>0) {
2373                    dest[0]=(char)c;
2374                }
2375            }
2376            return destLength;
2377        }
2378        
2379        char[] buffer = new char[100];
2380        int[] startIndex= new int[1];
2381        bufferLength=findPreviousIterationBoundary(src,
2382                                                   isPreviousBoundary, 
2383                                                   minC, mask,buffer, 
2384                                                   startIndex);
2385        if(bufferLength>0) {
2386            if(doNormalize) {
2387                destLength=Normalizer.normalize(buffer,startIndex[0],
2388                                                startIndex[0]+bufferLength,
2389                                                dest, destStart,destLimit,
2390                                                mode, options);
2391                
2392                if(pNeededToNormalize!=null) {
2393                    pNeededToNormalize[0]=(boolean)(destLength!=bufferLength ||
2394                                                    Utility.arrayRegionMatches(
2395                                                                               buffer,0,dest,
2396                                                                               destStart,destLimit
2397                                                                               ));
2398                }
2399            } else {
2400                /* just copy the source characters */
2401                if(destCapacity>0) {
2402                    System.arraycopy(buffer,startIndex[0],dest,0,
2403                                     (bufferLength<destCapacity) ? 
2404                                     bufferLength : destCapacity
2405                                     );
2406                }
2407            }
2408        } 
2409
2410    
2411        return destLength;
2412    }
2413
2414 
2415    
2416    /* forward iteration ---------------------------------------------------- */
2417    /*
2418     * read forward and check if the character is a next-iteration boundary
2419     * if c2!=0 then (c, c2) is a surrogate pair
2420     */
2421    private interface IsNextBoundary{
2422        boolean isNextBoundary(UCharacterIterator src, 
2423                               int/*unsigned*/ minC, 
2424                               int/*unsigned*/ mask, 
2425                               int[] chars);
2426    }   
2427    /*
2428     * read forward and get norm32
2429     * return 0 if the character is <minC
2430     * if c2!=0 then (c2, c) is a surrogate pair
2431     * always reads complete characters
2432     */
2433    private static long /*unsigned*/ getNextNorm32(UCharacterIterator src, 
2434                                                   int/*unsigned*/ minC, 
2435                                                   int/*unsigned*/ mask, 
2436                                                   int[] chars) {
2437        long norm32;
2438    
2439        /* need src.hasNext() to be true */
2440        chars[0]=src.next();
2441        chars[1]=0;
2442    
2443        if(chars[0]<minC) {
2444            return 0;
2445        }
2446    
2447        norm32=NormalizerImpl.getNorm32((char)chars[0]);
2448        if(UTF16.isLeadSurrogate((char)chars[0])) {
2449            if(src.current()!=UCharacterIterator.DONE &&
2450               UTF16.isTrailSurrogate((char)(chars[1]=src.current()))) {
2451                src.moveIndex(1); /* skip the c2 surrogate */
2452                if((norm32&mask)==0) {
2453                    /* irrelevant data */
2454                    return 0;
2455                } else {
2456                    /* norm32 must be a surrogate special */
2457                    return NormalizerImpl.getNorm32FromSurrogatePair(norm32,(char)chars[1]);
2458                }
2459            } else {
2460                /* unmatched surrogate */
2461                return 0;
2462            }
2463        }
2464        return norm32;
2465    }
2466
2467
2468    /*
2469     * for NF*D:
2470     * read forward and check if the lead combining class is 0
2471     * if c2!=0 then (c, c2) is a surrogate pair
2472     */
2473    private static final class IsNextNFDSafe implements IsNextBoundary{
2474        public boolean isNextBoundary(UCharacterIterator src, 
2475                                      int/*unsigned*/ minC, 
2476                                      int/*unsigned*/ ccOrQCMask, 
2477                                      int[] chars) {
2478            return NormalizerImpl.isNFDSafe(getNextNorm32(src,minC,ccOrQCMask,chars), 
2479                                            ccOrQCMask, ccOrQCMask&NormalizerImpl.QC_MASK);
2480        }
2481    }
2482    
2483    /*
2484     * for NF*C:
2485     * read forward and check if the character is (or its decomposition begins 
2486     * with) a "true starter" (cc==0 and NF*C_YES)
2487     * if c2!=0 then (c, c2) is a surrogate pair
2488     */
2489    private static final class IsNextTrueStarter implements IsNextBoundary{
2490        public boolean isNextBoundary(UCharacterIterator src, 
2491                                      int/*unsigned*/ minC, 
2492                                      int/*unsigned*/ ccOrQCMask, 
2493                                      int[] chars) {
2494            long norm32;
2495            int/*unsigned*/ decompQCMask;
2496            
2497            decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/
2498            norm32=getNextNorm32(src, minC, ccOrQCMask|decompQCMask, chars);
2499            return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask, decompQCMask);
2500        }
2501    }
2502    
2503    private static int findNextIterationBoundary(UCharacterIterator src,
2504                                                 IsNextBoundary obj, 
2505                                                 int/*unsigned*/ minC, 
2506                                                 int/*unsigned*/ mask,
2507                                                 char[] buffer) {
2508        int[] chars = new int[2];
2509        int bufferIndex =0;
2510        
2511        if(src.current()==UCharacterIterator.DONE) {
2512            return 0;
2513        }
2514        /* get one character and ignore its properties */
2515        chars[0]=src.next();
2516        buffer[0]=(char)chars[0];
2517        bufferIndex=1;
2518        
2519        if(UTF16.isLeadSurrogate((char)chars[0])&& 
2520           src.current()!=UCharacterIterator.DONE) {
2521            if(UTF16.isTrailSurrogate((char)(chars[1]=src.next()))) {
2522                buffer[bufferIndex++]=(char)chars[1];
2523            } else {
2524                src.moveIndex(-1); /* back out the non-trail-surrogate */
2525            }
2526        }
2527    
2528        /* get all following characters until we see a boundary */
2529        /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff 
2530         * is part of the string */
2531        while( src.current()!=UCharacterIterator.DONE) {
2532            if(obj.isNextBoundary(src, minC, mask, chars)) {
2533                /* back out the latest movement to stop at the boundary */
2534                src.moveIndex(chars[1]==0 ? -1 : -2);
2535                break;
2536            } else {
2537                if(bufferIndex+(chars[1]==0 ? 1 : 2)<=buffer.length) {
2538                    buffer[bufferIndex++]=(char)chars[0];
2539                    if(chars[1]!=0) {
2540                        buffer[bufferIndex++]=(char)chars[1];
2541                    }
2542                } else {
2543                    char[] newBuf = new char[buffer.length    *2];
2544                    System.arraycopy(buffer,0,newBuf,0,bufferIndex);
2545                    buffer = newBuf;
2546                    buffer[bufferIndex++]=(char)chars[0];
2547                    if(chars[1]!=0) {
2548                        buffer[bufferIndex++]=(char)chars[1];
2549                    }
2550                }
2551            }
2552        }
2553    
2554        /* return the length of the buffer contents */
2555        return bufferIndex;
2556    }
2557    
2558    private static int next(UCharacterIterator src,
2559                            char[] dest, int destStart, int destLimit,
2560                            Normalizer.Mode mode,
2561                            boolean doNormalize, 
2562                            boolean[] pNeededToNormalize,
2563                            int options) {
2564                                
2565        IsNextBoundary isNextBoundary;
2566        int /*unsigned*/ mask;
2567        int /*unsigned*/ bufferLength;
2568        int c,c2;
2569        char minC;
2570        int destCapacity = destLimit - destStart;
2571        int destLength = 0;
2572        
2573        if(pNeededToNormalize!=null) {
2574            pNeededToNormalize[0]=false;
2575        }
2576
2577        minC = (char)mode.getMinC();
2578        mask = mode.getMask();
2579        isNextBoundary = mode.getNextBoundary();
2580        
2581        if(isNextBoundary==null) {
2582            destLength=0;
2583            c=src.next();
2584            if(c!=UCharacterIterator.DONE) {
2585                destLength=1;
2586                if(UTF16.isLeadSurrogate((char)c)) {
2587                    c2= src.next();
2588                    if(c2!= UCharacterIterator.DONE) {
2589                        if(UTF16.isTrailSurrogate((char)c2)) {
2590                            if(destCapacity>=2) {
2591                                dest[1]=(char)c2; // trail surrogate 
2592                                destLength=2;
2593                            }
2594                            // lead surrogate to be written below 
2595                        } else {
2596                            src.moveIndex(-1);
2597                        }
2598                    }
2599                }
2600    
2601                if(destCapacity>0) {
2602                    dest[0]=(char)c;
2603                }
2604            }
2605            return destLength;
2606        }
2607
2608        char[] buffer=new char[100];
2609        int[] startIndex = new int[1];
2610        
2611        bufferLength=findNextIterationBoundary(src,isNextBoundary, minC, mask,
2612                                               buffer);
2613        if(bufferLength>0) {
2614            if(doNormalize) {
2615                destLength=mode.normalize(buffer,startIndex[0],bufferLength,
2616                                          dest,destStart,destLimit, options);
2617                
2618                if(pNeededToNormalize!=null) {
2619                    pNeededToNormalize[0]=(boolean)(destLength!=bufferLength ||
2620                                                    Utility.arrayRegionMatches(buffer,startIndex[0],
2621                                                                               dest,destStart,
2622                                                                               destLength));
2623                }
2624            } else {
2625                /* just copy the source characters */
2626                if(destCapacity>0) {
2627                    System.arraycopy(buffer,0,dest,destStart,
2628                                     Math.min(bufferLength,destCapacity)
2629                                     );
2630                }
2631                                      
2632               
2633            }
2634        }
2635        return destLength;
2636    } 
2637
2638    private void clearBuffer() {
2639        bufferLimit=bufferStart=bufferPos=0;
2640    }
2641        
2642    private boolean nextNormalize() {
2643        
2644        clearBuffer();
2645        currentIndex=nextIndex;
2646        text.setIndex(nextIndex);
2647                
2648        bufferLimit=next(text,buffer,bufferStart,buffer.length,mode,true,null,options);
2649                        
2650        nextIndex=text.getIndex();
2651        return (bufferLimit>0);
2652    }
2653        
2654    private boolean     previousNormalize() {
2655
2656        clearBuffer();
2657        nextIndex=currentIndex;
2658        text.setIndex(currentIndex);
2659        bufferLimit=previous(text,buffer,bufferStart,buffer.length,mode,true,null,options);
2660                
2661        currentIndex=text.getIndex();
2662        bufferPos = bufferLimit;
2663        return bufferLimit>0;
2664    }
2665    
2666    private int getCodePointAt(int index) {
2667        if( UTF16.isSurrogate(buffer[index])) {
2668            if(UTF16.isLeadSurrogate(buffer[index])) {
2669                if((index+1)<bufferLimit &&
2670                   UTF16.isTrailSurrogate(buffer[index+1])) {
2671                    return UCharacterProperty.getRawSupplementary(
2672                                                                  buffer[index], 
2673                                                                  buffer[index+1]
2674                                                                  );
2675                }
2676            }else if(UTF16.isTrailSurrogate(buffer[index])) {
2677                if(index>0 && UTF16.isLeadSurrogate(buffer[index-1])) {
2678                    return UCharacterProperty.getRawSupplementary(
2679                                                                  buffer[index-1],
2680                                                                  buffer[index]
2681                                                                  );
2682                }
2683            }   
2684        }
2685        return buffer[index];
2686        
2687    }
2688    
2689    /**
2690     * Internal API
2691     * @internal
2692     * @deprecated This API is ICU internal only.
2693     */
2694    public static boolean isNFSkippable(int c, Mode mode) {
2695        return mode.isNFSkippable(c);
2696    }    
2697
2698        
2699    private static int internalCompare(char[] s1, int s1Start,int s1Limit,
2700                                       char[] s2, int s2Start,int s2Limit,
2701                                       int options) {
2702                                  
2703        char[] fcd1  = new char[300];
2704        char[] fcd2  = new char[300];
2705        
2706        Normalizer.Mode mode;
2707        int result;
2708        
2709        if(    s1==null || s1Start<0 || s1Limit<0 || 
2710               s2==null || s2Start<0 || s2Limit<0 ||
2711               s1Limit<s1Start || s2Limit<s2Start
2712               ) {
2713                
2714            throw new IllegalArgumentException  ();
2715        }
2716
2717        UnicodeSet nx=NormalizerImpl.getNX((int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT));
2718        options|= NormalizerImpl.COMPARE_EQUIV;
2719        result=0;
2720
2721        /*
2722         * UAX #21 Case Mappings, as fixed for Unicode version 4
2723         * (see Jitterbug 2021), defines a canonical caseless match as
2724         *
2725         * A string X is a canonical caseless match
2726         * for a string Y if and only if
2727         * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
2728         *
2729         * For better performance, we check for FCD (or let the caller tell us that
2730         * both strings are in FCD) for the inner normalization.
2731         * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
2732         * case-folding preserves the FCD-ness of a string.
2733         * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
2734         * when there is a difference.
2735         *
2736         * Exception: When using the Turkic case-folding option, we do perform
2737         * full NFD first. This is because in the Turkic case precomposed characters
2738         * with 0049 capital I or 0069 small i fold differently whether they
2739         * are first decomposed or not, so an FCD check - a check only for
2740         * canonical order - is not sufficient.
2741         */
2742        if((options& Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) >0 ) {
2743            mode=Normalizer.NFD;
2744            options&=~ Normalizer.INPUT_IS_FCD;
2745        } else {
2746            mode=Normalizer.FCD;
2747        }
2748        if((options& Normalizer.INPUT_IS_FCD)==0) {
2749            char[] dest;
2750            int fcdLen1, fcdLen2;
2751            boolean isFCD1, isFCD2;
2752        
2753            // check if s1 and/or s2 fulfill the FCD conditions
2754            isFCD1= Normalizer.YES==mode.quickCheck(s1, s1Start, s1Limit, true, nx);
2755            isFCD2= Normalizer.YES==mode.quickCheck(s2, s2Start, s2Limit, true, nx);
2756            /*
2757             * ICU 2.4 had a further optimization:
2758             * If both strings were not in FCD, then they were both NFD'ed,
2759             * and the COMPARE_EQUIV option was turned off.
2760             * It is not entirely clear that this is valid with the current
2761             * definition of the canonical caseless match.
2762             * Therefore, ICU 2.6 removes that optimization.
2763             */
2764
2765            if(!isFCD1) {
2766                fcdLen1=mode.normalize(s1, 0, s1.length,
2767                                       fcd1, 0, fcd1.length,
2768                                       nx);
2769                                       
2770                if(fcdLen1>fcd1.length) {
2771                    dest=new char[fcdLen1];
2772                    fcdLen1=mode.normalize( s1, 0, s1.length,
2773                                            dest, 0, dest.length,
2774                                            nx);
2775                    s1=dest;
2776                } else {
2777                    s1=fcd1;
2778                }
2779                s1Limit=fcdLen1;
2780                s1Start=0;
2781            }
2782
2783            if(!isFCD2) {
2784                fcdLen2=mode.normalize(s2,s2Start,s2Limit,
2785                                       fcd2,0,fcd2.length,
2786                                       nx);
2787                
2788                if(fcdLen2>fcd2.length) {
2789                    dest=new char[fcdLen2];
2790                    fcdLen2=mode.normalize( s2,s2Start,s2Limit,
2791                                            dest,0,dest.length,
2792                                            nx);
2793                    s2=dest;
2794                } else {
2795                    s2=fcd2;
2796                }
2797                s2Limit=fcdLen2;
2798                s2Start=0;
2799            }
2800                
2801        }
2802        
2803        result=NormalizerImpl.cmpEquivFold(s1, s1Start, s1Limit, 
2804                                           s2, s2Start, s2Limit, options);
2805        return result;
2806    }    
2807    
2808    /**
2809     * Fetches the Unicode version burned into the Normalization data file
2810     * @return VersionInfo version information of the normalizer
2811     * @internal
2812     * @deprecated This API is ICU internal only.
2813     */
2814    static VersionInfo getUnicodeVersion() {
2815        return NormalizerImpl.getUnicodeVersion();
2816    }
2817}
2818
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags