KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > impl > NormalizerImpl


1  /*
2  *******************************************************************************
3  * Copyright (C) 1996-2005, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7  
8 package com.ibm.icu.impl;
9 import java.io.ByteArrayInputStream JavaDoc;
10 import java.io.IOException JavaDoc;
11 import java.io.BufferedInputStream JavaDoc;
12 import java.io.InputStream JavaDoc;
13 import java.util.MissingResourceException JavaDoc;
14
15 import com.ibm.icu.text.Normalizer;
16 import com.ibm.icu.text.UTF16;
17 import com.ibm.icu.text.UnicodeSet;
18 import com.ibm.icu.text.UnicodeSetIterator;
19 import com.ibm.icu.util.RangeValueIterator;
20 import com.ibm.icu.util.VersionInfo;
21 import com.ibm.icu.lang.UCharacter;
22
23 /**
24  * @version 1.0
25  * @author Ram Viswanadha
26  */

27 public final class NormalizerImpl {
28     // Static block for the class to initialize its own self
29
static final NormalizerImpl IMPL;
30     
31     static
32     {
33         try
34         {
35             IMPL = new NormalizerImpl();
36         }
37         catch (Exception JavaDoc e)
38         {
39             throw new MissingResourceException JavaDoc(e.getMessage(), "", "");
40         }
41     }
42     
43     static final int UNSIGNED_BYTE_MASK =0xFF;
44     static final long UNSIGNED_INT_MASK = 0xffffffffL;
45     /*
46      * This new implementation of the normalization code loads its data from
47      * unorm.icu, which is generated with the gennorm tool.
48      * The format of that file is described at the end of this file.
49      */

50     private static final String JavaDoc DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE+"/unorm.icu";
51     
52     // norm32 value constants
53

54     // quick check flags 0..3 set mean "no" for their forms
55
public static final int QC_NFC=0x11; /* no|maybe */
56     public static final int QC_NFKC=0x22; /* no|maybe */
57     public static final int QC_NFD=4; /* no */
58     public static final int QC_NFKD=8; /* no */
59     
60     public static final int QC_ANY_NO=0xf;
61
62     /* quick check flags 4..5 mean "maybe" for their forms;
63      * test flags>=QC_MAYBE
64      */

65     public static final int QC_MAYBE=0x10;
66     public static final int QC_ANY_MAYBE=0x30;
67
68     public static final int QC_MASK=0x3f;
69
70     private static final int COMBINES_FWD=0x40;
71     private static final int COMBINES_BACK=0x80;
72     public static final int COMBINES_ANY=0xc0;
73     // UnicodeData.txt combining class in bits 15.
74
private static final int CC_SHIFT=8;
75     public static final int CC_MASK=0xff00;
76     // 16 bits for the index to UChars and other extra data
77
private static final int EXTRA_SHIFT=16;
78     // start of surrogate specials after shift
79
private static final int EXTRA_INDEX_TOP=0xfc00;
80
81     private static final int EXTRA_SURROGATE_MASK=0x3ff;
82     private static final int EXTRA_SURROGATE_TOP=0x3f0; /* hangul etc. */
83
84     private static final int EXTRA_HANGUL=EXTRA_SURROGATE_TOP;
85     private static final int EXTRA_JAMO_L=EXTRA_SURROGATE_TOP+1;/* ### not used */
86     private static final int EXTRA_JAMO_V=EXTRA_SURROGATE_TOP+2;
87     private static final int EXTRA_JAMO_T=EXTRA_SURROGATE_TOP+3;
88     
89     /* norm32 value constants using >16 bits */
90     private static final long MIN_SPECIAL = (long)(0xfc000000 & UNSIGNED_INT_MASK);
91     private static final long SURROGATES_TOP = (long)(0xfff00000 & UNSIGNED_INT_MASK);
92     private static final long MIN_HANGUL = (long)(0xfff00000 & UNSIGNED_INT_MASK);
93     private static final long MIN_JAMO_V = (long)(0xfff20000 & UNSIGNED_INT_MASK);
94     private static final long JAMO_V_TOP = (long)(0xfff30000 & UNSIGNED_INT_MASK);
95     
96     
97     /* indexes[] value names */
98     /* number of bytes in normalization trie */
99     static final int INDEX_TRIE_SIZE = 0;
100      /* number of chars in extra data */
101     static final int INDEX_CHAR_COUNT = 1;
102     /* number of uint16_t words for combining data */
103     static final int INDEX_COMBINE_DATA_COUNT = 2;
104     /* number of code points that combine forward */
105     static final int INDEX_COMBINE_FWD_COUNT = 3;
106     /* number of code points that combine forward and backward */
107     static final int INDEX_COMBINE_BOTH_COUNT = 4;
108     /* number of code points that combine backward */
109     static final int INDEX_COMBINE_BACK_COUNT = 5;
110      /* first code point with quick check NFC NO/MAYBE */
111     public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
112     /* first code point with quick check NFKC NO/MAYBE */
113     public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
114      /* first code point with quick check NFD NO/MAYBE */
115     public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
116     /* first code point with quick check NFKD NO/MAYBE */
117     public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
118     /* number of bytes in FCD trie */
119     static final int INDEX_FCD_TRIE_SIZE = 10;
120     /* number of bytes in the auxiliary trie */
121     static final int INDEX_AUX_TRIE_SIZE = 11;
122     /* number of uint16_t in the array of serialized USet */
123     static final int INDEX_CANON_SET_COUNT = 12;
124     /* changing this requires a new formatVersion */
125     static final int INDEX_TOP = 32;
126     
127     
128     /* AUX constants */
129     /* value constants for auxTrie */
130     private static final int AUX_UNSAFE_SHIFT = 11;
131     private static final int AUX_COMP_EX_SHIFT = 10;
132     private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
133     
134     private static final int AUX_MAX_FNC = ((int)1<<AUX_COMP_EX_SHIFT);
135     private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
136     private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);
137     private static final int AUX_COMP_EX_MASK = (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
138     private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT);
139     
140     /* canonStartSets[0..31] contains indexes for what is in the array */
141     /* number of uint16_t in canonical starter sets */
142     static final int SET_INDEX_CANON_SETS_LENGTH = 0;
143     /* number of uint16_t in the BMP search table (contains pairs) */
144     static final int SET_INDEX_CANON_BMP_TABLE_LENGTH = 1;
145     /* number of uint16_t in the supplementary search table(contains triplets)*/
146     static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH = 2;
147     /* changing this requires a new formatVersion */
148     static final int SET_INDEX_TOP = 32;
149     
150     static final int CANON_SET_INDICIES_INDEX = 0;
151     static final int CANON_SET_START_SETS_INDEX = 1;
152     static final int CANON_SET_BMP_TABLE_INDEX = 2;
153     static final int CANON_SET_SUPP_TABLE_INDEX = 3;
154     /* 14 bit indexes to canonical USerializedSets */
155     static final int CANON_SET_MAX_CANON_SETS = 0x4000;
156     /* single-code point BMP sets are encoded directly in the search table
157      * except if result=0x4000..0x7fff
158      */

159     static final int CANON_SET_BMP_MASK = 0xc000;
160     static final int CANON_SET_BMP_IS_INDEX = 0x4000;
161     
162     private static final int MAX_BUFFER_SIZE = 20;
163     
164     /**
165      * Internal option for cmpEquivFold() for decomposing.
166      * If not set, just do strcasecmp().
167      * @internal
168      */

169      public static final int COMPARE_EQUIV = 0x80000;
170     
171     /*******************************/
172
173     /* Wrappers for Trie implementations */
174     static final class NormTrieImpl implements Trie.DataManipulate{
175         static IntTrie normTrie= null;
176        /**
177         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
178         * data the index array offset of the indexes for that lead surrogate.
179         * @param property data value for a surrogate from the trie, including
180         * the folding offset
181         * @return data offset or 0 if there is no data for the lead surrogate
182         */

183         /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
184         public int getFoldingOffset(int value){
185             return BMP_INDEX_LENGTH+
186                     ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))&
187                     (0x3ff<<SURROGATE_BLOCK_BITS));
188         }
189         
190     }
191     static final class FCDTrieImpl implements Trie.DataManipulate{
192         static CharTrie fcdTrie=null;
193        /**
194         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
195         * data the index array offset of the indexes for that lead surrogate.
196         * @param property data value for a surrogate from the trie, including
197         * the folding offset
198         * @return data offset or 0 if there is no data for the lead surrogate
199         */

200         /* fcdTrie: the folding offset is the lead FCD value itself */
201         public int getFoldingOffset(int value){
202             return value;
203         }
204     }
205     
206     static final class AuxTrieImpl implements Trie.DataManipulate{
207         static CharTrie auxTrie = null;
208        /**
209         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
210         * data the index array offset of the indexes for that lead surrogate.
211         * @param property data value for a surrogate from the trie, including
212         * the folding offset
213         * @return data offset or 0 if there is no data for the lead surrogate
214         */

215         /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
216         public int getFoldingOffset(int value){
217             return (int)(value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS;
218         }
219     }
220          
221     /****************************************************/
222     
223     
224     private static FCDTrieImpl fcdTrieImpl;
225     private static NormTrieImpl normTrieImpl;
226     private static AuxTrieImpl auxTrieImpl;
227     private static int[] indexes;
228     private static char[] combiningTable;
229     private static char[] extraData;
230     private static Object JavaDoc[] canonStartSets;
231     
232     private static boolean isDataLoaded;
233     private static boolean isFormatVersion_2_1;
234     private static boolean isFormatVersion_2_2;
235     private static byte[] unicodeVersion;
236     
237     /**
238      * Default buffer size of datafile
239      */

240     private static final int DATA_BUFFER_SIZE = 25000;
241     
242     /**
243      * FCD check: everything below this code point is known to have a 0
244      * lead combining class
245      */

246     public static final int MIN_WITH_LEAD_CC=0x300;
247
248
249     /**
250      * Bit 7 of the length byte for a decomposition string in extra data is
251      * a flag indicating whether the decomposition string is
252      * preceded by a 16-bit word with the leading and trailing cc
253      * of the decomposition (like for A-umlaut);
254      * if not, then both cc's are zero (like for compatibility ideographs).
255      */

256     private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80;
257     /**
258      * Bits 6..0 of the length byte contain the actual length.
259      */

260     private static final int DECOMP_LENGTH_MASK=0x7f;
261     
262     /** Length of the BMP portion of the index (stage 1) array. */
263     private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_;
264     /** Number of bits of a trail surrogate that are used in index table
265      * lookups.
266      */

267     private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;
268
269
270    // public utility
271
public static int getFromIndexesArr(int index){
272         return indexes[index];
273    }
274    
275    // protected constructor ---------------------------------------------
276

277     /**
278     * Constructor
279     * @exception thrown when data reading fails or data corrupted
280     */

281     private NormalizerImpl() throws IOException JavaDoc {
282         //data should be loaded only once
283
if(!isDataLoaded){
284             
285             // jar access
286
InputStream JavaDoc i = ICUData.getRequiredStream(DATA_FILE_NAME);
287             BufferedInputStream JavaDoc b = new BufferedInputStream JavaDoc(i,DATA_BUFFER_SIZE);
288             NormalizerDataReader reader = new NormalizerDataReader(b);
289             
290             // read the indexes
291
indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
292             
293             byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
294             
295             int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
296             combiningTable = new char[combiningTableTop];
297             
298             int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
299             extraData = new char[extraDataTop];
300
301             byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
302             byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
303             canonStartSets=new Object JavaDoc[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
304             
305             fcdTrieImpl = new FCDTrieImpl();
306             normTrieImpl = new NormTrieImpl();
307             auxTrieImpl = new AuxTrieImpl();
308                         
309             // load the rest of the data data and initialize the data members
310
reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable,
311                         canonStartSets);
312                                        
313             NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream JavaDoc(normBytes),normTrieImpl );
314             FCDTrieImpl.fcdTrie = new CharTrie( new ByteArrayInputStream JavaDoc(fcdBytes),fcdTrieImpl );
315             AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream JavaDoc(auxBytes),auxTrieImpl );
316             
317             // we reached here without any exceptions so the data is fully
318
// loaded set the variable to true
319
isDataLoaded = true;
320             
321             // get the data format version
322
byte[] formatVersion = reader.getDataFormatVersion();
323             
324             isFormatVersion_2_1 =( formatVersion[0]>2
325                                     ||
326                                    (formatVersion[0]==2 && formatVersion[1]>=1)
327                                  );
328             isFormatVersion_2_2 =( formatVersion[0]>2
329                                     ||
330                                    (formatVersion[0]==2 && formatVersion[1]>=2)
331                                  );
332             unicodeVersion = reader.getUnicodeVersion();
333             b.close();
334         }
335     }
336         
337     /* ---------------------------------------------------------------------- */
338     
339     /* Korean Hangul and Jamo constants */
340     
341     public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
342     public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
343     public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
344     
345     public static final int HANGUL_BASE=0xac00;
346     
347     public static final int JAMO_L_COUNT=19;
348     public static final int JAMO_V_COUNT=21;
349     public static final int JAMO_T_COUNT=28;
350     public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
351     
352     private static boolean isHangulWithoutJamoT(char c) {
353         c-=HANGUL_BASE;
354         return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
355     }
356     
357     /* norm32 helpers */
358     
359     /* is this a norm32 with a regular index? */
360     private static boolean isNorm32Regular(long norm32) {
361         return norm32<MIN_SPECIAL;
362     }
363     
364     /* is this a norm32 with a special index for a lead surrogate? */
365     private static boolean isNorm32LeadSurrogate(long norm32) {
366         return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP;
367     }
368     
369     /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
370     private static boolean isNorm32HangulOrJamo(long norm32) {
371         return norm32>=MIN_HANGUL;
372     }
373     
374     /*
375      * Given isNorm32HangulOrJamo(),
376      * is this a Hangul syllable or a Jamo?
377      */

378      ///CLOVER:OFF
379
private static boolean isHangulJamoNorm32HangulOrJamoL(long norm32) {
380         return norm32<MIN_JAMO_V;
381     }
382     ///CLOVER:ON
383

384     /*
385      * Given norm32 for Jamo V or T,
386      * is this a Jamo V?
387      */

388     private static boolean isJamoVTNorm32JamoV(long norm32) {
389         return norm32<JAMO_V_TOP;
390     }
391     
392     /* data access primitives ----------------------------------------------- */
393     
394     public static long/*unsigned*/ getNorm32(char c) {
395         return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c)));
396     }
397     
398     public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32,
399                                                                char c2) {
400         /*
401          * the surrogate index in norm32 stores only the number of the surrogate
402          * index block see gennorm/store.c/getFoldedNormValue()
403          */

404         return ((UNSIGNED_INT_MASK) &
405                     NormTrieImpl.normTrie.getTrailValue((int)norm32, c2));
406     }
407     ///CLOVER:OFF
408
private static long getNorm32(int c){
409         return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c)));
410     }
411     
412     private static long getNorm32(int c,int mask){
413         long/*unsigned*/ norm32= getNorm32(UTF16.getLeadSurrogate(c));
414         if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {
415             /* c is a lead surrogate, get the real norm32 */
416             norm32=getNorm32FromSurrogatePair(norm32,UTF16.getTrailSurrogate(c));
417         }
418         return norm32;
419     }
420     ///CLOVER:ON
421

422     
423     /*
424      * get a norm32 from text with complete code points
425      * (like from decompositions)
426      */

427     private static long/*unsigned*/ getNorm32(char[] p,int start,
428                                               int/*unsigned*/ mask) {
429         long/*unsigned*/ norm32= getNorm32(p[start]);
430         if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {
431             /* *p is a lead surrogate, get the real norm32 */
432             norm32=getNorm32FromSurrogatePair(norm32, p[start+1]);
433         }
434         return norm32;
435     }
436     public static VersionInfo getUnicodeVersion(){
437         return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],
438                                        unicodeVersion[2], unicodeVersion[3]);
439     }
440     public static char getFCD16(char c) {
441         return FCDTrieImpl.fcdTrie.getLeadValue(c);
442     }
443     
444     public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
445         /* the surrogate index in fcd16 is an absolute offset over the
446          * start of stage 1
447          * */

448         return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
449     }
450     public static int getFCD16(int c) {
451         return FCDTrieImpl.fcdTrie.getCodePointValue(c);
452     }
453         
454     private static int getExtraDataIndex(long norm32) {
455         return (int)(norm32>>EXTRA_SHIFT);
456     }
457     
458     private static final class DecomposeArgs{
459         int /*unsigned byte*/ cc;
460         int /*unsigned byte*/ trailCC;
461         int length;
462     }
463     /**
464      *
465      * get the canonical or compatibility decomposition for one character
466      *
467      * @return index into the extraData array
468      */

469     private static int/*index*/ decompose(long/*unsigned*/ norm32,
470                                           int/*unsigned*/ qcMask,
471                                           DecomposeArgs args) {
472         int p= getExtraDataIndex(norm32);
473         args.length=extraData[p++];
474     
475         if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) {
476             /* use compatibility decomposition, skip canonical data */
477             p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK);
478             args.length>>=8;
479         }
480     
481         if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
482             /* get the lead and trail cc's */
483             char bothCCs=extraData[p++];
484             args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
485             args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
486         } else {
487             /* lead and trail cc's are both 0 */
488             args.cc=args.trailCC=0;
489         }
490     
491         args.length&=DECOMP_LENGTH_MASK;
492         return p;
493     }
494     
495        
496     /**
497      * get the canonical decomposition for one character
498      * @return index into the extraData array
499      */

500     private static int decompose(long/*unsigned*/ norm32,
501                                  DecomposeArgs args) {
502                              
503         int p= getExtraDataIndex(norm32);
504         args.length=extraData[p++];
505     
506         if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
507             /* get the lead and trail cc's */
508             char bothCCs=extraData[p++];
509             args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
510             args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
511         } else {
512             /* lead and trail cc's are both 0 */
513             args.cc=args.trailCC=0;
514         }
515     
516         args.length&=DECOMP_LENGTH_MASK;
517         return p;
518     }
519     
520     
521     private static final class NextCCArgs{
522         char[] source;
523         int next;
524         int limit;
525         char c;
526         char c2;
527     }
528     
529     /*
530      * get the combining class of (c, c2)= args.source[args.next++]
531      * before: args.next<args.limit after: args.next<=args.limit
532      * if only one code unit is used, then c2==0
533      */

534     private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
535         long /*unsigned*/ norm32;
536     
537         args.c=args.source[args.next++];
538         
539         norm32= getNorm32(args.c);
540         if((norm32 & CC_MASK)==0) {
541             args.c2=0;
542             return 0;
543         } else {
544             if(!isNorm32LeadSurrogate(norm32)) {
545                 args.c2=0;
546             } else {
547                 /* c is a lead surrogate, get the real norm32 */
548                 if(args.next!=args.limit &&
549                         UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
550                     ++args.next;
551                     norm32=getNorm32FromSurrogatePair(norm32, args.c2);
552                 } else {
553                     args.c2=0;
554                     return 0;
555                 }
556             }
557     
558             return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
559         }
560     }
561
562     private static final class PrevArgs{
563         char[] src;
564         int start;
565         int current;
566         char c;
567         char c2;
568     }
569     
570     /*
571      * read backwards and get norm32
572      * return 0 if the character is <minC
573      * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
574      * surrogate but read second!)
575      */

576     private static long /*unsigned*/ getPrevNorm32(PrevArgs args,
577                                                       int/*unsigned*/ minC,
578                                                       int/*unsigned*/ mask) {
579         long/*unsigned*/ norm32;
580     
581         args.c=args.src[--args.current];
582         args.c2=0;
583     
584         /* check for a surrogate before getting norm32 to see if we need to
585          * predecrement further
586          */

587         if(args.c<minC) {
588             return 0;
589         } else if(!UTF16.isSurrogate(args.c)) {
590             return getNorm32(args.c);
591         } else if(UTF16.isLeadSurrogate(args.c)) {
592             /* unpaired first surrogate */
593             return 0;
594         } else if(args.current!=args.start &&
595                     UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
596             --args.current;
597             norm32=getNorm32(args.c2);
598     
599             if((norm32&mask)==0) {
600                 /* all surrogate pairs with this lead surrogate have
601                  * only irrelevant data
602                  */

603                 return 0;
604             } else {
605                 /* norm32 must be a surrogate special */
606                 return getNorm32FromSurrogatePair(norm32, args.c);
607             }
608         } else {
609             /* unpaired second surrogate */
610             args.c2=0;
611             return 0;
612         }
613     }
614     
615     /*
616      * get the combining class of (c, c2)=*--p
617      * before: start<p after: start<=p
618      */

619     private static int /*unsigned byte*/ getPrevCC(PrevArgs args) {
620
621         return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC,
622                                                          CC_MASK)>>CC_SHIFT));
623     }
624
625     /*
626      * is this a safe boundary character for NF*D?
627      * (lead cc==0)
628      */

629     public static boolean isNFDSafe(long/*unsigned*/ norm32,
630                                      int/*unsigned*/ccOrQCMask,
631                                      int/*unsigned*/ decompQCMask) {
632         if((norm32&ccOrQCMask)==0) {
633             return true; /* cc==0 and no decomposition: this is NF*D safe */
634         }
635     
636         /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
637         if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
638             DecomposeArgs args=new DecomposeArgs();
639             /* decomposes, get everything from the variable-length extra data */
640             decompose(norm32, decompQCMask, args);
641             return args.cc==0;
642         } else {
643             /* no decomposition (or Hangul), test the cc directly */
644             return (norm32&CC_MASK)==0;
645         }
646     }
647     
648     /*
649      * is this (or does its decomposition begin with) a "true starter"?
650      * (cc==0 and NF*C_YES)
651      */

652     public static boolean isTrueStarter(long/*unsigned*/ norm32,
653                                           int/*unsigned*/ ccOrQCMask,
654                                           int/*unsigned*/ decompQCMask) {
655         if((norm32&ccOrQCMask)==0) {
656             return true; /* this is a true starter (could be Hangul or Jamo L)*/
657         }
658     
659         /* inspect its decomposition - not a Hangul or a surrogate here */
660         if((norm32&decompQCMask)!=0) {
661             int p; /* index into extra data array */
662             DecomposeArgs args=new DecomposeArgs();
663             /* decomposes, get everything from the variable-length extra data */
664             p=decompose(norm32, decompQCMask, args);
665           
666             if(args.cc==0) {
667                 int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK;
668     
669                 /* does it begin with NFC_YES? */
670                 if((getNorm32(extraData,p, qcMask)&qcMask)==0) {
671                     /* yes, the decomposition begins with a true starter */
672                     return true;
673                 }
674             }
675         }
676         return false;
677     }
678
679     /* reorder UTF-16 in-place ---------------------------------------------- */
680     
681     /**
682      * simpler, single-character version of mergeOrdered() -
683      * bubble-insert one single code point into the preceding string
684      * which is already canonically ordered
685      * (c, c2) may or may not yet have been inserted at src[current]..src[p]
686      *
687      * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
688      *
689      * before: src[start]..src[current] is already ordered, and
690      * src[current]..src[p] may or may not hold (c, c2) but
691      * must be exactly the same length as (c, c2)
692      * after: src[start]..src[p] is ordered
693      *
694      * @return the trailing combining class
695      */

696     private static int/*unsigned byte*/ insertOrdered(char[] source,
697                                                       int start,
698                                                       int current, int p,
699                                                          char c, char c2,
700                                                          int/*unsigned byte*/ cc) {
701         int back, preBack;
702         int r;
703         int prevCC, trailCC=cc;
704     
705         if(start<current && cc!=0) {
706             // search for the insertion point where cc>=prevCC
707
preBack=back=current;
708             PrevArgs prevArgs = new PrevArgs();
709             prevArgs.current = current;
710             prevArgs.start = start;
711             prevArgs.src = source;
712             // get the prevCC
713
prevCC=getPrevCC(prevArgs);
714             preBack = prevArgs.current;
715             
716             if(cc<prevCC) {
717                 // this will be the last code point, so keep its cc
718
trailCC=prevCC;
719                 back=preBack;
720                 while(start<preBack) {
721                     prevCC=getPrevCC(prevArgs);
722                     preBack=prevArgs.current;
723                     if(cc>=prevCC) {
724                         break;
725                     }
726                     back=preBack;
727                 }
728     
729                 
730                 // this is where we are right now with all these indicies:
731
// [start]..[pPreBack] 0..? code points that we can ignore
732
// [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
733
// [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
734
// [current]..[p] 1 code point (c, c2) with cc
735

736                 // move the code units in between up
737
r=p;
738                 do {
739                     source[--r]=source[--current];
740                 } while(back!=current);
741             }
742         }
743     
744         // insert (c, c2)
745
source[current]=c;
746         if(c2!=0) {
747             source[(current+1)]=c2;
748         }
749     
750         // we know the cc of the last code point
751
return trailCC;
752     }
753     
754     /**
755      * merge two UTF-16 string parts together
756      * to canonically order (order by combining classes) their concatenation
757      *
758      * the two strings may already be adjacent, so that the merging is done
759      * in-place if the two strings are not adjacent, then the buffer holding the
760      * first one must be large enough
761      * the second string may or may not be ordered in itself
762      *
763      * before: [start]..[current] is already ordered, and
764      * [next]..[limit] may be ordered in itself, but
765      * is not in relation to [start..current[
766      * after: [start..current+(limit-next)[ is ordered
767      *
768      * the algorithm is a simple bubble-sort that takes the characters from
769      * src[next++] and inserts them in correct combining class order into the
770      * preceding part of the string
771      *
772      * since this function is called much less often than the single-code point
773      * insertOrdered(), it just uses that for easier maintenance
774      *
775      * @return the trailing combining class
776      */

777     private static int /*unsigned byte*/ mergeOrdered(char[] source,
778                                                       int start,
779                                                       int current,
780                                                       char[] data,
781                                                         int next,
782                                                         int limit,
783                                                         boolean isOrdered) {
784             int r;
785             int /*unsigned byte*/ cc, trailCC=0;
786             boolean adjacent;
787         
788             adjacent= current==next;
789             NextCCArgs ncArgs = new NextCCArgs();
790             ncArgs.source = data;
791             ncArgs.next = next;
792             ncArgs.limit = limit;
793             
794             if(start!=current || !isOrdered) {
795                     
796                 while(ncArgs.next<ncArgs.limit) {
797                     cc=getNextCC(ncArgs);
798                     if(cc==0) {
799                         // does not bubble back
800
trailCC=0;
801                         if(adjacent) {
802                             current=ncArgs.next;
803                         } else {
804                             data[current++]=ncArgs.c;
805                             if(ncArgs.c2!=0) {
806                                 data[current++]=ncArgs.c2;
807                             }
808                         }
809                         if(isOrdered) {
810                             break;
811                         } else {
812                             start=current;
813                         }
814                     } else {
815                         r=current+(ncArgs.c2==0 ? 1 : 2);
816                         trailCC=insertOrdered(source,start, current, r,
817                                               ncArgs.c, ncArgs.c2, cc);
818                         current=r;
819                     }
820                 }
821             }
822         
823             if(ncArgs.next==ncArgs.limit) {
824                 // we know the cc of the last code point
825
return trailCC;
826             } else {
827                 if(!adjacent) {
828                     // copy the second string part
829
do {
830                         source[current++]=data[ncArgs.next++];
831                     } while(ncArgs.next!=ncArgs.limit);
832                     ncArgs.limit=current;
833                 }
834                 PrevArgs prevArgs = new PrevArgs();
835                 prevArgs.src = data;
836                 prevArgs.start = start;
837                 prevArgs.current = ncArgs.limit;
838                 return getPrevCC(prevArgs);
839             }
840
841     }
842     private static int /*unsigned byte*/ mergeOrdered(char[] source,
843                                                       int start,
844                                                       int current,
845                                                       char[] data,
846                                                         final int next,
847                                                         final int limit) {
848         return mergeOrdered(source,start,current,data,next,limit,true);
849     }
850
851     
852       
853     public static boolean checkFCD(char[] src,int srcStart, int srcLimit,
854                                    UnicodeSet nx) {
855
856         char fcd16,c,c2;
857         int prevCC=0, cc;
858         int i =srcStart, length = srcLimit;
859     
860         for(;;) {
861             for(;;) {
862                 if(i==length) {
863                     return true;
864                 } else if((c=src[i++])<MIN_WITH_LEAD_CC) {
865                     prevCC=(int)-c;
866                 } else if((fcd16=getFCD16(c))==0) {
867                     prevCC=0;
868                 } else {
869                     break;
870                 }
871             }
872
873             // check one above-minimum, relevant code unit
874
if(UTF16.isLeadSurrogate(c)) {
875                 // c is a lead surrogate, get the real fcd16
876
if(i!=length && UTF16.isTrailSurrogate(c2=src[i])) {
877                     ++i;
878                     fcd16=getFCD16FromSurrogatePair(fcd16, c2);
879                 } else {
880                     c2=0;
881                     fcd16=0;
882                 }
883             }else{
884                 c2=0;
885             }
886             
887             if(nx_contains(nx, c, c2)) {
888                 prevCC=0; /* excluded: fcd16==0 */
889                 continue;
890             }
891
892             // prevCC has values from the following ranges:
893
// 0..0xff -the previous trail combining class
894
// <0 -the negative value of the previous code unit;
895
// that code unit was <MIN_WITH_LEAD_CC and its getFCD16()
896
// was deferred so that average text is checked faster
897
//
898

899             // check the combining order
900
cc=(int)(fcd16>>8);
901             if(cc!=0) {
902                 if(prevCC<0) {
903                     // the previous character was <_NORM_MIN_WITH_LEAD_CC,
904
// we need to get its trail cc
905
//
906
if(!nx_contains(nx, (int)-prevCC)) {
907                         prevCC=(int)(FCDTrieImpl.fcdTrie.getBMPValue(
908                                              (char)-prevCC)&0xff
909                                              );
910                     } else {
911                         prevCC=0; /* excluded: fcd16==0 */
912                     }
913                                       
914                 }
915     
916                 if(cc<prevCC) {
917                     return false;
918                 }
919             }
920             prevCC=(int)(fcd16&0xff);
921         }
922     }
923     
924     public static Normalizer.QuickCheckResult quickCheck(char[] src,
925                                                             int srcStart,
926                                                             int srcLimit,
927                                                             int minNoMaybe,
928                                                             int qcMask,
929                                                             int options,
930                                                             boolean allowMaybe,
931                                                             UnicodeSet nx){
932
933         int ccOrQCMask;
934         long norm32;
935         char c, c2;
936         char cc, prevCC;
937         long qcNorm32;
938         Normalizer.QuickCheckResult result;
939         ComposePartArgs args = new ComposePartArgs();
940         char[] buffer ;
941         int start = srcStart;
942         
943         if(!isDataLoaded) {
944             return Normalizer.MAYBE;
945         }
946         // initialize
947
ccOrQCMask=CC_MASK|qcMask;
948         result=Normalizer.YES;
949         prevCC=0;
950                 
951         for(;;) {
952             for(;;) {
953                 if(srcStart==srcLimit) {
954                     return result;
955                 } else if((c=src[srcStart++])>=minNoMaybe &&
956                                   (( norm32=getNorm32(c)) & ccOrQCMask)!=0) {
957                     break;
958                 }
959                 prevCC=0;
960             }
961             
962     
963             // check one above-minimum, relevant code unit
964
if(isNorm32LeadSurrogate(norm32)) {
965                 // c is a lead surrogate, get the real norm32
966
if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) {
967                     ++srcStart;
968                     norm32=getNorm32FromSurrogatePair(norm32,c2);
969                 } else {
970                     norm32=0;
971                     c2=0;
972                 }
973             }else{
974                 c2=0;
975             }
976             if(nx_contains(nx, c, c2)) {
977                 /* excluded: norm32==0 */
978                 norm32=0;
979             }
980     
981             // check the combining order
982
cc=(char)((norm32>>CC_SHIFT)&0xFF);
983             if(cc!=0 && cc<prevCC) {
984                 return Normalizer.NO;
985             }
986             prevCC=cc;
987     
988             // check for "no" or "maybe" quick check flags
989
qcNorm32 = norm32 & qcMask;
990             if((qcNorm32& QC_ANY_NO)>=1) {
991                 result= Normalizer.NO;
992                 break;
993             } else if(qcNorm32!=0) {
994                 // "maybe" can only occur for NFC and NFKC
995
if(allowMaybe){
996                     result=Normalizer.MAYBE;
997                 }else{
998                     // normalize a section around here to see if it is really
999
// normalized or not
1000
int prevStarter;
1001                    int/*unsigned*/ decompQCMask;
1002    
1003                    decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask
1004

1005                    // find the previous starter
1006

1007                    // set prevStarter to the beginning of the current character
1008
prevStarter=srcStart-1;
1009                    if(UTF16.isTrailSurrogate(src[prevStarter])) {
1010                        // safe because unpaired surrogates do not result
1011
// in "maybe"
1012
--prevStarter;
1013                    }
1014
1015                    prevStarter=findPreviousStarter(src, start, prevStarter,
1016                                                    ccOrQCMask, decompQCMask,
1017                                                    (char)minNoMaybe);
1018    
1019                    // find the next true starter in [src..limit[ - modifies
1020
// src to point to the next starter
1021
srcStart=findNextStarter(src,srcStart, srcLimit, qcMask,
1022                                             decompQCMask,(char) minNoMaybe);
1023                    
1024                    //set the args for compose part
1025
args.prevCC = prevCC;
1026                       
1027                    // decompose and recompose [prevStarter..src[
1028
buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx);
1029    
1030                    // compare the normalized version with the original
1031
if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) {
1032                        result=Normalizer.NO; // normalization differs
1033
break;
1034                    }
1035    
1036                    // continue after the next starter
1037
}
1038            }
1039        }
1040        return result;
1041    }
1042 
1043       
1044    //------------------------------------------------------
1045
// make NFD & NFKD
1046
//------------------------------------------------------
1047
public static int getDecomposition(int c /*UTF-32*/ ,
1048                                        boolean compat,
1049                                           char[] dest,
1050                                           int destStart,
1051                                           int destCapacity) {
1052            
1053        if( (UNSIGNED_INT_MASK & c)<=0x10ffff) {
1054            long /*unsigned*/ norm32;
1055            int qcMask;
1056            int minNoMaybe;
1057            int length;
1058    
1059            // initialize
1060
if(!compat) {
1061                minNoMaybe=(int)indexes[INDEX_MIN_NFD_NO_MAYBE];
1062                qcMask=QC_NFD;
1063            } else {
1064                minNoMaybe=(int)indexes[INDEX_MIN_NFKD_NO_MAYBE];
1065                qcMask=QC_NFKD;
1066            }
1067    
1068            if(c<minNoMaybe) {
1069                // trivial case
1070
if(destCapacity>0) {
1071                    dest[0]=(char)c;
1072                }
1073                return -1;
1074            }
1075    
1076            /* data lookup */
1077            norm32=getNorm32(c);
1078            if((norm32&qcMask)==0) {
1079                /* simple case: no decomposition */
1080                if(c<=0xffff) {
1081                    if(destCapacity>0) {
1082                        dest[0]=(char)c;
1083                    }
1084                    return -1;
1085                } else {
1086                    if(destCapacity>=2) {
1087                        dest[0]=UTF16.getLeadSurrogate(c);
1088                        dest[1]=UTF16.getTrailSurrogate(c);
1089                    }
1090                    return -2;
1091                }
1092            } else if(isNorm32HangulOrJamo(norm32)) {
1093                /* Hangul syllable: decompose algorithmically */
1094                char c2;
1095    
1096                c-=HANGUL_BASE;
1097    
1098                c2=(char)(c%JAMO_T_COUNT);
1099                c/=JAMO_T_COUNT;
1100                if(c2>0) {
1101                    if(destCapacity>=3) {
1102                        dest[2]=(char)(JAMO_T_BASE+c2);
1103                    }
1104                    length=3;
1105                } else {
1106                    length=2;
1107                }
1108    
1109                if(destCapacity>=2) {
1110                    dest[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
1111                    dest[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
1112                }
1113                return length;
1114            } else {
1115                /* c decomposes, get everything from the variable-length extra
1116                 * data
1117                 */

1118                int p, limit;
1119                DecomposeArgs args = new DecomposeArgs();
1120                /* the index into extra data array*/
1121                p=decompose(norm32, qcMask, args);
1122                if(args.length<=destCapacity) {
1123                    limit=p+args.length;
1124                    do {
1125                        dest[destStart++]=extraData[p++];
1126                    } while(p<limit);
1127                }
1128                return args.length;
1129            }
1130        } else {
1131            return 0;
1132        }
1133    }
1134
1135    
1136    public static int decompose(char[] src,int srcStart,int srcLimit,
1137                                char[] dest,int destStart,int destLimit,
1138                                 boolean compat,int[] outTrailCC,
1139                                 UnicodeSet nx) {
1140                                
1141        char[] buffer = new char[3];
1142        int prevSrc;
1143        long norm32;
1144        int ccOrQCMask, qcMask;
1145        int reorderStartIndex, length;
1146        char c, c2, minNoMaybe;
1147        int/*unsigned byte*/ cc, prevCC, trailCC;
1148        char[] p;
1149        int pStart;
1150        int destIndex = destStart;
1151        int srcIndex = srcStart;
1152        if(!compat) {
1153            minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE];
1154            qcMask=QC_NFD;
1155        } else {
1156            minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE];
1157            qcMask=QC_NFKD;
1158        }
1159    
1160        /* initialize */
1161        ccOrQCMask=CC_MASK|qcMask;
1162        reorderStartIndex=0;
1163        prevCC=0;
1164        norm32=0;
1165        c=0;
1166        pStart=0;
1167        
1168        cc=trailCC=-1;//initialize to bogus value
1169

1170        for(;;) {
1171            /* count code units below the minimum or with irrelevant data for
1172             * the quick check
1173             */

1174            prevSrc=srcIndex;
1175
1176            while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe ||
1177                                        ((norm32=getNorm32(c))&ccOrQCMask)==0)){
1178                prevCC=0;
1179                ++srcIndex;
1180            }
1181
1182            /* copy these code units all at once */
1183            if(srcIndex!=prevSrc) {
1184                length=(int)(srcIndex-prevSrc);
1185                if((destIndex+length)<=destLimit) {
1186                    System.arraycopy(src,prevSrc,dest,destIndex,length);
1187                }
1188              
1189                destIndex+=length;
1190                reorderStartIndex=destIndex;
1191            }
1192    
1193            /* end of source reached? */
1194            if(srcIndex==srcLimit) {
1195                break;
1196            }
1197    
1198            /* c already contains *src and norm32 is set for it, increment src*/
1199            ++srcIndex;
1200    
1201            /* check one above-minimum, relevant code unit */
1202            /*
1203             * generally, set p and length to the decomposition string
1204             * in simple cases, p==NULL and (c, c2) will hold the length code
1205             * units to append in all cases, set cc to the lead and trailCC to
1206             * the trail combining class
1207             *
1208             * the following merge-sort of the current character into the
1209             * preceding, canonically ordered result text will use the
1210             * optimized insertOrdered()
1211             * if there is only one single code point to process;
1212             * this is indicated with p==NULL, and (c, c2) is the character to
1213             * insert
1214             * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1215             * for a supplementary character)
1216             * otherwise, p[length] is merged in with _mergeOrdered()
1217             */

1218            if(isNorm32HangulOrJamo(norm32)) {
1219                if(nx_contains(nx, c)) {
1220                    c2=0;
1221                    p=null;
1222                    length=1;
1223                } else {
1224                    // Hangul syllable: decompose algorithmically
1225
p=buffer;
1226                    pStart=0;
1227                    cc=trailCC=0;
1228    
1229                    c-=HANGUL_BASE;
1230    
1231                    c2=(char)(c%JAMO_T_COUNT);
1232                    c/=JAMO_T_COUNT;
1233                    if(c2>0) {
1234                        buffer[2]=(char)(JAMO_T_BASE+c2);
1235                        length=3;
1236                    } else {
1237                        length=2;
1238                    }
1239    
1240                    buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
1241                    buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
1242                }
1243            } else {
1244                if(isNorm32Regular(norm32)) {
1245                    c2=0;
1246                    length=1;
1247                } else {
1248                    // c is a lead surrogate, get the real norm32
1249
if(srcIndex!=srcLimit &&
1250                                    UTF16.isTrailSurrogate(c2=src[srcIndex])) {
1251                        ++srcIndex;
1252                        length=2;
1253                        norm32=getNorm32FromSurrogatePair(norm32, c2);
1254                    } else {
1255                        c2=0;
1256                        length=1;
1257                        norm32=0;
1258                    }
1259                }
1260    
1261                /* get the decomposition and the lead and trail cc's */
1262                if(nx_contains(nx, c, c2)) {
1263                    /* excluded: norm32==0 */
1264                    cc=trailCC=0;
1265                    p=null;
1266                } else if((norm32&qcMask)==0) {
1267                    /* c does not decompose */
1268                    cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
1269                    p=null;
1270                    pStart=-1;
1271                } else {
1272                    DecomposeArgs arg = new DecomposeArgs();
1273                    /* c decomposes, get everything from the variable-length
1274                     * extra data
1275                     */

1276                    pStart=decompose(norm32, qcMask, arg);
1277                    p=extraData;
1278                    length=arg.length;
1279                    cc=arg.cc;
1280                    trailCC=arg.trailCC;
1281                    if(length==1) {
1282                        /* fastpath a single code unit from decomposition */
1283                        c=p[pStart];
1284                        c2=0;
1285                        p=null;
1286                        pStart=-1;
1287                    }
1288                }
1289            }
1290    
1291            /* append the decomposition to the destination buffer, assume
1292             * length>0
1293             */

1294            if((destIndex+length)<=destLimit) {
1295                int reorderSplit=destIndex;
1296                if(p==null) {
1297                    /* fastpath: single code point */
1298                    if(cc!=0 && cc<prevCC) {
1299                        /* (c, c2) is out of order with respect to the preceding
1300                         * text
1301                         */

1302                        destIndex+=length;
1303                        trailCC=insertOrdered(dest,reorderStartIndex,
1304                                            reorderSplit, destIndex, c, c2, cc);
1305                    } else {
1306                        /* just append (c, c2) */
1307                        dest[destIndex++]=c;
1308                        if(c2!=0) {
1309                            dest[destIndex++]=c2;
1310                        }
1311                    }
1312                } else {
1313                    /* general: multiple code points (ordered by themselves)
1314                     * from decomposition
1315                     */

1316                    if(cc!=0 && cc<prevCC) {
1317                        /* the decomposition is out of order with respect to the
1318                         * preceding text
1319                         */

1320                        destIndex+=length;
1321                        trailCC=mergeOrdered(dest,reorderStartIndex,
1322                                          reorderSplit,p, pStart,pStart+length);
1323                    } else {
1324                        /* just append the decomposition */
1325                        do {
1326                            dest[destIndex++]=p[pStart++];
1327                        } while(--length>0);
1328                    }
1329                }
1330            } else {
1331                /* buffer overflow */
1332                /* keep incrementing the destIndex for preflighting */
1333                destIndex+=length;
1334            }
1335    
1336            prevCC=trailCC;
1337            if(prevCC==0) {
1338                reorderStartIndex=destIndex;
1339            }
1340        }
1341    
1342        outTrailCC[0]=prevCC;
1343
1344        return destIndex - destStart;
1345    }
1346    
1347    /* make NFC & NFKC ------------------------------------------------------ */
1348    private static final class NextCombiningArgs{
1349        char[] source;
1350        int start;
1351        //int limit;
1352
char c;
1353        char c2;
1354        int/*unsigned*/ combiningIndex;
1355        char /*unsigned byte*/ cc;
1356    }
1357    
1358    /* get the composition properties of the next character */
1359    private static int /*unsigned*/ getNextCombining(NextCombiningArgs args,
1360                                                    int limit,
1361                                                    UnicodeSet nx) {
1362        long/*unsigned*/ norm32;
1363        int combineFlags;
1364        /* get properties */
1365        args.c=args.source[args.start++];
1366        norm32=getNorm32(args.c);
1367        
1368        /* preset output values for most characters */
1369        args.c2=0;
1370        args.combiningIndex=0;
1371        args.cc=0;
1372        
1373        if((norm32&(CC_MASK|COMBINES_ANY))==0) {
1374            return 0;
1375        } else {
1376            if(isNorm32Regular(norm32)) {
1377                /* set cc etc. below */
1378            } else if(isNorm32HangulOrJamo(norm32)) {
1379                /* a compatibility decomposition contained Jamos */
1380                args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0|
1381                                                        (norm32>>EXTRA_SHIFT)));
1382                return (int)(norm32&COMBINES_ANY);
1383            } else {
1384                /* c is a lead surrogate, get the real norm32 */
1385                if(args.start!=limit && UTF16.isTrailSurrogate(args.c2=
1386                                                     args.source[args.start])) {
1387                    ++args.start;
1388                    norm32=getNorm32FromSurrogatePair(norm32, args.c2);
1389                } else {
1390                    args.c2=0;
1391                    return 0;
1392                }
1393            }
1394            
1395            if(nx_contains(nx, args.c, args.c2)) {
1396                return 0; /* excluded: norm32==0 */
1397            }
1398    
1399            args.cc= (char)((norm32>>CC_SHIFT)&0xff);
1400        
1401            combineFlags=(int)(norm32&COMBINES_ANY);
1402            if(combineFlags!=0) {
1403                int index = getExtraDataIndex(norm32);
1404                args.combiningIndex=index>0 ? extraData[(index-1)] :0;
1405            }
1406    
1407            return combineFlags;
1408        }
1409    }
1410    
1411    /*
1412     * given a composition-result starter (c, c2) - which means its cc==0,
1413     * it combines forward, it has extra data, its norm32!=0,
1414     * it is not a Hangul or Jamo,
1415     * get just its combineFwdIndex
1416     *
1417     * norm32(c) is special if and only if c2!=0
1418     */

1419    private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){
1420        long/*unsigned*/ norm32;
1421    
1422        norm32=getNorm32(c);
1423        if(c2!=0) {
1424            norm32=getNorm32FromSurrogatePair(norm32, c2);
1425        }
1426        return extraData[(getExtraDataIndex(norm32)-1)];
1427    }
1428    
1429    /*
1430     * Find the recomposition result for
1431     * a forward-combining character
1432     * (specified with a pointer to its part of the combiningTable[])
1433     * and a backward-combining character
1434     * (specified with its combineBackIndex).
1435     *
1436     * If these two characters combine, then set (value, value2)
1437     * with the code unit(s) of the composition character.
1438     *
1439     * Return value:
1440     * 0 do not combine
1441     * 1 combine
1442     * >1 combine, and the composition is a forward-combining starter
1443     *
1444     * See unormimp.h for a description of the composition table format.
1445     */

1446    private static int/*unsigned*/ combine(char[]table,int tableStart,
1447                                   int/*unsinged*/ combineBackIndex,
1448                                    int[] outValues) {
1449        int/*unsigned*/ key;
1450        int value,value2;
1451        
1452        if(outValues.length<2){
1453            throw new IllegalArgumentException JavaDoc();
1454        }
1455        
1456        /* search in the starter's composition table */
1457        for(;;) {
1458            key=table[tableStart++];
1459            if(key>=combineBackIndex) {
1460                break;
1461            }
1462            tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1;
1463        }
1464    
1465        /* mask off bit 15, the last-entry-in-the-list flag */
1466        if((key&0x7fff)==combineBackIndex) {
1467            /* found! combine! */
1468            value=table[tableStart];
1469    
1470            /* is the composition a starter that combines forward? */
1471            key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1));
1472    
1473            /* get the composition result code point from the variable-length
1474             * result value
1475             */

1476            if((value&0x8000) != 0) {
1477                if((value&0x4000) != 0) {
1478                    /* surrogate pair composition result */
1479                    value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800));
1480                    value2=table[tableStart+1];
1481                } else {
1482                    /* BMP composition result U+2000..U+ffff */
1483                    value=table[tableStart+1];
1484                    value2=0;
1485                }
1486            } else {
1487                /* BMP composition result U+0000..U+1fff */
1488                value&=0x1fff;
1489                value2=0;
1490            }
1491            outValues[0]=value;
1492            outValues[1]=value2;
1493            return key;
1494        } else {
1495            /* not found */
1496            return 0;
1497        }
1498    }
1499    
1500    
1501    private static final class RecomposeArgs{
1502        char[] source;
1503        int start;
1504        int limit;
1505    }
1506    /*
1507     * recompose the characters in [p..limit[
1508     * (which is in NFD - decomposed and canonically ordered),
1509     * adjust limit, and return the trailing cc
1510     *
1511     * since for NFKC we may get Jamos in decompositions, we need to
1512     * recompose those too
1513     *
1514     * note that recomposition never lengthens the text:
1515     * any character consists of either one or two code units;
1516     * a composition may contain at most one more code unit than the original
1517     * starter, while the combining mark that is removed has at least one code
1518     * unit
1519     */

1520    private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) {
1521        int remove, q, r;
1522        int /*unsigned*/ combineFlags;
1523        int /*unsigned*/ combineFwdIndex, combineBackIndex;
1524        int /*unsigned*/ result, value=0, value2=0;
1525        int /*unsigned byte*/ prevCC;
1526        boolean starterIsSupplementary;
1527        int starter;
1528        int[] outValues = new int[2];
1529        starter=-1; /* no starter */
1530        combineFwdIndex=0; /* will not be used until starter!=NULL */
1531        starterIsSupplementary=false; /* will not be used until starter!=NULL */
1532        prevCC=0;
1533        
1534        NextCombiningArgs ncArg = new NextCombiningArgs();
1535        ncArg.source = args.source;
1536        
1537        ncArg.cc =0;
1538        ncArg.c2 =0;
1539
1540        for(;;) {
1541            ncArg.start = args.start;
1542            combineFlags=getNextCombining(ncArg,args.limit,nx);
1543            combineBackIndex=ncArg.combiningIndex;
1544            args.start = ncArg.start;
1545                        
1546            if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) {
1547                if((combineBackIndex&0x8000)!=0) {
1548                    /* c is a Jamo V/T, see if we can compose it with the
1549                     * previous character
1550                     */

1551                    /* for the PRI #29 fix, check that there is no intervening combining mark */
1552                    if((options&BEFORE_PRI_29)!=0 || prevCC==0) {
1553                        remove=-1; /* NULL while no Hangul composition */
1554                        combineFlags=0;
1555                        ncArg.c2=args.source[starter];
1556                        if(combineBackIndex==0xfff2) {
1557                            /* Jamo V, compose with previous Jamo L and following
1558                             * Jamo T
1559                             */

1560                            ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
1561                            if(ncArg.c2<JAMO_L_COUNT) {
1562                                remove=args.start-1;
1563                                ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
1564                                               (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
1565                                if(args.start!=args.limit &&
1566                                            (ncArg.c2=(char)(args.source[args.start]
1567                                             -JAMO_T_BASE))<JAMO_T_COUNT) {
1568                                    ++args.start;
1569                                    ncArg.c+=ncArg.c2;
1570                                 } else {
1571                                     /* the result is an LV syllable, which is a starter (unlike LVT) */
1572                                     combineFlags=COMBINES_FWD;
1573                                }
1574                                if(!nx_contains(nx, ncArg.c)) {
1575                                    args.source[starter]=ncArg.c;
1576                                   } else {
1577                                    /* excluded */
1578                                    if(!isHangulWithoutJamoT(ncArg.c)) {
1579                                        --args.start; /* undo the ++args.start from reading the Jamo T */
1580                                    }
1581                                    /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
1582                                    remove=args.start;
1583                                }
1584                            }
1585
1586                        /*
1587                         * Normally, the following can not occur:
1588                         * Since the input is in NFD, there are no Hangul LV syllables that
1589                         * a Jamo T could combine with.
1590                         * All Jamo Ts are combined above when handling Jamo Vs.
1591                         *
1592                         * However, before the PRI #29 fix, this can occur due to
1593                         * an intervening combining mark between the Hangul LV and the Jamo T.
1594                         */

1595                        } else {
1596                            /* Jamo T, compose with previous Hangul that does not have a Jamo T */
1597                            if(isHangulWithoutJamoT(ncArg.c2)) {
1598                                ncArg.c2+=ncArg.c-JAMO_T_BASE;
1599                                if(!nx_contains(nx, ncArg.c2)) {
1600                                    remove=args.start-1;
1601                                    args.source[starter]=ncArg.c2;
1602                                }
1603                            }
1604                        }
1605        
1606                        if(remove!=-1) {
1607                            /* remove the Jamo(s) */
1608                            q=remove;
1609                            r=args.start;
1610                            while(r<args.limit) {
1611                                args.source[q++]=args.source[r++];
1612                            }
1613                            args.start=remove;
1614                            args.limit=q;
1615                        }
1616        
1617                        ncArg.c2=0; /* c2 held *starter temporarily */
1618
1619                        if(combineFlags!=0) {
1620                            /*
1621                             * not starter=NULL because the composition is a Hangul LV syllable
1622                             * and might combine once more (but only before the PRI #29 fix)
1623                             */

1624
1625                            /* done? */
1626                            if(args.start==args.limit) {
1627                                return (char)prevCC;
1628                            }
1629
1630                            /* the composition is a Hangul LV syllable which is a starter that combines forward */
1631                            combineFwdIndex=0xfff0;
1632
1633                            /* we combined; continue with looking for compositions */
1634                            continue;
1635                        }
1636                    }
1637
1638                    /*
1639                     * now: cc==0 and the combining index does not include
1640                     * "forward" -> the rest of the loop body will reset starter
1641                     * to NULL; technically, a composed Hangul syllable is a
1642                     * starter, but it does not combine forward now that we have
1643                     * consumed all eligible Jamos; for Jamo V/T, combineFlags
1644                     * does not contain _NORM_COMBINES_FWD
1645                     */

1646    
1647                } else if(
1648                    /* the starter is not a Hangul LV or Jamo V/T and */
1649                    !((combineFwdIndex&0x8000)!=0) &&
1650                    /* the combining mark is not blocked and */
1651                    ((options&BEFORE_PRI_29)!=0 ?
1652                        (prevCC!=ncArg.cc || prevCC==0) :
1653                        (prevCC<ncArg.cc || prevCC==0)) &&
1654                    /* the starter and the combining mark (c, c2) do combine */
1655                    0!=(result=combine(combiningTable,combineFwdIndex,
1656                                       combineBackIndex, outValues)) &&
1657                    /* the composition result is not excluded */
1658                    !nx_contains(nx, (char)value, (char)value2)
1659                ) {
1660                    value=outValues[0];
1661                    value2=outValues[1];
1662                    /* replace the starter with the composition, remove the
1663                     * combining mark
1664                     */

1665                    remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */
1666    
1667                    /* replace the starter with the composition */
1668                    args.source[starter]=(char)value;
1669                    if(starterIsSupplementary) {
1670                        if(value2!=0) {
1671                            /* both are supplementary */
1672                            args.source[starter+1]=(char)value2;
1673                        } else {
1674                            /* the composition is shorter than the starter,
1675                             * move the intermediate characters forward one */

1676                            starterIsSupplementary=false;
1677                            q=starter+1;
1678                            r=q+1;
1679                            while(r<remove) {
1680                                args.source[q++]=args.source[r++];
1681                            }
1682                            --remove;
1683                        }
1684                    } else if(value2!=0) {
1685                        /* the composition is longer than the starter,
1686                         * move the intermediate characters back one */

1687                        starterIsSupplementary=true;
1688                        /* temporarily increment for the loop boundary */
1689                        ++starter;
1690                        q=remove;
1691                        r=++remove;
1692                        while(starter<q) {
1693                            args.source[--r]=args.source[--q];
1694                        }
1695                        args.source[starter]=(char)value2;
1696                        --starter; /* undo the temporary increment */
1697                    /* } else { both are on the BMP, nothing more to do */
1698                    }
1699    
1700                    /* remove the combining mark by moving the following text
1701                     * over it */

1702                    if(remove<args.start) {
1703                        q=remove;
1704                        r=args.start;
1705                        while(r<args.limit) {
1706                            args.source[q++]=args.source[r++];
1707                        }
1708                        args.start=remove;
1709                        args.limit=q;
1710                    }
1711    
1712                    /* keep prevCC because we removed the combining mark */
1713    
1714                    /* done? */
1715                    if(args.start==args.limit) {
1716                        return (char)prevCC;
1717                    }
1718    
1719                    /* is the composition a starter that combines forward? */
1720                    if(result>1) {
1721                       combineFwdIndex=getCombiningIndexFromStarter((char)value,
1722                                                                  (char)value2);
1723                    } else {
1724                       starter=-1;
1725                    }
1726    
1727                    /* we combined; continue with looking for compositions */
1728                    continue;
1729                }
1730            }
1731    
1732            /* no combination this time */
1733            prevCC=ncArg.cc;
1734            if(args.start==args.limit) {
1735                return (char)prevCC;
1736            }
1737    
1738            /* if (c, c2) did not combine, then check if it is a starter */
1739            if(ncArg.cc==0) {
1740                /* found a new starter; combineFlags==0 if (c, c2) is excluded */
1741                if((combineFlags&COMBINES_FWD)!=0) {
1742                    /* it may combine with something, prepare for it */
1743                    if(ncArg.c2==0) {
1744                        starterIsSupplementary=false;
1745                        starter=args.start-1;
1746                    } else {
1747                        starterIsSupplementary=false;
1748                        starter=args.start-2;
1749                    }
1750                    combineFwdIndex=combineBackIndex;
1751                } else {
1752                    /* it will not combine with anything */
1753                    starter=-1;
1754                }
1755            } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) {
1756                /* FCC: no discontiguous compositions; any intervening character blocks */
1757                starter=-1;
1758            }
1759        }
1760    }
1761   
1762    // find the last true starter between src[start]....src[current] going
1763
// backwards and return its index
1764
private static int findPreviousStarter(char[]src, int srcStart, int current,
1765                                          int/*unsigned*/ ccOrQCMask,
1766                                          int/*unsigned*/ decompQCMask,
1767                                          char minNoMaybe) {
1768       long norm32;
1769       PrevArgs args = new PrevArgs();
1770       args.src = src;
1771       args.start = srcStart;
1772       args.current = current;
1773       
1774       while(args.start<args.current) {
1775           norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask);
1776           if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1777               break;
1778           }
1779       }
1780       return args.current;
1781    }
1782    
1783    /* find the first true starter in [src..limit[ and return the
1784     * pointer to it
1785     */

1786    private static int/*index*/ findNextStarter(char[] src,int start,int limit,
1787                                                 int/*unsigned*/ qcMask,
1788                                                 int/*unsigned*/ decompQCMask,
1789                                                 char minNoMaybe) {
1790        int p;
1791        long/*unsigned*/ norm32;
1792        int ccOrQCMask;
1793        char c, c2;
1794    
1795        ccOrQCMask=CC_MASK|qcMask;
1796        
1797        DecomposeArgs decompArgs = new DecomposeArgs();
1798
1799        for(;;) {
1800            if(start==limit) {
1801                break; /* end of string */
1802            }
1803            c=src[start];
1804            if(c<minNoMaybe) {
1805                break; /* catches NUL terminater, too */
1806            }
1807    
1808            norm32=getNorm32(c);
1809            if((norm32&ccOrQCMask)==0) {
1810                break; /* true starter */
1811            }
1812    
1813            if(isNorm32LeadSurrogate(norm32)) {
1814                /* c is a lead surrogate, get the real norm32 */
1815                if((start+1)==limit ||
1816                                   !UTF16.isTrailSurrogate(c2=(src[start+1]))){
1817                    /* unmatched first surrogate: counts as a true starter */
1818                    break;
1819                }
1820                norm32=getNorm32FromSurrogatePair(norm32, c2);
1821    
1822                if((norm32&ccOrQCMask)==0) {
1823                    break; /* true starter */
1824                }
1825            } else {
1826                c2=0;
1827            }
1828    
1829            /* (c, c2) is not a true starter but its decomposition may be */
1830            if((norm32&decompQCMask)!=0) {
1831                /* (c, c2) decomposes, get everything from the variable-length
1832                 * extra data */

1833                p=decompose(norm32, decompQCMask, decompArgs);
1834    
1835                /* get the first character's norm32 to check if it is a true
1836                 * starter */

1837                if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) {
1838                    break; /* true starter */
1839                }
1840            }
1841    
1842            start+= c2==0 ? 1 : 2; /* not a true starter, continue */
1843        }
1844    
1845        return start;
1846    }
1847    
1848    
1849    private static final class ComposePartArgs{
1850        int prevCC;
1851        int length; /* length of decomposed part */
1852    }
1853        
1854     /* decompose and recompose [prevStarter..src[ */
1855    private static char[] composePart(ComposePartArgs args,
1856                                      int prevStarter,
1857                                         char[] src, int start, int limit,
1858                                       int options,
1859                                       UnicodeSet nx) {
1860        int recomposeLimit;
1861        boolean compat =((options&OPTIONS_COMPAT)!=0);
1862        
1863        /* decompose [prevStarter..src[ */
1864        int[] outTrailCC = new int[1];
1865        char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE];
1866
1867        for(;;){
1868            args.length=decompose(src,prevStarter,(start),
1869                                      buffer,0,buffer.length,
1870                                      compat,outTrailCC,nx);
1871            if(args.length<=buffer.length){
1872                break;
1873            }else{
1874                buffer = new char[args.length];
1875            }
1876        }
1877    
1878        /* recompose the decomposition */
1879        recomposeLimit=args.length;
1880          
1881        if(args.length>=2) {
1882            RecomposeArgs rcArgs = new RecomposeArgs();
1883            rcArgs.source = buffer;
1884            rcArgs.start = 0;
1885            rcArgs.limit = recomposeLimit;
1886            args.prevCC=recompose(rcArgs, options, nx);
1887            recomposeLimit = rcArgs.limit;
1888        }
1889        
1890        /* return with a pointer to the recomposition and its length */
1891        args.length=recomposeLimit;
1892        return buffer;
1893    }
1894    
1895    private static boolean composeHangul(char prev, char c,
1896                                         long/*unsigned*/ norm32,
1897                                         char[] src,int[] srcIndex, int limit,
1898                                            boolean compat,
1899                                         char[] dest,int destIndex,
1900                                         UnicodeSet nx) {
1901        int start=srcIndex[0];
1902        if(isJamoVTNorm32JamoV(norm32)) {
1903            /* c is a Jamo V, compose with previous Jamo L and
1904             * following Jamo T */

1905            prev=(char)(prev-JAMO_L_BASE);
1906            if(prev<JAMO_L_COUNT) {
1907                c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+
1908                                                 (c-JAMO_V_BASE))*JAMO_T_COUNT);
1909    
1910                /* check if the next character is a Jamo T (normal or
1911                 * compatibility) */

1912                if(start!=limit) {
1913                    char next, t;
1914    
1915                    next=src[start];
1916                    if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1917                        /* normal Jamo T */
1918                        ++start;
1919                        c+=t;
1920                    } else if(compat) {
1921                        /* if NFKC, then check for compatibility Jamo T
1922                         * (BMP only) */

1923                        norm32=getNorm32(next);
1924                        if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) {
1925                            int p /*index into extra data array*/;
1926                            DecomposeArgs dcArgs = new DecomposeArgs();
1927                            p=decompose(norm32, QC_NFKD, dcArgs);
1928                            if(dcArgs.length==1 &&
1929                                   (t=(char)(extraData[p]-JAMO_T_BASE))
1930                                                   <JAMO_T_COUNT) {
1931                                /* compatibility Jamo T */
1932                                ++start;
1933                                c+=t;
1934                            }
1935                        }
1936                    }
1937                }
1938                if(nx_contains(nx, c)) {
1939                    if(!isHangulWithoutJamoT(c)) {
1940                        --start; /* undo ++start from reading the Jamo T */
1941                    }
1942                    return false;
1943                }
1944                dest[destIndex]=c;
1945                srcIndex[0]=start;
1946                return true;
1947            }
1948        } else if(isHangulWithoutJamoT(prev)) {
1949            /* c is a Jamo T, compose with previous Hangul LV that does not
1950             * contain a Jamo T */

1951            c=(char)(prev+(c-JAMO_T_BASE));
1952            if(nx_contains(nx, c)) {
1953                return false;
1954            }
1955            dest[destIndex]=c;
1956            srcIndex[0]=start;
1957            return true;
1958        }
1959        return false;
1960    }
1961    /*
1962    public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
1963        return compose(src,0,src.length,dest,0,dest.length,compat, nx);
1964    }
1965    */

1966    
1967    public static int compose(char[] src, int srcStart, int srcLimit,
1968                              char[] dest,int destStart,int destLimit,
1969                              int options,UnicodeSet nx) {
1970        
1971        int prevSrc, prevStarter;
1972        long/*unsigned*/ norm32;
1973        int ccOrQCMask, qcMask;
1974        int reorderStartIndex, length;
1975        char c, c2, minNoMaybe;
1976        int/*unsigned byte*/ cc, prevCC;
1977        int[] ioIndex = new int[1];
1978        int destIndex = destStart;
1979        int srcIndex = srcStart;
1980        
1981        if((options&OPTIONS_COMPAT)!=0) {
1982            minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE];
1983            qcMask=QC_NFKC;
1984        } else {
1985            minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE];
1986            qcMask=QC_NFC;
1987        }
1988    
1989        /*
1990         * prevStarter points to the last character before the current one
1991         * that is a "true" starter with cc==0 and quick check "yes".
1992         *
1993         * prevStarter will be used instead of looking for a true starter
1994         * while incrementally decomposing [prevStarter..prevSrc[
1995         * in _composePart(). Having a good prevStarter allows to just decompose
1996         * the entire [prevStarter..prevSrc[.
1997         *
1998         * When _composePart() backs out from prevSrc back to prevStarter,
1999         * then it also backs out destIndex by the same amount.
2000         * Therefore, at all times, the (prevSrc-prevStarter) source units
2001         * must correspond 1:1 to destination units counted with destIndex,
2002         * except for reordering.
2003         * This is true for the qc "yes" characters copied in the fast loop,
2004         * and for pure reordering.
2005         * prevStarter must be set forward to src when this is not true:
2006         * In _composePart() and after composing a Hangul syllable.
2007         *
2008         * This mechanism relies on the assumption that the decomposition of a
2009         * true starter also begins with a true starter. gennorm/store.c checks
2010         * for this.
2011         */

2012        prevStarter=srcIndex;
2013    
2014        ccOrQCMask=CC_MASK|qcMask;
2015        /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/
2016        prevCC=0;
2017    
2018        /* avoid compiler warnings */
2019        norm32=0;
2020        c=0;
2021    
2022        for(;;) {
2023            /* count code units below the minimum or with irrelevant data for
2024             * the quick check */

2025            prevSrc=srcIndex;
2026
2027            while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe ||
2028                     ((norm32=getNorm32(c))&ccOrQCMask)==0)) {
2029                prevCC=0;
2030                ++srcIndex;
2031            }
2032
2033    
2034            /* copy these code units all at once */
2035            if(srcIndex!=prevSrc) {
2036                length=(int)(srcIndex-prevSrc);
2037                if((destIndex+length)<=destLimit) {
2038                    System.arraycopy(src,prevSrc,dest,destIndex,length);
2039                }
2040                destIndex+=length;
2041                reorderStartIndex=destIndex;
2042    
2043                /* set prevStarter to the last character in the quick check
2044                 * loop */

2045                prevStarter=srcIndex-1;
2046                if(UTF16.isTrailSurrogate(src[prevStarter]) &&
2047                    prevSrc<prevStarter &&
2048                    UTF16.isLeadSurrogate(src[(prevStarter-1)])) {
2049                    --prevStarter;
2050                }
2051    
2052                prevSrc=srcIndex;
2053            }
2054    
2055            /* end of source reached? */
2056            if(srcIndex==srcLimit) {
2057                break;
2058            }
2059    
2060            /* c already contains *src and norm32 is set for it, increment src*/
2061            ++srcIndex;
2062    
2063            /*
2064             * source buffer pointers:
2065             *
2066             * all done quick check current char not yet
2067             * "yes" but (c, c2) processed
2068             * may combine
2069             * forward
2070             * [-------------[-------------[-------------[-------------[
2071             * | | | | |
2072             * start prevStarter prevSrc src limit
2073             *
2074             *
2075             * destination buffer pointers and indexes:
2076             *
2077             * all done might take not filled yet
2078             * characters for
2079             * reordering
2080             * [-------------[-------------[-------------[
2081             * | | | |
2082             * dest reorderStartIndex destIndex destCapacity
2083             */

2084    
2085            /* check one above-minimum, relevant code unit */
2086            /*
2087             * norm32 is for c=*(src-1), and the quick check flag is "no" or
2088             * "maybe", and/or cc!=0
2089             * check for Jamo V/T, then for surrogates and regular characters
2090             * c is not a Hangul syllable or Jamo L because
2091             * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
2092             */

2093            if(isNorm32HangulOrJamo(norm32)) {
2094                /*
2095                 * c is a Jamo V/T:
2096                 * try to compose with the previous character, Jamo V also with
2097                 * a following Jamo T, and set values here right now in case we
2098                 * just continue with the main loop
2099                 */

2100                prevCC=cc=0;
2101                reorderStartIndex=destIndex;
2102                ioIndex[0]=srcIndex;
2103                if(
2104                    destIndex>0 &&
2105                    composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex,
2106                                  srcLimit, (options&OPTIONS_COMPAT)!=0, dest,
2107                                  destIndex<=destLimit ? destIndex-1: 0,
2108                                  nx)
2109                ) {
2110                    srcIndex=ioIndex[0];
2111                    prevStarter=srcIndex;
2112                    continue;
2113                }
2114                
2115                srcIndex = ioIndex[0];
2116    
2117                /* the Jamo V/T did not compose into a Hangul syllable, just
2118                 * append to dest */

2119                c2=0;
2120                length=1;
2121                prevStarter=prevSrc;
2122            } else {
2123                if(isNorm32Regular(norm32)) {
2124                    c2=0;
2125                    length=1;
2126                } else {
2127                    /* c is a lead surrogate, get the real norm32 */
2128                    if(srcIndex!=srcLimit &&
2129                                     UTF16.isTrailSurrogate(c2=src[srcIndex])) {
2130                        ++srcIndex;
2131                        length=2;
2132                        norm32=getNorm32FromSurrogatePair(norm32, c2);
2133                    } else {
2134                        /* c is an unpaired lead surrogate, nothing to do */
2135                        c2=0;
2136                        length=1;
2137                        norm32=0;
2138                    }
2139                }
2140                ComposePartArgs args =new ComposePartArgs();
2141                
2142                /* we are looking at the character (c, c2) at [prevSrc..src[ */
2143                if(nx_contains(nx, c, c2)) {
2144                    /* excluded: norm32==0 */
2145                    cc=0;
2146                } else if((norm32&qcMask)==0) {
2147                    cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT));
2148                } else {
2149                    char[] p;
2150    
2151                    /*
2152                     * find appropriate boundaries around this character,
2153                     * decompose the source text from between the boundaries,
2154                     * and recompose it
2155                     *
2156                     * this puts the intermediate text into the side buffer because
2157                     * it might be longer than the recomposition end result,
2158                     * or the destination buffer may be too short or missing
2159                     *
2160                     * note that destIndex may be adjusted backwards to account
2161                     * for source text that passed the quick check but needed to
2162                     * take part in the recomposition
2163                     */

2164                    int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
2165                    /*
2166                     * find the last true starter in [prevStarter..src[
2167                     * it is either the decomposition of the current character (at prevSrc),
2168                     * or prevStarter
2169                     */

2170                    if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) {
2171                        prevStarter=prevSrc;
2172                    } else {
2173                        /* adjust destIndex: back out what had been copied with qc "yes" */
2174                        destIndex-=prevSrc-prevStarter;
2175                    }
2176                
2177                    /* find the next true starter in [src..limit[ */
2178                    srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask,
2179                                               decompQCMask, minNoMaybe);
2180                    //args.prevStarter = prevStarter;
2181
args.prevCC = prevCC;
2182                    //args.destIndex = destIndex;
2183
args.length = length;
2184                    p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx);
2185                        
2186                    if(p==null) {
2187                        /* an error occurred (out of memory) */
2188                        break;
2189                    }
2190                    
2191                    prevCC = args.prevCC;
2192                    length = args.length;
2193                    
2194                    /* append the recomposed buffer contents to the destination
2195                     * buffer */

2196                    if((destIndex+args.length)<=destLimit) {
2197                        int i=0;
2198                        while(i<args.length) {
2199                            dest[destIndex++]=p[i++];
2200                            --length;
2201                        }
2202                    } else {
2203                        /* buffer overflow */
2204                        /* keep incrementing the destIndex for preflighting */
2205                        destIndex+=length;
2206                    }
2207    
2208                    prevStarter=srcIndex;
2209                    continue;
2210                }
2211            }
2212    
2213            /* append the single code point (c, c2) to the destination buffer */
2214            if((destIndex+length)<=destLimit) {
2215                if(cc!=0 && cc<prevCC) {
2216                    /* (c, c2) is out of order with respect to the preceding
2217                     * text */

2218                    int reorderSplit= destIndex;
2219                    destIndex+=length;
2220                    prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit,
2221                                         destIndex, c, c2, cc);
2222                } else {
2223                    /* just append (c, c2) */
2224                    dest[destIndex++]=c;
2225                    if(c2!=0) {
2226                        dest[destIndex++]=c2;
2227                    }
2228                    prevCC=cc;
2229                }
2230            } else {
2231                /* buffer overflow */
2232                /* keep incrementing the destIndex for preflighting */
2233                destIndex+=length;
2234                prevCC=cc;
2235            }
2236        }
2237
2238        return destIndex - destStart;
2239    }
2240    /* make FCD --------------------------------------------------------------*/
2241    
2242    private static int/*index*/ findSafeFCD(char[] src, int start, int limit,
2243                                            char fcd16) {
2244        char c, c2;
2245    
2246        /*
2247         * find the first position in [src..limit[ after some cc==0 according
2248         * to FCD data
2249         *
2250         * at the beginning of the loop, we have fcd16 from before src
2251         *
2252         * stop at positions:
2253         * - after trail cc==0
2254         * - at the end of the source
2255         * - before lead cc==0
2256         */

2257        for(;;) {
2258            /* stop if trail cc==0 for the previous character */
2259            if((fcd16&0xff)==0) {
2260                break;
2261            }
2262    
2263            /* get c=*src - stop at end of string */
2264            if(start==limit) {
2265                break;
2266            }
2267            c=src[start];
2268    
2269            /* stop if lead cc==0 for this character */
2270            if(c<MIN_WITH_LEAD_CC || (fcd16=getFCD16(c))==0) {
2271                break; /* catches terminating NUL, too */
2272            }
2273    
2274            if(!UTF16.isLeadSurrogate(c)) {
2275                if(fcd16<=0xff) {
2276                    break;
2277                }
2278                ++start;
2279            } else if(start+1!=limit &&
2280                                    (UTF16.isTrailSurrogate(c2=src[start+1]))) {
2281                /* c is a lead surrogate, get the real fcd16 */
2282                fcd16=getFCD16FromSurrogatePair(fcd16, c2);
2283                if(fcd16<=0xff) {
2284                    break;
2285                }
2286                start+=2;
2287            } else {
2288                /* c is an unpaired first surrogate, lead cc==0 */
2289                break;
2290            }
2291        }
2292    
2293        return start;
2294    }
2295    
2296    private static int/*unsigned byte*/ decomposeFCD(char[] src,
2297                                                     int start,int decompLimit,
2298                                                     char[] dest,
2299                                                     int[] destIndexArr,
2300                                                     UnicodeSet nx) {
2301        char[] p=null;
2302        int pStart=-1;
2303        
2304        long /*unsigned int*/ norm32;
2305        int reorderStartIndex;
2306        char c, c2;
2307        int/*unsigned byte*/ prevCC;
2308        DecomposeArgs args = new DecomposeArgs();
2309        int destIndex = destIndexArr[0];
2310        /*
2311         * canonically decompose [src..decompLimit[
2312         *
2313         * all characters in this range have some non-zero cc,
2314         * directly or in decomposition,
2315         * so that we do not need to check in the following for quick-check
2316         * limits etc.
2317         *
2318         * there _are_ _no_ Hangul syllables or Jamos in here because they are
2319         * FCD-safe (cc==0)!
2320         *
2321         * we also do not need to check for c==0 because we have an established
2322         * decompLimit
2323         */

2324        reorderStartIndex=destIndex;
2325        prevCC=0;
2326
2327        while(start<decompLimit) {
2328            c=src[start++];
2329            norm32=getNorm32(c);
2330            if(isNorm32Regular(norm32)) {
2331                c2=0;
2332                args.length=1;
2333            } else {
2334                /*
2335                 * reminder: this function is called with [src..decompLimit[
2336                 * not containing any Hangul/Jamo characters,
2337                 * therefore the only specials are lead surrogates
2338                 */

2339                /* c is a lead surrogate, get the real norm32 */
2340                if(start!=decompLimit && UTF16.isTrailSurrogate(c2=src[start])){
2341                    ++start;
2342                    args.length=2;
2343                    norm32=getNorm32FromSurrogatePair(norm32, c2);
2344                } else {
2345                    c2=0;
2346                    args.length=1;
2347                    norm32=0;
2348                }
2349            }
2350    
2351            /* get the decomposition and the lead and trail cc's */
2352            if(nx_contains(nx, c, c2)) {
2353                /* excluded: norm32==0 */
2354                args.cc=args.trailCC=0;
2355                p=null;
2356            } else if((norm32&QC_NFD)==0) {
2357                /* c does not decompose */
2358                args.cc=args.trailCC=(int)((UNSIGNED_BYTE_MASK)&
2359                                                            (norm32>>CC_SHIFT));
2360                p=null;
2361            } else {
2362                /* c decomposes, get everything from the variable-length extra
2363                 * data */

2364                pStart=decompose(norm32, args);
2365                p=extraData;
2366                if(args.length==1) {
2367                    /* fastpath a single code unit from decomposition */
2368                    c=p[pStart];
2369                    c2=0;
2370                    p=null;
2371                }
2372            }
2373    
2374            /* append the decomposition to the destination buffer, assume
2375             * length>0 */

2376            if((destIndex+args.length)<=dest.length) {
2377                int reorderSplit=destIndex;
2378                if(p==null) {
2379                    /* fastpath: single code point */
2380                    if(args.cc!=0 && args.cc<prevCC) {
2381                        /* (c, c2) is out of order with respect to the preceding
2382                         * text */

2383                        destIndex+=args.length;
2384                        args.trailCC=insertOrdered(dest,reorderStartIndex,
2385                                                   reorderSplit, destIndex,
2386                                                   c, c2, args.cc);
2387                    } else {
2388                        /* just append (c, c2) */
2389                        dest[destIndex++]=c;
2390                        if(c2!=0) {
2391                            dest[destIndex++]=c2;
2392                        }
2393                    }
2394                } else {
2395                    /* general: multiple code points (ordered by themselves)
2396                     * from decomposition */

2397                    if(args.cc!=0 && args.cc<prevCC) {
2398                        /* the decomposition is out of order with respect to
2399                         * the preceding text */

2400                        destIndex+=args.length;
2401                        args.trailCC=mergeOrdered(dest,reorderStartIndex,
2402                                                  reorderSplit, p, pStart,
2403                                                  pStart+args.length);
2404                    } else {
2405                        /* just append the decomposition */
2406                        do {
2407                            dest[destIndex++]=p[pStart++];
2408                        } while(--args.length>0);
2409                    }
2410                }
2411            } else {
2412                /* buffer overflow */
2413                /* keep incrementing the destIndex for preflighting */
2414                destIndex+=args.length;
2415            }
2416    
2417            prevCC=args.trailCC;
2418            if(prevCC==0) {
2419                reorderStartIndex=destIndex;
2420            }
2421        }
2422        destIndexArr[0]=destIndex;
2423        return prevCC;
2424    }
2425    
2426    public static int makeFCD(char[] src, int srcStart, int srcLimit,
2427                              char[] dest, int destStart, int destLimit,
2428                              UnicodeSet nx) {
2429                           
2430        int prevSrc, decompStart;
2431        int destIndex, length;
2432        char c, c2;
2433        int /* unsigned int*/ fcd16;
2434        int prevCC, cc;
2435    
2436        /* initialize */
2437        decompStart=srcStart;
2438        destIndex=destStart;
2439        prevCC=0;
2440        c=0;
2441        fcd16=0;
2442        int[] destIndexArr = new int[1];
2443        destIndexArr[0]=destIndex;
2444        
2445        for(;;) {
2446            /* skip a run of code units below the minimum or with irrelevant
2447             * data for the FCD check */

2448            prevSrc=srcStart;
2449
2450            for(;;) {
2451                if(srcStart==srcLimit) {
2452                    break;
2453                } else if((c=src[srcStart])<MIN_WITH_LEAD_CC) {
2454                    prevCC=(int)-c;
2455                } else if((fcd16=getFCD16(c))==0) {
2456                    prevCC=0;
2457                } else {
2458                    break;
2459                }
2460                ++srcStart;
2461            }
2462
2463    
2464            /*
2465             * prevCC has values from the following ranges:
2466             * 0..0xff - the previous trail combining class
2467             * <0 - the negative value of the previous code unit;
2468             * that code unit was <_NORM_MIN_WITH_LEAD_CC and its
2469             * getFCD16()
2470             * was deferred so that average text is checked faster
2471             */

2472    
2473            /* copy these code units all at once */
2474            if(srcStart!=prevSrc) {
2475                length=(int)(srcStart-prevSrc);
2476                if((destIndex+length)<=destLimit) {
2477                    System.arraycopy(src,prevSrc,dest,destIndex,length);
2478                }
2479                destIndex+=length;
2480                prevSrc=srcStart;
2481    
2482                /* prevCC<0 is only possible from the above loop, i.e., only if
2483                 * prevSrc<src */

2484                if(prevCC<0) {
2485                    /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we
2486                     * need to get its trail cc */

2487                    if(!nx_contains(nx, (int)-prevCC)) {
2488                        prevCC=(int)(getFCD16((int)-prevCC)&0xff);
2489                    } else {
2490                        prevCC=0; /* excluded: fcd16==0 */
2491                    }
2492                    /*
2493                     * set a pointer to this below-U+0300 character;
2494                     * if prevCC==0 then it will moved to after this character
2495                     * below
2496                     */

2497                    decompStart=prevSrc-1;
2498                }
2499            }
2500            /*
2501             * now:
2502             * prevSrc==src - used later to adjust destIndex before
2503             * decomposition
2504             * prevCC>=0
2505             */

2506    
2507            /* end of source reached? */
2508            if(srcStart==srcLimit) {
2509                break;
2510            }
2511    
2512            /* set a pointer to after the last source position where prevCC==0*/
2513            if(prevCC==0) {
2514                decompStart=prevSrc;
2515            }
2516    
2517            /* c already contains *src and fcd16 is set for it, increment src */
2518            ++srcStart;
2519    
2520            /* check one above-minimum, relevant code unit */
2521            if(UTF16.isLeadSurrogate(c)) {
2522                /* c is a lead surrogate, get the real fcd16 */
2523                if(srcStart!=srcLimit &&
2524                                     UTF16.isTrailSurrogate(c2=src[srcStart])) {
2525                    ++srcStart;
2526                    fcd16=getFCD16FromSurrogatePair((char)fcd16, c2);
2527                } else {
2528                    c2=0;
2529                    fcd16=0;
2530                }
2531            } else {
2532                c2=0;
2533            }
2534    
2535            /* we are looking at the character (c, c2) at [prevSrc..src[ */
2536            if(nx_contains(nx, c, c2)) {
2537                fcd16=0; /* excluded: fcd16==0 */
2538            }
2539            /* check the combining order, get the lead cc */
2540            cc=(int)(fcd16>>8);
2541            if(cc==0 || cc>=prevCC) {
2542                /* the order is ok */
2543                if(cc==0) {
2544                    decompStart=prevSrc;
2545                }
2546                prevCC=(int)(fcd16&0xff);
2547    
2548                /* just append (c, c2) */
2549                length= c2==0 ? 1 : 2;
2550                if((destIndex+length)<=destLimit) {
2551                    dest[destIndex++]=c;
2552                    if(c2!=0) {
2553                        dest[destIndex++]=c2;
2554                    }
2555                } else {
2556                    destIndex+=length;
2557                }
2558            } else {
2559                /*
2560                 * back out the part of the source that we copied already but
2561                 * is now going to be decomposed;
2562                 * prevSrc is set to after what was copied
2563                 */

2564                destIndex-=(int)(prevSrc-decompStart);
2565    
2566                /*
2567                 * find the part of the source that needs to be decomposed;
2568                 * to be safe and simple, decompose to before the next character
2569                 * with lead cc==0
2570                 */

2571                srcStart=findSafeFCD(src,srcStart, srcLimit, (char)fcd16);
2572    
2573                /*
2574                 * the source text does not fulfill the conditions for FCD;
2575                 * decompose and reorder a limited piece of the text
2576                 */

2577                destIndexArr[0] = destIndex;
2578                prevCC=decomposeFCD(src,decompStart, srcStart,dest,
2579                                    destIndexArr,nx);
2580                decompStart=srcStart;
2581                destIndex=destIndexArr[0];
2582            }
2583        }
2584        
2585        return destIndex - destStart;
2586    
2587    }
2588
2589    public static int getCombiningClass(int c) {
2590        long norm32;
2591        norm32=getNorm32(c);
2592        return (char)((norm32>>CC_SHIFT)&0xFF);
2593    }
2594    
2595    public static boolean isFullCompositionExclusion(int c) {
2596        if(isFormatVersion_2_1) {
2597            int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);
2598            return (boolean)((aux & AUX_COMP_EX_MASK)!=0);
2599        } else {
2600            return false;
2601        }
2602    }
2603    
2604    public static boolean isCanonSafeStart(int c) {
2605        if(isFormatVersion_2_1) {
2606            int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2607            return (boolean)((aux & AUX_UNSAFE_MASK)==0);
2608        } else {
2609            return false;
2610        }
2611    }
2612    
2613    public static boolean getCanonStartSet(int c, USerializedSet fillSet) {
2614
2615        if(fillSet!=null && canonStartSets!=null) {
2616             /*
2617             * binary search for c
2618             *
2619             * There are two search tables,
2620             * one for BMP code points and one for supplementary ones.
2621             * See unormimp.h for details.
2622             */

2623            char[] table;
2624            int i=0, start, limit;
2625            
2626            int[] indexes = (int[]) canonStartSets[CANON_SET_INDICIES_INDEX];
2627            char[] startSets = (char[]) canonStartSets[CANON_SET_START_SETS_INDEX];
2628            
2629            if(c<=0xffff) {
2630                table=(char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX];
2631                start=0;
2632                limit=table.length;
2633    
2634                /* each entry is a pair { c, result } */
2635                while(start<limit-2) {
2636                    i=(char)(((start+limit)/4)*2);
2637                    if(c<table[i]) {
2638                        limit=i;
2639                    } else {
2640                        start=i;
2641                    }
2642                }
2643                //System.out.println(i);
2644
/* found? */
2645                if(c==table[start]) {
2646                    i=table[start+1];
2647                    if((i & CANON_SET_BMP_MASK)==CANON_SET_BMP_IS_INDEX) {
2648                        /* result 01xxxxxx xxxxxx contains index x to a
2649                         * USerializedSet */

2650                        i&=(CANON_SET_MAX_CANON_SETS-1);
2651                        return fillSet.getSet(startSets,(i-indexes.length));
2652                    } else {
2653                        /* other result values are BMP code points for
2654                         * single-code point sets */

2655                        fillSet.setToOne(i);
2656                        return true;
2657                    }
2658                }
2659            } else {
2660                char high, low, h,j=0;
2661    
2662                table=(char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX];
2663                start=0;
2664                limit=table.length;
2665    
2666                high=(char)(c>>16);
2667                low=(char)c;
2668    
2669                /* each entry is a triplet { high(c), low(c), result } */
2670                while(start<limit-3) {
2671                    /* (start+limit)/2 and address triplets */
2672                    i=(char)(((start+limit)/6)*3);
2673                    j=(char)(table[i]&0x1f); /* high word */
2674                    int tableVal = table[i+1];
2675                    int lowInt = low;
2676                    if(high<j || ((tableVal>lowInt) && (high==j))) {
2677                        limit=i;
2678                    } else {
2679                        start=i;
2680                    }
2681                    
2682                    //System.err.println("\t((high==j) && (table[i+1]>low)) == " + ((high==j) && (tableVal>lowInt)) );
2683

2684                    // KLUDGE: IBM JIT in 1.4.0 is sooo broken
2685
// The below lines make TestExhaustive pass
2686
if(ICUDebug.enabled()){
2687                        System.err.println("\t\t j = " + Utility.hex(j,4) +
2688                                           "\t i = " + Utility.hex(i,4) +
2689                                           "\t high = "+ Utility.hex(high) +
2690                                           "\t low = " + Utility.hex(lowInt,4) +
2691                                           "\t table[i+1]: "+ Utility.hex(tableVal,4)
2692                                           );
2693                    }
2694                   
2695                }
2696
2697                /* found? */
2698                h=table[start];
2699
2700                //System.err.println("c: \\U"+ Integer.toHexString(c)+" i : "+Integer.toHexString(i) +" h : " + Integer.toHexString(h));
2701
int tableVal1 = table[start+1];
2702                int lowInt = low;
2703
2704                if(high==(h&0x1f) && lowInt==tableVal1) {
2705                    int tableVal2 = table[start+2];
2706                    i=tableVal2;
2707                    if((h&0x8000)==0) {
2708                        /* the result is an index to a USerializedSet */
2709                        return fillSet.getSet(startSets,(i-indexes.length));
2710                    } else {
2711                        /*
2712                         * single-code point set {x} in
2713                         * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
2714                         */

2715                        //i|=((int)h & 0x1f00)<<8; /* add high bits from high(c) */
2716
int temp = ((int)h & 0x1f00)<<8;
2717                        i|=temp; /* add high bits from high(c) */
2718                        fillSet.setToOne((int)i);
2719                        return true;
2720                    }
2721                }
2722            }
2723        }
2724    
2725        return false; /* not found */
2726    }
2727    
2728    public static int getFC_NFKC_Closure(int c, char[] dest) {
2729        
2730        int destCapacity;
2731         
2732        if(dest==null ) {
2733            destCapacity=0;
2734        }else{
2735            destCapacity = dest.length;
2736        }
2737        
2738        int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);
2739
2740        aux&= AUX_FNC_MASK;
2741        if(aux!=0) {
2742            int s;
2743            int index=aux;
2744            int length;
2745            
2746            s =extraData[index];
2747            if(s<0xff00) {
2748                /* s points to the single-unit string */
2749                length=1;
2750            } else {
2751                length=s&0xff;
2752                ++index;
2753            }
2754            if(0<length && length<=destCapacity) {
2755                System.arraycopy(extraData,index,dest,0,length);
2756            }
2757            return length;
2758        } else {
2759            return 0;
2760        }
2761    }
2762
2763
2764    /* Is c an NF<mode>-skippable code point? See unormimp.h. */
2765    public static boolean isNFSkippable(int c, Normalizer.Mode mode, long mask) {
2766        long /*unsigned int*/ norm32;
2767        mask = mask & UNSIGNED_INT_MASK;
2768        char aux;
2769   
2770        /* check conditions (a)..(e), see unormimp.h */
2771        norm32 = getNorm32(c);
2772
2773        if((norm32&mask)!=0) {
2774            return false; /* fails (a)..(e), not skippable */
2775        }
2776    
2777        if(mode == Normalizer.NFD || mode == Normalizer.NFKD || mode == Normalizer.NONE){
2778            return true; /* NF*D, passed (a)..(c), is skippable */
2779        }
2780        /* check conditions (a)..(e), see unormimp.h */
2781
2782        /* NF*C/FCC, passed (a)..(e) */
2783        if((norm32& QC_NFD)==0) {
2784            return true; /* no canonical decomposition, is skippable */
2785        }
2786    
2787        /* check Hangul syllables algorithmically */
2788        if(isNorm32HangulOrJamo(norm32)) {
2789            /* Jamo passed (a)..(e) above, must be Hangul */
2790            return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */
2791        }
2792    
2793        /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
2794        /* NF*C, test (f) flag */
2795        if(!isFormatVersion_2_2) {
2796            return false; /* no (f) data, say not skippable to be safe */
2797        }
2798    
2799
2800        aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2801        return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
2802    
2803        /* } else { FCC, test fcd<=1 instead of the above } */
2804    }
2805    
2806    /*
2807        private static final boolean
2808    _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
2809        // add the start code point to the USet
2810        uset_add((USet *)context, start);
2811        return TRUE;
2812    }
2813    */

2814
2815    public static UnicodeSet addPropertyStarts(UnicodeSet set) {
2816        int c;
2817       
2818        /* add the start code point of each same-value range of each trie */
2819        //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
2820
TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
2821        RangeValueIterator.Element normResult = new RangeValueIterator.Element();
2822        
2823        while(normIter.next(normResult)){
2824            set.add(normResult.start);
2825        }
2826        
2827        //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
2828
TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie);
2829        RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();
2830
2831        while(fcdIter.next(fcdResult)){
2832            set.add(fcdResult.start);
2833        }
2834        
2835        if(isFormatVersion_2_1){
2836            //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
2837
TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
2838            RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
2839            while(auxIter.next(auxResult)){
2840                set.add(auxResult.start);
2841            }
2842        }
2843        /* add Hangul LV syllables and LV+1 because of skippables */
2844        for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
2845            set.add(c);
2846            set.add(c+1);
2847        }
2848        set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
2849        return set; // for chaining
2850
}
2851
2852    /**
2853     * Internal API, used in UCharacter.getIntPropertyValue().
2854     * @internal
2855     * @param c code point
2856     * @param modeValue numeric value compatible with Mode
2857     * @return numeric value compatible with QuickCheck
2858     */

2859    public static final int quickCheck(int c, int modeValue) {
2860        final int qcMask[/*UNORM_MODE_COUNT*/]={
2861            0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC
2862        };
2863
2864        int norm32=(int)getNorm32(c)&qcMask[modeValue];
2865
2866        if(norm32==0) {
2867            return 1; // YES
2868
} else if((norm32&QC_ANY_NO)!=0) {
2869            return 0; // NO
2870
} else /* _NORM_QC_ANY_MAYBE */ {
2871            return 2; // MAYBE;
2872
}
2873    }
2874
2875    /**
2876     * Internal API, used by collation code.
2877     * Get access to the internal FCD trie table to be able to perform
2878     * incremental, per-code unit, FCD checks in collation.
2879     * One pointer is sufficient because the trie index values are offset
2880     * by the index size, so that the same pointer is used to access the trie
2881     * data.
2882     * @internal
2883     */

2884    ///CLOVER:OFF
2885
public CharTrie getFCDTrie(){
2886        return FCDTrieImpl.fcdTrie;
2887    }
2888    ///CLOVER:ON
2889

2890
2891    
2892   /* compare canonically equivalent ---------------------------------------- */
2893
2894    /*
2895     * Compare two strings for canonical equivalence.
2896     * Further options include case-insensitive comparison and
2897     * code point order (as opposed to code unit order).
2898     *
2899     * In this function, canonical equivalence is optional as well.
2900     * If canonical equivalence is tested, then both strings must fulfill
2901     * the FCD check.
2902     *
2903     * Semantically, this is equivalent to
2904     * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
2905     * where code point order, NFD and foldCase are all optional.
2906     *
2907     * String comparisons almost always yield results before processing both
2908     * strings completely.
2909     * They are generally more efficient working incrementally instead of
2910     * performing the sub-processing (strlen, normalization, case-folding)
2911     * on the entire strings first.
2912     *
2913     * It is also unnecessary to not normalize identical characters.
2914     *
2915     * This function works in principle as follows:
2916     *
2917     * loop {
2918     * get one code unit c1 from s1 (-1 if end of source)
2919     * get one code unit c2 from s2 (-1 if end of source)
2920     *
2921     * if(either string finished) {
2922     * return result;
2923     * }
2924     * if(c1==c2) {
2925     * continue;
2926     * }
2927     *
2928     * // c1!=c2
2929     * try to decompose/case-fold c1/c2, and continue if one does;
2930     *
2931     * // still c1!=c2 and neither decomposes/case-folds, return result
2932     * return c1-c2;
2933     * }
2934     *
2935     * When a character decomposes, then the pointer for that source changes to
2936     * the decomposition, pushing the previous pointer onto a stack.
2937     * When the end of the decomposition is reached, then the code unit reader
2938     * pops the previous source from the stack.
2939     * (Same for case-folding.)
2940     *
2941     * This is complicated further by operating on variable-width UTF-16.
2942     * The top part of the loop works on code units, while lookups for decomposition
2943     * and case-folding need code points.
2944     * Code points are assembled after the equality/end-of-source part.
2945     * The source pointer is only advanced beyond all code units when the code point
2946     * actually decomposes/case-folds.
2947     *
2948     * If we were on a trail surrogate unit when assembling a code point,
2949     * and the code point decomposes/case-folds, then the decomposition/folding
2950     * result must be compared with the part of the other string that corresponds to
2951     * this string's lead surrogate.
2952     * Since we only assemble a code point when hitting a trail unit when the
2953     * preceding lead units were identical, we back up the other string by one unit
2954     * in such a case.
2955     *
2956     * The optional code point order comparison at the end works with
2957     * the same fix-up as the other code point order comparison functions.
2958     * See ustring.c and the comment near the end of this function.
2959     *
2960     * Assumption: A decomposition or case-folding result string never contains
2961     * a single surrogate. This is a safe assumption in the Unicode Standard.
2962     * Therefore, we do not need to check for surrogate pairs across
2963     * decomposition/case-folding boundaries.
2964     * Further assumptions (see verifications tstnorm.cpp):
2965     * The API function checks for FCD first, while the core function
2966     * first case-folds and then decomposes. This requires that case-folding does not
2967     * un-FCD any strings.
2968     *
2969     * The API function may also NFD the input and turn off decomposition.
2970     * This requires that case-folding does not un-NFD strings either.
2971     *
2972     * TODO If any of the above two assumptions is violated,
2973     * then this entire code must be re-thought.
2974     * If this happens, then a simple solution is to case-fold both strings up front
2975     * and to turn off UNORM_INPUT_IS_FCD.
2976     * We already do this when not both strings are in FCD because makeFCD
2977     * would be a partial NFD before the case folding, which does not work.
2978     * Note that all of this is only a problem when case-folding _and_
2979     * canonical equivalence come together.
2980     *
2981     * This function could be moved to a different source file, at increased cost
2982     * for calling the decomposition access function.
2983     */

2984    
2985    // stack element for previous-level source/decomposition pointers
2986
private static class CmpEquivLevel {
2987        char[] source;
2988        int start;
2989        int s;
2990        int limit;
2991    }
2992    
2993    /**
2994     * Get the canonical decomposition for one code point.
2995     * @param c code point
2996     * @param buffer out-only buffer for algorithmic decompositions of Hangul
2997     * @param length out-only, takes the length of the decomposition, if any
2998     * @return index into the extraData array, or 0 if none
2999     * @internal
3000     */

3001     private static int decompose(int c, char[] buffer) {
3002        
3003        long norm32;
3004        int length=0;
3005        norm32 = (long) ((UNSIGNED_INT_MASK) & NormTrieImpl.normTrie.getCodePointValue(c));
3006        if((norm32 & QC_NFD)!=0) {
3007            if(isNorm32HangulOrJamo(norm32)) {
3008                /* Hangul syllable: decompose algorithmically */
3009                char c2;
3010    
3011                c-=HANGUL_BASE;
3012    
3013                c2=(char)(c%JAMO_T_COUNT);
3014                c/=JAMO_T_COUNT;
3015                if(c2>0) {
3016                    buffer[2]=(char)(JAMO_T_BASE+c2);
3017                    length=3;
3018                } else {
3019                    length=2;
3020                }
3021                buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
3022                buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
3023                return length;
3024            } else {
3025                /* normal decomposition */
3026                DecomposeArgs args = new DecomposeArgs();
3027                int index = decompose(norm32, args);
3028                System.arraycopy(extraData,index,buffer,0,args.length);
3029                return args.length ;
3030            }
3031        } else {
3032            return 0;
3033        }
3034    }
3035
3036    private static int foldCase(int c, char[] dest, int destStart, int destLimit,
3037                                 int options){
3038        String JavaDoc src = UTF16.valueOf(c);
3039        String JavaDoc foldedStr = UCharacter.foldCase(src,options);
3040        char[] foldedC = foldedStr.toCharArray();
3041        for(int i=0;i<foldedC.length;i++){
3042            if(destStart<destLimit){
3043                dest[destStart]=foldedC[i];
3044            }
3045            // always increment destStart so that we can return
3046
// the required length
3047
destStart++;
3048        }
3049        return (c==UTF16.charAt(foldedStr,0)) ? -destStart : destStart;
3050    }
3051    
3052    /*
3053     private static int foldCase(char[] src,int srcStart,int srcLimit,
3054                                char[] dest, int destStart, int destLimit,
3055                                int options){
3056        String source =new String(src,srcStart,(srcLimit-srcStart));
3057        String foldedStr = UCharacter.foldCase(source,options);
3058        char[] foldedC = foldedStr.toCharArray();
3059        for(int i=0;i<foldedC.length;i++){
3060            if(destStart<destLimit){
3061                dest[destStart]=foldedC[i];
3062            }
3063            // always increment destStart so that we can return
3064            // the required length
3065            destStart++;
3066            
3067        }
3068        return destStart;
3069    }
3070    */

3071    public static int cmpEquivFold(String JavaDoc s1, String JavaDoc s2,int options){
3072        return cmpEquivFold(s1.toCharArray(),0,s1.length(),
3073                            s2.toCharArray(),0,s2.length(),
3074                            options);
3075    }
3076    
3077    
3078    // internal function
3079
public static int cmpEquivFold(char[] s1, int s1Start,int s1Limit,
3080                                   char[] s2, int s2Start,int s2Limit,
3081                                   int options) {
3082        // current-level start/limit - s1/s2 as current
3083
int start1, start2, limit1, limit2;
3084        char[] cSource1, cSource2;
3085        
3086        cSource1 = s1;
3087        cSource2 = s2;
3088        // decomposition variables
3089
int length;
3090    
3091        // stacks of previous-level start/current/limit
3092
CmpEquivLevel[] stack1 = new CmpEquivLevel[]{
3093                                                    new CmpEquivLevel(),
3094                                                    new CmpEquivLevel()
3095                                                  };
3096        CmpEquivLevel[] stack2 = new CmpEquivLevel[]{
3097                                                    new CmpEquivLevel(),
3098                                                    new CmpEquivLevel()
3099                                                  };
3100    
3101        // decomposition buffers for Hangul
3102
char[] decomp1 = new char[8];
3103        char[] decomp2 = new char[8];
3104    
3105        // case folding buffers, only use current-level start/limit
3106
char[] fold1 = new char[32];
3107        char[] fold2 = new char[32];
3108    
3109        // track which is the current level per string
3110
int level1, level2;
3111    
3112        // current code units, and code points for lookups
3113
int c1, c2;
3114        int cp1, cp2;
3115    
3116        // no argument error checking because this itself is not an API
3117

3118        // assume that at least one of the options COMPARE_EQUIV and
3119
// COMPARE_IGNORE_CASE is set
3120
// otherwise this function must behave exactly as uprv_strCompare()
3121
// not checking for that here makes testing this function easier
3122

3123    
3124        // initialize
3125
start1=s1Start;
3126        limit1=s1Limit;
3127        
3128        start2=s2Start;
3129        limit2=s2Limit;
3130        
3131        level1=level2=0;
3132        c1=c2=-1;
3133        cp1=cp2=-1;
3134        // comparison loop
3135
for(;;) {
3136            // here a code unit value of -1 means "get another code unit"
3137
// below it will mean "this source is finished"
3138

3139            if(c1<0) {
3140                // get next code unit from string 1, post-increment
3141
for(;;) {
3142                    if(s1Start>=limit1) {
3143                        if(level1==0) {
3144                            c1=-1;
3145                            break;
3146                        }
3147                    } else {
3148                        c1=cSource1[s1Start];
3149                        ++s1Start;
3150                        break;
3151                    }
3152    
3153                    // reached end of level buffer, pop one level
3154
do {
3155                        --level1;
3156                        start1=stack1[level1].start;
3157                    } while(start1==-1); //###### check this
3158
s1Start=stack1[level1].s;
3159                    limit1=stack1[level1].limit;
3160                    cSource1=stack1[level1].source;
3161                }
3162            }
3163    
3164            if(c2<0) {
3165                // get next code unit from string 2, post-increment
3166
for(;;) {
3167                    if(s2Start>=limit2) {
3168                        if(level2==0) {
3169                            c2=-1;
3170                            break;
3171                        }
3172                    } else {
3173                        c2=cSource2[s2Start];
3174                        ++s2Start;
3175                        break;
3176                    }
3177    
3178                    // reached end of level buffer, pop one level
3179
do {
3180                        --level2;
3181                        start2=stack2[level2].start;
3182                    } while(start2==-1);
3183                    s2Start=stack2[level2].s;
3184                    limit2=stack2[level2].limit;
3185                    cSource2=stack2[level2].source;
3186                }
3187            }
3188    
3189            // compare c1 and c2
3190
// either variable c1, c2 is -1 only if the corresponding string
3191
// is finished
3192
if(c1==c2) {
3193                if(c1<0) {
3194                    return 0; // c1==c2==-1 indicating end of strings
3195
}
3196                c1=c2=-1; // make us fetch new code units
3197
continue;
3198            } else if(c1<0) {
3199                return -1; // string 1 ends before string 2
3200
} else if(c2<0) {
3201                return 1; // string 2 ends before string 1
3202
}
3203            // c1!=c2 && c1>=0 && c2>=0
3204

3205            // get complete code points for c1, c2 for lookups if either is a
3206
// surrogate
3207
cp1=c1;
3208            if(UTF16.isSurrogate((char)c1)) {
3209                char c;
3210    
3211                if(UTF16.isLeadSurrogate((char)c1)) {
3212                    if( s1Start!=limit1 &&
3213                           UTF16.isTrailSurrogate(c=cSource1[s1Start])
3214                      ) {
3215                        // advance ++s1; only below if cp1 decomposes/case-folds
3216
cp1=UCharacterProperty.getRawSupplementary((char)c1, c);
3217                    }
3218                } else /* isTrail(c1) */ {
3219                    if( start1<=(s1Start-2) &&
3220                            UTF16.isLeadSurrogate(c=cSource1[(s1Start-2)])
3221                      ) {
3222                        cp1=UCharacterProperty.getRawSupplementary(c, (char)c1);
3223                    }
3224                }
3225            }
3226            cp2=c2;
3227            if(UTF16.isSurrogate((char)c2)) {
3228                char c;
3229    
3230                if(UTF16.isLeadSurrogate((char)c2)) {
3231                    if( s2Start!=limit2 &&
3232                           UTF16.isTrailSurrogate(c=cSource2[s2Start])
3233                      ) {
3234                        // advance ++s2; only below if cp2 decomposes/case-folds
3235
cp2=UCharacterProperty.getRawSupplementary((char)c2, c);
3236                    }
3237                } else /* isTrail(c2) */ {
3238                    if( start2<=(s2Start-2) &&
3239                           UTF16.isLeadSurrogate(c=cSource2[s2Start-2])
3240                      ) {
3241                        cp2=UCharacterProperty.getRawSupplementary(c, (char)c2);
3242                    }
3243                }
3244            }
3245    
3246            // go down one level for each string
3247
// continue with the main loop as soon as there is a real change
3248
if( level1<2 && ((options & Normalizer.COMPARE_IGNORE_CASE)!=0)&&
3249                (length=foldCase(cp1, fold1, 0,32,options))>=0
3250            ) {
3251                // cp1 case-folds to fold1[length]
3252
if(UTF16.isSurrogate((char)c1)) {
3253                    if(UTF16.isLeadSurrogate((char)c1)) {
3254                        // advance beyond source surrogate pair if it
3255
// case-folds
3256
++s1Start;
3257                    } else /* isTrail(c1) */ {
3258                        // we got a supplementary code point when hitting its
3259
// trail surrogate, therefore the lead surrogate must
3260
// have been the same as in the other string;
3261
// compare this decomposition with the lead surrogate
3262
// in the other string
3263
--s2Start;
3264                        c2=cSource2[(s2Start-1)];
3265                    }
3266                }
3267    
3268                // push current level pointers
3269
stack1[0].start=start1;
3270                stack1[0].s=s1Start;
3271                stack1[0].limit=limit1;
3272                stack1[0].source=cSource1;
3273                ++level1;
3274    
3275                cSource1 = fold1;
3276                start1=s1Start=0;
3277                limit1=length;
3278    
3279                // get ready to read from decomposition, continue with loop
3280
c1=-1;
3281                continue;
3282            }
3283    
3284            if( level2<2 && ((options& Normalizer.COMPARE_IGNORE_CASE)!=0) &&
3285                (length=foldCase(cp2, fold2,0,32, options))>=0
3286            ) {
3287                // cp2 case-folds to fold2[length]
3288
if(UTF16.isSurrogate((char)c2)) {
3289                    if(UTF16.isLeadSurrogate((char)c2)) {
3290                        // advance beyond source surrogate pair if it
3291
// case-folds
3292
++s2Start;
3293                    } else /* isTrail(c2) */ {
3294                        // we got a supplementary code point when hitting its
3295
// trail surrogate, therefore the lead surrogate must
3296
// have been the same as in the other string;
3297
// compare this decomposition with the lead surrogate
3298
// in the other string
3299
--s1Start;
3300                        c1=cSource1[(s1Start-1)];
3301                    }
3302                }
3303    
3304                // push current level pointers
3305
stack2[0].start=start2;
3306                stack2[0].s=s2Start;
3307                stack2[0].limit=limit2;
3308                stack2[0].source=cSource2;
3309                ++level2;
3310                
3311                cSource2 = fold2;
3312                start2=s2Start=0;
3313                limit2=length;
3314    
3315                // get ready to read from decomposition, continue with loop
3316
c2=-1;
3317                continue;
3318            }
3319            
3320            if( level1<2 && ((options&COMPARE_EQUIV)!=0) &&
3321                0!=(length=decompose(cp1,decomp1))
3322            ) {
3323                // cp1 decomposes into p[length]
3324
if(UTF16.isSurrogate((char)c1)) {
3325                    if(UTF16.isLeadSurrogate((char)c1)) {
3326                        // advance beyond source surrogate pair if it
3327
//decomposes
3328
++s1Start;
3329                    } else /* isTrail(c1) */ {
3330                        // we got a supplementary code point when hitting
3331
// its trail surrogate, therefore the lead surrogate
3332
// must have been the same as in the other string;
3333
// compare this decomposition with the lead surrogate
3334
// in the other string
3335
--s2Start;
3336                        c2=cSource2[(s2Start-1)];
3337                    }
3338                }
3339    
3340                // push current level pointers
3341
stack1[level1].start=start1;
3342                stack1[level1].s=s1Start;
3343                stack1[level1].limit=limit1;
3344                stack1[level1].source=cSource1;
3345                ++level1;
3346    
3347                // set next level pointers to decomposition
3348
cSource1 = decomp1;
3349                start1=s1Start=0;
3350                limit1=length;
3351                
3352                // set empty intermediate level if skipped
3353
if(level1<2) {
3354                    stack1[level1++].start=-1;
3355                }
3356                // get ready to read from decomposition, continue with loop
3357
c1=-1;
3358                continue;
3359            }
3360    
3361            if( level2<2 && ((options&COMPARE_EQUIV)!=0) &&
3362                0!=(length=decompose(cp2, decomp2))
3363            ) {
3364                // cp2 decomposes into p[length]
3365
if(UTF16.isSurrogate((char)c2)) {
3366                    if(UTF16.isLeadSurrogate((char)c2)) {
3367                        // advance beyond source surrogate pair if it
3368
// decomposes
3369
++s2Start;
3370                    } else /* isTrail(c2) */ {
3371                        // we got a supplementary code point when hitting its
3372
// trail surrogate, therefore the lead surrogate must
3373
// have been the same as in the other string;
3374
// compare this decomposition with the lead surrogate
3375
// in the other string
3376
--s1Start;
3377                        c1=cSource1[(s1Start-1)];
3378                    }
3379                }
3380    
3381                // push current level pointers
3382
stack2[level2].start=start2;
3383                stack2[level2].s=s2Start;
3384                stack2[level2].limit=limit2;
3385                stack2[level2].source=cSource2;
3386                ++level2;
3387    
3388                // set next level pointers to decomposition
3389
cSource2=decomp2;
3390                start2=s2Start=0;
3391                limit2=length;
3392                
3393                // set empty intermediate level if skipped
3394
if(level2<2) {
3395                    stack2[level2++].start=-1;
3396                }
3397                
3398                // get ready to read from decomposition, continue with loop
3399
c2=-1;
3400                continue;
3401            }
3402    
3403    
3404            // no decomposition/case folding, max level for both sides:
3405
// return difference result
3406

3407            // code point order comparison must not just return cp1-cp2
3408
// because when single surrogates are present then the surrogate
3409
// pairs that formed cp1 and cp2 may be from different string
3410
// indexes
3411

3412            // example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at
3413
// second code units
3414
// c1=d800 cp1=10001 c2=dc00 cp2=10000
3415
// cp1-cp2>0 but c1-c2<0 and in fact in UTF-32
3416
// it is { d800 10001 } < { 10000 }
3417
// therefore fix-up
3418

3419            if( c1>=0xd800 && c2>=0xd800 &&
3420                    ((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0)
3421              ) {
3422                /* subtract 0x2800 from BMP code points to make them smaller
3423                 * than supplementary ones */

3424                if(
3425                    ( c1<=0xdbff && s1Start!=limit1
3426                         &&
3427                         UTF16.isTrailSurrogate(cSource1[s1Start])
3428                    )
3429                     ||
3430                    ( UTF16.isTrailSurrogate((char)c1) && start1!=(s1Start-1)
3431                         &&
3432                         UTF16.isLeadSurrogate(cSource1[(s1Start-2)])
3433                    )
3434                ) {
3435                    /* part of a surrogate pair, leave >=d800 */
3436                } else {
3437                    /* BMP code point - may be surrogate code point -
3438                     * make <d800 */

3439                    c1-=0x2800;
3440                }
3441    
3442                if(
3443                    ( c2<=0xdbff && s2Start!=limit2
3444                         &&
3445                         UTF16.isTrailSurrogate(cSource2[s2Start])
3446                    )
3447                     ||
3448                    ( UTF16.isTrailSurrogate((char)c2) && start2!=(s2Start-1)
3449                         &&
3450                         UTF16.isLeadSurrogate(cSource2[(s2Start-2)])
3451                    )
3452                ) {
3453                    /* part of a surrogate pair, leave >=d800 */
3454                } else {
3455                    /* BMP code point - may be surrogate code point -
3456                     * make <d800 */

3457                    c2-=0x2800;
3458                }
3459            }
3460    
3461            return c1-c2;
3462        }
3463    }
3464    private static int strCompare(char[] s1, int s1Start, int s1Limit,
3465                                  char[] s2, int s2Start, int s2Limit,
3466                                  boolean codePointOrder) {
3467                        
3468        int start1, start2, limit1, limit2;
3469 
3470        char c1, c2;
3471    
3472        /* setup for fix-up */
3473        start1=s1Start;
3474        start2=s2Start;
3475        
3476        int length1, length2;
3477        
3478        length1 = s1Limit - s1Start;
3479        length2 = s2Limit - s2Start;
3480            
3481        int lengthResult;
3482
3483        if(length1<length2) {
3484            lengthResult=-1;
3485            limit1=start1+length1;
3486        } else if(length1==length2) {
3487            lengthResult=0;
3488            limit1=start1+length1;
3489        } else /* length1>length2 */ {
3490            lengthResult=1;
3491            limit1=start1+length2;
3492        }
3493
3494        if(s1==s2) {
3495            return lengthResult;
3496        }
3497
3498        for(;;) {
3499            /* check pseudo-limit */
3500            if(s1Start==limit1) {
3501                return lengthResult;
3502            }
3503
3504            c1=s1[s1Start];
3505            c2=s2[s2Start];
3506            if(c1!=c2) {
3507                break;
3508            }
3509            ++s1Start;
3510            ++s2Start;
3511        }
3512
3513        /* setup for fix-up */
3514        limit1=start1+length1;
3515        limit2=start2+length2;
3516
3517    
3518        /* if both values are in or above the surrogate range, fix them up */
3519        if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
3520            /* subtract 0x2800 from BMP code points to make them smaller than
3521             * supplementary ones */

3522            if(
3523                ( c1<=0xdbff && (s1Start+1)!=limit1 &&
3524                  UTF16.isTrailSurrogate(s1[(s1Start+1)])
3525                ) ||
3526                ( UTF16.isTrailSurrogate(c1) && start1!=s1Start &&
3527                  UTF16.isLeadSurrogate(s1[(s1Start-1)])
3528                )
3529            ) {
3530                /* part of a surrogate pair, leave >=d800 */
3531            } else {
3532                /* BMP code point - may be surrogate code point - make <d800 */
3533                c1-=0x2800;
3534            }
3535    
3536            if(
3537                ( c2<=0xdbff && (s2Start+1)!=limit2 &&
3538                  UTF16.isTrailSurrogate(s2[(s2Start+1)])
3539                ) ||
3540                ( UTF16.isTrailSurrogate(c2) && start2!=s2Start &&
3541                  UTF16.isLeadSurrogate(s2[(s2Start-1)])
3542                )
3543            ) {
3544                /* part of a surrogate pair, leave >=d800 */
3545            } else {
3546                /* BMP code point - may be surrogate code point - make <d800 */
3547                c2-=0x2800;
3548            }
3549        }
3550    
3551        /* now c1 and c2 are in UTF-32-compatible order */
3552        return (int)c1-(int)c2;
3553    }
3554
3555
3556    /*
3557     * Status of tailored normalization
3558     *
3559     * This was done initially for investigation on Unicode public review issue 7
3560     * (http://www.unicode.org/review/). See Jitterbug 2481.
3561     * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
3562     * a permanent feature in ICU 2.6 in support of IDNA which requires true
3563     * Unicode 3.2 normalization.
3564     * (NormalizationCorrections are rolled into IDNA mapping tables.)
3565     *
3566     * Tailored normalization as implemented here allows to "normalize less"
3567     * than full Unicode normalization would.
3568     * Based internally on a UnicodeSet of code points that are
3569     * "excluded from normalization", the normalization functions leave those
3570     * code points alone ("inert"). This means that tailored normalization
3571     * still transforms text into a canonically equivalent form.
3572     * It does not add decompositions to code points that do not have any or
3573     * change decomposition results.
3574     *
3575     * Any function that searches for a safe boundary has not been touched,
3576     * which means that these functions will be over-pessimistic when
3577     * exclusions are applied.
3578     * This should not matter because subsequent checks and normalizations
3579     * do apply the exclusions; only a little more of the text may be processed
3580     * than necessary under exclusions.
3581     *
3582     * Normalization exclusions have the following effect on excluded code points c:
3583     * - c is not decomposed
3584     * - c is not a composition target
3585     * - c does not combine forward or backward for composition
3586     * except that this is not implemented for Jamo
3587     * - c is treated as having a combining class of 0
3588     */

3589     
3590    /*
3591     * Constants for the bit fields in the options bit set parameter.
3592     * These need not be public.
3593     * A user only needs to know the currently assigned values.
3594     * The number and positions of reserved bits per field can remain private.
3595     */

3596    private static final int OPTIONS_NX_MASK=0x1f;
3597    private static final int OPTIONS_UNICODE_MASK=0xe0;
3598    public static final int OPTIONS_SETS_MASK=0xff;
3599    private static final int OPTIONS_UNICODE_SHIFT=5;
3600    private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1];
3601     
3602    /* Constants for options flags for normalization.*/
3603
3604    /**
3605     * Options bit 0, do not decompose Hangul syllables.
3606     * @draft ICU 2.6
3607     */

3608    private static final int NX_HANGUL = 1;
3609    /**
3610     * Options bit 1, do not decompose CJK compatibility characters.
3611     * @draft ICU 2.6
3612     */

3613    private static final int NX_CJK_COMPAT=2;
3614    /**
3615     * Options bit 8, use buggy recomposition described in
3616     * Unicode Public Review Issue #29
3617     * at http://www.unicode.org/review/resolved-pri.html#pri29
3618     *
3619     * Used in IDNA implementation according to strict interpretation
3620     * of IDNA definition based on Unicode 3.2 which predates PRI #29.
3621     *
3622     * See ICU4C unormimp.h
3623     *
3624     * @draft ICU 3.2
3625     */

3626    public static final int BEFORE_PRI_29=0x100;
3627
3628    /*
3629     * The following options are used only in some composition functions.
3630     * They use bits 12 and up to preserve lower bits for the available options
3631     * space in unorm_compare() -
3632     * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
3633     */

3634
3635    /** Options bit 12, for compatibility vs. canonical decomposition. */
3636    public static final int OPTIONS_COMPAT=0x1000;
3637    /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
3638    public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000;
3639
3640    /* normalization exclusion sets --------------------------------------------- */
3641    
3642    /*
3643     * Normalization exclusion UnicodeSets are used for tailored normalization;
3644     * see the comment near the beginning of this file.
3645     *
3646     * By specifying one or several sets of code points,
3647     * those code points become inert for normalization.
3648     */

3649    private static final synchronized UnicodeSet internalGetNXHangul() {
3650        /* internal function, does not check for incoming U_FAILURE */
3651    
3652        if(nxCache[NX_HANGUL]==null) {
3653             nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3);
3654        }
3655        return nxCache[NX_HANGUL];
3656    }
3657    
3658    private static final synchronized UnicodeSet internalGetNXCJKCompat() {
3659        /* internal function, does not check for incoming U_FAILURE */
3660    
3661        if(nxCache[NX_CJK_COMPAT]==null) {
3662
3663            /* build a set from [CJK Ideographs]&[has canonical decomposition] */
3664            UnicodeSet set, hasDecomp;
3665    
3666            set=new UnicodeSet("[:Ideographic:]");
3667    
3668            /* start with an empty set for [has canonical decomposition] */
3669            hasDecomp=new UnicodeSet();
3670    
3671            /* iterate over all ideographs and remember which canonically decompose */
3672            UnicodeSetIterator it = new UnicodeSetIterator(set);
3673            int start, end;
3674            long norm32;
3675    
3676            while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
3677                start=it.codepoint;
3678                end=it.codepointEnd;
3679                while(start<=end) {
3680                    norm32 = getNorm32(start);
3681                    if((norm32 & QC_NFD)>0) {
3682                        hasDecomp.add(start);
3683                    }
3684                    ++start;
3685                }
3686            }
3687    
3688            /* hasDecomp now contains all ideographs that decompose canonically */
3689             nxCache[NX_CJK_COMPAT]=hasDecomp;
3690         
3691        }
3692    
3693        return nxCache[NX_CJK_COMPAT];
3694    }
3695    
3696    private static final synchronized UnicodeSet internalGetNXUnicode(int options) {
3697        options &= OPTIONS_UNICODE_MASK;
3698        if(options==0) {
3699            return null;
3700        }
3701    
3702        if(nxCache[options]==null) {
3703            /* build a set with all code points that were not designated by the specified Unicode version */
3704            UnicodeSet set = new UnicodeSet();
3705
3706            switch(options) {
3707            case Normalizer.UNICODE_3_2:
3708                set.applyPattern("[:^Age=3.2:]");
3709                break;
3710            default:
3711                return null;
3712            }
3713            
3714            nxCache[options]=set;
3715        }
3716    
3717        return nxCache[options];
3718    }
3719    
3720    /* Get a decomposition exclusion set. The data must be loaded. */
3721    private static final synchronized UnicodeSet internalGetNX(int options) {
3722        options&=OPTIONS_SETS_MASK;
3723    
3724        if(nxCache[options]==null) {
3725            /* return basic sets */
3726            if(options==NX_HANGUL) {
3727                return internalGetNXHangul();
3728            }
3729            if(options==NX_CJK_COMPAT) {
3730                return internalGetNXCJKCompat();
3731            }
3732            if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) {
3733                return internalGetNXUnicode(options);
3734            }
3735    
3736            /* build a set from multiple subsets */
3737            UnicodeSet set;
3738            UnicodeSet other;
3739    
3740            set=new UnicodeSet();
3741
3742    
3743            if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) {
3744                set.addAll(other);
3745            }
3746            if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) {
3747                set.addAll(other);
3748            }
3749            if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) {
3750                set.addAll(other);
3751            }
3752
3753               nxCache[options]=set;
3754        }
3755        return nxCache[options];
3756    }
3757    
3758    public static final UnicodeSet getNX(int options) {
3759        if((options&=OPTIONS_SETS_MASK)==0) {
3760            /* incoming failure, or no decomposition exclusions requested */
3761            return null;
3762        } else {
3763            return internalGetNX(options);
3764        }
3765    }
3766    
3767    private static final boolean nx_contains(UnicodeSet nx, int c) {
3768        return nx!=null && nx.contains(c);
3769    }
3770    
3771    private static final boolean nx_contains(UnicodeSet nx, char c, char c2) {
3772        return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2));
3773    }
3774
3775
3776}
3777
Popular Tags