UCaseProps


1   /*
2   *******************************************************************************
3   *
4   *   Copyright (C) 2004-2006, International Business Machines
5   *   Corporation and others.  All Rights Reserved.
6   *
7   *******************************************************************************
8   *   file name:  UCaseProps.java
9   *   encoding:   US-ASCII
10  *   tab size:   8 (not used)
11  *   indentation:4
12  *
13  *   created on: 2005jan29
14  *   created by: Markus W. Scherer
15  *
16  *   Low-level Unicode character/string case mapping code.
17  *   Java port of ucase.h/.c.
18  */
19  
20  package com.ibm.icu.impl;
21  
22  import java.io.InputStream  ;
23  import java.io.DataInputStream  ;
24  import java.io.BufferedInputStream  ;
25  import java.io.IOException  ;
26  
27  import com.ibm.icu.util.RangeValueIterator;
28  import com.ibm.icu.util.ULocale;
29  
30  import com.ibm.icu.text.UTF16;
31  import com.ibm.icu.text.UnicodeSet;
32  
33  import com.ibm.icu.lang.UCharacter;
34  
35  public final class UCaseProps {
36      // constructors etc. --------------------------------------------------- ***
37  
38      // port of ucase_openProps()
39      public UCaseProps() throws IOException   {
40          InputStream   is=ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/"+DATA_FILE_NAME);
41          BufferedInputStream   b=new BufferedInputStream  (is, 4096 /* data buffer size */);
42          readData(b);
43          b.close();
44          is.close();
45      }
46  
47      private final void readData(InputStream   is) throws IOException   {
48          DataInputStream   inputStream=new DataInputStream  (is);
49  
50          // read the header
51          unicodeVersion=ICUBinary.readHeader(inputStream, FMT, new IsAcceptable());
52  
53          // read indexes[]
54          int i, count;
55          count=inputStream.readInt();
56          if(count<IX_INDEX_TOP) {
57              throw new IOException  ("indexes[0] too small in "+DATA_FILE_NAME);
58          }
59          indexes=new int[count];
60  
61          indexes[0]=count;
62          for(i=1; i<count; ++i) {
63              indexes[i]=inputStream.readInt();
64          }
65  
66          // read the trie
67          trie=new CharTrie(inputStream, null);
68  
69          // read exceptions[]
70          count=indexes[IX_EXC_LENGTH];
71          if(count>0) {
72              exceptions=new char[count];
73              for(i=0; i<count; ++i) {
74                  exceptions[i]=inputStream.readChar();
75              }
76          }
77  
78          // read unfold[]
79          count=indexes[IX_UNFOLD_LENGTH];
80          if(count>0) {
81              unfold=new char[count];
82              for(i=0; i<count; ++i) {
83                  unfold[i]=inputStream.readChar();
84              }
85          }
86      }
87  
88      // implement ICUBinary.Authenticate
89      private final class IsAcceptable implements ICUBinary.Authenticate {
90          public boolean isDataVersionAcceptable(byte version[]) {
91              formatVersion=version;
92              return version[0]==1 &&
93                     version[2]==Trie.INDEX_STAGE_1_SHIFT_ && version[3]==Trie.INDEX_STAGE_2_SHIFT_;
94          }
95      }
96  
97      // UCaseProps singleton
98      private static UCaseProps gCsp=null;
99  
100     // port of ucase_getSingleton()
101     public static final synchronized UCaseProps getSingleton() throws IOException   {
102         if(gCsp==null) {
103             gCsp=new UCaseProps();
104         }
105         return gCsp;
106     }
107 
108     // UCaseProps dummy singleton
109     private static UCaseProps gCspDummy=null;
110 
111     private UCaseProps(boolean makeDummy) { // ignore makeDummy, only creates a unique signature
112         formatVersion=new byte[] { 1, 0, Trie.INDEX_STAGE_1_SHIFT_, Trie.INDEX_STAGE_2_SHIFT_ };
113         unicodeVersion=new byte[] { 2, 0, 0, 0 };
114         indexes=new int[IX_TOP];
115         indexes[0]=IX_TOP;
116         trie=new CharTrie(0, 0, null); // dummy trie, always returns 0
117     }
118 
119     /**
120      * Get a singleton dummy object, one that works with no real data.
121      * This can be used when the real data is not available.
122      * Using the dummy can reduce checks for available data after an initial failure.
123      * Port of ucase_getDummy().
124      */
125     public static final synchronized UCaseProps getDummy() {
126         if(gCspDummy==null) {
127             gCspDummy=new UCaseProps(true);
128         }
129         return gCspDummy;
130     }
131 
132     // set of property starts for UnicodeSet ------------------------------- ***
133 
134     public final void addPropertyStarts(UnicodeSet set) {
135         /* add the start code point of each same-value range of the trie */
136         TrieIterator iter=new TrieIterator(trie);
137         RangeValueIterator.Element element=new RangeValueIterator.Element();
138 
139         while(iter.next(element)){
140             set.add(element.start);
141         }
142 
143         /* add code points with hardcoded properties, plus the ones following them */
144 
145         /* (none right now, see comment below) */
146 
147         /*
148          * Omit code points with hardcoded specialcasing properties
149          * because we do not build property UnicodeSets for them right now.
150          */
151     }
152 
153     // data access primitives ---------------------------------------------- ***
154     private static final int getExceptionsOffset(int props) {
155         return props>>EXC_SHIFT;
156     }
157 
158     private static final boolean propsHasException(int props) {
159         return (props&EXCEPTION)!=0;
160     }
161 
162     /* number of bits in an 8-bit integer value */
163     private static final byte flagsOffset[/*256*/]={
164         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
165         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
166         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
167         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
168         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
169         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
170         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
171         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
172         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
173         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
174         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
175         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
176         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
177         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
178         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
179         4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
180     };
181 
182     private static final boolean hasSlot(int flags, int index) {
183         return (flags&(1<<index))!=0;
184     }
185     private static final byte slotOffset(int flags, int index) {
186         return flagsOffset[flags&((1<<index)-1)];
187     }
188 
189     /*
190      * Get the value of an optional-value slot where hasSlot(excWord, index).
191      *
192      * @param excWord (in) initial exceptions word
193      * @param index (in) desired slot index
194      * @param excOffset (in) offset into exceptions[] after excWord=exceptions[excOffset++];
195      * @return bits 31..0: slot value
196      *             63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot 
197      */
198     private final long getSlotValueAndOffset(int excWord, int index, int excOffset) {
199         long value;
200         if((excWord&EXC_DOUBLE_SLOTS)==0) {
201             excOffset+=slotOffset(excWord, index);
202             value=exceptions[excOffset];
203         } else {
204             excOffset+=2*slotOffset(excWord, index);
205             value=exceptions[excOffset++];
206             value=(value<<16)|exceptions[excOffset];
207         }
208         return (long)value|((long)excOffset<<32);
209     }
210 
211     /* same as getSlotValueAndOffset() but does not return the slot offset */
212     private final int getSlotValue(int excWord, int index, int excOffset) {
213         int value;
214         if((excWord&EXC_DOUBLE_SLOTS)==0) {
215             excOffset+=slotOffset(excWord, index);
216             value=exceptions[excOffset];
217         } else {
218             excOffset+=2*slotOffset(excWord, index);
219             value=exceptions[excOffset++];
220             value=(value<<16)|exceptions[excOffset];
221         }
222         return value;
223     }
224 
225     // simple case mappings ------------------------------------------------ ***
226 
227     public final int tolower(int c) {
228         int props=trie.getCodePointValue(c);
229         if(!propsHasException(props)) {
230             if(getTypeFromProps(props)>=UPPER) {
231                 c+=getDelta(props);
232             }
233         } else {
234             int excOffset=getExceptionsOffset(props);
235             int excWord=exceptions[excOffset++];
236             if(hasSlot(excWord, EXC_LOWER)) {
237                 c=getSlotValue(excWord, EXC_LOWER, excOffset);
238             }
239         }
240         return c;
241     }
242 
243     public final int toupper(int c) {
244         int props=trie.getCodePointValue(c);
245         if(!propsHasException(props)) {
246             if(getTypeFromProps(props)==LOWER) {
247                 c+=getDelta(props);
248             }
249         } else {
250             int excOffset=getExceptionsOffset(props);
251             int excWord=exceptions[excOffset++];
252             if(hasSlot(excWord, EXC_UPPER)) {
253                 c=getSlotValue(excWord, EXC_UPPER, excOffset);
254             }
255         }
256         return c;
257     }
258 
259     public final int totitle(int c) {
260         int props=trie.getCodePointValue(c);
261         if(!propsHasException(props)) {
262             if(getTypeFromProps(props)==LOWER) {
263                 c+=getDelta(props);
264             }
265         } else {
266             int excOffset=getExceptionsOffset(props);
267             int excWord=exceptions[excOffset++];
268             int index;
269             if(hasSlot(excWord, EXC_TITLE)) {
270                 index=EXC_TITLE;
271             } else if(hasSlot(excWord, EXC_UPPER)) {
272                 index=EXC_UPPER;
273             } else {
274                 return c;
275             }
276             c=getSlotValue(excWord, index, excOffset);
277         }
278         return c;
279     }
280 
281     /**
282      * Adds all simple case mappings and the full case folding for c to sa,
283      * and also adds special case closure mappings.
284      * c itself is not added.
285      * For example, the mappings
286      * - for s include long s
287      * - for sharp s include ss
288      * - for k include the Kelvin sign
289      */
290     public final void addCaseClosure(int c, UnicodeSet set) {
291         /*
292          * Hardcode the case closure of i and its relatives and ignore the
293          * data file data for these characters.
294          * The Turkic dotless i and dotted I with their case mapping conditions
295          * and case folding option make the related characters behave specially.
296          * This code matches their closure behavior to their case folding behavior.
297          */
298 
299         switch(c) {
300         case 0x49:
301             /* regular i and I are in one equivalence class */
302             set.add(0x69);
303             return;
304         case 0x69:
305             set.add(0x49);
306             return;
307         case 0x130:
308             /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
309             set.add(iDot);
310             return;
311         case 0x131:
312             /* dotless i is in a class by itself */
313             return;
314         default:
315             /* otherwise use the data file data */
316             break;
317         }
318 
319         int props=trie.getCodePointValue(c);
320         if(!propsHasException(props)) {
321             if(getTypeFromProps(props)!=NONE) {
322                 /* add the one simple case mapping, no matter what type it is */
323                 int delta=getDelta(props);
324                 if(delta!=0) {
325                     set.add(c+delta);
326                 }
327             }
328         } else {
329             /*
330              * c has exceptions, so there may be multiple simple and/or
331              * full case mappings. Add them all.
332              */
333             int excOffset0, excOffset=getExceptionsOffset(props);
334             int closureOffset;
335             int excWord=exceptions[excOffset++];
336             int index, closureLength, fullLength, length;
337 
338             excOffset0=excOffset;
339 
340             /* add all simple case mappings */
341             for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {
342                 if(hasSlot(excWord, index)) {
343                     excOffset=excOffset0;
344                     c=getSlotValue(excWord, index, excOffset);
345                     set.add(c);
346                 }
347             }
348 
349             /* get the closure string pointer & length */
350             if(hasSlot(excWord, EXC_CLOSURE)) {
351                 excOffset=excOffset0;
352                 long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
353                 closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */
354                 closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */
355             } else {
356                 closureLength=0;
357                 closureOffset=0;
358             }
359 
360             /* add the full case folding */
361             if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
362                 excOffset=excOffset0;
363                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
364                 fullLength=(int)value;
365 
366                 /* start of full case mapping strings */
367                 excOffset=(int)(value>>32)+1;
368 
369                 fullLength&=0xffff; /* bits 16 and higher are reserved */
370 
371                 /* skip the lowercase result string */
372                 excOffset+=fullLength&FULL_LOWER;
373                 fullLength>>=4;
374 
375                 /* add the full case folding string */
376                 length=fullLength&0xf;
377                 if(length!=0) {
378                     set.add(new String  (exceptions, excOffset, length));
379                     excOffset+=length;
380                 }
381 
382                 /* skip the uppercase and titlecase strings */
383                 fullLength>>=4;
384                 excOffset+=fullLength&0xf;
385                 fullLength>>=4;
386                 excOffset+=fullLength;
387 
388                 closureOffset=excOffset; /* behind full case mappings */
389             }
390 
391             /* add each code point in the closure string */
392             for(index=0; index<closureLength; index+=UTF16.getCharCount(c)) {
393                 c=UTF16.charAt(exceptions, closureOffset, exceptions.length, index);
394                 set.add(c);
395             }
396         }
397     }
398 
399     /*
400      * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated
401      * must be s.length()>0 and max>0 and s.length()<=max
402      */
403     private final int strcmpMax(String   s, int unfoldOffset, int max) {
404         int i1, length, c1, c2;
405 
406         length=s.length();
407         max-=length; /* we require length<=max, so no need to decrement max in the loop */
408         i1=0;
409         do {
410             c1=s.charAt(i1++);
411             c2=unfold[unfoldOffset++];
412             if(c2==0) {
413                 return 1; /* reached the end of t but not of s */
414             }
415             c1-=c2;
416             if(c1!=0) {
417                 return c1; /* return difference result */
418             }
419         } while(--length>0);
420         /* ends with length==0 */
421 
422         if(max==0 || unfold[unfoldOffset]==0) {
423             return 0; /* equal to length of both strings */
424         } else {
425             return -max; /* return lengh difference */
426         }
427     }
428 
429     /**
430      * Maps the string to single code points and adds the associated case closure
431      * mappings.
432      * The string is mapped to code points if it is their full case folding string.
433      * In other words, this performs a reverse full case folding and then
434      * adds the case closure items of the resulting code points.
435      * If the string is found and its closure applied, then
436      * the string itself is added as well as part of its code points' closure.
437      *
438      * @return true if the string was found
439      */
440     public final boolean addStringCaseClosure(String   s, UnicodeSet set) {
441         int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;
442 
443         if(unfold==null || s==null) {
444             return false; /* no reverse case folding data, or no string */
445         }
446         length=s.length();
447         if(length<=1) {
448             /* the string is too short to find any match */
449             /*
450              * more precise would be:
451              * if(!u_strHasMoreChar32Than(s, length, 1))
452              * but this does not make much practical difference because
453              * a single supplementary code point would just not be found
454              */
455             return false;
456         }
457 
458         unfoldRows=unfold[UNFOLD_ROWS];
459         unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
460         unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
461         //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;
462 
463         if(length>unfoldStringWidth) {
464             /* the string is too long to find any match */
465             return false;
466         }
467 
468         /* do a binary search for the string */
469         start=0;
470         limit=unfoldRows;
471         while(start<limit) {
472             i=(start+limit)/2;
473             unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
474             result=strcmpMax(s, unfoldOffset, unfoldStringWidth);
475 
476             if(result==0) {
477                 /* found the string: add each code point, and its case closure */
478                 int c;
479 
480                 for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
481                     c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
482                     set.add(c);
483                     addCaseClosure(c, set);
484                 }
485                 return true;
486             } else if(result<0) {
487                 limit=i;
488             } else /* result>0 */ {
489                 start=i+1;
490             }
491         }
492 
493         return false; /* string not found */
494     }
495 
496     /** @return NONE, LOWER, UPPER, TITLE */
497     public final int getType(int c) {
498         return getTypeFromProps(trie.getCodePointValue(c));
499     }
500 
501     /** @return same as getType(), or <0 if c is case-ignorable */
502     public final int getTypeOrIgnorable(int c) {
503         int props=trie.getCodePointValue(c);
504         int type=getTypeFromProps(props);
505         if(type!=NONE) {
506             return type;
507         } else if(
508             c==0x307 ||
509             (props&(EXCEPTION|CASE_IGNORABLE))==CASE_IGNORABLE
510         ) {
511             return -1; /* case-ignorable */
512         } else {
513             return 0; /* c is neither cased nor case-ignorable */
514         }
515     }
516 
517     /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */
518     public final int getDotType(int c) {
519         int props=trie.getCodePointValue(c);
520         if(!propsHasException(props)) {
521             return props&DOT_MASK;
522         } else {
523             return (exceptions[getExceptionsOffset(props)]>>EXC_DOT_SHIFT)&DOT_MASK;
524         }
525     }
526 
527     public final boolean isSoftDotted(int c) {
528         return getDotType(c)==SOFT_DOTTED;
529     }
530 
531     public final boolean isCaseSensitive(int c) {
532         return (trie.getCodePointValue(c)&SENSITIVE)!=0;
533     }
534 
535     // string casing ------------------------------------------------------- ***
536 
537     /*
538      * These internal functions form the core of string case mappings.
539      * They map single code points to result code points or strings and take
540      * all necessary conditions (context, locale ID, options) into account.
541      *
542      * They do not iterate over the source or write to the destination
543      * so that the same functions are useful for non-standard string storage,
544      * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
545      * For the same reason, the "surrounding text" context is passed in as a
546      * ContextIterator which does not make any assumptions about
547      * the underlying storage.
548      *
549      * This section contains helper functions that check for conditions
550      * in the input text surrounding the current code point
551      * according to SpecialCasing.txt.
552      *
553      * Each helper function gets the index
554      * - after the current code point if it looks at following text
555      * - before the current code point if it looks at preceding text
556      *
557      * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
558      *
559      * Final_Sigma
560      *   C is preceded by a sequence consisting of
561      *     a cased letter and a case-ignorable sequence,
562      *   and C is not followed by a sequence consisting of
563      *     an ignorable sequence and then a cased letter.
564      *
565      * More_Above
566      *   C is followed by one or more characters of combining class 230 (ABOVE)
567      *   in the combining character sequence.
568      *
569      * After_Soft_Dotted
570      *   The last preceding character with combining class of zero before C
571      *   was Soft_Dotted,
572      *   and there is no intervening combining character class 230 (ABOVE).
573      *
574      * Before_Dot
575      *   C is followed by combining dot above (U+0307).
576      *   Any sequence of characters with a combining class that is neither 0 nor 230
577      *   may intervene between the current character and the combining dot above.
578      *
579      * The erratum from 2002-10-31 adds the condition
580      *
581      * After_I
582      *   The last preceding base character was an uppercase I, and there is no
583      *   intervening combining character class 230 (ABOVE).
584      *
585      *   (See Jitterbug 2344 and the comments on After_I below.)
586      *
587      * Helper definitions in Unicode 3.2 UAX 21:
588      *
589      * D1. A character C is defined to be cased
590      *     if it meets any of the following criteria:
591      *
592      *   - The general category of C is Titlecase Letter (Lt)
593      *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
594      *   - Given D = NFD(C), then it is not the case that:
595      *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
596      *     (This third criterium does not add any characters to the list
597      *      for Unicode 3.2. Ignored.)
598      *
599      * D2. A character C is defined to be case-ignorable
600      *     if it meets either of the following criteria:
601      *
602      *   - The general category of C is
603      *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
604      *     Letter Modifier (Lm), or Symbol Modifier (Sk)
605      *   - C is one of the following characters 
606      *     U+0027 APOSTROPHE
607      *     U+00AD SOFT HYPHEN (SHY)
608      *     U+2019 RIGHT SINGLE QUOTATION MARK
609      *            (the preferred character for apostrophe)
610      *
611      * D3. A case-ignorable sequence is a sequence of
612      *     zero or more case-ignorable characters.
613      */
614 
615     /**
616      * Iterator for string case mappings, which need to look at the
617      * context (surrounding text) of a given character for conditional mappings.
618      *
619      * The iterator only needs to go backward or forward away from the
620      * character in question. It does not use any indexes on this interface.
621      * It does not support random access or an arbitrary change of
622      * iteration direction.
623      *
624      * The code point being case-mapped itself is never returned by
625      * this iterator.
626      */
627     public interface ContextIterator {
628         /**
629          * Reset the iterator for forward or backward iteration.
630          * @param dir >0: Begin iterating forward from the first code point
631          * after the one that is being case-mapped.
632          *            <0: Begin iterating backward from the first code point
633          * before the one that is being case-mapped.   
634          */
635         public void reset(int dir);
636         /**
637          * Iterate and return the next code point, moving in the direction
638          * determined by the reset() call.
639          * @return Next code point, or <0 when the iteration is done. 
640          */
641         public int next();
642     }
643 
644     /**
645      * For string case mappings, a single character (a code point) is mapped
646      * either to itself (in which case in-place mapping functions do nothing),
647      * or to another single code point, or to a string.
648      * Aside from the string contents, these are indicated with a single int
649      * value as follows:
650      *
651      * Mapping to self: Negative values (~self instead of -self to support U+0000)
652      *
653      * Mapping to another code point: Positive values >MAX_STRING_LENGTH
654      *
655      * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is
656      * returned. Note that the string result may indeed have zero length.
657      */
658     public static final int MAX_STRING_LENGTH=0x1f;
659 
660     private static final int LOC_UNKNOWN=0;
661     private static final int LOC_ROOT=1;
662     private static final int LOC_TURKISH=2;
663     private static final int LOC_LITHUANIAN=3;
664 
665     /*
666      * Checks and caches the type of locale ID as it is relevant for case mapping.
667      * If the locCache is not null, then it must be initialized with locCache[0]=0 .
668      */
669     private static final int getCaseLocale(ULocale locale, int[] locCache) {
670         int result;
671 
672         if(locCache!=null && (result=locCache[0])!=LOC_UNKNOWN) {
673             return result;
674         }
675 
676         result=LOC_ROOT;
677 
678         String   language=locale.getLanguage();
679         if(language.equals("tr") || language.equals("tur") || language.equals("az") || language.equals("aze")) {
680             result=LOC_TURKISH;
681         } else if(language.equals("lt") || language.equals("lit")) {
682             result=LOC_LITHUANIAN;
683         }
684 
685         if(locCache!=null) {
686             locCache[0]=result;
687         }
688         return result;
689     }
690 
691     /* Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward) */
692     private final boolean isFollowedByCasedLetter(ContextIterator iter, int dir) {
693         int c;
694         int props;
695 
696         if(iter==null) {
697             return false;
698         }
699 
700         for(iter.reset(dir); (c=iter.next())>=0;) {
701             props=trie.getCodePointValue(c);
702             if(getTypeFromProps(props)!=NONE) {
703                 return true; /* followed by cased letter */
704             } else if(c==0x307 || (props&(EXCEPTION|CASE_IGNORABLE))==CASE_IGNORABLE) {
705                 /* case-ignorable, continue with the loop */
706             } else {
707                 return false; /* not ignorable */
708             }
709         }
710 
711         return false; /* not followed by cased letter */
712     }
713 
714     /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
715     private final boolean isPrecededBySoftDotted(ContextIterator iter) {
716         int c;
717         int dotType;
718 
719         if(iter==null) {
720             return false;
721         }
722 
723         for(iter.reset(-1); (c=iter.next())>=0;) {
724             dotType=getDotType(c);
725             if(dotType==SOFT_DOTTED) {
726                 return true; /* preceded by TYPE_i */
727             } else if(dotType!=OTHER_ACCENT) {
728                 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
729             }
730         }
731 
732         return false; /* not preceded by TYPE_i */
733     }
734 
735     /*
736      * See Jitterbug 2344:
737      * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
738      * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
739      * we made those releases compatible with Unicode 3.2 which had not fixed
740      * a related bug in SpecialCasing.txt.
741      *
742      * From the Jitterbug 2344 text:
743      * ... this bug is listed as a Unicode erratum
744      * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
745      * <quote>
746      * There are two errors in SpecialCasing.txt.
747      * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
748      * 2. An incorrect context definition. Correct as follows:
749      * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
750      * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
751      * ---
752      * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
753      * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
754      * where the context After_I is defined as:
755      * The last preceding base character was an uppercase I, and there is no
756      * intervening combining character class 230 (ABOVE).
757      * </quote>
758      *
759      * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
760      *
761      * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
762      * # This matches the behavior of the canonically equivalent I-dot_above
763      *
764      * See also the description in this place in older versions of uchar.c (revision 1.100).
765      *
766      * Markus W. Scherer 2003-feb-15
767      */
768 
769     /* Is preceded by base character 'I' with no intervening cc=230 ? */
770     private final boolean isPrecededBy_I(ContextIterator iter) {
771         int c;
772         int dotType;
773 
774         if(iter==null) {
775             return false;
776         }
777 
778         for(iter.reset(-1); (c=iter.next())>=0;) {
779             if(c==0x49) {
780                 return true; /* preceded by I */
781             }
782             dotType=getDotType(c);
783             if(dotType!=OTHER_ACCENT) {
784                 return false; /* preceded by different base character (not I), or intervening cc==230 */
785             }
786         }
787 
788         return false; /* not preceded by I */
789     }
790 
791     /* Is followed by one or more cc==230 ? */
792     private final boolean isFollowedByMoreAbove(ContextIterator iter) {
793         int c;
794         int dotType;
795 
796         if(iter==null) {
797             return false;
798         }
799 
800         for(iter.reset(1); (c=iter.next())>=0;) {
801             dotType=getDotType(c);
802             if(dotType==ABOVE) {
803                 return true; /* at least one cc==230 following */
804             } else if(dotType!=OTHER_ACCENT) {
805                 return false; /* next base character, no more cc==230 following */
806             }
807         }
808 
809         return false; /* no more cc==230 following */
810     }
811 
812     /* Is followed by a dot above (without cc==230 in between) ? */
813     private final boolean isFollowedByDotAbove(ContextIterator iter) {
814         int c;
815         int dotType;
816 
817         if(iter==null) {
818             return false;
819         }
820 
821         for(iter.reset(1); (c=iter.next())>=0; ) {
822             if(c==0x307) {
823                 return true;
824             }
825             dotType=getDotType(c);
826             if(dotType!=OTHER_ACCENT) {
827                 return false; /* next base character or cc==230 in between */
828             }
829         }
830 
831         return false; /* no dot above following */
832     }
833 
834     private static final String  
835         iDot=       "i\u0307",
836         jDot=       "j\u0307",
837         iOgonekDot= "\u012f\u0307",
838         iDotGrave=  "i\u0307\u0300",
839         iDotAcute=  "i\u0307\u0301",
840         iDotTilde=  "i\u0307\u0303";
841 
842     /**
843      * Get the full lowercase mapping for c.
844      *
845      * @param c Character to be mapped.
846      * @param iter Character iterator, used for context-sensitive mappings.
847      *             See ContextIterator for details.
848      *             If iter==null then a context-independent result is returned.
849      * @param out If the mapping result is a string, then it is appended to out.
850      * @param locale Locale ID for locale-dependent mappings.
851      * @param locCache Initialize locCache[0] to 0; may be used to cache the result of parsing
852      *                 the locale ID for subsequent calls.
853      *                 Can be null.
854      * @return Output code point or string length, see MAX_STRING_LENGTH.
855      *
856      * @see ContextIterator
857      * @see #MAX_STRING_LENGTH
858      * @internal
859      */
860     public final int toFullLower(int c, ContextIterator iter,
861                                  StringBuffer   out,
862                                  ULocale locale, int[] locCache) {
863         int result, props;
864 
865         result=c;
866         props=trie.getCodePointValue(c);
867         if(!propsHasException(props)) {
868             if(getTypeFromProps(props)>=UPPER) {
869                 result=c+getDelta(props);
870             }
871         } else {
872             int excOffset=getExceptionsOffset(props), excOffset2;
873             int excWord=exceptions[excOffset++];
874             int full;
875 
876             excOffset2=excOffset;
877 
878             if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
879                 /* use hardcoded conditions and mappings */
880                 int loc=getCaseLocale(locale, locCache);
881 
882                 /*
883                  * Test for conditional mappings first
884                  *   (otherwise the unconditional default mappings are always taken),
885                  * then test for characters that have unconditional mappings in SpecialCasing.txt,
886                  * then get the UnicodeData.txt mappings.
887                  */
888                 if( loc==LOC_LITHUANIAN &&
889                         /* base characters, find accents above */
890                         (((c==0x49 || c==0x4a || c==0x12e) &&
891                             isFollowedByMoreAbove(iter)) ||
892                         /* precomposed with accent above, no need to find one */
893                         (c==0xcc || c==0xcd || c==0x128))
894                 ) {
895                     /*
896                         # Lithuanian
897 
898                         # Lithuanian retains the dot in a lowercase i when followed by accents.
899 
900                         # Introduce an explicit dot above when lowercasing capital I's and J's
901                         # whenever there are more accents above.
902                         # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
903 
904                         0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
905                         004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
906                         012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
907                         00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
908                         00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
909                         0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
910                      */
911                     switch(c) {
912                     case 0x49:  /* LATIN CAPITAL LETTER I */
913                         out.append(iDot);
914                         return 2;
915                     case 0x4a:  /* LATIN CAPITAL LETTER J */
916                         out.append(jDot);
917                         return 2;
918                     case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
919                         out.append(iOgonekDot);
920                         return 2;
921                     case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
922                         out.append(iDotGrave);
923                         return 3;
924                     case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
925                         out.append(iDotAcute);
926                         return 3;
927                     case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
928                         out.append(iDotTilde);
929                         return 3;
930                     default:
931                         return 0; /* will not occur */
932                     }
933                 /* # Turkish and Azeri */
934                 } else if(loc==LOC_TURKISH && c==0x130) {
935                     /*
936                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
937                         # The following rules handle those cases.
938 
939                         0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
940                         0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
941                      */
942                     return 0x69;
943                 } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {
944                     /*
945                         # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
946                         # This matches the behavior of the canonically equivalent I-dot_above
947 
948                         0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
949                         0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
950                      */
951                     return 0; /* remove the dot (continue without output) */
952                 } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {
953                     /*
954                         # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
955 
956                         0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
957                         0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
958                      */
959                     return 0x131;
960                 } else if(c==0x130) {
961                     /*
962                         # Preserve canonical equivalence for I with dot. Turkic is handled below.
963 
964                         0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
965                      */
966                     out.append(iDot);
967                     return 2;
968                 } else if(  c==0x3a3 &&
969                             !isFollowedByCasedLetter(iter, 1) &&
970                             isFollowedByCasedLetter(iter, -1) /* -1=preceded */
971                 ) {
972                     /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
973                     /*
974                         # Special case for final form of sigma
975 
976                         03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
977                      */
978                     return 0x3c2; /* greek small final sigma */
979                 } else {
980                     /* no known conditional special case mapping, use a normal mapping */
981                 }
982             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
983                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
984                 full=(int)value&FULL_LOWER;
985                 if(full!=0) {
986                     /* start of full case mapping strings */
987                     excOffset=(int)(value>>32)+1;
988 
989                     /* set the output pointer to the lowercase mapping */
990                     out.append(new String  (exceptions, excOffset, full));
991 
992                     /* return the string length */
993                     return full;
994                 }
995             }
996 
997             if(hasSlot(excWord, EXC_LOWER)) {
998                 result=getSlotValue(excWord, EXC_LOWER, excOffset2);
999             }
1000        }
1001
1002        return (result==c) ? ~result : result;
1003    }
1004
1005    /* internal */
1006    private final int toUpperOrTitle(int c, ContextIterator iter,
1007                                     StringBuffer   out,
1008                                     ULocale locale, int[] locCache,
1009                                     boolean upperNotTitle) {
1010        int result;
1011        int props;
1012
1013        result=c;
1014        props=trie.getCodePointValue(c);
1015        if(!propsHasException(props)) {
1016            if(getTypeFromProps(props)==LOWER) {
1017                result=c+getDelta(props);
1018            }
1019        } else {
1020            int excOffset=getExceptionsOffset(props), excOffset2;
1021            int excWord=exceptions[excOffset++];
1022            int full, index;
1023
1024            excOffset2=excOffset;
1025
1026            if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
1027                /* use hardcoded conditions and mappings */
1028                int loc=getCaseLocale(locale, locCache);
1029
1030                if(loc==LOC_TURKISH && c==0x69) {
1031                    /*
1032                        # Turkish and Azeri
1033
1034                        # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1035                        # The following rules handle those cases.
1036
1037                        # When uppercasing, i turns into a dotted capital I
1038
1039                        0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1040                        0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1041                    */
1042                    return 0x130;
1043                } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter)) {
1044                    /*
1045                        # Lithuanian
1046
1047                        # Lithuanian retains the dot in a lowercase i when followed by accents.
1048
1049                        # Remove DOT ABOVE after "i" with upper or titlecase
1050
1051                        0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1052                     */
1053                    return 0; /* remove the dot (continue without output) */
1054                } else {
1055                    /* no known conditional special case mapping, use a normal mapping */
1056                }
1057            } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1058                long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
1059                full=(int)value&0xffff;
1060
1061                /* start of full case mapping strings */
1062                excOffset=(int)(value>>32)+1;
1063
1064                /* skip the lowercase and case-folding result strings */
1065                excOffset+=full&FULL_LOWER;
1066                full>>=4;
1067                excOffset+=full&0xf;
1068                full>>=4;
1069
1070                if(upperNotTitle) {
1071                    full&=0xf;
1072                } else {
1073                    /* skip the uppercase result string */
1074                    excOffset+=full&0xf;
1075                    full=(full>>4)&0xf;
1076                }
1077
1078                if(full!=0) {
1079                    /* set the output pointer to the result string */
1080                    out.append(new String  (exceptions, excOffset, full));
1081
1082                    /* return the string length */
1083                    return full;
1084                }
1085            }
1086
1087            if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {
1088                index=EXC_TITLE;
1089            } else if(hasSlot(excWord, EXC_UPPER)) {
1090                /* here, titlecase is same as uppercase */
1091                index=EXC_UPPER;
1092            } else {
1093                return ~c;
1094            }
1095            result=getSlotValue(excWord, index, excOffset2);
1096        }
1097
1098        return (result==c) ? ~result : result;
1099    }
1100
1101    public final int toFullUpper(int c, ContextIterator iter,
1102                                 StringBuffer   out,
1103                                 ULocale locale, int[] locCache) {
1104        return toUpperOrTitle(c, iter, out, locale, locCache, true);
1105    }
1106
1107    public final int toFullTitle(int c, ContextIterator iter,
1108                                 StringBuffer   out,
1109                                 ULocale locale, int[] locCache) {
1110        return toUpperOrTitle(c, iter, out, locale, locCache, false);
1111    }
1112
1113    /* case folding ------------------------------------------------------------- */
1114
1115    /*
1116     * Case folding is similar to lowercasing.
1117     * The result may be a simple mapping, i.e., a single code point, or
1118     * a full mapping, i.e., a string.
1119     * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1120     * then only the lowercase mapping is stored.
1121     *
1122     * Some special cases are hardcoded because their conditions cannot be
1123     * parsed and processed from CaseFolding.txt.
1124     *
1125     * Unicode 3.2 CaseFolding.txt specifies for its status field:
1126
1127    # C: common case folding, common mappings shared by both simple and full mappings.
1128    # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1129    # S: simple case folding, mappings to single characters where different from F.
1130    # T: special case for uppercase I and dotted uppercase I
1131    #    - For non-Turkic languages, this mapping is normally not used.
1132    #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1133    #
1134    # Usage:
1135    #  A. To do a simple case folding, use the mappings with status C + S.
1136    #  B. To do a full case folding, use the mappings with status C + F.
1137    #
1138    #    The mappings with status T can be used or omitted depending on the desired case-folding
1139    #    behavior. (The default option is to exclude them.)
1140
1141     * Unicode 3.2 has 'T' mappings as follows:
1142
1143    0049; T; 0131; # LATIN CAPITAL LETTER I
1144    0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1145
1146     * while the default mappings for these code points are:
1147
1148    0049; C; 0069; # LATIN CAPITAL LETTER I
1149    0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1150
1151     * U+0130 has no simple case folding (simple-case-folds to itself).
1152     */
1153
1154    /**
1155     * Bit mask for getting just the options from a string compare options word
1156     * that are relevant for case folding (of a single string or code point).
1157     * @internal
1158     */
1159    private static final int FOLD_CASE_OPTIONS_MASK = 0xff;
1160    
1161    /* return the simple case folding mapping for c */
1162    public final int fold(int c, int options) {
1163        int props=trie.getCodePointValue(c);
1164        if(!propsHasException(props)) {
1165            if(getTypeFromProps(props)>=UPPER) {
1166                c+=getDelta(props);
1167            }
1168        } else {
1169            int excOffset=getExceptionsOffset(props);
1170            int excWord=exceptions[excOffset++];
1171            int index;
1172            if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
1173                /* special case folding mappings, hardcoded */
1174                if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
1175                    /* default mappings */
1176                    if(c==0x49) {
1177                        /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1178                        return 0x69;
1179                    } else if(c==0x130) {
1180                        /* no simple case folding for U+0130 */
1181                        return c;
1182                    }
1183                } else {
1184                    /* Turkic mappings */
1185                    if(c==0x49) {
1186                        /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1187                        return 0x131;
1188                    } else if(c==0x130) {
1189                        /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1190                        return 0x69;
1191                    }
1192                }
1193            }
1194            if(hasSlot(excWord, EXC_FOLD)) {
1195                index=EXC_FOLD;
1196            } else if(hasSlot(excWord, EXC_LOWER)) {
1197                index=EXC_LOWER;
1198            } else {
1199                return c;
1200            }
1201            c=getSlotValue(excWord, index, excOffset);
1202        }
1203        return c;
1204    }
1205
1206    /*
1207     * Issue for canonical caseless match (UAX #21):
1208     * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1209     * canonical equivalence, unlike default-option casefolding.
1210     * For example, I-grave and I + grave fold to strings that are not canonically
1211     * equivalent.
1212     * For more details, see the comment in unorm_compare() in unorm.cpp
1213     * and the intermediate prototype changes for Jitterbug 2021.
1214     * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1215     *
1216     * This did not get fixed because it appears that it is not possible to fix
1217     * it for uppercase and lowercase characters (I-grave vs. i-grave)
1218     * together in a way that they still fold to common result strings.
1219     */
1220
1221    public final int toFullFolding(int c, StringBuffer   out, int options) {
1222        int result;
1223        int props;
1224
1225        result=c;
1226        props=trie.getCodePointValue(c);
1227        if(!propsHasException(props)) {
1228            if(getTypeFromProps(props)>=UPPER) {
1229                result=c+getDelta(props);
1230            }
1231        } else {
1232            int excOffset=getExceptionsOffset(props), excOffset2;
1233            int excWord=exceptions[excOffset++];
1234            int full, index;
1235
1236            excOffset2=excOffset;
1237
1238            if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
1239                /* use hardcoded conditions and mappings */
1240                if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
1241                    /* default mappings */
1242                    if(c==0x49) {
1243                        /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1244                        return 0x69;
1245                    } else if(c==0x130) {
1246                        /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1247                        out.append(iDot);
1248                        return 2;
1249                    }
1250                } else {
1251                    /* Turkic mappings */
1252                    if(c==0x49) {
1253                        /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1254                        return 0x131;
1255                    } else if(c==0x130) {
1256                        /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1257                        return 0x69;
1258                    }
1259                }
1260            } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1261                long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
1262                full=(int)value&0xffff;
1263
1264                /* start of full case mapping strings */
1265                excOffset=(int)(value>>32)+1;
1266
1267                /* skip the lowercase result string */
1268                excOffset+=full&FULL_LOWER;
1269                full=(full>>4)&0xf;
1270
1271                if(full!=0) {
1272                    /* set the output pointer to the result string */
1273                    out.append(new String  (exceptions, excOffset, full));
1274
1275                    /* return the string length */
1276                    return full;
1277                }
1278            }
1279
1280            if(hasSlot(excWord, EXC_FOLD)) {
1281                index=EXC_FOLD;
1282            } else if(hasSlot(excWord, EXC_LOWER)) {
1283                index=EXC_LOWER;
1284            } else {
1285                return ~c;
1286            }
1287            result=getSlotValue(excWord, index, excOffset2);
1288        }
1289
1290        return (result==c) ? ~result : result;
1291    }
1292
1293    // data members -------------------------------------------------------- ***
1294    private int indexes[];
1295    private char exceptions[];
1296    private char unfold[];
1297
1298    private CharTrie trie;
1299    private byte formatVersion[];
1300    private byte unicodeVersion[];  
1301
1302    // data format constants ----------------------------------------------- ***
1303    private static final String   DATA_NAME="ucase";
1304    private static final String   DATA_TYPE="icu";
1305    private static final String   DATA_FILE_NAME=DATA_NAME+"."+DATA_TYPE;
1306
1307    /* format "cAsE" */
1308    private static final byte FMT[]={ 0x63, 0x41, 0x53, 0x45 };
1309
1310    /* indexes into indexes[] */
1311    private static final int IX_INDEX_TOP=0;
1312    private static final int IX_LENGTH=1;
1313    private static final int IX_TRIE_SIZE=2;
1314    private static final int IX_EXC_LENGTH=3;
1315    private static final int IX_UNFOLD_LENGTH=4;
1316
1317    private static final int IX_MAX_FULL_LENGTH=15;
1318    private static final int IX_TOP=16;
1319
1320    // definitions for 16-bit case properties word ------------------------- ***
1321
1322    /* 2-bit constants for types of cased characters */
1323    public static final int TYPE_MASK=3;
1324    public static final int NONE=0;
1325    public static final int LOWER=1;
1326    public static final int UPPER=2;
1327    public static final int TITLE=3;
1328
1329    private static final int getTypeFromProps(int props) {
1330        return props&TYPE_MASK;
1331    }
1332
1333    private static final int SENSITIVE=     4;
1334    private static final int EXCEPTION=     8;
1335
1336    private static final int DOT_MASK=      0x30;
1337    private static final int NO_DOT=        0;      /* normal characters with cc=0 */
1338    private static final int SOFT_DOTTED=   0x10;   /* soft-dotted characters with cc=0 */
1339    private static final int ABOVE=         0x20;   /* "above" accents with cc=230 */
1340    private static final int OTHER_ACCENT=  0x30;   /* other accent character (0<cc!=230) */
1341
1342    /* no exception: bits 15..6 are a 10-bit signed case mapping delta */
1343    private static final int DELTA_SHIFT=   6;
1344    private static final int DELTA_MASK=    0xffc0;
1345    private static final int MAX_DELTA=     0x1ff;
1346    private static final int MIN_DELTA=     (-MAX_DELTA-1);
1347
1348    private static final int getDelta(int props) {
1349        return (short)props>>DELTA_SHIFT;
1350    }
1351
1352    /* case-ignorable uses one of the delta bits, see gencase/store.c */
1353    private static final int CASE_IGNORABLE=0x40;
1354
1355    /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
1356    private static final int EXC_SHIFT=     4;
1357    private static final int EXC_MASK=      0xfff0;
1358    private static final int MAX_EXCEPTIONS=0x1000;
1359
1360    /* definitions for 16-bit main exceptions word ------------------------------ */
1361
1362    /* first 8 bits indicate values in optional slots */
1363    private static final int EXC_LOWER=0;
1364    private static final int EXC_FOLD=1;
1365    private static final int EXC_UPPER=2;
1366    private static final int EXC_TITLE=3;
1367    private static final int EXC_4=4;           /* reserved */
1368    private static final int EXC_5=5;           /* reserved */
1369    private static final int EXC_CLOSURE=6;
1370    private static final int EXC_FULL_MAPPINGS=7;
1371    private static final int EXC_ALL_SLOTS=8;   /* one past the last slot */
1372
1373    /* each slot is 2 uint16_t instead of 1 */
1374    private static final int EXC_DOUBLE_SLOTS=          0x100;
1375
1376    /* reserved: exception bits 11..9 */
1377
1378    /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */
1379    private static final int EXC_DOT_SHIFT=8;
1380
1381    /* normally stored in the main word, but pushed out for larger exception indexes */
1382    private static final int EXC_DOT_MASK=              0x3000;
1383    private static final int EXC_NO_DOT=                0;
1384    private static final int EXC_SOFT_DOTTED=           0x1000;
1385    private static final int EXC_ABOVE=                 0x2000; /* "above" accents with cc=230 */
1386    private static final int EXC_OTHER_ACCENT=          0x3000; /* other character (0<cc!=230) */
1387
1388    /* complex/conditional mappings */
1389    private static final int EXC_CONDITIONAL_SPECIAL=   0x4000;
1390    private static final int EXC_CONDITIONAL_FOLD=      0x8000;
1391
1392    /* definitions for lengths word for full case mappings */
1393    private static final int FULL_LOWER=    0xf;
1394    private static final int FULL_FOLDING=  0xf0;
1395    private static final int FULL_UPPER=    0xf00;
1396    private static final int FULL_TITLE=    0xf000;
1397
1398    /* maximum lengths */
1399    private static final int FULL_MAPPINGS_MAX_LENGTH=4*0xf;
1400    private static final int CLOSURE_MAX_LENGTH=0xf;
1401
1402    /* constants for reverse case folding ("unfold") data */
1403    private static final int UNFOLD_ROWS=0;
1404    private static final int UNFOLD_ROW_WIDTH=1;
1405    private static final int UNFOLD_STRING_WIDTH=2;
1406}
1407
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags