UCharacterProperty


1   /**
2   *******************************************************************************
3   * Copyright (C) 1996-2006, International Business Machines Corporation and    *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   
8   package com.ibm.icu.impl;
9   
10  import java.io.BufferedInputStream  ;
11  import java.io.InputStream  ;
12  import java.io.IOException  ;
13  import java.util.Locale  ;
14  import java.util.MissingResourceException  ;
15  
16  import com.ibm.icu.lang.UCharacter;
17  import com.ibm.icu.lang.UCharacterCategory;
18  import com.ibm.icu.lang.UProperty;
19  import com.ibm.icu.text.BreakIterator;
20  import com.ibm.icu.text.Normalizer;
21  import com.ibm.icu.text.UCharacterIterator;
22  import com.ibm.icu.text.UnicodeSet;
23  import com.ibm.icu.text.UTF16;
24  import com.ibm.icu.util.RangeValueIterator;
25  import com.ibm.icu.util.ULocale;
26  import com.ibm.icu.util.UResourceBundle;
27  import com.ibm.icu.util.VersionInfo;
28  
29  /**
30  * <p>Internal class used for Unicode character property database.</p>
31  * <p>This classes store binary data read from uprops.icu.
32  * It does not have the capability to parse the data into more high-level
33  * information. It only returns bytes of information when required.</p>
34  * <p>Due to the form most commonly used for retrieval, array of char is used
35  * to store the binary data.</p>
36  * <p>UCharacterPropertyDB also contains information on accessing indexes to
37  * significant points in the binary data.</p>
38  * <p>Responsibility for molding the binary data into more meaning form lies on
39  * <a HREF=UCharacter.html>UCharacter</a>.</p>
40  * @author Syn Wee Quek
41  * @since release 2.1, february 1st 2002
42  * @draft 2.1
43  */
44  
45  public final class UCharacterProperty
46  {
47      // public data members -----------------------------------------------
48  
49      /**
50      * Trie data
51      */
52      public CharTrie m_trie_;
53      /**
54       * Optimization
55       * CharTrie index array
56       */
57      public char[] m_trieIndex_;
58      /**
59       * Optimization
60       * CharTrie data array
61       */
62      public char[] m_trieData_;
63      /**
64       * Optimization
65       * CharTrie data offset
66       */
67      public int m_trieInitialValue_;
68      /**
69      * Unicode version
70      */
71      public VersionInfo m_unicodeVersion_;
72      /**
73      * Latin capital letter i with dot above
74      */
75      public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
76      /**
77      * Latin small letter i with dot above
78      */
79      public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
80      /**
81      * Latin lowercase i
82      */
83      public static final char LATIN_SMALL_LETTER_I_ = 0x69;
84      /**
85      * Character type mask
86      */
87      public static final int TYPE_MASK = 0x1F;
88  
89      // uprops.h enum UPropertySource --------------------------------------- ***
90  
91      /** No source, not a supported property. */
92      public static final int SRC_NONE=0;
93      /** From uchar.c/uprops.icu main trie */
94      public static final int SRC_CHAR=1;
95      /** From uchar.c/uprops.icu properties vectors trie */
96      public static final int SRC_PROPSVEC=2;
97      /** Hangul_Syllable_Type, from uchar.c/uprops.icu */
98      public static final int SRC_HST=3;
99      /** From unames.c/unames.icu */
100     public static final int SRC_NAMES=4;
101     /** From unorm.cpp/unorm.icu */
102     public static final int SRC_NORM=5;
103     /** From ucase.c/ucase.icu */
104     public static final int SRC_CASE=6;
105     /** From ubidi_props.c/ubidi.icu */
106     public static final int SRC_BIDI=7;
107     /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
108     public static final int SRC_CHAR_AND_PROPSVEC=8;
109     /** One more than the highest UPropertySource (SRC_) constant. */
110     public static final int SRC_COUNT=9;
111 
112     // public methods ----------------------------------------------------
113 
114     /**
115      * Java friends implementation
116      */
117     public void setIndexData(CharTrie.FriendAgent friendagent)
118     {
119         m_trieIndex_ = friendagent.getPrivateIndex();
120         m_trieData_ = friendagent.getPrivateData();
121         m_trieInitialValue_ = friendagent.getPrivateInitialValue();
122     }
123 
124     /**
125     * Gets the property value at the index.
126     * This is optimized.
127     * Note this is alittle different from CharTrie the index m_trieData_
128     * is never negative.
129     * @param ch code point whose property value is to be retrieved
130     * @return property value of code point
131     */
132     public final int getProperty(int ch)
133     {
134         if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
135             || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
136                 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
137             // BMP codepoint 0000..D7FF or DC00..FFFF
138             // optimized
139             try { // using try for ch < 0 is faster than using an if statement
140                 return m_trieData_[
141                     (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
142                           << Trie.INDEX_STAGE_2_SHIFT_)
143                     + (ch & Trie.INDEX_STAGE_3_MASK_)];
144             } catch (ArrayIndexOutOfBoundsException   e) {
145                 return m_trieInitialValue_;
146             }
147         }
148         if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
149             // lead surrogate D800..DBFF
150             return m_trieData_[
151                     (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
152                                   + (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
153                           << Trie.INDEX_STAGE_2_SHIFT_)
154                     + (ch & Trie.INDEX_STAGE_3_MASK_)];
155         }
156         if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
157             // supplementary code point 10000..10FFFF
158             // look at the construction of supplementary characters
159             // trail forms the ends of it.
160             return m_trie_.getSurrogateValue(
161                                           UTF16.getLeadSurrogate(ch),
162                                           (char)(ch & Trie.SURROGATE_MASK_));
163         }
164         // ch is out of bounds
165         // return m_dataOffset_ if there is an error, in this case we return
166         // the default value: m_initialValue_
167         // we cannot assume that m_initialValue_ is at offset 0
168         // this is for optimization.
169         return m_trieInitialValue_;
170 
171         // this all is an inlined form of return m_trie_.getCodePointValue(ch);
172     }
173 
174     /**
175     * Getting the signed numeric value of a character embedded in the property
176     * argument
177     * @param prop the character
178     * @return signed numberic value
179     */
180     public static int getSignedValue(int prop)
181     {
182         return ((short)prop >> VALUE_SHIFT_);
183     }
184 
185     /**
186     * Getting the unsigned numeric value of a character embedded in the property
187     * argument
188     * @param prop the character
189     * @return unsigned numberic value
190     */
191     ///CLOVER:OFF
192     public static int getUnsignedValue(int prop)
193     {
194         return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
195     }
196     ///CLOVER:ON
197 
198     /* internal numeric pseudo-types for special encodings of numeric values */
199     public static final int NT_FRACTION=4; /* ==UCharacter.NumericType.COUNT, must not change unless binary format version changes */
200     public static final int NT_LARGE=5;
201     public static final int NT_COUNT=6;
202 
203     /**
204      * Gets the unicode additional properties.
205      * C version getUnicodeProperties.
206      * @param codepoint codepoint whose additional properties is to be
207      *                  retrieved
208      * @param column
209      * @return unicode properties
210      */
211        public int getAdditional(int codepoint, int column) {
212         if (column == -1) {
213             return getProperty(codepoint);
214         }
215            if (column < 0 || column >= m_additionalColumnsCount_) {
216            return 0;
217        }
218        return m_additionalVectors_[
219                      m_additionalTrie_.getCodePointValue(codepoint) + column];
220        }
221 
222     static final int MY_MASK = UCharacterProperty.TYPE_MASK
223         & ((1<<UCharacterCategory.UPPERCASE_LETTER) |
224             (1<<UCharacterCategory.LOWERCASE_LETTER) |
225             (1<<UCharacterCategory.TITLECASE_LETTER) |
226             (1<<UCharacterCategory.MODIFIER_LETTER) |
227             (1<<UCharacterCategory.OTHER_LETTER));
228 
229 
230        /**
231      * <p>Get the "age" of the code point.</p>
232      * <p>The "age" is the Unicode version when the code point was first
233      * designated (as a non-character or for Private Use) or assigned a
234      * character.</p>
235      * <p>This can be useful to avoid emitting code points to receiving
236      * processes that do not accept newer characters.</p>
237      * <p>The data is from the UCD file DerivedAge.txt.</p>
238      * <p>This API does not check the validity of the codepoint.</p>
239      * @param codepoint The code point.
240      * @return the Unicode version number
241      * @draft ICU 2.1
242      */
243     public VersionInfo getAge(int codepoint)
244     {
245         int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
246         return VersionInfo.getInstance(
247                            (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
248                            version & LAST_NIBBLE_MASK_, 0, 0);
249     }
250 
251     private static final long UNSIGNED_INT_MASK = 0xffffffffL;
252 
253     private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED);
254     private static final int GC_CC_MASK = getMask(UCharacter.CONTROL);
255     private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE);
256     private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR);
257     private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR);
258     private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR);
259     /** Mask constant for multiple UCharCategory bits (Z Separators). */
260     private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK;
261 
262     /**
263      * Checks if c is in
264      * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
265      * with space=\p{Whitespace} and Control=Cc.
266      * Implements UCHAR_POSIX_GRAPH.
267      * @internal
268      */
269     private static final boolean isgraphPOSIX(int c) {
270         /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
271         /* comparing ==0 returns FALSE for the categories mentioned */
272         return (getMask(UCharacter.getType(c))&
273                 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK))
274                ==0;
275     }
276 
277     private static final class BinaryProperties{
278        int column;
279        long mask;
280        public BinaryProperties(int column,long mask){
281                this.column = column;
282                this.mask  = mask;
283        }
284    }
285    BinaryProperties[] binProps={
286        /*
287         * column and mask values for binary properties from u_getUnicodeProperties().
288         * Must be in order of corresponding UProperty,
289         * and there must be exacly one entry per binary UProperty.
290         */
291        new BinaryProperties(  1,                (  1 << ALPHABETIC_PROPERTY_) ),
292        new BinaryProperties(  1,                (  1 << ASCII_HEX_DIGIT_PROPERTY_) ),
293        new BinaryProperties( SRC_BIDI,   0 ),                                       /* UCHAR_BIDI_CONTROL */
294        new BinaryProperties( SRC_BIDI,   0 ),                                       /* UCHAR_BIDI_MIRRORED */
295        new BinaryProperties(  1,                (  1 << DASH_PROPERTY_) ),
296        new BinaryProperties(  1,                (  1 << DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_) ),
297        new BinaryProperties(  1,                (  1 << DEPRECATED_PROPERTY_) ),
298        new BinaryProperties(  1,                (  1 << DIACRITIC_PROPERTY_) ),
299        new BinaryProperties(  1,                (  1 << EXTENDER_PROPERTY_) ),
300        new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_FULL_COMPOSITION_EXCLUSION */
301        new BinaryProperties(  1,                (  1 << GRAPHEME_BASE_PROPERTY_) ),
302        new BinaryProperties(  1,                (  1 << GRAPHEME_EXTEND_PROPERTY_) ),
303        new BinaryProperties(  1,                (  1 << GRAPHEME_LINK_PROPERTY_) ),
304        new BinaryProperties(  1,                (  1 << HEX_DIGIT_PROPERTY_) ),
305        new BinaryProperties(  1,                (  1 << HYPHEN_PROPERTY_) ),
306        new BinaryProperties(  1,                (  1 << ID_CONTINUE_PROPERTY_) ),
307        new BinaryProperties(  1,                (  1 << ID_START_PROPERTY_) ),
308        new BinaryProperties(  1,                (  1 << IDEOGRAPHIC_PROPERTY_) ),
309        new BinaryProperties(  1,                (  1 << IDS_BINARY_OPERATOR_PROPERTY_) ),
310        new BinaryProperties(  1,                (  1 << IDS_TRINARY_OPERATOR_PROPERTY_) ),
311        new BinaryProperties( SRC_BIDI,   0 ),                                       /* UCHAR_JOIN_CONTROL */
312        new BinaryProperties(  1,                (  1 << LOGICAL_ORDER_EXCEPTION_PROPERTY_) ),
313        new BinaryProperties( SRC_CASE,   0 ),                                       /* UCHAR_LOWERCASE */
314        new BinaryProperties(  1,                (  1 << MATH_PROPERTY_) ),
315        new BinaryProperties(  1,                (  1 << NONCHARACTER_CODE_POINT_PROPERTY_) ),
316        new BinaryProperties(  1,                (  1 << QUOTATION_MARK_PROPERTY_) ),
317        new BinaryProperties(  1,                (  1 << RADICAL_PROPERTY_) ),
318        new BinaryProperties( SRC_CASE,   0 ),                                       /* UCHAR_SOFT_DOTTED */
319        new BinaryProperties(  1,                (  1 << TERMINAL_PUNCTUATION_PROPERTY_) ),
320        new BinaryProperties(  1,                (  1 << UNIFIED_IDEOGRAPH_PROPERTY_) ),
321        new BinaryProperties( SRC_CASE,   0 ),                                       /* UCHAR_UPPERCASE */
322        new BinaryProperties(  1,                (  1 << WHITE_SPACE_PROPERTY_) ),
323        new BinaryProperties(  1,                (  1 << XID_CONTINUE_PROPERTY_) ),
324        new BinaryProperties(  1,                (  1 << XID_START_PROPERTY_) ),
325        new BinaryProperties( SRC_CASE,   0 ),                                       /* UCHAR_CASE_SENSITIVE */
326        new BinaryProperties(  2,                (  1 << V2_S_TERM_PROPERTY_) ),
327        new BinaryProperties(  2,                (  1 << V2_VARIATION_SELECTOR_PROPERTY_) ),
328        new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_NFD_INERT */
329        new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_NFKD_INERT */
330        new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_NFC_INERT */
331        new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_NFKC_INERT */
332        new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_SEGMENT_STARTER */
333        new BinaryProperties(  2,                (  1 << V2_PATTERN_SYNTAX) ),
334        new BinaryProperties(  2,                (  1 << V2_PATTERN_WHITE_SPACE) ),
335        new BinaryProperties( SRC_CHAR_AND_PROPSVEC,  0 ),                           /* UCHAR_POSIX_ALNUM */
336        new BinaryProperties( SRC_CHAR,  0 ),                                        /* UCHAR_POSIX_BLANK */
337        new BinaryProperties( SRC_CHAR,  0 ),                                        /* UCHAR_POSIX_GRAPH */
338        new BinaryProperties( SRC_CHAR,  0 ),                                        /* UCHAR_POSIX_PRINT */
339        new BinaryProperties( SRC_CHAR,  0 )                                         /* UCHAR_POSIX_XDIGIT */
340    };
341 
342 
343     /**
344      * <p>Check a binary Unicode property for a code point.</p>
345      * <p>Unicode, especially in version 3.2, defines many more properties
346      * than the original set in UnicodeData.txt.</p>
347      * <p>This API is intended to reflect Unicode properties as defined in
348      * the Unicode Character Database (UCD) and Unicode Technical Reports
349      * (UTR).</p>
350      * <p>For details about the properties see
351      * <a HREF=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
352      * <p>For names of Unicode properties see the UCD file
353      * PropertyAliases.txt.</p>
354      * <p>This API does not check the validity of the codepoint.</p>
355      * <p>Important: If ICU is built with UCD files from Unicode versions
356      * below 3.2, then properties marked with "new" are not or
357      * not fully available.</p>
358      * @param codepoint Code point to test.
359      * @param property selector constant from com.ibm.icu.lang.UProperty,
360      *        identifies which binary property to check.
361      * @return true or false according to the binary Unicode property value
362      *         for ch. Also false if property is out of bounds or if the
363      *         Unicode version does not have data for the property at all, or
364      *         not for this code point.
365      * @see com.ibm.icu.lang.UProperty
366      * @draft ICU 2.1
367      */
368 
369     public boolean hasBinaryProperty(int codepoint, int property)
370     {
371          if(property <UProperty.BINARY_START || UProperty.BINARY_LIMIT<=property) {
372             // not a known binary property
373             return false;
374         } else {
375             long mask=binProps[property].mask;
376             int column=binProps[property].column;
377             if(mask!=0) {
378                 // systematic, directly stored properties
379                 return ((UNSIGNED_INT_MASK & getAdditional(codepoint, column)) & mask)!=0;
380             } else {
381                 if(column==SRC_CASE) {
382                     /* case mapping properties */
383                     UCaseProps csp;
384                     try {
385                         csp = UCaseProps.getSingleton();
386                     } catch (IOException   e) {
387                         return false;
388                     }
389                     switch(property) {
390                     case UProperty.LOWERCASE:
391                         return UCaseProps.LOWER==csp.getType(codepoint);
392                     case UProperty.UPPERCASE:
393                         return UCaseProps.UPPER==csp.getType(codepoint);
394                     case UProperty.SOFT_DOTTED:
395                         return csp.isSoftDotted(codepoint);
396                     case UProperty.CASE_SENSITIVE:
397                         return csp.isCaseSensitive(codepoint);
398                     default:
399                         break;
400                     }
401                 } else if(column==SRC_NORM) {
402                     /* normalization properties from unorm.icu */
403                     switch(property) {
404                     case UProperty.FULL_COMPOSITION_EXCLUSION:
405                         return NormalizerImpl.isFullCompositionExclusion(codepoint);
406                     case UProperty.NFD_INERT:
407                         return Normalizer.isNFSkippable(codepoint, Normalizer.NFD);
408                     case UProperty.NFKD_INERT:
409                         return Normalizer.isNFSkippable(codepoint, Normalizer.NFKD);
410                     case UProperty.NFC_INERT:
411                         return Normalizer.isNFSkippable(codepoint, Normalizer.NFC);
412                     case UProperty.NFKC_INERT:
413                         return Normalizer.isNFSkippable(codepoint, Normalizer.NFKC);
414                     case UProperty.SEGMENT_STARTER:
415                         return NormalizerImpl.isCanonSafeStart(codepoint);
416                     default:
417                         break;
418                     }
419                 } else if(column==SRC_BIDI) {
420                     /* bidi/shaping properties */
421                     UBiDiProps bdp;
422                     try {
423                         bdp = UBiDiProps.getSingleton();
424                     } catch (IOException   e) {
425                         return false;
426                     }
427                     switch(property) {
428                     case UProperty.BIDI_MIRRORED:
429                         return bdp.isMirrored(codepoint);
430                     case UProperty.BIDI_CONTROL:
431                         return bdp.isBidiControl(codepoint);
432                     case UProperty.JOIN_CONTROL:
433                         return bdp.isJoinControl(codepoint);
434                     default:
435                         break;
436                     }
437                 } else if(column==SRC_CHAR) {
438                     switch(property) {
439                     case UProperty.POSIX_BLANK:
440                         // "horizontal space"
441                         if(codepoint<=0x9f) {
442                             return codepoint==9 || codepoint==0x20; /* TAB or SPACE */
443                         } else {
444                             /* Zs */
445                             return UCharacter.getType(codepoint)==UCharacter.SPACE_SEPARATOR;
446                         }
447                     case UProperty.POSIX_GRAPH:
448                         return isgraphPOSIX(codepoint);
449                     case UProperty.POSIX_PRINT:
450                         /*
451                          * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
452                          *
453                          * The only cntrl character in graph+blank is TAB (in blank).
454                          * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
455                          */
456                         return (UCharacter.getType(codepoint)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(codepoint);
457                     case UProperty.POSIX_XDIGIT:
458                         /* check ASCII and Fullwidth ASCII a-fA-F */
459                         if(
460                             (codepoint<=0x66 && codepoint>=0x41 && (codepoint<=0x46 || codepoint>=0x61)) ||
461                             (codepoint>=0xff21 && codepoint<=0xff46 && (codepoint<=0xff26 || codepoint>=0xff41))
462                         ) {
463                             return true;
464                         }
465     
466                         return UCharacter.getType(codepoint)==UCharacter.DECIMAL_DIGIT_NUMBER;
467                     default:
468                         break;
469                     }
470                 } else if(column==SRC_CHAR_AND_PROPSVEC) {
471                     switch(property) {
472                     case UProperty.POSIX_ALNUM:
473                         return UCharacter.isUAlphabetic(codepoint) || UCharacter.isDigit(codepoint);
474                     default:
475                         break;
476                     }
477                 }
478             }
479         }
480         return false;
481     }
482 
483     public final int getSource(int which) {
484         if(which<UProperty.BINARY_START) {
485             return SRC_NONE; /* undefined */
486         } else if(which<UProperty.BINARY_LIMIT) {
487             if(binProps[which].mask!=0) {
488                 return SRC_PROPSVEC;
489             } else {
490                 return binProps[which].column;
491             }
492         } else if(which<UProperty.INT_START) {
493             return SRC_NONE; /* undefined */
494         } else if(which<UProperty.INT_LIMIT) {
495             switch(which) {
496             case UProperty.GENERAL_CATEGORY:
497             case UProperty.NUMERIC_TYPE:
498                 return SRC_CHAR;
499 
500             case UProperty.HANGUL_SYLLABLE_TYPE:
501                 return SRC_HST;
502 
503             case UProperty.CANONICAL_COMBINING_CLASS:
504             case UProperty.NFD_QUICK_CHECK:
505             case UProperty.NFKD_QUICK_CHECK:
506             case UProperty.NFC_QUICK_CHECK:
507             case UProperty.NFKC_QUICK_CHECK:
508             case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
509             case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
510                 return SRC_NORM;
511 
512             case UProperty.BIDI_CLASS:
513             case UProperty.JOINING_GROUP:
514             case UProperty.JOINING_TYPE:
515                 return SRC_BIDI;
516 
517             default:
518                 return SRC_PROPSVEC;
519             }
520         } else if(which<UProperty.STRING_START) {
521             switch(which) {
522             case UProperty.GENERAL_CATEGORY_MASK:
523             case UProperty.NUMERIC_VALUE:
524                 return SRC_CHAR;
525 
526             default:
527                 return SRC_NONE;
528             }
529         } else if(which<UProperty.STRING_LIMIT) {
530             switch(which) {
531             case UProperty.AGE:
532                 return SRC_PROPSVEC;
533 
534             case UProperty.BIDI_MIRRORING_GLYPH:
535                 return SRC_BIDI;
536 
537             case UProperty.CASE_FOLDING:
538             case UProperty.LOWERCASE_MAPPING:
539             case UProperty.SIMPLE_CASE_FOLDING:
540             case UProperty.SIMPLE_LOWERCASE_MAPPING:
541             case UProperty.SIMPLE_TITLECASE_MAPPING:
542             case UProperty.SIMPLE_UPPERCASE_MAPPING:
543             case UProperty.TITLECASE_MAPPING:
544             case UProperty.UPPERCASE_MAPPING:
545                 return SRC_CASE;
546 
547             case UProperty.ISO_COMMENT:
548             case UProperty.NAME:
549             case UProperty.UNICODE_1_NAME:
550                 return SRC_NAMES;
551 
552             default:
553                 return SRC_NONE;
554             }
555         } else {
556             return SRC_NONE; /* undefined */
557         }
558     }
559 
560     /**
561     * Forms a supplementary code point from the argument character<br>
562     * Note this is for internal use hence no checks for the validity of the
563     * surrogate characters are done
564     * @param lead lead surrogate character
565     * @param trail trailing surrogate character
566     * @return code point of the supplementary character
567     */
568     public static int getRawSupplementary(char lead, char trail)
569     {
570         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
571     }
572 
573     /**
574     * Loads the property data and initialize the UCharacterProperty instance.
575     * @throws MissingResourceException when data is missing or data has been corrupted
576     */
577     public static UCharacterProperty getInstance()
578     {
579         if(INSTANCE_ == null) {
580             try {
581                 INSTANCE_ = new UCharacterProperty();
582             }
583             catch (Exception   e) {
584                 throw new MissingResourceException  (e.getMessage(),"","");
585             }
586         }
587         return INSTANCE_;
588     }
589 
590     /**
591      * <p>
592      * Unicode property names and property value names are compared
593      * "loosely". Property[Value]Aliases.txt say:
594      * <quote>
595      *   "With loose matching of property names, the case distinctions,
596      *    whitespace, and '_' are ignored."
597      * </quote>
598      * </p>
599      * <p>
600      * This function does just that, for ASCII (char *) name strings.
601      * It is almost identical to ucnv_compareNames() but also ignores
602      * ASCII White_Space characters (U+0009..U+000d).
603      * </p>
604      * @param name1 name to compare
605      * @param name2 name to compare
606      * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0
607      *         if name1 is greater than name2.
608      */
609     /* to be implemented in 2.4
610      * public static int comparePropertyNames(String name1, String name2)
611     {
612         int result = 0;
613         int i1 = 0;
614         int i2 = 0;
615         while (true) {
616             char ch1 = 0;
617             char ch2 = 0;
618             // Ignore delimiters '-', '_', and ASCII White_Space
619             if (i1 < name1.length()) {
620                 ch1 = name1.charAt(i1 ++);
621             }
622             while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t'
623                    || ch1 == '\n' // synwee what is || ch1 == '\v'
624                    || ch1 == '\f' || ch1=='\r') {
625                 if (i1 < name1.length()) {
626                     ch1 = name1.charAt(i1 ++);
627                 }
628                 else {
629                     ch1 = 0;
630                 }
631             }
632             if (i2 < name2.length()) {
633                 ch2 = name2.charAt(i2 ++);
634             }
635             while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t'
636                    || ch2 == '\n' // synwee what is || ch1 == '\v'
637                    || ch2 == '\f' || ch2=='\r') {
638                 if (i2 < name2.length()) {
639                     ch2 = name2.charAt(i2 ++);
640                 }
641                 else {
642                     ch2 = 0;
643                 }
644             }
645 
646             // If we reach the ends of both strings then they match
647             if (ch1 == 0 && ch2 == 0) {
648                 return 0;
649             }
650 
651             // Case-insensitive comparison
652             if (ch1 != ch2) {
653                 result = Character.toLowerCase(ch1)
654                                                 - Character.toLowerCase(ch2);
655                 if (result != 0) {
656                     return result;
657                 }
658             }
659         }
660     }
661     */
662 
663     /**
664      * Checks if the argument c is to be treated as a white space in ICU
665      * rules. Usually ICU rule white spaces are ignored unless quoted.
666      * Equivalent to test for Pattern_White_Space Unicode property.
667      * Stable set of characters, won't change.
668      * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
669      * @param c codepoint to check
670      * @return true if c is a ICU white space
671      */
672     public static boolean isRuleWhiteSpace(int c)
673     {
674         /* "white space" in the sense of ICU rule parsers
675            This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
676            See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
677            U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
678            Equivalent to test for Pattern_White_Space Unicode property.
679         */
680         return (c >= 0x0009 && c <= 0x2029 &&
681                 (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
682                  c == 0x200E || c == 0x200F || c >= 0x2028));
683     }
684 
685     /**
686      * Get the the maximum values for some enum/int properties.
687      * @return maximum values for the integer properties.
688      */
689     public int getMaxValues(int column)
690     {
691        // return m_maxBlockScriptValue_;
692 
693         switch(column) {
694         case 0:
695             return m_maxBlockScriptValue_;
696         case 2:
697             return m_maxJTGValue_;
698         default:
699             return 0;
700         }
701     }
702 
703     /**
704      * Gets the type mask
705      * @param type character type
706      * @return mask
707      */
708     public static final int getMask(int type)
709     {
710         return 1 << type;
711     }
712 
713     // protected variables -----------------------------------------------
714 
715     /**
716      * Extra property trie
717      */
718     CharTrie m_additionalTrie_;
719     /**
720      * Extra property vectors, 1st column for age and second for binary
721      * properties.
722      */
723     int m_additionalVectors_[];
724     /**
725      * Number of additional columns
726      */
727     int m_additionalColumnsCount_;
728     /**
729      * Maximum values for block, bits used as in vector word
730      * 0
731      */
732     int m_maxBlockScriptValue_;
733     /**
734      * Maximum values for script, bits used as in vector word
735      * 0
736      */
737      int m_maxJTGValue_;
738     // private variables -------------------------------------------------
739 
740       /**
741      * UnicodeData.txt property object
742      */
743     private static UCharacterProperty INSTANCE_ = null;
744 
745     /**
746     * Default name of the datafile
747     */
748     private static final String   DATA_FILE_NAME_ = ICUResourceBundle.ICU_BUNDLE+"/uprops.icu";
749 
750     /**
751     * Default buffer size of datafile
752     */
753     private static final int DATA_BUFFER_SIZE_ = 25000;
754 
755     /**
756     * Numeric value shift
757     */
758     private static final int VALUE_SHIFT_ = 8;
759 
760     /**
761     * Mask to be applied after shifting to obtain an unsigned numeric value
762     */
763     private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
764 
765     /**
766      *
767      */
768     private static final int NUMERIC_TYPE_SHIFT = 5;
769 
770     /**
771     * To get the last 5 bits out from a data type
772     */
773     private static final int LAST_5_BIT_MASK_ = 0x1F;
774 
775     /**
776     * Shift 5 bits
777     */
778     private static final int SHIFT_5_ = 5;
779     /**
780     * Shift 10 bits
781     */
782     private static final int SHIFT_10_ = 10;
783 
784     /**
785     * Shift value for lead surrogate to form a supplementary character.
786     */
787     private static final int LEAD_SURROGATE_SHIFT_ = 10;
788     /**
789     * Offset to add to combined surrogate pair to avoid msking.
790     */
791     private static final int SURROGATE_OFFSET_ =
792                            UTF16.SUPPLEMENTARY_MIN_VALUE -
793                            (UTF16.SURROGATE_MIN_VALUE <<
794                            LEAD_SURROGATE_SHIFT_) -
795                            UTF16.TRAIL_SURROGATE_MIN_VALUE;
796     /**
797     * Latin uppercase I
798     */
799     private static final char LATIN_CAPITAL_LETTER_I_ = 0x49;
800     /**
801     * Combining dot above
802     */
803     private static final char COMBINING_DOT_ABOVE_ = 0x307;
804     /**
805     * LATIN SMALL LETTER J
806     */
807     private static final int LATIN_SMALL_LETTER_J_ = 0x6a;
808     /**
809     * LATIN SMALL LETTER I WITH OGONEK
810     */
811     private static final int LATIN_SMALL_LETTER_I_WITH_OGONEK_ = 0x12f;
812     /**
813     * LATIN SMALL LETTER I WITH TILDE BELOW
814     */
815     private static final int LATIN_SMALL_LETTER_I_WITH_TILDE_BELOW_ = 0x1e2d;
816     /**
817     * LATIN SMALL LETTER I WITH DOT BELOW
818     */
819     private static final int LATIN_SMALL_LETTER_I_WITH_DOT_BELOW_ = 0x1ecb;
820     /**
821     * Combining class for combining mark above
822     */
823     private static final int COMBINING_MARK_ABOVE_CLASS_ = 230;
824 
825     /**
826     * LATIN CAPITAL LETTER J
827     */
828     private static final int LATIN_CAPITAL_LETTER_J_ = 0x4a;
829 
830     /**
831     * LATIN CAPITAL LETTER I WITH OGONEK
832     */
833     private static final int LATIN_CAPITAL_I_WITH_OGONEK_ = 0x12e;
834     /**
835     * LATIN CAPITAL LETTER I WITH TILDE
836     */
837     private static final int LATIN_CAPITAL_I_WITH_TILDE_ = 0x128;
838     /**
839     * LATIN CAPITAL LETTER I WITH GRAVE
840     */
841     private static final int LATIN_CAPITAL_I_WITH_GRAVE_ = 0xcc;
842     /**
843     * LATIN CAPITAL LETTER I WITH ACUTE
844     */
845     private static final int LATIN_CAPITAL_I_WITH_ACUTE_ = 0xcd;
846     /**
847     * COMBINING GRAVE ACCENT
848     */
849     private static final int COMBINING_GRAVE_ACCENT_ = 0x300;
850     /**
851     * COMBINING ACUTE ACCENT
852     */
853     private static final int COMBINING_ACUTE_ACCENT_ = 0x301;
854     /**
855     * COMBINING TILDE
856     */
857     private static final int COMBINING_TILDE_ = 0x303;
858     /**
859     * Greek capital letter sigma
860     */
861     private static final char GREEK_CAPITAL_LETTER_SIGMA_ = 0x3a3;
862     /**
863     * Greek small letter sigma
864     */
865     private static final char GREEK_SMALL_LETTER_SIGMA_ = 0x3c3;
866     /**
867     * Greek small letter rho
868     */
869     private static final char GREEK_SMALL_LETTER_RHO_ = 0x3c2;
870     /**
871     * Hyphens
872     */
873     private static final int HYPHEN_      = 0x2010;
874     private static final int SOFT_HYPHEN_ = 0xAD;
875     /**
876     * To get the last character out from a data type
877     */
878     private static final int LAST_CHAR_MASK_ = 0xFFFF;
879     /**
880     * To get the last byte out from a data type
881     */
882     private static final int LAST_BYTE_MASK_ = 0xFF;
883     /**
884     * Shift 16 bits
885     */
886     private static final int SHIFT_16_ = 16;
887 
888     // additional properties ----------------------------------------------
889 
890     /**
891      * Additional properties used in internal trie data
892      */
893     /*
894      * Properties in vector word 1
895      * Each bit encodes one binary property.
896      * The following constants represent the bit number, use 1<<UPROPS_XYZ.
897      * UPROPS_BINARY_1_TOP<=32!
898      *
899      * Keep this list of property enums in sync with
900      * propListNames[] in icu/source/tools/genprops/props2.c!
901      *
902      * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
903      */
904     private static final int WHITE_SPACE_PROPERTY_ = 0;
905     private static final int BIDI_CONTROL_PROPERTY_ = 1;
906     private static final int JOIN_CONTROL_PROPERTY_ = 2;
907     private static final int DASH_PROPERTY_ = 3;
908     private static final int HYPHEN_PROPERTY_ = 4;
909     private static final int QUOTATION_MARK_PROPERTY_ = 5;
910     private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 6;
911     private static final int MATH_PROPERTY_ = 7;
912     private static final int HEX_DIGIT_PROPERTY_ = 8;
913     private static final int ASCII_HEX_DIGIT_PROPERTY_ = 9;
914     private static final int ALPHABETIC_PROPERTY_ = 10;
915     private static final int IDEOGRAPHIC_PROPERTY_ = 11;
916     private static final int DIACRITIC_PROPERTY_ = 12;
917     private static final int EXTENDER_PROPERTY_ = 13;
918     private static final int LOWERCASE_PROPERTY_ = 14;
919     private static final int UPPERCASE_PROPERTY_ = 15;
920     private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 16;
921     private static final int GRAPHEME_EXTEND_PROPERTY_ = 17;
922     private static final int GRAPHEME_LINK_PROPERTY_ = 18;
923     private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 19;
924     private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 20;
925     private static final int RADICAL_PROPERTY_ = 21;
926     private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 22;
927     private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 23;
928     private static final int DEPRECATED_PROPERTY_ = 24;
929     private static final int SOFT_DOTTED_PROPERTY_ = 25;
930     private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 26;
931     private static final int XID_START_PROPERTY_ = 27;
932     private static final int XID_CONTINUE_PROPERTY_ = 28;
933     private static final int ID_START_PROPERTY_    = 29;
934     private static final int ID_CONTINUE_PROPERTY_ = 30;
935     private static final int GRAPHEME_BASE_PROPERTY_ = 31;
936     private static final int BINARY_1_TOP_PROPERTY_ = 32;
937 
938     /**
939      * First nibble shift
940      */
941     private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
942     /**
943      * Second nibble mask
944      */
945     private static final int LAST_NIBBLE_MASK_ = 0xF;
946     /**
947      * Age value shift
948      */
949     private static final int AGE_SHIFT_ = 24;
950 
951     // boolean properties in vector word 2
952     private static final int V2_S_TERM_PROPERTY_ = 24;
953     private static final int V2_VARIATION_SELECTOR_PROPERTY_ = 25;
954     private static final int V2_PATTERN_SYNTAX = 26;                   /* new in ICU 3.4 and Unicode 4.1 */
955     private static final int V2_PATTERN_WHITE_SPACE = 27;
956 
957     // private constructors --------------------------------------------------
958 
959     /**
960     * Constructor
961     * @exception IOException thrown when data reading fails or data corrupted
962     */
963     private UCharacterProperty() throws IOException  
964     {
965         // jar access
966         InputStream   is = ICUData.getRequiredStream(DATA_FILE_NAME_);
967         BufferedInputStream   b = new BufferedInputStream  (is, DATA_BUFFER_SIZE_);
968         UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
969         reader.read(this);
970         b.close();
971 
972         m_trie_.putIndexData(this);
973     }
974 
975     // private methods -------------------------------------------------------
976 
977     /*
978      * Compare additional properties to see if it has argument type
979      * @param property 32 bit properties
980      * @param type character type
981      * @return true if property has type
982      */
983     /*private boolean compareAdditionalType(int property, int type)
984     {
985         return (property & (1 << type)) != 0;
986     }*/
987 
988     // property starts for UnicodeSet -------------------------------------- ***
989 
990     private static final int TAB     = 0x0009;
991     private static final int LF      = 0x000a;
992     private static final int FF      = 0x000c;
993     private static final int CR      = 0x000d;
994     private static final int U_A     = 0x0041;
995     private static final int U_F     = 0x0046;
996     private static final int U_Z     = 0x005a;
997     private static final int U_a     = 0x0061;
998     private static final int U_f     = 0x0066;
999     private static final int U_z     = 0x007a;
1000    private static final int DEL     = 0x007f;
1001    private static final int NL      = 0x0085;
1002    private static final int NBSP    = 0x00a0;
1003    private static final int CGJ     = 0x034f;
1004    private static final int FIGURESP= 0x2007;
1005    private static final int HAIRSP  = 0x200a;
1006    private static final int ZWNJ    = 0x200c;
1007    private static final int ZWJ     = 0x200d;
1008    private static final int RLM     = 0x200f;
1009    private static final int NNBSP   = 0x202f;
1010    private static final int WJ      = 0x2060;
1011    private static final int INHSWAP = 0x206a;
1012    private static final int NOMDIG  = 0x206f;
1013    private static final int U_FW_A  = 0xff21;
1014    private static final int U_FW_F  = 0xff26;
1015    private static final int U_FW_Z  = 0xff3a;
1016    private static final int U_FW_a  = 0xff41;
1017    private static final int U_FW_f  = 0xff46;
1018    private static final int U_FW_z  = 0xff5a;
1019    private static final int ZWNBSP  = 0xfeff;
1020
1021    /* for Hangul_Syllable_Type */
1022    public void uhst_addPropertyStarts(UnicodeSet set) {
1023        /* add code points with hardcoded properties, plus the ones following them */
1024
1025        /*
1026         * Add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE.
1027         * First, we add fixed boundaries for the blocks of Jamos.
1028         * Then we check in loops to see where the current Unicode version
1029         * actually stops assigning such Jamos. We start each loop
1030         * at the end of the per-Jamo-block assignments in Unicode 4 or earlier.
1031         * (These have not changed since Unicode 2.)
1032         */
1033        int c, value, value2;
1034
1035        set.add(0x1100);
1036        value=UCharacter.HangulSyllableType.LEADING_JAMO;
1037        for(c=0x115a; c<=0x115f; ++c) {
1038            value2= UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
1039            if(value!=value2) {
1040                value=value2;
1041                set.add(c);
1042            }
1043        }
1044
1045        set.add(0x1160);
1046        value=UCharacter.HangulSyllableType.VOWEL_JAMO;
1047        for(c=0x11a3; c<=0x11a7; ++c) {
1048            value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
1049            if(value!=value2) {
1050                value=value2;
1051                set.add(c);
1052            }
1053        }
1054
1055        set.add(0x11a8);
1056        value=UCharacter.HangulSyllableType.TRAILING_JAMO;
1057        for(c=0x11fa; c<=0x11ff; ++c) {
1058            value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
1059            if(value!=value2) {
1060                value=value2;
1061                set.add(c);
1062            }
1063        }
1064    }
1065
1066    public UnicodeSet addPropertyStarts(UnicodeSet set) {
1067        /* add the start code point of each same-value range of the main trie */
1068        TrieIterator propsIter = new TrieIterator(m_trie_);
1069        RangeValueIterator.Element propsResult = new RangeValueIterator.Element();
1070          while(propsIter.next(propsResult)){
1071            set.add(propsResult.start);
1072        }
1073
1074        /* add code points with hardcoded properties, plus the ones following them */
1075
1076        /* add for u_isblank() */
1077        set.add(TAB);
1078        set.add(TAB+1);
1079
1080        /* add for IS_THAT_CONTROL_SPACE() */
1081        set.add(CR+1); /* range TAB..CR */
1082        set.add(0x1c);
1083        set.add(0x1f+1);
1084        set.add(NL);
1085        set.add(NL+1);
1086
1087        /* add for u_isIDIgnorable() what was not added above */
1088        set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
1089        set.add(HAIRSP);
1090        set.add(RLM+1);
1091        set.add(INHSWAP);
1092        set.add(NOMDIG+1);
1093        set.add(ZWNBSP);
1094        set.add(ZWNBSP+1);
1095
1096        /* add no-break spaces for u_isWhitespace() what was not added above */
1097        set.add(NBSP);
1098        set.add(NBSP+1);
1099        set.add(FIGURESP);
1100        set.add(FIGURESP+1);
1101        set.add(NNBSP);
1102        set.add(NNBSP+1);
1103
1104        /* add for u_charDigitValue() */
1105        // TODO remove when UCharacter.getHanNumericValue() is changed to just return
1106        // Unicode numeric values 
1107        set.add(0x3007);
1108        set.add(0x3008);
1109        set.add(0x4e00);
1110        set.add(0x4e01);
1111        set.add(0x4e8c);
1112        set.add(0x4e8d);
1113        set.add(0x4e09);
1114        set.add(0x4e0a);
1115        set.add(0x56db);
1116        set.add(0x56dc);
1117        set.add(0x4e94);
1118        set.add(0x4e95);
1119        set.add(0x516d);
1120        set.add(0x516e);
1121        set.add(0x4e03);
1122        set.add(0x4e04);
1123        set.add(0x516b);
1124        set.add(0x516c);
1125        set.add(0x4e5d);
1126        set.add(0x4e5e);
1127
1128        /* add for u_digit() */
1129        set.add(U_a);
1130        set.add(U_z+1);
1131        set.add(U_A);
1132        set.add(U_Z+1);
1133        set.add(U_FW_a);
1134        set.add(U_FW_z+1);
1135        set.add(U_FW_A);
1136        set.add(U_FW_Z+1);
1137
1138        /* add for u_isxdigit() */
1139        set.add(U_f+1);
1140        set.add(U_F+1);
1141        set.add(U_FW_f+1);
1142        set.add(U_FW_F+1);
1143
1144        /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
1145        set.add(WJ); /* range WJ..NOMDIG */
1146        set.add(0xfff0);
1147        set.add(0xfffb+1);
1148        set.add(0xe0000);
1149        set.add(0xe0fff+1);
1150
1151        /* add for UCHAR_GRAPHEME_BASE and others */
1152        set.add(CGJ);
1153        set.add(CGJ+1);
1154
1155        return set; // for chaining
1156    }
1157
1158    public void upropsvec_addPropertyStarts(UnicodeSet set) {
1159        /* add the start code point of each same-value range of the properties vectors trie */
1160        if(m_additionalColumnsCount_>0) {
1161            /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
1162            TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
1163            RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
1164            while(propsVectorsIter.next(propsVectorsResult)){
1165                set.add(propsVectorsResult.start);
1166            }
1167        }
1168    }
1169
1170/*----------------------------------------------------------------
1171 * Inclusions list
1172 *----------------------------------------------------------------*/
1173
1174    /*
1175     * Return a set of characters for property enumeration.
1176     * The set implicitly contains 0x110000 as well, which is one more than the highest
1177     * Unicode code point.
1178     *
1179     * This set is used as an ordered list - its code points are ordered, and
1180     * consecutive code points (in Unicode code point order) in the set define a range.
1181     * For each two consecutive characters (start, limit) in the set,
1182     * all of the UCD/normalization and related properties for
1183     * all code points start..limit-1 are all the same,
1184     * except for character names and ISO comments.
1185     *
1186     * All Unicode code points U+0000..U+10ffff are covered by these ranges.
1187     * The ranges define a partition of the Unicode code space.
1188     * ICU uses the inclusions set to enumerate properties for generating
1189     * UnicodeSets containing all code points that have a certain property value.
1190     *
1191     * The Inclusion List is generated from the UCD. It is generated
1192     * by enumerating the data tries, and code points for hardcoded properties
1193     * are added as well.
1194     *
1195     * --------------------------------------------------------------------------
1196     *
1197     * The following are ideas for getting properties-unique code point ranges,
1198     * with possible optimizations beyond the current implementation.
1199     * These optimizations would require more code and be more fragile.
1200     * The current implementation generates one single list (set) for all properties.
1201     *
1202     * To enumerate properties efficiently, one needs to know ranges of
1203     * repetitive values, so that the value of only each start code point
1204     * can be applied to the whole range.
1205     * This information is in principle available in the uprops.icu/unorm.icu data.
1206     *
1207     * There are two obstacles:
1208     *
1209     * 1. Some properties are computed from multiple data structures,
1210     *    making it necessary to get repetitive ranges by intersecting
1211     *    ranges from multiple tries.
1212     *
1213     * 2. It is not economical to write code for getting repetitive ranges
1214     *    that are precise for each of some 50 properties.
1215     *
1216     * Compromise ideas:
1217     *
1218     * - Get ranges per trie, not per individual property.
1219     *   Each range contains the same values for a whole group of properties.
1220     *   This would generate currently five range sets, two for uprops.icu tries
1221     *   and three for unorm.icu tries.
1222     *
1223     * - Combine sets of ranges for multiple tries to get sufficient sets
1224     *   for properties, e.g., the uprops.icu main and auxiliary tries
1225     *   for all non-normalization properties.
1226     *
1227     * Ideas for representing ranges and combining them:
1228     *
1229     * - A UnicodeSet could hold just the start code points of ranges.
1230     *   Multiple sets are easily combined by or-ing them together.
1231     *
1232     * - Alternatively, a UnicodeSet could hold each even-numbered range.
1233     *   All ranges could be enumerated by using each start code point
1234     *   (for the even-numbered ranges) as well as each limit (end+1) code point
1235     *   (for the odd-numbered ranges).
1236     *   It should be possible to combine two such sets by xor-ing them,
1237     *   but no more than two.
1238     *
1239     * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
1240     * but the first one is certainly simpler and applicable for combining more than
1241     * two range sets.
1242     *
1243     * It is possible to combine all range sets for all uprops/unorm tries into one
1244     * set that can be used for all properties.
1245     * As an optimization, there could be less-combined range sets for certain
1246     * groups of properties.
1247     * The relationship of which less-combined range set to use for which property
1248     * depends on the implementation of the properties and must be hardcoded
1249     * - somewhat error-prone and higher maintenance but can be tested easily
1250     * by building property sets "the simple way" in test code.
1251     *
1252     * ---
1253     *
1254     * Do not use a UnicodeSet pattern because that causes infinite recursion;
1255     * UnicodeSet depends on the inclusions set.
1256     *
1257     * ---
1258     *
1259     * getInclusions() is commented out starting 2005-feb-12 because
1260     * UnicodeSet now calls the uxyz_addPropertyStarts() directly,
1261     * and only for the relevant property source.
1262     */
1263    /*
1264    public UnicodeSet getInclusions() {
1265        UnicodeSet set = new UnicodeSet();
1266        NormalizerImpl.addPropertyStarts(set);
1267        addPropertyStarts(set);
1268        return set;
1269    }
1270    */
1271}
1272
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags