KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > java > lang > ConditionalSpecialCasing


1 /*
2  * @(#)ConditionalSpecialCasing.java 1.4 04/09/14
3  *
4  * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
5  * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6  */

7
8 package java.lang;
9
10 import java.text.BreakIterator JavaDoc;
11 import java.util.HashSet JavaDoc;
12 import java.util.Hashtable JavaDoc;
13 import java.util.Iterator JavaDoc;
14 import java.util.Locale JavaDoc;
15 import sun.text.Normalizer;
16
17
18 /**
19  * This is a utility class for <code>String.toLowerCase()</code> and
20  * <code>String.toUpperCase()</code>, that handles special casing with
21  * conditions. In other words, it handles the mappings with conditions
22  * that are defined in
23  * <a HREF="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special
24  * Casing Properties</a> file.
25  * <p>
26  * Note that the unconditional case mappings (including 1:M mappings)
27  * are handled in <code>Character.toLower/UpperCase()</code>.
28  */

29 final class ConditionalSpecialCasing {
30
31     // context conditions.
32
final static int FINAL_CASED = 1;
33     final static int AFTER_SOFT_DOTTED = 2;
34     final static int MORE_ABOVE = 3;
35     final static int AFTER_I = 4;
36     final static int NOT_BEFORE_DOT = 5;
37
38     // combining class definitions
39
final static int COMBINING_CLASS_ABOVE = 230;
40
41     // Special case mapping entries
42
static Entry[] entry = {
43     //# ================================================================================
44
//# Conditional mappings
45
//# ================================================================================
46
new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA
47

48     //# ================================================================================
49
//# Locale-sensitive mappings
50
//# ================================================================================
51
//# Lithuanian
52
new Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt", AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE
53
new Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I
54
new Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J
55
new Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK
56
new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE
57
new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE
58
new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE
59

60     //# ================================================================================
61
//# Turkish and Azeri
62
// new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
63
// new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
64
new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE
65
new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE
66
new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
67
new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
68
new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I
69
new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I
70
};
71
72     // A hash table that contains the above entries
73
static Hashtable JavaDoc entryTable = new Hashtable JavaDoc();
74     static {
75     // create hashtable from the entry
76
for (int i = 0; i < entry.length; i ++) {
77         Entry cur = entry[i];
78         Integer JavaDoc cp = new Integer JavaDoc(cur.getCodePoint());
79         HashSet JavaDoc set = (HashSet JavaDoc)entryTable.get(cp);
80         if (set == null) {
81         set = new HashSet JavaDoc();
82         }
83         set.add(cur);
84         entryTable.put(cp, set);
85     }
86     }
87     
88     static int toLowerCaseEx(String JavaDoc src, int index, Locale JavaDoc locale) {
89         char[] result = lookUpTable(src, index, locale, true);
90
91     if (result != null) {
92         if (result.length == 1) {
93         return result[0];
94         } else {
95         return Character.ERROR;
96         }
97     } else {
98         // default to Character class' one
99
return Character.toLowerCase(src.codePointAt(index));
100     }
101     }
102
103     static int toUpperCaseEx(String JavaDoc src, int index, Locale JavaDoc locale) {
104         char[] result = lookUpTable(src, index, locale, false);
105
106     if (result != null) {
107         if (result.length == 1) {
108         return result[0];
109         } else {
110         return Character.ERROR;
111         }
112     } else {
113         // default to Character class' one
114
return Character.toUpperCaseEx(src.codePointAt(index));
115     }
116     }
117
118     static char[] toLowerCaseCharArray(String JavaDoc src, int index, Locale JavaDoc locale) {
119         return lookUpTable(src, index, locale, true);
120     }
121
122     static char[] toUpperCaseCharArray(String JavaDoc src, int index, Locale JavaDoc locale) {
123         char[] result = lookUpTable(src, index, locale, false);
124     if (result != null) {
125         return result;
126     } else {
127         return Character.toUpperCaseCharArray(src.codePointAt(index));
128     }
129     }
130
131     private static char[] lookUpTable(String JavaDoc src, int index, Locale JavaDoc locale, boolean bLowerCasing) {
132     HashSet JavaDoc set = (HashSet JavaDoc)entryTable.get(new Integer JavaDoc(src.codePointAt(index)));
133
134     if (set != null) {
135         Iterator JavaDoc iter = set.iterator();
136         String JavaDoc currentLang = locale.getLanguage();
137         while (iter.hasNext()) {
138         Entry entry = (Entry)iter.next();
139         String JavaDoc conditionLang= entry.getLanguage();
140         if (((conditionLang == null) || (conditionLang.equals(currentLang))) &&
141             isConditionMet(src, index, locale, entry.getCondition())) {
142             return (bLowerCasing ? entry.getLowerCase() : entry.getUpperCase());
143         }
144         }
145     }
146
147     return null;
148     }
149
150     private static boolean isConditionMet(String JavaDoc src, int index, Locale JavaDoc locale, int condition) {
151     switch (condition) {
152     case FINAL_CASED:
153         return isFinalCased(src, index, locale);
154
155     case AFTER_SOFT_DOTTED:
156         return isAfterSoftDotted(src, index);
157
158     case MORE_ABOVE:
159         return isMoreAbove(src, index);
160
161     case AFTER_I:
162         return isAfterI(src, index);
163
164     case NOT_BEFORE_DOT:
165         return !isBeforeDot(src, index);
166
167     default:
168         return true;
169     }
170     }
171
172     /**
173      * Implements the "Final_Cased" condition
174      *
175      * Specification: Within the closest word boundaries containing C, there is a cased
176      * letter before C, and there is no cased letter after C.
177      *
178      * Regular Expression:
179      * Before C: [{cased==true}][{wordBoundary!=true}]*
180      * After C: !([{wordBoundary!=true}]*[{cased}])
181      */

182     private static boolean isFinalCased(String JavaDoc src, int index, Locale JavaDoc locale) {
183     BreakIterator JavaDoc wordBoundary = BreakIterator.getWordInstance(locale);
184     wordBoundary.setText(src);
185     int ch;
186
187     // Look for a preceding 'cased' letter
188
for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);
189         i -= Character.charCount(ch)) {
190
191         ch = src.codePointBefore(i);
192         if (isCased(ch)) {
193
194         int len = src.length();
195         // Check that there is no 'cased' letter after the index
196
for (i = index + Character.charCount(src.codePointAt(index));
197             (i < len) && !wordBoundary.isBoundary(i);
198             i += Character.charCount(ch)) {
199
200             ch = src.codePointAt(i);
201             if (isCased(ch)) {
202             return false;
203             }
204         }
205
206         return true;
207         }
208     }
209
210     return false;
211     }
212
213     /**
214      * Implements the "After_I" condition
215      *
216      * Specification: The last preceding base character was an uppercase I,
217      * and there is no intervening combining character class 230 (ABOVE).
218      *
219      * Regular Expression:
220      * Before C: [I]([{cc!=230}&{cc!=0}])*
221      */

222     private static boolean isAfterI(String JavaDoc src, int index) {
223     int ch;
224     int cc;
225
226     // Look for the last preceding base character
227
for (int i = index; i > 0; i -= Character.charCount(ch)) {
228
229         ch = src.codePointBefore(i);
230
231         if (ch == 'I') {
232         return true;
233         } else {
234         cc = Normalizer.getClass(ch);
235         if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
236             return false;
237         }
238         }
239     }
240
241     return false;
242     }
243
244     /**
245      * Implements the "After_Soft_Dotted" condition
246      *
247      * Specification: The last preceding character with combining class
248      * of zero before C was Soft_Dotted, and there is no intervening
249      * combining character class 230 (ABOVE).
250      *
251      * Regular Expression:
252      * Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*
253      */

254     private static boolean isAfterSoftDotted(String JavaDoc src, int index) {
255     int ch;
256     int cc;
257
258     // Look for the last preceding character
259
for (int i = index; i > 0; i -= Character.charCount(ch)) {
260
261         ch = src.codePointBefore(i);
262
263         if (isSoftDotted(ch)) {
264         return true;
265         } else {
266         cc = Normalizer.getClass(ch);
267         if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
268             return false;
269         }
270         }
271     }
272
273     return false;
274     }
275
276     /**
277      * Implements the "More_Above" condition
278      *
279      * Specification: C is followed by one or more characters of combining
280      * class 230 (ABOVE) in the combining character sequence.
281      *
282      * Regular Expression:
283      * After C: [{cc!=0}]*[{cc==230}]
284      */

285     private static boolean isMoreAbove(String JavaDoc src, int index) {
286     int ch;
287     int cc;
288     int len = src.length();
289
290     // Look for a following ABOVE combining class character
291
for (int i = index + Character.charCount(src.codePointAt(index));
292         i < len; i += Character.charCount(ch)) {
293         
294         ch = src.codePointAt(i);
295         cc = Normalizer.getClass(ch);
296
297         if (cc == COMBINING_CLASS_ABOVE) {
298         return true;
299         } else if (cc == 0) {
300         return false;
301         }
302     }
303
304     return false;
305     }
306
307     /**
308      * Implements the "Before_Dot" condition
309      *
310      * Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>.
311      * Any sequence of characters with a combining class that is
312      * neither 0 nor 230 may intervene between the current character
313      * and the combining dot above.
314      *
315      * Regular Expression:
316      * After C: ([{cc!=230}&{cc!=0}])*[?]
317      */

318     private static boolean isBeforeDot(String JavaDoc src, int index) {
319     int ch;
320     int cc;
321     int len = src.length();
322
323     // Look for a following COMBINING DOT ABOVE
324
for (int i = index + Character.charCount(src.codePointAt(index));
325         i < len; i += Character.charCount(ch)) {
326         
327         ch = src.codePointAt(i);
328
329         if (ch == '\u0307') {
330         return true;
331         } else {
332         cc = Normalizer.getClass(ch);
333         if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
334             return false;
335         }
336         }
337     }
338
339     return false;
340     }
341
342     /**
343      * Examines whether a character is 'cased'.
344      *
345      * A character C is defined to be 'cased' if and only if at least one of
346      * following are true for C: uppercase==true, or lowercase==true, or
347      * general_category==titlecase_letter.
348      *
349      * The uppercase and lowercase property values are specified in the data
350      * file DerivedCoreProperties.txt in the Unicode Character Database.
351      */

352     private static boolean isCased(int ch) {
353     int type = Character.getType(ch);
354     if (type == Character.LOWERCASE_LETTER ||
355         type == Character.UPPERCASE_LETTER ||
356         type == Character.TITLECASE_LETTER) {
357         return true;
358     } else {
359         // Check for Other_Lowercase and Other_Uppercase
360
//
361
if ((ch >= 0x02B0) && (ch <= 0x02B8)) {
362         // MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y
363
return true;
364         } else if ((ch >= 0x02C0) && (ch <= 0x02C1)) {
365         // MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP
366
return true;
367         } else if ((ch >= 0x02E0) && (ch <= 0x02E4)) {
368         // MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
369
return true;
370         } else if (ch == 0x0345) {
371         // COMBINING GREEK YPOGEGRAMMENI
372
return true;
373         } else if (ch == 0x037A) {
374         // GREEK YPOGEGRAMMENI
375
return true;
376         } else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) {
377         // MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI
378
return true;
379         } else if ((ch >= 0x2160) && (ch <= 0x217F)) {
380         // ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
381
// SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
382
return true;
383         } else if ((ch >= 0x24B6) && (ch <= 0x24E9)) {
384         // CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
385
// CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
386
return true;
387         } else {
388         return false;
389         }
390     }
391     }
392
393     private static boolean isSoftDotted(int ch) {
394     switch (ch) {
395     case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I
396
case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J
397
case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK
398
case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE
399
case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
400
case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE
401
case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I
402
case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW
403
case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW
404
case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I
405
return true;
406     default:
407         return false;
408     }
409     }
410
411     /**
412      * An internal class that represents an entry in the Special Casing Properties.
413      */

414     static class Entry {
415     int ch;
416     char [] lower;
417     char [] upper;
418     String JavaDoc lang;
419     int condition;
420
421     Entry(int ch, char[] lower, char[] upper, String JavaDoc lang, int condition) {
422         this.ch = ch;
423         this.lower = lower;
424         this.upper = upper;
425         this.lang = lang;
426         this.condition = condition;
427     }
428
429     int getCodePoint() {
430         return ch;
431     }
432
433     char[] getLowerCase() {
434         return lower;
435     }
436
437     char[] getUpperCase() {
438         return upper;
439     }
440
441     String JavaDoc getLanguage() {
442         return lang;
443     }
444     
445     int getCondition() {
446         return condition;
447     }
448     }
449 }
450
Popular Tags