KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > jivesoftware > stringprep > NFKC


1 /**
2  * Copyright (C) 2004 Free Software Foundation, Inc.
3  *
4  * Author: Oliver Hitz
5  *
6  * This file is part of GNU Libidn.
7  *
8  * This library is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public License
10  * as published by the Free Software Foundation; either version 2.1 of
11  * the License, or (at your option) any later version.
12  *
13  * This library is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with this library; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
21  * USA
22  */

23
24 package org.jivesoftware.stringprep;
25
26 class NFKC {
27     /**
28      * Applies NFKC normalization to a string.
29      *
30      * @param in The string to normalize.
31      * @return An NFKC normalized string.
32      */

33     public static String JavaDoc normalizeNFKC(String JavaDoc in) {
34         StringBuilder JavaDoc out = new StringBuilder JavaDoc();
35
36         for (int i = 0; i < in.length(); i++) {
37             char code = in.charAt(i);
38
39             // In Unicode 3.0, Hangul was defined as the block from U+AC00
40
// to U+D7A3, however, since Unicode 3.2 the block extends until
41
// U+D7AF. The decomposeHangul function only decomposes until
42
// U+D7A3. Should this be changed?
43
if (code >= 0xAC00 && code <= 0xD7AF) {
44                 out.append(decomposeHangul(code));
45             } else {
46                 int index = decomposeIndex(code);
47                 if (index == -1) {
48                     out.append(code);
49                 } else {
50                     out.append(DecompositionMappings.m[index]);
51                 }
52             }
53         }
54
55         // Bring the stringbuffer into canonical order.
56
canonicalOrdering(out);
57
58         // Do the canonical composition.
59
int last_cc = 0;
60         int last_start = 0;
61
62         for (int i = 0; i < out.length(); i++) {
63             int cc = combiningClass(out.charAt(i));
64
65             if (i > 0 && (last_cc == 0 || last_cc != cc)) {
66                 // Try to combine characters
67
char a = out.charAt(last_start);
68                 char b = out.charAt(i);
69
70                 int c = compose(a, b);
71
72                 if (c != -1) {
73                     out.setCharAt(last_start, (char) c);
74                     out.deleteCharAt(i);
75                     i--;
76
77                     if (i == last_start) {
78                         last_cc = 0;
79                     } else {
80                         last_cc = combiningClass(out.charAt(i - 1));
81                     }
82                     continue;
83                 }
84             }
85
86             if (cc == 0) {
87                 last_start = i;
88             }
89
90             last_cc = cc;
91         }
92
93         return out.toString();
94     }
95
96
97     /**
98      * Returns the index inside the decomposition table, implemented
99      * using a binary search.
100      *
101      * @param c Character to look up.
102      * @return Index if found, -1 otherwise.
103      */

104     static int decomposeIndex(char c) {
105         int start = 0;
106         int end = DecompositionKeys.k.length / 2;
107
108         while (true) {
109             int half = (start + end) / 2;
110             int code = DecompositionKeys.k[half * 2];
111
112             if (c == code) {
113                 return DecompositionKeys.k[half * 2 + 1];
114             }
115             if (half == start) {
116                 // Character not found
117
return -1;
118             } else if (c > code) {
119                 start = half;
120             } else {
121                 end = half;
122             }
123         }
124     }
125
126     /**
127      * Returns the combining class of a given character.
128      *
129      * @param c The character.
130      * @return The combining class.
131      */

132     static int combiningClass(char c) {
133         int h = c >> 8;
134         int l = c & 0xff;
135
136         int i = CombiningClass.i[h];
137         if (i > -1) {
138             return CombiningClass.c[i][l];
139         } else {
140             return 0;
141         }
142     }
143
144     /**
145      * Rearranges characters in a stringbuffer in order to respect the
146      * canonical ordering properties.
147      *
148      * @param in the StringBuilder to rearrange.
149      */

150     static void canonicalOrdering(StringBuilder JavaDoc in) {
151         boolean isOrdered = false;
152
153         while (!isOrdered) {
154             isOrdered = true;
155
156             int lastCC = combiningClass(in.charAt(0));
157
158             for (int i = 0; i < in.length() - 1; i++) {
159                 int nextCC = combiningClass(in.charAt(i + 1));
160                 if (nextCC != 0 && lastCC > nextCC) {
161                     for (int j = i + 1; j > 0; j--) {
162                         if (combiningClass(in.charAt(j - 1)) <= nextCC) {
163                             break;
164                         }
165                         char t = in.charAt(j);
166                         in.setCharAt(j, in.charAt(j - 1));
167                         in.setCharAt(j - 1, t);
168                         isOrdered = false;
169                     }
170                     nextCC = lastCC;
171                 }
172                 lastCC = nextCC;
173             }
174         }
175     }
176
177     /**
178      * Returns the index inside the composition table.
179      *
180      * @param a Character to look up.
181      * @return Index if found, -1 otherwise.
182      */

183     static int composeIndex(char a) {
184         if (a >> 8 >= Composition.composePage.length) {
185             return -1;
186         }
187         int ap = Composition.composePage[a >> 8];
188         if (ap == -1) {
189             return -1;
190         }
191         return Composition.composeData[ap][a & 0xff];
192     }
193
194     /**
195      * Tries to compose two characters canonically.
196      *
197      * @param a First character.
198      * @param b Second character.
199      * @return The composed character or -1 if no composition could be
200      * found.
201      */

202     static int compose(char a, char b) {
203         int h = composeHangul(a, b);
204         if (h != -1) {
205             return h;
206         }
207
208         int ai = composeIndex(a);
209
210         if (ai >= Composition.singleFirstStart && ai < Composition.singleSecondStart) {
211             if (b == Composition.singleFirst[ai - Composition.singleFirstStart][0]) {
212                 return Composition.singleFirst[ai - Composition.singleFirstStart][1];
213             } else {
214                 return -1;
215             }
216         }
217
218         int bi = composeIndex(b);
219
220         if (bi >= Composition.singleSecondStart) {
221             if (a == Composition.singleSecond[bi - Composition.singleSecondStart][0]) {
222                 return Composition.singleSecond[bi - Composition.singleSecondStart][1];
223             } else {
224                 return -1;
225             }
226         }
227
228         if (ai >= 0 && ai < Composition.multiSecondStart &&
229                 bi >= Composition.multiSecondStart && bi < Composition.singleFirstStart) {
230             char[] f = Composition.multiFirst[ai];
231
232             if (bi - Composition.multiSecondStart < f.length) {
233                 char r = f[bi - Composition.multiSecondStart];
234                 if (r == 0) {
235                     return -1;
236                 } else {
237                     return r;
238                 }
239             }
240         }
241
242
243         return -1;
244     }
245
246     /**
247      * Entire hangul code copied from:
248      * http://www.unicode.org/unicode/reports/tr15/
249      * <p/>
250      * Several hangul specific constants
251      */

252     static final int SBase = 0xAC00;
253     static final int LBase = 0x1100;
254     static final int VBase = 0x1161;
255     static final int TBase = 0x11A7;
256     static final int LCount = 19;
257     static final int VCount = 21;
258     static final int TCount = 28;
259     static final int NCount = VCount * TCount;
260     static final int SCount = LCount * NCount;
261
262     /**
263      * Decomposes a hangul character.
264      *
265      * @param s A character to decompose.
266      * @return A string containing the hangul decomposition of the input
267      * character. If no hangul decomposition can be found, a string
268      * containing the character itself is returned.
269      */

270     static String JavaDoc decomposeHangul(char s) {
271         int SIndex = s - SBase;
272         if (SIndex < 0 || SIndex >= SCount) {
273             return String.valueOf(s);
274         }
275         StringBuilder JavaDoc result = new StringBuilder JavaDoc();
276         int L = LBase + SIndex / NCount;
277         int V = VBase + (SIndex % NCount) / TCount;
278         int T = TBase + SIndex % TCount;
279         result.append((char) L);
280         result.append((char) V);
281         if (T != TBase) result.append((char) T);
282         return result.toString();
283     }
284
285     /**
286      * Composes two hangul characters.
287      *
288      * @param a First character.
289      * @param b Second character.
290      * @return Returns the composed character or -1 if the two
291      * characters cannot be composed.
292      */

293     static int composeHangul(char a, char b) {
294         // 1. check to see if two current characters are L and V
295
int LIndex = a - LBase;
296         if (0 <= LIndex && LIndex < LCount) {
297             int VIndex = b - VBase;
298             if (0 <= VIndex && VIndex < VCount) {
299                 // make syllable of form LV
300
return SBase + (LIndex * VCount + VIndex) * TCount;
301             }
302         }
303
304         // 2. check to see if two current characters are LV and T
305
int SIndex = a - SBase;
306         if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0) {
307             int TIndex = b - TBase;
308             if (0 <= TIndex && TIndex <= TCount) {
309                 // make syllable of form LVT
310
return a + TIndex;
311             }
312         }
313         return -1;
314     }
315
316 }
Popular Tags