KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > sf > saxon > codenorm > Normalizer


1 package net.sf.saxon.codenorm;
2
3 import net.sf.saxon.om.XMLChar;
4
5 /**
6  * Implements Unicode Normalization Forms C, D, KC, KD.<br>
7  * Copyright (c) 1991-2005 Unicode, Inc.
8  * For terms of use, see http://www.unicode.org/terms_of_use.html
9  * For documentation, see UAX#15.<br>
10  * The Unicode Consortium makes no expressed or implied warranty of any
11  * kind, and assumes no liability for errors or omissions.
12  * No liability is assumed for incidental and consequential damages
13  * in connection with or arising out of the use of the information here.
14  * @author Mark Davis
15  * Updates for supplementary code points: Vladimir Weinstein & Markus Scherer
16  * Modified to remove dependency on ICU code: Michael Kay
17  */

18
19 public class Normalizer {
20
21     /**
22      * Create a normalizer for a given form.
23      */

24     public Normalizer(byte form) {
25         this.form = form;
26         if (data == null) {
27             data = UnicodeDataParser.build(); // load 1st time
28
}
29     }
30
31     /**
32     * Masks for the form selector
33     */

34     static final byte
35         COMPATIBILITY_MASK = 1,
36         COMPOSITION_MASK = 2;
37
38     /**
39     * Normalization Form Selector
40     */

41     public static final byte
42         D = 0 ,
43         C = COMPOSITION_MASK,
44         KD = COMPATIBILITY_MASK,
45         KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
46
47     /**
48     * Normalizes text according to the chosen form,
49     * replacing contents of the target buffer.
50     * @param source the original text, unnormalized
51     * @param target the resulting normalized text
52     */

53     public StringBuffer JavaDoc normalize(String JavaDoc source, StringBuffer JavaDoc target) {
54
55         // First decompose the source into target,
56
// then compose if the form requires.
57

58         if (source.length() != 0) {
59             internalDecompose(source, target);
60             if ((form & COMPOSITION_MASK) != 0) {
61                 internalCompose(target);
62             }
63         }
64         return target;
65     }
66
67     /**
68     * Normalizes text according to the chosen form
69     * @param source the original text, unnormalized
70     * @return target the resulting normalized text
71     */

72     public String JavaDoc normalize(String JavaDoc source) {
73         return normalize(source, new StringBuffer JavaDoc(source.length()+8)).toString();
74     }
75
76     // ======================================
77
// PRIVATES
78
// ======================================
79

80     /**
81      * The current form.
82      */

83     private byte form;
84
85     /**
86     * Decomposes text, either canonical or compatibility,
87     * replacing contents of the target buffer.
88 // * @param form the normalization form. If COMPATIBILITY_MASK
89 // * bit is on in this byte, then selects the recursive
90 // * compatibility decomposition, otherwise selects
91 // * the recursive canonical decomposition.
92     * @param source the original text, unnormalized
93     * @param target the resulting normalized text
94     */

95     private void internalDecompose(String JavaDoc source, StringBuffer JavaDoc target) {
96         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(8);
97         boolean canonical = (form & COMPATIBILITY_MASK) == 0;
98         int ch32;
99         //for (int i = 0; i < source.length(); i += (ch32<65536 ? 1 : 2)) {
100
for (int i = 0; i < source.length();) {
101             buffer.setLength(0);
102             //ch32 = UTF16.charAt(source, i);
103
ch32 = source.charAt(i++);
104             if (XMLChar.isHighSurrogate(ch32)) {
105                 char low = source.charAt(i++);
106                 ch32 = XMLChar.supplemental((char)ch32, low);
107             }
108             data.getRecursiveDecomposition(canonical, ch32, buffer);
109
110             // add all of the characters in the decomposition.
111
// (may be just the original character, if there was
112
// no decomposition mapping)
113

114             int ch;
115             //for (int j = 0; j < buffer.length(); j += (ch<65536 ? 1 : 2)) {
116
for (int j = 0; j < buffer.length();) {
117                 //ch = UTF16.charAt(buffer, j);
118
ch = buffer.charAt(j++);
119                 if (XMLChar.isHighSurrogate(ch32)) {
120                     char low = buffer.charAt(j++);
121                     ch = XMLChar.supplemental((char)ch, low);
122                 }
123                 int chClass = data.getCanonicalClass(ch);
124                 int k = target.length(); // insertion point
125
if (chClass != 0) {
126
127                     // bubble-sort combining marks as necessary
128

129                     int ch2;
130                     while (k > 0) {
131                         ch2 = target.charAt(k-1);
132                         if (XMLChar.isSurrogate(ch2)) {
133                             k--;
134                             char high = buffer.charAt(k-1);
135                             ch2 = XMLChar.supplemental(high, (char)ch2);
136                         }
137                         if (data.getCanonicalClass(ch2) <= chClass) break;
138                         k--;
139                     }
140 // for (; k > 0; k -= (ch2<65536 ? 1 : 2)) {
141
// ch2 = UTF16.charAt(target, k-1);
142
// if (data.getCanonicalClass(ch2) <= chClass) break;
143
// }
144
}
145                 if (ch < 65536) {
146                     target.insert(k, (char)ch);
147                 } else {
148                     String JavaDoc s = "" + XMLChar.highSurrogate(ch) + XMLChar.lowSurrogate(ch);
149                     target.insert(k, s);
150                 }
151                 //target.insert(k, UTF16.valueOf(ch));
152
}
153         }
154     }
155
156     /**
157     * Composes text in place. Target must already
158     * have been decomposed.
159     * @param target input: decomposed text.
160     * output: the resulting normalized text.
161     */

162     private void internalCompose(StringBuffer JavaDoc target) {
163
164         int starterPos = 0;
165         //int starterCh = UTF16.charAt(target,0);
166
//int compPos = (starterCh<65536 ? 1 : 2); // length of last composition
167
int starterCh = target.charAt(0);
168         int compPos = 1;
169         if (XMLChar.isHighSurrogate(starterCh)) {
170             starterCh = XMLChar.supplemental((char)starterCh, target.charAt(1));
171             compPos++;
172         }
173         int lastClass = data.getCanonicalClass(starterCh);
174         if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
175
int oldLen = target.length();
176
177         // Loop on the decomposed characters, combining where possible
178

179         int ch;
180         //for (int decompPos = compPos; decompPos < target.length(); decompPos += (ch<65536 ? 1 : 2)) {
181
for (int decompPos = compPos; decompPos < target.length();) {
182             ch = target.charAt(decompPos++);
183             if (XMLChar.isHighSurrogate(ch)) {
184                 ch = XMLChar.supplemental((char)ch, target.charAt(decompPos++));
185             }
186             //ch = UTF16.charAt(target, decompPos);
187
int chClass = data.getCanonicalClass(ch);
188             int composite = data.getPairwiseComposition(starterCh, ch);
189             if (composite != NormalizerData.NOT_COMPOSITE && (lastClass < chClass || lastClass == 0)) {
190                 setCharAt(target, starterPos, composite);
191                 // we know that we will only be replacing non-supplementaries by non-supplementaries
192
// so we don't have to adjust the decompPos
193
starterCh = composite;
194             } else {
195                 if (chClass == 0) {
196                     starterPos = compPos;
197                     starterCh = ch;
198                 }
199                 lastClass = chClass;
200                 setCharAt(target, compPos, ch);
201                 if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
202
decompPos += target.length() - oldLen;
203                     oldLen = target.length();
204                 }
205                 compPos += (ch<65536 ? 1 : 2);
206             }
207         }
208         target.setLength(compPos);
209     }
210
211     /**
212      * Set the 32-bit character at a particular 16-bit offset in a string buffer,
213      * replacing the previous character at that position, and taking account of the
214      * fact that either, both, or neither of the characters might be a surrogate pair.
215      */

216
217     private static void setCharAt(StringBuffer JavaDoc target, int offset, int ch32) {
218         if (ch32 < 65536) {
219             if (XMLChar.isHighSurrogate(target.charAt(offset))) {
220                 target.setCharAt(offset, (char)ch32);
221                 target.deleteCharAt(offset+1);
222             } else {
223                 target.setCharAt(offset, (char)ch32);
224             }
225         } else {
226             if (XMLChar.isHighSurrogate(target.charAt(offset))) {
227                 target.setCharAt(offset, XMLChar.highSurrogate(ch32));
228                 target.setCharAt(offset+1, XMLChar.lowSurrogate(ch32));
229             } else {
230                 target.setCharAt(offset, XMLChar.highSurrogate(ch32));
231                 target.insert(offset+1, XMLChar.lowSurrogate(ch32));
232             }
233         }
234     }
235
236     /**
237     * Contains normalization data from the Unicode Character Database.
238     * use false for the minimal set, true for the real set.
239     */

240     private static NormalizerData data = null;
241
242     /**
243     * Just accessible for testing.
244     */

245     boolean getExcluded (char ch) {
246         return data.getExcluded(ch);
247     }
248
249     /**
250     * Just accessible for testing.
251     */

252     String JavaDoc getRawDecompositionMapping (char ch) {
253         return data.getRawDecompositionMapping(ch);
254     }
255 }
Popular Tags