KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > sf > saxon > codenorm > UnicodeDataGenerator


1 package net.sf.saxon.codenorm;
2
3 import net.sf.saxon.om.FastStringBuffer;
4
5 import java.io.*;
6 import java.util.ArrayList JavaDoc;
7 import java.util.Iterator JavaDoc;
8 import java.util.List JavaDoc;
9
10 /**
11  * This class reads the Unicode character database, extracts information needed
12  * to perform unicode normalization, and writes this information out in the form of the
13  * Java "source" module UnicodeData.java. This class is therefore executed (via its main()
14  * method) at the time Saxon is built - it only needs to be rerun when the Unicode data tables
15  * have changed.
16  * <p>
17  * The class is derived from the sample program NormalizerData.java published by the
18  * Unicode consortium. That code has been modified so that instead of building the run-time
19  * data structures directly, they are written to a Java "source" module, which is then
20  * compiled. Also, the ability to construct a condensed version of the data tables has been
21  * removed.
22  * <p>
23  * Copyright (c) 1991-2005 Unicode, Inc.
24  * For terms of use, see http://www.unicode.org/terms_of_use.html
25  * For documentation, see UAX#15.<br>
26  * @author Mark Davis
27  * @author Michael Kay: Saxon modifications.
28  */

29 class UnicodeDataGenerator {
30     static final String JavaDoc copyright = "Copyright © 1998-1999 Unicode, Inc.";
31
32     /**
33      * Testing flags
34      */

35
36     private static final boolean DEBUG = false;
37
38     /**
39      * Constants for the data file version to use.
40      */

41 // static final boolean NEW_VERSION = true;
42
private static String JavaDoc dir;
43
44     private static String JavaDoc UNICODE_DATA = "UnicodeData.txt";
45     private static String JavaDoc COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt";
46
47     private static List JavaDoc canonicalClassKeys = new ArrayList JavaDoc(30000);
48     private static List JavaDoc canonicalClassValues = new ArrayList JavaDoc(30000);
49
50     private static List JavaDoc decompositionKeys = new ArrayList JavaDoc(6000);
51     private static List JavaDoc decompositionValues = new ArrayList JavaDoc(6000);
52
53     private static List JavaDoc exclusionList = new ArrayList JavaDoc(200);
54     private static List JavaDoc compatibilityList = new ArrayList JavaDoc(8000);
55
56     private UnicodeDataGenerator() {
57     }
58
59     /**
60      * Called exactly once by NormalizerData to build the static data
61      */

62
63     static void build() {
64         try {
65             readExclusionList();
66             buildDecompositionTables();
67         } catch (java.io.IOException JavaDoc e) {
68             System.err.println("Can't load data file." + e + ", " + e.getMessage());
69         }
70     }
71
72 // =============================================================
73
// Building Decomposition Tables
74
// =============================================================
75

76     /**
77      * Reads exclusion list and stores the data
78      */

79
80     // Modified by MHK: the original code expects the hex character code to be always four hex digits
81

82     private static void readExclusionList() throws java.io.IOException JavaDoc {
83         if (DEBUG) System.out.println("Reading Exclusions");
84         BufferedReader in = new BufferedReader(new FileReader(dir + '/' + COMPOSITION_EXCLUSIONS), 5*1024);
85         while (true) {
86
87             // read a line, discarding comments and blank lines
88

89             String JavaDoc line = in.readLine();
90             if (line == null) break;
91             int comment = line.indexOf('#'); // strip comments
92
if (comment != -1) line = line.substring(0,comment);
93             if (line.length() == 0) continue; // ignore blanks
94

95             // store -1 in the excluded table for each character hit
96

97             int z = line.indexOf(' ');
98             if (z < 0) {
99                 z = line.length();
100             }
101             int value = Integer.parseInt(line.substring(0,z),16);
102             exclusionList.add(new Integer JavaDoc(value));
103
104         }
105         in.close();
106     }
107
108     /**
109      * Builds a decomposition table from a UnicodeData file
110      */

111     private static void buildDecompositionTables()
112       throws java.io.IOException JavaDoc {
113         if (DEBUG) System.out.println("Reading Unicode Character Database");
114         BufferedReader in = new BufferedReader(new FileReader(dir + '/' + UNICODE_DATA), 64*1024);
115         int value;
116         int counter = 0;
117         while (true) {
118
119             // read a line, discarding comments and blank lines
120

121             String JavaDoc line = in.readLine();
122             if (line == null) break;
123             int comment = line.indexOf('#'); // strip comments
124
if (comment != -1) line = line.substring(0,comment);
125             if (line.length() == 0) continue;
126             if (DEBUG) {
127                 counter++;
128                 if ((counter & 0xFF) == 0) System.out.println("At: " + line);
129             }
130
131             // find the values of the particular fields that we need
132
// Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
133

134             int start = 0;
135             int end = line.indexOf(';'); // code
136
try {
137                 value = Integer.parseInt(line.substring(start,end),16);
138             } catch (NumberFormatException JavaDoc e) {
139                 throw new IllegalStateException JavaDoc("Bad hex value in line:\n" + line);
140             }
141             if (true && value == '\u00c0') {
142                 System.out.println("debug: " + line);
143             }
144             end = line.indexOf(';', end+1); // name
145
//String name = line.substring(start,end);
146
end = line.indexOf(';', end+1); // general category
147
end = line.indexOf(';', start=end+1); // canonical class
148

149             // check consistency: canonical classes must be from 0 to 255
150

151             int cc = Integer.parseInt(line.substring(start,end));
152             if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line);
153             canonicalClassKeys.add(new Integer JavaDoc(value));
154             canonicalClassValues.add(new Integer JavaDoc(cc));
155             //canonicalClass.put(value,cc);
156
end = line.indexOf(';', end+1); // BIDI
157
end = line.indexOf(';', start=end+1); // decomp
158

159             // decomp requires more processing.
160
// store whether it is canonical or compatibility.
161
// store the decomp in one table, and the reverse mapping (from pairs) in another
162

163             if (start != end) {
164                 String JavaDoc segment = line.substring(start, end);
165                 boolean compat = segment.charAt(0) == '<';
166                 if (compat) {
167                     compatibilityList.add(new Integer JavaDoc(value));
168                     //isCompatibility.set(value);
169
}
170                 String JavaDoc decomp = fromHex(segment);
171
172                 // check consistency: all canon decomps must be singles or pairs!
173

174                 if (decomp.length() < 1 || decomp.length() > 2 && !compat) {
175                     System.err.println("Bad decomp at: " + line);
176                 }
177
178                 decompositionKeys.add(new Integer JavaDoc(value));
179                 decompositionValues.add(decomp);
180                 //decompose.put(value, decomp);
181

182                 // only compositions are canonical pairs
183
// skip if script exclusion
184

185 // if (!compat && !isExcluded.get(value)) {
186
// char first = '\u0000';
187
// char second = decomp.charAt(0);
188
// if (decomp.length() > 1) {
189
// first = second;
190
// second = decomp.charAt(1);
191
// }
192
//
193
// // store composition pair in single integer
194
//
195
// pair = (first << 16) | second;
196
// if (DEBUG && value == '\u00C0') {
197
// System.out.println("debug2: " + line);
198
// }
199
// compose.put(pair, value);
200
// } else if (DEBUG) {
201
// System.out.println("Excluding: " + decomp);
202
// }
203
}
204         }
205         in.close();
206         if (DEBUG) System.out.println("Done reading Unicode Character Database");
207
208         // add algorithmic Hangul decompositions
209
// this is more compact if done at runtime, but for simplicity we
210
// do it this way.
211

212 // if (DEBUG) System.out.println("Adding Hangul");
213
//
214
// for (int SIndex = 0; SIndex < SCount; ++SIndex) {
215
// int TIndex = SIndex % TCount;
216
// char first, second;
217
// if (TIndex != 0) { // triple
218
// first = (char)(SBase + SIndex - TIndex);
219
// second = (char)(TBase + TIndex);
220
// } else {
221
// first = (char)(LBase + SIndex / NCount);
222
// second = (char)(VBase + (SIndex % NCount) / TCount);
223
// }
224
// pair = (first << 16) | second;
225
// value = SIndex + SBase;
226
// decompose.put(value, String.valueOf(first) + second);
227
// compose.put(pair, value);
228
// }
229
// if (DEBUG) System.out.println("Done adding Hangul");
230
}
231
232     /**
233      * Hangul composition constants
234      */

235 // static final int
236
// SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
237
// LCount = 19, VCount = 21, TCount = 28,
238
// NCount = VCount * TCount, // 588
239
// SCount = LCount * NCount; // 11172
240

241     /**
242      * Utility: Parses a sequence of hex Unicode characters separated by spaces
243      */

244
245     // Modified by MHK. Original code assumed the characters were each 4 hex digits!
246

247     public static String JavaDoc fromHex(String JavaDoc source) {
248         FastStringBuffer result = new FastStringBuffer(5);
249         for (int i = 0; i < source.length(); ++i) {
250             char c = source.charAt(i);
251             switch (c) {
252               case ' ': break; // ignore
253
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
254               case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
255               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
256                     int z = source.indexOf(' ',i);
257                     if (z < 0) {
258                         z = source.length();
259                     }
260                     try {
261                         result.append((char)Integer.parseInt(source.substring(i, z),16));
262                     } catch (NumberFormatException JavaDoc e) {
263                         throw new IllegalArgumentException JavaDoc("Bad hex value in " + source);
264                     }
265                     i = z; // skip rest of number
266
break;
267               case '<': int j = source.indexOf('>',i); // skip <...>
268
if (j > 0) {
269                     i = j;
270                     break;
271                 } // else fall through--error
272
default:
273                 throw new IllegalArgumentException JavaDoc("Bad hex value in " + source);
274             }
275         }
276         return result.toString();
277     }
278
279     /**
280      * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
281      */

282     public static String JavaDoc hex(char i) {
283         String JavaDoc result = Integer.toString(i, 16).toUpperCase();
284         return "0000".substring(result.length(),4) + result;
285     }
286
287     /**
288      * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
289      */

290     public static String JavaDoc hex(String JavaDoc s, String JavaDoc sep) {
291         FastStringBuffer result = new FastStringBuffer(20);
292         for (int i = 0; i < s.length(); ++i) {
293             if (i != 0) result.append(sep);
294             result.append(hex(s.charAt(i)));
295         }
296         return result.toString();
297     }
298
299     /**
300      * Generate the Java output from the data structure
301      */

302
303     private static void generateJava(PrintStream o) {
304         o.println("package net.sf.saxon.codenorm;");
305         o.println("");
306         o.println("//This module was generated by running net.sf.saxon.codenorm.UnicodeDataGenerator");
307         o.println("//*** DO NOT EDIT! ***");
308         o.println("//The strange format of this file is carefully chosen to avoid breaking Java compiler limits");
309         o.println("");
310         o.println("public class UnicodeData {");
311
312         // Output the canonical class table
313
o.println("public static final String[] canonicalClassKeys = {");
314         printArray(o, canonicalClassKeys.iterator());
315         o.println("};");
316         o.println("public static final String[] canonicalClassValues = {");
317         printArray(o, canonicalClassValues.iterator());
318         o.println("};");
319
320         // Output the decomposition values (not including Hangul algorithmic decompositions)
321
o.println("public static final String[] decompositionKeys = {");
322         printArray(o, decompositionKeys.iterator());
323         o.println("};");
324         o.println("public static final String[] decompositionValues = {");
325         printStringArray(o, decompositionValues.iterator());
326         o.println("};");
327
328         // Output the composition exclusions
329
o.println("public static final String[] exclusionList = {");
330         printArray(o, exclusionList.iterator());
331         o.println("};");
332
333         // Output the compatibility list
334
o.println("public static final String[] compatibilityList = {");
335         printArray(o, compatibilityList.iterator());
336         o.println("};");
337
338         o.println("}");
339
340     }
341
342     /**
343      * Output an array of integer values
344      */

345
346     private static void printArray(PrintStream o, Iterator JavaDoc iter) {
347         int count = 0;
348         FastStringBuffer buff = new FastStringBuffer(120);
349         if (!iter.hasNext()) return;
350         buff.append('"');
351         while (true) {
352             if (++count == 20) {
353                 count = 0;
354                 buff.append("\",");
355                 o.println(buff.toString());
356                 buff.setLength(0);
357                 buff.append('"');
358             }
359             int next = ((Integer JavaDoc)iter.next()).intValue();
360             buff.append(Integer.toString(next, 32)); // values are written in base-32 notation
361
if (iter.hasNext()) {
362                 buff.append(",");
363             } else {
364                 buff.append("\"");
365                 o.println(buff.toString());
366                 return;
367             }
368         }
369     }
370
371     /**
372      * Output an array of string values (using backslash-uuuu notation where appropriate)
373      */

374
375     private static void printStringArray(PrintStream o, Iterator JavaDoc iter) {
376         int count = 0;
377         FastStringBuffer buff = new FastStringBuffer(120);
378         if (!iter.hasNext()) return;
379         while (true) {
380             if (++count == 20) {
381                 count = 0;
382                 o.println(buff.toString());
383                 buff.setLength(0);
384             }
385             String JavaDoc next = (String JavaDoc)iter.next();
386             appendJavaString(next, buff);
387             if (iter.hasNext()) {
388                 buff.append(", ");
389             } else {
390                 o.println(buff.toString());
391                 return;
392             }
393         }
394     }
395
396     private static void appendJavaString(String JavaDoc value, FastStringBuffer buff) {
397         buff.append('"');
398         for (int i=0; i<value.length(); i++) {
399             char c = value.charAt(i);
400             if (c == '\\') {
401                 buff.append("\\\\");
402             } else if (c == '"') {
403                 buff.append("\\\"");
404             } else if (c > 32 && c < 127) {
405                 buff.append(c);
406             } else {
407                 buff.append("\\u");
408                 char b0 = "0123456789abcdef".charAt(c & 0xf);
409                 char b1 = "0123456789abcdef".charAt((c>>4) & 0xf);
410                 char b2 = "0123456789abcdef".charAt((c>>8) & 0xf);
411                 char b3 = "0123456789abcdef".charAt((c>>12) & 0xf);
412                 buff.append(b3);
413                 buff.append(b2);
414                 buff.append(b1);
415                 buff.append(b0);
416             }
417         }
418         buff.append('"');
419     }
420
421     /**
422      * Main program. Run this program to regenerate the Java module UnicodeData.java against revised data
423      * from the Unicode character database.
424      * <p>
425      * Usage: java UnicodeDataGenerator dir >UnicodeData.java
426      * <p>
427      * where dir is the directory containing the files UnicodeData.text and CompositionExclusions.txt from the
428      * Unicode character database.
429      */

430
431     public static void main(String JavaDoc[] args) throws Exception JavaDoc {
432         if (args.length != 2) {
433             System.err.println("Usage: java UnicodeDataGenerator dir UnicodeData.java");
434             System.err.println("where dir is the directory containing the files UnicodeData.text and" +
435                     " CompositionExclusions.txt from the Unicode character database");
436         }
437         dir = args[0];
438         build();
439         PrintStream o = new PrintStream(new FileOutputStream(new File(args[1])));
440         generateJava(o);
441     }
442 }
443
Popular Tags