UnicodeDataGenerator


1   package net.sf.saxon.codenorm;
2   
3   import net.sf.saxon.om.FastStringBuffer;
4   
5   import java.io.*;
6   import java.util.ArrayList  ;
7   import java.util.Iterator  ;
8   import java.util.List  ;
9   
10  /**
11   * This class reads the Unicode character database, extracts information needed
12   * to perform unicode normalization, and writes this information out in the form of the
13   * Java "source" module UnicodeData.java. This class is therefore executed (via its main()
14   * method) at the time Saxon is built - it only needs to be rerun when the Unicode data tables
15   * have changed.
16   * <p>
17   * The class is derived from the sample program NormalizerData.java published by the
18   * Unicode consortium. That code has been modified so that instead of building the run-time
19   * data structures directly, they are written to a Java "source" module, which is then
20   * compiled. Also, the ability to construct a condensed version of the data tables has been
21   * removed.
22   * <p>
23   * Copyright (c) 1991-2005 Unicode, Inc.
24   * For terms of use, see http://www.unicode.org/terms_of_use.html
25   * For documentation, see UAX#15.<br>
26   * @author Mark Davis
27   * @author Michael Kay: Saxon modifications.
28   */
29  class UnicodeDataGenerator {
30      static final String   copyright = "Copyright � 1998-1999 Unicode, Inc.";
31  
32      /**
33       * Testing flags
34       */
35  
36      private static final boolean DEBUG = false;
37  
38      /**
39       * Constants for the data file version to use.
40       */
41  //    static final boolean NEW_VERSION = true;
42      private static String   dir;
43  
44      private static String   UNICODE_DATA = "UnicodeData.txt";
45      private static String   COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt";
46  
47      private static List   canonicalClassKeys = new ArrayList  (30000);
48      private static List   canonicalClassValues = new ArrayList  (30000);
49  
50      private static List   decompositionKeys = new ArrayList  (6000);
51      private static List   decompositionValues = new ArrayList  (6000);
52  
53      private static List   exclusionList = new ArrayList  (200);
54      private static List   compatibilityList = new ArrayList  (8000);
55  
56      private UnicodeDataGenerator() {
57      }
58  
59      /**
60       * Called exactly once by NormalizerData to build the static data
61       */
62  
63      static void build() {
64          try {
65              readExclusionList();
66              buildDecompositionTables();
67          } catch (java.io.IOException   e) {
68              System.err.println("Can't load data file." + e + ", " + e.getMessage());
69          }
70      }
71  
72  // =============================================================
73  // Building Decomposition Tables
74  // =============================================================
75  
76      /**
77       * Reads exclusion list and stores the data
78       */
79  
80      // Modified by MHK: the original code expects the hex character code to be always four hex digits
81  
82      private static void readExclusionList() throws java.io.IOException   {
83          if (DEBUG) System.out.println("Reading Exclusions");
84          BufferedReader in = new BufferedReader(new FileReader(dir + '/' + COMPOSITION_EXCLUSIONS), 5*1024);
85          while (true) {
86  
87              // read a line, discarding comments and blank lines
88  
89              String   line = in.readLine();
90              if (line == null) break;
91              int comment = line.indexOf('#');                    // strip comments
92              if (comment != -1) line = line.substring(0,comment);
93              if (line.length() == 0) continue;                   // ignore blanks
94  
95              // store -1 in the excluded table for each character hit
96  
97              int z = line.indexOf(' ');
98              if (z < 0) {
99                  z = line.length();
100             }
101             int value = Integer.parseInt(line.substring(0,z),16);
102             exclusionList.add(new Integer  (value));
103 
104         }
105         in.close();
106     }
107 
108     /**
109      * Builds a decomposition table from a UnicodeData file
110      */
111     private static void buildDecompositionTables()
112       throws java.io.IOException   {
113         if (DEBUG) System.out.println("Reading Unicode Character Database");
114         BufferedReader in = new BufferedReader(new FileReader(dir + '/' + UNICODE_DATA), 64*1024);
115         int value;
116         int counter = 0;
117         while (true) {
118 
119             // read a line, discarding comments and blank lines
120 
121             String   line = in.readLine();
122             if (line == null) break;
123             int comment = line.indexOf('#');                    // strip comments
124             if (comment != -1) line = line.substring(0,comment);
125             if (line.length() == 0) continue;
126             if (DEBUG) {
127                 counter++;
128                 if ((counter & 0xFF) == 0) System.out.println("At: " + line);
129             }
130 
131             // find the values of the particular fields that we need
132             // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
133 
134             int start = 0;
135             int end = line.indexOf(';'); // code
136             try {
137                 value = Integer.parseInt(line.substring(start,end),16);
138             } catch (NumberFormatException   e) {
139                 throw new IllegalStateException  ("Bad hex value in line:\n" + line);
140             }
141             if (true && value == '\u00c0') {
142                 System.out.println("debug: " + line);
143             }
144             end = line.indexOf(';', end+1); // name
145             //String name = line.substring(start,end);
146             end = line.indexOf(';', end+1); // general category
147             end = line.indexOf(';', start=end+1); // canonical class
148 
149             // check consistency: canonical classes must be from 0 to 255
150 
151             int cc = Integer.parseInt(line.substring(start,end));
152             if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line);
153             canonicalClassKeys.add(new Integer  (value));
154             canonicalClassValues.add(new Integer  (cc));
155             //canonicalClass.put(value,cc);
156             end = line.indexOf(';', end+1); // BIDI
157             end = line.indexOf(';', start=end+1); // decomp
158 
159             // decomp requires more processing.
160             // store whether it is canonical or compatibility.
161             // store the decomp in one table, and the reverse mapping (from pairs) in another
162 
163             if (start != end) {
164                 String   segment = line.substring(start, end);
165                 boolean compat = segment.charAt(0) == '<';
166                 if (compat) {
167                     compatibilityList.add(new Integer  (value));
168                     //isCompatibility.set(value);
169                 }
170                 String   decomp = fromHex(segment);
171 
172                 // check consistency: all canon decomps must be singles or pairs!
173 
174                 if (decomp.length() < 1 || decomp.length() > 2 && !compat) {
175                     System.err.println("Bad decomp at: " + line);
176                 }
177 
178                 decompositionKeys.add(new Integer  (value));
179                 decompositionValues.add(decomp);
180                 //decompose.put(value, decomp);
181 
182                 // only compositions are canonical pairs
183                 // skip if script exclusion
184 
185 //                if (!compat && !isExcluded.get(value)) {
186 //                    char first = '\u0000';
187 //                    char second = decomp.charAt(0);
188 //                    if (decomp.length() > 1) {
189 //                        first = second;
190 //                        second = decomp.charAt(1);
191 //                    }
192 //
193 //                    // store composition pair in single integer
194 //
195 //                    pair = (first << 16) | second;
196 //                    if (DEBUG && value == '\u00C0') {
197 //                        System.out.println("debug2: " + line);
198 //                    }
199 //                    compose.put(pair, value);
200 //                } else if (DEBUG) {
201 //                    System.out.println("Excluding: " + decomp);
202 //                }
203             }
204         }
205         in.close();
206         if (DEBUG) System.out.println("Done reading Unicode Character Database");
207 
208         // add algorithmic Hangul decompositions
209         // this is more compact if done at runtime, but for simplicity we
210         // do it this way.
211 
212 //        if (DEBUG) System.out.println("Adding Hangul");
213 //
214 //        for (int SIndex = 0; SIndex < SCount; ++SIndex) {
215 //            int TIndex = SIndex % TCount;
216 //            char first, second;
217 //            if (TIndex != 0) { // triple
218 //                first = (char)(SBase + SIndex - TIndex);
219 //                second = (char)(TBase + TIndex);
220 //            } else {
221 //                first = (char)(LBase + SIndex / NCount);
222 //                second = (char)(VBase + (SIndex % NCount) / TCount);
223 //            }
224 //            pair = (first << 16) | second;
225 //            value = SIndex + SBase;
226 //            decompose.put(value, String.valueOf(first) + second);
227 //            compose.put(pair, value);
228 //        }
229 //        if (DEBUG) System.out.println("Done adding Hangul");
230     }
231 
232     /**
233      * Hangul composition constants
234      */
235 //    static final int
236 //        SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
237 //        LCount = 19, VCount = 21, TCount = 28,
238 //        NCount = VCount * TCount,   // 588
239 //        SCount = LCount * NCount;   // 11172
240 
241     /**
242      * Utility: Parses a sequence of hex Unicode characters separated by spaces
243      */
244 
245     // Modified by MHK. Original code assumed the characters were each 4 hex digits!
246 
247     public static String   fromHex(String   source) {
248         FastStringBuffer result = new FastStringBuffer(5);
249         for (int i = 0; i < source.length(); ++i) {
250             char c = source.charAt(i);
251             switch (c) {
252               case ' ': break; // ignore
253               case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
254               case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
255               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
256                     int z = source.indexOf(' ',i);
257                     if (z < 0) {
258                         z = source.length();
259                     }
260                     try {
261                         result.append((char)Integer.parseInt(source.substring(i, z),16));
262                     } catch (NumberFormatException   e) {
263                         throw new IllegalArgumentException  ("Bad hex value in " + source);
264                     }
265                     i = z; // skip rest of number
266                 break;
267               case '<': int j = source.indexOf('>',i); // skip <...>
268                 if (j > 0) {
269                     i = j;
270                     break;
271                 } // else fall through--error
272               default:
273                 throw new IllegalArgumentException  ("Bad hex value in " + source);
274             }
275         }
276         return result.toString();
277     }
278 
279     /**
280      * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
281      */
282     public static String   hex(char i) {
283         String   result = Integer.toString(i, 16).toUpperCase();
284         return "0000".substring(result.length(),4) + result;
285     }
286 
287     /**
288      * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
289      */
290     public static String   hex(String   s, String   sep) {
291         FastStringBuffer result = new FastStringBuffer(20);
292         for (int i = 0; i < s.length(); ++i) {
293             if (i != 0) result.append(sep);
294             result.append(hex(s.charAt(i)));
295         }
296         return result.toString();
297     }
298 
299     /**
300      * Generate the Java output from the data structure
301      */
302 
303     private static void generateJava(PrintStream o) {
304         o.println("package net.sf.saxon.codenorm;");
305         o.println("");
306         o.println("//This module was generated by running net.sf.saxon.codenorm.UnicodeDataGenerator");
307         o.println("//*** DO NOT EDIT! ***");
308         o.println("//The strange format of this file is carefully chosen to avoid breaking Java compiler limits");
309         o.println("");
310         o.println("public class UnicodeData {");
311 
312         // Output the canonical class table
313         o.println("public static final String[] canonicalClassKeys = {");
314         printArray(o, canonicalClassKeys.iterator());
315         o.println("};");
316         o.println("public static final String[] canonicalClassValues = {");
317         printArray(o, canonicalClassValues.iterator());
318         o.println("};");
319 
320         // Output the decomposition values (not including Hangul algorithmic decompositions)
321         o.println("public static final String[] decompositionKeys = {");
322         printArray(o, decompositionKeys.iterator());
323         o.println("};");
324         o.println("public static final String[] decompositionValues = {");
325         printStringArray(o, decompositionValues.iterator());
326         o.println("};");
327 
328         // Output the composition exclusions
329         o.println("public static final String[] exclusionList = {");
330         printArray(o, exclusionList.iterator());
331         o.println("};");
332 
333         // Output the compatibility list
334         o.println("public static final String[] compatibilityList = {");
335         printArray(o, compatibilityList.iterator());
336         o.println("};");
337 
338         o.println("}");
339 
340     }
341 
342     /**
343      * Output an array of integer values
344      */
345 
346     private static void printArray(PrintStream o, Iterator   iter) {
347         int count = 0;
348         FastStringBuffer buff = new FastStringBuffer(120);
349         if (!iter.hasNext()) return;
350         buff.append('"');
351         while (true) {
352             if (++count == 20) {
353                 count = 0;
354                 buff.append("\",");
355                 o.println(buff.toString());
356                 buff.setLength(0);
357                 buff.append('"');
358             }
359             int next = ((Integer  )iter.next()).intValue();
360             buff.append(Integer.toString(next, 32));    // values are written in base-32 notation
361             if (iter.hasNext()) {
362                 buff.append(",");
363             } else {
364                 buff.append("\"");
365                 o.println(buff.toString());
366                 return;
367             }
368         }
369     }
370 
371     /**
372      * Output an array of string values (using backslash-uuuu notation where appropriate)
373      */
374 
375     private static void printStringArray(PrintStream o, Iterator   iter) {
376         int count = 0;
377         FastStringBuffer buff = new FastStringBuffer(120);
378         if (!iter.hasNext()) return;
379         while (true) {
380             if (++count == 20) {
381                 count = 0;
382                 o.println(buff.toString());
383                 buff.setLength(0);
384             }
385             String   next = (String  )iter.next();
386             appendJavaString(next, buff);
387             if (iter.hasNext()) {
388                 buff.append(", ");
389             } else {
390                 o.println(buff.toString());
391                 return;
392             }
393         }
394     }
395 
396     private static void appendJavaString(String   value, FastStringBuffer buff) {
397         buff.append('"');
398         for (int i=0; i<value.length(); i++) {
399             char c = value.charAt(i);
400             if (c == '\\') {
401                 buff.append("\\\\");
402             } else if (c == '"') {
403                 buff.append("\\\"");
404             } else if (c > 32 && c < 127) {
405                 buff.append(c);
406             } else {
407                 buff.append("\\u");
408                 char b0 = "0123456789abcdef".charAt(c & 0xf);
409                 char b1 = "0123456789abcdef".charAt((c>>4) & 0xf);
410                 char b2 = "0123456789abcdef".charAt((c>>8) & 0xf);
411                 char b3 = "0123456789abcdef".charAt((c>>12) & 0xf);
412                 buff.append(b3);
413                 buff.append(b2);
414                 buff.append(b1);
415                 buff.append(b0);
416             }
417         }
418         buff.append('"');
419     }
420 
421     /**
422      * Main program. Run this program to regenerate the Java module UnicodeData.java against revised data
423      * from the Unicode character database.
424      * <p>
425      * Usage: java UnicodeDataGenerator dir >UnicodeData.java
426      * <p>
427      * where dir is the directory containing the files UnicodeData.text and CompositionExclusions.txt from the
428      * Unicode character database.
429      */
430 
431     public static void main(String  [] args) throws Exception   {
432         if (args.length != 2) {
433             System.err.println("Usage: java UnicodeDataGenerator dir UnicodeData.java");
434             System.err.println("where dir is the directory containing the files UnicodeData.text and" +
435                     " CompositionExclusions.txt from the Unicode character database");
436         }
437         dir = args[0];
438         build();
439         PrintStream o = new PrintStream(new FileOutputStream(new File(args[1])));
440         generateJava(o);
441     }
442 }
443
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags