KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > sf > saxon > codenorm > UnicodeDataParser


1 package net.sf.saxon.codenorm;
2
3 import java.util.ArrayList JavaDoc;
4 import java.util.BitSet JavaDoc;
5 import java.util.StringTokenizer JavaDoc;
6
7 /**
8  * This class reads the data compiled into class UnicodeData, and builds hash tables
9  * that can be used by the Unicode normalization routines. This operation is performed
10  * once only, the first time normalization is attempted after Saxon is loaded.
11  */

12
13 class UnicodeDataParser {
14
15     // This class is never instantiated
16
private UnicodeDataParser(){}
17
18     /**
19      * Called exactly once by NormalizerData to build the static data
20      */

21
22     static NormalizerData build() {
23         IntHashtable canonicalClass = new IntHashtable(0, 400);
24         IntStringHashtable decompose = new IntStringHashtable(null, 18000);
25         IntHashtable compose = new IntHashtable(NormalizerData.NOT_COMPOSITE, 15000);
26         BitSet JavaDoc isCompatibility = new BitSet JavaDoc(128000);
27         BitSet JavaDoc isExcluded = new BitSet JavaDoc(128000);
28
29         readExclusionList(isExcluded);
30         readCompatibilityList(isCompatibility);
31         readCanonicalClassTable(canonicalClass);
32         readDecompositionTable(decompose, compose, isExcluded, isCompatibility);
33
34         return new NormalizerData(canonicalClass, decompose, compose,
35               isCompatibility, isExcluded);
36     }
37
38     /**
39      * Reads exclusion list and stores the data
40      */

41
42     private static void readExclusionList(BitSet JavaDoc isExcluded) {
43         for (int i=0; i<UnicodeData.exclusionList.length; i++) {
44             String JavaDoc s = UnicodeData.exclusionList[i];
45             StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(s, ",");
46             while (st.hasMoreTokens()) {
47                 String JavaDoc tok = st.nextToken();
48                 int value = Integer.parseInt(tok, 32);
49                 isExcluded.set(value);
50             }
51         }
52     }
53
54     /**
55      * Reads exclusion list and stores the data
56      */

57
58     private static void readCompatibilityList(BitSet JavaDoc isCompatible) {
59         for (int i=0; i<UnicodeData.compatibilityList.length; i++) {
60             String JavaDoc s = UnicodeData.compatibilityList[i];
61             StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(s, ",");
62             while (st.hasMoreTokens()) {
63                 String JavaDoc tok = st.nextToken();
64                 int value = Integer.parseInt(tok, 32);
65                 isCompatible.set(value);
66             }
67         }
68     }
69
70     /**
71      * Read canonical class table (mapping from character codes to their canonical class)
72      */

73
74     private static void readCanonicalClassTable(IntHashtable canonicalClasses) {
75         ArrayList JavaDoc keys = new ArrayList JavaDoc(5000);
76         for (int i=0; i<UnicodeData.canonicalClassKeys.length; i++) {
77             String JavaDoc s = UnicodeData.canonicalClassKeys[i];
78             StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(s, ",");
79             while (st.hasMoreTokens()) {
80                 String JavaDoc tok = st.nextToken();
81                 int value = Integer.parseInt(tok, 32);
82                 keys.add(new Integer JavaDoc(value));
83             }
84         }
85         int k = 0;
86         for (int i=0; i<UnicodeData.canonicalClassValues.length; i++) {
87             String JavaDoc s = UnicodeData.canonicalClassValues[i];
88             StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(s, ",");
89             while (st.hasMoreTokens()) {
90                 String JavaDoc tok = st.nextToken();
91                 int clss = Integer.parseInt(tok, 32);
92                 canonicalClasses.put(((Integer JavaDoc)keys.get(k++)).intValue(), clss);
93             }
94         }
95     }
96
97     /**
98      * Read canonical class table (mapping from character codes to their canonical class)
99      */

100
101     private static void readDecompositionTable(IntStringHashtable decompose, IntHashtable compose,
102                                                BitSet JavaDoc isExcluded, BitSet JavaDoc isCompatibility) {
103         int k = 0;
104         for (int i=0; i<UnicodeData.decompositionKeys.length; i++) {
105             String JavaDoc s = UnicodeData.decompositionKeys[i];
106             StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(s, ",");
107             while (st.hasMoreTokens()) {
108                 String JavaDoc tok = st.nextToken();
109                 int key = Integer.parseInt(tok, 32);
110                 String JavaDoc value = UnicodeData.decompositionValues[k++];
111                 decompose.put(key, value);
112                                 // only compositions are canonical pairs
113
// skip if script exclusion
114

115                 if (!isCompatibility.get(key) && !isExcluded.get(key)) {
116                     char first = '\u0000';
117                     char second = value.charAt(0);
118                     if (value.length() > 1) {
119                         first = second;
120                         second = value.charAt(1);
121                     }
122
123                     // store composition pair in single integer
124

125                     int pair = (first << 16) | second;
126                     compose.put(pair, key);
127                 }
128             }
129         }
130
131         // Add algorithmic Hangul decompositions
132
// This fragment code is copied from the normalization code published by Unicode consortium.
133
// See module net.sf.saxon.codenorm.Normalizer for applicable copyright information.
134

135         for (int SIndex = 0; SIndex < SCount; ++SIndex) {
136             int TIndex = SIndex % TCount;
137             char first, second;
138             if (TIndex != 0) { // triple
139
first = (char)(SBase + SIndex - TIndex);
140                 second = (char)(TBase + TIndex);
141             } else {
142                 first = (char)(LBase + SIndex / NCount);
143                 second = (char)(VBase + (SIndex % NCount) / TCount);
144             }
145             int pair = (first << 16) | second;
146             int key = SIndex + SBase;
147             decompose.put(key, String.valueOf(first) + second);
148             compose.put(pair, key);
149         }
150     }
151
152     /**
153      * Hangul composition constants
154      */

155     private static final int
156         SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
157         LCount = 19, VCount = 21, TCount = 28,
158         NCount = VCount * TCount, // 588
159
SCount = LCount * NCount; // 11172
160

161     // end of Unicode consortium code
162

163 }
164
165 //
166
// The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
167
// you may not use this file except in compliance with the License. You may obtain a copy of the
168
// License at http://www.mozilla.org/MPL/
169
//
170
// Software distributed under the License is distributed on an "AS IS" basis,
171
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
172
// See the License for the specific language governing rights and limitations under the License.
173
//
174
// The Original Code is: all this file.
175
//
176
// The Initial Developer of the Original Code is Michael H. Kay.
177
//
178
// The code for generating Hangul decompositions is Copyright (C) Unicode, Inc. All Rights Reserved.
179
// See statement below.
180
//
181
// Contributor(s): none.
182
//
183

184 // * Copyright (c) 1991-2005 Unicode, Inc.
185
// * For terms of use, see http://www.unicode.org/terms_of_use.html
186
// * For documentation, see UAX#15.<br>
187
// * The Unicode Consortium makes no expressed or implied warranty of any
188
// * kind, and assumes no liability for errors or omissions.
189
// * No liability is assumed for incidental and consequential damages
190
// * in connection with or arising out of the use of the information here.
Popular Tags