UnicodeDataParser


1   package net.sf.saxon.codenorm;
2   
3   import java.util.ArrayList  ;
4   import java.util.BitSet  ;
5   import java.util.StringTokenizer  ;
6   
7   /**
8    * This class reads the data compiled into class UnicodeData, and builds hash tables
9    * that can be used by the Unicode normalization routines. This operation is performed
10   * once only, the first time normalization is attempted after Saxon is loaded.
11   */
12  
13  class UnicodeDataParser {
14  
15      // This class is never instantiated
16      private UnicodeDataParser(){}
17  
18      /**
19       * Called exactly once by NormalizerData to build the static data
20       */
21  
22      static NormalizerData build() {
23          IntHashtable canonicalClass = new IntHashtable(0, 400);
24          IntStringHashtable decompose = new IntStringHashtable(null, 18000);
25          IntHashtable compose = new IntHashtable(NormalizerData.NOT_COMPOSITE, 15000);
26          BitSet   isCompatibility = new BitSet  (128000);
27          BitSet   isExcluded = new BitSet  (128000);
28  
29          readExclusionList(isExcluded);
30          readCompatibilityList(isCompatibility);
31          readCanonicalClassTable(canonicalClass);
32          readDecompositionTable(decompose, compose, isExcluded, isCompatibility);
33  
34          return new NormalizerData(canonicalClass, decompose, compose,
35                isCompatibility, isExcluded);
36      }
37  
38      /**
39       * Reads exclusion list and stores the data
40       */
41  
42      private static void readExclusionList(BitSet   isExcluded) {
43          for (int i=0; i<UnicodeData.exclusionList.length; i++) {
44              String   s = UnicodeData.exclusionList[i];
45              StringTokenizer   st = new StringTokenizer  (s, ",");
46              while (st.hasMoreTokens()) {
47                  String   tok = st.nextToken();
48                  int value = Integer.parseInt(tok, 32);
49                  isExcluded.set(value);
50              }
51          }
52      }
53  
54      /**
55       * Reads exclusion list and stores the data
56       */
57  
58      private static void readCompatibilityList(BitSet   isCompatible) {
59          for (int i=0; i<UnicodeData.compatibilityList.length; i++) {
60              String   s = UnicodeData.compatibilityList[i];
61              StringTokenizer   st = new StringTokenizer  (s, ",");
62              while (st.hasMoreTokens()) {
63                  String   tok = st.nextToken();
64                  int value = Integer.parseInt(tok, 32);
65                  isCompatible.set(value);
66              }
67          }
68      }
69  
70      /**
71       * Read canonical class table (mapping from character codes to their canonical class)
72       */
73  
74      private static void readCanonicalClassTable(IntHashtable canonicalClasses) {
75          ArrayList   keys = new ArrayList  (5000);
76          for (int i=0; i<UnicodeData.canonicalClassKeys.length; i++) {
77              String   s = UnicodeData.canonicalClassKeys[i];
78              StringTokenizer   st = new StringTokenizer  (s, ",");
79              while (st.hasMoreTokens()) {
80                  String   tok = st.nextToken();
81                  int value = Integer.parseInt(tok, 32);
82                  keys.add(new Integer  (value));
83              }
84          }
85          int k = 0;
86          for (int i=0; i<UnicodeData.canonicalClassValues.length; i++) {
87              String   s = UnicodeData.canonicalClassValues[i];
88              StringTokenizer   st = new StringTokenizer  (s, ",");
89              while (st.hasMoreTokens()) {
90                  String   tok = st.nextToken();
91                  int clss = Integer.parseInt(tok, 32);
92                  canonicalClasses.put(((Integer  )keys.get(k++)).intValue(), clss);
93              }
94          }
95      }
96  
97      /**
98       * Read canonical class table (mapping from character codes to their canonical class)
99       */
100 
101     private static void readDecompositionTable(IntStringHashtable decompose, IntHashtable compose,
102                                                BitSet   isExcluded, BitSet   isCompatibility) {
103         int k = 0;
104         for (int i=0; i<UnicodeData.decompositionKeys.length; i++) {
105             String   s = UnicodeData.decompositionKeys[i];
106             StringTokenizer   st = new StringTokenizer  (s, ",");
107             while (st.hasMoreTokens()) {
108                 String   tok = st.nextToken();
109                 int key = Integer.parseInt(tok, 32);
110                 String   value = UnicodeData.decompositionValues[k++];
111                 decompose.put(key, value);
112                                 // only compositions are canonical pairs
113                 // skip if script exclusion
114 
115                 if (!isCompatibility.get(key) && !isExcluded.get(key)) {
116                     char first = '\u0000';
117                     char second = value.charAt(0);
118                     if (value.length() > 1) {
119                         first = second;
120                         second = value.charAt(1);
121                     }
122 
123                     // store composition pair in single integer
124 
125                     int pair = (first << 16) | second;
126                     compose.put(pair, key);
127                 }
128             }
129         }
130 
131         // Add algorithmic Hangul decompositions
132         // This fragment code is copied from the normalization code published by Unicode consortium.
133         // See module net.sf.saxon.codenorm.Normalizer for applicable copyright information.
134 
135         for (int SIndex = 0; SIndex < SCount; ++SIndex) {
136             int TIndex = SIndex % TCount;
137             char first, second;
138             if (TIndex != 0) { // triple
139                 first = (char)(SBase + SIndex - TIndex);
140                 second = (char)(TBase + TIndex);
141             } else {
142                 first = (char)(LBase + SIndex / NCount);
143                 second = (char)(VBase + (SIndex % NCount) / TCount);
144             }
145             int pair = (first << 16) | second;
146             int key = SIndex + SBase;
147             decompose.put(key, String.valueOf(first) + second);
148             compose.put(pair, key);
149         }
150     }
151 
152     /**
153      * Hangul composition constants
154      */
155     private static final int
156         SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
157         LCount = 19, VCount = 21, TCount = 28,
158         NCount = VCount * TCount,   // 588
159         SCount = LCount * NCount;   // 11172
160 
161     // end of Unicode consortium code
162 
163 }
164 
165 //
166 // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
167 // you may not use this file except in compliance with the License. You may obtain a copy of the
168 // License at http://www.mozilla.org/MPL/
169 //
170 // Software distributed under the License is distributed on an "AS IS" basis,
171 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
172 // See the License for the specific language governing rights and limitations under the License.
173 //
174 // The Original Code is: all this file.
175 //
176 // The Initial Developer of the Original Code is Michael H. Kay.
177 //
178 // The code for generating Hangul decompositions is Copyright (C) Unicode, Inc. All Rights Reserved.
179 // See statement below.
180 //
181 // Contributor(s): none.
182 //
183 
184 // * Copyright (c) 1991-2005 Unicode, Inc.
185 // * For terms of use, see http://www.unicode.org/terms_of_use.html
186 // * For documentation, see UAX#15.<br>
187 // * The Unicode Consortium makes no expressed or implied warranty of any
188 // * kind, and assumes no liability for errors or omissions.
189 // * No liability is assumed for incidental and consequential damages
190 // * in connection with or arising out of the use of the information here.
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags