1 package net.sf.saxon.codenorm; 2 3 import java.util.ArrayList ; 4 import java.util.BitSet ; 5 import java.util.StringTokenizer ; 6 7 12 13 class UnicodeDataParser { 14 15 private UnicodeDataParser(){} 17 18 21 22 static NormalizerData build() { 23 IntHashtable canonicalClass = new IntHashtable(0, 400); 24 IntStringHashtable decompose = new IntStringHashtable(null, 18000); 25 IntHashtable compose = new IntHashtable(NormalizerData.NOT_COMPOSITE, 15000); 26 BitSet isCompatibility = new BitSet (128000); 27 BitSet isExcluded = new BitSet (128000); 28 29 readExclusionList(isExcluded); 30 readCompatibilityList(isCompatibility); 31 readCanonicalClassTable(canonicalClass); 32 readDecompositionTable(decompose, compose, isExcluded, isCompatibility); 33 34 return new NormalizerData(canonicalClass, decompose, compose, 35 isCompatibility, isExcluded); 36 } 37 38 41 42 private static void readExclusionList(BitSet isExcluded) { 43 for (int i=0; i<UnicodeData.exclusionList.length; i++) { 44 String s = UnicodeData.exclusionList[i]; 45 StringTokenizer st = new StringTokenizer (s, ","); 46 while (st.hasMoreTokens()) { 47 String tok = st.nextToken(); 48 int value = Integer.parseInt(tok, 32); 49 isExcluded.set(value); 50 } 51 } 52 } 53 54 57 58 private static void readCompatibilityList(BitSet isCompatible) { 59 for (int i=0; i<UnicodeData.compatibilityList.length; i++) { 60 String s = UnicodeData.compatibilityList[i]; 61 StringTokenizer st = new StringTokenizer (s, ","); 62 while (st.hasMoreTokens()) { 63 String tok = st.nextToken(); 64 int value = Integer.parseInt(tok, 32); 65 isCompatible.set(value); 66 } 67 } 68 } 69 70 73 74 private static void readCanonicalClassTable(IntHashtable canonicalClasses) { 75 ArrayList keys = new ArrayList (5000); 76 for (int i=0; i<UnicodeData.canonicalClassKeys.length; i++) { 77 String s = UnicodeData.canonicalClassKeys[i]; 78 StringTokenizer st = new StringTokenizer (s, ","); 79 while (st.hasMoreTokens()) { 80 String tok = st.nextToken(); 81 int value = Integer.parseInt(tok, 32); 82 keys.add(new Integer (value)); 83 } 84 } 85 int k = 0; 86 for (int i=0; i<UnicodeData.canonicalClassValues.length; i++) { 87 String s = UnicodeData.canonicalClassValues[i]; 88 StringTokenizer st = new StringTokenizer (s, ","); 89 while (st.hasMoreTokens()) { 90 String tok = st.nextToken(); 91 int clss = Integer.parseInt(tok, 32); 92 canonicalClasses.put(((Integer )keys.get(k++)).intValue(), clss); 93 } 94 } 95 } 96 97 100 101 private static void readDecompositionTable(IntStringHashtable decompose, IntHashtable compose, 102 BitSet isExcluded, BitSet isCompatibility) { 103 int k = 0; 104 for (int i=0; i<UnicodeData.decompositionKeys.length; i++) { 105 String s = UnicodeData.decompositionKeys[i]; 106 StringTokenizer st = new StringTokenizer (s, ","); 107 while (st.hasMoreTokens()) { 108 String tok = st.nextToken(); 109 int key = Integer.parseInt(tok, 32); 110 String value = UnicodeData.decompositionValues[k++]; 111 decompose.put(key, value); 112 115 if (!isCompatibility.get(key) && !isExcluded.get(key)) { 116 char first = '\u0000'; 117 char second = value.charAt(0); 118 if (value.length() > 1) { 119 first = second; 120 second = value.charAt(1); 121 } 122 123 125 int pair = (first << 16) | second; 126 compose.put(pair, key); 127 } 128 } 129 } 130 131 135 for (int SIndex = 0; SIndex < SCount; ++SIndex) { 136 int TIndex = SIndex % TCount; 137 char first, second; 138 if (TIndex != 0) { first = (char)(SBase + SIndex - TIndex); 140 second = (char)(TBase + TIndex); 141 } else { 142 first = (char)(LBase + SIndex / NCount); 143 second = (char)(VBase + (SIndex % NCount) / TCount); 144 } 145 int pair = (first << 16) | second; 146 int key = SIndex + SBase; 147 decompose.put(key, String.valueOf(first) + second); 148 compose.put(pair, key); 149 } 150 } 151 152 155 private static final int 156 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, 157 LCount = 19, VCount = 21, TCount = 28, 158 NCount = VCount * TCount, SCount = LCount * NCount; 161 163 } 164 165 184 | Popular Tags |