1 7 package com.ibm.icu.text; 8 9 import com.ibm.icu.lang.*; 10 import java.util.*; 11 import com.ibm.icu.impl.NormalizerImpl; 12 import com.ibm.icu.impl.USerializedSet; 13 import com.ibm.icu.impl.Utility; 14 15 38 39 public final class CanonicalIterator { 40 45 public CanonicalIterator(String source) { 46 setSource(source); 47 } 48 49 54 public String getSource() { 55 return source; 56 } 57 58 62 public void reset() { 63 done = false; 64 for (int i = 0; i < current.length; ++i) { 65 current[i] = 0; 66 } 67 } 68 69 76 public String next() { 77 if (done) return null; 78 79 81 buffer.setLength(0); for (int i = 0; i < pieces.length; ++i) { 83 buffer.append(pieces[i][current[i]]); 84 } 85 String result = buffer.toString(); 86 87 89 for (int i = current.length - 1; ; --i) { 90 if (i < 0) { 91 done = true; 92 break; 93 } 94 current[i]++; 95 if (current[i] < pieces[i].length) break; current[i] = 0; 97 } 98 return result; 99 } 100 101 107 public void setSource(String newSource) { 108 source = Normalizer.normalize(newSource, Normalizer.NFD); 109 done = false; 110 111 if (newSource.length() == 0) { 113 pieces = new String [1][]; 114 current = new int[1]; 115 pieces[0] = new String []{""}; 116 return; 117 } 118 119 List segmentList = new ArrayList(); 121 int cp; 122 int start = 0; 123 124 127 int i = UTF16.findOffsetFromCodePoint(source, 1); 128 129 for (; i < source.length(); i += UTF16.getCharCount(cp)) { 130 cp = UTF16.charAt(source, i); 131 if (NormalizerImpl.isCanonSafeStart(cp)) { 132 segmentList.add(source.substring(start, i)); start = i; 134 } 135 } 136 segmentList.add(source.substring(start, i)); 138 pieces = new String [segmentList.size()][]; 140 current = new int[segmentList.size()]; 141 for (i = 0; i < pieces.length; ++i) { 142 if (PROGRESS) System.out.println("SEGMENT"); 143 pieces[i] = getEquivalents((String ) segmentList.get(i)); 144 } 145 } 146 147 156 public static void permute(String source, boolean skipZeros, Set output) { 157 160 if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) { 164 output.add(source); 165 return; 166 } 167 168 Set subpermute = new HashSet(); 170 int cp; 171 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { 172 cp = UTF16.charAt(source, i); 173 174 if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) { 178 continue; 180 } 181 182 subpermute.clear(); 184 permute(source.substring(0,i) 185 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute); 186 187 String chStr = UTF16.valueOf(source, i); 189 Iterator it = subpermute.iterator(); 190 while (it.hasNext()) { 191 String piece = chStr + (String ) it.next(); 192 output.add(piece); 194 } 195 } 196 } 197 198 200 208 218 219 221 private static boolean PROGRESS = false; private static boolean SKIP_ZEROS = true; 225 226 private String source; 228 private boolean done; 229 private String [][] pieces; 230 private int[] current; 231 235 private transient StringBuffer buffer = new StringBuffer (); 237 238 239 private String [] getEquivalents(String segment) { 241 Set result = new HashSet(); 242 Set basic = getEquivalents2(segment); 243 Set permutations = new HashSet(); 244 245 Iterator it = basic.iterator(); 249 while (it.hasNext()) { 250 String item = (String ) it.next(); 251 permutations.clear(); 252 permute(item, SKIP_ZEROS, permutations); 253 Iterator it2 = permutations.iterator(); 254 while (it2.hasNext()) { 255 String possible = (String ) it2.next(); 256 257 261 if (Normalizer.compare(possible, segment,0)==0) { 262 263 if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible)); 264 result.add(possible); 265 266 } else { 267 if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible)); 268 } 269 } 270 } 271 272 String [] finalResult = new String [result.size()]; 274 result.toArray(finalResult); 275 return finalResult; 276 } 277 278 279 private Set getEquivalents2(String segment) { 280 281 Set result = new HashSet(); 282 283 if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment)); 284 285 result.add(segment); 286 StringBuffer workingBuffer = new StringBuffer (); 287 288 int cp=0; 290 int[] range = new int[2]; 291 for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) { 292 293 cp = UTF16.charAt(segment, i); 295 USerializedSet starts = new USerializedSet(); 296 297 if (!NormalizerImpl.getCanonStartSet(cp, starts)) { 298 continue; 299 } 300 int j=0; 301 int rangeCount = starts.countRanges(); 303 for(j = 0; j < rangeCount; ++j) { 304 starts.getRange(j, range); 305 int end=range[1]; 306 for (int cp2 = range[0]; cp2 <= end; ++cp2) { 307 Set remainder = extract(cp2, segment, i, workingBuffer); 308 if (remainder == null) continue; 309 310 String prefix= segment.substring(0,i); 312 prefix += UTF16.valueOf(cp2); 313 Iterator iter = remainder.iterator(); 315 while (iter.hasNext()) { 316 String item = (String ) iter.next(); 317 String toAdd = new String (prefix); 318 toAdd += item; 319 result.add(toAdd); 320 } 322 } 323 } 324 } 325 return result; 326 362 } 363 364 369 private Set extract(int comp, String segment, int segmentPos, StringBuffer buffer) { 370 if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp)) 371 + ", " + Utility.hex(segment.substring(segmentPos))); 372 373 String decomp = Normalizer.normalize(comp, Normalizer.NFD); 375 376 boolean ok = false; 378 int cp; 379 int decompPos = 0; 380 int decompCp = UTF16.charAt(decomp,0); 381 decompPos += UTF16.getCharCount(decompCp); buffer.setLength(0); 385 for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) { 386 cp = UTF16.charAt(segment, i); 387 if (cp == decompCp) { if (PROGRESS) System.out.println(" matches: " + Utility.hex(UTF16.valueOf(cp))); 389 if (decompPos == decomp.length()) { buffer.append(segment.substring(i + UTF16.getCharCount(cp))); ok = true; 392 break; 393 } 394 decompCp = UTF16.charAt(decomp, decompPos); 395 decompPos += UTF16.getCharCount(decompCp); 396 } else { 398 if (PROGRESS) System.out.println(" buffer: " + Utility.hex(UTF16.valueOf(cp))); 399 UTF16.append(buffer, cp); 401 412 } 413 } 414 if (!ok) return null; if (PROGRESS) System.out.println("Matches"); 416 if (buffer.length() == 0) return SET_WITH_NULL_STRING; String remainder = buffer.toString(); 418 419 425 426 if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null; 427 428 return getEquivalents2(remainder); 430 } 431 432 439 440 443 private static final UnicodeSet EMPTY = new UnicodeSet(); private static final Set SET_WITH_NULL_STRING = new HashSet(); static { 446 SET_WITH_NULL_STRING.add(""); 447 } 448 449 452 460 530 531 } 532 | Popular Tags |