KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > CanonicalIterator


1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2006, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7 package com.ibm.icu.text;
8
9 import com.ibm.icu.lang.*;
10 import java.util.*;
11 import com.ibm.icu.impl.NormalizerImpl;
12 import com.ibm.icu.impl.USerializedSet;
13 import com.ibm.icu.impl.Utility;
14
15 /**
16  * This class allows one to iterate through all the strings that are canonically equivalent to a given
17  * string. For example, here are some sample results:
18  * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
19  * <pre>
20  1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
21  2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
22  3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
23  4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
24  5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
25  6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
26  7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
27  8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
28  9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
29 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
30 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
31 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
32  *</pre>
33  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
34  * since it has not been optimized for that situation.
35  * @author M. Davis
36  * @stable ICU 2.4
37  */

38
39 public final class CanonicalIterator {
40     /**
41      * Construct a CanonicalIterator object
42      * @param source string to get results for
43      * @stable ICU 2.4
44      */

45     public CanonicalIterator(String JavaDoc source) {
46         setSource(source);
47     }
48
49     /**
50      * Gets the NFD form of the current source we are iterating over.
51      * @return gets the source: NOTE: it is the NFD form of the source originally passed in
52      * @stable ICU 2.4
53      */

54     public String JavaDoc getSource() {
55       return source;
56     }
57
58     /**
59      * Resets the iterator so that one can start again from the beginning.
60      * @stable ICU 2.4
61      */

62     public void reset() {
63         done = false;
64         for (int i = 0; i < current.length; ++i) {
65             current[i] = 0;
66         }
67     }
68
69     /**
70      * Get the next canonically equivalent string.
71      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
72      * @return the next string that is canonically equivalent. The value null is returned when
73      * the iteration is done.
74      * @stable ICU 2.4
75      */

76     public String JavaDoc next() {
77         if (done) return null;
78
79         // construct return value
80

81         buffer.setLength(0); // delete old contents
82
for (int i = 0; i < pieces.length; ++i) {
83             buffer.append(pieces[i][current[i]]);
84         }
85         String JavaDoc result = buffer.toString();
86
87         // find next value for next time
88

89         for (int i = current.length - 1; ; --i) {
90             if (i < 0) {
91                 done = true;
92                 break;
93             }
94             current[i]++;
95             if (current[i] < pieces[i].length) break; // got sequence
96
current[i] = 0;
97         }
98         return result;
99     }
100
101     /**
102      * Set a new source for this iterator. Allows object reuse.
103      * @param newSource the source string to iterate against. This allows the same iterator to be used
104      * while changing the source string, saving object creation.
105      * @stable ICU 2.4
106      */

107     public void setSource(String JavaDoc newSource) {
108         source = Normalizer.normalize(newSource, Normalizer.NFD);
109         done = false;
110
111         // catch degenerate case
112
if (newSource.length() == 0) {
113             pieces = new String JavaDoc[1][];
114             current = new int[1];
115             pieces[0] = new String JavaDoc[]{""};
116             return;
117         }
118
119         // find the segments
120
List segmentList = new ArrayList();
121         int cp;
122         int start = 0;
123
124         // i should be the end of the first code point
125
// break up the string into segements
126

127         int i = UTF16.findOffsetFromCodePoint(source, 1);
128
129         for (; i < source.length(); i += UTF16.getCharCount(cp)) {
130             cp = UTF16.charAt(source, i);
131             if (NormalizerImpl.isCanonSafeStart(cp)) {
132                 segmentList.add(source.substring(start, i)); // add up to i
133
start = i;
134             }
135         }
136         segmentList.add(source.substring(start, i)); // add last one
137

138         // allocate the arrays, and find the strings that are CE to each segment
139
pieces = new String JavaDoc[segmentList.size()][];
140         current = new int[segmentList.size()];
141         for (i = 0; i < pieces.length; ++i) {
142             if (PROGRESS) System.out.println("SEGMENT");
143             pieces[i] = getEquivalents((String JavaDoc) segmentList.get(i));
144         }
145     }
146
147     /**
148      * Simple implementation of permutation.
149      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
150      * @param source the string to find permutations for
151      * @param skipZeros set to true to skip characters with canonical combining class zero
152      * @param output the set to add the results to
153      * @internal
154      * @deprecated This API is ICU internal only.
155      */

156     public static void permute(String JavaDoc source, boolean skipZeros, Set output) {
157         // TODO: optimize
158
//if (PROGRESS) System.out.println("Permute: " + source);
159

160         // optimization:
161
// if zero or one character, just return a set with it
162
// we check for length < 2 to keep from counting code points all the time
163
if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
164             output.add(source);
165             return;
166         }
167
168         // otherwise iterate through the string, and recursively permute all the other characters
169
Set subpermute = new HashSet();
170         int cp;
171         for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
172             cp = UTF16.charAt(source, i);
173
174             // optimization:
175
// if the character is canonical combining class zero,
176
// don't permute it
177
if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
178                 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
179
continue;
180             }
181
182             // see what the permutations of the characters before and after this one are
183
subpermute.clear();
184             permute(source.substring(0,i)
185                 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
186
187             // prefix this character to all of them
188
String JavaDoc chStr = UTF16.valueOf(source, i);
189             Iterator it = subpermute.iterator();
190             while (it.hasNext()) {
191                 String JavaDoc piece = chStr + (String JavaDoc) it.next();
192                 //if (PROGRESS) System.out.println(" Piece: " + piece);
193
output.add(piece);
194             }
195         }
196     }
197
198     // FOR TESTING
199

200     /*
201      *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
202      *@internal
203      *
204     public static UnicodeSet getSafeStart() {
205         return (UnicodeSet) SAFE_START.clone();
206     }
207     */

208     /*
209      *@return the set of characters whose decompositions start with the given character
210      *@internal
211      *
212     public static UnicodeSet getStarts(int cp) {
213         UnicodeSet result = AT_START.get(cp);
214         if (result == null) result = EMPTY;
215         return (UnicodeSet) result.clone();
216     }
217     */

218
219     // ===================== PRIVATES ==============================
220

221     // debug
222
private static boolean PROGRESS = false; // debug progress
223
//private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
224
private static boolean SKIP_ZEROS = true;
225
226     // fields
227
private String JavaDoc source;
228     private boolean done;
229     private String JavaDoc[][] pieces;
230     private int[] current;
231     // Note: C will need two more fields, since arrays there don't have lengths
232
// int pieces_length;
233
// int[] pieces_lengths;
234

235     // transient fields
236
private transient StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
237
238
239     // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
240
private String JavaDoc[] getEquivalents(String JavaDoc segment) {
241         Set result = new HashSet();
242         Set basic = getEquivalents2(segment);
243         Set permutations = new HashSet();
244
245         // now get all the permutations
246
// add only the ones that are canonically equivalent
247
// TODO: optimize by not permuting any class zero.
248
Iterator it = basic.iterator();
249         while (it.hasNext()) {
250             String JavaDoc item = (String JavaDoc) it.next();
251             permutations.clear();
252             permute(item, SKIP_ZEROS, permutations);
253             Iterator it2 = permutations.iterator();
254             while (it2.hasNext()) {
255                 String JavaDoc possible = (String JavaDoc) it2.next();
256
257 /*
258                 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
259                 if (attempt.equals(segment)) {
260 */

261                 if (Normalizer.compare(possible, segment,0)==0) {
262
263                     if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
264                     result.add(possible);
265
266                 } else {
267                     if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
268                 }
269             }
270         }
271
272         // convert into a String[] to clean up storage
273
String JavaDoc[] finalResult = new String JavaDoc[result.size()];
274         result.toArray(finalResult);
275         return finalResult;
276     }
277
278
279     private Set getEquivalents2(String JavaDoc segment) {
280
281         Set result = new HashSet();
282
283         if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
284
285         result.add(segment);
286         StringBuffer JavaDoc workingBuffer = new StringBuffer JavaDoc();
287
288         // cycle through all the characters
289
int cp=0;
290         int[] range = new int[2];
291         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
292
293             // see if any character is at the start of some decomposition
294
cp = UTF16.charAt(segment, i);
295             USerializedSet starts = new USerializedSet();
296
297             if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
298               continue;
299             }
300             int j=0;
301             // if so, see which decompositions match
302
int rangeCount = starts.countRanges();
303             for(j = 0; j < rangeCount; ++j) {
304                 starts.getRange(j, range);
305                 int end=range[1];
306                 for (int cp2 = range[0]; cp2 <= end; ++cp2) {
307                     Set remainder = extract(cp2, segment, i, workingBuffer);
308                     if (remainder == null) continue;
309     
310                     // there were some matches, so add all the possibilities to the set.
311
String JavaDoc prefix= segment.substring(0,i);
312                     prefix += UTF16.valueOf(cp2);
313                     //int el = -1;
314
Iterator iter = remainder.iterator();
315                     while (iter.hasNext()) {
316                         String JavaDoc item = (String JavaDoc) iter.next();
317                         String JavaDoc toAdd = new String JavaDoc(prefix);
318                         toAdd += item;
319                         result.add(toAdd);
320                         //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
321
}
322                 }
323             }
324         }
325         return result;
326         /*
327         Set result = new HashSet();
328         if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
329         result.add(segment);
330         StringBuffer workingBuffer = new StringBuffer();
331
332         // cycle through all the characters
333         int cp;
334
335         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
336             // see if any character is at the start of some decomposition
337             cp = UTF16.charAt(segment, i);
338             NormalizerImpl.getCanonStartSet(c,fillSet)
339             UnicodeSet starts = AT_START.get(cp);
340             if (starts == null) continue;
341             UnicodeSetIterator usi = new UnicodeSetIterator(starts);
342             // if so, see which decompositions match
343             while (usi.next()) {
344                 int cp2 = usi.codepoint;
345                 // we know that there are no strings in it
346                 // so we don't have to check CharacterIterator.IS_STRING
347                 Set remainder = extract(cp2, segment, i, workingBuffer);
348                 if (remainder == null) continue;
349
350                 // there were some matches, so add all the possibilities to the set.
351                 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
352                 Iterator it = remainder.iterator();
353                 while (it.hasNext()) {
354                     String item = (String) it.next();
355                     if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
356                     result.add(prefix + item);
357                 }
358             }
359         }
360         return result;
361         */

362     }
363
364     /**
365      * See if the decomposition of cp2 is at segment starting at segmentPos
366      * (with canonical rearrangment!)
367      * If so, take the remainder, and return the equivalents
368      */

369     private Set extract(int comp, String JavaDoc segment, int segmentPos, StringBuffer JavaDoc buffer) {
370         if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
371             + ", " + Utility.hex(segment.substring(segmentPos)));
372
373         //String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
374
String JavaDoc decomp = Normalizer.normalize(comp, Normalizer.NFD);
375
376         // See if it matches the start of segment (at segmentPos)
377
boolean ok = false;
378         int cp;
379         int decompPos = 0;
380         int decompCp = UTF16.charAt(decomp,0);
381         decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
382
//int decompClass = getClass(decompCp);
383
buffer.setLength(0); // initialize working buffer, shared among callees
384

385         for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
386             cp = UTF16.charAt(segment, i);
387             if (cp == decompCp) { // if equal, eat another cp from decomp
388
if (PROGRESS) System.out.println(" matches: " + Utility.hex(UTF16.valueOf(cp)));
389                 if (decompPos == decomp.length()) { // done, have all decomp characters!
390
buffer.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
391
ok = true;
392                     break;
393                 }
394                 decompCp = UTF16.charAt(decomp, decompPos);
395                 decompPos += UTF16.getCharCount(decompCp);
396                 //decompClass = getClass(decompCp);
397
} else {
398                 if (PROGRESS) System.out.println(" buffer: " + Utility.hex(UTF16.valueOf(cp)));
399                 // brute force approach
400
UTF16.append(buffer, cp);
401                 /* TODO: optimize
402                 // since we know that the classes are monotonically increasing, after zero
403                 // e.g. 0 5 7 9 0 3
404                 // we can do an optimization
405                 // there are only a few cases that work: zero, less, same, greater
406                 // if both classes are the same, we fail
407                 // if the decomp class < the segment class, we fail
408
409                 segClass = getClass(cp);
410                 if (decompClass <= segClass) return null;
411                 */

412             }
413         }
414         if (!ok) return null; // we failed, characters left over
415
if (PROGRESS) System.out.println("Matches");
416         if (buffer.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
417
String JavaDoc remainder = buffer.toString();
418
419         // brute force approach
420
// to check to make sure result is canonically equivalent
421
/*
422         String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
423         if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
424         */

425
426         if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
427
428         // get the remaining combinations
429
return getEquivalents2(remainder);
430     }
431
432     /*
433     // TODO: fix once we have a codepoint interface to get the canonical combining class
434     // TODO: Need public access to canonical combining class in UCharacter!
435     private static int getClass(int cp) {
436         return Normalizer.getClass((char)cp);
437     }
438     */

439
440    // ================= BUILDER =========================
441
// TODO: Flatten this data so it doesn't have to be reconstructed each time!
442

443     private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
444
private static final Set SET_WITH_NULL_STRING = new HashSet(); // constant, don't change
445
static {
446         SET_WITH_NULL_STRING.add("");
447     }
448
449   // private static UnicodeSet SAFE_START = new UnicodeSet();
450
// private static CharMap AT_START = new CharMap();
451

452         // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
453
// Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
454
// private static int LAST_UNICODE = 0x10FFFF;
455
/*
456     static {
457         buildData();
458     }
459     */

460     /*
461     private static void buildData() {
462
463         if (PROGRESS) System.out.println("Getting Safe Start");
464         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
465             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
466             int cc = UCharacter.getCombiningClass(cp);
467             if (cc == 0) SAFE_START.add(cp);
468             // will fix to be really safe below
469         }
470         if (PROGRESS) System.out.println();
471
472         if (PROGRESS) System.out.println("Getting Containment");
473         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
474             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
475
476             if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
477
478             //String istr = UTF16.valueOf(cp);
479             String decomp = Normalizer.normalize(cp, Normalizer.NFD);
480             //if (decomp.equals(istr)) continue;
481
482             // add each character in the decomposition to canBeIn
483
484             int component;
485             for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
486                 component = UTF16.charAt(decomp, i);
487                 if (i == 0) {
488                     AT_START.add(component, cp);
489                 } else if (UCharacter.getCombiningClass(component) == 0) {
490                     SAFE_START.remove(component);
491                 }
492             }
493         }
494         if (PROGRESS) System.out.println();
495     }
496         // the following is just for a map from characters to a set of characters
497
498     private static class CharMap {
499         Map storage = new HashMap();
500         MutableInt probe = new MutableInt();
501         boolean converted = false;
502
503         public void add(int cp, int whatItIsIn) {
504             UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
505             if (result == null) {
506                 result = new UnicodeSet();
507                 storage.put(probe, result);
508             }
509             result.add(whatItIsIn);
510         }
511
512         public UnicodeSet get(int cp) {
513             return (UnicodeSet) storage.get(probe.set(cp));
514         }
515     }
516
517     private static class MutableInt {
518         public int contents;
519         public int hashCode() { return contents; }
520         public boolean equals(Object other) {
521             return ((MutableInt)other).contents == contents;
522         }
523         // allows chaining
524         public MutableInt set(int contents) {
525             this.contents = contents;
526             return this;
527         }
528     }
529     */

530
531 }
532
Popular Tags