CanonicalIterator


1   /*
2    *******************************************************************************
3    * Copyright (C) 1996-2006, International Business Machines Corporation and    *
4    * others. All Rights Reserved.                                                *
5    *******************************************************************************
6    */
7   package com.ibm.icu.text;
8   
9   import com.ibm.icu.lang.*;
10  import java.util.*;
11  import com.ibm.icu.impl.NormalizerImpl;
12  import com.ibm.icu.impl.USerializedSet;
13  import com.ibm.icu.impl.Utility;
14  
15  /**
16   * This class allows one to iterate through all the strings that are canonically equivalent to a given
17   * string. For example, here are some sample results:
18   * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
19   * <pre>
20   1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
21   2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
22   3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
23   4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
24   5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
25   6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
26   7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
27   8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
28   9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
29  10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
30  11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
31  12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
32   *</pre>
33   *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
34   * since it has not been optimized for that situation.
35   * @author M. Davis
36   * @stable ICU 2.4
37   */
38  
39  public final class CanonicalIterator {
40      /**
41       * Construct a CanonicalIterator object
42       * @param source string to get results for
43       * @stable ICU 2.4
44       */
45      public CanonicalIterator(String   source) {
46          setSource(source);
47      }
48  
49      /**
50       * Gets the NFD form of the current source we are iterating over.
51       * @return gets the source: NOTE: it is the NFD form of the source originally passed in
52       * @stable ICU 2.4
53       */
54      public String   getSource() {
55        return source;
56      }
57  
58      /**
59       * Resets the iterator so that one can start again from the beginning.
60       * @stable ICU 2.4
61       */
62      public void reset() {
63          done = false;
64          for (int i = 0; i < current.length; ++i) {
65              current[i] = 0;
66          }
67      }
68  
69      /**
70       * Get the next canonically equivalent string.
71       * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
72       * @return the next string that is canonically equivalent. The value null is returned when
73       * the iteration is done.
74       * @stable ICU 2.4
75       */
76      public String   next() {
77          if (done) return null;
78  
79          // construct return value
80  
81          buffer.setLength(0); // delete old contents
82          for (int i = 0; i < pieces.length; ++i) {
83              buffer.append(pieces[i][current[i]]);
84          }
85          String   result = buffer.toString();
86  
87          // find next value for next time
88  
89          for (int i = current.length - 1; ; --i) {
90              if (i < 0) {
91                  done = true;
92                  break;
93              }
94              current[i]++;
95              if (current[i] < pieces[i].length) break; // got sequence
96              current[i] = 0;
97          }
98          return result;
99      }
100 
101     /**
102      * Set a new source for this iterator. Allows object reuse.
103      * @param newSource the source string to iterate against. This allows the same iterator to be used
104      * while changing the source string, saving object creation.
105      * @stable ICU 2.4
106      */
107     public void setSource(String   newSource) {
108         source = Normalizer.normalize(newSource, Normalizer.NFD);
109         done = false;
110 
111         // catch degenerate case
112         if (newSource.length() == 0) {
113             pieces = new String  [1][];
114             current = new int[1];
115             pieces[0] = new String  []{""};
116             return;
117         }
118 
119         // find the segments
120         List segmentList = new ArrayList();
121         int cp;
122         int start = 0;
123 
124         // i should be the end of the first code point
125         // break up the string into segements
126 
127         int i = UTF16.findOffsetFromCodePoint(source, 1);
128 
129         for (; i < source.length(); i += UTF16.getCharCount(cp)) {
130             cp = UTF16.charAt(source, i);
131             if (NormalizerImpl.isCanonSafeStart(cp)) {
132                 segmentList.add(source.substring(start, i)); // add up to i
133                 start = i;
134             }
135         }
136         segmentList.add(source.substring(start, i)); // add last one
137 
138         // allocate the arrays, and find the strings that are CE to each segment
139         pieces = new String  [segmentList.size()][];
140         current = new int[segmentList.size()];
141         for (i = 0; i < pieces.length; ++i) {
142             if (PROGRESS) System.out.println("SEGMENT");
143             pieces[i] = getEquivalents((String  ) segmentList.get(i));
144         }
145     }
146 
147     /**
148      * Simple implementation of permutation.
149      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
150      * @param source the string to find permutations for
151      * @param skipZeros set to true to skip characters with canonical combining class zero
152      * @param output the set to add the results to
153      * @internal
154      * @deprecated This API is ICU internal only.
155      */
156     public static void permute(String   source, boolean skipZeros, Set output) {
157         // TODO: optimize
158         //if (PROGRESS) System.out.println("Permute: " + source);
159 
160         // optimization:
161         // if zero or one character, just return a set with it
162         // we check for length < 2 to keep from counting code points all the time
163         if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
164             output.add(source);
165             return;
166         }
167 
168         // otherwise iterate through the string, and recursively permute all the other characters
169         Set subpermute = new HashSet();
170         int cp;
171         for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
172             cp = UTF16.charAt(source, i);
173 
174             // optimization:
175             // if the character is canonical combining class zero,
176             // don't permute it
177             if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
178                 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
179                 continue;
180             }
181 
182             // see what the permutations of the characters before and after this one are
183             subpermute.clear();
184             permute(source.substring(0,i)
185                 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
186 
187             // prefix this character to all of them
188             String   chStr = UTF16.valueOf(source, i);
189             Iterator it = subpermute.iterator();
190             while (it.hasNext()) {
191                 String   piece = chStr + (String  ) it.next();
192                 //if (PROGRESS) System.out.println("  Piece: " + piece);
193                 output.add(piece);
194             }
195         }
196     }
197 
198     // FOR TESTING
199 
200     /*
201      *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
202      *@internal
203      *
204     public static UnicodeSet getSafeStart() {
205         return (UnicodeSet) SAFE_START.clone();
206     }
207     */
208     /*
209      *@return the set of characters whose decompositions start with the given character
210      *@internal
211      *
212     public static UnicodeSet getStarts(int cp) {
213         UnicodeSet result = AT_START.get(cp);
214         if (result == null) result = EMPTY;
215         return (UnicodeSet) result.clone();
216     }
217     */
218 
219     // ===================== PRIVATES ==============================
220 
221     // debug
222     private static boolean PROGRESS = false; // debug progress
223     //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
224     private static boolean SKIP_ZEROS = true;
225 
226     // fields
227     private String   source;
228     private boolean done;
229     private String  [][] pieces;
230     private int[] current;
231     // Note: C will need two more fields, since arrays there don't have lengths
232     // int pieces_length;
233     // int[] pieces_lengths;
234 
235     // transient fields
236     private transient StringBuffer   buffer = new StringBuffer  ();
237 
238 
239     // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
240     private String  [] getEquivalents(String   segment) {
241         Set result = new HashSet();
242         Set basic = getEquivalents2(segment);
243         Set permutations = new HashSet();
244 
245         // now get all the permutations
246         // add only the ones that are canonically equivalent
247         // TODO: optimize by not permuting any class zero.
248         Iterator it = basic.iterator();
249         while (it.hasNext()) {
250             String   item = (String  ) it.next();
251             permutations.clear();
252             permute(item, SKIP_ZEROS, permutations);
253             Iterator it2 = permutations.iterator();
254             while (it2.hasNext()) {
255                 String   possible = (String  ) it2.next();
256 
257 /*
258                 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
259                 if (attempt.equals(segment)) {
260 */
261                 if (Normalizer.compare(possible, segment,0)==0) {
262 
263                     if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
264                     result.add(possible);
265 
266                 } else {
267                     if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
268                 }
269             }
270         }
271 
272         // convert into a String[] to clean up storage
273         String  [] finalResult = new String  [result.size()];
274         result.toArray(finalResult);
275         return finalResult;
276     }
277 
278 
279     private Set getEquivalents2(String   segment) {
280 
281         Set result = new HashSet();
282 
283         if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
284 
285         result.add(segment);
286         StringBuffer   workingBuffer = new StringBuffer  ();
287 
288         // cycle through all the characters
289         int cp=0;
290         int[] range = new int[2];
291         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
292 
293             // see if any character is at the start of some decomposition
294             cp = UTF16.charAt(segment, i);
295             USerializedSet starts = new USerializedSet();
296 
297             if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
298               continue;
299             }
300             int j=0;
301             // if so, see which decompositions match
302             int rangeCount = starts.countRanges();
303             for(j = 0; j < rangeCount; ++j) {
304                 starts.getRange(j, range);
305                 int end=range[1];
306                 for (int cp2 = range[0]; cp2 <= end; ++cp2) {
307                     Set remainder = extract(cp2, segment, i, workingBuffer);
308                     if (remainder == null) continue;
309     
310                     // there were some matches, so add all the possibilities to the set.
311                     String   prefix= segment.substring(0,i);
312                     prefix += UTF16.valueOf(cp2);
313                     //int el = -1;
314                     Iterator iter = remainder.iterator();
315                     while (iter.hasNext()) {
316                         String   item = (String  ) iter.next();
317                         String   toAdd = new String  (prefix);
318                         toAdd += item;
319                         result.add(toAdd);
320                         //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
321                     }
322                 }
323             }
324         }
325         return result;
326         /*
327         Set result = new HashSet();
328         if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
329         result.add(segment);
330         StringBuffer workingBuffer = new StringBuffer();
331 
332         // cycle through all the characters
333         int cp;
334 
335         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
336             // see if any character is at the start of some decomposition
337             cp = UTF16.charAt(segment, i);
338             NormalizerImpl.getCanonStartSet(c,fillSet)
339             UnicodeSet starts = AT_START.get(cp);
340             if (starts == null) continue;
341             UnicodeSetIterator usi = new UnicodeSetIterator(starts);
342             // if so, see which decompositions match
343             while (usi.next()) {
344                 int cp2 = usi.codepoint;
345                 // we know that there are no strings in it
346                 // so we don't have to check CharacterIterator.IS_STRING
347                 Set remainder = extract(cp2, segment, i, workingBuffer);
348                 if (remainder == null) continue;
349 
350                 // there were some matches, so add all the possibilities to the set.
351                 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
352                 Iterator it = remainder.iterator();
353                 while (it.hasNext()) {
354                     String item = (String) it.next();
355                     if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
356                     result.add(prefix + item);
357                 }
358             }
359         }
360         return result;
361         */
362     }
363 
364     /**
365      * See if the decomposition of cp2 is at segment starting at segmentPos
366      * (with canonical rearrangment!)
367      * If so, take the remainder, and return the equivalents
368      */
369     private Set extract(int comp, String   segment, int segmentPos, StringBuffer   buffer) {
370         if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
371             + ", " + Utility.hex(segment.substring(segmentPos)));
372 
373         //String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
374         String   decomp = Normalizer.normalize(comp, Normalizer.NFD);
375 
376         // See if it matches the start of segment (at segmentPos)
377         boolean ok = false;
378         int cp;
379         int decompPos = 0;
380         int decompCp = UTF16.charAt(decomp,0);
381         decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
382         //int decompClass = getClass(decompCp);
383         buffer.setLength(0); // initialize working buffer, shared among callees
384 
385         for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
386             cp = UTF16.charAt(segment, i);
387             if (cp == decompCp) { // if equal, eat another cp from decomp
388                 if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
389                 if (decompPos == decomp.length()) { // done, have all decomp characters!
390                     buffer.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
391                     ok = true;
392                     break;
393                 }
394                 decompCp = UTF16.charAt(decomp, decompPos);
395                 decompPos += UTF16.getCharCount(decompCp);
396                 //decompClass = getClass(decompCp);
397             } else {
398                 if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
399                 // brute force approach
400                 UTF16.append(buffer, cp);
401                 /* TODO: optimize
402                 // since we know that the classes are monotonically increasing, after zero
403                 // e.g. 0 5 7 9 0 3
404                 // we can do an optimization
405                 // there are only a few cases that work: zero, less, same, greater
406                 // if both classes are the same, we fail
407                 // if the decomp class < the segment class, we fail
408 
409                 segClass = getClass(cp);
410                 if (decompClass <= segClass) return null;
411                 */
412             }
413         }
414         if (!ok) return null; // we failed, characters left over
415         if (PROGRESS) System.out.println("Matches");
416         if (buffer.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
417         String   remainder = buffer.toString();
418 
419         // brute force approach
420         // to check to make sure result is canonically equivalent
421         /*
422         String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
423         if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
424         */
425 
426         if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
427 
428         // get the remaining combinations
429         return getEquivalents2(remainder);
430     }
431 
432     /*
433     // TODO: fix once we have a codepoint interface to get the canonical combining class
434     // TODO: Need public access to canonical combining class in UCharacter!
435     private static int getClass(int cp) {
436         return Normalizer.getClass((char)cp);
437     }
438     */
439 
440    // ================= BUILDER =========================
441     // TODO: Flatten this data so it doesn't have to be reconstructed each time!
442 
443     private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
444     private static final Set SET_WITH_NULL_STRING = new HashSet(); // constant, don't change
445     static {
446         SET_WITH_NULL_STRING.add("");
447     }
448 
449   //  private static UnicodeSet SAFE_START = new UnicodeSet();
450   //  private static CharMap AT_START = new CharMap();
451 
452         // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
453         // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
454   //  private static int LAST_UNICODE = 0x10FFFF;
455     /*
456     static {
457         buildData();
458     }
459     */
460     /*
461     private static void buildData() {
462 
463         if (PROGRESS) System.out.println("Getting Safe Start");
464         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
465             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
466             int cc = UCharacter.getCombiningClass(cp);
467             if (cc == 0) SAFE_START.add(cp);
468             // will fix to be really safe below
469         }
470         if (PROGRESS) System.out.println();
471 
472         if (PROGRESS) System.out.println("Getting Containment");
473         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
474             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
475 
476             if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
477 
478             //String istr = UTF16.valueOf(cp);
479             String decomp = Normalizer.normalize(cp, Normalizer.NFD);
480             //if (decomp.equals(istr)) continue;
481 
482             // add each character in the decomposition to canBeIn
483 
484             int component;
485             for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
486                 component = UTF16.charAt(decomp, i);
487                 if (i == 0) {
488                     AT_START.add(component, cp);
489                 } else if (UCharacter.getCombiningClass(component) == 0) {
490                     SAFE_START.remove(component);
491                 }
492             }
493         }
494         if (PROGRESS) System.out.println();
495     }
496         // the following is just for a map from characters to a set of characters
497 
498     private static class CharMap {
499         Map storage = new HashMap();
500         MutableInt probe = new MutableInt();
501         boolean converted = false;
502 
503         public void add(int cp, int whatItIsIn) {
504             UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
505             if (result == null) {
506                 result = new UnicodeSet();
507                 storage.put(probe, result);
508             }
509             result.add(whatItIsIn);
510         }
511 
512         public UnicodeSet get(int cp) {
513             return (UnicodeSet) storage.get(probe.set(cp));
514         }
515     }
516 
517     private static class MutableInt {
518         public int contents;
519         public int hashCode() { return contents; }
520         public boolean equals(Object other) {
521             return ((MutableInt)other).contents == contents;
522         }
523         // allows chaining
524         public MutableInt set(int contents) {
525             this.contents = contents;
526             return this;
527         }
528     }
529     */
530 
531 }
532
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags