KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > CompoundTransliterator


1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2005, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7 package com.ibm.icu.text;
8 import com.ibm.icu.impl.Utility;
9 import com.ibm.icu.impl.UtilityExtensions;
10 import java.util.Vector JavaDoc;
11
12 /**
13  * A transliterator that is composed of two or more other
14  * transliterator objects linked together. For example, if one
15  * transliterator transliterates from script A to script B, and
16  * another transliterates from script B to script C, the two may be
17  * combined to form a new transliterator from A to C.
18  *
19  * <p>Composed transliterators may not behave as expected. For
20  * example, inverses may not combine to form the identity
21  * transliterator. See the class documentation for {@link
22  * Transliterator} for details.
23  *
24  * <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
25  *
26  * @author Alan Liu
27  * @internal
28  */

29 class CompoundTransliterator extends Transliterator {
30
31     private Transliterator[] trans;
32
33     private int numAnonymousRBTs = 0;
34
35     private static final String JavaDoc COPYRIGHT =
36         "\u00A9 IBM Corporation 1999-2001. All rights reserved.";
37
38     /**
39      * Constructs a new compound transliterator given an array of
40      * transliterators. The array of transliterators may be of any
41      * length, including zero or one, however, useful compound
42      * transliterators have at least two components.
43      * @param transliterators array of <code>Transliterator</code>
44      * objects
45      * @param filter the filter. Any character for which
46      * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
47      * altered by this transliterator. If <tt>filter</tt> is
48      * <tt>null</tt> then no filtering is applied.
49      * @internal
50      */

51     public CompoundTransliterator(Transliterator[] transliterators,
52                                   UnicodeFilter filter) {
53         super(joinIDs(transliterators), filter);
54         trans = new Transliterator[transliterators.length];
55         System.arraycopy(transliterators, 0, trans, 0, trans.length);
56         computeMaximumContextLength();
57     }
58
59     /**
60      * Constructs a new compound transliterator given an array of
61      * transliterators. The array of transliterators may be of any
62      * length, including zero or one, however, useful compound
63      * transliterators have at least two components.
64      * @param transliterators array of <code>Transliterator</code>
65      * objects
66      * @internal
67      */

68     public CompoundTransliterator(Transliterator[] transliterators) {
69         this(transliterators, null);
70     }
71
72     /**
73      * Constructs a new compound transliterator.
74      * @param ID compound ID
75      * @param direction either Transliterator.FORWARD or Transliterator.REVERSE
76      * @param filter a global filter for this compound transliterator
77      * or null
78      * @internal
79      */

80     public CompoundTransliterator(String JavaDoc ID, int direction,
81                                   UnicodeFilter filter) {
82         super(ID, filter);
83         init(ID, direction, true);
84     }
85
86     /**
87      * Constructs a new compound transliterator with no filter.
88      * @param ID compound ID
89      * @param direction either Transliterator.FORWARD or Transliterator.REVERSE
90      * @internal
91      */

92     public CompoundTransliterator(String JavaDoc ID, int direction) {
93         this(ID, direction, null);
94     }
95
96     /**
97      * Constructs a new forward compound transliterator with no filter.
98      * @param ID compound ID
99      * @internal
100      */

101     public CompoundTransliterator(String JavaDoc ID) {
102         this(ID, FORWARD, null);
103     }
104
105     /**
106      * Package private constructor for Transliterator from a vector of
107      * transliterators. The caller is responsible for fixing up the
108      * ID.
109      */

110     CompoundTransliterator(Vector JavaDoc list) {
111         this(list, 0);
112     }
113
114     CompoundTransliterator(Vector JavaDoc list, int numAnonymousRBTs) {
115         super("", null);
116         trans = null;
117         init(list, FORWARD, false);
118         this.numAnonymousRBTs = numAnonymousRBTs;
119         // assume caller will fixup ID
120
}
121
122     /**
123      * Finish constructing a transliterator: only to be called by
124      * constructors. Before calling init(), set trans and filter to NULL.
125      * @param id the id containing ';'-separated entries
126      * @param direction either FORWARD or REVERSE
127      * @param idSplitPoint the index into id at which the
128      * splitTrans should be inserted, if there is one, or
129      * -1 if there is none.
130      * @param splitTrans a transliterator to be inserted
131      * before the entry at offset idSplitPoint in the id string. May be
132      * NULL to insert no entry.
133      * @param fixReverseID if TRUE, then reconstruct the ID of reverse
134      * entries by calling getID() of component entries. Some constructors
135      * do not require this because they apply a facade ID anyway.
136      */

137     private void init(String JavaDoc id,
138                       int direction,
139                       boolean fixReverseID) {
140         // assert(trans == 0);
141

142         Vector JavaDoc list = new Vector JavaDoc();
143         UnicodeSet[] compoundFilter = new UnicodeSet[1];
144         StringBuffer JavaDoc regenID = new StringBuffer JavaDoc();
145         if (!TransliteratorIDParser.parseCompoundID(id, direction,
146                  regenID, list, compoundFilter)) {
147             throw new IllegalArgumentException JavaDoc("Invalid ID " + id);
148         }
149
150         TransliteratorIDParser.instantiateList(list);
151
152         init(list, direction, fixReverseID);
153
154         if (compoundFilter[0] != null) {
155             setFilter(compoundFilter[0]);
156         }
157     }
158
159     /**
160      * Finish constructing a transliterator: only to be called by
161      * constructors. Before calling init(), set trans and filter to NULL.
162      * @param list a vector of transliterator objects to be adopted. It
163      * should NOT be empty. The list should be in declared order. That
164      * is, it should be in the FORWARD order; if direction is REVERSE then
165      * the list order will be reversed.
166      * @param direction either FORWARD or REVERSE
167      * @param fixReverseID if TRUE, then reconstruct the ID of reverse
168      * entries by calling getID() of component entries. Some constructors
169      * do not require this because they apply a facade ID anyway.
170      */

171     private void init(Vector JavaDoc list,
172                       int direction,
173                       boolean fixReverseID) {
174         // assert(trans == 0);
175

176         // Allocate array
177
int count = list.size();
178         trans = new Transliterator[count];
179
180         // Move the transliterators from the vector into an array.
181
// Reverse the order if necessary.
182
int i;
183         for (i=0; i<count; ++i) {
184             int j = (direction == FORWARD) ? i : count - 1 - i;
185             trans[i] = (Transliterator) list.elementAt(j);
186         }
187
188         // If the direction is UTRANS_REVERSE then we may need to fix the
189
// ID.
190
if (direction == REVERSE && fixReverseID) {
191             StringBuffer JavaDoc newID = new StringBuffer JavaDoc();
192             for (i=0; i<count; ++i) {
193                 if (i > 0) {
194                     newID.append(ID_DELIM);
195                 }
196                 newID.append(trans[i].getID());
197             }
198             setID(newID.toString());
199         }
200
201         computeMaximumContextLength();
202     }
203
204     /**
205      * Return the IDs of the given list of transliterators, concatenated
206      * with ';' delimiting them. Equivalent to the perlish expression
207      * join(';', map($_.getID(), transliterators).
208      */

209     private static String JavaDoc joinIDs(Transliterator[] transliterators) {
210         StringBuffer JavaDoc id = new StringBuffer JavaDoc();
211         for (int i=0; i<transliterators.length; ++i) {
212             if (i > 0) {
213                 id.append(';');
214             }
215             id.append(transliterators[i].getID());
216         }
217         return id.toString();
218     }
219
220     /**
221      * Returns the number of transliterators in this chain.
222      * @return number of transliterators in this chain.
223      * @internal
224      */

225     public int getCount() {
226         return trans.length;
227     }
228
229     /**
230      * Returns the transliterator at the given index in this chain.
231      * @param index index into chain, from 0 to <code>getCount() - 1</code>
232      * @return transliterator at the given index
233      * @internal
234      */

235     public Transliterator getTransliterator(int index) {
236         return trans[index];
237     }
238
239     /**
240      * Append c to buf, unless buf is empty or buf already ends in c.
241      */

242     private static void _smartAppend(StringBuffer JavaDoc buf, char c) {
243         if (buf.length() != 0 &&
244             buf.charAt(buf.length() - 1) != c) {
245             buf.append(c);
246         }
247     }
248
249     /**
250      * Override Transliterator:
251      * Create a rule string that can be passed to createFromRules()
252      * to recreate this transliterator.
253      * @param escapeUnprintable if TRUE then convert unprintable
254      * character to their hex escape representations, \\uxxxx or
255      * \\Uxxxxxxxx. Unprintable characters are those other than
256      * U+000A, U+0020..U+007E.
257      * @return the rule string
258      * @internal
259      */

260     public String JavaDoc toRules(boolean escapeUnprintable) {
261         // We do NOT call toRules() on our component transliterators, in
262
// general. If we have several rule-based transliterators, this
263
// yields a concatenation of the rules -- not what we want. We do
264
// handle compound RBT transliterators specially -- those for which
265
// compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,
266
// we do call toRules() recursively.
267
StringBuffer JavaDoc rulesSource = new StringBuffer JavaDoc();
268         if (numAnonymousRBTs >= 1 && getFilter() != null) {
269             // If we are a compound RBT and if we have a global
270
// filter, then emit it at the top.
271
rulesSource.append("::").append(getFilter().toPattern(escapeUnprintable)).append(ID_DELIM);
272         }
273         for (int i=0; i<trans.length; ++i) {
274             String JavaDoc rule;
275
276             // Anonymous RuleBasedTransliterators (inline rules and
277
// ::BEGIN/::END blocks) are given IDs that begin with
278
// "%Pass": use toRules() to write all the rules to the output
279
// (and insert "::Null;" if we have two in a row)
280
if (trans[i].getID().startsWith("%Pass")) {
281                 rule = trans[i].toRules(escapeUnprintable);
282                 if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1].getID().startsWith("%Pass"))
283                     rule = "::Null;" + rule;
284
285             // we also use toRules() on CompoundTransliterators (which we
286
// check for by looking for a semicolon in the ID)-- this gets
287
// the list of their child transliterators output in the right
288
// format
289
} else if (trans[i].getID().indexOf(';') >= 0) {
290                 rule = trans[i].toRules(escapeUnprintable);
291
292             // for everything else, use baseToRules()
293
} else {
294                 rule = trans[i].baseToRules(escapeUnprintable);
295             }
296             _smartAppend(rulesSource, '\n');
297             rulesSource.append(rule);
298             _smartAppend(rulesSource, ID_DELIM);
299         }
300         return rulesSource.toString();
301     }
302
303     /**
304      * Return the set of all characters that may be modified by this
305      * Transliterator, ignoring the effect of our filter.
306      * @internal
307      */

308     protected UnicodeSet handleGetSourceSet() {
309         UnicodeSet set = new UnicodeSet();
310         for (int i=0; i<trans.length; ++i) {
311             set.addAll(trans[i].getSourceSet());
312             // Take the example of Hiragana-Latin. This is really
313
// Hiragana-Katakana; Katakana-Latin. The source set of
314
// these two is roughly [:Hiragana:] and [:Katakana:].
315
// But the source set for the entire transliterator is
316
// actually [:Hiragana:] ONLY -- that is, the first
317
// non-empty source set.
318

319             // This is a heuristic, and not 100% reliable.
320
if (!set.isEmpty()) {
321                 break;
322             }
323         }
324         return set;
325     }
326
327     /**
328      * Returns the set of all characters that may be generated as
329      * replacement text by this transliterator.
330      * @internal
331      */

332     public UnicodeSet getTargetSet() {
333         UnicodeSet set = new UnicodeSet();
334         for (int i=0; i<trans.length; ++i) {
335             // This is a heuristic, and not 100% reliable.
336
set.addAll(trans[i].getTargetSet());
337         }
338         return set;
339     }
340
341     /**
342      * Implements {@link Transliterator#handleTransliterate}.
343      * @internal
344      */

345     protected void handleTransliterate(Replaceable text,
346                                        Position index, boolean incremental) {
347         /* Call each transliterator with the same start value and
348          * initial cursor index, but with the limit index as modified
349          * by preceding transliterators. The cursor index must be
350          * reset for each transliterator to give each a chance to
351          * transliterate the text. The initial cursor index is known
352          * to still point to the same place after each transliterator
353          * is called because each transliterator will not change the
354          * text between start and the initial value of cursor.
355          *
356          * IMPORTANT: After the first transliterator, each subsequent
357          * transliterator only gets to transliterate text committed by
358          * preceding transliterators; that is, the cursor (output
359          * value) of transliterator i becomes the limit (input value)
360          * of transliterator i+1. Finally, the overall limit is fixed
361          * up before we return.
362          *
363          * Assumptions we make here:
364          * (1) contextStart <= start <= limit <= contextLimit <= text.length()
365          * (2) start <= start' <= limit' ;cursor doesn't move back
366          * (3) start <= limit' ;text before cursor unchanged
367          * - start' is the value of start after calling handleKT
368          * - limit' is the value of limit after calling handleKT
369          */

370
371         /**
372          * Example: 3 transliterators. This example illustrates the
373          * mechanics we need to implement. C, S, and L are the contextStart,
374          * start, and limit. gl is the globalLimit. contextLimit is
375          * equal to limit throughout.
376          *
377          * 1. h-u, changes hex to Unicode
378          *
379          * 4 7 a d 0 4 7 a
380          * abc/u0061/u => abca/u
381          * C S L C S L gl=f->a
382          *
383          * 2. upup, changes "x" to "XX"
384          *
385          * 4 7 a 4 7 a
386          * abca/u => abcAA/u
387          * C SL C S
388          * L gl=a->b
389          * 3. u-h, changes Unicode to hex
390          *
391          * 4 7 a 4 7 a d 0 3
392          * abcAA/u => abc/u0041/u0041/u
393          * C S L C S
394          * L gl=b->15
395          * 4. return
396          *
397          * 4 7 a d 0 3
398          * abc/u0041/u0041/u
399          * C S L
400          */

401
402         if (trans.length < 1) {
403             index.start = index.limit;
404             return; // Short circuit for empty compound transliterators
405
}
406
407         // compoundLimit is the limit value for the entire compound
408
// operation. We overwrite index.limit with the previous
409
// index.start. After each transliteration, we update
410
// compoundLimit for insertions or deletions that have happened.
411
int compoundLimit = index.limit;
412
413         // compoundStart is the start for the entire compound
414
// operation.
415
int compoundStart = index.start;
416
417         int delta = 0; // delta in length
418

419         StringBuffer JavaDoc log = null;
420         if (DEBUG) {
421             log = new StringBuffer JavaDoc("CompoundTransliterator{" + getID() +
422                                    (incremental ? "}i: IN=" : "}: IN="));
423             UtilityExtensions.formatInput(log, text, index);
424             System.out.println(Utility.escape(log.toString()));
425         }
426
427         // Give each transliterator a crack at the run of characters.
428
// See comments at the top of the method for more detail.
429
for (int i=0; i<trans.length; ++i) {
430             index.start = compoundStart; // Reset start
431
int limit = index.limit;
432
433             if (index.start == index.limit) {
434                 // Short circuit for empty range
435
if (DEBUG) {
436                     System.out.println("CompoundTransliterator[" + i +
437                                        ".." + (trans.length-1) +
438                                        (incremental ? "]i: " : "]: ") +
439                                        UtilityExtensions.formatInput(text, index) +
440                                        " (NOTHING TO DO)");
441                 }
442                 break;
443             }
444
445             if (DEBUG) {
446                 log.setLength(0);
447                 log.append("CompoundTransliterator[" + i + "=" +
448                            trans[i].getID() +
449                            (incremental ? "]i: " : "]: "));
450                 UtilityExtensions.formatInput(log, text, index);
451             }
452
453             trans[i].filteredTransliterate(text, index, incremental);
454
455             // In a properly written transliterator, start == limit after
456
// handleTransliterate() returns when incremental is false.
457
// Catch cases where the subclass doesn't do this, and throw
458
// an exception. (Just pinning start to limit is a bad idea,
459
// because what's probably happening is that the subclass
460
// isn't transliterating all the way to the end, and it should
461
// in non-incremental mode.)
462
if (!incremental && index.start != index.limit) {
463                 throw new RuntimeException JavaDoc("ERROR: Incomplete non-incremental transliteration by " + trans[i].getID());
464             }
465
466             if (DEBUG) {
467                 log.append(" => ");
468                 UtilityExtensions.formatInput(log, text, index);
469                 System.out.println(Utility.escape(log.toString()));
470             }
471
472             // Cumulative delta for insertions/deletions
473
delta += index.limit - limit;
474
475             if (incremental) {
476                 // In the incremental case, only allow subsequent
477
// transliterators to modify what has already been
478
// completely processed by prior transliterators. In the
479
// non-incrmental case, allow each transliterator to
480
// process the entire text.
481
index.limit = index.start;
482             }
483         }
484
485         compoundLimit += delta;
486
487         // Start is good where it is -- where the last transliterator left
488
// it. Limit needs to be put back where it was, modulo
489
// adjustments for deletions/insertions.
490
index.limit = compoundLimit;
491
492         if (DEBUG) {
493             log.setLength(0);
494             log.append("CompoundTransliterator{" + getID() +
495                        (incremental ? "}i: OUT=" : "}: OUT="));
496             UtilityExtensions.formatInput(log, text, index);
497             System.out.println(Utility.escape(log.toString()));
498         }
499     }
500
501     /**
502      * Compute and set the length of the longest context required by this transliterator.
503      * This is <em>preceding</em> context.
504      */

505     private void computeMaximumContextLength() {
506         int max = 0;
507         for (int i=0; i<trans.length; ++i) {
508             int len = trans[i].getMaximumContextLength();
509             if (len > max) {
510                 max = len;
511             }
512         }
513         setMaximumContextLength(max);
514     }
515 }
516
Popular Tags