KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > Transliterator


1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2006, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7 package com.ibm.icu.text;
8
9 import com.ibm.icu.impl.ICUResourceBundle;
10 import com.ibm.icu.impl.Utility;
11 import com.ibm.icu.impl.UtilityExtensions;
12 import com.ibm.icu.util.CaseInsensitiveString;
13 import com.ibm.icu.util.ULocale;
14 import com.ibm.icu.util.UResourceBundle;
15 import com.ibm.icu.impl.UCharacterProperty;
16
17 import java.text.MessageFormat JavaDoc;
18 import java.util.Enumeration JavaDoc;
19 import java.util.Hashtable JavaDoc;
20 import java.util.Locale JavaDoc;
21 import java.util.MissingResourceException JavaDoc;
22 import java.util.Vector JavaDoc;
23
24 /**
25  * <code>Transliterator</code> is an abstract class that
26  * transliterates text from one format to another. The most common
27  * kind of transliterator is a script, or alphabet, transliterator.
28  * For example, a Russian to Latin transliterator changes Russian text
29  * written in Cyrillic characters to phonetically equivalent Latin
30  * characters. It does not <em>translate</em> Russian to English!
31  * Transliteration, unlike translation, operates on characters, without
32  * reference to the meanings of words and sentences.
33  *
34  * <p>Although script conversion is its most common use, a
35  * transliterator can actually perform a more general class of tasks.
36  * In fact, <code>Transliterator</code> defines a very general API
37  * which specifies only that a segment of the input text is replaced
38  * by new text. The particulars of this conversion are determined
39  * entirely by subclasses of <code>Transliterator</code>.
40  *
41  * <p><b>Transliterators are stateless</b>
42  *
43  * <p><code>Transliterator</code> objects are <em>stateless</em>; they
44  * retain no information between calls to
45  * <code>transliterate()</code>. As a result, threads may share
46  * transliterators without synchronizing them. This might seem to
47  * limit the complexity of the transliteration operation. In
48  * practice, subclasses perform complex transliterations by delaying
49  * the replacement of text until it is known that no other
50  * replacements are possible. In other words, although the
51  * <code>Transliterator</code> objects are stateless, the source text
52  * itself embodies all the needed information, and delayed operation
53  * allows arbitrary complexity.
54  *
55  * <p><b>Batch transliteration</b>
56  *
57  * <p>The simplest way to perform transliteration is all at once, on a
58  * string of existing text. This is referred to as <em>batch</em>
59  * transliteration. For example, given a string <code>input</code>
60  * and a transliterator <code>t</code>, the call
61  *
62  * <blockquote><code>String result = t.transliterate(input);
63  * </code></blockquote>
64  *
65  * will transliterate it and return the result. Other methods allow
66  * the client to specify a substring to be transliterated and to use
67  * {@link Replaceable} objects instead of strings, in order to
68  * preserve out-of-band information (such as text styles).
69  *
70  * <p><b>Keyboard transliteration</b>
71  *
72  * <p>Somewhat more involved is <em>keyboard</em>, or incremental
73  * transliteration. This is the transliteration of text that is
74  * arriving from some source (typically the user's keyboard) one
75  * character at a time, or in some other piecemeal fashion.
76  *
77  * <p>In keyboard transliteration, a <code>Replaceable</code> buffer
78  * stores the text. As text is inserted, as much as possible is
79  * transliterated on the fly. This means a GUI that displays the
80  * contents of the buffer may show text being modified as each new
81  * character arrives.
82  *
83  * <p>Consider the simple <code>RuleBasedTransliterator</code>:
84  *
85  * <blockquote><code>
86  * th&gt;{theta}<br>
87  * t&gt;{tau}
88  * </code></blockquote>
89  *
90  * When the user types 't', nothing will happen, since the
91  * transliterator is waiting to see if the next character is 'h'. To
92  * remedy this, we introduce the notion of a cursor, marked by a '|'
93  * in the output string:
94  *
95  * <blockquote><code>
96  * t&gt;|{tau}<br>
97  * {tau}h&gt;{theta}
98  * </code></blockquote>
99  *
100  * Now when the user types 't', tau appears, and if the next character
101  * is 'h', the tau changes to a theta. This is accomplished by
102  * maintaining a cursor position (independent of the insertion point,
103  * and invisible in the GUI) across calls to
104  * <code>transliterate()</code>. Typically, the cursor will
105  * be coincident with the insertion point, but in a case like the one
106  * above, it will precede the insertion point.
107  *
108  * <p>Keyboard transliteration methods maintain a set of three indices
109  * that are updated with each call to
110  * <code>transliterate()</code>, including the cursor, start,
111  * and limit. These indices are changed by the method, and they are
112  * passed in and out via a Position object. The <code>start</code> index
113  * marks the beginning of the substring that the transliterator will
114  * look at. It is advanced as text becomes committed (but it is not
115  * the committed index; that's the <code>cursor</code>). The
116  * <code>cursor</code> index, described above, marks the point at
117  * which the transliterator last stopped, either because it reached
118  * the end, or because it required more characters to disambiguate
119  * between possible inputs. The <code>cursor</code> can also be
120  * explicitly set by rules in a <code>RuleBasedTransliterator</code>.
121  * Any characters before the <code>cursor</code> index are frozen;
122  * future keyboard transliteration calls within this input sequence
123  * will not change them. New text is inserted at the
124  * <code>limit</code> index, which marks the end of the substring that
125  * the transliterator looks at.
126  *
127  * <p>Because keyboard transliteration assumes that more characters
128  * are to arrive, it is conservative in its operation. It only
129  * transliterates when it can do so unambiguously. Otherwise it waits
130  * for more characters to arrive. When the client code knows that no
131  * more characters are forthcoming, perhaps because the user has
132  * performed some input termination operation, then it should call
133  * <code>finishTransliteration()</code> to complete any
134  * pending transliterations.
135  *
136  * <p><b>Inverses</b>
137  *
138  * <p>Pairs of transliterators may be inverses of one another. For
139  * example, if transliterator <b>A</b> transliterates characters by
140  * incrementing their Unicode value (so "abc" -> "def"), and
141  * transliterator <b>B</b> decrements character values, then <b>A</b>
142  * is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
143  * with <b>B</b> in a compound transliterator, the result is the
144  * indentity transliterator, that is, a transliterator that does not
145  * change its input text.
146  *
147  * The <code>Transliterator</code> method <code>getInverse()</code>
148  * returns a transliterator's inverse, if one exists, or
149  * <code>null</code> otherwise. However, the result of
150  * <code>getInverse()</code> usually will <em>not</em> be a true
151  * mathematical inverse. This is because true inverse transliterators
152  * are difficult to formulate. For example, consider two
153  * transliterators: <b>AB</b>, which transliterates the character 'A'
154  * to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
155  * seem that these are exact inverses, since
156  *
157  * <blockquote>"A" x <b>AB</b> -> "B"<br>
158  * "B" x <b>BA</b> -> "A"</blockquote>
159  *
160  * where 'x' represents transliteration. However,
161  *
162  * <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
163  * "BBCD" x <b>BA</b> -> "AACD"</blockquote>
164  *
165  * so <b>AB</b> composed with <b>BA</b> is not the
166  * identity. Nonetheless, <b>BA</b> may be usefully considered to be
167  * <b>AB</b>'s inverse, and it is on this basis that
168  * <b>AB</b><code>.getInverse()</code> could legitimately return
169  * <b>BA</b>.
170  *
171  * <p><b>IDs and display names</b>
172  *
173  * <p>A transliterator is designated by a short identifier string or
174  * <em>ID</em>. IDs follow the format <em>source-destination</em>,
175  * where <em>source</em> describes the entity being replaced, and
176  * <em>destination</em> describes the entity replacing
177  * <em>source</em>. The entities may be the names of scripts,
178  * particular sequences of characters, or whatever else it is that the
179  * transliterator converts to or from. For example, a transliterator
180  * from Russian to Latin might be named "Russian-Latin". A
181  * transliterator from keyboard escape sequences to Latin-1 characters
182  * might be named "KeyboardEscape-Latin1". By convention, system
183  * entity names are in English, with the initial letters of words
184  * capitalized; user entity names may follow any format so long as
185  * they do not contain dashes.
186  *
187  * <p>In addition to programmatic IDs, transliterator objects have
188  * display names for presentation in user interfaces, returned by
189  * {@link #getDisplayName}.
190  *
191  * <p><b>Factory methods and registration</b>
192  *
193  * <p>In general, client code should use the factory method
194  * <code>getInstance()</code> to obtain an instance of a
195  * transliterator given its ID. Valid IDs may be enumerated using
196  * <code>getAvailableIDs()</code>. Since transliterators are
197  * stateless, multiple calls to <code>getInstance()</code> with the
198  * same ID will return the same object.
199  *
200  * <p>In addition to the system transliterators registered at startup,
201  * user transliterators may be registered by calling
202  * <code>registerInstance()</code> at run time. To register a
203  * transliterator subclass without instantiating it (until it is
204  * needed), users may call <code>registerClass()</code>.
205  *
206  * <p><b>Composed transliterators</b>
207  *
208  * <p>In addition to built-in system transliterators like
209  * "Latin-Greek", there are also built-in <em>composed</em>
210  * transliterators. These are implemented by composing two or more
211  * component transliterators. For example, if we have scripts "A",
212  * "B", "C", and "D", and we want to transliterate between all pairs
213  * of them, then we need to write 12 transliterators: "A-B", "A-C",
214  * "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to
215  * convert all scripts to an intermediate script "M", then instead of
216  * writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M",
217  * "D~M", "M~A", "M~B", "M~C", "M~D". (This might not seem like a big
218  * win, but it's really 2<em>n</em> vs. <em>n</em><sup>2</sup> -
219  * <em>n</em>, so as <em>n</em> gets larger the gain becomes
220  * significant. With 9 scripts, it's 18 vs. 72 rule sets, a big
221  * difference.) Note the use of "~" rather than "-" for the script
222  * separator here; this indicates that the given transliterator is
223  * intended to be composed with others, rather than be used as is.
224  *
225  * <p>Composed transliterators can be instantiated as usual. For
226  * example, the system transliterator "Devanagari-Gujarati" is a
227  * composed transliterator built internally as
228  * "Devanagari~InterIndic;InterIndic~Gujarati". When this
229  * transliterator is instantiated, it appears externally to be a
230  * standard transliterator (e.g., getID() returns
231  * "Devanagari-Gujarati").
232  *
233  * <p><b>Subclassing</b>
234  *
235  * <p>Subclasses must implement the abstract method
236  * <code>handleTransliterate()</code>. <p>Subclasses should override
237  * the <code>transliterate()</code> method taking a
238  * <code>Replaceable</code> and the <code>transliterate()</code>
239  * method taking a <code>String</code> and <code>StringBuffer</code>
240  * if the performance of these methods can be improved over the
241  * performance obtained by the default implementations in this class.
242  *
243  * <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
244  *
245  * @author Alan Liu
246  * @stable ICU 2.0
247  */

248 public abstract class Transliterator {
249     /**
250      * Direction constant indicating the forward direction in a transliterator,
251      * e.g., the forward rules of a RuleBasedTransliterator. An "A-B"
252      * transliterator transliterates A to B when operating in the forward
253      * direction, and B to A when operating in the reverse direction.
254      * @stable ICU 2.0
255      */

256     public static final int FORWARD = 0;
257
258     /**
259      * Direction constant indicating the reverse direction in a transliterator,
260      * e.g., the reverse rules of a RuleBasedTransliterator. An "A-B"
261      * transliterator transliterates A to B when operating in the forward
262      * direction, and B to A when operating in the reverse direction.
263      * @stable ICU 2.0
264      */

265     public static final int REVERSE = 1;
266
267     /**
268      * Position structure for incremental transliteration. This data
269      * structure defines two substrings of the text being
270      * transliterated. The first region, [contextStart,
271      * contextLimit), defines what characters the transliterator will
272      * read as context. The second region, [start, limit), defines
273      * what characters will actually be transliterated. The second
274      * region should be a subset of the first.
275      *
276      * <p>After a transliteration operation, some of the indices in this
277      * structure will be modified. See the field descriptions for
278      * details.
279      *
280      * <p>contextStart <= start <= limit <= contextLimit
281      *
282      * <p>Note: All index values in this structure must be at code point
283      * boundaries. That is, none of them may occur between two code units
284      * of a surrogate pair. If any index does split a surrogate pair,
285      * results are unspecified.
286      * @stable ICU 2.0
287      */

288     public static class Position {
289
290         /**
291          * Beginning index, inclusive, of the context to be considered for
292          * a transliteration operation. The transliterator will ignore
293          * anything before this index. INPUT/OUTPUT parameter: This parameter
294          * is updated by a transliteration operation to reflect the maximum
295          * amount of antecontext needed by a transliterator.
296          * @stable ICU 2.0
297          */

298         public int contextStart;
299
300         /**
301          * Ending index, exclusive, of the context to be considered for a
302          * transliteration operation. The transliterator will ignore
303          * anything at or after this index. INPUT/OUTPUT parameter: This
304          * parameter is updated to reflect changes in the length of the
305          * text, but points to the same logical position in the text.
306          * @stable ICU 2.0
307          */

308         public int contextLimit;
309
310         /**
311          * Beginning index, inclusive, of the text to be transliteratd.
312          * INPUT/OUTPUT parameter: This parameter is advanced past
313          * characters that have already been transliterated by a
314          * transliteration operation.
315          * @stable ICU 2.0
316          */

317         public int start;
318
319         /**
320          * Ending index, exclusive, of the text to be transliteratd.
321          * INPUT/OUTPUT parameter: This parameter is updated to reflect
322          * changes in the length of the text, but points to the same
323          * logical position in the text.
324          * @stable ICU 2.0
325          */

326         public int limit;
327
328         /**
329          * Constructs a Position object with start, limit,
330          * contextStart, and contextLimit all equal to zero.
331          * @stable ICU 2.0
332          */

333         public Position() {
334             this(0, 0, 0, 0);
335         }
336
337         /**
338          * Constructs a Position object with the given start,
339          * contextStart, and contextLimit. The limit is set to the
340          * contextLimit.
341          * @stable ICU 2.0
342          */

343         public Position(int contextStart, int contextLimit, int start) {
344             this(contextStart, contextLimit, start, contextLimit);
345         }
346
347         /**
348          * Constructs a Position object with the given start, limit,
349          * contextStart, and contextLimit.
350          * @stable ICU 2.0
351          */

352         public Position(int contextStart, int contextLimit,
353                         int start, int limit) {
354             this.contextStart = contextStart;
355             this.contextLimit = contextLimit;
356             this.start = start;
357             this.limit = limit;
358         }
359
360         /**
361          * Constructs a Position object that is a copy of another.
362          * @stable ICU 2.6
363          */

364         public Position(Position pos) {
365             set(pos);
366         }
367
368         /**
369          * Copies the indices of this position from another.
370          * @stable ICU 2.6
371          */

372         public void set(Position pos) {
373             contextStart = pos.contextStart;
374             contextLimit = pos.contextLimit;
375             start = pos.start;
376             limit = pos.limit;
377         }
378
379         /**
380          * Returns true if this Position is equal to the given object.
381          * @stable ICU 2.6
382          */

383         public boolean equals(Object JavaDoc obj) {
384             if (obj instanceof Position) {
385                 Position pos = (Position) obj;
386                 return contextStart == pos.contextStart &&
387                     contextLimit == pos.contextLimit &&
388                     start == pos.start &&
389                     limit == pos.limit;
390             }
391             return false;
392         }
393
394         /**
395          * Returns a string representation of this Position.
396          * @stable ICU 2.6
397          */

398         public String JavaDoc toString() {
399             return "[cs=" + contextStart
400                 + ", s=" + start
401                 + ", l=" + limit
402                 + ", cl=" + contextLimit
403                 + "]";
404         }
405
406         /**
407          * Check all bounds. If they are invalid, throw an exception.
408          * @param length the length of the string this object applies to
409          * @exception IllegalArgumentException if any indices are out
410          * of bounds
411          * @stable ICU 2.0
412          */

413         public final void validate(int length) {
414             if (contextStart < 0 ||
415                 start < contextStart ||
416                 limit < start ||
417                 contextLimit < limit ||
418                 length < contextLimit) {
419                 throw new IllegalArgumentException JavaDoc("Invalid Position {cs=" +
420                                                    contextStart + ", s=" +
421                                                    start + ", l=" +
422                                                    limit + ", cl=" +
423                                                    contextLimit + "}, len=" +
424                                                    length);
425             }
426         }
427     }
428
429     /**
430      * Programmatic name, e.g., "Latin-Arabic".
431      */

432     private String JavaDoc ID;
433
434     /**
435      * This transliterator's filter. Any character for which
436      * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
437      * altered by this transliterator. If <tt>filter</tt> is
438      * <tt>null</tt> then no filtering is applied.
439      */

440     private UnicodeFilter filter;
441
442     private int maximumContextLength = 0;
443
444     /**
445      * System transliterator registry.
446      */

447     private static TransliteratorRegistry registry;
448
449     private static Hashtable JavaDoc displayNameCache;
450
451     /**
452      * Prefix for resource bundle key for the display name for a
453      * transliterator. The ID is appended to this to form the key.
454      * The resource bundle value should be a String.
455      */

456     private static final String JavaDoc RB_DISPLAY_NAME_PREFIX = "%Translit%%";
457
458     /**
459      * Prefix for resource bundle key for the display name for a
460      * transliterator SCRIPT. The ID is appended to this to form the key.
461      * The resource bundle value should be a String.
462      */

463     private static final String JavaDoc RB_SCRIPT_DISPLAY_NAME_PREFIX = "%Translit%";
464
465     /**
466      * Resource bundle key for display name pattern.
467      * The resource bundle value should be a String forming a
468      * MessageFormat pattern, e.g.:
469      * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
470      */

471     private static final String JavaDoc RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern";
472
473     /**
474      * Delimiter between elements in a compound ID.
475      * @internal
476      */

477     static final char ID_DELIM = ';';
478
479     /**
480      * Delimiter before target in an ID.
481      * @internal
482      */

483     static final char ID_SEP = '-';
484
485     /**
486      * Delimiter before variant in an ID.
487      * @internal
488      */

489     static final char VARIANT_SEP = '/';
490
491     /**
492      * To enable debugging output in the Transliterator component, set
493      * DEBUG to true.
494      *
495      * N.B. Make sure to recompile all of the com.ibm.icu.text package
496      * after changing this. Easiest way to do this is 'ant clean
497      * core' ('ant' will NOT pick up the dependency automatically).
498      *
499      * <<This generates a lot of output.>>
500      */

501     static final boolean DEBUG = false;
502
503     private static final String JavaDoc COPYRIGHT =
504         "\u00A9 IBM Corporation 1999. All rights reserved.";
505
506     /**
507      * Default constructor.
508      * @param ID the string identifier for this transliterator
509      * @param filter the filter. Any character for which
510      * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
511      * altered by this transliterator. If <tt>filter</tt> is
512      * <tt>null</tt> then no filtering is applied.
513      * @stable ICU 2.0
514      */

515     protected Transliterator(String JavaDoc ID, UnicodeFilter filter) {
516         if (ID == null) {
517             throw new NullPointerException JavaDoc();
518         }
519         this.ID = ID;
520         this.filter = filter;
521     }
522
523     /**
524      * Transliterates a segment of a string, with optional filtering.
525      *
526      * @param text the string to be transliterated
527      * @param start the beginning index, inclusive; <code>0 <= start
528      * <= limit</code>.
529      * @param limit the ending index, exclusive; <code>start <= limit
530      * <= text.length()</code>.
531      * @return The new limit index. The text previously occupying <code>[start,
532      * limit)</code> has been transliterated, possibly to a string of a different
533      * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
534      * <em>new-limit</em> is the return value. If the input offsets are out of bounds,
535      * the returned value is -1 and the input string remains unchanged.
536      * @stable ICU 2.0
537      */

538     public final int transliterate(Replaceable text, int start, int limit) {
539         if (start < 0 ||
540             limit < start ||
541             text.length() < limit) {
542             return -1;
543         }
544
545         Position pos = new Position(start, limit, start);
546         filteredTransliterate(text, pos, false, true);
547         return pos.limit;
548     }
549
550     /**
551      * Transliterates an entire string in place. Convenience method.
552      * @param text the string to be transliterated
553      * @stable ICU 2.0
554      */

555     public final void transliterate(Replaceable text) {
556         transliterate(text, 0, text.length());
557     }
558
559     /**
560      * Transliterate an entire string and returns the result. Convenience method.
561      *
562      * @param text the string to be transliterated
563      * @return The transliterated text
564      * @stable ICU 2.0
565      */

566     public final String JavaDoc transliterate(String JavaDoc text) {
567         ReplaceableString result = new ReplaceableString(text);
568         transliterate(result);
569         return result.toString();
570     }
571
572     /**
573      * Transliterates the portion of the text buffer that can be
574      * transliterated unambiguosly after new text has been inserted,
575      * typically as a result of a keyboard event. The new text in
576      * <code>insertion</code> will be inserted into <code>text</code>
577      * at <code>index.contextLimit</code>, advancing
578      * <code>index.contextLimit</code> by <code>insertion.length()</code>.
579      * Then the transliterator will try to transliterate characters of
580      * <code>text</code> between <code>index.start</code> and
581      * <code>index.contextLimit</code>. Characters before
582      * <code>index.start</code> will not be changed.
583      *
584      * <p>Upon return, values in <code>index</code> will be updated.
585      * <code>index.contextStart</code> will be advanced to the first
586      * character that future calls to this method will read.
587      * <code>index.start</code> and <code>index.contextLimit</code> will
588      * be adjusted to delimit the range of text that future calls to
589      * this method may change.
590      *
591      * <p>Typical usage of this method begins with an initial call
592      * with <code>index.contextStart</code> and <code>index.contextLimit</code>
593      * set to indicate the portion of <code>text</code> to be
594      * transliterated, and <code>index.start == index.contextStart</code>.
595      * Thereafter, <code>index</code> can be used without
596      * modification in future calls, provided that all changes to
597      * <code>text</code> are made via this method.
598      *
599      * <p>This method assumes that future calls may be made that will
600      * insert new text into the buffer. As a result, it only performs
601      * unambiguous transliterations. After the last call to this
602      * method, there may be untransliterated text that is waiting for
603      * more input to resolve an ambiguity. In order to perform these
604      * pending transliterations, clients should call {@link
605      * #finishTransliteration} after the last call to this
606      * method has been made.
607      *
608      * @param text the buffer holding transliterated and untransliterated text
609      * @param index the start and limit of the text, the position
610      * of the cursor, and the start and limit of transliteration.
611      * @param insertion text to be inserted and possibly
612      * transliterated into the translation buffer at
613      * <code>index.contextLimit</code>. If <code>null</code> then no text
614      * is inserted.
615      * @see #handleTransliterate
616      * @exception IllegalArgumentException if <code>index</code>
617      * is invalid
618      * @stable ICU 2.0
619      */

620     public final void transliterate(Replaceable text, Position index,
621                                     String JavaDoc insertion) {
622         index.validate(text.length());
623
624 // int originalStart = index.contextStart;
625
if (insertion != null) {
626             text.replace(index.limit, index.limit, insertion);
627             index.limit += insertion.length();
628             index.contextLimit += insertion.length();
629         }
630
631         if (index.limit > 0 &&
632             UTF16.isLeadSurrogate(text.charAt(index.limit - 1))) {
633             // Oops, there is a dangling lead surrogate in the buffer.
634
// This will break most transliterators, since they will
635
// assume it is part of a pair. Don't transliterate until
636
// more text comes in.
637
return;
638         }
639
640         filteredTransliterate(text, index, true, true);
641
642 // TODO
643
// This doesn't work once we add quantifier support. Need to rewrite
644
// this code to support quantifiers and 'use maximum backup <n>;'.
645
//
646
// index.contextStart = Math.max(index.start - getMaximumContextLength(),
647
// originalStart);
648
}
649
650     /**
651      * Transliterates the portion of the text buffer that can be
652      * transliterated unambiguosly after a new character has been
653      * inserted, typically as a result of a keyboard event. This is a
654      * convenience method; see {@link #transliterate(Replaceable,
655      * Transliterator.Position, String)} for details.
656      * @param text the buffer holding transliterated and
657      * untransliterated text
658      * @param index the start and limit of the text, the position
659      * of the cursor, and the start and limit of transliteration.
660      * @param insertion text to be inserted and possibly
661      * transliterated into the translation buffer at
662      * <code>index.contextLimit</code>.
663      * @see #transliterate(Replaceable, Transliterator.Position, String)
664      * @stable ICU 2.0
665      */

666     public final void transliterate(Replaceable text, Position index,
667                                     int insertion) {
668         transliterate(text, index, UTF16.valueOf(insertion));
669     }
670
671     /**
672      * Transliterates the portion of the text buffer that can be
673      * transliterated unambiguosly. This is a convenience method; see
674      * {@link #transliterate(Replaceable, Transliterator.Position,
675      * String)} for details.
676      * @param text the buffer holding transliterated and
677      * untransliterated text
678      * @param index the start and limit of the text, the position
679      * of the cursor, and the start and limit of transliteration.
680      * @see #transliterate(Replaceable, Transliterator.Position, String)
681      * @stable ICU 2.0
682      */

683     public final void transliterate(Replaceable text, Position index) {
684         transliterate(text, index, null);
685     }
686
687     /**
688      * Finishes any pending transliterations that were waiting for
689      * more characters. Clients should call this method as the last
690      * call after a sequence of one or more calls to
691      * <code>transliterate()</code>.
692      * @param text the buffer holding transliterated and
693      * untransliterated text.
694      * @param index the array of indices previously passed to {@link
695      * #transliterate}
696      * @stable ICU 2.0
697      */

698     public final void finishTransliteration(Replaceable text,
699                                             Position index) {
700         index.validate(text.length());
701         filteredTransliterate(text, index, false, true);
702     }
703
704     /**
705      * Abstract method that concrete subclasses define to implement
706      * their transliteration algorithm. This method handles both
707      * incremental and non-incremental transliteration. Let
708      * <code>originalStart</code> refer to the value of
709      * <code>pos.start</code> upon entry.
710      *
711      * <ul>
712      * <li>If <code>incremental</code> is false, then this method
713      * should transliterate all characters between
714      * <code>pos.start</code> and <code>pos.limit</code>. Upon return
715      * <code>pos.start</code> must == <code> pos.limit</code>.</li>
716      *
717      * <li>If <code>incremental</code> is true, then this method
718      * should transliterate all characters between
719      * <code>pos.start</code> and <code>pos.limit</code> that can be
720      * unambiguously transliterated, regardless of future insertions
721      * of text at <code>pos.limit</code>. Upon return,
722      * <code>pos.start</code> should be in the range
723      * [<code>originalStart</code>, <code>pos.limit</code>).
724      * <code>pos.start</code> should be positioned such that
725      * characters [<code>originalStart</code>, <code>
726      * pos.start</code>) will not be changed in the future by this
727      * transliterator and characters [<code>pos.start</code>,
728      * <code>pos.limit</code>) are unchanged.</li>
729      * </ul>
730      *
731      * <p>Implementations of this method should also obey the
732      * following invariants:</p>
733      *
734      * <ul>
735      * <li> <code>pos.limit</code> and <code>pos.contextLimit</code>
736      * should be updated to reflect changes in length of the text
737      * between <code>pos.start</code> and <code>pos.limit</code>. The
738      * difference <code> pos.contextLimit - pos.limit</code> should
739      * not change.</li>
740      *
741      * <li><code>pos.contextStart</code> should not change.</li>
742      *
743      * <li>Upon return, neither <code>pos.start</code> nor
744      * <code>pos.limit</code> should be less than
745      * <code>originalStart</code>.</li>
746      *
747      * <li>Text before <code>originalStart</code> and text after
748      * <code>pos.limit</code> should not change.</li>
749      *
750      * <li>Text before <code>pos.contextStart</code> and text after
751      * <code> pos.contextLimit</code> should be ignored.</li>
752      * </ul>
753      *
754      * <p>Subclasses may safely assume that all characters in
755      * [<code>pos.start</code>, <code>pos.limit</code>) are filtered.
756      * In other words, the filter has already been applied by the time
757      * this method is called. See
758      * <code>filteredTransliterate()</code>.
759      *
760      * <p>This method is <b>not</b> for public consumption. Calling
761      * this method directly will transliterate
762      * [<code>pos.start</code>, <code>pos.limit</code>) without
763      * applying the filter. End user code should call <code>
764      * transliterate()</code> instead of this method. Subclass code
765      * should call <code>filteredTransliterate()</code> instead of
766      * this method.<p>
767      *
768      * @param text the buffer holding transliterated and
769      * untransliterated text
770      *
771      * @param pos the indices indicating the start, limit, context
772      * start, and context limit of the text.
773      *
774      * @param incremental if true, assume more text may be inserted at
775      * <code>pos.limit</code> and act accordingly. Otherwise,
776      * transliterate all text between <code>pos.start</code> and
777      * <code>pos.limit</code> and move <code>pos.start</code> up to
778      * <code>pos.limit</code>.
779      *
780      * @see #transliterate
781      * @stable ICU 2.0
782      */

783     protected abstract void handleTransliterate(Replaceable text,
784                                                 Position pos, boolean incremental);
785
786     /**
787      * Top-level transliteration method, handling filtering, incremental and
788      * non-incremental transliteration, and rollback. All transliteration
789      * public API methods eventually call this method with a rollback argument
790      * of TRUE. Other entities may call this method but rollback should be
791      * FALSE.
792      *
793      * <p>If this transliterator has a filter, break up the input text into runs
794      * of unfiltered characters. Pass each run to
795      * <subclass>.handleTransliterate().
796      *
797      * <p>In incremental mode, if rollback is TRUE, perform a special
798      * incremental procedure in which several passes are made over the input
799      * text, adding one character at a time, and committing successful
800      * transliterations as they occur. Unsuccessful transliterations are rolled
801      * back and retried with additional characters to give correct results.
802      *
803      * @param text the text to be transliterated
804      * @param index the position indices
805      * @param incremental if TRUE, then assume more characters may be inserted
806      * at index.limit, and postpone processing to accomodate future incoming
807      * characters
808      * @param rollback if TRUE and if incremental is TRUE, then perform special
809      * incremental processing, as described above, and undo partial
810      * transliterations where necessary. If incremental is FALSE then this
811      * parameter is ignored.
812      */

813     private void filteredTransliterate(Replaceable text,
814                                        Position index,
815                                        boolean incremental,
816                                        boolean rollback) {
817         // Short circuit path for transliterators with no filter in
818
// non-incremental mode.
819
if (filter == null && !rollback) {
820             handleTransliterate(text, index, incremental);
821             return;
822         }
823
824         //----------------------------------------------------------------------
825
// This method processes text in two groupings:
826
//
827
// RUNS -- A run is a contiguous group of characters which are contained
828
// in the filter for this transliterator (filter.contains(ch) == true).
829
// Text outside of runs may appear as context but it is not modified.
830
// The start and limit Position values are narrowed to each run.
831
//
832
// PASSES (incremental only) -- To make incremental mode work correctly,
833
// each run is broken up into n passes, where n is the length (in code
834
// points) of the run. Each pass contains the first n characters. If a
835
// pass is completely transliterated, it is committed, and further passes
836
// include characters after the committed text. If a pass is blocked,
837
// and does not transliterate completely, then this method rolls back
838
// the changes made during the pass, extends the pass by one code point,
839
// and tries again.
840
//----------------------------------------------------------------------
841

842         // globalLimit is the limit value for the entire operation. We
843
// set index.limit to the end of each unfiltered run before
844
// calling handleTransliterate(), so we need to maintain the real
845
// value of index.limit here. After each transliteration, we
846
// update globalLimit for insertions or deletions that have
847
// happened.
848
int globalLimit = index.limit;
849
850         // If there is a non-null filter, then break the input text up. Say the
851
// input text has the form:
852
// xxxabcxxdefxx
853
// where 'x' represents a filtered character (filter.contains('x') ==
854
// false). Then we break this up into:
855
// xxxabc xxdef xx
856
// Each pass through the loop consumes a run of filtered
857
// characters (which are ignored) and a subsequent run of
858
// unfiltered characters (which are transliterated).
859

860         StringBuffer JavaDoc log = null;
861         if (DEBUG) {
862             log = new StringBuffer JavaDoc();
863         }
864
865         for (;;) {
866
867             if (filter != null) {
868                 // Narrow the range to be transliterated to the first run
869
// of unfiltered characters at or after index.start.
870

871                 // Advance past filtered chars
872
int c;
873                 while (index.start < globalLimit &&
874                        !filter.contains(c=text.char32At(index.start))) {
875                     index.start += UTF16.getCharCount(c);
876                 }
877
878                 // Find the end of this run of unfiltered chars
879
index.limit = index.start;
880                 while (index.limit < globalLimit &&
881                        filter.contains(c=text.char32At(index.limit))) {
882                     index.limit += UTF16.getCharCount(c);
883                 }
884             }
885
886             // Check to see if the unfiltered run is empty. This only
887
// happens at the end of the string when all the remaining
888
// characters are filtered.
889
if (index.start == index.limit) {
890                 break;
891             }
892
893             // Is this run incremental? If there is additional
894
// filtered text (if limit < globalLimit) then we pass in
895
// an incremental value of FALSE to force the subclass to
896
// complete the transliteration for this run.
897
boolean isIncrementalRun =
898                 (index.limit < globalLimit ? false : incremental);
899
900             int delta;
901
902             // Implement rollback. To understand the need for rollback,
903
// consider the following transliterator:
904
//
905
// "t" is "a > A;"
906
// "u" is "A > b;"
907
// "v" is a compound of "t; NFD; u" with a filter [:Ll:]
908
//
909
// Now apply "v" to the input text "a". The result is "b". But if
910
// the transliteration is done incrementally, then the NFD holds
911
// things up after "t" has already transformed "a" to "A". When
912
// finishTransliterate() is called, "A" is _not_ processed because
913
// it gets excluded by the [:Ll:] filter, and the end result is "A"
914
// -- incorrect. The problem is that the filter is applied to a
915
// partially-transliterated result, when we only want it to apply to
916
// input text. Although this example describes a compound
917
// transliterator containing NFD and a specific filter, it can
918
// happen with any transliterator which does a partial
919
// transformation in incremental mode into characters outside its
920
// filter.
921
//
922
// To handle this, when in incremental mode we supply characters to
923
// handleTransliterate() in several passes. Each pass adds one more
924
// input character to the input text. That is, for input "ABCD", we
925
// first try "A", then "AB", then "ABC", and finally "ABCD". If at
926
// any point we block (upon return, start < limit) then we roll
927
// back. If at any point we complete the run (upon return start ==
928
// limit) then we commit that run.
929

930             if (rollback && isIncrementalRun) {
931
932                 if (DEBUG) {
933                     log.setLength(0);
934                     System.out.println("filteredTransliterate{"+getID()+"}i: IN=" +
935                                        UtilityExtensions.formatInput(text, index));
936                 }
937
938                 int runStart = index.start;
939                 int runLimit = index.limit;
940                 int runLength = runLimit - runStart;
941
942                 // Make a rollback copy at the end of the string
943
int rollbackOrigin = text.length();
944                 text.copy(runStart, runLimit, rollbackOrigin);
945
946                 // Variables reflecting the commitment of completely
947
// transliterated text. passStart is the runStart, advanced
948
// past committed text. rollbackStart is the rollbackOrigin,
949
// advanced past rollback text that corresponds to committed
950
// text.
951
int passStart = runStart;
952                 int rollbackStart = rollbackOrigin;
953
954                 // The limit for each pass; we advance by one code point with
955
// each iteration.
956
int passLimit = index.start;
957
958                 // Total length, in 16-bit code units, of uncommitted text.
959
// This is the length to be rolled back.
960
int uncommittedLength = 0;
961
962                 // Total delta (change in length) for all passes
963
int totalDelta = 0;
964
965                 // PASS MAIN LOOP -- Start with a single character, and extend
966
// the text by one character at a time. Roll back partial
967
// transliterations and commit complete transliterations.
968
for (;;) {
969                     // Length of additional code point, either one or two
970
int charLength =
971                         UTF16.getCharCount(text.char32At(passLimit));
972                     passLimit += charLength;
973                     if (passLimit > runLimit) {
974                         break;
975                     }
976                     uncommittedLength += charLength;
977
978                     index.limit = passLimit;
979
980                     if (DEBUG) {
981                         log.setLength(0);
982                         log.append("filteredTransliterate{"+getID()+"}i: ");
983                         UtilityExtensions.formatInput(log, text, index);
984                     }
985
986                     // Delegate to subclass for actual transliteration. Upon
987
// return, start will be updated to point after the
988
// transliterated text, and limit and contextLimit will be
989
// adjusted for length changes.
990
handleTransliterate(text, index, true);
991
992                     if (DEBUG) {
993                         log.append(" => ");
994                         UtilityExtensions.formatInput(log, text, index);
995                     }
996
997                     delta = index.limit - passLimit; // change in length
998

999                     // We failed to completely transliterate this pass.
1000
// Roll back the text. Indices remain unchanged; reset
1001
// them where necessary.
1002
if (index.start != index.limit) {
1003                        // Find the rollbackStart, adjusted for length changes
1004
// and the deletion of partially transliterated text.
1005
int rs = rollbackStart + delta - (index.limit - passStart);
1006
1007                        // Delete the partially transliterated text
1008
text.replace(passStart, index.limit, "");
1009
1010                        // Copy the rollback text back
1011
text.copy(rs, rs + uncommittedLength, passStart);
1012
1013                        // Restore indices to their original values
1014
index.start = passStart;
1015                        index.limit = passLimit;
1016                        index.contextLimit -= delta;
1017
1018                        if (DEBUG) {
1019                            log.append(" (ROLLBACK)");
1020                        }
1021                    }
1022
1023                    // We did completely transliterate this pass. Update the
1024
// commit indices to record how far we got. Adjust indices
1025
// for length change.
1026
else {
1027                        // Move the pass indices past the committed text.
1028
passStart = passLimit = index.start;
1029
1030                        // Adjust the rollbackStart for length changes and move
1031
// it past the committed text. All characters we've
1032
// processed to this point are committed now, so zero
1033
// out the uncommittedLength.
1034
rollbackStart += delta + uncommittedLength;
1035                        uncommittedLength = 0;
1036
1037                        // Adjust indices for length changes.
1038
runLimit += delta;
1039                        totalDelta += delta;
1040                    }
1041
1042                    if (DEBUG) {
1043                        System.out.println(Utility.escape(log.toString()));
1044                    }
1045                }
1046
1047                // Adjust overall limit and rollbackOrigin for insertions and
1048
// deletions. Don't need to worry about contextLimit because
1049
// handleTransliterate() maintains that.
1050
rollbackOrigin += totalDelta;
1051                globalLimit += totalDelta;
1052
1053                // Delete the rollback copy
1054
text.replace(rollbackOrigin, rollbackOrigin + runLength, "");
1055
1056                // Move start past committed text
1057
index.start = passStart;
1058            }
1059
1060            else {
1061                // Delegate to subclass for actual transliteration.
1062
if (DEBUG) {
1063                    log.setLength(0);
1064                    log.append("filteredTransliterate{"+getID()+"}: ");
1065                    UtilityExtensions.formatInput(log, text, index);
1066                }
1067
1068                int limit = index.limit;
1069                handleTransliterate(text, index, isIncrementalRun);
1070                delta = index.limit - limit; // change in length
1071

1072                if (DEBUG) {
1073                    log.append(" => ");
1074                    UtilityExtensions.formatInput(log, text, index);
1075                }
1076
1077                // In a properly written transliterator, start == limit after
1078
// handleTransliterate() returns when incremental is false.
1079
// Catch cases where the subclass doesn't do this, and throw
1080
// an exception. (Just pinning start to limit is a bad idea,
1081
// because what's probably happening is that the subclass
1082
// isn't transliterating all the way to the end, and it should
1083
// in non-incremental mode.)
1084
if (!isIncrementalRun && index.start != index.limit) {
1085                    throw new RuntimeException JavaDoc("ERROR: Incomplete non-incremental transliteration by " + getID());
1086                }
1087
1088                // Adjust overall limit for insertions/deletions. Don't need
1089
// to worry about contextLimit because handleTransliterate()
1090
// maintains that.
1091
globalLimit += delta;
1092
1093                if (DEBUG) {
1094                    System.out.println(Utility.escape(log.toString()));
1095                }
1096            }
1097
1098            if (filter == null || isIncrementalRun) {
1099                break;
1100            }
1101
1102            // If we did completely transliterate this
1103
// run, then repeat with the next unfiltered run.
1104
}
1105
1106        // Start is valid where it is. Limit needs to be put back where
1107
// it was, modulo adjustments for deletions/insertions.
1108
index.limit = globalLimit;
1109
1110        if (DEBUG) {
1111            System.out.println("filteredTransliterate{"+getID()+"}: OUT=" +
1112                               UtilityExtensions.formatInput(text, index));
1113        }
1114    }
1115
1116    /**
1117     * Transliterate a substring of text, as specified by index, taking filters
1118     * into account. This method is for subclasses that need to delegate to
1119     * another transliterator, such as CompoundTransliterator.
1120     * @param text the text to be transliterated
1121     * @param index the position indices
1122     * @param incremental if TRUE, then assume more characters may be inserted
1123     * at index.limit, and postpone processing to accomodate future incoming
1124     * characters
1125     * @stable ICU 2.0
1126     */

1127    public void filteredTransliterate(Replaceable text,
1128                                         Position index,
1129                                         boolean incremental) {
1130        filteredTransliterate(text, index, incremental, false);
1131    }
1132
1133    /**
1134     * Returns the length of the longest context required by this transliterator.
1135     * This is <em>preceding</em> context. The default value is zero, but
1136     * subclasses can change this by calling <code>setMaximumContextLength()</code>.
1137     * For example, if a transliterator translates "ddd" (where
1138     * d is any digit) to "555" when preceded by "(ddd)", then the preceding
1139     * context length is 5, the length of "(ddd)".
1140     *
1141     * @return The maximum number of preceding context characters this
1142     * transliterator needs to examine
1143     * @stable ICU 2.0
1144     */

1145    public final int getMaximumContextLength() {
1146        return maximumContextLength;
1147    }
1148
1149    /**
1150     * Method for subclasses to use to set the maximum context length.
1151     * @see #getMaximumContextLength
1152     * @stable ICU 2.0
1153     */

1154    protected void setMaximumContextLength(int a) {
1155        if (a < 0) {
1156            throw new IllegalArgumentException JavaDoc("Invalid context length " + a);
1157        }
1158        maximumContextLength = a;
1159    }
1160
1161    /**
1162     * Returns a programmatic identifier for this transliterator.
1163     * If this identifier is passed to <code>getInstance()</code>, it
1164     * will return this object, if it has been registered.
1165     * @see #registerClass
1166     * @see #getAvailableIDs
1167     * @stable ICU 2.0
1168     */

1169    public final String JavaDoc getID() {
1170        return ID;
1171    }
1172
1173    /**
1174     * Set the programmatic identifier for this transliterator. Only
1175     * for use by subclasses.
1176     * @stable ICU 2.0
1177     */

1178    protected final void setID(String JavaDoc id) {
1179        ID = id;
1180    }
1181
1182    /**
1183     * Returns a name for this transliterator that is appropriate for
1184     * display to the user in the default locale. See {@link
1185     * #getDisplayName(String,Locale)} for details.
1186     * @stable ICU 2.0
1187     */

1188    public final static String JavaDoc getDisplayName(String JavaDoc ID) {
1189        return getDisplayName(ID, ULocale.getDefault());
1190    }
1191
1192    /**
1193     * Returns a name for this transliterator that is appropriate for
1194     * display to the user in the given locale. This name is taken
1195     * from the locale resource data in the standard manner of the
1196     * <code>java.text</code> package.
1197     *
1198     * <p>If no localized names exist in the system resource bundles,
1199     * a name is synthesized using a localized
1200     * <code>MessageFormat</code> pattern from the resource data. The
1201     * arguments to this pattern are an integer followed by one or two
1202     * strings. The integer is the number of strings, either 1 or 2.
1203     * The strings are formed by splitting the ID for this
1204     * transliterator at the first '-'. If there is no '-', then the
1205     * entire ID forms the only string.
1206     * @param inLocale the Locale in which the display name should be
1207     * localized.
1208     * @see java.text.MessageFormat
1209     * @stable ICU 2.0
1210     */

1211    public static String JavaDoc getDisplayName(String JavaDoc id, Locale inLocale) {
1212        return getDisplayName(id, ULocale.forLocale(inLocale));
1213    }
1214
1215    /**
1216     * Returns a name for this transliterator that is appropriate for
1217     * display to the user in the given locale. This name is taken
1218     * from the locale resource data in the standard manner of the
1219     * <code>java.text</code> package.
1220     *
1221     * <p>If no localized names exist in the system resource bundles,
1222     * a name is synthesized using a localized
1223     * <code>MessageFormat</code> pattern from the resource data. The
1224     * arguments to this pattern are an integer followed by one or two
1225     * strings. The integer is the number of strings, either 1 or 2.
1226     * The strings are formed by splitting the ID for this
1227     * transliterator at the first '-'. If there is no '-', then the
1228     * entire ID forms the only string.
1229     * @param inLocale the ULocale in which the display name should be
1230     * localized.
1231     * @see java.text.MessageFormat
1232     * @draft ICU 3.2
1233     * @provisional This API might change or be removed in a future release.
1234     */

1235    public static String JavaDoc getDisplayName(String JavaDoc id, ULocale inLocale) {
1236
1237        // Resource bundle containing display name keys and the
1238
// RB_RULE_BASED_IDS array.
1239
//
1240
//If we ever integrate this with the Sun JDK, the resource bundle
1241
// root will change to sun.text.resources.LocaleElements
1242

1243        ICUResourceBundle bundle = (ICUResourceBundle)UResourceBundle.
1244            getBundleInstance(ICUResourceBundle.ICU_TRANSLIT_BASE_NAME, inLocale);
1245
1246        // Normalize the ID
1247
String JavaDoc stv[] = TransliteratorIDParser.IDtoSTV(id);
1248        if (stv == null) {
1249            // No target; malformed id
1250
return "";
1251        }
1252        String JavaDoc ID = stv[0] + '-' + stv[1];
1253        if (stv[2] != null && stv[2].length() > 0) {
1254            ID = ID + '/' + stv[2];
1255        }
1256
1257        // Use the registered display name, if any
1258
String JavaDoc n = (String JavaDoc) displayNameCache.get(new CaseInsensitiveString(ID));
1259        if (n != null) {
1260            return n;
1261        }
1262
1263        // Use display name for the entire transliterator, if it
1264
// exists.
1265
try {
1266            return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID);
1267        } catch (MissingResourceException JavaDoc e) {}
1268
1269        try {
1270            // Construct the formatter first; if getString() fails
1271
// we'll exit the try block
1272
MessageFormat format = new MessageFormat(
1273                    bundle.getString(RB_DISPLAY_NAME_PATTERN));
1274            // Construct the argument array
1275
Object JavaDoc[] args = new Object JavaDoc[] { new Integer JavaDoc(2), stv[0], stv[1] };
1276
1277            // Use display names for the scripts, if they exist
1278
for (int j=1; j<=2; ++j) {
1279                try {
1280                    args[j] = bundle.getString(RB_SCRIPT_DISPLAY_NAME_PREFIX +
1281                                               (String JavaDoc) args[j]);
1282                } catch (MissingResourceException JavaDoc e) {}
1283            }
1284
1285            // Format it using the pattern in the resource
1286
return (stv[2].length() > 0) ?
1287                (format.format(args) + '/' + stv[2]) :
1288                format.format(args);
1289        } catch (MissingResourceException JavaDoc e2) {}
1290
1291        // We should not reach this point unless there is something
1292
// wrong with the build or the RB_DISPLAY_NAME_PATTERN has
1293
// been deleted from the root RB_LOCALE_ELEMENTS resource.
1294
throw new RuntimeException JavaDoc();
1295    }
1296
1297    /**
1298     * Returns the filter used by this transliterator, or <tt>null</tt>
1299     * if this transliterator uses no filter.
1300     * @stable ICU 2.0
1301     */

1302    public final UnicodeFilter getFilter() {
1303        return filter;
1304    }
1305
1306    /**
1307     * Changes the filter used by this transliterator. If the filter
1308     * is set to <tt>null</tt> then no filtering will occur.
1309     *
1310     * <p>Callers must take care if a transliterator is in use by
1311     * multiple threads. The filter should not be changed by one
1312     * thread while another thread may be transliterating.
1313     * @stable ICU 2.0
1314     */

1315    public void setFilter(UnicodeFilter filter) {
1316        this.filter = filter;
1317    }
1318
1319    /**
1320     * Returns a <code>Transliterator</code> object given its ID.
1321     * The ID must be either a system transliterator ID or a ID registered
1322     * using <code>registerClass()</code>.
1323     *
1324     * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
1325     * @return A <code>Transliterator</code> object with the given ID
1326     * @exception IllegalArgumentException if the given ID is invalid.
1327     * @stable ICU 2.0
1328     */

1329    public static final Transliterator getInstance(String JavaDoc ID) {
1330        return getInstance(ID, FORWARD);
1331    }
1332
1333    /**
1334     * Returns a <code>Transliterator</code> object given its ID.
1335     * The ID must be either a system transliterator ID or a ID registered
1336     * using <code>registerClass()</code>.
1337     *
1338     * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
1339     * @param dir either FORWARD or REVERSE. If REVERSE then the
1340     * inverse of the given ID is instantiated.
1341     * @return A <code>Transliterator</code> object with the given ID
1342     * @exception IllegalArgumentException if the given ID is invalid.
1343     * @see #registerClass
1344     * @see #getAvailableIDs
1345     * @see #getID
1346     * @stable ICU 2.0
1347     */

1348    public static Transliterator getInstance(String JavaDoc ID,
1349                                             int dir) {
1350        StringBuffer JavaDoc canonID = new StringBuffer JavaDoc();
1351        Vector JavaDoc list = new Vector JavaDoc();
1352        UnicodeSet[] globalFilter = new UnicodeSet[1];
1353        if (!TransliteratorIDParser.parseCompoundID(ID, dir, canonID, list, globalFilter)) {
1354            throw new IllegalArgumentException JavaDoc("Invalid ID " + ID);
1355        }
1356
1357        TransliteratorIDParser.instantiateList(list);
1358
1359        // assert(list.size() > 0);
1360
Transliterator t = null;
1361        if (list.size() > 1 || Utility.indexOf(canonID, ";") >= 0) {
1362            // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only
1363
// has one child transliterator. This is so that toRules() will return the right thing
1364
// (without any inactive ID), but our main ID still comes out correct. That is, if we
1365
// instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;"
1366
// even though the ID is "(Lower);Latin-Greek;".
1367
t = new CompoundTransliterator(list);
1368        }
1369        else {
1370            t = (Transliterator)list.elementAt(0);
1371        }
1372
1373        t.setID(canonID.toString());
1374        if (globalFilter[0] != null) {
1375            t.setFilter(globalFilter[0]);
1376        }
1377        return t;
1378    }
1379
1380    /**
1381     * Create a transliterator from a basic ID. This is an ID
1382     * containing only the forward direction source, target, and
1383     * variant.
1384     * @param id a basic ID of the form S-T or S-T/V.
1385     * @param canonID canonical ID to apply to the result, or
1386     * null to leave the ID unchanged
1387     * @return a newly created Transliterator or null if the ID is
1388     * invalid.
1389     */

1390    static Transliterator getBasicInstance(String JavaDoc id, String JavaDoc canonID) {
1391        StringBuffer JavaDoc s = new StringBuffer JavaDoc();
1392        Transliterator t = registry.get(id, s);
1393        if (s.length() != 0) {
1394            // assert(t==0);
1395
// Instantiate an alias
1396
t = getInstance(s.toString(), FORWARD);
1397        }
1398        if (t != null && canonID != null) {
1399            t.setID(canonID);
1400        }
1401        return t;
1402    }
1403
1404    /**
1405     * Returns a <code>Transliterator</code> object constructed from
1406     * the given rule string. This will be a RuleBasedTransliterator,
1407     * if the rule string contains only rules, or a
1408     * CompoundTransliterator, if it contains ID blocks, or a
1409     * NullTransliterator, if it contains ID blocks which parse as
1410     * empty for the given direction.
1411     * @stable ICU 2.0
1412     */

1413    public static final Transliterator createFromRules(String JavaDoc ID, String JavaDoc rules, int dir) {
1414        Transliterator t = null;
1415
1416        TransliteratorParser parser = new TransliteratorParser();
1417        parser.parse(rules, dir);
1418
1419        // NOTE: The logic here matches that in TransliteratorRegistry.
1420
if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) {
1421            t = new NullTransliterator();
1422        }
1423        else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) {
1424            t = new RuleBasedTransliterator(ID, (RuleBasedTransliterator.Data)parser.dataVector.get(0), null);
1425        }
1426        else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) {
1427            // idBlock, no data -- this is an alias. The ID has
1428
// been munged from reverse into forward mode, if
1429
// necessary, so instantiate the ID in the forward
1430
// direction.
1431
if (parser.compoundFilter != null)
1432                t = getInstance(parser.compoundFilter.toPattern(false) + ";"
1433                        + (String JavaDoc)parser.idBlockVector.get(0));
1434            else
1435                t = getInstance((String JavaDoc)parser.idBlockVector.get(0));
1436
1437
1438            if (t != null) {
1439                t.setID(ID);
1440            }
1441        }
1442        else {
1443            Vector JavaDoc transliterators = new Vector JavaDoc();
1444            int passNumber = 1;
1445
1446            int limit = Math.max(parser.idBlockVector.size(), parser.dataVector.size());
1447            for (int i = 0; i < limit; i++) {
1448                if (i < parser.idBlockVector.size()) {
1449                    String JavaDoc idBlock = (String JavaDoc)parser.idBlockVector.get(i);
1450                    if (idBlock.length() > 0) {
1451                        Transliterator temp = getInstance(idBlock);
1452                        if (!(temp instanceof NullTransliterator))
1453                            transliterators.add(getInstance(idBlock));
1454                    }
1455                }
1456                if (i < parser.dataVector.size()) {
1457                    RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data)parser.dataVector.get(i);
1458                    transliterators.add(new RuleBasedTransliterator("%Pass" + passNumber++, data, null));
1459                }
1460            }
1461
1462            t = new CompoundTransliterator(transliterators, passNumber - 1);
1463            t.setID(ID);
1464            if (parser.compoundFilter != null) {
1465                t.setFilter(parser.compoundFilter);
1466            }
1467        }
1468
1469        return t;
1470    }
1471
1472    /**
1473     * Returns a rule string for this transliterator.
1474     * @param escapeUnprintable if true, then unprintable characters
1475     * will be converted to escape form backslash-'u' or
1476     * backslash-'U'.
1477     * @stable ICU 2.0
1478     */

1479    public String JavaDoc toRules(boolean escapeUnprintable) {
1480        return baseToRules(escapeUnprintable);
1481    }
1482
1483    /**
1484     * Returns a rule string for this transliterator. This is
1485     * a non-overrideable base class implementation that subclasses
1486     * may call. It simply munges the ID into the correct format,
1487     * that is, "foo" => "::foo".
1488     * @param escapeUnprintable if true, then unprintable characters
1489     * will be converted to escape form backslash-'u' or
1490     * backslash-'U'.
1491     * @stable ICU 2.0
1492     */

1493    protected final String JavaDoc baseToRules(boolean escapeUnprintable) {
1494        // The base class implementation of toRules munges the ID into
1495
// the correct format. That is: foo => ::foo
1496
// KEEP in sync with rbt_pars
1497
if (escapeUnprintable) {
1498            StringBuffer JavaDoc rulesSource = new StringBuffer JavaDoc();
1499            String JavaDoc id = getID();
1500            for (int i=0; i<id.length();) {
1501                int c = UTF16.charAt(id, i);
1502                if (!Utility.escapeUnprintable(rulesSource, c)) {
1503                    UTF16.append(rulesSource, c);
1504                }
1505                i += UTF16.getCharCount(c);
1506            }
1507            rulesSource.insert(0, "::");
1508            rulesSource.append(ID_DELIM);
1509            return rulesSource.toString();
1510        }
1511        return "::" + getID() + ID_DELIM;
1512    }
1513
1514    /**
1515     * Return the elements that make up this transliterator. For
1516     * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
1517     * were created, the return value of this method would be an array
1518     * of the three transliterator objects that make up that
1519     * transliterator: [NFD, Jamo-Latin, Latin-Greek].
1520     *
1521     * <p>If this transliterator is not composed of other
1522     * transliterators, then this method will return an array of
1523     * length one containing a reference to this transliterator.
1524     * @return an array of one or more transliterators that make up
1525     * this transliterator
1526     * @stable ICU 3.0
1527     */

1528    public Transliterator[] getElements() {
1529        Transliterator result[];
1530        if (this instanceof CompoundTransliterator) {
1531            CompoundTransliterator cpd = (CompoundTransliterator) this;
1532            result = new Transliterator[cpd.getCount()];
1533            for (int i=0; i<result.length; ++i) {
1534                result[i] = cpd.getTransliterator(i);
1535            }
1536        } else {
1537            result = new Transliterator[] { this };
1538        }
1539        return result;
1540    }
1541
1542    /**
1543     * Returns the set of all characters that may be modified in the
1544     * input text by this Transliterator. This incorporates this
1545     * object's current filter; if the filter is changed, the return
1546     * value of this function will change. The default implementation
1547     * returns an empty set. Some subclasses may override {@link
1548     * #handleGetSourceSet} to return a more precise result. The
1549     * return result is approximate in any case and is intended for
1550     * use by tests, tools, or utilities.
1551     * @see #getTargetSet
1552     * @see #handleGetSourceSet
1553     * @stable ICU 2.2
1554     */

1555    public final UnicodeSet getSourceSet() {
1556        UnicodeSet set = handleGetSourceSet();
1557        if (filter != null) {
1558            UnicodeSet filterSet;
1559            // Most, but not all filters will be UnicodeSets. Optimize for
1560
// the high-runner case.
1561
try {
1562                filterSet = (UnicodeSet) filter;
1563            } catch (ClassCastException JavaDoc e) {
1564                filterSet = new UnicodeSet();
1565                filter.addMatchSetTo(filterSet);
1566            }
1567            set.retainAll(filterSet);
1568        }
1569        return set;
1570    }
1571
1572    /**
1573     * Framework method that returns the set of all characters that
1574     * may be modified in the input text by this Transliterator,
1575     * ignoring the effect of this object's filter. The base class
1576     * implementation returns the empty set. Subclasses that wish to
1577     * implement this should override this method.
1578     * @return the set of characters that this transliterator may
1579     * modify. The set may be modified, so subclasses should return a
1580     * newly-created object.
1581     * @see #getSourceSet
1582     * @see #getTargetSet
1583     * @stable ICU 2.2
1584     */

1585    protected UnicodeSet handleGetSourceSet() {
1586        return new UnicodeSet();
1587    }
1588
1589    /**
1590     * Returns the set of all characters that may be generated as
1591     * replacement text by this transliterator. The default
1592     * implementation returns the empty set. Some subclasses may
1593     * override this method to return a more precise result. The
1594     * return result is approximate in any case and is intended for
1595     * use by tests, tools, or utilities requiring such
1596     * meta-information.
1597     * @see #getTargetSet
1598     * @stable ICU 2.2
1599     */

1600    public UnicodeSet getTargetSet() {
1601        return new UnicodeSet();
1602    }
1603
1604    /**
1605     * Returns this transliterator's inverse. See the class
1606     * documentation for details. This implementation simply inverts
1607     * the two entities in the ID and attempts to retrieve the
1608     * resulting transliterator. That is, if <code>getID()</code>
1609     * returns "A-B", then this method will return the result of
1610     * <code>getInstance("B-A")</code>, or <code>null</code> if that
1611     * call fails.
1612     *
1613     * <p>Subclasses with knowledge of their inverse may wish to
1614     * override this method.
1615     *
1616     * @return a transliterator that is an inverse, not necessarily
1617     * exact, of this transliterator, or <code>null</code> if no such
1618     * transliterator is registered.
1619     * @see #registerClass
1620     * @stable ICU 2.0
1621     */

1622    public final Transliterator getInverse() {
1623        return getInstance(ID, REVERSE);
1624    }
1625
1626    /**
1627     * Registers a subclass of <code>Transliterator</code> with the
1628     * system. This subclass must have a public constructor taking no
1629     * arguments. When that constructor is called, the resulting
1630     * object must return the <code>ID</code> passed to this method if
1631     * its <code>getID()</code> method is called.
1632     *
1633     * @param ID the result of <code>getID()</code> for this
1634     * transliterator
1635     * @param transClass a subclass of <code>Transliterator</code>
1636     * @see #unregister
1637     * @stable ICU 2.0
1638     */

1639    public static void registerClass(String JavaDoc ID, Class JavaDoc transClass, String JavaDoc displayName) {
1640        registry.put(ID, transClass, true);
1641        if (displayName != null) {
1642            displayNameCache.put(new CaseInsensitiveString(ID), displayName);
1643        }
1644    }
1645
1646    /**
1647     * Register a factory object with the given ID. The factory
1648     * method should return a new instance of the given transliterator.
1649     * @param ID the ID of this transliterator
1650     * @param factory the factory object
1651     * @stable ICU 2.0
1652     */

1653    public static void registerFactory(String JavaDoc ID, Factory factory) {
1654        registry.put(ID, factory, true);
1655    }
1656
1657    /**
1658     * Register a Transliterator object with the given ID.
1659     * @param trans the Transliterator object
1660     * @stable ICU 2.2
1661     */

1662    public static void registerInstance(Transliterator trans) {
1663        registry.put(trans.getID(), trans, true);
1664    }
1665
1666    /**
1667     * Register a Transliterator object with the given ID.
1668     * @param ID the ID of this transliterator
1669     * @param trans the Transliterator object
1670     * @internal
1671     */

1672    static void registerInstance(Transliterator trans, boolean visible) {
1673        registry.put(trans.getID(), trans, visible);
1674    }
1675
1676    /**
1677     * Register an ID as an alias of another ID. Instantiating
1678     * alias ID produces the same result as instantiating the original ID.
1679     * This is generally used to create short aliases of compound IDs.
1680     * @param aliasID The new ID being registered.
1681     * @param realID The existing ID that the new ID should be an alias of.
1682     * @draft ICU 3.4.1
1683     * @provisional This API might change or be removed in a future release.
1684     */

1685    public static void registerAlias(String JavaDoc aliasID, String JavaDoc realID) {
1686        registry.put(aliasID, realID, true);
1687    }
1688
1689    /**
1690     * Register two targets as being inverses of one another. For
1691     * example, calling registerSpecialInverse("NFC", "NFD", true) causes
1692     * Transliterator to form the following inverse relationships:
1693     *
1694     * <pre>NFC => NFD
1695     * Any-NFC => Any-NFD
1696     * NFD => NFC
1697     * Any-NFD => Any-NFC</pre>
1698     *
1699     * (Without the special inverse registration, the inverse of NFC
1700     * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
1701     * that the presence or absence of "Any-" is preserved.
1702     *
1703     * <p>The relationship is symmetrical; registering (a, b) is
1704     * equivalent to registering (b, a).
1705     *
1706     * <p>The relevant IDs must still be registered separately as
1707     * factories or classes.
1708     *
1709     * <p>Only the targets are specified. Special inverses always
1710     * have the form Any-Target1 <=> Any-Target2. The target should
1711     * have canonical casing (the casing desired to be produced when
1712     * an inverse is formed) and should contain no whitespace or other
1713     * extraneous characters.
1714     *
1715     * @param target the target against which to register the inverse
1716     * @param inverseTarget the inverse of target, that is
1717     * Any-target.getInverse() => Any-inverseTarget
1718     * @param bidirectional if true, register the reverse relation
1719     * as well, that is, Any-inverseTarget.getInverse() => Any-target
1720     * @internal
1721     */

1722    static void registerSpecialInverse(String JavaDoc target,
1723                                       String JavaDoc inverseTarget,
1724                                       boolean bidirectional) {
1725        TransliteratorIDParser.registerSpecialInverse(target, inverseTarget, bidirectional);
1726    }
1727
1728    /**
1729     * Unregisters a transliterator or class. This may be either
1730     * a system transliterator or a user transliterator or class.
1731     *
1732     * @param ID the ID of the transliterator or class
1733     * @see #registerClass
1734     * @stable ICU 2.0
1735     */

1736    public static void unregister(String JavaDoc ID) {
1737        displayNameCache.remove(new CaseInsensitiveString(ID));
1738        registry.remove(ID);
1739    }
1740
1741    /**
1742     * Returns an enumeration over the programmatic names of registered
1743     * <code>Transliterator</code> objects. This includes both system
1744     * transliterators and user transliterators registered using
1745     * <code>registerClass()</code>. The enumerated names may be
1746     * passed to <code>getInstance()</code>.
1747     *
1748     * @return An <code>Enumeration</code> over <code>String</code> objects
1749     * @see #getInstance
1750     * @see #registerClass
1751     * @stable ICU 2.0
1752     */

1753    public static final Enumeration JavaDoc getAvailableIDs() {
1754        return registry.getAvailableIDs();
1755    }
1756
1757    /**
1758     * Returns an enumeration over the source names of registered
1759     * transliterators. Source names may be passed to
1760     * getAvailableTargets() to obtain available targets for each
1761     * source.
1762     * @stable ICU 2.0
1763     */

1764    public static final Enumeration JavaDoc getAvailableSources() {
1765        return registry.getAvailableSources();
1766    }
1767
1768    /**
1769     * Returns an enumeration over the target names of registered
1770     * transliterators having a given source name. Target names may
1771     * be passed to getAvailableVariants() to obtain available
1772     * variants for each source and target pair.
1773     * @stable ICU 2.0
1774     */

1775    public static final Enumeration JavaDoc getAvailableTargets(String JavaDoc source) {
1776        return registry.getAvailableTargets(source);
1777    }
1778
1779    /**
1780     * Returns an enumeration over the variant names of registered
1781     * transliterators having a given source name and target name.
1782     * @stable ICU 2.0
1783     */

1784    public static final Enumeration JavaDoc getAvailableVariants(String JavaDoc source,
1785                                                         String JavaDoc target) {
1786        return registry.getAvailableVariants(source, target);
1787    }
1788    private static final String JavaDoc INDEX = "index",
1789                                RB_RULE_BASED_IDS ="RuleBasedTransliteratorIDs";
1790    static {
1791        registry = new TransliteratorRegistry();
1792
1793        // The display name cache starts out empty
1794
displayNameCache = new Hashtable JavaDoc();
1795        /* The following code parses the index table located in
1796         * icu/data/translit/root.txt. The index is an n x 4 table
1797         * that follows this format:
1798         * <id>{
1799         * file{
1800         * resource{"<resource>"}
1801         * direction{"<direction>"}
1802         * }
1803         * }
1804         * <id>{
1805         * internal{
1806         * resource{"<resource>"}
1807         * direction{"<direction"}
1808         * }
1809         * }
1810         * <id>{
1811         * alias{"<getInstanceArg"}
1812         * }
1813         * <id> is the ID of the system transliterator being defined. These
1814         * are public IDs enumerated by Transliterator.getAvailableIDs(),
1815         * unless the second field is "internal".
1816         *
1817         * <resource> is a ResourceReader resource name. Currently these refer
1818         * to file names under com/ibm/text/resources. This string is passed
1819         * directly to ResourceReader, together with <encoding>.
1820         *
1821         * <direction> is either "FORWARD" or "REVERSE".
1822         *
1823         * <getInstanceArg> is a string to be passed directly to
1824         * Transliterator.getInstance(). The returned Transliterator object
1825         * then has its ID changed to <id> and is returned.
1826         *
1827         * The extra blank field on "alias" lines is to make the array square.
1828         */

1829        ICUResourceBundle bundle, transIDs, colBund;
1830        bundle = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_TRANSLIT_BASE_NAME, INDEX);
1831        transIDs = bundle.get(RB_RULE_BASED_IDS);
1832
1833        int row, maxRows;
1834        maxRows = transIDs.getSize();
1835        for (row = 0; row < maxRows; row++) {
1836            colBund = transIDs.get(row);
1837            String JavaDoc ID = colBund.getKey();
1838            ICUResourceBundle res = colBund.get(0);
1839            String JavaDoc type = res.getKey();
1840            if (type.equals("file") || type.equals("internal")) {
1841                // Rest of line is <resource>:<encoding>:<direction>
1842
// pos colon c2
1843
String JavaDoc resString = res.getString("resource");
1844                int dir;
1845                String JavaDoc direction = res.getString("direction");
1846                switch (direction.charAt(0)) {
1847                case 'F':
1848                    dir = FORWARD;
1849                    break;
1850                case 'R':
1851                    dir = REVERSE;
1852                    break;
1853                default:
1854                    throw new RuntimeException JavaDoc("Can't parse direction: " + direction);
1855                }
1856                registry.put(ID,
1857                             resString, // resource
1858
"UTF-16", // encoding
1859
dir,
1860                             !type.equals("internal"));
1861            } else if (type.equals("alias")) {
1862                //'alias'; row[2]=createInstance argument
1863
String JavaDoc resString = res.getString();
1864                registry.put(ID, resString, true);
1865            } else {
1866                // Unknown type
1867
throw new RuntimeException JavaDoc("Unknow type: " + type);
1868            }
1869        }
1870
1871        registerSpecialInverse(NullTransliterator.SHORT_ID, NullTransliterator.SHORT_ID, false);
1872
1873        // Register non-rule-based transliterators
1874
registerClass(NullTransliterator._ID,
1875                      NullTransliterator.class, null);
1876        RemoveTransliterator.register();
1877        EscapeTransliterator.register();
1878        UnescapeTransliterator.register();
1879        LowercaseTransliterator.register();
1880        UppercaseTransliterator.register();
1881        TitlecaseTransliterator.register();
1882        UnicodeNameTransliterator.register();
1883        NameUnicodeTransliterator.register();
1884        NormalizationTransliterator.register();
1885        BreakTransliterator.register();
1886        AnyTransliterator.register(); // do this last!
1887
}
1888
1889    /**
1890     * The factory interface for transliterators. Transliterator
1891     * subclasses can register factory objects for IDs using the
1892     * registerFactory() method of Transliterator. When invoked, the
1893     * factory object will be passed the ID being instantiated. This
1894     * makes it possible to register one factory method to more than
1895     * one ID, or for a factory method to parameterize its result
1896     * based on the variant.
1897     * @stable ICU 2.0
1898     */

1899    public static interface Factory {
1900        /**
1901         * Return a transliterator for the given ID.
1902         * @stable ICU 2.0
1903         */

1904        Transliterator getInstance(String JavaDoc ID);
1905    }
1906}
1907
Popular Tags