KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > TransliterationRuleSet


1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2004, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7 package com.ibm.icu.text;
8
9 import java.util.*;
10 import com.ibm.icu.impl.UtilityExtensions;
11
12 /**
13  * A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
14  * the transliteration in one direction from one set of characters or short
15  * strings to another. A <code>RuleBasedTransliterator</code> consists of up to
16  * two such sets, one for the forward direction, and one for the reverse.
17  *
18  * <p>A <code>TransliterationRuleSet</code> has one important operation, that of
19  * finding a matching rule at a given point in the text. This is accomplished
20  * by the <code>findMatch()</code> method.
21  *
22  * <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
23  *
24  * @author Alan Liu
25  */

26 class TransliterationRuleSet {
27     /**
28      * Vector of rules, in the order added.
29      */

30     private Vector ruleVector;
31
32     /**
33      * Length of the longest preceding context
34      */

35     private int maxContextLength;
36
37     /**
38      * Sorted and indexed table of rules. This is created by freeze() from
39      * the rules in ruleVector. rules.length >= ruleVector.size(), and the
40      * references in rules[] are aliases of the references in ruleVector.
41      * A single rule in ruleVector is listed one or more times in rules[].
42      */

43     private TransliterationRule[] rules;
44
45     /**
46      * Index table. For text having a first character c, compute x = c&0xFF.
47      * Now use rules[index[x]..index[x+1]-1]. This index table is created by
48      * freeze().
49      */

50     private int[] index;
51
52     private static final String JavaDoc COPYRIGHT =
53         "\u00A9 IBM Corporation 1999-2001. All rights reserved.";
54
55     /**
56      * Construct a new empty rule set.
57      */

58     public TransliterationRuleSet() {
59         ruleVector = new Vector();
60         maxContextLength = 0;
61     }
62
63     /**
64      * Return the maximum context length.
65      * @return the length of the longest preceding context.
66      */

67     public int getMaximumContextLength() {
68         return maxContextLength;
69     }
70
71     /**
72      * Add a rule to this set. Rules are added in order, and order is
73      * significant.
74      * @param rule the rule to add
75      */

76     public void addRule(TransliterationRule rule) {
77         ruleVector.addElement(rule);
78         int len;
79         if ((len = rule.getAnteContextLength()) > maxContextLength) {
80             maxContextLength = len;
81         }
82
83         rules = null;
84     }
85
86     /**
87      * Close this rule set to further additions, check it for masked rules,
88      * and index it to optimize performance.
89      * @exception IllegalArgumentException if some rules are masked
90      */

91     public void freeze() {
92         /* Construct the rule array and index table. We reorder the
93          * rules by sorting them into 256 bins. Each bin contains all
94          * rules matching the index value for that bin. A rule
95          * matches an index value if string whose first key character
96          * has a low byte equal to the index value can match the rule.
97          *
98          * Each bin contains zero or more rules, in the same order
99          * they were found originally. However, the total rules in
100          * the bins may exceed the number in the original vector,
101          * since rules that have a variable as their first key
102          * character will generally fall into more than one bin.
103          *
104          * That is, each bin contains all rules that either have that
105          * first index value as their first key character, or have
106          * a set containing the index value as their first character.
107          */

108         int n = ruleVector.size();
109         index = new int[257]; // [sic]
110
Vector v = new Vector(2*n); // heuristic; adjust as needed
111

112         /* Precompute the index values. This saves a LOT of time.
113          */

114         int[] indexValue = new int[n];
115         for (int j=0; j<n; ++j) {
116             TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
117             indexValue[j] = r.getIndexValue();
118         }
119         for (int x=0; x<256; ++x) {
120             index[x] = v.size();
121             for (int j=0; j<n; ++j) {
122                 if (indexValue[j] >= 0) {
123                     if (indexValue[j] == x) {
124                         v.addElement(ruleVector.elementAt(j));
125                     }
126                 } else {
127                     // If the indexValue is < 0, then the first key character is
128
// a set, and we must use the more time-consuming
129
// matchesIndexValue check. In practice this happens
130
// rarely, so we seldom tread this code path.
131
TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
132                     if (r.matchesIndexValue(x)) {
133                         v.addElement(r);
134                     }
135                 }
136             }
137         }
138         index[256] = v.size();
139
140         /* Freeze things into an array.
141          */

142         rules = new TransliterationRule[v.size()];
143         v.copyInto(rules);
144
145         StringBuffer JavaDoc errors = null;
146
147         /* Check for masking. This is MUCH faster than our old check,
148          * which was each rule against each following rule, since we
149          * only have to check for masking within each bin now. It's
150          * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
151          * count, and n2 is the per-bin rule count. But n2<<n1, so
152          * it's a big win.
153          */

154         for (int x=0; x<256; ++x) {
155             for (int j=index[x]; j<index[x+1]-1; ++j) {
156                 TransliterationRule r1 = rules[j];
157                 for (int k=j+1; k<index[x+1]; ++k) {
158                     TransliterationRule r2 = rules[k];
159                     if (r1.masks(r2)) {
160                         if (errors == null) {
161                             errors = new StringBuffer JavaDoc();
162                         } else {
163                             errors.append("\n");
164                         }
165                         errors.append("Rule " + r1 + " masks " + r2);
166                     }
167                 }
168             }
169         }
170
171         if (errors != null) {
172             throw new IllegalArgumentException JavaDoc(errors.toString());
173         }
174     }
175
176     /**
177      * Transliterate the given text with the given UTransPosition
178      * indices. Return TRUE if the transliteration should continue
179      * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
180      * Note that FALSE is only ever returned if isIncremental is TRUE.
181      * @param text the text to be transliterated
182      * @param pos the position indices, which will be updated
183      * @param incremental if TRUE, assume new text may be inserted
184      * at index.limit, and return FALSE if thre is a partial match.
185      * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
186      * indicating that transliteration should stop until more text
187      * arrives.
188      */

189     public boolean transliterate(Replaceable text,
190                                  Transliterator.Position pos,
191                                  boolean incremental) {
192         int indexByte = text.char32At(pos.start) & 0xFF;
193         for (int i=index[indexByte]; i<index[indexByte+1]; ++i) {
194             int m = rules[i].matchAndReplace(text, pos, incremental);
195             switch (m) {
196             case UnicodeMatcher.U_MATCH:
197                 if (Transliterator.DEBUG) {
198                     System.out.println((incremental ? "Rule.i: match ":"Rule: match ") +
199                                        rules[i].toRule(true) + " => " +
200                                        UtilityExtensions.formatInput(text, pos));
201                 }
202                 return true;
203             case UnicodeMatcher.U_PARTIAL_MATCH:
204                 if (Transliterator.DEBUG) {
205                     System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") +
206                                        rules[i].toRule(true) + " => " +
207                                        UtilityExtensions.formatInput(text, pos));
208                 }
209                 return false;
210             }
211         }
212         // No match or partial match from any rule
213
pos.start += UTF16.getCharCount(text.char32At(pos.start));
214         if (Transliterator.DEBUG) {
215             System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") +
216                                UtilityExtensions.formatInput(text, pos));
217         }
218         return true;
219     }
220
221     /**
222      * Create rule strings that represents this rule set.
223      */

224     String JavaDoc toRules(boolean escapeUnprintable) {
225         int i;
226         int count = ruleVector.size();
227         StringBuffer JavaDoc ruleSource = new StringBuffer JavaDoc();
228         for (i=0; i<count; ++i) {
229             if (i != 0) {
230                 ruleSource.append('\n');
231             }
232             TransliterationRule r =
233                 (TransliterationRule) ruleVector.elementAt(i);
234             ruleSource.append(r.toRule(escapeUnprintable));
235         }
236         return ruleSource.toString();
237     }
238
239     /**
240      * Return the set of all characters that may be modified (getTarget=false)
241      * or emitted (getTarget=true) by this set.
242      */

243     UnicodeSet getSourceTargetSet(boolean getTarget) {
244         UnicodeSet set = new UnicodeSet();
245         int count = ruleVector.size();
246         for (int i=0; i<count; ++i) {
247             TransliterationRule r =
248                 (TransliterationRule) ruleVector.elementAt(i);
249             if (getTarget) {
250                 r.addTargetSetTo(set);
251             } else {
252                 r.addSourceSetTo(set);
253             }
254         }
255         return set;
256     }
257 }
258
Popular Tags