TransliterationRuleSet


1   /*
2    *******************************************************************************
3    * Copyright (C) 1996-2004, International Business Machines Corporation and    *
4    * others. All Rights Reserved.                                                *
5    *******************************************************************************
6    */
7   package com.ibm.icu.text;
8   
9   import java.util.*;
10  import com.ibm.icu.impl.UtilityExtensions;
11  
12  /**
13   * A set of rules for a <code>RuleBasedTransliterator</code>.  This set encodes
14   * the transliteration in one direction from one set of characters or short
15   * strings to another.  A <code>RuleBasedTransliterator</code> consists of up to
16   * two such sets, one for the forward direction, and one for the reverse.
17   *
18   * <p>A <code>TransliterationRuleSet</code> has one important operation, that of
19   * finding a matching rule at a given point in the text.  This is accomplished
20   * by the <code>findMatch()</code> method.
21   *
22   * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
23   *
24   * @author Alan Liu
25   */
26  class TransliterationRuleSet {
27      /**
28       * Vector of rules, in the order added.
29       */
30      private Vector ruleVector;
31  
32      /**
33       * Length of the longest preceding context
34       */
35      private int maxContextLength;
36  
37      /**
38       * Sorted and indexed table of rules.  This is created by freeze() from
39       * the rules in ruleVector.  rules.length >= ruleVector.size(), and the
40       * references in rules[] are aliases of the references in ruleVector.
41       * A single rule in ruleVector is listed one or more times in rules[].
42       */
43      private TransliterationRule[] rules;
44  
45      /**
46       * Index table.  For text having a first character c, compute x = c&0xFF.
47       * Now use rules[index[x]..index[x+1]-1].  This index table is created by
48       * freeze().
49       */
50      private int[] index;
51  
52      private static final String   COPYRIGHT =
53          "\u00A9 IBM Corporation 1999-2001. All rights reserved.";
54  
55      /**
56       * Construct a new empty rule set.
57       */
58      public TransliterationRuleSet() {
59          ruleVector = new Vector();
60          maxContextLength = 0;
61      }
62  
63      /**
64       * Return the maximum context length.
65       * @return the length of the longest preceding context.
66       */
67      public int getMaximumContextLength() {
68          return maxContextLength;
69      }
70  
71      /**
72       * Add a rule to this set.  Rules are added in order, and order is
73       * significant.
74       * @param rule the rule to add
75       */
76      public void addRule(TransliterationRule rule) {
77          ruleVector.addElement(rule);
78          int len;
79          if ((len = rule.getAnteContextLength()) > maxContextLength) {
80              maxContextLength = len;
81          }
82  
83          rules = null;
84      }
85  
86      /**
87       * Close this rule set to further additions, check it for masked rules,
88       * and index it to optimize performance.
89       * @exception IllegalArgumentException if some rules are masked
90       */
91      public void freeze() {
92          /* Construct the rule array and index table.  We reorder the
93           * rules by sorting them into 256 bins.  Each bin contains all
94           * rules matching the index value for that bin.  A rule
95           * matches an index value if string whose first key character
96           * has a low byte equal to the index value can match the rule.
97           *
98           * Each bin contains zero or more rules, in the same order
99           * they were found originally.  However, the total rules in
100          * the bins may exceed the number in the original vector,
101          * since rules that have a variable as their first key
102          * character will generally fall into more than one bin.
103          *
104          * That is, each bin contains all rules that either have that
105          * first index value as their first key character, or have
106          * a set containing the index value as their first character.
107          */
108         int n = ruleVector.size();
109         index = new int[257]; // [sic]
110         Vector v = new Vector(2*n); // heuristic; adjust as needed
111 
112         /* Precompute the index values.  This saves a LOT of time.
113          */
114         int[] indexValue = new int[n];
115         for (int j=0; j<n; ++j) {
116             TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
117             indexValue[j] = r.getIndexValue();
118         }
119         for (int x=0; x<256; ++x) {
120             index[x] = v.size();
121             for (int j=0; j<n; ++j) {
122                 if (indexValue[j] >= 0) {
123                     if (indexValue[j] == x) {
124                         v.addElement(ruleVector.elementAt(j));
125                     }
126                 } else {
127                     // If the indexValue is < 0, then the first key character is
128                     // a set, and we must use the more time-consuming
129                     // matchesIndexValue check.  In practice this happens
130                     // rarely, so we seldom tread this code path.
131                     TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
132                     if (r.matchesIndexValue(x)) {
133                         v.addElement(r);
134                     }
135                 }
136             }
137         }
138         index[256] = v.size();
139 
140         /* Freeze things into an array.
141          */
142         rules = new TransliterationRule[v.size()];
143         v.copyInto(rules);
144 
145         StringBuffer   errors = null;
146 
147         /* Check for masking.  This is MUCH faster than our old check,
148          * which was each rule against each following rule, since we
149          * only have to check for masking within each bin now.  It's
150          * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
151          * count, and n2 is the per-bin rule count.  But n2<<n1, so
152          * it's a big win.
153          */
154         for (int x=0; x<256; ++x) {
155             for (int j=index[x]; j<index[x+1]-1; ++j) {
156                 TransliterationRule r1 = rules[j];
157                 for (int k=j+1; k<index[x+1]; ++k) {
158                     TransliterationRule r2 = rules[k];
159                     if (r1.masks(r2)) {
160                         if (errors == null) {
161                             errors = new StringBuffer  ();
162                         } else {
163                             errors.append("\n");
164                         }
165                         errors.append("Rule " + r1 + " masks " + r2);
166                     }
167                 }
168             }
169         }
170 
171         if (errors != null) {
172             throw new IllegalArgumentException  (errors.toString());
173         }
174     }
175 
176     /**
177      * Transliterate the given text with the given UTransPosition
178      * indices.  Return TRUE if the transliteration should continue
179      * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
180      * Note that FALSE is only ever returned if isIncremental is TRUE.
181      * @param text the text to be transliterated
182      * @param pos the position indices, which will be updated
183      * @param incremental if TRUE, assume new text may be inserted
184      * at index.limit, and return FALSE if thre is a partial match.
185      * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
186      * indicating that transliteration should stop until more text
187      * arrives.
188      */
189     public boolean transliterate(Replaceable text,
190                                  Transliterator.Position pos,
191                                  boolean incremental) {
192         int indexByte = text.char32At(pos.start) & 0xFF;
193         for (int i=index[indexByte]; i<index[indexByte+1]; ++i) {
194             int m = rules[i].matchAndReplace(text, pos, incremental);
195             switch (m) {
196             case UnicodeMatcher.U_MATCH:
197                 if (Transliterator.DEBUG) {
198                     System.out.println((incremental ? "Rule.i: match ":"Rule: match ") +
199                                        rules[i].toRule(true) + " => " +
200                                        UtilityExtensions.formatInput(text, pos));
201                 }
202                 return true;
203             case UnicodeMatcher.U_PARTIAL_MATCH:
204                 if (Transliterator.DEBUG) {
205                     System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") +
206                                        rules[i].toRule(true) + " => " +
207                                        UtilityExtensions.formatInput(text, pos));
208                 }
209                 return false;
210             }
211         }
212         // No match or partial match from any rule
213         pos.start += UTF16.getCharCount(text.char32At(pos.start));
214         if (Transliterator.DEBUG) {
215             System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") +
216                                UtilityExtensions.formatInput(text, pos));
217         }
218         return true;
219     }
220 
221     /**
222      * Create rule strings that represents this rule set.
223      */
224     String   toRules(boolean escapeUnprintable) {
225         int i;
226         int count = ruleVector.size();
227         StringBuffer   ruleSource = new StringBuffer  ();
228         for (i=0; i<count; ++i) {
229             if (i != 0) {
230                 ruleSource.append('\n');
231             }
232             TransliterationRule r =
233                 (TransliterationRule) ruleVector.elementAt(i);
234             ruleSource.append(r.toRule(escapeUnprintable));
235         }
236         return ruleSource.toString();
237     }
238 
239     /**
240      * Return the set of all characters that may be modified (getTarget=false)
241      * or emitted (getTarget=true) by this set.
242      */
243     UnicodeSet getSourceTargetSet(boolean getTarget) {
244         UnicodeSet set = new UnicodeSet();
245         int count = ruleVector.size();
246         for (int i=0; i<count; ++i) {
247             TransliterationRule r =
248                 (TransliterationRule) ruleVector.elementAt(i);
249             if (getTarget) {
250                 r.addTargetSetTo(set);
251             } else {
252                 r.addSourceSetTo(set);
253             }
254         }
255         return set;
256     }
257 }
258
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags