StringMatcher


1   /*
2    *******************************************************************************
3    * Copyright (C) 2001-2004, International Business Machines Corporation and    *
4    * others. All Rights Reserved.                                                *
5    *******************************************************************************
6    */
7   package com.ibm.icu.text;
8   import com.ibm.icu.impl.Utility;
9   
10  /**
11   * An object that matches a fixed input string, implementing the
12   * UnicodeMatcher API.  This object also implements the
13   * UnicodeReplacer API, allowing it to emit the matched text as
14   * output.  Since the match text may contain flexible match elements,
15   * such as UnicodeSets, the emitted text is not the match pattern, but
16   * instead a substring of the actual matched text.  Following
17   * convention, the output text is the leftmost match seen up to this
18   * point.
19   *
20   * A StringMatcher may represent a segment, in which case it has a
21   * positive segment number.  This affects how the matcher converts
22   * itself to a pattern but does not otherwise affect its function.
23   *
24   * A StringMatcher that is not a segment should not be used as a
25   * UnicodeReplacer.
26   */
27  class StringMatcher implements UnicodeMatcher, UnicodeReplacer {
28  
29      /**
30       * The text to be matched.
31       */
32      private String   pattern;
33  
34      /**
35       * Start offset, in the match text, of the <em>rightmost</em>
36       * match.
37       */
38      private int matchStart;
39      
40      /**
41       * Limit offset, in the match text, of the <em>rightmost</em>
42       * match.
43       */
44      private int matchLimit;
45  
46      /**
47       * The segment number, 1-based, or 0 if not a segment.
48       */
49      private int segmentNumber;
50  
51      /**
52       * Context object that maps stand-ins to matcher and replacer
53       * objects.
54       */
55      private final RuleBasedTransliterator.Data data;
56  
57      /**
58       * Construct a matcher that matches the given pattern string.
59       * @param theString the pattern to be matched, possibly containing
60       * stand-ins that represent nested UnicodeMatcher objects.
61       * @param segmentNum the segment number from 1..n, or 0 if this is
62       * not a segment.
63       * @param theData context object mapping stand-ins to
64       * UnicodeMatcher objects.
65       */
66      public StringMatcher(String   theString,
67                           int segmentNum,
68                           RuleBasedTransliterator.Data theData) {
69          data = theData;
70          pattern = theString;
71          matchStart = matchLimit = -1;
72          segmentNumber = segmentNum;
73      }
74  
75      /**
76       * Construct a matcher that matches a substring of the given
77       * pattern string.
78       * @param theString the pattern to be matched, possibly containing
79       * stand-ins that represent nested UnicodeMatcher objects.
80       * @param start first character of theString to be matched
81       * @param limit index after the last character of theString to be
82       * matched.
83       * @param segmentNum the segment number from 1..n, or 0 if this is
84       * not a segment.
85       * @param theData context object mapping stand-ins to
86       * UnicodeMatcher objects.
87       */
88      public StringMatcher(String   theString,
89                           int start,
90                           int limit,
91                           int segmentNum,
92                           RuleBasedTransliterator.Data theData) {
93          this(theString.substring(start, limit), segmentNum, theData);
94      }
95  
96      /**
97       * Implement UnicodeMatcher
98       */
99      public int matches(Replaceable text,
100                        int[] offset,
101                        int limit,
102                        boolean incremental) {
103         // Note (1): We process text in 16-bit code units, rather than
104         // 32-bit code points.  This works because stand-ins are
105         // always in the BMP and because we are doing a literal match
106         // operation, which can be done 16-bits at a time.
107         int i;
108         int[] cursor = new int[] { offset[0] };
109         if (limit < cursor[0]) {
110             // Match in the reverse direction
111             for (i=pattern.length()-1; i>=0; --i) {
112                 char keyChar = pattern.charAt(i); // OK; see note (1) above
113                 UnicodeMatcher subm = data.lookupMatcher(keyChar);
114                 if (subm == null) {
115                     if (cursor[0] > limit &&
116                         keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
117                         --cursor[0];
118                     } else {
119                         return U_MISMATCH;
120                     }
121                 } else {
122                     int m =
123                         subm.matches(text, cursor, limit, incremental);
124                     if (m != U_MATCH) {
125                         return m;
126                     }
127                 }
128             }
129             // Record the match position, but adjust for a normal
130             // forward start, limit, and only if a prior match does not
131             // exist -- we want the rightmost match.
132             if (matchStart < 0) {
133                 matchStart = cursor[0]+1;
134                 matchLimit = offset[0]+1;
135             }
136         } else {
137             for (i=0; i<pattern.length(); ++i) {
138                 if (incremental && cursor[0] == limit) {
139                     // We've reached the context limit without a mismatch and
140                     // without completing our match.
141                     return U_PARTIAL_MATCH;
142                 }
143                 char keyChar = pattern.charAt(i); // OK; see note (1) above
144                 UnicodeMatcher subm = data.lookupMatcher(keyChar);
145                 if (subm == null) {
146                     // Don't need the cursor < limit check if
147                     // incremental is true (because it's done above); do need
148                     // it otherwise.
149                     if (cursor[0] < limit &&
150                         keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
151                         ++cursor[0];
152                     } else {
153                         return U_MISMATCH;
154                     }
155                 } else {
156                     int m =
157                         subm.matches(text, cursor, limit, incremental);
158                     if (m != U_MATCH) {
159                         return m;
160                     }
161                 }
162             }
163             // Record the match position
164             matchStart = offset[0];
165             matchLimit = cursor[0];
166         }
167 
168         offset[0] = cursor[0];
169         return U_MATCH;
170     }
171 
172     /**
173      * Implement UnicodeMatcher
174      */
175     public String   toPattern(boolean escapeUnprintable) {
176         StringBuffer   result = new StringBuffer  ();
177         StringBuffer   quoteBuf = new StringBuffer  ();
178         if (segmentNumber > 0) { // i.e., if this is a segment
179             result.append('(');
180         }
181         for (int i=0; i<pattern.length(); ++i) {
182             char keyChar = pattern.charAt(i); // OK; see note (1) above
183             UnicodeMatcher m = data.lookupMatcher(keyChar);
184             if (m == null) {
185                 Utility.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
186             } else {
187                 Utility.appendToRule(result, m.toPattern(escapeUnprintable),
188                                      true, escapeUnprintable, quoteBuf);
189             }
190         }
191         if (segmentNumber > 0) { // i.e., if this is a segment
192             result.append(')');
193         }
194         // Flush quoteBuf out to result
195         Utility.appendToRule(result, -1,
196                              true, escapeUnprintable, quoteBuf);
197         return result.toString();
198     }
199 
200     /**
201      * Implement UnicodeMatcher
202      */
203     public boolean matchesIndexValue(int v) {
204         if (pattern.length() == 0) {
205             return true;
206         }
207         int c = UTF16.charAt(pattern, 0);
208         UnicodeMatcher m = data.lookupMatcher(c);
209         return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
210     }
211 
212     /**
213      * Implementation of UnicodeMatcher API.  Union the set of all
214      * characters that may be matched by this object into the given
215      * set.
216      * @param toUnionTo the set into which to union the source characters
217      */
218     public void addMatchSetTo(UnicodeSet toUnionTo) {
219         int ch;
220         for (int i=0; i<pattern.length(); i+=UTF16.getCharCount(ch)) {
221             ch = UTF16.charAt(pattern, i);
222             UnicodeMatcher matcher = data.lookupMatcher(ch);
223             if (matcher == null) {
224                 toUnionTo.add(ch);
225             } else {
226                 matcher.addMatchSetTo(toUnionTo);
227             }
228         }
229     }
230 
231     /**
232      * UnicodeReplacer API
233      */
234     public int replace(Replaceable text,
235                        int start,
236                        int limit,
237                        int[] cursor) {
238 
239         int outLen = 0;
240 
241         // Copy segment with out-of-band data
242         int dest = limit;
243         // If there was no match, that means that a quantifier
244         // matched zero-length.  E.g., x (a)* y matched "xy".
245         if (matchStart >= 0) {
246             if (matchStart != matchLimit) {
247                 text.copy(matchStart, matchLimit, dest);
248                 outLen = matchLimit - matchStart;
249             }
250         }
251 
252         text.replace(start, limit, ""); // delete original text
253 
254         return outLen;
255     }
256 
257     /**
258      * UnicodeReplacer API
259      */
260     public String   toReplacerPattern(boolean escapeUnprintable) {
261         // assert(segmentNumber > 0);
262         StringBuffer   rule = new StringBuffer  ("$");
263         Utility.appendNumber(rule, segmentNumber, 10, 1);
264         return rule.toString();
265     }
266 
267     /**
268      * Remove any match data.  This must be called before performing a
269      * set of matches with this segment.
270      */
271     public void resetMatch() {
272         matchStart = matchLimit = -1;
273     }
274 
275     /**
276      * Union the set of all characters that may output by this object
277      * into the given set.
278      * @param toUnionTo the set into which to union the output characters
279      */
280     public void addReplacementSetTo(UnicodeSet toUnionTo) {
281         // The output of this replacer varies; it is the source text between
282         // matchStart and matchLimit.  Since this varies depending on the
283         // input text, we can't compute it here.  We can either do nothing
284         // or we can add ALL characters to the set.  It's probably more useful
285         // to do nothing.
286     }
287 }
288 
289 //eof
290
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags