KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > StringMatcher


1 /*
2  *******************************************************************************
3  * Copyright (C) 2001-2004, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7 package com.ibm.icu.text;
8 import com.ibm.icu.impl.Utility;
9
10 /**
11  * An object that matches a fixed input string, implementing the
12  * UnicodeMatcher API. This object also implements the
13  * UnicodeReplacer API, allowing it to emit the matched text as
14  * output. Since the match text may contain flexible match elements,
15  * such as UnicodeSets, the emitted text is not the match pattern, but
16  * instead a substring of the actual matched text. Following
17  * convention, the output text is the leftmost match seen up to this
18  * point.
19  *
20  * A StringMatcher may represent a segment, in which case it has a
21  * positive segment number. This affects how the matcher converts
22  * itself to a pattern but does not otherwise affect its function.
23  *
24  * A StringMatcher that is not a segment should not be used as a
25  * UnicodeReplacer.
26  */

27 class StringMatcher implements UnicodeMatcher, UnicodeReplacer {
28
29     /**
30      * The text to be matched.
31      */

32     private String JavaDoc pattern;
33
34     /**
35      * Start offset, in the match text, of the <em>rightmost</em>
36      * match.
37      */

38     private int matchStart;
39     
40     /**
41      * Limit offset, in the match text, of the <em>rightmost</em>
42      * match.
43      */

44     private int matchLimit;
45
46     /**
47      * The segment number, 1-based, or 0 if not a segment.
48      */

49     private int segmentNumber;
50
51     /**
52      * Context object that maps stand-ins to matcher and replacer
53      * objects.
54      */

55     private final RuleBasedTransliterator.Data data;
56
57     /**
58      * Construct a matcher that matches the given pattern string.
59      * @param theString the pattern to be matched, possibly containing
60      * stand-ins that represent nested UnicodeMatcher objects.
61      * @param segmentNum the segment number from 1..n, or 0 if this is
62      * not a segment.
63      * @param theData context object mapping stand-ins to
64      * UnicodeMatcher objects.
65      */

66     public StringMatcher(String JavaDoc theString,
67                          int segmentNum,
68                          RuleBasedTransliterator.Data theData) {
69         data = theData;
70         pattern = theString;
71         matchStart = matchLimit = -1;
72         segmentNumber = segmentNum;
73     }
74
75     /**
76      * Construct a matcher that matches a substring of the given
77      * pattern string.
78      * @param theString the pattern to be matched, possibly containing
79      * stand-ins that represent nested UnicodeMatcher objects.
80      * @param start first character of theString to be matched
81      * @param limit index after the last character of theString to be
82      * matched.
83      * @param segmentNum the segment number from 1..n, or 0 if this is
84      * not a segment.
85      * @param theData context object mapping stand-ins to
86      * UnicodeMatcher objects.
87      */

88     public StringMatcher(String JavaDoc theString,
89                          int start,
90                          int limit,
91                          int segmentNum,
92                          RuleBasedTransliterator.Data theData) {
93         this(theString.substring(start, limit), segmentNum, theData);
94     }
95
96     /**
97      * Implement UnicodeMatcher
98      */

99     public int matches(Replaceable text,
100                        int[] offset,
101                        int limit,
102                        boolean incremental) {
103         // Note (1): We process text in 16-bit code units, rather than
104
// 32-bit code points. This works because stand-ins are
105
// always in the BMP and because we are doing a literal match
106
// operation, which can be done 16-bits at a time.
107
int i;
108         int[] cursor = new int[] { offset[0] };
109         if (limit < cursor[0]) {
110             // Match in the reverse direction
111
for (i=pattern.length()-1; i>=0; --i) {
112                 char keyChar = pattern.charAt(i); // OK; see note (1) above
113
UnicodeMatcher subm = data.lookupMatcher(keyChar);
114                 if (subm == null) {
115                     if (cursor[0] > limit &&
116                         keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
117
--cursor[0];
118                     } else {
119                         return U_MISMATCH;
120                     }
121                 } else {
122                     int m =
123                         subm.matches(text, cursor, limit, incremental);
124                     if (m != U_MATCH) {
125                         return m;
126                     }
127                 }
128             }
129             // Record the match position, but adjust for a normal
130
// forward start, limit, and only if a prior match does not
131
// exist -- we want the rightmost match.
132
if (matchStart < 0) {
133                 matchStart = cursor[0]+1;
134                 matchLimit = offset[0]+1;
135             }
136         } else {
137             for (i=0; i<pattern.length(); ++i) {
138                 if (incremental && cursor[0] == limit) {
139                     // We've reached the context limit without a mismatch and
140
// without completing our match.
141
return U_PARTIAL_MATCH;
142                 }
143                 char keyChar = pattern.charAt(i); // OK; see note (1) above
144
UnicodeMatcher subm = data.lookupMatcher(keyChar);
145                 if (subm == null) {
146                     // Don't need the cursor < limit check if
147
// incremental is true (because it's done above); do need
148
// it otherwise.
149
if (cursor[0] < limit &&
150                         keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
151
++cursor[0];
152                     } else {
153                         return U_MISMATCH;
154                     }
155                 } else {
156                     int m =
157                         subm.matches(text, cursor, limit, incremental);
158                     if (m != U_MATCH) {
159                         return m;
160                     }
161                 }
162             }
163             // Record the match position
164
matchStart = offset[0];
165             matchLimit = cursor[0];
166         }
167
168         offset[0] = cursor[0];
169         return U_MATCH;
170     }
171
172     /**
173      * Implement UnicodeMatcher
174      */

175     public String JavaDoc toPattern(boolean escapeUnprintable) {
176         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
177         StringBuffer JavaDoc quoteBuf = new StringBuffer JavaDoc();
178         if (segmentNumber > 0) { // i.e., if this is a segment
179
result.append('(');
180         }
181         for (int i=0; i<pattern.length(); ++i) {
182             char keyChar = pattern.charAt(i); // OK; see note (1) above
183
UnicodeMatcher m = data.lookupMatcher(keyChar);
184             if (m == null) {
185                 Utility.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
186             } else {
187                 Utility.appendToRule(result, m.toPattern(escapeUnprintable),
188                                      true, escapeUnprintable, quoteBuf);
189             }
190         }
191         if (segmentNumber > 0) { // i.e., if this is a segment
192
result.append(')');
193         }
194         // Flush quoteBuf out to result
195
Utility.appendToRule(result, -1,
196                              true, escapeUnprintable, quoteBuf);
197         return result.toString();
198     }
199
200     /**
201      * Implement UnicodeMatcher
202      */

203     public boolean matchesIndexValue(int v) {
204         if (pattern.length() == 0) {
205             return true;
206         }
207         int c = UTF16.charAt(pattern, 0);
208         UnicodeMatcher m = data.lookupMatcher(c);
209         return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
210     }
211
212     /**
213      * Implementation of UnicodeMatcher API. Union the set of all
214      * characters that may be matched by this object into the given
215      * set.
216      * @param toUnionTo the set into which to union the source characters
217      */

218     public void addMatchSetTo(UnicodeSet toUnionTo) {
219         int ch;
220         for (int i=0; i<pattern.length(); i+=UTF16.getCharCount(ch)) {
221             ch = UTF16.charAt(pattern, i);
222             UnicodeMatcher matcher = data.lookupMatcher(ch);
223             if (matcher == null) {
224                 toUnionTo.add(ch);
225             } else {
226                 matcher.addMatchSetTo(toUnionTo);
227             }
228         }
229     }
230
231     /**
232      * UnicodeReplacer API
233      */

234     public int replace(Replaceable text,
235                        int start,
236                        int limit,
237                        int[] cursor) {
238
239         int outLen = 0;
240
241         // Copy segment with out-of-band data
242
int dest = limit;
243         // If there was no match, that means that a quantifier
244
// matched zero-length. E.g., x (a)* y matched "xy".
245
if (matchStart >= 0) {
246             if (matchStart != matchLimit) {
247                 text.copy(matchStart, matchLimit, dest);
248                 outLen = matchLimit - matchStart;
249             }
250         }
251
252         text.replace(start, limit, ""); // delete original text
253

254         return outLen;
255     }
256
257     /**
258      * UnicodeReplacer API
259      */

260     public String JavaDoc toReplacerPattern(boolean escapeUnprintable) {
261         // assert(segmentNumber > 0);
262
StringBuffer JavaDoc rule = new StringBuffer JavaDoc("$");
263         Utility.appendNumber(rule, segmentNumber, 10, 1);
264         return rule.toString();
265     }
266
267     /**
268      * Remove any match data. This must be called before performing a
269      * set of matches with this segment.
270      */

271     public void resetMatch() {
272         matchStart = matchLimit = -1;
273     }
274
275     /**
276      * Union the set of all characters that may output by this object
277      * into the given set.
278      * @param toUnionTo the set into which to union the output characters
279      */

280     public void addReplacementSetTo(UnicodeSet toUnionTo) {
281         // The output of this replacer varies; it is the source text between
282
// matchStart and matchLimit. Since this varies depending on the
283
// input text, we can't compute it here. We can either do nothing
284
// or we can add ALL characters to the set. It's probably more useful
285
// to do nothing.
286
}
287 }
288
289 //eof
290
Popular Tags