KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > PatternTokenizer


1 //##header 1189099963000 FOUNDATION
2
//#ifndef FOUNDATION
3
//##/*
4
//## *******************************************************************************
5
//## * Copyright (C) 2006, Google, International Business Machines Corporation and *
6
//## * others. All Rights Reserved. *
7
//## *******************************************************************************
8
//## */
9
//##package com.ibm.icu.impl;
10
//##
11
//##import com.ibm.icu.text.UTF16;
12
//##import com.ibm.icu.text.UnicodeSet;
13
//##import com.ibm.icu.text.DateTimePatternGenerator.FormatParser;
14
//##import com.ibm.icu.text.DateTimePatternGenerator.VariableField;
15
//##
16
//##import java.util.BitSet;
17
//##import java.util.Iterator;
18
//##import java.util.List;
19
//##
20
//##/**
21
//## * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
22
//## * The '' (two quotes) is treated as a single quote, inside or outside a quote
23
//## * <ul>
24
//## * <li>Any ignorable characters are ignored in parsing.</li>
25
//## * <li>Any syntax characters are broken into separate tokens</li>
26
//## * <li>Quote characters can be specified: '...', "...", and \x </li>
27
//## * <li>Other characters are treated as literals</li>
28
//## * </ul>
29
//## */
30
//##public class PatternTokenizer {
31
//## // settings used in the interpretation of the pattern
32
//## private UnicodeSet ignorableCharacters = new UnicodeSet();
33
//## private UnicodeSet syntaxCharacters = new UnicodeSet();
34
//## private UnicodeSet escapeCharacters = new UnicodeSet();
35
//## private boolean usingSlash = false;
36
//## private boolean usingQuote = false;
37
//##
38
//## // transient data, set when needed. Null it out for any changes in the above fields.
39
//## private transient UnicodeSet needingQuoteCharacters = null;
40
//##
41
//## // data about the current pattern being parsed. start gets moved as we go along.
42
//## private int start;
43
//## private int limit;
44
//## private CharSequence pattern;
45
//##
46
//## public UnicodeSet getIgnorableCharacters() {
47
//## return (UnicodeSet) ignorableCharacters.clone();
48
//## }
49
//## /**
50
//## * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
51
//## * @param ignorableCharacters
52
//## * @return
53
//## */
54
//## public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
55
//## this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
56
//## needingQuoteCharacters = null;
57
//## return this;
58
//## }
59
//## public UnicodeSet getSyntaxCharacters() {
60
//## return (UnicodeSet) syntaxCharacters.clone();
61
//## }
62
//## /**
63
//## * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
64
//## * @param syntaxCharacters
65
//## * @return
66
//## */
67
//## public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
68
//## this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
69
//## needingQuoteCharacters = null;
70
//## return this;
71
//## }
72
//## public UnicodeSet getEscapeCharacters() {
73
//## return (UnicodeSet) escapeCharacters.clone();
74
//## }
75
//## /**
76
//## * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
77
//## * @param escapeCharacters
78
//## * @return
79
//## */
80
//## public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
81
//## this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
82
//## return this;
83
//## }
84
//## public boolean isUsingQuote() {
85
//## return usingQuote;
86
//## }
87
//## public PatternTokenizer setUsingQuote(boolean usingQuote) {
88
//## this.usingQuote = usingQuote;
89
//## needingQuoteCharacters = null;
90
//## return this;
91
//## }
92
//## public boolean isUsingSlash() {
93
//## return usingSlash;
94
//## }
95
//## public PatternTokenizer setUsingSlash(boolean usingSlash) {
96
//## this.usingSlash = usingSlash;
97
//## needingQuoteCharacters = null;
98
//## return this;
99
//## }
100
//## // public UnicodeSet getQuoteCharacters() {
101
//##// return (UnicodeSet) quoteCharacters.clone();
102
//##// }
103
//##// public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
104
//##// this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
105
//##// needingQuoteCharacters = null;
106
//##// return this;
107
//##// }
108
//## public int getLimit() {
109
//## return limit;
110
//## }
111
//## public PatternTokenizer setLimit(int limit) {
112
//## this.limit = limit;
113
//## return this;
114
//## }
115
//## public int getStart() {
116
//## return start;
117
//## }
118
//## public PatternTokenizer setStart(int start) {
119
//## this.start = start;
120
//## return this;
121
//## }
122
//## public PatternTokenizer setPattern(CharSequence pattern) {
123
//## if (pattern == null) {
124
//## throw new IllegalArgumentException("Inconsistent arguments");
125
//## }
126
//## this.start = 0;
127
//## this.limit = pattern.length();
128
//## this.pattern = pattern;
129
//## return this;
130
//## }
131
//##
132
//## public static final char SINGLE_QUOTE = '\'';
133
//## public static final char BACK_SLASH = '\\';
134
//## private static int NO_QUOTE = -1, IN_QUOTE = -2;
135
//## /**
136
//## * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
137
//## * @param string
138
//## * @return
139
//## */
140
//## public String quoteLiteral(CharSequence string) {
141
//## if (needingQuoteCharacters == null) {
142
//## needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters); // .addAll(quoteCharacters)
143
//## if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
144
//## if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
145
//## }
146
//## StringBuffer result = new StringBuffer();
147
//## int quotedChar = NO_QUOTE;
148
//## int cp;
149
//## for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
150
//## cp = UTF16.charAt(string, i);
151
//## if (escapeCharacters.contains(cp)) {
152
//## // we may have to fix up previous characters
153
//## if (quotedChar == IN_QUOTE) {
154
//## result.append(SINGLE_QUOTE);
155
//## quotedChar = NO_QUOTE;
156
//## }
157
//## appendEscaped(result, cp);
158
//## continue;
159
//## }
160
//##
161
//## if (needingQuoteCharacters.contains(cp)) {
162
//## // if we have already started a quote
163
//## if (quotedChar == IN_QUOTE) {
164
//## UTF16.append(result, cp);
165
//## if (usingQuote && cp == SINGLE_QUOTE) { // double it
166
//## result.append(SINGLE_QUOTE);
167
//## }
168
//## continue;
169
//## }
170
//## // otherwise not already in quote
171
//## if (usingSlash) {
172
//## result.append(BACK_SLASH);
173
//## UTF16.append(result, cp);
174
//## continue;
175
//## }
176
//## if (usingQuote) {
177
//## if (cp == SINGLE_QUOTE) { // double it and continue
178
//## result.append(SINGLE_QUOTE);
179
//## result.append(SINGLE_QUOTE);
180
//## continue;
181
//## }
182
//## result.append(SINGLE_QUOTE);
183
//## UTF16.append(result, cp);
184
//## quotedChar = IN_QUOTE;
185
//## continue;
186
//## }
187
//## // we have no choice but to use \\u or \\U
188
//## appendEscaped(result, cp);
189
//## continue;
190
//## }
191
//## // otherwise cp doesn't need quoting
192
//## // we may have to fix up previous characters
193
//## if (quotedChar == IN_QUOTE) {
194
//## result.append(SINGLE_QUOTE);
195
//## quotedChar = NO_QUOTE;
196
//## }
197
//## UTF16.append(result, cp);
198
//## }
199
//## // all done.
200
//## // we may have to fix up previous characters
201
//## if (quotedChar == IN_QUOTE) {
202
//## result.append(SINGLE_QUOTE);
203
//## }
204
//## return result.toString();
205
//## }
206
//##
207
//## private void appendEscaped(StringBuffer result, int cp) {
208
//## if (cp <= 0xFFFF) {
209
//## result.append("\\u").append(Utility.hex(cp,4));
210
//## } else {
211
//## result.append("\\U").append(Utility.hex(cp,8));
212
//## }
213
//## }
214
//##
215
//## public String normalize() {
216
//## int oldStart = start;
217
//## StringBuffer result = new StringBuffer();
218
//## StringBuffer buffer = new StringBuffer();
219
//## while (true) {
220
//## buffer.setLength(0);
221
//## int status = next(buffer);
222
//## if (status == DONE) {
223
//## start = oldStart;
224
//## return result.toString();
225
//## }
226
//## if (status != SYNTAX) {
227
//## result.append(quoteLiteral(buffer));
228
//## } else {
229
//## result.append(buffer);
230
//## }
231
//## }
232
//## }
233
//##
234
//## public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
235
//##
236
//## private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
237
//##
238
//## public int next(StringBuffer buffer) {
239
//## if (start >= limit) return DONE;
240
//## int status = UNKNOWN;
241
//## int lastQuote = UNKNOWN;
242
//## int quoteStatus = NONE;
243
//## int hexCount = 0;
244
//## int hexValue = 0;
245
//## int cp;
246
//## main:
247
//## for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
248
//## cp = UTF16.charAt(pattern, i);
249
//## // if we are in a quote, then handle it.
250
//## switch (quoteStatus) {
251
//## case SLASH_START:
252
//## switch (cp) {
253
//## case 'u':
254
//## quoteStatus = HEX;
255
//## hexCount = 4;
256
//## hexValue = 0;
257
//## continue main;
258
//## case 'U':
259
//## quoteStatus = HEX;
260
//## hexCount = 8;
261
//## hexValue = 0;
262
//## continue main;
263
//## default:
264
//## if (usingSlash) {
265
//## UTF16.append(buffer, cp);
266
//## quoteStatus = NONE;
267
//## continue main;
268
//## } else {
269
//## buffer.append(BACK_SLASH);
270
//## quoteStatus = NONE;
271
//## }
272
//## }
273
//## break; // fall through to NONE
274
//## case HEX:
275
//## hexValue <<= 4;
276
//## hexValue += cp;
277
//## switch (cp) {
278
//## case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
279
//## hexValue -= '0'; break;
280
//## case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
281
//## hexValue -= 'a' - 10; break;
282
//## case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
283
//## hexValue -= 'A' - 10; break;
284
//## default:
285
//## start = i;
286
//## return BROKEN_ESCAPE;
287
//## }
288
//## --hexCount;
289
//## if (hexCount == 0) {
290
//## quoteStatus = NONE;
291
//## UTF16.append(buffer, hexValue);
292
//## }
293
//## continue main;
294
//## case AFTER_QUOTE:
295
//## // see if we get another quote character
296
//## // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
297
//## if (cp == lastQuote) {
298
//## UTF16.append(buffer, cp);
299
//## quoteStatus = NORMAL_QUOTE;
300
//## continue main;
301
//## }
302
//## quoteStatus = NONE;
303
//## break; // fall through to NONE
304
//## case START_QUOTE:
305
//## // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
306
//## if (cp == lastQuote) {
307
//## UTF16.append(buffer, cp);
308
//## quoteStatus = NONE; // get out of quote, with no trace remaining
309
//## continue;
310
//## }
311
//## // otherwise get into quote
312
//## UTF16.append(buffer, cp);
313
//## quoteStatus = NORMAL_QUOTE;
314
//## continue main;
315
//## case NORMAL_QUOTE:
316
//## if (cp == lastQuote) {
317
//## quoteStatus = AFTER_QUOTE; // get out of quote
318
//## continue main;
319
//## }
320
//## UTF16.append(buffer, cp);
321
//## continue main;
322
//## }
323
//##
324
//## if (ignorableCharacters.contains(cp)) {
325
//## continue;
326
//## }
327
//## // do syntax characters
328
//## if (syntaxCharacters.contains(cp)) {
329
//## if (status == UNKNOWN) {
330
//## UTF16.append(buffer, cp);
331
//## start = i + UTF16.getCharCount(cp);
332
//## return SYNTAX;
333
//## } else { // LITERAL, so back up and break
334
//## start = i;
335
//## return status;
336
//## }
337
//## }
338
//## // otherwise it is a literal; keep on going
339
//## status = LITERAL;
340
//## if (cp == BACK_SLASH) {
341
//## quoteStatus = SLASH_START;
342
//## continue;
343
//## } else if (usingQuote && cp == SINGLE_QUOTE) {
344
//## lastQuote = cp;
345
//## quoteStatus = START_QUOTE;
346
//## continue;
347
//## }
348
//## // normal literals
349
//## UTF16.append(buffer, cp);
350
//## }
351
//## // handle final cleanup
352
//## start = limit;
353
//## switch (quoteStatus) {
354
//## case HEX:
355
//## status = BROKEN_ESCAPE;
356
//## break;
357
//## case SLASH_START:
358
//## if (usingSlash) {
359
//## status = BROKEN_ESCAPE;
360
//## } else {
361
//## buffer.append(BACK_SLASH);
362
//## }
363
//## break;
364
//## case START_QUOTE: case NORMAL_QUOTE:
365
//## status = BROKEN_QUOTE;
366
//## break;
367
//## }
368
//## return status;
369
//## }
370
//##
371
//##
372
//##}
373
//#endif
374
//eof
375
Popular Tags