PatternTokenizer


1   //##header 1189099963000 FOUNDATION
2   //#ifndef FOUNDATION
3   //##/*
4   //## *******************************************************************************
5   //## * Copyright (C) 2006, Google, International Business Machines Corporation and    *
6   //## * others. All Rights Reserved.                                                *
7   //## *******************************************************************************
8   //## */
9   //##package com.ibm.icu.impl;
10  //##
11  //##import com.ibm.icu.text.UTF16;
12  //##import com.ibm.icu.text.UnicodeSet;
13  //##import com.ibm.icu.text.DateTimePatternGenerator.FormatParser;
14  //##import com.ibm.icu.text.DateTimePatternGenerator.VariableField;
15  //##
16  //##import java.util.BitSet;
17  //##import java.util.Iterator;
18  //##import java.util.List;
19  //##
20  //##/**
21  //## * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
22  //## * The '' (two quotes) is treated as a single quote, inside or outside a quote
23  //## * <ul>
24  //## * <li>Any ignorable characters are ignored in parsing.</li>
25  //## * <li>Any syntax characters are broken into separate tokens</li>
26  //## * <li>Quote characters can be specified: '...', "...", and \x </li>
27  //## * <li>Other characters are treated as literals</li>
28  //## * </ul>
29  //## */
30  //##public class PatternTokenizer {
31  //##    // settings used in the interpretation of the pattern
32  //##    private UnicodeSet ignorableCharacters = new UnicodeSet();
33  //##    private UnicodeSet syntaxCharacters = new UnicodeSet();
34  //##    private UnicodeSet escapeCharacters = new UnicodeSet();
35  //##    private boolean usingSlash = false;
36  //##    private boolean usingQuote = false;
37  //##    
38  //##    // transient data, set when needed. Null it out for any changes in the above fields.
39  //##    private transient UnicodeSet needingQuoteCharacters = null;
40  //##    
41  //##    // data about the current pattern being parsed. start gets moved as we go along.
42  //##    private int start;
43  //##    private int limit;
44  //##    private CharSequence pattern;
45  //##    
46  //##    public UnicodeSet getIgnorableCharacters() {
47  //##        return (UnicodeSet) ignorableCharacters.clone();
48  //##    }
49  //##    /**
50  //##     * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
51  //##     * @param ignorableCharacters
52  //##     * @return
53  //##     */
54  //##    public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
55  //##        this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
56  //##        needingQuoteCharacters = null;
57  //##        return this;
58  //##    }
59  //##    public UnicodeSet getSyntaxCharacters() {
60  //##        return (UnicodeSet) syntaxCharacters.clone();
61  //##    }
62  //##    /**
63  //##     *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
64  //##     * @param syntaxCharacters
65  //##     * @return
66  //##     */
67  //##    public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
68  //##        this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
69  //##        needingQuoteCharacters = null;
70  //##        return this;
71  //##    }   
72  //##    public UnicodeSet getEscapeCharacters() {
73  //##        return (UnicodeSet) escapeCharacters.clone();
74  //##    }
75  //##    /**
76  //##     * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
77  //##     * @param escapeCharacters
78  //##     * @return
79  //##     */
80  //##    public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
81  //##        this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
82  //##        return this;
83  //##    }
84  //##    public boolean isUsingQuote() {
85  //##        return usingQuote;
86  //##    }
87  //##    public PatternTokenizer setUsingQuote(boolean usingQuote) {
88  //##        this.usingQuote = usingQuote;
89  //##        needingQuoteCharacters = null;
90  //##        return this;
91  //##    }
92  //##    public boolean isUsingSlash() {
93  //##        return usingSlash;
94  //##    }
95  //##    public PatternTokenizer setUsingSlash(boolean usingSlash) {
96  //##        this.usingSlash = usingSlash;
97  //##        needingQuoteCharacters = null;
98  //##        return this;
99  //##    }
100 //##    //    public UnicodeSet getQuoteCharacters() {
101 //##//  return (UnicodeSet) quoteCharacters.clone();
102 //##//  }
103 //##//  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
104 //##//  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
105 //##//  needingQuoteCharacters = null;
106 //##//  return this;
107 //##//  }
108 //##    public int getLimit() {
109 //##        return limit;
110 //##    }
111 //##    public PatternTokenizer setLimit(int limit) {
112 //##        this.limit = limit;
113 //##        return this;
114 //##    }
115 //##    public int getStart() {
116 //##        return start;
117 //##    }
118 //##    public PatternTokenizer setStart(int start) {
119 //##        this.start = start;
120 //##        return this;
121 //##    }
122 //##    public PatternTokenizer setPattern(CharSequence pattern) {
123 //##        if (pattern == null) {
124 //##            throw new IllegalArgumentException("Inconsistent arguments");
125 //##        }
126 //##        this.start = 0;
127 //##        this.limit = pattern.length();
128 //##        this.pattern = pattern;
129 //##        return this;
130 //##    }
131 //##    
132 //##    public static final char SINGLE_QUOTE = '\'';
133 //##    public static final char BACK_SLASH = '\\';
134 //##    private static int NO_QUOTE = -1, IN_QUOTE = -2;
135 //##    /**
136 //##     * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
137 //##     * @param string
138 //##     * @return
139 //##     */
140 //##    public String quoteLiteral(CharSequence string) {
141 //##        if (needingQuoteCharacters == null) {
142 //##            needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters); // .addAll(quoteCharacters)
143 //##            if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
144 //##            if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
145 //##        }
146 //##        StringBuffer result = new StringBuffer();
147 //##        int quotedChar = NO_QUOTE;
148 //##        int cp;
149 //##        for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
150 //##            cp = UTF16.charAt(string, i);
151 //##            if (escapeCharacters.contains(cp)) {
152 //##                // we may have to fix up previous characters
153 //##                if (quotedChar == IN_QUOTE) {
154 //##                    result.append(SINGLE_QUOTE);
155 //##                    quotedChar = NO_QUOTE;
156 //##                }
157 //##                appendEscaped(result, cp);
158 //##                continue;
159 //##            }
160 //##            
161 //##            if (needingQuoteCharacters.contains(cp)) {
162 //##                // if we have already started a quote
163 //##                if (quotedChar == IN_QUOTE) {
164 //##                    UTF16.append(result, cp);
165 //##                    if (usingQuote && cp == SINGLE_QUOTE) { // double it
166 //##                        result.append(SINGLE_QUOTE);
167 //##                    }
168 //##                    continue;
169 //##                }
170 //##                // otherwise not already in quote
171 //##                if (usingSlash) {
172 //##                    result.append(BACK_SLASH);
173 //##                    UTF16.append(result, cp);
174 //##                    continue;
175 //##                }
176 //##                if (usingQuote) {
177 //##                    if (cp == SINGLE_QUOTE) { // double it and continue
178 //##                        result.append(SINGLE_QUOTE);
179 //##                        result.append(SINGLE_QUOTE);
180 //##                        continue;
181 //##                    }
182 //##                    result.append(SINGLE_QUOTE);
183 //##                    UTF16.append(result, cp);
184 //##                    quotedChar = IN_QUOTE;
185 //##                    continue;
186 //##                }
187 //##                // we have no choice but to use \\u or \\U
188 //##                appendEscaped(result, cp);
189 //##                continue;
190 //##            }
191 //##            // otherwise cp doesn't need quoting
192 //##            // we may have to fix up previous characters
193 //##            if (quotedChar == IN_QUOTE) {
194 //##                result.append(SINGLE_QUOTE);
195 //##                quotedChar = NO_QUOTE;
196 //##            }
197 //##            UTF16.append(result, cp);
198 //##        }
199 //##        // all done. 
200 //##        // we may have to fix up previous characters
201 //##        if (quotedChar == IN_QUOTE) {
202 //##            result.append(SINGLE_QUOTE);
203 //##        }
204 //##        return result.toString();
205 //##    }
206 //##    
207 //##    private void appendEscaped(StringBuffer result, int cp) {
208 //##        if (cp <= 0xFFFF) {
209 //##            result.append("\\u").append(Utility.hex(cp,4));
210 //##        } else {
211 //##            result.append("\\U").append(Utility.hex(cp,8));
212 //##        }
213 //##    }
214 //##    
215 //##    public String normalize() {
216 //##        int oldStart = start;
217 //##        StringBuffer result = new StringBuffer();
218 //##        StringBuffer buffer = new StringBuffer();
219 //##        while (true) {
220 //##            buffer.setLength(0);
221 //##            int status = next(buffer);
222 //##            if (status == DONE) {
223 //##                start = oldStart;
224 //##                return result.toString();
225 //##            }
226 //##            if (status != SYNTAX) {
227 //##                result.append(quoteLiteral(buffer));
228 //##            } else {
229 //##                result.append(buffer);
230 //##            }
231 //##        }
232 //##    }
233 //##    
234 //##    public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
235 //##    
236 //##    private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
237 //##    
238 //##    public int next(StringBuffer buffer) {
239 //##        if (start >= limit) return DONE;
240 //##        int status = UNKNOWN;
241 //##        int lastQuote = UNKNOWN;
242 //##        int quoteStatus = NONE;
243 //##        int hexCount = 0;
244 //##        int hexValue = 0;
245 //##        int cp;
246 //##        main:
247 //##            for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
248 //##                cp = UTF16.charAt(pattern, i);
249 //##                // if we are in a quote, then handle it.
250 //##                switch (quoteStatus) {
251 //##                case SLASH_START:
252 //##                    switch (cp) {
253 //##                    case 'u':
254 //##                        quoteStatus = HEX;
255 //##                        hexCount = 4;
256 //##                        hexValue = 0;
257 //##                        continue main;
258 //##                    case 'U': 
259 //##                        quoteStatus = HEX;
260 //##                        hexCount = 8;
261 //##                        hexValue = 0;
262 //##                        continue main;
263 //##                    default:
264 //##                        if (usingSlash) {
265 //##                            UTF16.append(buffer, cp);
266 //##                            quoteStatus = NONE;
267 //##                            continue main;
268 //##                        } else {
269 //##                            buffer.append(BACK_SLASH);
270 //##                            quoteStatus = NONE;
271 //##                        }
272 //##                    }
273 //##                    break; // fall through to NONE
274 //##                case HEX:
275 //##                    hexValue <<= 4;
276 //##                    hexValue += cp;
277 //##                    switch (cp) {
278 //##                    case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
279 //##                        hexValue -= '0'; break;
280 //##                    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
281 //##                        hexValue -= 'a' - 10; break;
282 //##                    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
283 //##                        hexValue -= 'A' - 10; break;
284 //##                    default:
285 //##                        start = i;
286 //##                    return BROKEN_ESCAPE;
287 //##                    }
288 //##                    --hexCount;
289 //##                    if (hexCount == 0) {
290 //##                        quoteStatus = NONE;
291 //##                        UTF16.append(buffer, hexValue);
292 //##                    }
293 //##                    continue main;
294 //##                case AFTER_QUOTE:
295 //##                    // see if we get another quote character
296 //##                    // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
297 //##                    if (cp == lastQuote) {
298 //##                        UTF16.append(buffer, cp);
299 //##                        quoteStatus = NORMAL_QUOTE;
300 //##                        continue main;
301 //##                    }
302 //##                    quoteStatus = NONE;
303 //##                    break; // fall through to NONE
304 //##                case START_QUOTE:
305 //##                    // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
306 //##                    if (cp == lastQuote) {
307 //##                        UTF16.append(buffer, cp);
308 //##                        quoteStatus = NONE; // get out of quote, with no trace remaining
309 //##                        continue;                            
310 //##                    }
311 //##                    // otherwise get into quote
312 //##                    UTF16.append(buffer, cp);
313 //##                    quoteStatus = NORMAL_QUOTE;
314 //##                    continue main;
315 //##                case NORMAL_QUOTE: 
316 //##                    if (cp == lastQuote) {
317 //##                        quoteStatus = AFTER_QUOTE; // get out of quote
318 //##                        continue main;
319 //##                    }
320 //##                    UTF16.append(buffer, cp);
321 //##                    continue main;
322 //##                }
323 //##                
324 //##                if (ignorableCharacters.contains(cp)) {
325 //##                    continue;
326 //##                }
327 //##                // do syntax characters
328 //##                if (syntaxCharacters.contains(cp)) {
329 //##                    if (status == UNKNOWN) {
330 //##                        UTF16.append(buffer, cp);
331 //##                        start = i + UTF16.getCharCount(cp);
332 //##                        return SYNTAX;
333 //##                    } else { // LITERAL, so back up and break
334 //##                        start = i;
335 //##                        return status;
336 //##                    }
337 //##                }
338 //##                // otherwise it is a literal; keep on going
339 //##                status = LITERAL;
340 //##                if (cp == BACK_SLASH) {
341 //##                    quoteStatus = SLASH_START;
342 //##                    continue;
343 //##                } else if (usingQuote && cp == SINGLE_QUOTE) {
344 //##                    lastQuote = cp;
345 //##                    quoteStatus = START_QUOTE;
346 //##                    continue;
347 //##                }
348 //##                // normal literals
349 //##                UTF16.append(buffer, cp);
350 //##            }
351 //##        // handle final cleanup
352 //##        start = limit;
353 //##        switch (quoteStatus) {
354 //##        case HEX:
355 //##            status = BROKEN_ESCAPE;
356 //##            break;
357 //##        case SLASH_START:
358 //##            if (usingSlash) {
359 //##                status = BROKEN_ESCAPE;
360 //##            } else {
361 //##                buffer.append(BACK_SLASH);
362 //##            }
363 //##            break;
364 //##        case START_QUOTE: case NORMAL_QUOTE:
365 //##            status = BROKEN_QUOTE;
366 //##            break;
367 //##        }
368 //##        return status;
369 //##    }
370 //##    
371 //##    
372 //##}
373 //#endif
374 //eof
375
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags