1 //##header 1189099963000 FOUNDATION2 //#ifndef FOUNDATION3 //##/*4 //## *******************************************************************************5 //## * Copyright (C) 2006, Google, International Business Machines Corporation and *6 //## * others. All Rights Reserved. *7 //## *******************************************************************************8 //## */9 //##package com.ibm.icu.impl;10 //##11 //##import com.ibm.icu.text.UTF16;12 //##import com.ibm.icu.text.UnicodeSet;13 //##import com.ibm.icu.text.DateTimePatternGenerator.FormatParser;14 //##import com.ibm.icu.text.DateTimePatternGenerator.VariableField;15 //##16 //##import java.util.BitSet;17 //##import java.util.Iterator;18 //##import java.util.List;19 //##20 //##/**21 //## * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.22 //## * The '' (two quotes) is treated as a single quote, inside or outside a quote23 //## * <ul>24 //## * <li>Any ignorable characters are ignored in parsing.</li>25 //## * <li>Any syntax characters are broken into separate tokens</li>26 //## * <li>Quote characters can be specified: '...', "...", and \x </li>27 //## * <li>Other characters are treated as literals</li>28 //## * </ul>29 //## */30 //##public class PatternTokenizer {31 //## // settings used in the interpretation of the pattern32 //## private UnicodeSet ignorableCharacters = new UnicodeSet();33 //## private UnicodeSet syntaxCharacters = new UnicodeSet();34 //## private UnicodeSet escapeCharacters = new UnicodeSet();35 //## private boolean usingSlash = false;36 //## private boolean usingQuote = false;37 //## 38 //## // transient data, set when needed. Null it out for any changes in the above fields.39 //## private transient UnicodeSet needingQuoteCharacters = null;40 //## 41 //## // data about the current pattern being parsed. start gets moved as we go along.42 //## private int start;43 //## private int limit;44 //## private CharSequence pattern;45 //## 46 //## public UnicodeSet getIgnorableCharacters() {47 //## return (UnicodeSet) ignorableCharacters.clone();48 //## }49 //## /**50 //## * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");51 //## * @param ignorableCharacters52 //## * @return53 //## */54 //## public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {55 //## this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();56 //## needingQuoteCharacters = null;57 //## return this;58 //## }59 //## public UnicodeSet getSyntaxCharacters() {60 //## return (UnicodeSet) syntaxCharacters.clone();61 //## }62 //## /**63 //## * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")64 //## * @param syntaxCharacters65 //## * @return66 //## */67 //## public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {68 //## this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();69 //## needingQuoteCharacters = null;70 //## return this;71 //## } 72 //## public UnicodeSet getEscapeCharacters() {73 //## return (UnicodeSet) escapeCharacters.clone();74 //## }75 //## /**76 //## * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");77 //## * @param escapeCharacters78 //## * @return79 //## */80 //## public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {81 //## this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();82 //## return this;83 //## }84 //## public boolean isUsingQuote() {85 //## return usingQuote;86 //## }87 //## public PatternTokenizer setUsingQuote(boolean usingQuote) {88 //## this.usingQuote = usingQuote;89 //## needingQuoteCharacters = null;90 //## return this;91 //## }92 //## public boolean isUsingSlash() {93 //## return usingSlash;94 //## }95 //## public PatternTokenizer setUsingSlash(boolean usingSlash) {96 //## this.usingSlash = usingSlash;97 //## needingQuoteCharacters = null;98 //## return this;99 //## }100 //## // public UnicodeSet getQuoteCharacters() {101 //##// return (UnicodeSet) quoteCharacters.clone();102 //##// }103 //##// public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {104 //##// this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();105 //##// needingQuoteCharacters = null;106 //##// return this;107 //##// }108 //## public int getLimit() {109 //## return limit;110 //## }111 //## public PatternTokenizer setLimit(int limit) {112 //## this.limit = limit;113 //## return this;114 //## }115 //## public int getStart() {116 //## return start;117 //## }118 //## public PatternTokenizer setStart(int start) {119 //## this.start = start;120 //## return this;121 //## }122 //## public PatternTokenizer setPattern(CharSequence pattern) {123 //## if (pattern == null) {124 //## throw new IllegalArgumentException("Inconsistent arguments");125 //## }126 //## this.start = 0;127 //## this.limit = pattern.length();128 //## this.pattern = pattern;129 //## return this;130 //## }131 //## 132 //## public static final char SINGLE_QUOTE = '\'';133 //## public static final char BACK_SLASH = '\\';134 //## private static int NO_QUOTE = -1, IN_QUOTE = -2;135 //## /**136 //## * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.137 //## * @param string138 //## * @return139 //## */140 //## public String quoteLiteral(CharSequence string) {141 //## if (needingQuoteCharacters == null) {142 //## needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters); // .addAll(quoteCharacters)143 //## if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);144 //## if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);145 //## }146 //## StringBuffer result = new StringBuffer();147 //## int quotedChar = NO_QUOTE;148 //## int cp;149 //## for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {150 //## cp = UTF16.charAt(string, i);151 //## if (escapeCharacters.contains(cp)) {152 //## // we may have to fix up previous characters153 //## if (quotedChar == IN_QUOTE) {154 //## result.append(SINGLE_QUOTE);155 //## quotedChar = NO_QUOTE;156 //## }157 //## appendEscaped(result, cp);158 //## continue;159 //## }160 //## 161 //## if (needingQuoteCharacters.contains(cp)) {162 //## // if we have already started a quote163 //## if (quotedChar == IN_QUOTE) {164 //## UTF16.append(result, cp);165 //## if (usingQuote && cp == SINGLE_QUOTE) { // double it166 //## result.append(SINGLE_QUOTE);167 //## }168 //## continue;169 //## }170 //## // otherwise not already in quote171 //## if (usingSlash) {172 //## result.append(BACK_SLASH);173 //## UTF16.append(result, cp);174 //## continue;175 //## }176 //## if (usingQuote) {177 //## if (cp == SINGLE_QUOTE) { // double it and continue178 //## result.append(SINGLE_QUOTE);179 //## result.append(SINGLE_QUOTE);180 //## continue;181 //## }182 //## result.append(SINGLE_QUOTE);183 //## UTF16.append(result, cp);184 //## quotedChar = IN_QUOTE;185 //## continue;186 //## }187 //## // we have no choice but to use \\u or \\U188 //## appendEscaped(result, cp);189 //## continue;190 //## }191 //## // otherwise cp doesn't need quoting192 //## // we may have to fix up previous characters193 //## if (quotedChar == IN_QUOTE) {194 //## result.append(SINGLE_QUOTE);195 //## quotedChar = NO_QUOTE;196 //## }197 //## UTF16.append(result, cp);198 //## }199 //## // all done. 200 //## // we may have to fix up previous characters201 //## if (quotedChar == IN_QUOTE) {202 //## result.append(SINGLE_QUOTE);203 //## }204 //## return result.toString();205 //## }206 //## 207 //## private void appendEscaped(StringBuffer result, int cp) {208 //## if (cp <= 0xFFFF) {209 //## result.append("\\u").append(Utility.hex(cp,4));210 //## } else {211 //## result.append("\\U").append(Utility.hex(cp,8));212 //## }213 //## }214 //## 215 //## public String normalize() {216 //## int oldStart = start;217 //## StringBuffer result = new StringBuffer();218 //## StringBuffer buffer = new StringBuffer();219 //## while (true) {220 //## buffer.setLength(0);221 //## int status = next(buffer);222 //## if (status == DONE) {223 //## start = oldStart;224 //## return result.toString();225 //## }226 //## if (status != SYNTAX) {227 //## result.append(quoteLiteral(buffer));228 //## } else {229 //## result.append(buffer);230 //## }231 //## }232 //## }233 //## 234 //## public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;235 //## 236 //## private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;237 //## 238 //## public int next(StringBuffer buffer) {239 //## if (start >= limit) return DONE;240 //## int status = UNKNOWN;241 //## int lastQuote = UNKNOWN;242 //## int quoteStatus = NONE;243 //## int hexCount = 0;244 //## int hexValue = 0;245 //## int cp;246 //## main:247 //## for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {248 //## cp = UTF16.charAt(pattern, i);249 //## // if we are in a quote, then handle it.250 //## switch (quoteStatus) {251 //## case SLASH_START:252 //## switch (cp) {253 //## case 'u':254 //## quoteStatus = HEX;255 //## hexCount = 4;256 //## hexValue = 0;257 //## continue main;258 //## case 'U': 259 //## quoteStatus = HEX;260 //## hexCount = 8;261 //## hexValue = 0;262 //## continue main;263 //## default:264 //## if (usingSlash) {265 //## UTF16.append(buffer, cp);266 //## quoteStatus = NONE;267 //## continue main;268 //## } else {269 //## buffer.append(BACK_SLASH);270 //## quoteStatus = NONE;271 //## }272 //## }273 //## break; // fall through to NONE274 //## case HEX:275 //## hexValue <<= 4;276 //## hexValue += cp;277 //## switch (cp) {278 //## case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':279 //## hexValue -= '0'; break;280 //## case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':281 //## hexValue -= 'a' - 10; break;282 //## case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':283 //## hexValue -= 'A' - 10; break;284 //## default:285 //## start = i;286 //## return BROKEN_ESCAPE;287 //## }288 //## --hexCount;289 //## if (hexCount == 0) {290 //## quoteStatus = NONE;291 //## UTF16.append(buffer, hexValue);292 //## }293 //## continue main;294 //## case AFTER_QUOTE:295 //## // see if we get another quote character296 //## // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote297 //## if (cp == lastQuote) {298 //## UTF16.append(buffer, cp);299 //## quoteStatus = NORMAL_QUOTE;300 //## continue main;301 //## }302 //## quoteStatus = NONE;303 //## break; // fall through to NONE304 //## case START_QUOTE:305 //## // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote306 //## if (cp == lastQuote) {307 //## UTF16.append(buffer, cp);308 //## quoteStatus = NONE; // get out of quote, with no trace remaining309 //## continue; 310 //## }311 //## // otherwise get into quote312 //## UTF16.append(buffer, cp);313 //## quoteStatus = NORMAL_QUOTE;314 //## continue main;315 //## case NORMAL_QUOTE: 316 //## if (cp == lastQuote) {317 //## quoteStatus = AFTER_QUOTE; // get out of quote318 //## continue main;319 //## }320 //## UTF16.append(buffer, cp);321 //## continue main;322 //## }323 //## 324 //## if (ignorableCharacters.contains(cp)) {325 //## continue;326 //## }327 //## // do syntax characters328 //## if (syntaxCharacters.contains(cp)) {329 //## if (status == UNKNOWN) {330 //## UTF16.append(buffer, cp);331 //## start = i + UTF16.getCharCount(cp);332 //## return SYNTAX;333 //## } else { // LITERAL, so back up and break334 //## start = i;335 //## return status;336 //## }337 //## }338 //## // otherwise it is a literal; keep on going339 //## status = LITERAL;340 //## if (cp == BACK_SLASH) {341 //## quoteStatus = SLASH_START;342 //## continue;343 //## } else if (usingQuote && cp == SINGLE_QUOTE) {344 //## lastQuote = cp;345 //## quoteStatus = START_QUOTE;346 //## continue;347 //## }348 //## // normal literals349 //## UTF16.append(buffer, cp);350 //## }351 //## // handle final cleanup352 //## start = limit;353 //## switch (quoteStatus) {354 //## case HEX:355 //## status = BROKEN_ESCAPE;356 //## break;357 //## case SLASH_START:358 //## if (usingSlash) {359 //## status = BROKEN_ESCAPE;360 //## } else {361 //## buffer.append(BACK_SLASH);362 //## }363 //## break;364 //## case START_QUOTE: case NORMAL_QUOTE:365 //## status = BROKEN_QUOTE;366 //## break;367 //## }368 //## return status;369 //## }370 //## 371 //## 372 //##}373 //#endif374 //eof375