1 21 22 package net.percederberg.grammatica.parser; 23 24 import java.io.IOException ; 25 import java.io.Reader ; 26 import java.util.ArrayList ; 27 28 import net.percederberg.grammatica.parser.re.RegExp; 29 import net.percederberg.grammatica.parser.re.Matcher; 30 import net.percederberg.grammatica.parser.re.RegExpException; 31 32 42 public class Tokenizer { 43 44 47 protected boolean ignoreCase = false; 48 49 52 private boolean useTokenList = false; 53 54 59 private StringTokenMatcher stringMatcher = new StringTokenMatcher(); 60 61 65 private ArrayList regexpMatchers = new ArrayList (); 66 67 70 private LookAheadReader input = null; 71 72 75 private Token previousToken = null; 76 77 83 public Tokenizer(Reader input) { 84 this(input, false); 85 } 86 87 97 public Tokenizer(Reader input, boolean ignoreCase) { 98 this.input = new LookAheadReader(input); 99 this.ignoreCase = ignoreCase; 100 } 101 102 117 public boolean getUseTokenList() { 118 return useTokenList; 119 } 120 121 135 public void setUseTokenList(boolean useTokenList) { 136 this.useTokenList = useTokenList; 137 } 138 139 148 public String getPatternDescription(int id) { 149 TokenPattern pattern; 150 RegExpTokenMatcher re; 151 152 pattern = stringMatcher.getPattern(id); 153 if (pattern != null) { 154 return pattern.toShortString(); 155 } 156 for (int i = 0; i < regexpMatchers.size(); i++) { 157 re = (RegExpTokenMatcher) regexpMatchers.get(i); 158 if (re.getPattern().getId() == id) { 159 return re.getPattern().toShortString(); 160 } 161 } 162 return null; 163 } 164 165 171 public int getCurrentLine() { 172 return input.getLineNumber(); 173 } 174 175 181 public int getCurrentColumn() { 182 return input.getColumnNumber(); 183 } 184 185 195 public void addPattern(TokenPattern pattern) 196 throws ParserCreationException { 197 198 switch (pattern.getType()) { 199 case TokenPattern.STRING_TYPE: 200 stringMatcher.addPattern(pattern); 201 break; 202 case TokenPattern.REGEXP_TYPE: 203 try { 204 regexpMatchers.add(new RegExpTokenMatcher(pattern, input)); 205 } catch (RegExpException e) { 206 throw new ParserCreationException( 207 ParserCreationException.INVALID_TOKEN_ERROR, 208 pattern.getName(), 209 "regular expression contains error(s): " + 210 e.getMessage()); 211 } 212 break; 213 default: 214 throw new ParserCreationException( 215 ParserCreationException.INVALID_TOKEN_ERROR, 216 pattern.getName(), 217 "pattern type " + pattern.getType() + " is undefined"); 218 } 219 } 220 221 232 public void reset(Reader input) { 233 try { 234 this.input.close(); 235 } catch (IOException ignore) { 236 } 238 this.input = new LookAheadReader(input); 239 this.previousToken = null; 240 stringMatcher.reset(); 241 for (int i = 0; i < regexpMatchers.size(); i++) { 242 ((RegExpTokenMatcher) regexpMatchers.get(i)).reset(this.input); 243 } 244 } 245 246 260 public Token next() throws ParseException { 261 Token token = null; 262 263 do { 264 token = nextToken(); 265 if (useTokenList && token != null) { 266 token.setPreviousToken(previousToken); 267 previousToken = token; 268 } 269 if (token == null) { 270 return null; 271 } else if (token.getPattern().isError()) { 272 throw new ParseException( 273 ParseException.INVALID_TOKEN_ERROR, 274 token.getPattern().getErrorMessage(), 275 token.getStartLine(), 276 token.getStartColumn()); 277 } else if (token.getPattern().isIgnore()) { 278 token = null; 279 } 280 } while (token == null); 281 282 return token; 283 } 284 285 296 private Token nextToken() throws ParseException { 297 TokenMatcher m; 298 String str; 299 int line; 300 int column; 301 302 try { 303 m = findMatch(); 304 if (m != null) { 305 line = input.getLineNumber(); 306 column = input.getColumnNumber(); 307 str = input.readString(m.getMatchedLength()); 308 return new Token(m.getMatchedPattern(), str, line, column); 309 } else if (input.peek(0) < 0) { 310 return null; 311 } else { 312 line = input.getLineNumber(); 313 column = input.getColumnNumber(); 314 throw new ParseException(ParseException.UNEXPECTED_CHAR_ERROR, 315 input.readString(1), 316 line, 317 column); 318 } 319 } catch (IOException e) { 320 throw new ParseException(ParseException.IO_ERROR, 321 e.getMessage(), 322 -1, 323 -1); 324 } 325 326 } 327 328 339 private TokenMatcher findMatch() throws IOException { 340 TokenMatcher bestMatch = null; 341 int bestLength = 0; 342 RegExpTokenMatcher re; 343 344 349 if (stringMatcher.match(input)) { 351 bestMatch = stringMatcher; 352 bestLength = bestMatch.getMatchedLength(); 353 } 354 355 for (int i = 0; i < regexpMatchers.size(); i++) { 357 re = (RegExpTokenMatcher) regexpMatchers.get(i); 358 if (re.match() && re.getMatchedLength() > bestLength) { 359 bestMatch = re; 360 bestLength = re.getMatchedLength(); 361 } 362 } 363 return bestMatch; 364 } 365 366 373 public String toString() { 374 StringBuffer buffer = new StringBuffer (); 375 376 buffer.append(stringMatcher); 377 for (int i = 0; i < regexpMatchers.size(); i++) { 378 buffer.append(regexpMatchers.get(i)); 379 } 380 return buffer.toString(); 381 } 382 383 384 390 private abstract class TokenMatcher { 391 392 398 public abstract TokenPattern getMatchedPattern(); 399 400 406 public abstract int getMatchedLength(); 407 } 408 409 410 415 private class RegExpTokenMatcher extends TokenMatcher { 416 417 420 private TokenPattern pattern; 421 422 425 private RegExp regExp; 426 427 430 private Matcher matcher; 431 432 441 public RegExpTokenMatcher(TokenPattern pattern, LookAheadReader input) 442 throws RegExpException { 443 444 this.pattern = pattern; 445 this.regExp = new RegExp(pattern.getPattern(), ignoreCase); 446 this.matcher = regExp.matcher(input); 447 } 448 449 455 public void reset(LookAheadReader input) { 456 matcher.reset(input); 457 } 458 459 464 public TokenPattern getPattern() { 465 return pattern; 466 } 467 468 474 public TokenPattern getMatchedPattern() { 475 if (matcher.length() <= 0) { 476 return null; 477 } else { 478 return pattern; 479 } 480 } 481 482 488 public int getMatchedLength() { 489 return matcher.length(); 490 } 491 492 501 public boolean match() throws IOException { 502 return matcher.matchFromBeginning(); 503 } 504 505 510 public String toString() { 511 return pattern.toString() + "\n" + 512 regExp.toString() + "\n"; 513 } 514 } 515 516 517 523 private class StringTokenMatcher extends TokenMatcher { 524 525 528 private ArrayList patterns = new ArrayList (); 529 530 533 private Automaton start = new Automaton(); 534 535 538 private TokenPattern match = null; 539 540 543 public StringTokenMatcher() { 544 } 545 546 550 public void reset() { 551 match = null; 552 } 553 554 560 public TokenPattern getMatchedPattern() { 561 return match; 562 } 563 564 570 public int getMatchedLength() { 571 if (match == null) { 572 return 0; 573 } else { 574 return match.getPattern().length(); 575 } 576 } 577 578 587 public TokenPattern getPattern(int id) { 588 TokenPattern pattern; 589 590 for (int i = 0; i < patterns.size(); i++) { 591 pattern = (TokenPattern) patterns.get(i); 592 if (pattern.getId() == id) { 593 return pattern; 594 } 595 } 596 return null; 597 } 598 599 604 public void addPattern(TokenPattern pattern) { 605 patterns.add(pattern); 606 start.addMatch(pattern.getPattern(), ignoreCase, pattern); 607 } 608 609 620 public boolean match(LookAheadReader input) throws IOException { 621 reset(); 622 match = (TokenPattern) start.matchFrom(input, 0, ignoreCase); 623 return match != null; 624 } 625 626 632 public String toString() { 633 StringBuffer buffer = new StringBuffer (); 634 635 for (int i = 0; i < patterns.size(); i++) { 636 buffer.append(patterns.get(i)); 637 buffer.append("\n\n"); 638 } 639 return buffer.toString(); 640 } 641 } 642 } 643 | Popular Tags |