1 21 22 package net.percederberg.grammatica.parser.re; 23 24 import java.io.PrintWriter ; 25 import java.io.Reader ; 26 import java.io.StringReader ; 27 import java.io.StringWriter ; 28 import java.util.ArrayList ; 29 30 import net.percederberg.grammatica.parser.LookAheadReader; 31 32 41 public class RegExp { 42 43 46 private Element element; 47 48 51 private String pattern; 52 53 56 private boolean ignoreCase; 57 58 62 private int pos; 63 64 72 public RegExp(String pattern) throws RegExpException { 73 this(pattern, false); 74 } 75 76 88 public RegExp(String pattern, boolean ignoreCase) 89 throws RegExpException { 90 91 this.pattern = pattern; 92 this.ignoreCase = ignoreCase; 93 this.pos = 0; 94 this.element = parseExpr(); 95 if (pos < pattern.length()) { 96 throw new RegExpException( 97 RegExpException.UNEXPECTED_CHARACTER, 98 pos, 99 pattern); 100 } 101 } 102 103 114 public Matcher matcher(CharBuffer str) { 115 return matcher(str.toString()); 116 } 117 118 125 public Matcher matcher(String str) { 126 return matcher(new StringReader (str)); 127 } 128 129 136 public Matcher matcher(StringBuffer str) { 137 return matcher(new StringReader (str.toString())); 138 } 139 140 149 public Matcher matcher(Reader input) { 150 if (input instanceof LookAheadReader) { 151 return matcher((LookAheadReader) input); 152 } else { 153 return matcher(new LookAheadReader(input)); 154 } 155 } 156 157 167 private Matcher matcher(LookAheadReader input) { 168 return new Matcher((Element) element.clone(), input, ignoreCase); 169 } 170 171 176 public String toString() { 177 StringWriter str; 178 179 str = new StringWriter (); 180 str.write("Regular Expression\n"); 181 str.write(" Pattern: " + pattern + "\n"); 182 str.write(" Flags:"); 183 if (ignoreCase) { 184 str.write(" caseignore"); 185 } 186 str.write("\n"); 187 str.write(" Compiled:\n"); 188 element.printTo(new PrintWriter (str), " "); 189 return str.toString(); 190 } 191 192 201 private Element parseExpr() throws RegExpException { 202 Element first; 203 Element second; 204 205 first = parseTerm(); 206 if (peekChar(0) != '|') { 207 return first; 208 } else { 209 readChar('|'); 210 second = parseExpr(); 211 return new AlternativeElement(first, second); 212 } 213 } 214 215 224 private Element parseTerm() throws RegExpException { 225 ArrayList list = new ArrayList (); 226 227 list.add(parseFact()); 228 while (true) { 229 switch (peekChar(0)) { 230 case -1: 231 case ')': 232 case ']': 233 case '{': 234 case '}': 235 case '?': 236 case '+': 237 case '|': 238 return combineElements(list); 239 default: 240 list.add(parseFact()); 241 } 242 } 243 } 244 245 254 private Element parseFact() throws RegExpException { 255 Element elem; 256 257 elem = parseAtom(); 258 switch (peekChar(0)) { 259 case '?': 260 case '*': 261 case '+': 262 case '{': 263 return parseAtomModifier(elem); 264 default: 265 return elem; 266 } 267 } 268 269 278 private Element parseAtom() throws RegExpException { 279 Element elem; 280 281 switch (peekChar(0)) { 282 case '.': 283 readChar('.'); 284 return CharacterSetElement.DOT; 285 case '(': 286 readChar('('); 287 elem = parseExpr(); 288 readChar(')'); 289 return elem; 290 case '[': 291 readChar('['); 292 elem = parseCharSet(); 293 readChar(']'); 294 return elem; 295 case -1: 296 case ')': 297 case ']': 298 case '{': 299 case '}': 300 case '?': 301 case '*': 302 case '+': 303 case '|': 304 throw new RegExpException( 305 RegExpException.UNEXPECTED_CHARACTER, 306 pos, 307 pattern); 308 default: 309 return parseChar(); 310 } 311 } 312 313 324 private Element parseAtomModifier(Element elem) throws RegExpException { 325 int min = 0; 326 int max = -1; 327 int type = RepeatElement.GREEDY; 328 int firstPos; 329 330 switch (readChar()) { 332 case '?': 333 min = 0; 334 max = 1; 335 break; 336 case '*': 337 min = 0; 338 max = -1; 339 break; 340 case '+': 341 min = 1; 342 max = -1; 343 break; 344 case '{': 345 firstPos = pos - 1; 346 min = readNumber(); 347 max = min; 348 if (peekChar(0) == ',') { 349 readChar(','); 350 max = -1; 351 if (peekChar(0) != '}') { 352 max = readNumber(); 353 } 354 } 355 readChar('}'); 356 if (max == 0 || (max > 0 && min > max)) { 357 throw new RegExpException( 358 RegExpException.INVALID_REPEAT_COUNT, 359 firstPos, 360 pattern); 361 } 362 break; 363 default: 364 throw new RegExpException( 365 RegExpException.UNEXPECTED_CHARACTER, 366 pos - 1, 367 pattern); 368 } 369 370 if (peekChar(0) == '?') { 372 readChar('?'); 373 type = RepeatElement.RELUCTANT; 374 } else if (peekChar(0) == '+') { 375 readChar('+'); 376 type = RepeatElement.POSSESSIVE; 377 } 378 379 return new RepeatElement(elem, min, max, type); 380 } 381 382 391 private Element parseCharSet() throws RegExpException { 392 CharacterSetElement charset; 393 Element elem; 394 boolean repeat = true; 395 char start; 396 char end; 397 398 if (peekChar(0) == '^') { 399 readChar('^'); 400 charset = new CharacterSetElement(true); 401 } else { 402 charset = new CharacterSetElement(false); 403 } 404 405 while (peekChar(0) > 0 && repeat) { 406 start = (char) peekChar(0); 407 switch (start) { 408 case ']': 409 repeat = false; 410 break; 411 case '\\': 412 elem = parseEscapeChar(); 413 if (elem instanceof StringElement) { 414 charset.addCharacters((StringElement) elem); 415 } else { 416 charset.addCharacterSet((CharacterSetElement) elem); 417 } 418 break; 419 default: 420 readChar(start); 421 if (peekChar(0) == '-' 422 && peekChar(1) > 0 423 && peekChar(1) != ']') { 424 425 readChar('-'); 426 end = readChar(); 427 charset.addRange(fixChar(start), fixChar(end)); 428 } else { 429 charset.addCharacter(fixChar(start)); 430 } 431 } 432 } 433 434 return charset; 435 } 436 437 446 private Element parseChar() throws RegExpException { 447 switch (peekChar(0)) { 448 case '\\': 449 return parseEscapeChar(); 450 case '^': 451 case '$': 452 throw new RegExpException( 453 RegExpException.UNSUPPORTED_SPECIAL_CHARACTER, 454 pos, 455 pattern); 456 default: 457 return new StringElement(fixChar(readChar())); 458 } 459 } 460 461 470 private Element parseEscapeChar() throws RegExpException { 471 char c; 472 String str; 473 474 readChar('\\'); 475 c = readChar(); 476 switch (c) { 477 case '0': 478 c = readChar(); 479 if (c < '0' || c > '3') { 480 throw new RegExpException( 481 RegExpException.UNSUPPORTED_ESCAPE_CHARACTER, 482 pos - 3, 483 pattern); 484 } 485 str = String.valueOf(c); 486 c = (char) peekChar(0); 487 if ('0' <= c && c <= '7') { 488 str += String.valueOf(readChar()); 489 c = (char) peekChar(0); 490 if ('0' <= c && c <= '7') { 491 str += String.valueOf(readChar()); 492 } 493 } 494 try { 495 c = (char) Integer.parseInt(str, 8); 496 return new StringElement(fixChar(c)); 497 } catch (NumberFormatException e) { 498 throw new RegExpException( 499 RegExpException.UNSUPPORTED_ESCAPE_CHARACTER, 500 pos - str.length() - 2, 501 pattern); 502 } 503 case 'x': 504 str = String.valueOf(readChar()) + 505 String.valueOf(readChar()); 506 try { 507 c = (char) Integer.parseInt(str, 16); 508 return new StringElement(fixChar(c)); 509 } catch (NumberFormatException e) { 510 throw new RegExpException( 511 RegExpException.UNSUPPORTED_ESCAPE_CHARACTER, 512 pos - str.length() - 2, 513 pattern); 514 } 515 case 'u': 516 str = String.valueOf(readChar()) + 517 String.valueOf(readChar()) + 518 String.valueOf(readChar()) + 519 String.valueOf(readChar()); 520 try { 521 c = (char) Integer.parseInt(str, 16); 522 return new StringElement(fixChar(c)); 523 } catch (NumberFormatException e) { 524 throw new RegExpException( 525 RegExpException.UNSUPPORTED_ESCAPE_CHARACTER, 526 pos - str.length() - 2, 527 pattern); 528 } 529 case 't': 530 return new StringElement('\t'); 531 case 'n': 532 return new StringElement('\n'); 533 case 'r': 534 return new StringElement('\r'); 535 case 'f': 536 return new StringElement('\f'); 537 case 'a': 538 return new StringElement('\u0007'); 539 case 'e': 540 return new StringElement('\u001B'); 541 case 'd': 542 return CharacterSetElement.DIGIT; 543 case 'D': 544 return CharacterSetElement.NON_DIGIT; 545 case 's': 546 return CharacterSetElement.WHITESPACE; 547 case 'S': 548 return CharacterSetElement.NON_WHITESPACE; 549 case 'w': 550 return CharacterSetElement.WORD; 551 case 'W': 552 return CharacterSetElement.NON_WORD; 553 default: 554 if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')) { 555 throw new RegExpException( 556 RegExpException.UNSUPPORTED_ESCAPE_CHARACTER, 557 pos - 2, 558 pattern); 559 } 560 return new StringElement(fixChar(c)); 561 } 562 } 563 564 573 private char fixChar(char c) { 574 return ignoreCase ? Character.toLowerCase(c) : c; 575 } 576 577 587 private int readNumber() throws RegExpException { 588 StringBuffer buf = new StringBuffer (); 589 int c; 590 591 c = peekChar(0); 592 while ('0' <= c && c <= '9') { 593 buf.append(readChar()); 594 c = peekChar(0); 595 } 596 if (buf.length() <= 0) { 597 throw new RegExpException( 598 RegExpException.UNEXPECTED_CHARACTER, 599 pos, 600 pattern); 601 } 602 return Integer.parseInt(buf.toString()); 603 } 604 605 614 private char readChar() throws RegExpException { 615 int c = peekChar(0); 616 617 if (c < 0) { 618 throw new RegExpException( 619 RegExpException.UNTERMINATED_PATTERN, 620 pos, 621 pattern); 622 } else { 623 pos++; 624 return (char) c; 625 } 626 } 627 628 640 private char readChar(char c) throws RegExpException { 641 if (c != readChar()) { 642 throw new RegExpException( 643 RegExpException.UNEXPECTED_CHARACTER, 644 pos - 1, 645 pattern); 646 } 647 return c; 648 } 649 650 660 private int peekChar(int count) { 661 if (pos + count < pattern.length()) { 662 return pattern.charAt(pos + count); 663 } else { 664 return -1; 665 } 666 } 667 668 677 private Element combineElements(ArrayList list) { 678 Element prev; 679 Element elem; 680 String str; 681 int i; 682 683 prev = (Element) list.get(0); 685 for (i = 1; i < list.size(); i++) { 686 elem = (Element) list.get(i); 687 if (prev instanceof StringElement 688 && elem instanceof StringElement) { 689 690 str = ((StringElement) prev).getString() + 691 ((StringElement) elem).getString(); 692 elem = new StringElement(str); 693 list.remove(i); 694 list.set(i - 1, elem); 695 i--; 696 } 697 prev = elem; 698 } 699 700 elem = (Element) list.get(list.size() - 1); 702 for (i = list.size() - 2; i >= 0; i--) { 703 prev = (Element) list.get(i); 704 elem = new CombineElement(prev, elem); 705 } 706 return elem; 707 } 708 } 709 | Popular Tags |