| 1 package org.apache.oro.text.regex; 2 3 59 60 75 76 public final class Perl5Compiler implements PatternCompiler { 77 private static final int __WORSTCASE = 0, __NONNULL = 0x1, __SIMPLE = 0x2, 78 __SPSTART = 0x4, __TRYAGAIN = 0x8; 79 80 private static final char 81 __CASE_INSENSITIVE = 0x0001, 82 __GLOBAL = 0x0002, 83 __KEEP = 0x0004, 84 __MULTILINE = 0x0008, 85 __SINGLELINE = 0x0010, 86 __EXTENDED = 0x0020, 87 __READ_ONLY = 0x8000; 88 89 private static final String __META_CHARS = "^$.[()|?+*\\"; 90 private static final String __HEX_DIGIT = 91 "0123456789abcdef0123456789ABCDEFx"; 92 private CharStringPointer __input; 93 private boolean __sawBackreference; 94 private char[] __modifierFlags = { 0 }; 95 96 private int __numParentheses, __programSize, __cost; 101 102 private char[] __program; 105 106 121 public static final int DEFAULT_MASK = 0; 122 123 127 public static final int CASE_INSENSITIVE_MASK = __CASE_INSENSITIVE; 128 129 142 public static final int MULTILINE_MASK = __MULTILINE; 143 144 159 public static final int SINGLELINE_MASK = __SINGLELINE; 160 161 175 public static final int EXTENDED_MASK = __EXTENDED; 176 177 189 public static final int READ_ONLY_MASK = __READ_ONLY; 190 191 205 public static final String quotemeta(char[] expression) { 206 int ch; 207 StringBuffer buffer; 208 209 buffer = new StringBuffer (2*expression.length); 210 for(ch = 0; ch < expression.length; ch++) { 211 if(!OpCode._isWordCharacter(expression[ch])) 212 buffer.append('\\'); 213 buffer.append(expression[ch]); 214 } 215 216 return buffer.toString(); 217 } 218 219 233 public static final String quotemeta(String expression) { 234 return quotemeta(expression.toCharArray()); 235 } 236 237 private static boolean __isSimpleRepetitionOp(char ch) { 238 return (ch == '*' || ch == '+' || ch == '?'); 239 } 240 241 private static boolean __isComplexRepetitionOp(char[] ch, int offset) { 242 if(offset < ch.length && offset >= 0) 243 return (ch[offset] == '*' || ch[offset] == '+' || ch[offset] == '?' 244 || (ch[offset] == '{' && __parseRepetition(ch, offset))); 245 return false; 246 } 247 248 private static boolean __parseRepetition(char[] str, int offset) { 250 if(str[offset] != '{') 251 return false; 252 ++offset; 253 254 if(offset >= str.length || !Character.isDigit(str[offset])) 255 return false; 256 257 while(offset < str.length && Character.isDigit(str[offset])) 258 ++offset; 259 260 if(offset < str.length && str[offset] == ',') 261 ++offset; 262 263 while(offset < str.length && Character.isDigit(str[offset])) 264 ++offset; 265 266 if(offset >= str.length || str[offset] != '}') 267 return false; 268 269 return true; 270 } 271 272 private static int __parseHex(char[] str, int offset, int maxLength, 273 int[] scanned) 274 { 275 int val = 0, index; 276 277 scanned[0] = 0; 278 while(offset < str.length && maxLength-- > 0 && 279 (index = __HEX_DIGIT.indexOf(str[offset])) != -1) { 280 val <<= 4; 281 val |= (index & 15); 282 ++offset; 283 ++scanned[0]; 284 } 285 286 return val; 287 } 288 289 private static int __parseOctal(char[] str, int offset, int maxLength, 290 int[] scanned) 291 { 292 int val = 0, index; 293 294 scanned[0] = 0; 295 while(offset < str.length && 296 maxLength > 0 && str[offset] >= '0' && str[offset] <= '7') { 297 val <<= 3; 298 val |= (str[offset] - '0'); 299 --maxLength; 300 ++offset; 301 ++scanned[0]; 302 } 303 304 return val; 305 } 306 307 private static void __setModifierFlag(char[] flags, char ch) { 308 switch(ch) { 309 case 'i' : flags[0] |= __CASE_INSENSITIVE; return; 310 case 'g' : flags[0] |= __GLOBAL; return; 311 case 'o' : flags[0] |= __KEEP; return; 312 case 'm' : flags[0] |= __MULTILINE; return; 313 case 's' : flags[0] |= __SINGLELINE; return; 314 case 'x' : flags[0] |= __EXTENDED; return; 315 } 316 } 317 318 private void __emitCode(char code) { 320 321 if(__program != null) 322 __program[__programSize] = code; 323 324 ++__programSize; 325 } 326 327 328 private int __emitNode(char operator) { 331 int offset; 332 333 offset = __programSize; 334 335 if(__program == null) 336 __programSize+=2; 337 else { 338 __program[__programSize++] = operator; 339 __program[__programSize++] = OpCode._NULL_POINTER; 340 } 341 342 return offset; 343 } 344 345 346 private int __emitArgNode(char operator, char arg) { 349 int offset; 350 351 offset = __programSize; 352 353 if(__program== null) 354 __programSize+=3; 355 else { 356 __program[__programSize++] = operator; 357 __program[__programSize++] = OpCode._NULL_POINTER; 358 __program[__programSize++] = arg; 359 } 360 361 return offset; 362 } 363 364 365 private void __programInsertOperator(char operator, int operand) { 367 int src, dest, offset; 368 369 offset = (OpCode._opType[operator] == OpCode._CURLY ? 2 : 0); 370 371 372 if(__program== null) { 373 __programSize+=(2 + offset); 374 return; 375 } 376 377 src = __programSize; 378 __programSize+=(2 + offset); 379 dest = __programSize; 380 381 while(src > operand) { 382 --src; 383 --dest; 384 __program[dest] = __program[src]; 385 } 386 387 __program[operand++] = operator; 388 __program[operand++] = OpCode._NULL_POINTER; 389 390 while(offset-- > 0) 391 __program[operand++] = OpCode._NULL_POINTER; 392 393 } 394 395 396 397 private void __programAddTail(int current, int value) { 398 int scan, temp, offset; 399 400 if(__program== null || current == OpCode._NULL_OFFSET) 401 return; 402 403 scan = current; 404 405 while(true) { 406 temp = OpCode._getNext(__program, scan); 407 if(temp == OpCode._NULL_OFFSET) 408 break; 409 scan = temp; 410 } 411 412 if(__program[scan] == OpCode._BACK) 413 offset = scan - value; 414 else 415 offset = value - scan; 416 417 __program[scan + 1] = (char)offset; 418 } 419 420 421 private void __programAddOperatorTail(int current, int value) { 422 if(__program== null || current == OpCode._NULL_OFFSET || 423 OpCode._opType[__program[current]] != OpCode._BRANCH) 424 return; 425 __programAddTail(OpCode._getNextOperator(current), value); 426 } 427 428 429 private char __getNextChar() { 430 char ret, value; 431 432 ret = __input._postIncrement(); 433 434 while(true) { 435 value = __input._getValue(); 436 437 if(value == '(' && __input._getValueRelative(1) == '?' && 438 __input._getValueRelative(2) == '#') { 439 while(value != CharStringPointer._END_OF_STRING && value != ')') 441 value = __input._increment(); 442 __input._increment(); 443 continue; 444 } 445 446 if((__modifierFlags[0] & __EXTENDED) != 0) { 447 if(Character.isWhitespace(value)) { 448 __input._increment(); 449 continue; 450 } else if(value == '#') { 451 while(value != CharStringPointer._END_OF_STRING && value != '\n') 452 value = __input._increment(); 453 __input._increment(); 454 continue; 455 } 456 } 457 458 460 461 return ret; 462 } 463 464 } 465 466 467 private int __parseAlternation(int[] retFlags) 468 throws MalformedPatternException 469 { 470 int chain, offset, latest; 471 int flags = 0; 472 char value; 473 474 retFlags[0] = __WORSTCASE; 475 476 offset = __emitNode(OpCode._BRANCH); 477 478 chain = OpCode._NULL_OFFSET; 479 480 if(__input._getOffset() == 0) { 481 __input._setOffset(-1); 482 __getNextChar(); 483 } else { 484 __input._decrement(); 485 __getNextChar(); 486 } 487 488 value = __input._getValue(); 489 490 while(value != CharStringPointer._END_OF_STRING && 491 value != '|' && value != ')') { 492 flags &= ~__TRYAGAIN; 493 latest = __parseBranch(retFlags); 494 495 if(latest == OpCode._NULL_OFFSET) { 496 if((flags & __TRYAGAIN) != 0){ 497 value = __input._getValue(); 498 continue; 499 } 500 return OpCode._NULL_OFFSET; 501 } 502 503 retFlags[0] |= (flags & __NONNULL); 504 505 if(chain == OpCode._NULL_OFFSET) 506 retFlags[0] |= (flags & __SPSTART); 507 else { 508 ++__cost; 509 __programAddTail(chain, latest); 510 } 511 chain = latest; 512 value = __input._getValue(); 513 } 514 515 if(chain == OpCode._NULL_OFFSET) 517 __emitNode(OpCode._NOTHING); 518 519 return offset; 520 } 521 522 523 private int __parseAtom(int[] retFlags) throws MalformedPatternException { 524 boolean doDefault; 525 char value; 526 int offset, flags[] = { 0 }; 527 528 529 retFlags[0] = __WORSTCASE; 530 doDefault = false; 531 offset = OpCode._NULL_OFFSET; 532 533 tryAgain: 534 while(true) { 535 536 value = __input._getValue(); 537 538 switch(value) { 539 case '^' : 540 __getNextChar(); 541 if((__modifierFlags[0] & __MULTILINE) != 0) 544 offset = __emitNode(OpCode._MBOL); 545 else if((__modifierFlags[0] & __SINGLELINE) != 0) 546 offset = __emitNode(OpCode._SBOL); 547 else 548 offset = __emitNode(OpCode._BOL); 549 break tryAgain; 550 551 case '$': 552 __getNextChar(); 553 if((__modifierFlags[0] & __MULTILINE) != 0) 556 offset = __emitNode(OpCode._MEOL); 557 else if((__modifierFlags[0] & __SINGLELINE) != 0) 558 offset = __emitNode(OpCode._SEOL); 559 else 560 offset = __emitNode(OpCode._EOL); 561 break tryAgain; 562 563 case '.': 564 __getNextChar(); 565 if((__modifierFlags[0] & __SINGLELINE) != 0) 568 offset = __emitNode(OpCode._SANY); 569 else 570 offset = __emitNode(OpCode._ANY); 571 ++__cost; 572 retFlags[0] |= (__NONNULL | __SIMPLE); 573 break tryAgain; 574 575 case '[': 576 __input._increment(); 577 offset = __parseCharacterClass(); 578 retFlags[0] |= (__NONNULL | __SIMPLE); 579 break tryAgain; 580 581 case '(': 582 __getNextChar(); 583 offset = __parseExpression(true, flags); 584 if(offset == OpCode._NULL_OFFSET) { 585 if((flags[0] & __TRYAGAIN) != 0) 586 continue tryAgain; 587 return OpCode._NULL_OFFSET; 588 } 589 retFlags[0] |= (flags[0] & (__NONNULL | __SPSTART)); 590 break tryAgain; 591 592 case '|': 593 case ')': 594 if((flags[0] & __TRYAGAIN) != 0) { 595 retFlags[0] |= __TRYAGAIN; 596 return OpCode._NULL_OFFSET; 597 } 598 599 throw new MalformedPatternException("Error in expression at " + 600 __input._toString(__input._getOffset())); 601 603 case '?': 604 case '+': 605 case '*': 606 throw new MalformedPatternException( 607 "?+* follows nothing in expression"); 608 610 case '\\': 611 value = __input._increment(); 612 613 switch(value) { 614 case 'A' : 615 offset = __emitNode(OpCode._SBOL); 616 retFlags[0] |= __SIMPLE; 617 __getNextChar(); 618 break; 619 case 'G': 620 offset = __emitNode(OpCode._GBOL); 621 retFlags[0] |= __SIMPLE; 622 __getNextChar(); 623 break; 624 case 'Z': 625 offset = __emitNode(OpCode._SEOL); 626 retFlags[0] |= __SIMPLE; 627 __getNextChar(); 628 break; 629 case 'w': 630 offset = __emitNode(OpCode._ALNUM); 631 retFlags[0] |= (__NONNULL | __SIMPLE); 632 __getNextChar(); 633 break; 634 case 'W': 635 offset = __emitNode(OpCode._NALNUM); 636 retFlags[0] |= (__NONNULL | __SIMPLE); 637 __getNextChar(); 638 break; 639 case 'b': 640 offset = __emitNode(OpCode._BOUND); 641 retFlags[0] |= __SIMPLE; 642 __getNextChar(); 643 break; 644 case 'B': 645 offset = __emitNode(OpCode._NBOUND); 646 retFlags[0] |= __SIMPLE; 647 __getNextChar(); 648 break; 649 case 's': 650 offset = __emitNode(OpCode._SPACE); 651 retFlags[0] |= (__NONNULL | __SIMPLE); 652 __getNextChar(); 653 break; 654 case 'S': 655 offset = __emitNode(OpCode._NSPACE); 656 retFlags[0] |= (__NONNULL | __SIMPLE); 657 __getNextChar(); 658 break; 659 case 'd': 660 offset = __emitNode(OpCode._DIGIT); 661 retFlags[0] |= (__NONNULL | __SIMPLE); 662 __getNextChar(); 663 break; 664 case 'D': 665 offset = __emitNode(OpCode._NDIGIT); 666 retFlags[0] |= (__NONNULL | __SIMPLE); 667 __getNextChar(); 668 break; 669 case 'n': case 'r': case 't': case 'f': case 'e': case 'a': case 'x': 670 case 'c': case '0': 671 doDefault = true; 672 break tryAgain; 673 case '1': case '2': case '3': case '4': case '5': case '6': case '7': 674 case '8': case '9': 675 int num; 676 StringBuffer buffer = new StringBuffer (10); 677 678 num = 0; 679 value = __input._getValueRelative(num); 680 681 while(Character.isDigit(value)) { 682 buffer.append(value); 683 ++num; 684 value = __input._getValueRelative(num); 685 } 686 687 try { 688 num = Integer.parseInt(buffer.toString()); 689 } catch(NumberFormatException e) { 690 throw new MalformedPatternException( 691 "Unexpected number format exception. Please report this bug." + 692 "NumberFormatException message: " + e.getMessage()); 693 } 694 695 if(num > 9 && num >= __numParentheses) { 696 doDefault = true; 697 break tryAgain; 698 } else { 699 if(num >= __numParentheses) 701 throw new MalformedPatternException("Invalid backreference: \\" + 702 num); 703 __sawBackreference = true; 704 offset = __emitArgNode(OpCode._REF, (char)num); 705 retFlags[0] |= __NONNULL; 706 707 value = __input._getValue(); 708 while(Character.isDigit(value)) 709 value = __input._increment(); 710 711 __input._decrement(); 712 __getNextChar(); 713 } 714 break; 715 case '\0': 716 case CharStringPointer._END_OF_STRING: 717 if(__input._isAtEnd()) 718 throw new 719 MalformedPatternException("Trailing \\ in expression."); 720 default: 722 doDefault = true; 723 break tryAgain; 724 } 725 break tryAgain; 726 727 case '#': 728 if((__modifierFlags[0] & __EXTENDED) != 0) { 730 while(!__input._isAtEnd() && __input._getValue() != '\n') 731 __input._increment(); 732 if(!__input._isAtEnd()) 733 continue tryAgain; 734 } 735 default: 737 __input._increment(); 738 doDefault = true; 739 break tryAgain; 740 } } 743 744 if(doDefault) { 745 char ender; 746 int length, pOffset, maxOffset, lastOffset, numLength[]; 747 748 offset = __emitNode(OpCode._EXACTLY); 749 __emitCode((char)CharStringPointer._END_OF_STRING); 752 753 forLoop: 754 for(length = 0, pOffset = __input._getOffset() - 1, 755 maxOffset = __input._getLength(); 756 length < 127 && pOffset < maxOffset; ++length) { 757 758 lastOffset = pOffset; 759 value = __input._getValue(pOffset); 760 761 switch(value) { 762 case '^': case '$': case '.': case '[': case '(': case ')': 763 case '|': 764 break forLoop; 765 case '\\': 766 value = __input._getValue(++pOffset); 767 768 switch(value) { 769 case 'A': case 'G': case 'Z': case 'w': case 'W': case 'b': 770 case 'B': case 's': case 'S': case 'd': case 'D': 771 --pOffset; 772 break forLoop; 773 case 'n': 774 ender = '\n'; 775 ++pOffset; 776 break; 777 case 'r': 778 ender = '\r'; 779 ++pOffset; 780 break; 781 case 't': 782 ender = '\t'; 783 ++pOffset; 784 break; 785 case 'f': 786 ender = '\f'; 787 ++pOffset; 788 break; 789 case 'e': 790 ender = '\033'; 791 ++pOffset; 792 break; 793 case 'a': 794 ender = '\007'; 795 ++pOffset; 796 break; 797 case 'x': 798 numLength = new int[1]; 799 ender = (char)__parseHex(__input._array, ++pOffset, 2, numLength); 800 pOffset+=numLength[0]; 801 break; 802 case 'c': 803 ++pOffset; 804 ender = __input._getValue(pOffset++); 805 if(Character.isLowerCase(ender)) 806 ender = Character.toUpperCase(ender); 807 ender ^= 64; 808 break; 809 case '0': case '1': case '2': case'3': case '4': case '5': 810 case '6': case '7': case '8': case '9': 811 boolean doOctal = false; 812 value = __input._getValue(pOffset); 813 814 if(value == '0') 815 doOctal = true; 816 value = __input._getValue(pOffset + 1); 817 818 if(Character.isDigit(value)) { 819 int num; 820 StringBuffer buffer = new StringBuffer (10); 821 822 num = pOffset; 823 value = __input._getValue(num); 824 825 while(Character.isDigit(value)){ 826 buffer.append(value); 827 ++num; 828 value = __input._getValue(num); 829 } 830 831 try { 832 num = Integer.parseInt(buffer.toString()); 833 } catch(NumberFormatException e) { 834 throw new MalformedPatternException( 835 "Unexpected number format exception. Please report this bug." + 836 "NumberFormatException message: " + e.getMessage()); 837 } 838 839 if(!doOctal) 840 doOctal = (num >= __numParentheses); 841 } 842 843 if(doOctal) { 844 numLength = new int[1]; 845 ender = (char)__parseOctal(__input._array, pOffset, 3, numLength); 846 pOffset+=numLength[0]; 847 } else { 848 --pOffset; 849 break forLoop; 850 } 851 break; 852 853 case CharStringPointer._END_OF_STRING: 854 case '\0': 855 if(pOffset >= maxOffset) 856 throw new 857 MalformedPatternException("Trailing \\ in expression."); 858 default: 860 ender = __input._getValue(pOffset++); 861 break; 862 } break; 864 865 case '#': 866 if((__modifierFlags[0] & __EXTENDED) != 0) { 867 while(pOffset < maxOffset && __input._getValue(pOffset) != '\n') 868 ++pOffset; 869 } 870 case ' ': case '\t': case '\n': case '\r': case '\f': case '\013': 872 if((__modifierFlags[0] & __EXTENDED) != 0) { 873 ++pOffset; 874 --length; 875 continue; 876 } 877 default: 879 ender = __input._getValue(pOffset++); 880 break; 881 882 } 884 if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 && 885 Character.isUpperCase(ender)) 886 ender = Character.toLowerCase(ender); 887 888 if(pOffset < maxOffset && __isComplexRepetitionOp(__input._array, pOffset)) { 889 if(length > 0) 890 pOffset = lastOffset; 891 else { 892 ++length; 893 __emitCode(ender); 894 } 895 break; 896 } 897 898 __emitCode(ender); 899 900 901 } 903 904 __input._setOffset(pOffset - 1); 905 __getNextChar(); 906 907 if(length < 0) 908 throw new MalformedPatternException( 909 "Unexpected compilation failure. Please report this bug!"); 910 if(length > 0) 911 retFlags[0] |= __NONNULL; 912 if(length == 1) 913 retFlags[0] |= __SIMPLE; 914 if(__program!= null) 915 __program[OpCode._getOperand(offset)] = (char)length; 916 __emitCode(CharStringPointer._END_OF_STRING); 918 } 919 920 return offset; 921 } 922 923 924 private void __setCharacterClassBits(char[] bits, int offset, char deflt, 926 char ch) 927 { 928 if(__program== null || ch >= 256) 929 return; 930 ch &= 0xffff; 931 932 if(deflt == 0) { 933 bits[offset + (ch >> 4)] |= (1 << (ch & 0xf)); 934 } else { 935 bits[offset + (ch >> 4)] &= ~(1 << (ch & 0xf)); 936 } 937 } 938 939 940 private int __parseCharacterClass() throws MalformedPatternException { 941 boolean range = false, skipTest; 942 char clss, deflt, lastclss = Character.MAX_VALUE; 943 int offset, bits, numLength[] = { 0 }; 944 945 offset = __emitNode(OpCode._ANYOF); 946 947 if(__input._getValue() == '^') { 948 ++__cost; 949 __input._increment(); 950 deflt = 0; 951 } else { 952 deflt = 0xffff; 953 } 954 955 bits = __programSize; 956 for(clss = 0; clss < 16; clss++) 957 __emitCode(deflt); 958 959 clss = __input._getValue(); 960 961 if(clss == ']' || clss == '-') 962 skipTest = true; 963 else 964 skipTest = false; 965 966 while((!__input._isAtEnd() && (clss = __input._getValue()) != ']') 967 || skipTest) { 968 skipTest = false; 970 __input._increment(); 971 if(clss == '\\') { 972 clss = __input._postIncrement(); 973 974 switch(clss){ 975 case 'w': 976 for(clss = 0; clss < 256; clss++) 977 if(OpCode._isWordCharacter(clss)) 978 __setCharacterClassBits(__program, bits, deflt, clss); 979 lastclss = Character.MAX_VALUE; 980 continue; 981 case 'W': 982 for(clss = 0; clss < 256; clss++) 983 if(!OpCode._isWordCharacter(clss)) 984 __setCharacterClassBits(__program, bits, deflt, clss); 985 lastclss = Character.MAX_VALUE; 986 continue; 987 case 's': 988 for(clss = 0; clss < 256; clss++) 989 if(Character.isWhitespace(clss)) 990 __setCharacterClassBits(__program, bits, deflt, clss); 991 lastclss = Character.MAX_VALUE; 992 continue; 993 case 'S': 994 for(clss = 0; clss < 256; clss++) 995 if(!Character.isWhitespace(clss)) 996 __setCharacterClassBits(__program, bits, deflt, clss); 997 lastclss = Character.MAX_VALUE; 998 continue; 999 case 'd': 1000 for(clss = '0'; clss <= '9'; clss++) 1001 __setCharacterClassBits(__program, bits, deflt, clss); 1002 lastclss = Character.MAX_VALUE; 1003 continue; 1004 case 'D': 1005 for(clss = 0; clss < '0'; clss++) 1006 __setCharacterClassBits(__program, bits, deflt, clss); 1007 for(clss = (char)('9' + 1); clss < 256; clss++) 1008 __setCharacterClassBits(__program, bits, deflt, clss); 1009 lastclss = Character.MAX_VALUE; 1010 continue; 1011 case 'n': 1012 clss = '\n'; 1013 break; 1014 case 'r': 1015 clss = '\r'; 1016 break; 1017 case 't': 1018 clss = '\t'; 1019 break; 1020 case 'f': 1021 clss = |