1 package org.apache.oro.text.regex; 2 3 59 60 75 76 public final class Perl5Compiler implements PatternCompiler { 77 private static final int __WORSTCASE = 0, __NONNULL = 0x1, __SIMPLE = 0x2, 78 __SPSTART = 0x4, __TRYAGAIN = 0x8; 79 80 private static final char 81 __CASE_INSENSITIVE = 0x0001, 82 __GLOBAL = 0x0002, 83 __KEEP = 0x0004, 84 __MULTILINE = 0x0008, 85 __SINGLELINE = 0x0010, 86 __EXTENDED = 0x0020, 87 __READ_ONLY = 0x8000; 88 89 private static final String __META_CHARS = "^$.[()|?+*\\"; 90 private static final String __HEX_DIGIT = 91 "0123456789abcdef0123456789ABCDEFx"; 92 private CharStringPointer __input; 93 private boolean __sawBackreference; 94 private char[] __modifierFlags = { 0 }; 95 96 private int __numParentheses, __programSize, __cost; 101 102 private char[] __program; 105 106 121 public static final int DEFAULT_MASK = 0; 122 123 127 public static final int CASE_INSENSITIVE_MASK = __CASE_INSENSITIVE; 128 129 142 public static final int MULTILINE_MASK = __MULTILINE; 143 144 159 public static final int SINGLELINE_MASK = __SINGLELINE; 160 161 175 public static final int EXTENDED_MASK = __EXTENDED; 176 177 189 public static final int READ_ONLY_MASK = __READ_ONLY; 190 191 205 public static final String quotemeta(char[] expression) { 206 int ch; 207 StringBuffer buffer; 208 209 buffer = new StringBuffer (2*expression.length); 210 for(ch = 0; ch < expression.length; ch++) { 211 if(!OpCode._isWordCharacter(expression[ch])) 212 buffer.append('\\'); 213 buffer.append(expression[ch]); 214 } 215 216 return buffer.toString(); 217 } 218 219 233 public static final String quotemeta(String expression) { 234 return quotemeta(expression.toCharArray()); 235 } 236 237 private static boolean __isSimpleRepetitionOp(char ch) { 238 return (ch == '*' || ch == '+' || ch == '?'); 239 } 240 241 private static boolean __isComplexRepetitionOp(char[] ch, int offset) { 242 if(offset < ch.length && offset >= 0) 243 return (ch[offset] == '*' || ch[offset] == '+' || ch[offset] == '?' 244 || (ch[offset] == '{' && __parseRepetition(ch, offset))); 245 return false; 246 } 247 248 private static boolean __parseRepetition(char[] str, int offset) { 250 if(str[offset] != '{') 251 return false; 252 ++offset; 253 254 if(offset >= str.length || !Character.isDigit(str[offset])) 255 return false; 256 257 while(offset < str.length && Character.isDigit(str[offset])) 258 ++offset; 259 260 if(offset < str.length && str[offset] == ',') 261 ++offset; 262 263 while(offset < str.length && Character.isDigit(str[offset])) 264 ++offset; 265 266 if(offset >= str.length || str[offset] != '}') 267 return false; 268 269 return true; 270 } 271 272 private static int __parseHex(char[] str, int offset, int maxLength, 273 int[] scanned) 274 { 275 int val = 0, index; 276 277 scanned[0] = 0; 278 while(offset < str.length && maxLength-- > 0 && 279 (index = __HEX_DIGIT.indexOf(str[offset])) != -1) { 280 val <<= 4; 281 val |= (index & 15); 282 ++offset; 283 ++scanned[0]; 284 } 285 286 return val; 287 } 288 289 private static int __parseOctal(char[] str, int offset, int maxLength, 290 int[] scanned) 291 { 292 int val = 0, index; 293 294 scanned[0] = 0; 295 while(offset < str.length && 296 maxLength > 0 && str[offset] >= '0' && str[offset] <= '7') { 297 val <<= 3; 298 val |= (str[offset] - '0'); 299 --maxLength; 300 ++offset; 301 ++scanned[0]; 302 } 303 304 return val; 305 } 306 307 private static void __setModifierFlag(char[] flags, char ch) { 308 switch(ch) { 309 case 'i' : flags[0] |= __CASE_INSENSITIVE; return; 310 case 'g' : flags[0] |= __GLOBAL; return; 311 case 'o' : flags[0] |= __KEEP; return; 312 case 'm' : flags[0] |= __MULTILINE; return; 313 case 's' : flags[0] |= __SINGLELINE; return; 314 case 'x' : flags[0] |= __EXTENDED; return; 315 } 316 } 317 318 private void __emitCode(char code) { 320 321 if(__program != null) 322 __program[__programSize] = code; 323 324 ++__programSize; 325 } 326 327 328 private int __emitNode(char operator) { 331 int offset; 332 333 offset = __programSize; 334 335 if(__program == null) 336 __programSize+=2; 337 else { 338 __program[__programSize++] = operator; 339 __program[__programSize++] = OpCode._NULL_POINTER; 340 } 341 342 return offset; 343 } 344 345 346 private int __emitArgNode(char operator, char arg) { 349 int offset; 350 351 offset = __programSize; 352 353 if(__program== null) 354 __programSize+=3; 355 else { 356 __program[__programSize++] = operator; 357 __program[__programSize++] = OpCode._NULL_POINTER; 358 __program[__programSize++] = arg; 359 } 360 361 return offset; 362 } 363 364 365 private void __programInsertOperator(char operator, int operand) { 367 int src, dest, offset; 368 369 offset = (OpCode._opType[operator] == OpCode._CURLY ? 2 : 0); 370 371 372 if(__program== null) { 373 __programSize+=(2 + offset); 374 return; 375 } 376 377 src = __programSize; 378 __programSize+=(2 + offset); 379 dest = __programSize; 380 381 while(src > operand) { 382 --src; 383 --dest; 384 __program[dest] = __program[src]; 385 } 386 387 __program[operand++] = operator; 388 __program[operand++] = OpCode._NULL_POINTER; 389 390 while(offset-- > 0) 391 __program[operand++] = OpCode._NULL_POINTER; 392 393 } 394 395 396 397 private void __programAddTail(int current, int value) { 398 int scan, temp, offset; 399 400 if(__program== null || current == OpCode._NULL_OFFSET) 401 return; 402 403 scan = current; 404 405 while(true) { 406 temp = OpCode._getNext(__program, scan); 407 if(temp == OpCode._NULL_OFFSET) 408 break; 409 scan = temp; 410 } 411 412 if(__program[scan] == OpCode._BACK) 413 offset = scan - value; 414 else 415 offset = value - scan; 416 417 __program[scan + 1] = (char)offset; 418 } 419 420 421 private void __programAddOperatorTail(int current, int value) { 422 if(__program== null || current == OpCode._NULL_OFFSET || 423 OpCode._opType[__program[current]] != OpCode._BRANCH) 424 return; 425 __programAddTail(OpCode._getNextOperator(current), value); 426 } 427 428 429 private char __getNextChar() { 430 char ret, value; 431 432 ret = __input._postIncrement(); 433 434 while(true) { 435 value = __input._getValue(); 436 437 if(value == '(' && __input._getValueRelative(1) == '?' && 438 __input._getValueRelative(2) == '#') { 439 while(value != CharStringPointer._END_OF_STRING && value != ')') 441 value = __input._increment(); 442 __input._increment(); 443 continue; 444 } 445 446 if((__modifierFlags[0] & __EXTENDED) != 0) { 447 if(Character.isWhitespace(value)) { 448 __input._increment(); 449 continue; 450 } else if(value == '#') { 451 while(value != CharStringPointer._END_OF_STRING && value != '\n') 452 value = __input._increment(); 453 __input._increment(); 454 continue; 455 } 456 } 457 458 460 461 return ret; 462 } 463 464 } 465 466 467 private int __parseAlternation(int[] retFlags) 468 throws MalformedPatternException 469 { 470 int chain, offset, latest; 471 int flags = 0; 472 char value; 473 474 retFlags[0] = __WORSTCASE; 475 476 offset = __emitNode(OpCode._BRANCH); 477 478 chain = OpCode._NULL_OFFSET; 479 480 if(__input._getOffset() == 0) { 481 __input._setOffset(-1); 482 __getNextChar(); 483 } else { 484 __input._decrement(); 485 __getNextChar(); 486 } 487 488 value = __input._getValue(); 489 490 while(value != CharStringPointer._END_OF_STRING && 491 value != '|' && value != ')') { 492 flags &= ~__TRYAGAIN; 493 latest = __parseBranch(retFlags); 494 495 if(latest == OpCode._NULL_OFFSET) { 496 if((flags & __TRYAGAIN) != 0){ 497 value = __input._getValue(); 498 continue; 499 } 500 return OpCode._NULL_OFFSET; 501 } 502 503 retFlags[0] |= (flags & __NONNULL); 504 505 if(chain == OpCode._NULL_OFFSET) 506 retFlags[0] |= (flags & __SPSTART); 507 else { 508 ++__cost; 509 __programAddTail(chain, latest); 510 } 511 chain = latest; 512 value = __input._getValue(); 513 } 514 515 if(chain == OpCode._NULL_OFFSET) 517 __emitNode(OpCode._NOTHING); 518 519 return offset; 520 } 521 522 523 private int __parseAtom(int[] retFlags) throws MalformedPatternException { 524 boolean doDefault; 525 char value; 526 int offset, flags[] = { 0 }; 527 528 529 retFlags[0] = __WORSTCASE; 530 doDefault = false; 531 offset = OpCode._NULL_OFFSET; 532 533 tryAgain: 534 while(true) { 535 536 value = __input._getValue(); 537 538 switch(value) { 539 case '^' : 540 __getNextChar(); 541 if((__modifierFlags[0] & __MULTILINE) != 0) 544 offset = __emitNode(OpCode._MBOL); 545 else if((__modifierFlags[0] & __SINGLELINE) != 0) 546 offset = __emitNode(OpCode._SBOL); 547 else 548 offset = __emitNode(OpCode._BOL); 549 break tryAgain; 550 551 case '$': 552 __getNextChar(); 553 if((__modifierFlags[0] & __MULTILINE) != 0) 556 offset = __emitNode(OpCode._MEOL); 557 else if((__modifierFlags[0] & __SINGLELINE) != 0) 558 offset = __emitNode(OpCode._SEOL); 559 else 560 offset = __emitNode(OpCode._EOL); 561 break tryAgain; 562 563 case '.': 564 __getNextChar(); 565 if((__modifierFlags[0] & __SINGLELINE) != 0) 568 offset = __emitNode(OpCode._SANY); 569 else 570 offset = __emitNode(OpCode._ANY); 571 ++__cost; 572 retFlags[0] |= (__NONNULL | __SIMPLE); 573 break tryAgain; 574 575 case '[': 576 __input._increment(); 577 offset = __parseCharacterClass(); 578 retFlags[0] |= (__NONNULL | __SIMPLE); 579 break tryAgain; 580 581 case '(': 582 __getNextChar(); 583 offset = __parseExpression(true, flags); 584 if(offset == OpCode._NULL_OFFSET) { 585 if((flags[0] & __TRYAGAIN) != 0) 586 continue tryAgain; 587 return OpCode._NULL_OFFSET; 588 } 589 retFlags[0] |= (flags[0] & (__NONNULL | __SPSTART)); 590 break tryAgain; 591 592 case '|': 593 case ')': 594 if((flags[0] & __TRYAGAIN) != 0) { 595 retFlags[0] |= __TRYAGAIN; 596 return OpCode._NULL_OFFSET; 597 } 598 599 throw new MalformedPatternException("Error in expression at " + 600 __input._toString(__input._getOffset())); 601 603 case '?': 604 case '+': 605 case '*': 606 throw new MalformedPatternException( 607 "?+* follows nothing in expression"); 608 610 case '\\': 611 value = __input._increment(); 612 613 switch(value) { 614 case 'A' : 615 offset = __emitNode(OpCode._SBOL); 616 retFlags[0] |= __SIMPLE; 617 __getNextChar(); 618 break; 619 case 'G': 620 offset = __emitNode(OpCode._GBOL); 621 retFlags[0] |= __SIMPLE; 622 __getNextChar(); 623 break; 624 case 'Z': 625 offset = __emitNode(OpCode._SEOL); 626 retFlags[0] |= __SIMPLE; 627 __getNextChar(); 628 break; 629 case 'w': 630 offset = __emitNode(OpCode._ALNUM); 631 retFlags[0] |= (__NONNULL | __SIMPLE); 632 __getNextChar(); 633 break; 634 case 'W': 635 offset = __emitNode(OpCode._NALNUM); 636 retFlags[0] |= (__NONNULL | __SIMPLE); 637 __getNextChar(); 638 break; 639 case 'b': 640 offset = __emitNode(OpCode._BOUND); 641 retFlags[0] |= __SIMPLE; 642 __getNextChar(); 643 break; 644 case 'B': 645 offset = __emitNode(OpCode._NBOUND); 646 retFlags[0] |= __SIMPLE; 647 __getNextChar(); 648 break; 649 case 's': 650 offset = __emitNode(OpCode._SPACE); 651 retFlags[0] |= (__NONNULL | __SIMPLE); 652 __getNextChar(); 653 break; 654 case 'S': 655 offset = __emitNode(OpCode._NSPACE); 656 retFlags[0] |= (__NONNULL | __SIMPLE); 657 __getNextChar(); 658 break; 659 case 'd': 660 offset = __emitNode(OpCode._DIGIT); 661 retFlags[0] |= (__NONNULL | __SIMPLE); 662 __getNextChar(); 663 break; 664 case 'D': 665 offset = __emitNode(OpCode._NDIGIT); 666 retFlags[0] |= (__NONNULL | __SIMPLE); 667 __getNextChar(); 668 break; 669 case 'n': case 'r': case 't': case 'f': case 'e': case 'a': case 'x': 670 case 'c': case '0': 671 doDefault = true; 672 break tryAgain; 673 case '1': case '2': case '3': case '4': case '5': case '6': case '7': 674 case '8': case '9': 675 int num; 676 StringBuffer buffer = new StringBuffer (10); 677 678 num = 0; 679 value = __input._getValueRelative(num); 680 681 while(Character.isDigit(value)) { 682 buffer.append(value); 683 ++num; 684 value = __input._getValueRelative(num); 685 } 686 687 try { 688 num = Integer.parseInt(buffer.toString()); 689 } catch(NumberFormatException e) { 690 throw new MalformedPatternException( 691 "Unexpected number format exception. Please report this bug." + 692 "NumberFormatException message: " + e.getMessage()); 693 } 694 695 if(num > 9 && num >= __numParentheses) { 696 doDefault = true; 697 break tryAgain; 698 } else { 699 if(num >= __numParentheses) 701 throw new MalformedPatternException("Invalid backreference: \\" + 702 num); 703 __sawBackreference = true; 704 offset = __emitArgNode(OpCode._REF, (char)num); 705 retFlags[0] |= __NONNULL; 706 707 value = __input._getValue(); 708 while(Character.isDigit(value)) 709 value = __input._increment(); 710 711 __input._decrement(); 712 __getNextChar(); 713 } 714 break; 715 case '\0': 716 case CharStringPointer._END_OF_STRING: 717 if(__input._isAtEnd()) 718 throw new 719 MalformedPatternException("Trailing \\ in expression."); 720 default: 722 doDefault = true; 723 break tryAgain; 724 } 725 break tryAgain; 726 727 case '#': 728 if((__modifierFlags[0] & __EXTENDED) != 0) { 730 while(!__input._isAtEnd() && __input._getValue() != '\n') 731 __input._increment(); 732 if(!__input._isAtEnd()) 733 continue tryAgain; 734 } 735 default: 737 __input._increment(); 738 doDefault = true; 739 break tryAgain; 740 } } 743 744 if(doDefault) { 745 char ender; 746 int length, pOffset, maxOffset, lastOffset, numLength[]; 747 748 offset = __emitNode(OpCode._EXACTLY); 749 __emitCode((char)CharStringPointer._END_OF_STRING); 752 753 forLoop: 754 for(length = 0, pOffset = __input._getOffset() - 1, 755 maxOffset = __input._getLength(); 756 length < 127 && pOffset < maxOffset; ++length) { 757 758 lastOffset = pOffset; 759 value = __input._getValue(pOffset); 760 761 switch(value) { 762 case '^': case '$': case '.': case '[': case '(': case ')': 763 case '|': 764 break forLoop; 765 case '\\': 766 value = __input._getValue(++pOffset); 767 768 switch(value) { 769 case 'A': case 'G': case 'Z': case 'w': case 'W': case 'b': 770 case 'B': case 's': case 'S': case 'd': case 'D': 771 --pOffset; 772 break forLoop; 773 case 'n': 774 ender = '\n'; 775 ++pOffset; 776 break; 777 case 'r': 778 ender = '\r'; 779 ++pOffset; 780 break; 781 case 't': 782 ender = '\t'; 783 ++pOffset; 784 break; 785 case 'f': 786 ender = '\f'; 787 ++pOffset; 788 break; 789 case 'e': 790 ender = '\033'; 791 ++pOffset; 792 break; 793 case 'a': 794 ender = '\007'; 795 ++pOffset; 796 break; 797 case 'x': 798 numLength = new int[1]; 799 ender = (char)__parseHex(__input._array, ++pOffset, 2, numLength); 800 pOffset+=numLength[0]; 801 break; 802 case 'c': 803 ++pOffset; 804 ender = __input._getValue(pOffset++); 805 if(Character.isLowerCase(ender)) 806 ender = Character.toUpperCase(ender); 807 ender ^= 64; 808 break; 809 case '0': case '1': case '2': case'3': case '4': case '5': 810 case '6': case '7': case '8': case '9': 811 boolean doOctal = false; 812 value = __input._getValue(pOffset); 813 814 if(value == '0') 815 doOctal = true; 816 value = __input._getValue(pOffset + 1); 817 818 if(Character.isDigit(value)) { 819 int num; 820 StringBuffer buffer = new StringBuffer (10); 821 822 num = pOffset; 823 value = __input._getValue(num); 824 825 while(Character.isDigit(value)){ 826 buffer.append(value); 827 ++num; 828 value = __input._getValue(num); 829 } 830 831 try { 832 num = Integer.parseInt(buffer.toString()); 833 } catch(NumberFormatException e) { 834 throw new MalformedPatternException( 835 "Unexpected number format exception. Please report this bug." + 836 "NumberFormatException message: " + e.getMessage()); 837 } 838 839 if(!doOctal) 840 doOctal = (num >= __numParentheses); 841 } 842 843 if(doOctal) { 844 numLength = new int[1]; 845 ender = (char)__parseOctal(__input._array, pOffset, 3, numLength); 846 pOffset+=numLength[0]; 847 } else { 848 --pOffset; 849 break forLoop; 850 } 851 break; 852 853 case CharStringPointer._END_OF_STRING: 854 case '\0': 855 if(pOffset >= maxOffset) 856 throw new 857 MalformedPatternException("Trailing \\ in expression."); 858 default: 860 ender = __input._getValue(pOffset++); 861 break; 862 } break; 864 865 case '#': 866 if((__modifierFlags[0] & __EXTENDED) != 0) { 867 while(pOffset < maxOffset && __input._getValue(pOffset) != '\n') 868 ++pOffset; 869 } 870 case ' ': case '\t': case '\n': case '\r': case '\f': case '\013': 872 if((__modifierFlags[0] & __EXTENDED) != 0) { 873 ++pOffset; 874 --length; 875 continue; 876 } 877 default: 879 ender = __input._getValue(pOffset++); 880 break; 881 882 } 884 if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 && 885 Character.isUpperCase(ender)) 886 ender = Character.toLowerCase(ender); 887 888 if(pOffset < maxOffset && __isComplexRepetitionOp(__input._array, pOffset)) { 889 if(length > 0) 890 pOffset = lastOffset; 891 else { 892 ++length; 893 __emitCode(ender); 894 } 895 break; 896 } 897 898 __emitCode(ender); 899 900 901 } 903 904 __input._setOffset(pOffset - 1); 905 __getNextChar(); 906 907 if(length < 0) 908 throw new MalformedPatternException( 909 "Unexpected compilation failure. Please report this bug!"); 910 if(length > 0) 911 retFlags[0] |= __NONNULL; 912 if(length == 1) 913 retFlags[0] |= __SIMPLE; 914 if(__program!= null) 915 __program[OpCode._getOperand(offset)] = (char)length; 916 __emitCode(CharStringPointer._END_OF_STRING); 918 } 919 920 return offset; 921 } 922 923 924 private void __setCharacterClassBits(char[] bits, int offset, char deflt, 926 char ch) 927 { 928 if(__program== null || ch >= 256) 929 return; 930 ch &= 0xffff; 931 932 if(deflt == 0) { 933 bits[offset + (ch >> 4)] |= (1 << (ch & 0xf)); 934 } else { 935 bits[offset + (ch >> 4)] &= ~(1 << (ch & 0xf)); 936 } 937 } 938 939 940 private int __parseCharacterClass() throws MalformedPatternException { 941 boolean range = false, skipTest; 942 char clss, deflt, lastclss = Character.MAX_VALUE; 943 int offset, bits, numLength[] = { 0 }; 944 945 offset = __emitNode(OpCode._ANYOF); 946 947 if(__input._getValue() == '^') { 948 ++__cost; 949 __input._increment(); 950 deflt = 0; 951 } else { 952 deflt = 0xffff; 953 } 954 955 bits = __programSize; 956 for(clss = 0; clss < 16; clss++) 957 __emitCode(deflt); 958 959 clss = __input._getValue(); 960 961 if(clss == ']' || clss == '-') 962 skipTest = true; 963 else 964 skipTest = false; 965 966 while((!__input._isAtEnd() && (clss = __input._getValue()) != ']') 967 || skipTest) { 968 skipTest = false; 970 __input._increment(); 971 if(clss == '\\') { 972 clss = __input._postIncrement(); 973 974 switch(clss){ 975 case 'w': 976 for(clss = 0; clss < 256; clss++) 977 if(OpCode._isWordCharacter(clss)) 978 __setCharacterClassBits(__program, bits, deflt, clss); 979 lastclss = Character.MAX_VALUE; 980 continue; 981 case 'W': 982 for(clss = 0; clss < 256; clss++) 983 if(!OpCode._isWordCharacter(clss)) 984 __setCharacterClassBits(__program, bits, deflt, clss); 985 lastclss = Character.MAX_VALUE; 986 continue; 987 case 's': 988 for(clss = 0; clss < 256; clss++) 989 if(Character.isWhitespace(clss)) 990 __setCharacterClassBits(__program, bits, deflt, clss); 991 lastclss = Character.MAX_VALUE; 992 continue; 993 case 'S': 994 for(clss = 0; clss < 256; clss++) 995 if(!Character.isWhitespace(clss)) 996 __setCharacterClassBits(__program, bits, deflt, clss); 997 lastclss = Character.MAX_VALUE; 998 continue; 999 case 'd': 1000 for(clss = '0'; clss <= '9'; clss++) 1001 __setCharacterClassBits(__program, bits, deflt, clss); 1002 lastclss = Character.MAX_VALUE; 1003 continue; 1004 case 'D': 1005 for(clss = 0; clss < '0'; clss++) 1006 __setCharacterClassBits(__program, bits, deflt, clss); 1007 for(clss = (char)('9' + 1); clss < 256; clss++) 1008 __setCharacterClassBits(__program, bits, deflt, clss); 1009 lastclss = Character.MAX_VALUE; 1010 continue; 1011 case 'n': 1012 clss = '\n'; 1013 break; 1014 case 'r': 1015 clss = '\r'; 1016 break; 1017 case 't': 1018 clss = '\t'; 1019 break; 1020 case 'f': 1021 clss = '\f'; 1022 break; 1023 case 'b': 1024 clss = '\b'; 1025 break; 1026 case 'e': 1027 clss = '\033'; 1028 break; 1029 case 'a': 1030 clss = '\007'; 1031 break; 1032 case 'x': 1033 clss = (char)__parseHex(__input._array, __input._getOffset(), 2, 1034 numLength); 1035 __input._increment(numLength[0]); 1036 break; 1037 case 'c': 1038 clss = __input._postIncrement(); 1039 if(Character.isLowerCase(clss)) 1040 clss = Character.toUpperCase(clss); 1041 clss ^= 64; 1042 break; 1043 case '0': case '1': case '2': case '3': case '4': 1044 case '5': case '6': case '7': case '8': case '9': 1045 clss = (char)__parseOctal(__input._array, __input._getOffset() - 1, 1046 3, numLength); 1047 __input._increment(numLength[0] - 1); 1048 break; 1049 } 1050 } 1051 1052 if(range) { 1053 if(lastclss > clss) 1054 throw new MalformedPatternException( 1055 "Invalid [] range in expression."); 1056 range = false; 1057 } else { 1058 lastclss = clss; 1059 1060 if(__input._getValue() == '-' && 1061 __input._getOffset() + 1 < __input._getLength() && 1062 __input._getValueRelative(1) != ']') { 1063 __input._increment(); 1064 range = true; 1065 continue; 1066 } 1067 } 1068 1069 while(lastclss <= clss) { 1070 __setCharacterClassBits(__program, bits, deflt, lastclss); 1071 if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 && 1072 Character.isUpperCase(lastclss)) 1073 __setCharacterClassBits(__program, bits, deflt, 1074 Character.toLowerCase(lastclss)); 1075 1076 ++lastclss; 1077 } 1078 1079 lastclss = clss; 1080 } 1081 1082 if(__input._getValue() != ']') 1083 throw new MalformedPatternException("Unmatched [] in expression."); 1084 1085 __getNextChar(); 1086 1087 return offset; 1088 } 1089 1090 1091 private int __parseBranch(int[] retFlags) throws MalformedPatternException { 1092 boolean nestCheck = false, handleRepetition = false; 1093 int offset, next, min, max, flags[] = { 0 }; 1094 char operator, value; 1095 1096 min = 0; 1097 max = Character.MAX_VALUE; 1098 offset = __parseAtom(flags); 1099 1100 if(offset == OpCode._NULL_OFFSET) { 1101 if((flags[0] & __TRYAGAIN) != 0) 1102 retFlags[0] |= __TRYAGAIN; 1103 return OpCode._NULL_OFFSET; 1104 } 1105 1106 operator = __input._getValue(); 1107 1108 if(operator == '(' && __input._getValueRelative(1) == '?' && 1109 __input._getValueRelative(2) == '#') { 1110 while(operator != CharStringPointer._END_OF_STRING && operator != ')') 1111 operator = __input._increment(); 1112 1113 if(operator != CharStringPointer._END_OF_STRING) { 1114 __getNextChar(); 1115 operator = __input._getValue(); 1116 } 1117 } 1118 1119 if(operator == '{' && 1120 __parseRepetition(__input._array, __input._getOffset())) { 1121 int maxOffset, pos; 1122 1123 next = __input._getOffset() + 1; 1124 pos = maxOffset = __input._getLength(); 1125 1126 value = __input._getValue(next); 1127 1128 while(Character.isDigit(value) || value == ',') { 1129 if(value == ',') { 1130 if(pos != maxOffset) 1131 break; 1132 else 1133 pos = next; 1134 } 1135 ++next; 1136 value = __input._getValue(next); 1137 } 1138 1139 if(value == '}') { 1140 int num; 1141 StringBuffer buffer = new StringBuffer (10); 1142 1143 if(pos == maxOffset) 1144 pos = next; 1145 __input._increment(); 1146 1147 num = __input._getOffset(); 1148 value = __input._getValue(num); 1149 1150 while(Character.isDigit(value)) { 1151 buffer.append(value); 1152 ++num; 1153 value = __input._getValue(num); 1154 } 1155 1156 try { 1157 min = Integer.parseInt(buffer.toString()); 1158 } catch(NumberFormatException e) { 1159 throw new MalformedPatternException( 1160 "Unexpected number format exception. Please report this bug." + 1161 "NumberFormatException message: " + e.getMessage()); 1162 } 1163 1164 value = __input._getValue(pos); 1165 if(value == ',') 1166 ++pos; 1167 else 1168 pos = __input._getOffset(); 1169 1170 num = pos; 1171 buffer = new StringBuffer (10); 1172 1173 value = __input._getValue(num); 1174 1175 while(Character.isDigit(value)){ 1176 buffer.append(value); 1177 ++num; 1178 value = __input._getValue(num); 1179 } 1180 1181 try { 1182 if(num != pos) 1183 max = Integer.parseInt(buffer.toString()); 1184 } catch(NumberFormatException e) { 1185 throw new MalformedPatternException( 1186 "Unexpected number format exception. Please report this bug." + 1187 "NumberFormatException message: " + e.getMessage()); 1188 } 1189 1190 1192 if(max == 0 && __input._getValue(pos) != '0') 1193 max = Character.MAX_VALUE; 1194 __input._setOffset(next); 1195 __getNextChar(); 1196 1197 1199 nestCheck = true; 1200 handleRepetition = true; 1201 } 1202 } 1203 1204 if(!nestCheck) { 1205 handleRepetition = false; 1206 1207 if(!__isSimpleRepetitionOp(operator)) { 1208 retFlags[0] = flags[0]; 1209 return offset; 1210 } 1211 1212 __getNextChar(); 1213 1214 retFlags[0] = ((operator != '+') ? 1215 (__WORSTCASE | __SPSTART) : (__WORSTCASE | __NONNULL)); 1216 1217 if(operator == '*' && ((flags[0] & __SIMPLE) != 0)) { 1218 __programInsertOperator(OpCode._STAR, offset); 1219 __cost+=4; 1220 } else if(operator == '*') { 1221 min = 0; 1222 handleRepetition = true; 1223 } else if(operator == '+' && (flags[0] & __SIMPLE) != 0) { 1224 __programInsertOperator(OpCode._PLUS, offset); 1225 __cost+=3; 1226 } else if(operator == '+') { 1227 min = 1; 1228 handleRepetition = true; 1229 } else if(operator == '?') { 1230 min = 0; 1231 max = 1; 1232 handleRepetition = true; 1233 } 1234 } 1235 1236 if(handleRepetition) { 1237 1238 if((flags[0] & __SIMPLE) != 0){ 1240 __cost+= ((2 + __cost) / 2); 1241 __programInsertOperator(OpCode._CURLY, offset); 1242 } else { 1243 __cost += (4 + __cost); 1244 __programAddTail(offset, __emitNode(OpCode._WHILEM)); 1245 __programInsertOperator(OpCode._CURLYX, offset); 1246 __programAddTail(offset, __emitNode(OpCode._NOTHING)); 1247 } 1248 1249 if(min > 0) 1250 retFlags[0] = (__WORSTCASE | __NONNULL); 1251 1252 if(max != 0 && max < min) 1253 throw new MalformedPatternException( 1254 "Invalid interval {" + min + "," + max + "}"); 1255 1256 if(__program!= null) { 1257 __program[offset + 2] = (char)min; 1258 __program[offset + 3] = (char)max; 1259 } 1260 } 1261 1262 1263 if(__input._getValue() == '?') { 1264 __getNextChar(); 1265 __programInsertOperator(OpCode._MINMOD, offset); 1266 __programAddTail(offset, offset + 2); 1267 } 1268 1269 if(__isComplexRepetitionOp(__input._array, __input._getOffset())) 1270 throw new MalformedPatternException( 1271 "Nested repetitions *?+ in expression"); 1272 1273 return offset; 1274 } 1275 1276 1277 private int __parseExpression(boolean isParenthesized, int[] hintFlags) 1278 throws MalformedPatternException { 1279 char value, paren; 1280 int nodeOffset = OpCode._NULL_OFFSET, parenthesisNum = 0, br, ender; 1281 int[] flags = { 0 }; 1282 String modifiers = "iogmsx"; 1283 1284 1285 hintFlags[0] = __NONNULL; 1287 1288 if (isParenthesized) { 1289 paren = 1; 1290 if(__input._getValue() == '?') { 1291 __input._increment(); 1292 paren = value = __input._postIncrement(); 1293 1294 switch(value) { 1295 case ':' : 1296 case '=' : 1297 case '!' : break; 1298 case '#' : 1299 value = __input._getValue(); 1300 while(value != CharStringPointer._END_OF_STRING && value != ')') 1301 value = __input._increment(); 1302 if(value != ')') 1303 throw new MalformedPatternException( 1304 "Sequence (?#... not terminated"); 1305 __getNextChar(); 1306 hintFlags[0] = __TRYAGAIN; 1307 return OpCode._NULL_OFFSET; 1308 default : 1309 __input._decrement(); 1310 value = __input._getValue(); 1311 while(value != CharStringPointer._END_OF_STRING && 1312 modifiers.indexOf(value) != -1) { 1313 __setModifierFlag(__modifierFlags, value); 1314 value = __input._increment(); 1315 } 1316 if(value != ')') 1317 throw new MalformedPatternException( 1318 "Sequence (?" + value + "...) not recognized"); 1319 __getNextChar(); 1320 hintFlags[0] = __TRYAGAIN; 1321 return OpCode._NULL_OFFSET; 1322 } 1323 } else { 1324 parenthesisNum = __numParentheses; 1325 ++__numParentheses; 1326 nodeOffset = __emitArgNode(OpCode._OPEN, (char)parenthesisNum); 1327 } 1328 } else 1329 paren = 0; 1330 1331 br = __parseAlternation(flags); 1332 1333 if(br == OpCode._NULL_OFFSET) 1334 return OpCode._NULL_OFFSET; 1335 1336 if(nodeOffset != OpCode._NULL_OFFSET) 1337 __programAddTail(nodeOffset, br); 1338 else 1339 nodeOffset = br; 1340 1341 if((flags[0] & __NONNULL) == 0) 1342 hintFlags[0] &= ~__NONNULL; 1343 1344 hintFlags[0] |= (flags[0] & __SPSTART); 1345 1346 while(__input._getValue() == '|') { 1347 __getNextChar(); 1348 br = __parseAlternation(flags); 1349 1350 if(br == OpCode._NULL_OFFSET) 1351 return OpCode._NULL_OFFSET; 1352 1353 __programAddTail(nodeOffset, br); 1354 1355 if((flags[0] & __NONNULL) == 0) 1356 hintFlags[0] &= ~__NONNULL; 1357 1358 hintFlags[0] |= (flags[0] & __SPSTART); 1359 } 1360 1361 switch(paren) { 1362 case ':' : 1363 ender = __emitNode(OpCode._NOTHING); 1364 break; 1365 case 1: 1366 ender = __emitArgNode(OpCode._CLOSE, (char)parenthesisNum); 1367 break; 1368 case '=': 1369 case '!': 1370 ender = __emitNode(OpCode._SUCCEED); 1371 hintFlags[0] &= ~__NONNULL; 1372 break; 1373 case 0 : 1374 default : 1375 ender = __emitNode(OpCode._END); 1376 break; 1377 } 1378 1379 __programAddTail(nodeOffset, ender); 1380 1381 for(br = nodeOffset; br != OpCode._NULL_OFFSET; 1382 br = OpCode._getNext(__program, br)) 1383 __programAddOperatorTail(br, ender); 1384 1385 if(paren == '=') { 1386 __programInsertOperator(OpCode._IFMATCH, nodeOffset); 1387 __programAddTail(nodeOffset, __emitNode(OpCode._NOTHING)); 1388 } else if(paren == '!') { 1389 __programInsertOperator(OpCode._UNLESSM, nodeOffset); 1390 __programAddTail(nodeOffset, __emitNode(OpCode._NOTHING)); 1391 } 1392 1393 if(paren != 0 && (__input._isAtEnd() || __getNextChar() != ')')) { 1394 throw new MalformedPatternException("Unmatched parentheses."); 1395 } else if(paren == 0 && !__input._isAtEnd()) { 1396 if(__input._getValue() == ')') 1397 throw new MalformedPatternException("Unmatched parentheses."); 1398 else 1399 throw new MalformedPatternException( 1401 "Unreached characters at end of expression. Please report this bug!"); 1402 } 1403 1404 1405 return nodeOffset; 1406 } 1407 1408 1409 1435 public Pattern compile(char[] pattern, int options) 1436 throws MalformedPatternException { 1437 int[] flags = { 0 }; 1438 int caseInsensitive, scan; 1439 Perl5Pattern regexp; 1440 String mustString, startString; 1441 1442 int first; 1443 boolean sawOpen = false, sawPlus = false; 1444 1445 StringBuffer lastLongest, longest; 1446 int length, minLength = 0, curBack, back, backmost; 1447 1448 1449 __input = new CharStringPointer(pattern); 1450 1451 caseInsensitive = options & __CASE_INSENSITIVE; 1452 __modifierFlags[0] = (char)options; 1453 __sawBackreference = false; 1454 __numParentheses = 1; 1455 __programSize = 0; 1456 __cost = 0; 1457 __program= null; 1458 1459 __emitCode((char)0); 1460 if(__parseExpression(false, flags) == OpCode._NULL_OFFSET) { 1461 throw new MalformedPatternException("Unknown compilation error."); 1464 } 1465 1466 1468 if(__programSize >= Character.MAX_VALUE - 1) 1469 throw new MalformedPatternException("Expression is too large."); 1470 1471 1472 __program= new char[__programSize]; 1473 regexp = new Perl5Pattern(); 1474 1475 regexp._program = __program; 1476 regexp._expression = new String (pattern); 1477 1478 __input._setOffset(0); 1479 1480 __numParentheses = 1; 1481 __programSize = 0; 1482 __cost = 0; 1483 1484 __emitCode((char)0); 1485 if(__parseExpression(false, flags) == OpCode._NULL_OFFSET) { 1486 throw new MalformedPatternException("Unknown compilation error."); 1489 } 1490 1491 1493 caseInsensitive = __modifierFlags[0] & __CASE_INSENSITIVE; 1494 1495 regexp._isExpensive = (__cost >= 10); 1496 regexp._startClassOffset = OpCode._NULL_OFFSET; 1497 regexp._anchor = 0; 1498 regexp._back = -1; 1499 regexp._options = options; 1500 regexp._startString = null; 1501 regexp._mustString = null; 1502 mustString = null; 1503 startString = null; 1504 1505 scan = 1; 1506 if(__program[OpCode._getNext(__program, scan)] == OpCode._END){ 1507 boolean doItAgain; char op; 1509 1510 first = scan = OpCode._getNextOperator(scan); 1511 op = __program[first]; 1512 1513 while((op == OpCode._OPEN && (sawOpen = true)) || 1514 (op == OpCode._BRANCH && 1515 __program[OpCode._getNext(__program, first)] != OpCode._BRANCH) || 1516 op == OpCode._PLUS || op == OpCode._MINMOD || 1517 (OpCode._opType[op] == OpCode._CURLY && 1518 OpCode._getArg1(__program, first) > 0)) { 1519 if(op == OpCode._PLUS) 1520 sawPlus = true; 1521 else 1522 first+=OpCode._operandLength[op]; 1523 1524 first = OpCode._getNextOperator(first); 1525 op = __program[first]; 1526 } 1527 1528 doItAgain = true; 1529 1530 while(doItAgain) { 1531 doItAgain = false; 1532 op = __program[first]; 1533 1534 if(op == OpCode._EXACTLY) { 1535 startString = 1536 new String (__program, OpCode._getOperand(first + 1), 1537 __program[OpCode._getOperand(first)]); 1538 1539 } else if(OpCode._isInArray(op, OpCode._opLengthOne, 2)) 1540 regexp._startClassOffset = first; 1541 else if(op == OpCode._BOUND || op == OpCode._NBOUND) 1542 regexp._startClassOffset = first; 1543 else if(OpCode._opType[op] == OpCode._BOL) { 1544 regexp._anchor = Perl5Pattern._OPT_ANCH; 1545 first = OpCode._getNextOperator(first); 1546 doItAgain = true; 1547 continue; 1548 } else if(op == OpCode._STAR && 1549 OpCode._opType[__program[OpCode._getNextOperator(first)]] == 1550 OpCode._ANY && (regexp._anchor & Perl5Pattern._OPT_ANCH) != 0) 1551 { 1552 regexp._anchor = Perl5Pattern._OPT_ANCH | Perl5Pattern._OPT_IMPLICIT; 1553 first = OpCode._getNextOperator(first); 1554 doItAgain = true; 1555 continue; 1556 } 1557 } 1559 if(sawPlus && (!sawOpen || !__sawBackreference)) 1560 regexp._anchor |= Perl5Pattern._OPT_SKIP; 1561 1562 1563 1572 lastLongest = new StringBuffer (); 1573 longest = new StringBuffer (); 1574 length = 0; 1575 minLength = 0; 1576 curBack = 0; 1577 back = 0; 1578 backmost = 0; 1579 1580 while(scan > 0 && (op = __program[scan]) != OpCode._END) { 1581 1582 if(op == OpCode._BRANCH) { 1583 if(__program[OpCode._getNext(__program, scan)] == OpCode._BRANCH) { 1584 curBack = -30000; 1585 while(__program[scan] == OpCode._BRANCH) 1586 scan = OpCode._getNext(__program, scan); 1587 } else 1588 scan = OpCode._getNextOperator(scan); 1589 continue; 1590 } 1591 1592 if(op == OpCode._UNLESSM) { 1593 curBack = -30000; 1594 scan = OpCode._getNext(__program, scan); 1595 continue; 1596 } 1597 1598 if(op == OpCode._EXACTLY) { 1599 int temp; 1600 1601 first = scan; 1602 while(__program[(temp = OpCode._getNext(__program, scan))] == 1603 OpCode._CLOSE) 1604 scan = temp; 1605 1606 minLength += __program[OpCode._getOperand(first)]; 1607 1608 temp = __program[OpCode._getOperand(first)]; 1609 1610 if(curBack - back == length) { 1611 lastLongest.append(new String (__program, OpCode._getOperand(first) + 1, 1612 temp)); 1613 length += temp; 1614 curBack += temp; 1615 first = OpCode._getNext(__program, scan); 1616 } else if(temp >= (length + (curBack >= 0 ? 1 : 0))) { 1617 length = temp; 1618 lastLongest = 1619 new StringBuffer (new String (__program, 1620 OpCode._getOperand(first) + 1, temp)); 1621 back = curBack; 1622 curBack += length; 1623 first = OpCode._getNext(__program, scan); 1624 } else 1625 curBack += temp; 1626 } else if(OpCode._isInArray(op, OpCode._opLengthVaries, 0)) { 1627 curBack = -30000; 1628 length = 0; 1629 1630 if(lastLongest.length() > longest.length()) { 1631 longest = lastLongest; 1632 backmost = back; 1633 } 1634 1635 lastLongest = new StringBuffer (); 1636 1637 if(op == OpCode._PLUS && 1638 OpCode._isInArray(__program[OpCode._getNextOperator(scan)], 1639 OpCode._opLengthOne, 0)) 1640 ++minLength; 1641 else if(OpCode._opType[op] == OpCode._CURLY && 1642 OpCode._isInArray(__program[OpCode._getNextOperator(scan) + 2], 1643 OpCode._opLengthOne, 0)) 1644 minLength += OpCode._getArg1(__program, scan); 1645 } else if(OpCode._isInArray(op, OpCode._opLengthOne, 0)) { 1646 ++curBack; 1647 ++minLength; 1648 length = 0; 1649 if(lastLongest.length() > longest.length()) { 1650 longest = lastLongest; 1651 backmost = back; 1652 } 1653 lastLongest = new StringBuffer (); 1654 } 1655 1656 scan = OpCode._getNext(__program, scan); 1657 } 1659 if(lastLongest.length() + 1660 ((OpCode._opType[__program[first]] == OpCode._EOL) ? 1 : 0) > 1661 longest.length()) { 1662 longest = lastLongest; 1663 backmost = back; 1664 } else 1665 lastLongest = new StringBuffer (); 1666 1667 if(longest.length() > 0 && startString == null) { 1668 mustString = longest.toString(); 1669 if(backmost < 0) 1670 backmost = -1; 1671 regexp._back = backmost; 1672 1673 1680 } else 1681 longest = null; 1682 } 1684 1685 regexp._isCaseInsensitive = ((caseInsensitive & __CASE_INSENSITIVE) != 0); 1686 regexp._numParentheses = __numParentheses - 1; 1687 regexp._minLength = minLength; 1688 1689 if(mustString != null) { 1690 regexp._mustString = mustString.toCharArray(); 1691 regexp._mustUtility = 100; 1692 } 1693 1694 if(startString != null) 1695 regexp._startString = startString.toCharArray(); 1696 1697 return regexp; 1698 } 1699 1700 1710 public Pattern compile(char[] pattern) throws MalformedPatternException { 1711 return compile(pattern, DEFAULT_MASK); 1712 } 1713 1714 1715 1725 public Pattern compile(String pattern) throws MalformedPatternException { 1726 return compile(pattern.toCharArray(), DEFAULT_MASK); 1727 } 1728 1729 1730 1756 public Pattern compile(String pattern, int options) 1757 throws MalformedPatternException { 1758 return compile(pattern.toCharArray(), options); 1759 } 1760 1761} 1762 | Popular Tags |