1 19 20 21 package org.netbeans.modules.i18n.regexp; 22 23 28 public class Parser { 29 30 31 private String regexp; 32 33 39 private String [] tokenNames; 40 41 46 private int maxTokenLength; 47 48 58 public static TreeNodeRoot parse(String regexp) 59 throws IllegalArgumentException , ParseException { 60 return parse(regexp, null); 61 } 62 63 77 public static TreeNodeRoot parse(String regexp, String tokenNames[]) 78 throws IllegalArgumentException , ParseException { 79 Parser parser = new Parser(regexp); 80 if (tokenNames != null && tokenNames.length != 0) { 81 parser.setTokenNames(tokenNames); 82 } 83 return parser.parse(); 84 } 85 86 93 Parser(String regexp) { 94 if (regexp == null) { 95 throw new IllegalArgumentException (); 96 } 97 this.regexp = regexp; 98 } 99 100 102 private void setTokenNames(String [] tokenNames) { 103 if (tokenNames != null && tokenNames.length != 0) { 104 this.tokenNames = tokenNames; 105 maxTokenLength = tokenNames[0].length(); 106 for (int i = 1; i < tokenNames.length; i++) { 107 if (tokenNames[i].length() > maxTokenLength) { 108 maxTokenLength = tokenNames[i].length(); 109 } 110 } 111 } else { 112 this.tokenNames = null; 113 maxTokenLength = 0; 114 } 115 } 116 117 124 TreeNodeRoot parse() throws ParseException { 125 126 TreeNodeRoot result; 127 TreeNode multiRegexpNode = null; 128 129 int begin = 0; 130 int end = regexp.length(); 131 boolean initialPart = false; 132 boolean finalPart = false; 133 134 if (begin == end) { 135 return null; 136 } 137 138 139 if (regexp.charAt(0) == '^') { 140 initialPart = true; 141 begin++; 142 } 143 if ((end == begin + 1) && (regexp.charAt(begin) == '$')) { 144 finalPart = true; 145 end--; 146 } 147 148 161 162 if (begin != end) { 163 multiRegexpNode = parseMultiRegexp(begin, end); 164 165 169 if (multiRegexpNode == null) { 170 throwParseException(begin); 171 } 172 173 177 if ((multiRegexpNode.end == end - 1) 178 && (regexp.charAt(end - 1) == '$')) { 179 finalPart = true; 180 end--; 181 } 182 183 188 if (multiRegexpNode.end != end) { 189 throwParseException(begin); 190 } 191 } 192 193 String attribs = null; 194 if (initialPart || finalPart) { 195 StringBuffer buf = new StringBuffer (2); 196 if (initialPart) { 197 buf.append('^'); 198 } 199 if (finalPart) { 200 buf.append('$'); 201 } 202 attribs = buf.toString(); 203 } 204 205 result = new TreeNodeRoot(regexp, attribs); 206 if (multiRegexpNode != null) { 207 result.add(multiRegexpNode); 208 } 209 return result; 210 } 211 212 213 214 private void throwParseException(int position) throws ParseException { 215 throw new ParseException(regexp, position); 216 } 217 218 219 220 private TreeNode parseMultiRegexp(int start, int end) 221 throws ParseException { 222 if (start == end) { 223 return null; 224 } 225 226 TreeNode regexpSequenceNode = parseRegexpSequence(start, end); 227 if (regexpSequenceNode == null) { 228 return null; 229 } 230 231 java.util.List alternatives = new java.util.ArrayList (4); 232 alternatives.add(regexpSequenceNode); 233 234 while (regexpSequenceNode.end != end 235 && regexp.charAt(regexpSequenceNode.end) == '|') { 236 int from = regexpSequenceNode.end + 1; 237 regexpSequenceNode = parseRegexpSequence(from, end); 238 239 if (regexpSequenceNode == null) { 240 241 242 throwParseException(from); 243 } 244 245 alternatives.add(regexpSequenceNode); 246 }; 247 248 TreeNode result = new TreeNode(TreeNode.MULTI_REGEXP, 249 start, 250 regexpSequenceNode.end); 251 java.util.Iterator i; 252 for (i = alternatives.iterator(); i.hasNext(); ) { 253 result.add((TreeNode) i.next()); 254 } 255 return result; 256 } 257 258 259 260 private TreeNode parseRegexpSequence(int start, int end) 261 throws ParseException { 262 if (start == end) { 263 return null; 264 } 265 266 TreeNode result; 267 java.util.List sequence = null; 268 TreeNode lastChildNode = null; 269 270 int from = start; 271 while (true) { 272 TreeNode qRegexpNode = parseQRegexp(from, end); 273 274 if (qRegexpNode == null) { 275 break; 276 } 277 278 if (sequence == null) { 279 sequence = new java.util.ArrayList (4); 280 } 281 sequence.add(qRegexpNode); 282 283 284 lastChildNode = qRegexpNode; 285 286 287 if (qRegexpNode.end == end) { 288 break; 289 } 290 291 from = qRegexpNode.end; 292 } 293 294 if (sequence == null) { 295 return null; 296 } 297 298 result = new TreeNode(TreeNode.SIMPLE_REGEXP, start, lastChildNode.end); 299 java.util.Iterator i; 300 for (i = sequence.iterator(); i.hasNext(); ) { 301 result.add((TreeNode) i.next()); 302 } 303 return result; 304 } 305 306 307 308 private TreeNode parseQRegexp(int start, int end) throws ParseException { 309 if (start == end) { 310 return null; 311 } 312 313 TreeNode result; 314 315 TreeNode singleRegexpNode = parseSingleRegexp(start, end); 316 if (singleRegexpNode == null) { 317 return null; 318 } 319 320 321 if (singleRegexpNode.end == end) { 322 result = new TreeNode(TreeNode.Q_REGEXP, 323 start, 324 singleRegexpNode.end); 325 result.add(singleRegexpNode); 326 return result; 327 } 328 329 TreeNode quantifierNode = parseQuantifier(singleRegexpNode.end, end); 330 if (quantifierNode == null) { 331 result = new TreeNode(TreeNode.Q_REGEXP, 332 start, 333 singleRegexpNode.end); 334 result.add(singleRegexpNode); 335 } else { 336 result = new TreeNode(TreeNode.Q_REGEXP, 337 start, 338 quantifierNode.end); 339 result.add(singleRegexpNode); 340 result.add(quantifierNode); 341 } 342 return result; 343 } 344 345 346 347 private TreeNode parseSingleRegexp(int start, int end) 348 throws ParseException { 349 if (start == end) { 350 return null; 351 } 352 353 TreeNode result; 354 char ch = regexp.charAt(start); 355 switch (ch) { 356 case '.': 357 result = new TreeNode(TreeNode.METACHAR, 358 start, 359 start + 1, 360 new Character (ch)); 361 break; 362 363 case '[': 364 TreeNode setNode = parseSet(start, end); 365 assert setNode != null; 366 return setNode; 367 368 case '(': 369 TreeNode subexprNode = parseSubexpr(start, end); 370 assert subexprNode != null; 371 return subexprNode; 372 373 case '\\': 374 if (end == start + 1) { 375 376 377 throwParseException(end); 378 } 379 char ch2 = regexp.charAt(start + 1); 380 switch (ch2) { 381 case 'b': 382 case 'B': 383 result = new TreeNode(TreeNode.METACHAR, 384 start, 385 start + 2, 386 new Character (ch2)); 387 break; 388 389 case 'u': 390 Integer unicode = parseUnicode(start + 2, end); 391 if (unicode == null) { 392 393 394 throwParseException(start + 2); 395 } 396 result = new TreeNode(TreeNode.UNICODE_CHAR, 397 start, 398 start + 6, 399 unicode); 400 break; 401 402 default: 403 char parsedChar; 404 switch (ch2) { 405 case 't': 406 parsedChar = '\t'; 407 break; 408 409 case 'n': 410 parsedChar = '\n'; 411 break; 412 413 case 'r': 414 parsedChar = '\r'; 415 break; 416 417 case 'f': 418 parsedChar = '\f'; 419 break; 420 421 default: 422 parsedChar = ch2; 423 break; 424 } 425 result = new TreeNode(TreeNode.CHAR, 426 start, 427 start + 2, 428 new Character (parsedChar)); 429 break; 430 } 431 break; 432 433 case '{': 434 String tokenName = getTokenName(start, end); 435 if (tokenName != null) { 436 result = new TreeNode(TreeNode.TOKEN, 437 start, 438 start + tokenName.length() + 2, 439 tokenName); 440 break; 441 } 442 443 444 default: 445 if ("^$|*+?)]{}".indexOf(ch) != -1) { return null; 447 } 448 result = new TreeNode(TreeNode.CHAR, 449 start, 450 start + 1, 451 new Character (ch)); 452 break; 453 } 454 return result; 455 } 456 457 458 459 private TreeNode parseQuantifier(int start, int end) 460 throws ParseException { 461 if (start == end) { 462 return null; 463 } 464 465 TreeNode result = null; 466 char ch = regexp.charAt(start); 467 switch (ch) { 468 case '*': 469 case '+': 470 case '?': 471 result = new TreeNode(TreeNode.QUANTIFIER, 472 start, 473 start + 1, 474 new Character (ch)); 475 return result; 476 case '{': 477 break; 478 default: 479 return null; 480 } 481 482 if (end - start == 1) { 483 484 485 throwParseException(start + 1); 486 } 487 488 TreeNode numberNode1 = parseNumber(start + 1, end); 489 if (numberNode1 == null) { 490 491 492 if (getTokenName(start, end) != null) { 493 494 495 return null; 496 } 497 498 499 throwParseException(start + 1); 500 } 501 if (numberNode1.end == end) { 502 503 504 throwParseException(numberNode1.end); 505 } 506 507 switch (regexp.charAt(numberNode1.end)) { 508 case '}': 509 result = new TreeNode(TreeNode.QUANTIFIER, 510 start, 511 numberNode1.end + 1, 512 "{n}"); result.add(numberNode1); 514 return result; 515 case ',': 516 break; 517 default: 518 519 520 throwParseException(numberNode1.end); 521 } 522 523 if (numberNode1.end + 1 == end) { 524 525 526 throwParseException(numberNode1.end + 1); 527 } 528 529 if (regexp.charAt(numberNode1.end + 1) == '}') { 530 result = new TreeNode(TreeNode.QUANTIFIER, 531 start, 532 numberNode1.end + 2, 533 "{n,}"); result.add(numberNode1); 535 return result; 536 } 537 538 TreeNode numberNode2 = parseNumber(numberNode1.end + 1, end); 539 if (numberNode2 == null) { 540 541 542 throwParseException(numberNode1.end + 1); 543 } 544 if (numberNode2.end == end 545 || regexp.charAt(numberNode2.end) != '}') { 546 547 548 throwParseException(numberNode2.end); 549 } 550 551 int num1 = ((Integer ) numberNode1.getAttribs()).intValue(); 552 int num2 = ((Integer ) numberNode2.getAttribs()).intValue(); 553 if (num2 < num1) { 554 throwParseException(numberNode2.start); 555 } 556 557 result = new TreeNode(TreeNode.QUANTIFIER, 558 start, 559 numberNode2.end + 1, 560 "{n,n}"); result.add(numberNode1); 562 result.add(numberNode2); 563 return result; 564 } 565 566 567 568 private TreeNode parseNumber(int start, int end) throws ParseException { 569 if (start == end) { 570 return null; 571 } 572 573 char[] chars = regexp.substring(start, end).toCharArray(); 574 int endIndex = chars.length; 575 for (int i = 0; i < chars.length; i++) { 576 if (chars[i] < '0' || chars[i] > '9') { 577 endIndex = i; 578 break; 579 } 580 } 581 582 if (endIndex == 0) { 583 return null; 584 } else if (endIndex > 3) { 585 586 587 throwParseException(start); 588 } 589 590 int number; 591 if (endIndex == 1) { 592 number = chars[0] - '0'; 593 } else { 594 try { 595 number = Integer.parseInt(regexp.substring(start, 596 start + endIndex)); 597 } catch (NumberFormatException ex) { 598 throw new AssertionError (); } 600 } 601 602 TreeNode result = new TreeNode(TreeNode.NUMBER, 603 start, 604 start + endIndex, 605 new Integer (number)); 606 return result; 607 } 608 609 610 611 private String getTokenName(int start, int end) { 612 if (tokenNames == null) { 613 return null; 614 } 615 616 int checkAreaLength = Math.min(end - start, maxTokenLength + 2); 617 String substring = regexp.substring(start, start + checkAreaLength); 618 if (substring.charAt(0) != '{') { 619 return null; 620 } 621 int rightBoundaryIndex = substring.indexOf('}', 1); 622 if (rightBoundaryIndex == -1) { 623 return null; 624 } 625 String tokenName = substring.substring(1, rightBoundaryIndex); 626 for (int i = 0; i < tokenNames.length; i++) { 627 if (tokenName.equals(tokenNames[i])) { 628 return tokenName; 629 } 630 } 631 return null; 632 } 633 634 635 636 private Integer parseUnicode(int start, int end) throws ParseException { 637 if (start == end) { 638 return null; 639 } 640 641 if (end - start < 4) { 642 643 644 throwParseException(start); 645 } 646 647 char[] chars = regexp.substring(start, start + 4).toCharArray(); 648 for (int i = 0; i < 4; i++) { 649 char ch = chars[i]; 650 if ("01234567890abcdefABCDEF".indexOf(ch) == -1) { if (i == 0) { 652 return null; 653 } else { 654 throwParseException(start); 655 } 656 } 657 } 658 659 Integer integer; 660 try { 661 integer = Integer.valueOf(regexp.substring(start, start + 4), 16); 662 } catch (NumberFormatException ex) { 663 throw new AssertionError (); } 665 return integer; 666 } 667 668 669 670 private TreeNode parseSubexpr(int start, int end) throws ParseException { 671 if (start == end) { 672 return null; 673 } 674 675 if (regexp.charAt(start) != '(') { 676 return null; 677 } 678 if (end == start + 1) { 679 throwParseException(start + 1); 680 } 681 682 TreeNode result; 683 TreeNode multiRegexpNode = parseMultiRegexp(start + 1, end); 684 if (multiRegexpNode == null) { 685 686 687 throwParseException(start + 1); 688 } 689 if (multiRegexpNode.end == end 690 || regexp.charAt(multiRegexpNode.end) != ')') { 691 throwParseException(multiRegexpNode.end); 692 } 693 result = new TreeNode(TreeNode.SUBEXPR, start, multiRegexpNode.end + 1); 694 result.add(multiRegexpNode); 695 return result; 696 } 697 698 699 700 private TreeNode parseSet(int start, int end) throws ParseException { 701 if (start == end) { 702 return null; 703 } 704 705 if (regexp.charAt(start) != '[') { 706 return null; 707 } 708 if (end == start + 1) { 709 710 711 throwParseException(start + 1); 712 } 713 714 718 String setString = regexp.substring(start, end); 719 String specials = getSpecials(setString); 720 721 722 int endIndex = setString.indexOf(']', 1 + specials.length()); 723 if (endIndex == -1) { 724 725 726 throwParseException(start); 727 } else { 728 endIndex++; } 730 endIndex += start; 732 setString = regexp.substring(start, endIndex); 733 int setLength = setString.length(); 734 735 TreeNode result; 736 737 738 if (setLength >= 5 739 && setString.charAt(1) == ':' 740 && setString.charAt(setLength - 2) == ':') { 741 String charClassName = setString.substring(2, setLength - 2); 742 if (isPosixCharClass(charClassName)) { 743 result = new TreeNode(TreeNode.POSIX_SET, 744 start, 745 endIndex, 746 charClassName); 747 return result; 748 } else { 749 throwParseException(start + 2); 750 } 751 } 752 753 result = new TreeNode(TreeNode.SET, 754 start, 755 endIndex, 756 specials); 757 758 int from = start + 1 + specials.length(); 759 int to = endIndex - 1; 760 761 while (from != to) { 762 TreeNode rangeNode = parseRangeOrChar(from, to); 763 if (rangeNode == null) { 764 765 766 throwParseException(from); 767 } 768 result.add(rangeNode); 769 from = rangeNode.end; 770 } 771 772 return result; 773 } 774 775 776 777 private TreeNode parseRangeOrChar(int start, int end) 778 throws ParseException { 779 if (start == end) { 780 return null; 781 } 782 783 TreeNode rangeCharNode1 = parseRangeChar(start, end); 784 if (rangeCharNode1 == null) { 785 return null; 786 } 787 788 if (rangeCharNode1.end == end 789 || regexp.charAt(rangeCharNode1.end) != '-') { 790 return rangeCharNode1; 791 } 792 793 TreeNode rangeCharNode2 = parseRangeChar(rangeCharNode1.end + 1, end); 794 if (rangeCharNode2 == null) { 795 796 797 throwParseException(rangeCharNode1.end + 1); 798 } 799 800 Object charObject; 801 802 charObject = rangeCharNode1.getAttribs(); 803 int char1 = charObject instanceof Character 804 ? Character.getNumericValue( 805 ((Character ) charObject).charValue()) 806 : ((Integer ) charObject).intValue(); 807 charObject = rangeCharNode2.getAttribs(); 808 int char2 = charObject instanceof Character 809 ? Character.getNumericValue( 810 ((Character ) charObject).charValue()) 811 : ((Integer ) charObject).intValue(); 812 813 if (!(char1 < char2)) { 814 815 816 throwParseException(rangeCharNode1.end + 1); 817 } 818 819 TreeNode result = new TreeNode(TreeNode.RANGE, 820 start, 821 rangeCharNode2.end); 822 result.add(rangeCharNode1); 823 result.add(rangeCharNode2); 824 return result; 825 } 826 827 828 829 private TreeNode parseRangeChar(int start, int end) throws ParseException { 830 if (start == end) { 831 return null; 832 } 833 834 TreeNode result; 835 836 char ch = regexp.charAt(start); 837 switch (ch) { 838 case ']': 839 case '-': 840 return null; 841 842 case '\\': 843 if (end == start + 1) { 844 845 846 throwParseException(start + 1); 847 } 848 char ch2 = regexp.charAt(start + 1); 849 char parsedChar; 850 switch (ch2) { 851 case 'u': 852 Integer unicode = parseUnicode(start + 2, end); 853 if (unicode == null) { 854 855 856 throwParseException(start + 2); 857 } 858 int codeValue = unicode.intValue(); 859 assert codeValue >= 0; 860 if (codeValue <= 0x007f) { 861 862 863 throwParseException(start + 2); 864 } 865 return new TreeNode(TreeNode.UNICODE_CHAR, 866 start, 867 start + 6, 868 unicode); 869 870 case ']': 871 case '-': 872 873 877 throwParseException(start + 2); 878 879 case 't': 880 parsedChar = '\t'; 881 break; 882 883 case 'n': 884 parsedChar = '\n'; 885 break; 886 887 case 'r': 888 parsedChar = '\r'; 889 break; 890 891 case 'f': 892 parsedChar = '\f'; 893 break; 894 895 default: 896 parsedChar = ch2; 897 break; 898 } 899 result = new TreeNode(TreeNode.CHAR, 900 start, 901 start + 2, 902 new Character (parsedChar)); 903 break; 904 default: 905 result = new TreeNode(TreeNode.CHAR, 906 start, 907 start + 1, 908 new Character (ch)); 909 break; 910 } 911 return result; 912 } 913 914 915 916 private String getSpecials(String setRegexp) { 917 int index = 1; 918 int maxIndex = 3; 919 if (setRegexp.length() < 5) { 920 maxIndex = setRegexp.length() - 2; 921 } 922 StringBuffer buf = new StringBuffer (maxIndex - index + 1); 923 char ch = setRegexp.charAt(index); 924 if (ch == '^') { 925 buf.append(ch); 926 if (index == maxIndex) { 927 return buf.toString(); 928 } 929 ch = setRegexp.charAt(++index); 930 } 931 if (ch == ']') { 932 buf.append(ch); 933 if (index == maxIndex) { 934 return buf.toString(); 935 } 936 ch = setRegexp.charAt(++index); 937 } 938 if (ch == '-') { 939 buf.append(ch); 940 } 941 return buf.toString(); 942 } 943 944 945 946 private boolean isPosixCharClass(String name) { 947 948 if (name.equals("xdigit")) { return true; 950 } 951 if (name.length() != 5) { 952 return false; 953 } 954 955 String classNames = "alnum alpha blank cntrl digit graph " + "lower print punct space upper"; java.util.StringTokenizer tokenizer 958 = new java.util.StringTokenizer (classNames, " "); while (tokenizer.hasMoreTokens()) { 960 if (name.equals(tokenizer.nextToken())) { 961 return true; 962 } 963 } 964 return false; 965 } 966 967 } 968 | Popular Tags |