1 package org.apache.regexp; 2 3 59 60 import org.apache.regexp.RE; 61 import java.util.Hashtable ; 62 63 77 public class RECompiler 78 { 79 char[] instruction; int lenInstruction; 83 String pattern; int len; int idx; int parens; 89 static final int NODE_NORMAL = 0; static final int NODE_NULLABLE = 1; static final int NODE_TOPLEVEL = 2; 94 static final char ESC_MASK = 0xfff0; static final char ESC_BACKREF = 0xffff; static final char ESC_COMPLEX = 0xfffe; static final char ESC_CLASS = 0xfffd; 100 int maxBrackets = 10; static final int bracketUnbounded = -1; int brackets = 0; int[] bracketStart = null; int[] bracketEnd = null; int[] bracketMin = null; int[] bracketOpt = null; 109 static Hashtable hashPOSIX = new Hashtable (); 111 static 112 { 113 hashPOSIX.put("alnum", new Character (RE.POSIX_CLASS_ALNUM)); 114 hashPOSIX.put("alpha", new Character (RE.POSIX_CLASS_ALPHA)); 115 hashPOSIX.put("blank", new Character (RE.POSIX_CLASS_BLANK)); 116 hashPOSIX.put("cntrl", new Character (RE.POSIX_CLASS_CNTRL)); 117 hashPOSIX.put("digit", new Character (RE.POSIX_CLASS_DIGIT)); 118 hashPOSIX.put("graph", new Character (RE.POSIX_CLASS_GRAPH)); 119 hashPOSIX.put("lower", new Character (RE.POSIX_CLASS_LOWER)); 120 hashPOSIX.put("print", new Character (RE.POSIX_CLASS_PRINT)); 121 hashPOSIX.put("punct", new Character (RE.POSIX_CLASS_PUNCT)); 122 hashPOSIX.put("space", new Character (RE.POSIX_CLASS_SPACE)); 123 hashPOSIX.put("upper", new Character (RE.POSIX_CLASS_UPPER)); 124 hashPOSIX.put("xdigit", new Character (RE.POSIX_CLASS_XDIGIT)); 125 hashPOSIX.put("javastart", new Character (RE.POSIX_CLASS_JSTART)); 126 hashPOSIX.put("javapart", new Character (RE.POSIX_CLASS_JPART)); 127 } 128 129 132 public RECompiler() 133 { 134 instruction = new char[128]; 136 lenInstruction = 0; 137 } 138 139 144 void ensure(int n) 145 { 146 int curlen = instruction.length; 148 149 if (lenInstruction + n >= curlen) 151 { 152 while (lenInstruction + n >= curlen) 154 { 155 curlen *= 2; 156 } 157 158 char[] newInstruction = new char[curlen]; 160 System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction); 161 instruction = newInstruction; 162 } 163 } 164 165 169 void emit(char c) 170 { 171 ensure(1); 173 174 instruction[lenInstruction++] = c; 176 } 177 178 185 void nodeInsert(char opcode, int opdata, int insertAt) 186 { 187 ensure(RE.nodeSize); 189 190 System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt); 192 instruction[insertAt + RE.offsetOpcode] = opcode; 193 instruction[insertAt + RE.offsetOpdata] = (char)opdata; 194 instruction[insertAt + RE.offsetNext] = 0; 195 lenInstruction += RE.nodeSize; 196 } 197 198 203 void setNextOfEnd(int node, int pointTo) 204 { 205 int next = instruction[node + RE.offsetNext]; 207 while ( next != 0 && node < lenInstruction ) 210 { 211 if ( node == pointTo ) { 218 pointTo = lenInstruction; 219 } 220 node += next; 221 next = instruction[node + RE.offsetNext]; 222 } 223 if ( node < lenInstruction ) { 226 instruction[node + RE.offsetNext] = (char)(short)(pointTo - node); 228 } 229 } 230 231 237 int node(char opcode, int opdata) 238 { 239 ensure(RE.nodeSize); 241 242 instruction[lenInstruction + RE.offsetOpcode] = opcode; 244 instruction[lenInstruction + RE.offsetOpdata] = (char)opdata; 245 instruction[lenInstruction + RE.offsetNext] = 0; 246 lenInstruction += RE.nodeSize; 247 248 return lenInstruction - RE.nodeSize; 250 } 251 252 253 257 void internalError() throws Error 258 { 259 throw new Error ("Internal error!"); 260 } 261 262 266 void syntaxError(String s) throws RESyntaxException 267 { 268 throw new RESyntaxException(s); 269 } 270 271 274 void allocBrackets() 275 { 276 if (bracketStart == null) 278 { 279 bracketStart = new int[maxBrackets]; 281 bracketEnd = new int[maxBrackets]; 282 bracketMin = new int[maxBrackets]; 283 bracketOpt = new int[maxBrackets]; 284 285 for (int i = 0; i < maxBrackets; i++) 287 { 288 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1; 289 } 290 } 291 } 292 293 294 synchronized void reallocBrackets() { 295 if (bracketStart == null) { 297 allocBrackets(); 298 } 299 300 int new_size = maxBrackets * 2; 301 int[] new_bS = new int[new_size]; 302 int[] new_bE = new int[new_size]; 303 int[] new_bM = new int[new_size]; 304 int[] new_bO = new int[new_size]; 305 for (int i=brackets; i<new_size; i++) { 307 new_bS[i] = new_bE[i] = new_bM[i] = new_bO[i] = -1; 308 } 309 System.arraycopy(bracketStart,0, new_bS,0, brackets); 310 System.arraycopy(bracketEnd,0, new_bE,0, brackets); 311 System.arraycopy(bracketMin,0, new_bM,0, brackets); 312 System.arraycopy(bracketOpt,0, new_bO,0, brackets); 313 bracketStart = new_bS; 314 bracketEnd = new_bE; 315 bracketMin = new_bM; 316 bracketOpt = new_bO; 317 maxBrackets = new_size; 318 } 319 320 324 void bracket() throws RESyntaxException 325 { 326 if (idx >= len || pattern.charAt(idx++) != '{') 328 { 329 internalError(); 330 } 331 332 if (idx >= len || !Character.isDigit(pattern.charAt(idx))) 334 { 335 syntaxError("Expected digit"); 336 } 337 338 StringBuffer number = new StringBuffer (); 340 while (idx < len && Character.isDigit(pattern.charAt(idx))) 341 { 342 number.append(pattern.charAt(idx++)); 343 } 344 try 345 { 346 bracketMin[brackets] = Integer.parseInt(number.toString()); 347 } 348 catch (NumberFormatException e) 349 { 350 syntaxError("Expected valid number"); 351 } 352 353 if (idx >= len) 355 { 356 syntaxError("Expected comma or right bracket"); 357 } 358 359 if (pattern.charAt(idx) == '}') 361 { 362 idx++; 363 bracketOpt[brackets] = 0; 364 return; 365 } 366 367 if (idx >= len || pattern.charAt(idx++) != ',') 369 { 370 syntaxError("Expected comma"); 371 } 372 373 if (idx >= len) 375 { 376 syntaxError("Expected comma or right bracket"); 377 } 378 379 if (pattern.charAt(idx) == '}') 381 { 382 idx++; 383 bracketOpt[brackets] = bracketUnbounded; 384 return; 385 } 386 387 if (idx >= len || !Character.isDigit(pattern.charAt(idx))) 389 { 390 syntaxError("Expected digit"); 391 } 392 393 number.setLength(0); 395 while (idx < len && Character.isDigit(pattern.charAt(idx))) 396 { 397 number.append(pattern.charAt(idx++)); 398 } 399 try 400 { 401 bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets]; 402 } 403 catch (NumberFormatException e) 404 { 405 syntaxError("Expected valid number"); 406 } 407 408 if (bracketOpt[brackets] < 0) 410 { 411 syntaxError("Bad range"); 412 } 413 414 if (idx >= len || pattern.charAt(idx++) != '}') 416 { 417 syntaxError("Missing close brace"); 418 } 419 } 420 421 430 char escape() throws RESyntaxException 431 { 432 if (pattern.charAt(idx) != '\\') 434 { 435 internalError(); 436 } 437 438 if (idx + 1 == len) 440 { 441 syntaxError("Escape terminates string"); 442 } 443 444 idx += 2; 446 char escapeChar = pattern.charAt(idx - 1); 447 switch (escapeChar) 448 { 449 case RE.E_BOUND: 450 case RE.E_NBOUND: 451 return ESC_COMPLEX; 452 453 case RE.E_ALNUM: 454 case RE.E_NALNUM: 455 case RE.E_SPACE: 456 case RE.E_NSPACE: 457 case RE.E_DIGIT: 458 case RE.E_NDIGIT: 459 return ESC_CLASS; 460 461 case 'u': 462 case 'x': 463 { 464 int hexDigits = (escapeChar == 'u' ? 4 : 2); 466 467 int val = 0; 469 for ( ; idx < len && hexDigits-- > 0; idx++) 470 { 471 char c = pattern.charAt(idx); 473 474 if (c >= '0' && c <= '9') 476 { 477 val = (val << 4) + c - '0'; 479 } 480 else 481 { 482 c = Character.toLowerCase(c); 484 if (c >= 'a' && c <= 'f') 485 { 486 val = (val << 4) + (c - 'a') + 10; 488 } 489 else 490 { 491 syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar); 494 } 495 } 496 } 497 return (char)val; 498 } 499 500 case 't': 501 return '\t'; 502 503 case 'n': 504 return '\n'; 505 506 case 'r': 507 return '\r'; 508 509 case 'f': 510 return '\f'; 511 512 case '0': 513 case '1': 514 case '2': 515 case '3': 516 case '4': 517 case '5': 518 case '6': 519 case '7': 520 case '8': 521 case '9': 522 523 if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0') 525 { 526 int val = escapeChar - '0'; 528 if (idx < len && Character.isDigit(pattern.charAt(idx))) 529 { 530 val = ((val << 3) + (pattern.charAt(idx++) - '0')); 531 if (idx < len && Character.isDigit(pattern.charAt(idx))) 532 { 533 val = ((val << 3) + (pattern.charAt(idx++) - '0')); 534 } 535 } 536 return (char)val; 537 } 538 539 return ESC_BACKREF; 541 542 default: 543 544 return escapeChar; 546 } 547 } 548 549 554 int characterClass() throws RESyntaxException 555 { 556 if (pattern.charAt(idx) != '[') 558 { 559 internalError(); 560 } 561 562 if ((idx + 1) >= len || pattern.charAt(++idx) == ']') 564 { 565 syntaxError("Empty or unterminated class"); 566 } 567 568 if (idx < len && pattern.charAt(idx) == ':') 570 { 571 idx++; 573 574 int idxStart = idx; 576 while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z') 577 { 578 idx++; 579 } 580 581 if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']') 583 { 584 String charClass = pattern.substring(idxStart, idx); 586 587 Character i = (Character )hashPOSIX.get(charClass); 589 if (i != null) 590 { 591 idx += 2; 593 594 return node(RE.OP_POSIXCLASS, i.charValue()); 596 } 597 syntaxError("Invalid POSIX character class '" + charClass + "'"); 598 } 599 syntaxError("Invalid POSIX character class syntax"); 600 } 601 602 int ret = node(RE.OP_ANYOF, 0); 604 605 char CHAR_INVALID = Character.MAX_VALUE; 607 char last = CHAR_INVALID; 608 char simpleChar = 0; 609 boolean include = true; 610 boolean definingRange = false; 611 int idxFirst = idx; 612 char rangeStart = Character.MIN_VALUE; 613 char rangeEnd; 614 RERange range = new RERange(); 615 while (idx < len && pattern.charAt(idx) != ']') 616 { 617 618 switchOnCharacter: 619 620 switch (pattern.charAt(idx)) 622 { 623 case '^': 624 include = !include; 625 if (idx == idxFirst) 626 { 627 range.include(Character.MIN_VALUE, Character.MAX_VALUE, true); 628 } 629 idx++; 630 continue; 631 632 case '\\': 633 { 634 char c; 636 switch (c = escape ()) 637 { 638 case ESC_COMPLEX: 639 case ESC_BACKREF: 640 641 syntaxError("Bad character class"); 643 644 case ESC_CLASS: 645 646 if (definingRange) 648 { 649 syntaxError("Bad character class"); 650 } 651 652 switch (pattern.charAt(idx - 1)) 654 { 655 case RE.E_NSPACE: 656 case RE.E_NDIGIT: 657 case RE.E_NALNUM: 658 syntaxError("Bad character class"); 659 660 case RE.E_SPACE: 661 range.include('\t', include); 662 range.include('\r', include); 663 range.include('\f', include); 664 range.include('\n', include); 665 range.include('\b', include); 666 range.include(' ', include); 667 break; 668 669 case RE.E_ALNUM: 670 range.include('a', 'z', include); 671 range.include('A', 'Z', include); 672 range.include('_', include); 673 674 676 case RE.E_DIGIT: 677 range.include('0', '9', include); 678 break; 679 } 680 681 last = CHAR_INVALID; 683 break; 684 685 default: 686 687 simpleChar = c; 689 break switchOnCharacter; 690 } 691 } 692 continue; 693 694 case '-': 695 696 if (definingRange) 698 { 699 syntaxError("Bad class range"); 700 } 701 definingRange = true; 702 703 rangeStart = (last == CHAR_INVALID ? 0 : last); 705 706 if ((idx + 1) < len && pattern.charAt(++idx) == ']') 708 { 709 simpleChar = Character.MAX_VALUE; 710 break; 711 } 712 continue; 713 714 default: 715 simpleChar = pattern.charAt(idx++); 716 break; 717 } 718 719 if (definingRange) 721 { 722 rangeEnd = simpleChar; 724 725 if (rangeStart >= rangeEnd) 727 { 728 syntaxError("Bad character class"); 729 } 730 range.include(rangeStart, rangeEnd, include); 731 732 last = CHAR_INVALID; 734 definingRange = false; 735 } 736 else 737 { 738 if (idx >= len || pattern.charAt(idx) != '-') 740 { 741 range.include(simpleChar, include); 742 } 743 last = simpleChar; 744 } 745 } 746 747 if (idx == len) 749 { 750 syntaxError("Unterminated character class"); 751 } 752 753 idx++; 755 756 instruction[ret + RE.offsetOpdata] = (char)range.num; 758 for (int i = 0; i < range.num; i++) 759 { 760 emit((char)range.minRange[i]); 761 emit((char)range.maxRange[i]); 762 } 763 return ret; 764 } 765 766 774 int atom() throws RESyntaxException 775 { 776 int ret = node(RE.OP_ATOM, 0); 778 779 int lenAtom = 0; 781 782 784 atomLoop: 785 786 while (idx < len) 787 { 788 if ((idx + 1) < len) 790 { 791 char c = pattern.charAt(idx + 1); 792 793 if (pattern.charAt(idx) == '\\') 795 { 796 int idxEscape = idx; 797 escape(); 798 if (idx < len) 799 { 800 c = pattern.charAt(idx); 801 } 802 idx = idxEscape; 803 } 804 805 switch (c) 807 { 808 case '{': 809 case '?': 810 case '*': 811 case '+': 812 813 if (lenAtom != 0) 816 { 817 break atomLoop; 818 } 819 } 820 } 821 822 switch (pattern.charAt(idx)) 824 { 825 case ']': 826 case '^': 827 case '$': 828 case '.': 829 case '[': 830 case '(': 831 case ')': 832 case '|': 833 break atomLoop; 834 835 case '{': 836 case '?': 837 case '*': 838 case '+': 839 840 if (lenAtom == 0) 842 { 843 syntaxError("Missing operand to closure"); 845 } 846 break atomLoop; 847 848 case '\\': 849 850 { 851 int idxBeforeEscape = idx; 853 char c = escape(); 854 855 if ((c & ESC_MASK) == ESC_MASK) 857 { 858 idx = idxBeforeEscape; 860 break atomLoop; 861 } 862 863 emit(c); 865 lenAtom++; 866 } 867 break; 868 869 default: 870 871 emit(pattern.charAt(idx++)); 873 lenAtom++; 874 break; 875 } 876 } 877 878 if (lenAtom == 0) 880 { 881 internalError(); 882 } 883 884 instruction[ret + RE.offsetOpdata] = (char)lenAtom; 886 return ret; 887 } 888 889 895 int terminal(int[] flags) throws RESyntaxException 896 { 897 switch (pattern.charAt(idx)) 898 { 899 case RE.OP_EOL: 900 case RE.OP_BOL: 901 case RE.OP_ANY: 902 return node(pattern.charAt(idx++), 0); 903 904 case '[': 905 return characterClass(); 906 907 case '(': 908 return expr(flags); 909 910 case ')': 911 syntaxError("Unexpected close paren"); 912 913 case '|': 914 internalError(); 915 916 case ']': 917 syntaxError("Mismatched class"); 918 919 case 0: 920 syntaxError("Unexpected end of input"); 921 922 case '?': 923 case '+': 924 case '{': 925 case '*': 926 syntaxError("Missing operand to closure"); 927 928 case '\\': 929 { 930 int idxBeforeEscape = idx; 932 933 switch (escape()) 935 { 936 case ESC_CLASS: 937 case ESC_COMPLEX: 938 flags[0] &= ~NODE_NULLABLE; 939 return node(RE.OP_ESCAPE, pattern.charAt(idx - 1)); 940 941 case ESC_BACKREF: 942 { 943 char backreference = (char)(pattern.charAt(idx - 1) - '0'); 944 if (parens <= backreference) 945 { 946 syntaxError("Bad backreference"); 947 } 948 flags[0] |= NODE_NULLABLE; 949 return node(RE.OP_BACKREF, backreference); 950 } 951 952 default: 953 954 idx = idxBeforeEscape; 957 flags[0] &= ~NODE_NULLABLE; 958 break; 959 } 960 } 961 } 962 963 flags[0] &= ~NODE_NULLABLE; 966 return atom(); 967 } 968 969 975 int closure(int[] flags) throws RESyntaxException 976 { 977 int idxBeforeTerminal = idx; 979 980 int[] terminalFlags = { NODE_NORMAL }; 982 983 int ret = terminal(terminalFlags); 985 986 flags[0] |= terminalFlags[0]; 988 989 if (idx >= len) 991 { 992 return ret; 993 } 994 boolean greedy = true; 995 char closureType = pattern.charAt(idx); 996 switch (closureType) 997 { 998 case '?': 999 case '*': 1000 1001 flags[0] |= NODE_NULLABLE; 1003 1004 case '+': 1005 1006 idx++; 1008 1009 case '{': 1010 1011 int opcode = instruction[ret + RE.offsetOpcode]; 1013 if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) 1014 { 1015 syntaxError("Bad closure operand"); 1016 } 1017 if ((terminalFlags[0] & NODE_NULLABLE) != 0) 1018 { 1019 syntaxError("Closure operand can't be nullable"); 1020 } 1021 break; 1022 } 1023 1024 if (idx < len && pattern.charAt(idx) == '?') 1026 { 1027 idx++; 1028 greedy = false; 1029 } 1030 1031 if (greedy) 1032 { 1033 switch (closureType) 1035 { 1036 case '{': 1037 { 1038 boolean found = false; 1040 int i; 1041 allocBrackets(); 1042 for (i = 0; i < brackets; i++) 1043 { 1044 if (bracketStart[i] == idx) 1045 { 1046 found = true; 1047 break; 1048 } 1049 } 1050 1051 if (!found) 1053 { 1054 if (brackets >= maxBrackets) 1055 { 1056 reallocBrackets(); 1057 } 1058 bracketStart[brackets] = idx; 1059 bracket(); 1060 bracketEnd[brackets] = idx; 1061 i = brackets++; 1062 } 1063 1064 if (bracketMin[i]-- > 0) 1066 { 1067 if (bracketMin[i] > 0 || bracketOpt[i] != 0) { 1068 idx = idxBeforeTerminal; 1070 } else { 1071 idx = bracketEnd[i]; 1073 } 1074 break; 1075 } 1076 1077 if (bracketOpt[i] == bracketUnbounded) 1079 { 1080 closureType = '*'; 1083 bracketOpt[i] = 0; 1084 idx = bracketEnd[i]; 1085 } 1086 else 1087 if (bracketOpt[i]-- > 0) 1088 { 1089 if (bracketOpt[i] > 0) 1090 { 1091 idx = idxBeforeTerminal; 1093 } else { 1094 idx = bracketEnd[i]; 1096 } 1097 closureType = '?'; 1099 } 1100 else 1101 { 1102 lenInstruction = ret; 1104 node(RE.OP_NOTHING, 0); 1105 1106 idx = bracketEnd[i]; 1108 break; 1109 } 1110 } 1111 1112 1114 case '?': 1115 case '*': 1116 1117 if (!greedy) 1118 { 1119 break; 1120 } 1121 1122 if (closureType == '?') 1123 { 1124 nodeInsert(RE.OP_BRANCH, 0, ret); setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); int nothing = node (RE.OP_NOTHING, 0); setNextOfEnd(ret, nothing); setNextOfEnd(ret + RE.nodeSize, nothing); } 1131 1132 if (closureType == '*') 1133 { 1134 nodeInsert(RE.OP_BRANCH, 0, ret); setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); setNextOfEnd(ret + RE.nodeSize, ret); setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); } 1142 break; 1143 1144 case '+': 1145 { 1146 int branch; 1148 branch = node(RE.OP_BRANCH, 0); setNextOfEnd(ret, branch); setNextOfEnd(node(RE.OP_GOTO, 0), ret); setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); } 1154 break; 1155 } 1156 } 1157 else 1158 { 1159 setNextOfEnd(ret, node(RE.OP_END, 0)); 1161 1162 switch (closureType) 1164 { 1165 case '?': 1166 nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret); 1167 break; 1168 1169 case '*': 1170 nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret); 1171 break; 1172 1173 case '+': 1174 nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret); 1175 break; 1176 } 1177 1178 setNextOfEnd(ret, lenInstruction); 1180 } 1181 return ret; 1182 } 1183 1184 1190 int branch(int[] flags) throws RESyntaxException 1191 { 1192 int node; 1194 int ret = node(RE.OP_BRANCH, 0); 1195 int chain = -1; 1196 int[] closureFlags = new int[1]; 1197 boolean nullable = true; 1198 while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') 1199 { 1200 closureFlags[0] = NODE_NORMAL; 1202 node = closure(closureFlags); 1203 if (closureFlags[0] == NODE_NORMAL) 1204 { 1205 nullable = false; 1206 } 1207 1208 if (chain != -1) 1210 { 1211 setNextOfEnd(chain, node); 1212 } 1213 1214 chain = node; 1216 } 1217 1218 if (chain == -1) 1220 { 1221 node(RE.OP_NOTHING, 0); 1222 } 1223 1224 if (nullable) 1226 { 1227 flags[0] |= NODE_NULLABLE; 1228 } 1229 return ret; 1230 } 1231 1232 1239 int expr(int[] flags) throws RESyntaxException 1240 { 1241 int paren = -1; 1243 int ret = -1; 1244 int closeParens = parens; 1245 if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') 1246 { 1247 if ( idx + 2 < len && pattern.charAt( idx + 1 ) == '?' && pattern.charAt( idx + 2 ) == ':' ) 1249 { 1250 paren = 2; 1251 idx += 3; 1252 ret = node( RE.OP_OPEN_CLUSTER, 0 ); 1253 } 1254 else 1255 { 1256 paren = 1; 1257 idx++; 1258 ret = node(RE.OP_OPEN, parens++); 1259 } 1260 } 1261 flags[0] &= ~NODE_TOPLEVEL; 1262 1263 int branch = branch(flags); 1265 if (ret == -1) 1266 { 1267 ret = branch; 1268 } 1269 else 1270 { 1271 setNextOfEnd(ret, branch); 1272 } 1273 1274 while (idx < len && pattern.charAt(idx) == '|') 1276 { 1277 idx++; 1278 branch = branch(flags); 1279 setNextOfEnd(ret, branch); 1280 } 1281 1282 int end; 1284 if ( paren > 0 ) 1285 { 1286 if (idx < len && pattern.charAt(idx) == ')') 1287 { 1288 idx++; 1289 } 1290 else 1291 { 1292 syntaxError("Missing close paren"); 1293 } 1294 if ( paren == 1 ) 1295 { 1296 end = node(RE.OP_CLOSE, closeParens); 1297 } 1298 else 1299 { 1300 end = node( RE.OP_CLOSE_CLUSTER, 0 ); 1301 } 1302 } 1303 else 1304 { 1305 end = node(RE.OP_END, 0); 1306 } 1307 1308 setNextOfEnd(ret, end); 1310 1311 int currentNode = ret; 1313 int nextNodeOffset = instruction[ currentNode + RE.offsetNext ]; 1314 while ( nextNodeOffset != 0 && currentNode < lenInstruction ) 1316 { 1317 if ( instruction[ currentNode + RE.offsetOpcode ] == RE.OP_BRANCH ) 1319 { 1320 setNextOfEnd( currentNode + RE.nodeSize, end ); 1321 } 1322 nextNodeOffset = instruction[ currentNode + RE.offsetNext ]; 1323 currentNode += nextNodeOffset; 1324 } 1325 1326 return ret; 1328 } 1329 1330 1340 public REProgram compile(String pattern) throws RESyntaxException 1341 { 1342 this.pattern = pattern; len = pattern.length(); idx = 0; lenInstruction = 0; parens = 1; brackets = 0; 1350 int[] flags = { NODE_TOPLEVEL }; 1352 1353 expr(flags); 1355 1356 if (idx != len) 1358 { 1359 if (pattern.charAt(idx) == ')') 1360 { 1361 syntaxError("Unmatched close paren"); 1362 } 1363 syntaxError("Unexpected input remains"); 1364 } 1365 1366 char[] ins = new char[lenInstruction]; 1368 System.arraycopy(instruction, 0, ins, 0, lenInstruction); 1369 return new REProgram(parens, ins); 1370 } 1371 1372 1375 class RERange 1376 { 1377 int size = 16; int[] minRange = new int[size]; int[] maxRange = new int[size]; int num = 0; 1382 1386 void delete(int index) 1387 { 1388 if (num == 0 || index >= num) 1390 { 1391 return; 1392 } 1393 1394 while (++index < num) 1396 { 1397 if (index - 1 >= 0) 1398 { 1399 minRange[index-1] = minRange[index]; 1400 maxRange[index-1] = maxRange[index]; 1401 } 1402 } 1403 1404 num--; 1406 } 1407 1408 1413 void merge(int min, int max) 1414 { 1415 for (int i = 0; i < num; i++) 1417 { 1418 if (min >= minRange[i] && max <= maxRange[i]) 1420 { 1421 return; 1422 } 1423 1424 else if (min <= minRange[i] && max >= maxRange[i]) 1426 { 1427 delete(i); 1428 merge(min, max); 1429 return; 1430 } 1431 1432 else if (min >= minRange[i] && min <= maxRange[i]) 1434 { 1435 delete(i); 1436 min = minRange[i]; 1437 merge(min, max); 1438 return; 1439 } 1440 1441 else if (max >= minRange[i] && max <= maxRange[i]) 1443 { 1444 delete(i); 1445 max = maxRange[i]; 1446 merge(min, max); 1447 return; 1448 } 1449 } 1450 1451 if (num >= size) 1453 { 1454 size *= 2; 1455 int[] newMin = new int[size]; 1456 int[] newMax = new int[size]; 1457 System.arraycopy(minRange, 0, newMin, 0, num); 1458 System.arraycopy(maxRange, 0, newMax, 0, num); 1459 minRange = newMin; 1460 maxRange = newMax; 1461 } 1462 minRange[num] = min; 1463 maxRange[num] = max; 1464 num++; 1465 } 1466 1467 1472 void remove(int min, int max) 1473 { 1474 for (int i = 0; i < num; i++) 1476 { 1477 if (minRange[i] >= min && maxRange[i] <= max) 1479 { 1480 delete(i); 1481 i--; 1482 return; 1483 } 1484 1485 else if (min >= minRange[i] && max <= maxRange[i]) 1487 { 1488 int minr = minRange[i]; 1489 int maxr = maxRange[i]; 1490 delete(i); 1491 if (minr < min) 1492 { 1493 merge(minr, min - 1); 1494 } 1495 if (max < maxr) 1496 { 1497 merge(max + 1, maxr); 1498 } 1499 return; 1500 } 1501 1502 else if (minRange[i] >= min && minRange[i] <= max) 1504 { 1505 minRange[i] = max + 1; 1506 return; 1507 } 1508 1509 else if (maxRange[i] >= min && maxRange[i] <= max) 1511 { 1512 maxRange[i] = min - 1; 1513 return; 1514 } 1515 } 1516 } 1517 1518 1524 void include(int min, int max, boolean include) 1525 { 1526 if (include) 1527 { 1528 merge(min, max); 1529 } 1530 else 1531 { 1532 remove(min, max); 1533 } 1534 } 1535 1536 1541 void include(char minmax, boolean include) 1542 { 1543 include(minmax, minmax, include); 1544 } 1545 } 1546} 1547 | Popular Tags |