| 1 package org.apache.regexp; 2 3 59 60 import org.apache.regexp.RE; 61 import java.util.Hashtable ; 62 63 77 public class RECompiler 78 { 79 char[] instruction; int lenInstruction; 83 String pattern; int len; int idx; int parens; 89 static final int NODE_NORMAL = 0; static final int NODE_NULLABLE = 1; static final int NODE_TOPLEVEL = 2; 94 static final char ESC_MASK = 0xfff0; static final char ESC_BACKREF = 0xffff; static final char ESC_COMPLEX = 0xfffe; static final char ESC_CLASS = 0xfffd; 100 int maxBrackets = 10; static final int bracketUnbounded = -1; int brackets = 0; int[] bracketStart = null; int[] bracketEnd = null; int[] bracketMin = null; int[] bracketOpt = null; 109 static Hashtable hashPOSIX = new Hashtable (); 111 static 112 { 113 hashPOSIX.put("alnum", new Character (RE.POSIX_CLASS_ALNUM)); 114 hashPOSIX.put("alpha", new Character (RE.POSIX_CLASS_ALPHA)); 115 hashPOSIX.put("blank", new Character (RE.POSIX_CLASS_BLANK)); 116 hashPOSIX.put("cntrl", new Character (RE.POSIX_CLASS_CNTRL)); 117 hashPOSIX.put("digit", new Character (RE.POSIX_CLASS_DIGIT)); 118 hashPOSIX.put("graph", new Character (RE.POSIX_CLASS_GRAPH)); 119 hashPOSIX.put("lower", new Character (RE.POSIX_CLASS_LOWER)); 120 hashPOSIX.put("print", new Character (RE.POSIX_CLASS_PRINT)); 121 hashPOSIX.put("punct", new Character (RE.POSIX_CLASS_PUNCT)); 122 hashPOSIX.put("space", new Character (RE.POSIX_CLASS_SPACE)); 123 hashPOSIX.put("upper", new Character (RE.POSIX_CLASS_UPPER)); 124 hashPOSIX.put("xdigit", new Character (RE.POSIX_CLASS_XDIGIT)); 125 hashPOSIX.put("javastart", new Character (RE.POSIX_CLASS_JSTART)); 126 hashPOSIX.put("javapart", new Character (RE.POSIX_CLASS_JPART)); 127 } 128 129 132 public RECompiler() 133 { 134 instruction = new char[128]; 136 lenInstruction = 0; 137 } 138 139 144 void ensure(int n) 145 { 146 int curlen = instruction.length; 148 149 if (lenInstruction + n >= curlen) 151 { 152 while (lenInstruction + n >= curlen) 154 { 155 curlen *= 2; 156 } 157 158 char[] newInstruction = new char[curlen]; 160 System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction); 161 instruction = newInstruction; 162 } 163 } 164 165 169 void emit(char c) 170 { 171 ensure(1); 173 174 instruction[lenInstruction++] = c; 176 } 177 178 185 void nodeInsert(char opcode, int opdata, int insertAt) 186 { 187 ensure(RE.nodeSize); 189 190 System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt); 192 instruction[insertAt + RE.offsetOpcode] = opcode; 193 instruction[insertAt + RE.offsetOpdata] = (char)opdata; 194 instruction[insertAt + RE.offsetNext] = 0; 195 lenInstruction += RE.nodeSize; 196 } 197 198 203 void setNextOfEnd(int node, int pointTo) 204 { 205 int next = instruction[node + RE.offsetNext]; 207 while ( next != 0 && node < lenInstruction ) 210 { 211 if ( node == pointTo ) { 218 pointTo = lenInstruction; 219 } 220 node += next; 221 next = instruction[node + RE.offsetNext]; 222 } 223 if ( node < lenInstruction ) { 226 instruction[node + RE.offsetNext] = (char)(short)(pointTo - node); 228 } 229 } 230 231 237 int node(char opcode, int opdata) 238 { 239 ensure(RE.nodeSize); 241 242 instruction[lenInstruction + RE.offsetOpcode] = opcode; 244 instruction[lenInstruction + RE.offsetOpdata] = (char)opdata; 245 instruction[lenInstruction + RE.offsetNext] = 0; 246 lenInstruction += RE.nodeSize; 247 248 return lenInstruction - RE.nodeSize; 250 } 251 252 253 257 void internalError() throws Error  258 { 259 throw new Error ("Internal error!"); 260 } 261 262 266 void syntaxError(String s) throws RESyntaxException 267 { 268 throw new RESyntaxException(s); 269 } 270 271 274 void allocBrackets() 275 { 276 if (bracketStart == null) 278 { 279 bracketStart = new int[maxBrackets]; 281 bracketEnd = new int[maxBrackets]; 282 bracketMin = new int[maxBrackets]; 283 bracketOpt = new int[maxBrackets]; 284 285 for (int i = 0; i < maxBrackets; i++) 287 { 288 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1; 289 } 290 } 291 } 292 293 294 synchronized void reallocBrackets() { 295 if (bracketStart == null) { 297 allocBrackets(); 298 } 299 300 int new_size = maxBrackets * 2; 301 int[] new_bS = new int[new_size]; 302 int[] new_bE = new int[new_size]; 303 int[] new_bM = new int[new_size]; 304 int[] new_bO = new int[new_size]; 305 for (int i=brackets; i<new_size; i++) { 307 new_bS[i] = new_bE[i] = new_bM[i] = new_bO[i] = -1; 308 } 309 System.arraycopy(bracketStart,0, new_bS,0, brackets); 310 System.arraycopy(bracketEnd,0, new_bE,0, brackets); 311 System.arraycopy(bracketMin,0, new_bM,0, brackets); 312 System.arraycopy(bracketOpt,0, new_bO,0, brackets); 313 bracketStart = new_bS; 314 bracketEnd = new_bE; 315 bracketMin = new_bM; 316 bracketOpt = new_bO; 317 maxBrackets = new_size; 318 } 319 320 324 void bracket() throws RESyntaxException 325 { 326 if (idx >= len || pattern.charAt(idx++) != '{') 328 { 329 internalError(); 330 } 331 332 if (idx >= len || !Character.isDigit(pattern.charAt(idx))) 334 { 335 syntaxError("Expected digit"); 336 } 337 338 StringBuffer number = new StringBuffer (); 340 while (idx < len && Character.isDigit(pattern.charAt(idx))) 341 { 342 number.append(pattern.charAt(idx++)); 343 } 344 try 345 { 346 bracketMin[brackets] = Integer.parseInt(number.toString()); 347 } 348 catch (NumberFormatException e) 349 { 350 syntaxError("Expected valid number"); 351 } 352 353 if (idx >= len) 355 { 356 syntaxError("Expected comma or right bracket"); 357 } 358 359 if (pattern.charAt(idx) == '}') 361 { 362 idx++; 363 bracketOpt[brackets] = 0; 364 return; 365 } 366 367 if (idx >= len || pattern.charAt(idx++) != ',') 369 { 370 syntaxError("Expected comma"); 371 } 372 373 if (idx >= len) 375 { 376 syntaxError("Expected comma or right bracket"); 377 } 378 379 if (pattern.charAt(idx) == '}') 381 { 382 idx++; 383 bracketOpt[brackets] = bracketUnbounded; 384 return; 385 } 386 387 if (idx >= len || !Character.isDigit(pattern.charAt(idx))) 389 { 390 syntaxError("Expected digit"); 391 } 392 393 number.setLength(0); 395 while (idx < len && Character.isDigit(pattern.charAt(idx))) 396 { 397 number.append(pattern.charAt(idx++)); 398 } 399 try 400 { 401 bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets]; 402 } 403 catch (NumberFormatException e) 404 { 405 syntaxError("Expected valid number"); 406 } 407 408 if (bracketOpt[brackets] < 0) 410 { 411 syntaxError("Bad range"); 412 } 413 414 if (idx >= len || pattern.charAt(idx++) != '}') 416 { 417 syntaxError("Missing close brace"); 418 } 419 } 420 421 430 char escape() throws RESyntaxException 431 { 432 if (pattern.charAt(idx) != '\\') 434 { 435 internalError(); 436 } 437 438 if (idx + 1 == len) 440 { 441 syntaxError("Escape terminates string"); 442 } 443 444 idx += 2; 446 char escapeChar = pattern.charAt(idx - 1); 447 switch (escapeChar) 448 { 449 case RE.E_BOUND: 450 case RE.E_NBOUND: 451 return ESC_COMPLEX; 452 453 case RE.E_ALNUM: 454 case RE.E_NALNUM: 455 case RE.E_SPACE: 456 case RE.E_NSPACE: 457 case RE.E_DIGIT: 458 case RE.E_NDIGIT: 459 return ESC_CLASS; 460 461 case 'u': 462 case 'x': 463 { 464 int hexDigits = (escapeChar == 'u' ? 4 : 2); 466 467 int val = 0; 469 for ( ; idx < len && hexDigits-- > 0; idx++) 470 { 471 char c = pattern.charAt(idx); 473 474 if (c >= '0' && c <= '9') 476 { 477 val = (val << 4) + c - '0'; 479 } 480 else 481 { 482 c = Character.toLowerCase(c); 484 if (c >= 'a' && c <= 'f') 485 { 486 val = (val << 4) + (c - 'a') + 10; 488 } 489 else 490 { 491 syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar); 494 } 495 } 496 } 497 return (char)val; 498 } 499 500 case 't': 501 return '\t'; 502 503 case 'n': 504 return '\n'; 505 506 case 'r': 507 return '\r'; 508 509 case 'f': 510 return '\f'; 511 512 case '0': 513 case '1': 514 case '2': 515 case '3': 516 case '4': 517 case '5': 518 case '6': 519 case '7': 520 case '8': 521 case '9': 522 523 if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0') 525 { 526 int val = escapeChar - '0'; 528 if (idx < len && Character.isDigit(pattern.charAt(idx))) 529 { 530 val = ((val << 3) + (pattern.charAt(idx++) - '0')); 531 if (idx < len && Character.isDigit(pattern.charAt(idx))) 532 { 533 val = ((val << 3) + (pattern.charAt(idx++) - '0')); 534 } 535 } 536 return (char)val; 537 } 538 539 return ESC_BACKREF; 541 542 default: 543 544 return escapeChar; 546 } 547 } 548 549 554 int characterClass() throws RESyntaxException 555 { 556 if (pattern.charAt(idx) != '[') 558 { 559 internalError(); 560 } 561 562 if ((idx + 1) >= len || pattern.charAt(++idx) == ']') 564 { 565 syntaxError("Empty or unterminated class"); 566 } 567 568 if (idx < len && pattern.charAt(idx) == ':') 570 { 571 idx++; 573 574 int idxStart = idx; 576 while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z') 577 { 578 idx++; 579 } 580 581 if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']') 583 { 584 String charClass = pattern.substring(idxStart, idx); 586 587 Character i = (Character )hashPOSIX.get(charClass); 589 if (i != null) 590 { 591 idx += 2; 593 594 return node(RE.OP_POSIXCLASS, i.charValue()); 596 } 597 syntaxError("Invalid POSIX character class '" + charClass + "'"); 598 } 599 syntaxError("Invalid POSIX character class syntax"); 600 } 601 602 int ret = node(RE.OP_ANYOF, 0); 604 605 char CHAR_INVALID = Character.MAX_VALUE; 607 char last = CHAR_INVALID; 608 char simpleChar = 0; 609 boolean include = true; 610 boolean definingRange = false; 611 int idxFirst = idx; 612 char rangeStart = Character.MIN_VALUE; 613 char rangeEnd; 614 RERange range = new RERange(); 615 while (idx < len && pattern.charAt(idx) != ']') 616 { 617 618 switchOnCharacter: 619 620 switch (pattern.charAt(idx)) 622 { 623 case '^': 624 include = !include; 625 if (idx == idxFirst) 626 { 627 range.include(Character.MIN_VALUE, Character.MAX_VALUE, true); 628 } 629 idx++; 630 continue; 631 632 case '\\': 633 { 634 char c; 636 switch (c = escape ()) 637 { 638 case ESC_COMPLEX: 639 case ESC_BACKREF: 640 641 syntaxError("Bad character class"); 643 644 case ESC_CLASS: 645 646 if (definingRange) 648 { 649 syntaxError("Bad character class"); 650 } 651 652 switch (pattern.charAt(idx - 1)) 654 { 655 case RE.E_NSPACE: 656 case RE.E_NDIGIT: 657 case RE.E_NALNUM: 658 syntaxError("Bad character class"); 659 660 case RE.E_SPACE: 661 range.include('\t', include); 662 range.include('\r', include); 663 range.include('\f', include); 664 range.include('\n', include); 665 range.include('\b', include); 666 range.include(' ', include); 667 break; 668 669 case RE.E_ALNUM: 670 range.include('a', 'z', include); 671 range.include('A', 'Z', include); 672 range.include('_', include); 673 674 676 case RE.E_DIGIT: 677 range.include('0', '9', include); 678 break; 679 } 680 681 last = CHAR_INVALID; 683 break; 684 685 default: 686 687 simpleChar = c; 689 break switchOnCharacter; 690 } 691 } 692 continue; 693 694 case '-': 695 696 if (definingRange) 698 { 699 syntaxError("Bad class range"); 700 } 701 definingRange = true; 702 703 rangeStart = (last == CHAR_INVALID ? 0 : last); 705 706 if ((idx + 1) < len && pattern.charAt(++idx) == ']') 708 { 709 simpleChar = Character.MAX_VALUE; 710 break; 711 } 712 continue; 713 714 default: 715 simpleChar = pattern.charAt(idx++); 716 break; 717 } 718 719 if (definingRange) 721 { 722 rangeEnd = simpleChar; 724 725 if (rangeStart >= rangeEnd) 727 { 728 syntaxError("Bad character class"); 729 } 730 range.include(rangeStart, rangeEnd, include); 731 732 last = CHAR_INVALID; 734 definingRange = false; 735 } 736 else 737 { 738 if (idx >= len || pattern.charAt(idx) != '-') 740 { 741 range.include(simpleChar, include); 742 } 743 last = simpleChar; 744 } 745 } 746 747 if (idx == len) 749 { 750 syntaxError("Unterminated character class"); 751 } 752 753 idx++; 755 756 instruction[ret + RE.offsetOpdata] = (char)range.num; 758 for (int i = 0; i < range.num; i++) 759 { 760 emit((char)range.minRange[i]); 761 emit((char)range.maxRange[i]); 762 } 763 return ret; 764 } 765 766 774 int atom() throws RESyntaxException 775 { 776 int ret = node(RE.OP_ATOM, 0); 778 779 int lenAtom = 0; 781 782 784 atomLoop: 785 786 while (idx < len) 787 { 788 if ((idx + 1) < len) 790 { 791 char c = pattern.charAt(idx + 1); 792 793 if (pattern.charAt(idx) == '\\') 795 { 796 int idxEscape = idx; 797 escape(); 798 if (idx < len) 799 { 800 c = pattern.charAt(idx); 801 } 802 idx = idxEscape; 803 } 804 805 switch (c) 807 { 808 case '{': 809 case '?': 810 case '*': 811 case '+': 812 813 if (lenAtom != 0) 816 { 817 break atomLoop; 818 } 819 } 820 } 821 822 switch (pattern.charAt(idx)) 824 { 825 case ']': 826 case '^': 827 case '$': 828 case '.': 829 case '[': 830 case '(': 831 case ')': 832 case '|': 833 break atomLoop; 834 835 case '{': 836 case '?': 837 case '*': 838 case '+': 839 840 if (lenAtom == 0) 842 { 843 syntaxError("Missing operand to closure"); 845
|