1 package org.apache.oro.text.awk; 2 3 59 60 import org.apache.oro.text.regex.*; 61 62 145 public final class AwkCompiler implements PatternCompiler { 146 public static final int DEFAULT_MASK = 0; 147 public static final int CASE_INSENSITIVE_MASK = 0x0001; 148 149 static final char _END_OF_INPUT = '\uFFFF'; 150 151 private boolean __inCharacterClass, __caseSensitive; 152 private boolean __beginAnchor, __endAnchor; 153 private char __lookahead; 154 private int __position, __bytesRead, __expressionLength; 155 private char[] __regularExpression; 156 private int __openParen, __closeParen; 157 158 public AwkCompiler() { } 159 160 private static boolean __isMetachar(char token) { 161 return (token == '*' || token == '?' || token == '+' || 162 token == '[' || token == ']' || token == '(' || 163 token == ')' || token == '|' || token == '.'); 165 } 166 167 static boolean _isWordCharacter(char token) { 168 return ((token >= 'a' && token <= 'z') || 169 (token >= 'A' && token <= 'Z') || 170 (token >= '0' && token <= '9') || 171 (token == '_')); 172 } 173 174 static boolean _isLowerCase(char token){ 175 return (token >= 'a' && token <= 'z'); 176 } 177 178 static boolean _isUpperCase(char token){ 179 return (token >= 'A' && token <= 'Z'); 180 } 181 182 static char _toggleCase(char token){ 183 if(_isUpperCase(token)) 184 return (char)(token + 32); 185 else if(_isLowerCase(token)) 186 return (char)(token - 32); 187 188 return token; 189 } 190 191 192 private void __match(char token) throws MalformedPatternException { 193 if(token == __lookahead){ 194 if(__bytesRead < __expressionLength) 195 __lookahead = __regularExpression[__bytesRead++]; 196 else 197 __lookahead = _END_OF_INPUT; 198 } 199 else 200 throw new MalformedPatternException("token: " + token + 201 " does not match lookahead: " + 202 __lookahead + " at position: " + 203 __bytesRead); 204 } 205 206 private void __putback() { 207 if(__lookahead != _END_OF_INPUT) 208 --__bytesRead; 209 __lookahead = __regularExpression[__bytesRead - 1]; 210 } 211 212 private SyntaxNode __regex() throws MalformedPatternException { 213 SyntaxNode left; 214 215 left = __branch(); 216 217 if(__lookahead == '|') { 218 __match('|'); 219 return (new OrNode(left, __regex())); 220 } 221 222 return left; 223 } 224 225 226 private SyntaxNode __branch() throws MalformedPatternException { 227 CatNode current; 228 SyntaxNode left, root; 229 230 left = __piece(); 231 232 if(__lookahead == ')'){ 233 if(__openParen > __closeParen) 234 return left; 235 else 236 throw 237 new MalformedPatternException("Parse error: close parenthesis" 238 + " without matching open parenthesis at position " + __bytesRead); 239 } else if(__lookahead == '|' || __lookahead == _END_OF_INPUT) 240 return left; 241 242 root = current = new CatNode(); 243 current._left = left; 244 245 while(true) { 246 left = __piece(); 247 248 if(__lookahead == ')'){ 249 if(__openParen > __closeParen){ 250 current._right = left; 251 break; 252 } 253 else 254 throw 255 new MalformedPatternException("Parse error: close parenthesis" 256 + " without matching open parenthesis at position " + __bytesRead); 257 } else if(__lookahead == '|' || __lookahead == _END_OF_INPUT){ 258 current._right = left; 259 break; 260 } 261 262 current._right = new CatNode(); 263 current = (CatNode)current._right; 264 current._left = left; 265 } 266 267 return root; 268 } 269 270 271 private SyntaxNode __piece() throws MalformedPatternException { 272 SyntaxNode left; 273 274 left = __atom(); 275 276 switch(__lookahead){ 277 case '+' : __match('+'); return (new PlusNode(left)); 278 case '?' : __match('?'); return (new QuestionNode(left)); 279 case '*' : __match('*'); return (new StarNode(left)); 280 case '{' : return __repetition(left); 281 } 282 283 return left; 284 } 285 286 private int __parseUnsignedInteger(int radix, int minDigits, int maxDigits) 288 throws MalformedPatternException { 289 int num, digits = 0; 290 StringBuffer buf; 291 292 buf = new StringBuffer (4); 294 295 while(Character.digit(__lookahead, radix) != -1 && digits < maxDigits){ 296 buf.append((char)__lookahead); 297 __match(__lookahead); 298 ++digits; 299 } 300 301 if(digits < minDigits || digits > maxDigits) 302 throw 303 new MalformedPatternException( 304 "Parse error: unexpected number of digits at position " + __bytesRead); 305 306 try { 307 num = Integer.parseInt(buf.toString(), radix); 308 } catch(NumberFormatException e) { 309 throw 310 new MalformedPatternException("Parse error: numeric value at " + 311 "position " + __bytesRead + " is invalid"); 312 } 313 314 return num; 315 } 316 317 private SyntaxNode __repetition(SyntaxNode atom) 318 throws MalformedPatternException { 319 int min, max, startPosition[]; 320 StringBuffer minBuffer, maxBuffer; 321 SyntaxNode root = null; 322 CatNode catNode; 323 324 __match('{'); 325 326 min = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE); 327 startPosition = new int[1]; 328 startPosition[0] = __position; 329 330 if(__lookahead == '}'){ 331 __match('}'); 333 334 if(min == 0) 335 throw 336 new MalformedPatternException( 337 "Parse error: Superfluous interval specified at position " + 338 __bytesRead + ". Number of occurences was set to zero."); 339 340 if(min == 1) 341 return atom; 342 343 root = catNode = new CatNode(); 344 catNode._left = atom; 345 346 while(--min > 1) { 347 atom = atom._clone(startPosition); 348 349 catNode._right = new CatNode(); 350 catNode = (CatNode)catNode._right; 351 catNode._left = atom; 352 } 353 354 catNode._right = atom._clone(startPosition); 355 } else if(__lookahead == ','){ 356 __match(','); 357 358 if(__lookahead == '}') { 359 __match('}'); 361 362 if(min == 0) 363 return new StarNode(atom); 364 365 if(min == 1) 366 return new PlusNode(atom); 367 368 root = catNode = new CatNode(); 369 catNode._left = atom; 370 371 while(--min > 0) { 372 atom = atom._clone(startPosition); 373 374 catNode._right = new CatNode(); 375 catNode = (CatNode)catNode._right; 376 catNode._left = atom; 377 } 378 379 catNode._right = new StarNode(atom._clone(startPosition)); 380 } else { 381 max = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE); 383 __match('}'); 384 385 if(max < min) 386 throw 387 new MalformedPatternException("Parse error: invalid interval; " 388 + max + " is less than " + min + " at position " + __bytesRead); 389 if(max == 0) 390 throw 391 new MalformedPatternException( 392 "Parse error: Superfluous interval specified at position " + 393 __bytesRead + ". Number of occurences was set to zero."); 394 395 if(min == 0) { 396 if(max == 1) 397 return new QuestionNode(atom); 398 399 root = catNode = new CatNode(); 400 atom = new QuestionNode(atom); 401 catNode._left = atom; 402 403 while(--max > 1) { 404 atom = atom._clone(startPosition); 405 406 catNode._right = new CatNode(); 407 catNode = (CatNode)catNode._right; 408 catNode._left = atom; 409 } 410 411 catNode._right = atom._clone(startPosition); 412 } else if(min == max) { 413 if(min == 1) 414 return atom; 415 416 root = catNode = new CatNode(); 417 catNode._left = atom; 418 419 while(--min > 1) { 420 atom = atom._clone(startPosition); 421 422 catNode._right = new CatNode(); 423 catNode = (CatNode)catNode._right; 424 catNode._left = atom; 425 } 426 427 catNode._right = atom._clone(startPosition); 428 } else { 429 int count; 430 431 root = catNode = new CatNode(); 432 catNode._left = atom; 433 434 for(count=1; count < min; count++) { 435 atom = atom._clone(startPosition); 436 437 catNode._right = new CatNode(); 438 catNode = (CatNode)catNode._right; 439 catNode._left = atom; 440 } 441 442 atom = new QuestionNode(atom._clone(startPosition)); 443 444 count = max-min; 445 446 if(count == 1) 447 catNode._right = atom; 448 else { 449 catNode._right = new CatNode(); 450 catNode = (CatNode)catNode._right; 451 catNode._left = atom; 452 453 while(--count > 1) { 454 atom = atom._clone(startPosition); 455 456 catNode._right = new CatNode(); 457 catNode = (CatNode)catNode._right; 458 catNode._left = atom; 459 } 460 461 catNode._right = atom._clone(startPosition); 462 } 463 } 464 } 465 } else 466 throw 467 new MalformedPatternException("Parse error: unexpected character " + 468 __lookahead + " in interval at position " + __bytesRead); 469 __position = startPosition[0]; 470 return root; 471 } 472 473 474 private SyntaxNode __backslashToken() throws MalformedPatternException { 475 SyntaxNode current; 476 char token; 477 int number; 478 479 __match('\\'); 480 481 if(__lookahead == 'x'){ 482 __match('x'); 483 current = _newTokenNode((char)__parseUnsignedInteger(16, 2, 2), 485 __position++); 486 } else if(__lookahead == 'c') { 487 __match('c'); 488 token = Character.toUpperCase(__lookahead); 490 token = (char)(token > 63 ? token - 64 : token + 64); 491 current = new TokenNode(token, __position++); 492 __match(__lookahead); 493 } else if(__lookahead >= '0' && __lookahead <= '9') { 494 __match(__lookahead); 495 496 if(__lookahead >= '0' && __lookahead <= '9'){ 497 __putback(); 500 number = __parseUnsignedInteger(10, 2, 3); 501 number = Integer.parseInt(Integer.toString(number), 8); 502 current = _newTokenNode((char)number, __position++); 503 } else { 504 __putback(); 506 if(__lookahead == '0'){ 507 __match('0'); 509 current = new TokenNode('\0', __position++); 510 } else { 511 number = Character.digit(__lookahead, 10); 513 current = _newTokenNode(__lookahead, __position++); 514 __match(__lookahead); 515 } 516 } 517 } else if(__lookahead == 'b') { 518 current = new TokenNode('\b', __position++); 523 528 __match('b'); 529 } else { 534 CharacterClassNode characterSet; 535 token = __lookahead; 536 537 switch(__lookahead){ 538 case 'n' : token = '\n'; break; 539 case 'r' : token = '\r'; break; 540 case 't' : token = '\t'; break; 541 case 'f' : token = '\f'; break; 542 } 543 544 switch(token) { 545 case 'd' : 546 characterSet = new CharacterClassNode(__position++); 547 characterSet._addTokenRange('0', '9'); 548 current = characterSet; 549 break; 550 case 'D' : 551 characterSet = new NegativeCharacterClassNode(__position++); 552 characterSet._addTokenRange('0', '9'); 553 current = characterSet; 554 break; 555 case 'w' : 556 characterSet = new CharacterClassNode(__position++); 557 characterSet._addTokenRange('0', '9'); 558 characterSet._addTokenRange('a', 'z'); 559 characterSet._addTokenRange('A', 'Z'); 560 characterSet._addToken('_'); 561 current = characterSet; 562 break; 563 case 'W' : 564 characterSet = new NegativeCharacterClassNode(__position++); 565 characterSet._addTokenRange('0', '9'); 566 characterSet._addTokenRange('a', 'z'); 567 characterSet._addTokenRange('A', 'Z'); 568 characterSet._addToken('_'); 569 current = characterSet; 570 break; 571 case 's' : 572 characterSet = new CharacterClassNode(__position++); 573 characterSet._addToken(' '); 574 characterSet._addToken('\f'); 575 characterSet._addToken('\n'); 576 characterSet._addToken('\r'); 577 characterSet._addToken('\t'); 578 current = characterSet; 579 break; 580 case 'S' : 581 characterSet = new NegativeCharacterClassNode(__position++); 582 characterSet._addToken(' '); 583 characterSet._addToken('\f'); 584 characterSet._addToken('\n'); 585 characterSet._addToken('\r'); 586 characterSet._addToken('\t'); 587 current = characterSet; 588 break; 589 default : current = _newTokenNode(token, __position++); break; 590 } 591 592 __match(__lookahead); 593 } 594 595 return current; 596 } 597 598 private SyntaxNode __atom() throws MalformedPatternException { 599 SyntaxNode current; 600 601 if(__lookahead == '(') { 602 __match('('); 603 ++__openParen; 604 current = __regex(); 605 __match(')'); 606 ++__closeParen; 607 } else if(__lookahead == '[') 608 current = __characterClass(); 609 else if(__lookahead == '.') { 610 CharacterClassNode characterSet; 611 612 __match('.'); 613 characterSet = new NegativeCharacterClassNode(__position++); 614 characterSet._addToken('\n'); 615 current = characterSet; 616 } else if(__lookahead == '\\') { 617 current = __backslashToken(); 618 } else if(!__isMetachar(__lookahead)) { 627 current = _newTokenNode(__lookahead, __position++); 628 __match(__lookahead); 629 } else 630 throw 631 new MalformedPatternException("Parse error: unexpected character " + 632 __lookahead + " at position " + __bytesRead); 633 634 return current; 635 } 636 637 638 private SyntaxNode __characterClass() throws MalformedPatternException { 639 char lastToken, token; 640 SyntaxNode node; 641 CharacterClassNode current; 642 643 __match('['); 644 __inCharacterClass = true; 645 646 if(__lookahead == '^'){ 647 __match('^'); 648 current = new NegativeCharacterClassNode(__position++); 649 } else 650 current = new CharacterClassNode(__position++); 651 652 while(__lookahead != ']' && __lookahead != _END_OF_INPUT) { 653 654 if(__lookahead == '\\'){ 655 node = __backslashToken(); 656 --__position; 657 658 if(node instanceof TokenNode){ 661 lastToken = ((TokenNode)node)._token; 662 current._addToken(lastToken); 663 if(!__caseSensitive) 664 current._addToken(_toggleCase(lastToken)); 665 } else { 666 CharacterClassNode slash; 667 slash = (CharacterClassNode)node; 668 for(token=0; token < LeafNode._NUM_TOKENS; token++){ 672 if(slash._matches(token)) 673 current._addToken(token); 674 } 675 676 continue; 681 } 682 } else { 683 lastToken = __lookahead; 684 current._addToken(__lookahead); 685 if(!__caseSensitive) 686 current._addToken(_toggleCase(__lookahead)); 687 __match(__lookahead); 688 } 689 690 if(__lookahead == '-'){ 698 __match('-'); 699 if(__lookahead == ']'){ 700 current._addToken('-'); 701 break; 702 } else if(__lookahead == '\\') { 703 node = __backslashToken(); 704 --__position; 705 if(node instanceof TokenNode) 706 token = ((TokenNode)node)._token; 707 else 708 throw new MalformedPatternException( 709 "Parse error: invalid range specified at position " + __bytesRead); 710 } else { 711 token = __lookahead; 712 __match(__lookahead); 713 } 714 715 if(token < lastToken) 716 throw new MalformedPatternException( 717 "Parse error: invalid range specified at position " + __bytesRead); 718 current._addTokenRange(lastToken + 1, token); 719 if(!__caseSensitive) 720 current._addTokenRange(_toggleCase((char)(lastToken + 1)), 721 _toggleCase(token)); 722 } 723 } 724 725 __match(']'); 726 __inCharacterClass = false; 727 return current; 728 } 729 730 731 SyntaxNode _newTokenNode(char token, int position){ 732 if(!__inCharacterClass && !__caseSensitive && 733 (_isUpperCase(token) || _isLowerCase(token))){ 734 CharacterClassNode node = new CharacterClassNode(position); 735 node._addToken(token); 736 node._addToken(_toggleCase(token)); 737 return node; 738 } 739 740 return new TokenNode(token, position); 741 } 742 743 744 SyntaxTree _parse(char[] expression) throws MalformedPatternException { 745 SyntaxTree tree; 746 747 __openParen = __closeParen = 0; 748 __regularExpression = expression; 749 __bytesRead = 0; 750 __expressionLength = expression.length; 751 __inCharacterClass = false; 752 753 __position = 0; 754 __match(__lookahead); 756 if(__lookahead == '^') { 757 __beginAnchor = true; 758 __match(__lookahead); 759 } 760 761 if(__expressionLength > 0 && expression[__expressionLength - 1] == '$') { 762 --__expressionLength; 763 __endAnchor = true; 764 } 765 766 if(__expressionLength > 1 || (__expressionLength == 1 && !__beginAnchor)) { 767 CatNode root; 768 root = new CatNode(); 769 root._left = __regex(); 770 root._right = 772 new TokenNode((char)LeafNode._END_MARKER_TOKEN, __position++); 773 tree = new SyntaxTree(root, __position); 774 } else 775 tree = new 776 SyntaxTree(new TokenNode((char)LeafNode._END_MARKER_TOKEN, 0), 1); 777 778 tree._computeFollowPositions(); 779 780 return tree; 781 } 782 783 784 798 public Pattern compile(char[] pattern, int options) 799 throws MalformedPatternException 800 { 801 SyntaxTree tree; 802 AwkPattern regexp; 803 804 __beginAnchor = __endAnchor = false; 805 __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0); 806 tree = _parse(pattern); 807 regexp = new AwkPattern(new String (pattern), tree); 808 regexp._options = options; 809 regexp._hasBeginAnchor = __beginAnchor; 810 regexp._hasEndAnchor = __endAnchor; 811 812 return regexp; 813 } 814 815 816 830 public Pattern compile(String pattern, int options) 831 throws MalformedPatternException 832 { 833 SyntaxTree tree; 834 AwkPattern regexp; 835 836 __beginAnchor = __endAnchor = false; 837 __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0); 838 tree = _parse(pattern.toCharArray()); 839 regexp = new AwkPattern(pattern, tree); 840 regexp._options = options; 841 regexp._hasBeginAnchor = __beginAnchor; 842 regexp._hasEndAnchor = __endAnchor; 843 844 return regexp; 845 } 846 847 857 public Pattern compile(char[] pattern) throws MalformedPatternException { 858 return compile(pattern, DEFAULT_MASK); 859 } 860 861 862 872 public Pattern compile(String pattern) throws MalformedPatternException { 873 return compile(pattern, DEFAULT_MASK); 874 } 875 876 } 877 | Popular Tags |