1 package net.sf.saxon.expr; 2 import net.sf.saxon.functions.NormalizeSpace; 3 import net.sf.saxon.trans.StaticError; 4 5 import java.util.ArrayList ; 6 import java.util.List ; 7 8 14 15 16 public final class Tokenizer { 17 18 public int getState() { 19 return state; 20 } 21 22 public void setState(int state) { 23 this.state = state; 24 if (state==DEFAULT_STATE) { 25 precedingToken = Token.UNKNOWN; 27 currentToken = Token.UNKNOWN; 28 } else if (state==OPERATOR_STATE) { 29 precedingToken = Token.RPAR; 30 currentToken = Token.RPAR; 31 } 32 } 33 34 private int state = DEFAULT_STATE; 35 37 40 public static final int DEFAULT_STATE = 0; 41 42 45 public static final int BARE_NAME_STATE = 1; 46 47 50 public static final int SEQUENCE_TYPE_STATE = 2; 51 54 55 public static final int OPERATOR_STATE = 3; 56 57 60 public int startLineNumber; 61 64 public int currentToken = Token.EOF; 65 68 public String currentTokenValue = null; 69 72 public int currentTokenStartOffset = 0; 73 76 private int nextToken = Token.EOF; 77 80 private String nextTokenValue = null; 81 84 private int nextTokenStartOffset = 0; 85 88 public String input; 89 92 public int inputOffset = 0; 93 96 private int inputLength; 97 100 private int lineNumber = 1; 101 104 private int nextLineNumber = 1; 105 106 110 111 private List newlineOffsets = null; 112 113 116 private int precedingToken = Token.UNKNOWN; 117 118 119 122 126 137 public void tokenize(String input, int start, int end, int lineNumber) throws StaticError { 138 nextToken = Token.EOF; 139 nextTokenValue = null; 140 nextTokenStartOffset = 0; 141 inputOffset = start; 142 this.input = input; 143 this.startLineNumber = lineNumber; 144 this.lineNumber = lineNumber; 145 this.nextLineNumber = lineNumber; 146 if (end==-1) { 147 this.inputLength = input.length(); 148 } else { 149 this.inputLength = end; 150 } 151 152 158 lookAhead(); 159 next(); 160 } 161 162 169 175 176 public void next() throws StaticError { 177 precedingToken = currentToken; 178 currentToken = nextToken; 179 currentTokenValue = nextTokenValue; 180 if (currentTokenValue==null) { 181 currentTokenValue=""; 182 } 183 currentTokenStartOffset = nextTokenStartOffset; 184 lineNumber = nextLineNumber; 185 186 188 switch (currentToken) { 189 case Token.NAME: 190 int optype = getBinaryOp(currentTokenValue); 191 if (optype!=Token.UNKNOWN && !followsOperator()) { 192 currentToken = optype; 193 } 194 break; 195 case Token.LT: 196 if (followsOperator()) { 197 currentToken = Token.TAG; 198 } 199 break; 200 case Token.STAR: 201 if (!followsOperator()) { 202 currentToken = Token.MULT; 203 } 204 break; 205 } 206 207 if (currentToken == Token.TAG || currentToken == Token.RCURLY) { 208 return; 212 } 213 214 lookAhead(); 215 216 if (currentToken == Token.NAME) { 217 if (state == BARE_NAME_STATE) { 218 return; 219 } 220 switch (nextToken) { 221 case Token.LPAR: 222 int op = getBinaryOp(currentTokenValue); 223 if (op == Token.UNKNOWN) { 224 currentToken = getFunctionType(currentTokenValue); 225 lookAhead(); } else { 227 currentToken = op; 228 } 229 break; 230 231 case Token.LCURLY: 232 if (!(state == SEQUENCE_TYPE_STATE)) { 233 currentToken = Token.KEYWORD_CURLY; 234 lookAhead(); } 236 break; 237 238 case Token.COLONCOLON: 239 lookAhead(); 240 currentToken = Token.AXIS; 241 break; 242 243 case Token.COLONSTAR: 244 lookAhead(); 245 currentToken = Token.PREFIX; 246 break; 247 248 case Token.DOLLAR: 249 if (currentTokenValue=="for") { 250 currentToken = Token.FOR; 251 } else if (currentTokenValue=="some") { 252 currentToken = Token.SOME; 253 } else if (currentTokenValue=="every") { 254 currentToken = Token.EVERY; 255 } else if (currentTokenValue=="let") { 256 currentToken = Token.LET; 257 } 258 break; 259 260 case Token.NAME: 261 int candidate = -1; 262 if (currentTokenValue.equals("element")) { 263 candidate = Token.ELEMENT_QNAME; 264 } else if (currentTokenValue.equals("attribute")) { 265 candidate = Token.ATTRIBUTE_QNAME; 266 } else if (currentTokenValue.equals("processing-instruction")) { 267 candidate = Token.PI_QNAME; 268 } 269 if (candidate != -1) { 270 274 String qname = nextTokenValue; 275 String saveTokenValue = currentTokenValue; 276 int savePosition = inputOffset; 277 lookAhead(); 278 if (nextToken == Token.LCURLY) { 279 currentToken = candidate; 280 currentTokenValue = qname; 281 lookAhead(); 282 return; 283 } else { 284 currentToken = Token.NAME; 287 currentTokenValue = saveTokenValue; 288 inputOffset = savePosition; 289 nextToken = Token.NAME; 290 nextTokenValue = qname; 291 } 292 293 } 294 String composite = currentTokenValue + ' ' + nextTokenValue; 295 Integer val = (Integer )Token.doubleKeywords.get(composite); 296 if (val==null) { 297 break; 298 } else { 299 currentToken = val.intValue(); 300 currentTokenValue = composite; 301 lookAhead(); 302 return; 303 } 304 default: 305 } 307 } 308 } 309 310 313 314 public void treatCurrentAsOperator() { 315 switch (currentToken) { 316 case Token.NAME: 317 int optype = getBinaryOp(currentTokenValue); 318 if (optype!=Token.UNKNOWN) { 319 currentToken = optype; 320 } 321 break; 322 case Token.STAR: 323 currentToken = Token.MULT; 324 break; 325 } 326 } 327 328 335 public void lookAhead() throws StaticError { 336 precedingToken = nextToken; 337 nextTokenValue = null; 338 nextTokenStartOffset = inputOffset; 339 for (;;) { 340 if (inputOffset >= inputLength) { 341 nextToken = Token.EOF; 342 return; 343 } 344 char c = input.charAt(inputOffset++); 345 switch (c) { 346 case '/': 347 if (inputOffset < inputLength 348 && input.charAt(inputOffset) == '/') { 349 inputOffset++; 350 nextToken = Token.SLSL; 351 return; 352 } 353 nextToken = Token.SLASH; 354 return; 355 case ':': 356 if (inputOffset < inputLength) { 357 if (input.charAt(inputOffset) == ':') { 358 inputOffset++; 359 nextToken = Token.COLONCOLON; 360 return; 361 } else if (input.charAt(inputOffset) == '=') { 362 nextToken = Token.ASSIGN; 363 inputOffset++; 364 return; 365 } 366 } 367 throw new StaticError("Unexpected colon at start of token"); 368 case '@': 369 nextToken = Token.AT; 370 return; 371 case '?': 372 nextToken = Token.QMARK; 373 return; 374 case '[': 375 nextToken = Token.LSQB; 376 return; 377 case ']': 378 nextToken = Token.RSQB; 379 return; 380 case '{': 381 nextToken = Token.LCURLY; 382 return; 383 case '}': 384 nextToken = Token.RCURLY; 385 return; 386 case ';': 387 nextToken = Token.SEMICOLON; 388 state = DEFAULT_STATE; 389 return; 390 case '(': 391 if (inputOffset < inputLength && input.charAt(inputOffset) == '#') { 392 inputOffset++; 393 int pragmaStart = inputOffset; 394 int nestingDepth = 1; 395 while (nestingDepth > 0 && inputOffset < (inputLength-1)) { 396 if (input.charAt(inputOffset) == '\n') { 397 incrementLineNumber(); 398 } else if (input.charAt(inputOffset) == '#' && 399 input.charAt(inputOffset+1) == ')') { 400 nestingDepth--; 401 inputOffset++; 402 } else if (input.charAt(inputOffset) == '(' && 403 input.charAt(inputOffset+1) == '#') { 404 nestingDepth++; 405 inputOffset++; 406 } 407 inputOffset++; 408 } 409 if (nestingDepth > 0) { 410 throw new StaticError("Unclosed XPath comment"); 411 } 412 nextToken = Token.PRAGMA; 413 nextTokenValue = input.substring(pragmaStart, inputOffset-2 ); 414 return; 415 } 416 if (inputOffset < inputLength && input.charAt(inputOffset) == ':') { 417 inputOffset++; 420 int nestingDepth = 1; 421 while (nestingDepth > 0 && inputOffset < (inputLength-1)) { 422 if (input.charAt(inputOffset) == '\n') { 423 incrementLineNumber(); 424 } else if (input.charAt(inputOffset) == ':' && 425 input.charAt(inputOffset+1) == ')') { 426 if (input.charAt(inputOffset-2) == '(' && 427 input.charAt(inputOffset-1) == ':') { 428 throw new StaticError("Empty XPath comments are not allowed"); 429 } 430 nestingDepth--; 431 inputOffset++; 432 } else if (input.charAt(inputOffset) == '(' && 433 input.charAt(inputOffset+1) == ':') { 434 nestingDepth++; 435 inputOffset++; 436 } 437 inputOffset++; 438 } 439 if (nestingDepth > 0) { 440 throw new StaticError("Unclosed XPath comment"); 441 } 442 lookAhead(); 443 } else { 444 nextToken = Token.LPAR; 445 } 446 return; 447 case ')': 448 nextToken = Token.RPAR; 449 return; 450 case '+': 451 nextToken = Token.PLUS; 452 return; 453 case '-': 454 nextToken = Token.MINUS; return; 456 case '=': 457 nextToken = Token.EQUALS; 458 return; 459 case '!': 460 if (inputOffset < inputLength 461 && input.charAt(inputOffset) == '=') { 462 inputOffset++; 463 nextToken = Token.NE; 464 return; 465 } 466 throw new StaticError("'!' without '='"); 467 case '*': 468 if (inputOffset < inputLength 471 && input.charAt(inputOffset) == ':') { 472 inputOffset++; 473 nextToken = Token.SUFFIX; 474 if (inputOffset < inputLength) { 477 char ahead = input.charAt(inputOffset); 478 if (" \r\t\n".indexOf(ahead) >= 0) { 479 throw new StaticError("Whitespace is not allowed after '*:'"); 480 } 481 } 482 return; 483 } 484 nextToken = Token.STAR; 485 return; 489 case ',': 490 nextToken = Token.COMMA; 491 return; 492 case '$': 493 nextToken = Token.DOLLAR; 494 return; 495 case '|': 496 nextToken = Token.UNION; 497 return; 498 case '<': 499 if (inputOffset < inputLength 500 && input.charAt(inputOffset) == '=') { 501 inputOffset++; 502 nextToken = Token.LE; 503 return; 504 } 505 if (inputOffset < inputLength 506 && input.charAt(inputOffset) == '<') { 507 inputOffset++; 508 nextToken = Token.PRECEDES; 509 return; 510 } 511 nextToken = Token.LT; 512 return; 513 case '>': 514 if (inputOffset < inputLength 515 && input.charAt(inputOffset) == '=') { 516 inputOffset++; 517 nextToken = Token.GE; 518 return; 519 } 520 if (inputOffset < inputLength 521 && input.charAt(inputOffset) == '>') { 522 inputOffset++; 523 nextToken = Token.FOLLOWS; 524 return; 525 } 526 nextToken = Token.GT; 527 return; 528 case '.': 529 if (inputOffset < inputLength 530 && input.charAt(inputOffset) == '.') { 531 inputOffset++; 532 nextToken = Token.DOTDOT; 533 return; 534 } 535 if (inputOffset == inputLength 536 || input.charAt(inputOffset) < '0' 537 || input.charAt(inputOffset) > '9') { 538 nextToken = Token.DOT; 539 return; 540 } 541 case '0': 543 case '1': 544 case '2': 545 case '3': 546 case '4': 547 case '5': 548 case '6': 549 case '7': 550 case '8': 551 case '9': 552 boolean allowE = true; 557 boolean allowSign = false; 558 boolean allowDot = true; 559 boolean endOfNum = false; 560 numloop: 561 while (!endOfNum) { 562 switch (c) { 563 case '0': case '1': case '2': case '3': case '4': 564 case '5': case '6': case '7': case '8': case '9': 565 allowSign = false; 566 break; 567 case '.': 568 if (allowDot) { 569 allowDot = false; 570 allowSign = false; 571 } else { 572 inputOffset--; 573 break numloop; 574 } 575 break; 576 case 'E': case 'e': 577 if (allowE) { 578 allowSign = true; 579 allowE = false; 580 } else { 581 inputOffset--; 582 break numloop; 583 } 584 break; 585 case '+': case '-': 586 if (allowSign) { 587 allowSign = false; 588 } else { 589 inputOffset--; 590 break numloop; 591 } 592 break; 593 default: 594 if (('a' <= c && c <= 'z') || c>127) { 595 throw new StaticError("Separator needed after numeric literal"); 597 } 598 inputOffset--; 599 break numloop; 600 } 601 if (inputOffset >= inputLength) break; 602 c = input.charAt(inputOffset++); 603 } 604 nextTokenValue = input.substring(nextTokenStartOffset, inputOffset); 605 nextToken = Token.NUMBER; 606 return; 607 case '"': 608 case '\'': 609 nextTokenValue = ""; 610 while (true) { 611 inputOffset = input.indexOf(c, inputOffset); 612 if (inputOffset < 0) { 613 inputOffset = nextTokenStartOffset + 1; 614 throw new StaticError("Unmatched quote in expression"); 615 } 616 nextTokenValue += input.substring(nextTokenStartOffset + 1, inputOffset++); 617 if (inputOffset < inputLength && input.charAt(inputOffset) == c) { 619 nextTokenValue += c; 620 nextTokenStartOffset = inputOffset; 621 inputOffset++; 622 } else { 623 break; 624 } 625 } 626 627 if (nextTokenValue.indexOf('\n') >= 0) { 629 for (int i = 0; i<nextTokenValue.length(); i++) { 630 if (nextTokenValue.charAt(i) == '\n') { 631 lineNumber++; 632 if (newlineOffsets==null) { 633 newlineOffsets = new ArrayList (20); 634 } 635 newlineOffsets.add(new Integer (nextTokenStartOffset+i)); 636 } 637 } 638 } 639 nextTokenValue = nextTokenValue.intern(); 640 nextToken = Token.STRING_LITERAL; 641 return; 642 case '\n': 643 incrementLineNumber(); 644 case ' ': 646 case '\t': 647 case '\r': 648 nextTokenStartOffset = inputOffset; 649 break; 650 default: 651 if (c < 0x80 && !Character.isLetter(c)) { 652 throw new StaticError("Invalid character '" + c + "' in expression"); 653 } 654 655 case '_': 656 loop: 657 for (;inputOffset < inputLength; inputOffset++) { 658 c = input.charAt(inputOffset); 659 switch (c) { 660 case ':': 661 if (inputOffset+1 < inputLength) { 662 char nc = input.charAt(inputOffset+1); 663 if (nc == ':') { 664 nextTokenValue = input.substring(nextTokenStartOffset, 665 inputOffset).intern(); 666 nextToken = Token.AXIS; 667 inputOffset+=2; 668 return; 669 } else if (nc == '*') { 670 nextTokenValue = input.substring(nextTokenStartOffset, 671 inputOffset).intern(); 672 nextToken = Token.PREFIX; 673 inputOffset+=2; 674 return; 675 } else if (nc == '=') { 676 nextTokenValue = input.substring(nextTokenStartOffset, 678 inputOffset).intern(); 679 nextToken = Token.NAME; 680 return; 681 } 682 } 683 break; 684 case '.': 685 case '-': 686 case '_': 687 break; 688 689 default: 690 if (c < 0x80 && !Character.isLetterOrDigit(c)) 691 break loop; 692 break; 693 } 694 } 695 nextTokenValue = input.substring(nextTokenStartOffset, 696 inputOffset).intern(); 697 nextToken = Token.NAME; 698 return; 699 } 700 } 701 } 702 703 710 711 private static int getBinaryOp(String s) { 712 switch(s.length()) { 713 case 2: 714 if (s=="or") return Token.OR; 715 if (s=="is") return Token.IS; 716 if (s=="to") return Token.TO; 717 if (s=="in") return Token.IN; 718 if (s=="eq") return Token.FEQ; 719 if (s=="ne") return Token.FNE; 720 if (s=="gt") return Token.FGT; 721 if (s=="ge") return Token.FGE; 722 if (s=="lt") return Token.FLT; 723 if (s=="le") return Token.FLE; 724 break; 725 case 3: 726 if (s=="and") return Token.AND; 727 if (s=="div") return Token.DIV; 728 if (s=="mod") return Token.MOD; 729 break; 730 case 4: 731 if (s=="idiv") return Token.IDIV; 732 if (s=="then") return Token.THEN; 733 if (s=="else") return Token.ELSE; 734 if (s=="case") return Token.CASE; 735 break; 736 case 5: 737 if (s=="where") return Token.WHERE; 738 if (s=="union") return Token.UNION; 739 break; 740 case 6: 741 if (s=="except") return Token.EXCEPT; 742 if (s=="return") return Token.RETURN; 743 break; 744 case 7: 745 if (s=="default") return Token.DEFAULT; 746 case 9: 747 if (s=="intersect") return Token.INTERSECT; 748 if (s=="satisfies") return Token.SATISFIES; 749 break; 750 } 751 return Token.UNKNOWN; 752 } 753 754 761 762 private static int getFunctionType(String s) { 763 switch(s.length()) { 764 case 2: 765 if (s=="if") return Token.IF; 766 break; 767 case 4: 768 if (s=="node") return Token.NODEKIND; 769 if (s=="item") return Token.NODEKIND; 770 if (s=="text") return Token.NODEKIND; 771 if (s=="void") return Token.NODEKIND; break; 773 case 7: 774 if (s=="element") return Token.NODEKIND; 775 if (s=="comment") return Token.NODEKIND; 776 break; 777 case 9: 778 if (s=="attribute") return Token.NODEKIND; 779 if (s=="namespace") return Token.NODEKIND; 780 break; 781 case 10: 782 if (s=="typeswitch") return Token.TYPESWITCH; 783 break; 784 default: 785 if (s=="document-node") return Token.NODEKIND; 786 if (s=="empty-sequence") return Token.NODEKIND; 787 if (s=="schema-element") return Token.NODEKIND; 788 if (s=="schema-attribute") return Token.NODEKIND; 789 if (s=="processing-instruction") return Token.NODEKIND; 790 791 break; 792 } 793 return Token.FUNCTION; 794 } 795 796 800 801 private boolean followsOperator() { 802 return precedingToken <= Token.LAST_OPERATOR; 803 } 804 805 812 813 public char nextChar() throws StringIndexOutOfBoundsException { 814 char c = input.charAt(inputOffset++); 815 if (c=='\n') { 817 incrementLineNumber(); 818 lineNumber++; 819 } 820 return c; 821 } 822 823 829 830 848 851 852 private void incrementLineNumber() { 853 nextLineNumber++; 854 if (newlineOffsets==null) { 855 newlineOffsets = new ArrayList (20); 856 } 857 newlineOffsets.add(new Integer (inputOffset-1)); 858 } 859 860 863 864 public void unreadChar() { 865 if (input.charAt(--inputOffset) == '\n') { 866 nextLineNumber--; 867 lineNumber--; 868 if (newlineOffsets != null) { 869 newlineOffsets.remove(newlineOffsets.size()-1); 870 } 871 } 872 } 873 874 877 878 public String recentText() { 879 if (inputOffset > inputLength) { 880 inputOffset = inputLength; 881 } 882 if (inputOffset < 34) { 883 return input.substring(0, inputOffset); 884 } else { 885 return NormalizeSpace.normalize( 886 "..." + input.substring(inputOffset-30, inputOffset)).toString(); 887 } 888 } 889 890 893 894 public int getLineNumber() { 895 return lineNumber; 896 } 897 898 901 902 public int getColumnNumber() { 903 return (int)(getLineAndColumn(currentTokenStartOffset)&0x7fffffff); 904 } 905 906 919 920 926 927 public long getLineAndColumn(int offset) { 928 if (newlineOffsets==null) { 929 return ((long)startLineNumber) << 32 | (long)offset; 930 } 931 for (int line=newlineOffsets.size()-1; line>=0; line--) { 932 int nloffset = ((Integer )newlineOffsets.get(line)).intValue(); 933 if (offset > nloffset) { 934 return ((long)(line+startLineNumber+1)<<32) | ((long)(offset - nloffset)); 935 } 936 } 937 return ((long)startLineNumber) << 32 | (long)(offset+1); 938 } 939 940 public int getLineNumber(int offset) { 941 return (int)((getLineAndColumn(offset))>>32); 942 } 943 944 public int getColumnNumber(int offset) { 945 return (int)((getLineAndColumn(offset))&0x7fffffff); 946 } 947 948 } 949 950 982 983 | Popular Tags |