1 13 14 package org.netbeans.lib.xml.lexer; 15 16 import org.netbeans.api.xml.lexer.XMLTokenId; 17 import org.netbeans.api.lexer.Token; 18 import org.netbeans.spi.lexer.Lexer; 19 import org.netbeans.spi.lexer.LexerInput; 20 import org.netbeans.spi.lexer.LexerRestartInfo; 21 import org.netbeans.spi.lexer.TokenFactory; 22 23 33 34 public class XMLLexer implements Lexer<XMLTokenId> { 35 private LexerInput input; 36 37 private TokenFactory<XMLTokenId> tokenFactory; 38 39 public Object state() { 40 Integer encoded = (subState << 020) + (this.state << 010) + (subInternalDTD ? 1 : 0); 41 return encoded; 42 } 43 44 private void loadState(final Object state) { 45 if (state == null) { 46 subState = INIT; 47 this.state = INIT; 48 subInternalDTD = false; 49 } else { 50 int encoded = ((Integer ) state).intValue(); 51 52 subState = (encoded & 0xff0000) >> 020; 53 this.state = (encoded & 0xff00) >> 010; 54 subInternalDTD = encoded % 2 == 1; 55 } 56 } 57 58 64 protected int state = INIT; 65 66 72 protected int subState = INIT; 73 74 79 protected boolean subInternalDTD = false; 80 81 82 public static final int INIT = 0; 83 84 88 89 private static final int ISI_TEXT = 1; private static final int ISI_ERROR = 2; private static final int ISA_LT = 3; private static final int ISA_SLASH = 4; private static final int ISI_ENDTAG = 5; private static final int ISP_ENDTAG_X = 6; private static final int ISP_ENDTAG_WS = 7; private static final int ISI_TAG = 8; private static final int ISP_TAG_X = 9; private static final int ISP_TAG_WS = 10; private static final int ISI_ARG = 11; private static final int ISP_ARG_X = 12; private static final int ISP_ARG_WS = 13; private static final int ISP_EQ = 14; private static final int ISP_EQ_WS = 15; private static final int ISI_VAL_APOS = 17; private static final int ISI_VAL_QUOT = 18; private static final int ISA_SGML_ESCAPE = 19; private static final int ISA_SGML_DASH = 20; private static final int ISI_XML_COMMENT = 21; private static final int ISA_XML_COMMENT_DASH = 22; private static final int ISI_XML_COMMENT_WS = 23; private static final int ISI_SGML_DECL = 24; 112 private static final int ISA_SGML_DECL_DASH = 25; 113 private static final int ISA_REF = 28; private static final int ISI_REF_NAME = 29; private static final int ISA_REF_HASH = 30; private static final int ISI_REF_DEC = 31; private static final int ISA_REF_X = 32; private static final int ISI_REF_HEX = 33; 122 123 private static final int ISI_PI = 35; private static final int ISI_PI_TARGET = 36; private static final int ISP_PI_TARGET_WS = 37; private static final int ISI_PI_CONTENT = 38; private static final int ISA_PI_CONTENT_QMARK = 39; private static final int ISP_PI_CONTENT_QMARK = 40; 130 private static final int ISA_LTEXBR = 41; 132 private static final int ISA_LTEXBRC = 42; 133 private static final int ISA_LTEXBRCD = 43; 134 private static final int ISA_LTEXBRCDA = 44; 135 private static final int ISA_LTEXBRCDAT = 45; 136 private static final int ISA_LTEXBRCDATA = 46; 137 private static final int ISI_CDATA = 47; 138 private static final int ISA_CDATA_BR = 48; 139 private static final int ISA_CDATA_BRBR = 49; 140 141 private static final int ISI_DECL_CHARS = 50; 143 private static final int ISI_DECL_STRING = 51; 144 private static final int ISP_DECL_CHARS = 52; 145 private static final int ISP_DECL_STRING = 53; 146 147 private static final int ISA_INIT_BR = 54; 149 150 public XMLLexer(LexerRestartInfo<XMLTokenId> info) { 151 this.input = info.input(); 152 this.tokenFactory = info.tokenFactory(); 153 loadState(info.state()); 154 } 155 156 private final boolean isAZ( int ch ) { 157 return( (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ); 158 } 159 160 170 171 private final boolean isWS( int ch ) { 172 return Character.isWhitespace(ch); 173 } 176 177 private void enterInternalDTD() { 178 subInternalDTD = true; 179 } 180 181 private void leaveInternalDTD() { 182 subInternalDTD = false; 183 } 184 185 private boolean isInternalDTD() { 186 return subInternalDTD; 187 } 188 189 public Token<XMLTokenId> nextToken() { 190 191 int actChar; 192 while(true) { 193 actChar = input.read(); 194 195 if (actChar == LexerInput.EOF){ 196 197 if (input.readLength() == 0){ 198 return null; 199 } 200 201 input.backup(1); 202 break; 203 } 204 205 switch( state ) { 206 case INIT: switch( actChar ) { 208 case '<': 209 state = ISA_LT; 210 break; 211 case '&': 212 if (isInternalDTD() == false) { 213 state = ISA_REF; 214 subState = ISI_TEXT; 215 } else { 216 state = ISI_TEXT; 217 } 218 break; 219 case '%': 220 if (isInternalDTD()) { 221 state = ISA_REF; 222 subState = INIT; 223 } else { 224 state = ISI_TEXT; 225 } 226 break; 227 case ']': 228 if (isInternalDTD()) { 229 state = ISA_INIT_BR; 230 } else { 231 state = ISI_TEXT; 232 } 233 break; 234 default: 235 state = ISI_TEXT; 236 break; 237 } 238 239 break; 240 241 case ISI_TEXT: switch( actChar ) { 243 case '<': 244 state = INIT; 245 input.backup(1); 246 return token(XMLTokenId.TEXT); 247 case '&': 248 if (isInternalDTD() == false) { 249 state = INIT; 250 input.backup(1); 251 return token(XMLTokenId.TEXT); 252 } 253 break; 254 case '%': 255 if (isInternalDTD()) { 256 state = INIT; 257 input.backup(1); 258 return token(XMLTokenId.TEXT); 259 } 260 break; 261 case ']': 262 if (isInternalDTD()) { 263 state = ISA_INIT_BR; 264 } 265 break; 266 } 267 break; 268 269 case ISI_ERROR: state = INIT; 271 return token(XMLTokenId.ERROR); 272 273 case ISA_LT: 275 if( UnicodeClasses.isXMLNameStartChar( actChar ) && isInternalDTD() == false) { 276 state = ISI_TAG; 277 break; 278 } 279 switch( actChar ) { 280 case '/': state = ISA_SLASH; 282 break; 283 case '!': 284 state = ISA_SGML_ESCAPE; 285 break; 286 case '?': 287 state = ISI_PI; 288 return token(XMLTokenId.PI_START); 289 default: 290 state = ISI_TEXT; continue; } 293 break; 294 295 case ISI_PI: 296 if ( UnicodeClasses.isXMLNameStartChar( actChar )) { 297 state = ISI_PI_TARGET; 298 break; 299 } 300 state = ISI_ERROR; 301 break; 302 303 case ISI_PI_TARGET: 304 if ( UnicodeClasses.isXMLNameChar( actChar )) break; 305 if (isWS( actChar )) { 306 state = ISP_PI_TARGET_WS; 307 input.backup(1); 308 return token(XMLTokenId.PI_TARGET); 309 } 310 state = ISI_ERROR; 311 break; 312 313 case ISP_PI_TARGET_WS: 314 if (isWS( actChar)) break; 315 state = ISI_PI_CONTENT; 316 input.backup(1); 317 return token(XMLTokenId.WS); 318 319 case ISI_PI_CONTENT: 320 if (actChar != '?') break; state = ISP_PI_CONTENT_QMARK; 322 input.backup(1); 323 return token(XMLTokenId.PI_CONTENT); 325 case ISP_PI_CONTENT_QMARK: 326 if (actChar != '?') throw new IllegalStateException ("'?' expected in ISP_PI_CONTENT_QMARK"); 327 state = ISA_PI_CONTENT_QMARK; 328 break; 329 330 case ISA_PI_CONTENT_QMARK: 331 if (actChar != '>') { 332 state = ISI_PI_CONTENT; 333 break; 334 } 335 state = INIT; 336 return token(XMLTokenId.PI_END); 337 338 case ISA_SLASH: 340 if( UnicodeClasses.isXMLNameStartChar( actChar )){ 341 state = ISI_ENDTAG; 342 break; 343 } 344 switch( actChar ) { 345 case ' ': 346 state = ISI_TEXT; 347 continue; 348 case '\n': 349 state = ISI_TEXT; 350 continue; 351 case '\r': 352 state = ISI_TEXT; 353 continue; 354 default: state = ISI_TEXT; 356 continue; } 358 360 case ISI_ENDTAG: if( UnicodeClasses.isXMLNameChar( actChar )){ 362 break; } 364 365 state = ISP_ENDTAG_X; 366 input.backup(1); 367 return token(XMLTokenId.TAG); 368 369 370 case ISP_ENDTAG_X: if( isWS( actChar ) ) { 372 state = ISP_ENDTAG_WS; 373 break; 374 } 375 switch( actChar ) { 376 case '>': state = INIT; 378 return token(XMLTokenId.TAG); 379 default: 380 state = ISI_ERROR; 381 continue; } 383 385 case ISP_ENDTAG_WS: if( isWS( actChar ) ) break; state = ISP_ENDTAG_X; 388 input.backup(1); 389 return token(XMLTokenId.WS); 390 391 392 case ISI_TAG: if( UnicodeClasses.isXMLNameChar( actChar ) ) break; state = ISP_TAG_X; 395 input.backup(1); 396 return token(XMLTokenId.TAG); 397 398 case ISP_TAG_X: if( isWS( actChar ) ) { 400 state = ISP_TAG_WS; 401 break; 402 } 403 if( UnicodeClasses.isXMLNameStartChar( actChar ) ) { 404 state = ISI_ARG; 405 break; 406 } 407 switch( actChar ) { 408 case '/': 409 break; 410 case '?': break; 412 case '>': 413 state = INIT; 414 return token(XMLTokenId.TAG); 415 default: 416 state = ISI_ERROR; 417 continue; 418 } 419 break; 420 421 422 case ISP_TAG_WS: if( isWS( actChar ) ) break; state = ISP_TAG_X; 426 input.backup(1); 427 return token(XMLTokenId.WS); 428 429 case ISI_ARG: if( UnicodeClasses.isXMLNameChar( actChar ) ) break; state = ISP_ARG_X; 432 input.backup(1); 433 return token(XMLTokenId.ARGUMENT); 434 435 case ISP_ARG_X: 436 if( isWS( actChar ) ) { 437 state = ISP_ARG_WS; 438 break; 439 } 440 switch( actChar ) { 441 case '=': 442 state = ISP_EQ; 443 return token(XMLTokenId.OPERATOR); 444 default: 445 state = ISI_ERROR; 446 continue; 447 } 448 450 case ISP_ARG_WS: 451 if( isWS( actChar ) ) break; state = ISP_ARG_X; 453 input.backup(1); 454 return token(XMLTokenId.WS); 455 456 case ISP_EQ: 457 if( isWS( actChar ) ) { 458 state = ISP_EQ_WS; 459 break; 460 } 461 switch( actChar ) { 462 case '\'': 463 state = ISI_VAL_APOS; 464 break; 465 case '"': 466 state = ISI_VAL_QUOT; 467 break; 468 default: 469 state = ISI_ERROR; 470 continue; 471 } 472 break; 473 474 case ISP_EQ_WS: 475 if( isWS( actChar ) ) break; state = ISP_EQ; 477 input.backup(1); 478 return token(XMLTokenId.WS); 479 480 case ISI_VAL_APOS: 481 switch( actChar ) { 482 case '\'': 483 state = ISP_TAG_X; 484 return token(XMLTokenId.VALUE); 485 case '&': 486 if(input.readLength() == 1) { 487 subState = state; 488 state = ISA_REF; 489 break; 490 } else { 491 input.backup(1); 492 return token(XMLTokenId.VALUE); 493 } 494 } 495 break; 497 case ISI_VAL_QUOT: 498 switch( actChar ) { 499 case '"': 500 state = ISP_TAG_X; 501 return token(XMLTokenId.VALUE); 502 case '&': 503 if(input.readLength() == 1) { 504 subState = state; 505 state = ISA_REF; 506 break; 507 } else { 508 input.backup(1); 509 return token(XMLTokenId.VALUE); 510 } 511 } 512 break; 514 515 case ISA_SGML_ESCAPE: if (actChar == '[') { 517 state = ISA_LTEXBR; 518 break; 519 } else if( isAZ(actChar) ) { 520 state = ISI_SGML_DECL; 521 break; 522 } 523 switch( actChar ) { 524 case '-': 525 state = ISA_SGML_DASH; 526 break; 527 default: 528 state = ISI_TEXT; 529 continue; 530 } 531 break; 532 533 case ISA_LTEXBR: 534 if (actChar == 'C') { 535 state = ISA_LTEXBRC; 536 break; 537 } else { 538 state = ISI_TEXT; 539 continue; 540 } 541 542 case ISA_LTEXBRC: 543 if (actChar == 'D') { 544 state = ISA_LTEXBRCD; 545 break; 546 } else { 547 state = ISI_TEXT; 548 continue; 549 } 550 551 case ISA_LTEXBRCD: 552 if (actChar == 'A') { 553 state = ISA_LTEXBRCDA; 554 break; 555 } else { 556 state = ISI_TEXT; 557 continue; 558 } 559 560 case ISA_LTEXBRCDA: 561 if (actChar == 'T') { 562 state = ISA_LTEXBRCDAT; 563 break; 564 } else { 565 state = ISI_TEXT; 566 continue; 567 } 568 569 case ISA_LTEXBRCDAT: 570 if (actChar == 'A') { 571 state = ISA_LTEXBRCDATA; 572 break; 573 } else { 574 state = ISI_TEXT; 575 continue; 576 } 577 578 case ISA_LTEXBRCDATA: 579 if (actChar == '[') { 580 state = ISI_CDATA; 581 break; 582 } else { 583 state = ISI_TEXT; 584 continue; 585 } 586 587 case ISI_CDATA: 588 if (actChar == ']') { 589 state = ISA_CDATA_BR; 590 break; 591 } 592 593 case ISA_CDATA_BR: 594 if (actChar == ']') { 595 state = ISA_CDATA_BRBR; 596 break; 597 } else { 598 state = ISI_CDATA; 599 break; 600 } 601 602 case ISA_CDATA_BRBR: 603 if (actChar == '>') { 604 state = ISI_TEXT; return token(XMLTokenId.CDATA_SECTION); 606 } else if (actChar == ']') { 607 break; 609 } else { 610 state = ISI_CDATA; 611 break; 612 } 613 614 615 case ISA_SGML_DASH: switch( actChar ) { 617 case '-': 618 state = ISI_XML_COMMENT; 619 break; 620 default: 621 state=ISI_ERROR; 622 continue; 623 } 624 break; 625 626 case ISI_XML_COMMENT: switch( actChar ) { 628 case '-': 629 state = ISA_XML_COMMENT_DASH; 630 break; 631 case '\n': 635 return token(XMLTokenId.BLOCK_COMMENT); 638 } 639 break; 640 641 case ISA_XML_COMMENT_DASH: 642 switch( actChar ) { 643 case '-': 644 state = ISI_XML_COMMENT_WS; 645 break; 646 default: 647 state = ISI_XML_COMMENT; 648 continue; 649 } 650 break; 651 652 case ISI_XML_COMMENT_WS: if( isWS( actChar ) ) break; switch( actChar ) { 655 case '>': 656 state = INIT; 657 return token(XMLTokenId.BLOCK_COMMENT); 658 default: 659 state = ISI_ERROR; 660 input.backup(1); 661 return token(XMLTokenId.BLOCK_COMMENT); 662 } 663 664 case ISP_DECL_STRING: 665 if (actChar != '"') throw new IllegalStateException ("Unexpected " + actChar); 666 state = ISI_DECL_STRING; 667 break; 668 669 case ISI_DECL_STRING: 670 if ( actChar == '"') { 671 state = ISI_SGML_DECL; 672 return token(XMLTokenId.VALUE); 673 } 674 break; 675 676 case ISP_DECL_CHARS: 677 if (actChar != '\'') throw new IllegalStateException ("Unexpected " + actChar); 678 state = ISI_DECL_CHARS; 679 break; 680 681 case ISI_DECL_CHARS: 682 if ( actChar == '\'') { 683 state = ISI_SGML_DECL; 684 return token(XMLTokenId.VALUE); 685 } 686 break; 687 688 case ISI_SGML_DECL: 689 switch( actChar ) { 690 case '"': 691 state = ISP_DECL_STRING; 692 input.backup(1); 693 return token(XMLTokenId.DECLARATION); 694 case '\'': 695 state = ISP_DECL_CHARS; 696 input.backup(1); 697 return token(XMLTokenId.DECLARATION); 698 case '[': 699 state = INIT; 700 enterInternalDTD(); 701 return token(XMLTokenId.DECLARATION); 702 case '>': 703 state = INIT; 704 return token(XMLTokenId.DECLARATION); 705 } 706 break; 707 708 case ISA_INIT_BR: 709 if (isWS(actChar)) break; 710 if (actChar == '>') { 711 state = INIT; 712 leaveInternalDTD(); 713 return token(XMLTokenId.DECLARATION); 714 } else { 715 state = INIT; 716 input.backup(1); 717 return token(XMLTokenId.ERROR); 718 } 719 720 case ISA_SGML_DECL_DASH: 721 if( actChar == '-' ) { 722 state = ISI_ERROR; 723 break; 724 } else { 725 if(isWS(actChar)){ 726 state = ISI_ERROR; 727 continue; 728 } else { 729 state = ISI_SGML_DECL; 730 continue; 731 } 732 } 733 734 case ISA_REF: 735 if( UnicodeClasses.isXMLNameStartChar( actChar ) ) { 736 state = ISI_REF_NAME; 737 break; 738 } 739 if( actChar == '#') { 740 state = ISA_REF_HASH; 741 break; 742 } 743 state = subState; 744 continue; 745 746 case ISI_REF_NAME: 747 if( UnicodeClasses.isXMLNameChar( actChar ) ) break; 748 if( actChar != ';' ) input.backup(1); 749 state = subState; 750 return token(XMLTokenId.CHARACTER); 751 752 case ISA_REF_HASH: 753 if( actChar >= '0' && actChar <= '9' ) { 754 state = ISI_REF_DEC; 755 break; 756 } 757 if( actChar == 'x' || actChar == 'X' ) { 758 state = ISA_REF_X; 759 break; 760 } 761 if( isAZ( actChar ) ) { 762 state = subState; 763 return token(XMLTokenId.ERROR); 764 } 765 state = subState; 766 continue; 767 768 case ISI_REF_DEC: 769 if( actChar >= '0' && actChar <= '9' ) break; 770 if( actChar != ';' ) input.backup(1); 771 state = subState; 772 return token(XMLTokenId.CHARACTER); 773 774 case ISA_REF_X: 775 if (isHex(actChar)) { 776 state = ISI_REF_HEX; 777 break; 778 } 779 state = subState; 780 input.backup(1); 781 return token(XMLTokenId.ERROR); 783 case ISI_REF_HEX: 784 if (isHex(actChar)) break; 785 if (actChar != ';' ) input.backup(1); 786 state = subState; 787 return token(XMLTokenId.CHARACTER); 788 } 789 } 791 switch( state ) { 792 case INIT: 793 case ISI_TEXT: 794 case ISA_LT: 795 case ISA_SLASH: 796 case ISA_SGML_ESCAPE: 797 case ISA_SGML_DASH: 798 return token(XMLTokenId.TEXT); 799 800 case ISA_REF: 801 case ISA_REF_HASH: 802 if( subState == ISI_TEXT ) return token(XMLTokenId.TEXT); 803 else return token(XMLTokenId.VALUE); 804 805 case ISI_XML_COMMENT: 806 case ISA_XML_COMMENT_DASH: 807 case ISI_XML_COMMENT_WS: 808 return token(XMLTokenId.BLOCK_COMMENT); 809 810 case ISI_TAG: 811 case ISI_ENDTAG: 812 return token(XMLTokenId.TAG); 813 814 case ISI_ARG: 815 return token(XMLTokenId.ARGUMENT); 816 817 case ISI_ERROR: 818 return token(XMLTokenId.ERROR); 819 820 case ISP_ARG_WS: 821 case ISP_TAG_WS: 822 case ISP_ENDTAG_WS: 823 case ISP_EQ_WS: 824 return token(XMLTokenId.WS); 825 826 case ISP_ARG_X: 827 case ISP_TAG_X: 828 case ISP_ENDTAG_X: 829 case ISP_EQ: 830 return token(XMLTokenId.WS); 831 832 case ISI_VAL_APOS: 833 case ISI_VAL_QUOT: 834 case ISI_DECL_CHARS: 835 case ISI_DECL_STRING: 836 return token(XMLTokenId.VALUE); 837 838 case ISI_SGML_DECL: 839 case ISA_SGML_DECL_DASH: 840 case ISP_DECL_STRING: 841 case ISP_DECL_CHARS: 842 return token(XMLTokenId.DECLARATION); 843 844 case ISI_REF_NAME: 845 case ISI_REF_DEC: 846 case ISA_REF_X: 847 case ISI_REF_HEX: 848 return token(XMLTokenId.CHARACTER); 849 850 case ISI_PI: 851 return token(XMLTokenId.PI_START); 852 case ISI_PI_TARGET: 853 return token(XMLTokenId.PI_TARGET); 854 case ISP_PI_TARGET_WS: 855 return token(XMLTokenId.WS); 856 case ISI_PI_CONTENT: 857 return token(XMLTokenId.PI_CONTENT); 858 case ISA_PI_CONTENT_QMARK: 859 case ISP_PI_CONTENT_QMARK: 860 return token(XMLTokenId.PI_END); 862 863 case ISA_LTEXBR: 864 case ISA_LTEXBRC: 865 case ISA_LTEXBRCD: 866 case ISA_LTEXBRCDA: 867 case ISA_LTEXBRCDAT: 868 case ISA_LTEXBRCDATA: 869 return token(XMLTokenId.TEXT); 870 871 case ISI_CDATA: 872 case ISA_CDATA_BR: 873 case ISA_CDATA_BRBR: 874 return token(XMLTokenId.CDATA_SECTION); 875 876 case ISA_INIT_BR: 877 return token(XMLTokenId.TEXT); 878 879 default: 880 throw new IllegalStateException ("Last buffer does not handle state " + state + "!"); } 882 883 } 884 885 private Token<XMLTokenId> token(XMLTokenId id) { 886 Token<XMLTokenId> t = tokenFactory.createToken(id); 891 return t; 893 } 894 895 private boolean isHex(int ch) { 896 return (ch >= '0' && ch <= '9') || isAF(ch); 897 } 898 899 private boolean isAF(int ch) { 900 return( (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F') ); 901 } 902 903 public void release() { 904 } 905 906 } 907 | Popular Tags |