1 13 14 package org.netbeans.lib.html.lexer; 15 16 import java.util.logging.Level ; 17 import java.util.logging.Logger ; 18 import org.netbeans.api.html.lexer.HTMLTokenId; 19 import org.netbeans.api.lexer.Token; 20 import org.netbeans.spi.lexer.Lexer; 21 import org.netbeans.spi.lexer.LexerInput; 22 import org.netbeans.spi.lexer.LexerRestartInfo; 23 import org.netbeans.spi.lexer.TokenFactory; 24 25 34 35 public final class HTMLLexer implements Lexer<HTMLTokenId> { 36 37 private static final Logger LOGGER = Logger.getLogger(HTMLLexer.class.getName()); 38 private static final boolean LOG = Boolean.getBoolean("j2ee_lexer_debug"); 40 private static final int EOF = LexerInput.EOF; 41 42 private final LexerInput input; 43 44 private final TokenFactory<HTMLTokenId> tokenFactory; 45 46 public Object state() { 47 return lexerSubState * 1000000 + lexerState * 1000 + lexerScriptState; 48 } 49 50 51 56 private int lexerSubState = INIT; 57 private int lexerState = INIT; 58 59 60 private int lexerScriptState = INIT; 61 62 private static final int ISI_SCRIPT = 1; 65 66 private static final int INIT = 0; 68 private static final int ISI_TEXT = 1; private static final int ISI_ERROR = 2; private static final int ISA_LT = 3; private static final int ISA_SLASH = 4; private static final int ISI_ENDTAG = 5; private static final int ISP_ENDTAG_X = 6; private static final int ISP_ENDTAG_WS = 7; private static final int ISI_TAG = 8; private static final int ISP_TAG_X = 9; private static final int ISP_TAG_WS = 10; private static final int ISI_ARG = 11; private static final int ISP_ARG_X = 12; private static final int ISP_ARG_WS = 13; private static final int ISP_EQ = 14; private static final int ISP_EQ_WS = 15; private static final int ISI_VAL = 16; private static final int ISI_VAL_QUOT = 17; private static final int ISI_VAL_DQUOT = 18; private static final int ISA_SGML_ESCAPE = 19; private static final int ISA_SGML_DASH = 20; private static final int ISI_HTML_COMMENT = 21; private static final int ISA_HTML_COMMENT_DASH = 22; private static final int ISI_HTML_COMMENT_WS = 23; private static final int ISI_SGML_DECL = 24; 92 private static final int ISA_SGML_DECL_DASH = 25; 93 private static final int ISI_SGML_COMMENT = 26; 94 private static final int ISA_SGML_COMMENT_DASH = 27; 95 private static final int ISA_REF = 28; private static final int ISI_REF_NAME = 29; private static final int ISA_REF_HASH = 30; private static final int ISI_REF_DEC = 31; private static final int ISA_REF_X = 32; private static final int ISI_REF_HEX = 33; private static final int ISI_TAG_SLASH = 34; 103 public HTMLLexer(LexerRestartInfo<HTMLTokenId> info) { 104 this.input = info.input(); 105 this.tokenFactory = info.tokenFactory(); 106 if (info.state() == null) { 107 this.lexerSubState = INIT; 108 this.lexerState = INIT; 109 this.lexerScriptState = INIT; 110 } else { 111 int encoded = ((Integer ) info.state()).intValue(); 112 this.lexerSubState = encoded / 1000000; 113 int remainder = encoded % 1000000; 114 this.lexerState = remainder / 1000; 115 this.lexerScriptState = remainder % 1000; 116 } 117 } 118 119 private final boolean isAZ( int character ) { 120 return( (character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z') ); 121 } 122 123 private final boolean isName( int character ) { 124 return Character.isLetterOrDigit(character) || 125 character == '-' || character == '_' || character == '.' || character == ':'; 126 131 } 132 133 143 144 private final boolean isWS( int character ) { 145 return Character.isWhitespace(character); 146 } 149 150 public Token<HTMLTokenId> nextToken() { 151 int actChar; 152 153 while (true) { 154 actChar = input.read(); 155 156 if (actChar == EOF) { 157 if(input.readLengthEOF() == 1) { 158 return null; } else { 160 input.backup(1); break; 164 } 165 } 166 167 switch( lexerState ) { 170 case INIT: switch( actChar ) { 172 case '<': 173 lexerState = ISA_LT; 174 break; 175 case '&': 176 lexerState = ISA_REF; 177 lexerSubState = ISI_TEXT; 178 break; 179 default: 180 lexerState = ISI_TEXT; 181 break; 182 } 183 break; 184 185 case ISI_TEXT: switch( actChar ) { 187 case '<': 188 case '&': 189 lexerState = INIT; 190 input.backup(1); 191 if(input.readLength() > 0) { return token(lexerScriptState == INIT ? HTMLTokenId.TEXT : HTMLTokenId.SCRIPT); 193 } 194 break; 195 } 196 break; 197 198 case ISI_ERROR: lexerState = INIT; 200 return token(HTMLTokenId.ERROR); 201 202 case ISA_LT: if( isAZ( actChar ) ) { lexerState = ISI_TAG; 205 input.backup(1); 206 return token(HTMLTokenId.TAG_OPEN_SYMBOL); 207 } 208 switch( actChar ) { 209 case '/': lexerState = ISA_SLASH; 211 return token(HTMLTokenId.TAG_OPEN_SYMBOL); 212 case '>': lexerState = INIT; 214 return token(HTMLTokenId.TAG_CLOSE_SYMBOL); 215 case '!': 216 lexerState = ISA_SGML_ESCAPE; 217 break; 218 default: lexerState = ISI_TEXT; 220 break; 221 } 222 break; 223 224 case ISA_SLASH: if( isAZ( actChar ) ) { lexerState = ISI_ENDTAG; 227 break; 228 } 229 switch( actChar ) { 230 case '>': lexerState = INIT; 232 return token(HTMLTokenId.TAG_CLOSE_SYMBOL); 233 default: lexerState = ISI_TEXT; 235 input.backup(1); 236 break; 237 } 238 break; 239 240 case ISI_ENDTAG: if( isName( actChar ) ) break; lexerState = ISP_ENDTAG_X; 243 input.backup(1); 244 251 return token(HTMLTokenId.TAG_CLOSE); 252 253 254 case ISP_ENDTAG_X: if( isWS( actChar ) ) { 256 lexerState = ISP_ENDTAG_WS; 257 break; 258 } 259 switch( actChar ) { 260 case '>': lexerState = INIT; 262 return token(HTMLTokenId.TAG_CLOSE_SYMBOL); 263 case '<': lexerState = INIT; 265 input.backup(1); 266 break; 267 default: 268 lexerState = ISI_ERROR; 269 input.backup(1); 270 break; 271 } 272 break; 273 274 case ISP_ENDTAG_WS: if( isWS( actChar ) ) break; lexerState = ISP_ENDTAG_X; 277 input.backup(1); 278 return token(HTMLTokenId.WS); 279 280 281 case ISI_TAG: if( isName( actChar ) ) break; lexerState = ISP_TAG_X; 284 input.backup(1); 285 return token(HTMLTokenId.TAG_OPEN); 291 292 case ISP_TAG_X: if( isWS( actChar ) ) { 294 lexerState = ISP_TAG_WS; 295 break; 296 } 297 if( isAZ( actChar ) ) { 298 lexerState = ISI_ARG; 299 break; 300 } 301 switch( actChar ) { 302 case '/': 303 lexerState = ISI_TAG_SLASH; 304 break; 305 case '>': 306 lexerState = INIT; 307 return token(HTMLTokenId.TAG_CLOSE_SYMBOL); 308 case '<': 309 lexerState = INIT; 310 input.backup(1); 311 break; 312 default: 313 lexerState = ISI_ERROR; 314 input.backup(1); 315 break; 316 } 317 break; 318 319 case ISP_TAG_WS: if( isWS( actChar ) ) break; lexerState = ISP_TAG_X; 322 input.backup(1); 323 return token(HTMLTokenId.WS); 324 325 case ISI_TAG_SLASH: 326 switch( actChar ) { 327 case '>': 328 lexerState = INIT; 329 return token(HTMLTokenId.TAG_CLOSE_SYMBOL); 330 default: 331 lexerState = ISI_ERROR; 332 input.backup(1); 333 break; 334 } 335 break; 336 337 case ISI_ARG: if( isName( actChar ) ) break; lexerState = ISP_ARG_X; 340 input.backup(1); 341 return token(HTMLTokenId.ARGUMENT); 342 343 case ISP_ARG_X: 344 if( isWS( actChar ) ) { 345 lexerState = ISP_ARG_WS; 346 break; 347 } 348 if( isAZ( actChar ) ) { 349 lexerState = ISI_ARG; 350 break; 351 } 352 switch( actChar ) { 353 case '/': 354 case '>': 355 input.backup(1); 356 lexerState = ISP_TAG_X; 357 break; 358 case '<': 359 lexerState = INIT; 360 input.backup(1); 361 break; 362 case '=': 363 lexerState = ISP_EQ; 364 return token(HTMLTokenId.OPERATOR); 365 default: 366 lexerState = ISI_ERROR; 367 input.backup(1); 368 break; 369 } 370 break; 371 372 case ISP_ARG_WS: 373 if( isWS( actChar ) ) break; lexerState = ISP_ARG_X; 375 input.backup(1); 376 return token(HTMLTokenId.WS); 377 378 case ISP_EQ: 379 if( isWS( actChar ) ) { 380 lexerState = ISP_EQ_WS; 381 break; 382 } 383 switch( actChar ) { 384 case '\'': 385 lexerState = ISI_VAL_QUOT; 386 break; 387 case '"': 388 lexerState = ISI_VAL_DQUOT; 389 break; 390 case '/': 391 case '>': 392 input.backup(1); 393 lexerState = ISP_TAG_X; 394 break; 395 default: 396 lexerState = ISI_VAL; break; 398 } 399 break; 400 401 case ISP_EQ_WS: 402 if( isWS( actChar ) ) break; lexerState = ISP_EQ; 404 input.backup(1); 405 return token(HTMLTokenId.WS); 406 407 408 case ISI_VAL: 409 if( !isWS( actChar ) 410 && !(actChar == '/' || actChar == '>' || actChar == '<')) break; lexerState = ISP_TAG_X; 412 input.backup(1); 413 return token(HTMLTokenId.VALUE); 414 415 case ISI_VAL_QUOT: 416 switch( actChar ) { 417 case '\'': 418 lexerState = ISP_TAG_X; 419 return token(HTMLTokenId.VALUE); 420 case '&': 421 if( input.readLength() == 1 ) { 422 lexerSubState = lexerState; 423 lexerState = ISA_REF; 424 break; 425 } else { 426 input.backup(1); 427 return token(HTMLTokenId.VALUE); 428 } 429 } 430 break; 432 case ISI_VAL_DQUOT: 433 switch( actChar ) { 434 case '"': 435 lexerState = ISP_TAG_X; 436 return token(HTMLTokenId.VALUE); 437 case '&': 438 if( input.readLength() == 1 ) { 439 lexerSubState = lexerState; 440 lexerState = ISA_REF; 441 break; 442 } else { 443 input.backup(1); 444 return token(HTMLTokenId.VALUE); 445 } 446 } 447 break; 449 450 451 case ISA_SGML_ESCAPE: if( isAZ(actChar) ) { 453 lexerState = ISI_SGML_DECL; 454 break; 455 } 456 switch( actChar ) { 457 case '-': 458 lexerState = ISA_SGML_DASH; 459 break; 460 default: 461 lexerState = ISI_TEXT; 462 input.backup(1); 463 continue; 464 } 465 break; 466 467 case ISA_SGML_DASH: switch( actChar ) { 469 case '-': 470 lexerState = ISI_HTML_COMMENT; 471 break; 472 default: 473 lexerState = ISI_TEXT; 474 input.backup(1); 475 continue; 476 } 477 break; 478 479 case ISI_HTML_COMMENT: switch( actChar ) { 481 case '-': 482 lexerState = ISA_HTML_COMMENT_DASH; 483 break; 484 case '\n': 486 return token(HTMLTokenId.BLOCK_COMMENT); 489 } 490 break; 491 492 case ISA_HTML_COMMENT_DASH: 493 switch( actChar ) { 494 case '-': 495 lexerState = ISI_HTML_COMMENT_WS; 496 break; 497 default: 498 lexerState = ISI_HTML_COMMENT; 499 continue; 500 } 501 break; 502 503 case ISI_HTML_COMMENT_WS: if( isWS( actChar ) ) break; switch( actChar ) { 506 case '>': 507 lexerState = INIT; 508 return token(HTMLTokenId.BLOCK_COMMENT); 509 default: 510 lexerState = ISI_HTML_COMMENT; 511 input.backup(1); 512 break; 513 } 514 break; 515 516 case ISI_SGML_DECL: 517 switch( actChar ) { 518 case '>': 519 lexerState = INIT; 520 return token(HTMLTokenId.DECLARATION); 521 case '-': 522 if( input.readLength() == 1 ) { 523 lexerState = ISA_SGML_DECL_DASH; 524 break; 525 } else { 526 input.backup(1); 527 return token(HTMLTokenId.DECLARATION); 528 } 529 } 530 break; 531 532 case ISA_SGML_DECL_DASH: 533 if( actChar == '-' ) { 534 lexerState = ISI_SGML_COMMENT; 535 break; 536 } else { 537 lexerState = ISI_SGML_DECL; 538 input.backup(1); 539 continue; 540 } 541 542 case ISI_SGML_COMMENT: 543 switch( actChar ) { 544 case '-': 545 lexerState = ISA_SGML_COMMENT_DASH; 546 break; 547 } 548 break; 549 550 case ISA_SGML_COMMENT_DASH: 551 if( actChar == '-' ) { 552 lexerState = ISI_SGML_DECL; 553 return token(HTMLTokenId.SGML_COMMENT); 554 } else { 555 lexerState = ISI_SGML_COMMENT; 556 input.backup(1); 557 continue; 558 } 559 560 561 case ISA_REF: 562 if( isAZ( actChar ) ) { 563 lexerState = ISI_REF_NAME; 564 break; 565 } 566 if( actChar == '#' ) { 567 lexerState = ISA_REF_HASH; 568 break; 569 } 570 lexerState = lexerSubState; 571 input.backup(1); 572 continue; 573 574 case ISI_REF_NAME: 575 if( isName( actChar ) ) break; 576 if( actChar != ';' ) 577 input.backup(1); 578 lexerState = lexerSubState; 579 return token(HTMLTokenId.CHARACTER); 580 581 case ISA_REF_HASH: 582 if( actChar >= '0' && actChar <= '9' ) { 583 lexerState = ISI_REF_DEC; 584 break; 585 } 586 if( actChar == 'x' || actChar == 'X' ) { 587 lexerState = ISA_REF_X; 588 break; 589 } 590 if( isAZ( actChar ) ) { 591 lexerState = lexerSubState; 592 return token(HTMLTokenId.ERROR); 593 } 594 lexerState = lexerSubState; 595 input.backup(1); 596 continue; 597 598 case ISI_REF_DEC: 599 if( actChar >= '0' && actChar <= '9' ) break; 600 if( actChar != ';' ) 601 input.backup(1); 602 lexerState = lexerSubState; 603 return token(HTMLTokenId.CHARACTER); 604 605 case ISA_REF_X: 606 if( (actChar >= '0' && actChar <= '9') || 607 (actChar >= 'a' && actChar <= 'f') || 608 (actChar >= 'A' && actChar <= 'F') 609 ) { 610 lexerState = ISI_REF_HEX; 611 break; 612 } 613 lexerState = lexerSubState; 614 input.backup(1); 615 return token(HTMLTokenId.ERROR); 617 case ISI_REF_HEX: 618 if( (actChar >= '0' && actChar <= '9') || 619 (actChar >= 'a' && actChar <= 'f') || 620 (actChar >= 'A' && actChar <= 'F') 621 ) break; 622 if( actChar != ';' ) 623 input.backup(1); 624 lexerState = lexerSubState; 625 return token(HTMLTokenId.CHARACTER); 626 } 627 } 629 633 switch( lexerState ) { 634 case INIT: 635 if (input.readLength() == 0) { 636 return null; 637 } 638 break; 639 case ISI_TEXT: 640 case ISA_LT: 641 case ISA_SLASH: 642 case ISA_SGML_ESCAPE: 643 case ISA_SGML_DASH: 644 lexerState = INIT; 645 return token(lexerScriptState == INIT ? HTMLTokenId.TEXT : HTMLTokenId.SCRIPT); 646 647 case ISA_REF: 648 case ISA_REF_HASH: 649 lexerState = INIT; 650 if( lexerSubState == ISI_TEXT ) return token(lexerScriptState == INIT ? HTMLTokenId.TEXT : HTMLTokenId.SCRIPT); 651 else return token(HTMLTokenId.VALUE); 652 653 case ISI_HTML_COMMENT: 654 case ISA_HTML_COMMENT_DASH: 655 case ISI_HTML_COMMENT_WS: 656 lexerState = INIT; 657 return token(HTMLTokenId.BLOCK_COMMENT); 658 659 case ISI_TAG: 660 lexerState = INIT; 661 return token(HTMLTokenId.TAG_OPEN); 662 case ISI_ENDTAG: 663 lexerState = INIT; 664 return token(HTMLTokenId.TAG_CLOSE); 665 666 case ISI_ARG: 667 lexerState = INIT; 668 return token(HTMLTokenId.ARGUMENT); 669 670 case ISI_ERROR: 671 lexerState = INIT; 672 return token(HTMLTokenId.ERROR); 673 674 case ISP_ARG_WS: 675 case ISP_TAG_WS: 676 case ISP_ENDTAG_WS: 677 case ISP_EQ_WS: 678 lexerState = INIT; 679 return token(HTMLTokenId.WS); 680 681 case ISP_ARG_X: 682 case ISP_TAG_X: 683 case ISP_ENDTAG_X: 684 case ISP_EQ: 685 lexerState = INIT; 686 return token(HTMLTokenId.WS); 687 688 case ISI_VAL: 689 case ISI_VAL_QUOT: 690 case ISI_VAL_DQUOT: 691 lexerState = INIT; 692 return token(HTMLTokenId.VALUE); 693 694 case ISI_SGML_DECL: 695 case ISA_SGML_DECL_DASH: 696 lexerState = INIT; 697 return token(HTMLTokenId.DECLARATION); 698 699 case ISI_SGML_COMMENT: 700 case ISA_SGML_COMMENT_DASH: 701 lexerState = INIT; 702 return token(HTMLTokenId.SGML_COMMENT); 703 704 case ISI_REF_NAME: 705 case ISI_REF_DEC: 706 case ISA_REF_X: 707 case ISI_REF_HEX: 708 lexerState = INIT; 709 return token(HTMLTokenId.CHARACTER); 710 } 711 712 return null; 713 } 714 715 private Token<HTMLTokenId> token(HTMLTokenId tokenId) { 716 if(LOG) { 717 if(input.readLength() == 0) { 718 LOGGER.log(Level.INFO, "Found zero length token: "); 719 } 720 LOGGER.log(Level.INFO, "[" + this.getClass().getSimpleName() + "] token ('" + input.readText().toString() + "'; id=" + tokenId + "; state=" + state() + ")\n"); 721 } 722 return tokenFactory.createToken(tokenId); 723 } 724 725 public void release() { 726 } 727 728 } 729 | Popular Tags |