1 2 3 4 package hotsax.html.sax; 5 6 import java.io.*; 7 8 9 15 class HtmlLexer { 16 17 18 final public static int YYEOF = -1; 19 20 21 final private static int YY_BUFFERSIZE = 16384; 22 23 24 final public static int ATTRIBUTE = 2; 25 final public static int STRING = 4; 26 final public static int CDATA = 9; 27 final public static int APOSSTRING = 3; 28 final public static int YYINITIAL = 0; 29 final public static int PROCESSINGINSTRUCTION = 7; 30 final public static int ELEMENT = 1; 31 final public static int END = 5; 32 final public static int COMMENT = 6; 33 final public static int DOCTYPE = 8; 34 35 38 final private static char [] yycmap = { 39 0, 0, 0, 0, 0, 0, 0, 0, 5, 25, 4, 0, 0, 3, 0, 0, 40 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41 5, 10, 23, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 11, 0, 9, 42 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 8, 6, 22, 21, 43 0, 20, 1, 14, 12, 18, 1, 1, 1, 1, 1, 1, 1, 1, 1, 13, 44 17, 1, 1, 1, 15, 1, 1, 1, 1, 16, 1, 19, 0, 26, 0, 7, 45 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 49 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 52 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 55 }; 56 57 60 final private static int yy_rowMap [] = { 61 0, 27, 54, 81, 108, 135, 162, 189, 216, 243, 62 270, 297, 270, 324, 351, 378, 270, 270, 405, 432, 63 459, 270, 270, 270, 270, 270, 486, 270, 270, 297, 64 513, 270, 297, 540, 270, 297, 270, 270, 297, 567, 65 270, 594, 270, 621, 270, 648, 675, 702, 729, 270, 66 270, 270, 756, 783, 810, 837, 864, 891, 918, 945, 67 972, 999, 270, 270 68 }; 69 70 73 final private static String yy_packed0 = 74 "\3\13\1\14\1\15\3\13\1\16\22\13\1\15\1\17"+ 75 "\1\15\3\20\6\15\7\17\1\15\1\17\1\15\1\21"+ 76 "\2\15\1\20\1\15\1\22\1\23\1\24\3\25\1\15"+ 77 "\5\22\7\23\1\22\1\23\1\22\1\26\1\27\1\30"+ 78 "\1\25\1\22\3\31\1\14\1\15\23\31\1\32\1\15"+ 79 "\4\31\1\14\1\15\22\31\1\32\1\31\1\15\1\31"+ 80 "\1\15\1\33\1\15\1\14\10\15\7\33\1\15\1\33"+ 81 "\1\15\1\34\4\15\3\35\1\36\7\35\1\37\17\35"+ 82 "\3\40\1\41\21\40\1\42\5\40\3\43\1\44\1\15"+ 83 "\21\43\1\45\4\43\3\46\1\47\1\15\25\46\1\50"+ 84 "\37\0\1\15\37\0\1\51\1\52\12\0\1\53\6\0"+ 85 "\2\17\4\0\1\17\4\0\7\17\1\0\1\17\11\0"+ 86 "\3\20\23\0\1\20\2\0\2\23\4\0\1\23\4\0"+ 87 "\7\23\1\0\1\23\10\0\1\24\33\0\3\25\23\0"+ 88 "\1\25\2\0\2\33\4\0\1\33\4\0\7\33\1\0"+ 89 "\1\33\21\0\1\54\45\0\1\55\36\0\1\56\13\0"+ 90 "\1\57\1\60\6\0\1\61\35\0\1\62\32\0\1\63"+ 91 "\17\0\1\64\34\0\1\65\33\0\1\66\32\0\1\67"+ 92 "\30\0\1\70\35\0\1\71\37\0\1\72\26\0\1\73"+ 93 "\31\0\1\74\34\0\1\75\35\0\1\76\30\0\1\77"+ 94 "\33\0\1\100\7\0"; 95 96 99 final private static int yytrans [] = yy_unpack(); 100 101 102 103 final private static int YY_UNKNOWN_ERROR = 0; 104 final private static int YY_ILLEGAL_STATE = 1; 105 final private static int YY_NO_MATCH = 2; 106 final private static int YY_PUSHBACK_2BIG = 3; 107 108 109 final private static String YY_ERROR_MSG[] = { 110 "Unkown internal scanner error", 111 "Internal error: unknown state", 112 "Error: could not match input", 113 "Error: pushback value was too large" 114 }; 115 116 119 private final static byte YY_ATTRIBUTE[] = { 120 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 9, 1, 9, 1, 1, 1, 121 9, 9, 1, 1, 1, 9, 9, 9, 9, 9, 1, 9, 9, 1, 1, 9, 122 1, 1, 9, 1, 9, 9, 1, 1, 9, 0, 9, 0, 9, 0, 0, 0, 123 0, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9 124 }; 125 126 127 private java.io.Reader yy_reader; 128 129 130 private int yy_state; 131 132 133 private int yy_lexical_state = YYINITIAL; 134 135 137 private char yy_buffer[] = new char[YY_BUFFERSIZE]; 138 139 140 private int yy_markedPos; 141 142 143 private int yy_pushbackPos; 144 145 146 private int yy_currentPos; 147 148 149 private int yy_startRead; 150 151 153 private int yy_endRead; 154 155 156 private int yyline; 157 158 159 private int yychar; 160 161 165 private int yycolumn; 166 167 170 private boolean yy_atBOL = true; 171 172 173 private boolean yy_atEOF; 174 175 176 private boolean yy_eof_done; 177 178 179 private HtmlParser yyparser; 180 private StringBuffer string = new StringBuffer (); 181 private StringBuffer text = new StringBuffer (); 182 private StringBuffer attr = new StringBuffer (); 183 private StringBuffer comment = new StringBuffer (); 184 private StringBuffer pi = new StringBuffer (); 185 private StringBuffer cdata = new StringBuffer (); 186 private StringBuffer doctype = new StringBuffer (); 187 188 public void p(String s) { System.out.println(s); } 189 190 public boolean getEOF() { return yy_atEOF; } 191 192 public HtmlLexer(Reader r, HtmlParser p) 193 { 194 this(r); 195 yyparser = p; 196 } 197 198 private boolean first = false; 199 private boolean last = false; 200 201 205 public int _yylex() 206 throws IOException 207 { 208 int token; 209 if (!first) { 210 first = true; 211 212 return HtmlParser.SOF; } 214 else { 215 if (last) 216 return 0; else { 218 token = yylex(); 219 220 if (token == 0) 221 { 222 last = true; 223 return HtmlParser.EOF; } 225 else 226 return token; 227 } 228 } 229 } 230 231 void setLval(String text) 233 { 234 yyparser.yylval = new Lval(text); 235 } 236 237 void setLval(Attribute a) 238 { 239 yyparser.yylval = a; 240 } 241 242 243 244 250 HtmlLexer(java.io.Reader in) { 251 this.yy_reader = in; 252 } 253 254 260 HtmlLexer(java.io.InputStream in) { 261 this(new java.io.InputStreamReader (in)); 262 } 263 264 269 private static int [] yy_unpack() { 270 int [] trans = new int[1026]; 271 int offset = 0; 272 offset = yy_unpack(yy_packed0, offset, trans); 273 return trans; 274 } 275 276 282 private static int yy_unpack(String packed, int offset, int [] trans) { 283 int i = 0; 284 int j = offset; 285 int l = packed.length(); 286 while (i < l) { 287 int count = packed.charAt(i++); 288 int value = packed.charAt(i++); 289 value--; 290 do trans[j++] = value; while (--count > 0); 291 } 292 return j; 293 } 294 295 296 303 private int yy_advance() throws java.io.IOException { 304 305 306 if (yy_currentPos < yy_endRead) return yy_buffer[yy_currentPos++]; 307 308 309 if (yy_atEOF) return YYEOF; 310 311 312 313 314 if (yy_startRead > 0) { 315 System.arraycopy(yy_buffer, yy_startRead, 316 yy_buffer, 0, 317 yy_endRead-yy_startRead); 318 319 320 yy_endRead-= yy_startRead; 321 yy_currentPos-= yy_startRead; 322 yy_markedPos-= yy_startRead; 323 yy_pushbackPos-= yy_startRead; 324 yy_startRead = 0; 325 } 326 327 328 if (yy_currentPos >= yy_buffer.length) { 329 330 char newBuffer[] = new char[yy_currentPos*2]; 331 System.arraycopy(yy_buffer, 0, newBuffer, 0, yy_buffer.length); 332 yy_buffer = newBuffer; 333 } 334 335 336 int numRead = yy_reader.read(yy_buffer, yy_endRead, 337 yy_buffer.length-yy_endRead); 338 339 if ( numRead == -1 ) return YYEOF; 340 341 yy_endRead+= numRead; 342 343 return yy_buffer[yy_currentPos++]; 344 } 345 346 347 350 final public void yyclose() throws java.io.IOException { 351 yy_atEOF = true; 352 yy_endRead = yy_startRead; 353 354 if (yy_reader != null) 355 yy_reader.close(); 356 } 357 358 359 369 final public void yyreset(java.io.Reader reader) throws java.io.IOException { 370 yyclose(); 371 yy_reader = reader; 372 yy_atBOL = true; 373 yy_atEOF = false; 374 yy_endRead = yy_startRead = 0; 375 yy_currentPos = yy_markedPos = yy_pushbackPos = 0; 376 yyline = yychar = yycolumn = 0; 377 yy_lexical_state = YYINITIAL; 378 } 379 380 381 384 final public int yystate() { 385 return yy_lexical_state; 386 } 387 388 389 394 final public void yybegin(int newState) { 395 yy_lexical_state = newState; 396 } 397 398 399 402 final public String yytext() { 403 return new String ( yy_buffer, yy_startRead, yy_markedPos-yy_startRead ); 404 } 405 406 407 418 final public char yycharat(int pos) { 419 return yy_buffer[yy_startRead+pos]; 420 } 421 422 423 426 final public int yylength() { 427 return yy_markedPos-yy_startRead; 428 } 429 430 431 445 private void yy_ScanError(int errorCode) { 446 String message; 447 try { 448 message = YY_ERROR_MSG[errorCode]; 449 } 450 catch (ArrayIndexOutOfBoundsException e) { 451 message = YY_ERROR_MSG[YY_UNKNOWN_ERROR]; 452 } 453 454 throw new Error (message); 455 } 456 457 458 466 private void yypushback(int number) { 467 if ( number > yylength() ) 468 yy_ScanError(YY_PUSHBACK_2BIG); 469 470 yy_markedPos -= number; 471 } 472 473 474 478 private void yy_do_eof() throws java.io.IOException { 479 if (!yy_eof_done) { 480 yy_eof_done = true; 481 yyclose(); 482 } 483 } 484 485 486 493 public int yylex() throws java.io.IOException { 494 int yy_input; 495 int yy_action; 496 497 498 while (true) { 499 500 yychar+= yylength(); 501 502 boolean yy_r = false; 503 for (yy_currentPos = yy_startRead; yy_currentPos < yy_markedPos; 504 yy_currentPos++) { 505 switch (yy_buffer[yy_currentPos]) { 506 case '\u000B': 507 case '\u000C': 508 case '\u0085': 509 case '\u2028': 510 case '\u2029': 511 yyline++; 512 yy_r = false; 513 break; 514 case '\r': 515 yyline++; 516 yy_r = true; 517 break; 518 case '\n': 519 if (yy_r) 520 yy_r = false; 521 else { 522 yyline++; 523 } 524 break; 525 default: 526 yy_r = false; 527 } 528 } 529 530 if (yy_r) { 531 if ( yy_advance() == '\n' ) yyline--; 532 if ( !yy_atEOF ) yy_currentPos--; 533 } 534 535 yy_action = -1; 536 537 yy_currentPos = yy_startRead = yy_markedPos; 538 539 yy_state = yy_lexical_state; 540 541 542 yy_forAction: { 543 while (true) { 544 545 yy_input = yy_advance(); 546 547 if ( yy_input == YYEOF ) break yy_forAction; 548 549 int yy_next = yytrans[ yy_rowMap[yy_state] + yycmap[yy_input] ]; 550 if (yy_next == -1) break yy_forAction; 551 yy_state = yy_next; 552 553 int yy_attributes = YY_ATTRIBUTE[yy_state]; 554 if ( (yy_attributes & 1) > 0 ) { 555 yy_action = yy_state; 556 yy_markedPos = yy_currentPos; 557 if ( (yy_attributes & 8) > 0 ) break yy_forAction; 558 } 559 560 } 561 } 562 563 564 switch (yy_action) { 565 566 case 16: 567 { yybegin(YYINITIAL); 568 text.setLength(0); 569 return HtmlParser.TAG_START_COMPLETE; } 571 case 65: break; 572 case 20: 573 { 575 } 576 case 66: break; 577 case 21: 578 { yybegin(YYINITIAL); 579 text.setLength(0); 580 setLval(""); 581 return HtmlParser.TAG_START_COMPLETE; 582 } 583 case 67: break; 584 case 26: 585 { 586 setLval(yytext()); 587 return HtmlParser.TAG_END; 588 } 589 case 68: break; 590 case 17: 591 { 593 } 594 case 69: break; 595 case 63: 596 { yybegin(CDATA); 597 cdata.setLength(0); 598 if (text.length() > 0) 599 { 600 setLval(text.toString()); 601 return HtmlParser.TEXT; 602 } 603 } 604 case 70: break; 605 case 62: 606 { yybegin(DOCTYPE); 607 doctype.setLength(0); 608 } 609 case 71: break; 610 case 51: 611 { yybegin(COMMENT); 612 comment.setLength(0); 613 if (text.length() > 0) 614 { 615 setLval(text.toString()); 616 return HtmlParser.TEXT; 617 } 618 } 619 case 72: break; 620 case 50: 621 { yybegin(YYINITIAL); 622 setLval(cdata.toString()); 623 return HtmlParser.CDATA; 624 } 625 case 73: break; 626 case 49: 627 { yybegin(YYINITIAL); 628 setLval(comment.toString()); 629 return HtmlParser.COMMENT; 630 } 631 case 74: break; 632 case 44: 633 { yybegin(YYINITIAL); 634 setLval(pi.toString()); 635 return HtmlParser.PI; 636 } 637 case 75: break; 638 case 42: 639 { yybegin(PROCESSINGINSTRUCTION); 640 pi.setLength(0); 641 } 642 case 76: break; 643 case 40: 644 { yybegin(END); 645 if (text.length() > 0) 646 { 647 setLval(text.toString()); 648 return HtmlParser.TEXT; 649 } 650 } 651 case 77: break; 652 case 37: 653 case 38: 654 case 39: 655 { cdata.append(yytext()); } 656 case 78: break; 657 case 36: 658 { yybegin(YYINITIAL); 659 setLval(doctype.toString()); 660 return HtmlParser.DOCTYPE; 661 } 662 case 79: break; 663 case 34: 664 case 35: 665 { doctype.append(yytext()); } 666 case 80: break; 667 case 31: 668 case 32: 669 { pi.append(yytext()); } 670 case 81: break; 671 case 28: 672 case 29: 673 { comment.append(yytext()); } 674 case 82: break; 675 case 14: 676 { setLval(yytext()); 677 return HtmlParser.TAG_START; 678 } 679 case 83: break; 680 case 13: 681 { yybegin(ELEMENT); 682 if (text.length() > 0) 683 { 684 setLval(text.toString()); 685 return HtmlParser.TEXT; } 687 } 688 case 84: break; 689 case 11: 690 case 12: 691 case 30: 692 case 33: 693 { } 694 case 85: break; 695 case 10: 696 { text.append(yytext()); } 697 case 86: break; 698 case 2: 699 case 19: 700 { setLval(new Attribute(attr.toString(), yytext())); 701 return HtmlParser.ATTR; 702 } 703 case 87: break; 704 case 1: 705 case 15: 706 { yybegin(ATTRIBUTE); } 707 case 88: break; 708 case 18: 709 { attr.setLength(0); attr.append(yytext()); } 710 case 89: break; 711 case 22: 712 { string.setLength(0); yybegin(STRING); } 713 case 90: break; 714 case 23: 715 { string.setLength(0); yybegin(APOSSTRING); } 716 case 91: break; 717 case 24: 718 { string.append(yytext()); } 719 case 92: break; 720 case 25: 721 { yybegin(ATTRIBUTE); setLval(new Attribute(attr.toString(), string.toString())); 722 return HtmlParser.ATTR; } 723 case 93: break; 724 case 27: 725 { yybegin(YYINITIAL); text.setLength(0); } 726 case 94: break; 727 default: 728 if (yy_input == YYEOF && yy_startRead == yy_currentPos) { 729 yy_atEOF = true; 730 yy_do_eof(); 731 { return 0; } 732 } 733 else { 734 yy_ScanError(YY_NO_MATCH); 735 } 736 } 737 } 738 } 739 740 750 public static void main(String argv[]) { 751 for (int i = 0; i < argv.length; i++) { 752 HtmlLexer scanner = null; 753 try { 754 scanner = new HtmlLexer( new java.io.FileReader (argv[i]) ); 755 } 756 catch (java.io.FileNotFoundException e) { 757 System.out.println("File not found : \""+argv[i]+"\""); 758 System.exit(1); 759 } 760 catch (java.io.IOException e) { 761 System.out.println("Error opening file \""+argv[i]+"\""); 762 System.exit(1); 763 } 764 catch (ArrayIndexOutOfBoundsException e) { 765 System.out.println("Usage : java HtmlLexer <inputfile>"); 766 System.exit(1); 767 } 768 769 try { 770 do { 771 System.out.println(scanner.yylex()); 772 } while (!scanner.yy_atEOF); 773 774 } 775 catch (java.io.IOException e) { 776 System.out.println("An I/O error occured while scanning :"); 777 System.out.println(e); 778 System.exit(1); 779 } 780 catch (Exception e) { 781 e.printStackTrace(); 782 System.exit(1); 783 } 784 } 785 } 786 787 788 } 789 | Popular Tags |