1 27 package org.htmlparser.lexer; 28 29 import java.io.IOException ; 30 import java.io.Serializable ; 31 import java.net.MalformedURLException ; 32 import java.net.URLConnection ; 33 import java.util.Vector ; 34 35 import org.htmlparser.Node; 36 import org.htmlparser.NodeFactory; 37 import org.htmlparser.Remark; 38 import org.htmlparser.Text; 39 import org.htmlparser.Tag; 40 import org.htmlparser.http.ConnectionManager; 41 import org.htmlparser.nodes.RemarkNode; 42 import org.htmlparser.nodes.TextNode; 43 import org.htmlparser.nodes.TagNode; 44 import org.htmlparser.util.ParserException; 45 46 55 public class Lexer 56 implements 57 Serializable , 58 NodeFactory 59 { 60 63 protected Page mPage; 64 65 68 protected Cursor mCursor; 69 70 73 protected NodeFactory mFactory; 74 75 82 static protected int mDebugLineTrigger = -1; 83 84 87 public Lexer () 88 { 89 this (new Page ("")); 90 } 91 92 96 public Lexer (Page page) 97 { 98 setPage (page); 99 setCursor (new Cursor (page, 0)); 100 setNodeFactory (this); 101 } 102 103 107 public Lexer (String text) 108 { 109 this (new Page (text)); 110 } 111 112 116 public Lexer (URLConnection connection) throws ParserException 117 { 118 this (new Page (connection)); 119 } 120 121 126 public void reset () 127 { 128 getPage ().reset (); 129 setCursor (new Cursor (getPage (), 0)); 130 } 131 132 136 public Page getPage () 137 { 138 return (mPage); 139 } 140 141 145 public void setPage (Page page) 146 { 147 if (null == page) 148 throw new IllegalArgumentException ("page cannot be null"); 149 mPage = page; 151 } 152 153 157 public Cursor getCursor () 158 { 159 return (mCursor); 160 } 161 162 166 public void setCursor (Cursor cursor) 167 { 168 if (null == cursor) 169 throw new IllegalArgumentException ("cursor cannot be null"); 170 mCursor = cursor; 172 } 173 174 178 public NodeFactory getNodeFactory () 179 { 180 return (mFactory); 181 } 182 183 187 public void setNodeFactory (NodeFactory factory) 188 { 189 if (null == factory) 190 throw new IllegalArgumentException ("node factory cannot be null"); 191 mFactory = factory; 192 } 193 194 public int getPosition () 195 { 196 return (getCursor ().getPosition ()); 197 } 198 199 public void setPosition (int position) 200 { 201 getCursor ().setPosition (position); 203 } 204 205 209 public int getCurrentLineNumber () 210 { 211 return (getPage ().row (getCursor ())); 212 } 213 214 218 public String getCurrentLine () 219 { 220 return (getPage ().getLine (getCursor ())); 221 } 222 223 229 public Node nextNode () 230 throws 231 ParserException 232 { 233 return nextNode (false); 234 } 235 236 243 public Node nextNode (boolean quotesmart) 244 throws 245 ParserException 246 { 247 int start; 248 char ch; 249 Node ret; 250 251 if (-1 != mDebugLineTrigger) 253 { 254 Page page = getPage (); 255 int lineno = page.row (mCursor); 256 if (mDebugLineTrigger < lineno) 257 mDebugLineTrigger = lineno + 1; } 259 start = mCursor.getPosition (); 260 ch = mPage.getCharacter (mCursor); 261 switch (ch) 262 { 263 case Page.EOF: 264 ret = null; 265 break; 266 case '<': 267 ch = mPage.getCharacter (mCursor); 268 if (Page.EOF == ch) 269 ret = makeString (start, mCursor.getPosition ()); 270 else if ('%' == ch) 271 { 272 mCursor.retreat (); 273 ret = parseJsp (start); 274 } 275 else if ('/' == ch || '%' == ch || Character.isLetter (ch)) 276 { 277 mCursor.retreat (); 278 ret = parseTag (start); 279 } 280 else if ('!' == ch) 281 { 282 ch = mPage.getCharacter (mCursor); 283 if (Page.EOF == ch) 284 ret = makeString (start, mCursor.getPosition ()); 285 else 286 { 287 if ('>' == ch) ret = makeRemark (start, mCursor.getPosition ()); 289 else 290 { 291 mCursor.retreat (); if ('-' == ch) 293 ret = parseRemark (start, quotesmart); 294 else 295 { 296 mCursor.retreat (); ret = parseTag (start); 298 } 299 } 300 } 301 } 302 else 303 ret = parseString (start, quotesmart); 304 break; 305 default: 306 mCursor.retreat (); ret = parseString (start, quotesmart); 308 break; 309 } 310 311 return (ret); 312 } 313 314 318 protected void scanJIS (Cursor cursor) 319 throws 320 ParserException 321 { 322 boolean done; 323 char ch; 324 int state; 325 326 done = false; 327 state = 0; 328 while (!done) 329 { 330 ch = mPage.getCharacter (cursor); 331 if (Page.EOF == ch) 332 done = true; 333 else 334 switch (state) 335 { 336 case 0: 337 if (0x1b == ch) state = 1; 339 break; 340 case 1: 341 if ('(' == ch) 342 state = 2; 343 else 344 state = 0; 345 break; 346 case 2: 347 if ('J' == ch) 348 done = true; 349 else 350 state = 0; 351 break; 352 default: 353 throw new IllegalStateException ("how the fuck did we get in state " + state); 354 } 355 } 356 } 357 358 366 protected Node parseString (int start, boolean quotesmart) 367 throws 368 ParserException 369 { 370 boolean done; 371 char ch; 372 char quote; 373 374 done = false; 375 quote = 0; 376 while (!done) 377 { 378 ch = mPage.getCharacter (mCursor); 379 if (Page.EOF == ch) 380 done = true; 381 else if (0x1b == ch) { 383 ch = mPage.getCharacter (mCursor); 384 if (Page.EOF == ch) 385 done = true; 386 else if ('$' == ch) 387 { 388 ch = mPage.getCharacter (mCursor); 389 if (Page.EOF == ch) 390 done = true; 391 else if ('B' == ch) 392 scanJIS (mCursor); 393 else 394 { 395 mCursor.retreat (); 396 mCursor.retreat (); 397 } 398 } 399 else 400 mCursor.retreat (); 401 } 402 else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) 403 quote = ch; else if (quotesmart && (0 != quote) && ('\\' == ch)) 406 { 407 ch = mPage.getCharacter (mCursor); if ((Page.EOF != ch) 409 && ('\\' != ch) && (ch != quote)) mCursor.retreat(); } 414 else if (quotesmart && (ch == quote)) 415 quote = 0; else if (quotesmart && (0 == quote) && (ch == '/')) 417 { 418 ch = mPage.getCharacter (mCursor); 421 if (Page.EOF == ch) 422 done = true; 423 else if ('/' == ch) 424 { 425 do 426 ch = mPage.getCharacter (mCursor); 427 while ((Page.EOF != ch) && ('\n' != ch)); 428 } 429 else if ('*' == ch) 430 { 431 do 432 { 433 do 434 ch = mPage.getCharacter (mCursor); 435 while ((Page.EOF != ch) && ('*' != ch)); 436 ch = mPage.getCharacter (mCursor); 437 if (ch == '*') 438 mCursor.retreat (); 439 } 440 while ((Page.EOF != ch) && ('/' != ch)); 441 } 442 else 443 mCursor.retreat (); 444 } 445 else if ((0 == quote) && ('<' == ch)) 446 { 447 ch = mPage.getCharacter (mCursor); 448 if (Page.EOF == ch) 449 done = true; 450 else if ('/' == ch || Character.isLetter (ch) || '!' == ch || '%' == ch) 452 { 453 done = true; 454 mCursor.retreat (); 455 mCursor.retreat (); 456 } 457 else 458 { 459 mCursor.retreat (); 461 } 462 } 463 } 464 465 return (makeString (start, mCursor.getPosition ())); 466 } 467 468 471 protected Node makeString (int start, int end) 472 throws 473 ParserException 474 { 475 int length; 476 Node ret; 477 478 length = end - start; 479 if (0 != length) 480 { ret = getNodeFactory ().createStringNode (this.getPage (), start, end); 482 } 483 else 484 ret = null; 485 486 return (ret); 487 } 488 489 private void whitespace (Vector attributes, int[] bookmarks) 490 { 491 if (bookmarks[1] > bookmarks[0]) 492 attributes.addElement (new PageAttribute (mPage, -1, -1, bookmarks[0], bookmarks[1], (char)0)); 493 } 494 495 private void standalone (Vector attributes, int[] bookmarks) 496 { 497 attributes.addElement (new PageAttribute (mPage, bookmarks[1], bookmarks[2], -1, -1, (char)0)); 498 } 499 500 private void empty (Vector attributes, int[] bookmarks) 501 { 502 attributes.addElement (new PageAttribute (mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char)0)); 503 } 504 505 private void naked (Vector attributes, int[] bookmarks) 506 { 507 attributes.addElement (new PageAttribute (mPage, bookmarks[1], bookmarks[2], bookmarks[3], bookmarks[4], (char)0)); 508 } 509 510 private void single_quote (Vector attributes, int[] bookmarks) 511 { 512 attributes.addElement (new PageAttribute (mPage, bookmarks[1], bookmarks[2], bookmarks[4] + 1, bookmarks[5], '\'')); 513 } 514 515 private void double_quote (Vector attributes, int[] bookmarks) 516 { 517 attributes.addElement (new PageAttribute (mPage, bookmarks[1], bookmarks[2], bookmarks[5] + 1, bookmarks[6], '"')); 518 } 519 520 581 protected Node parseTag (int start) 582 throws 583 ParserException 584 { 585 boolean done; 586 char ch; 587 int state; 588 int[] bookmarks; 589 Vector attributes; 590 591 done = false; 592 attributes = new Vector (); 593 state = 0; 594 bookmarks = new int[8]; 595 bookmarks[0] = mCursor.getPosition (); 596 while (!done) 597 { 598 bookmarks[state + 1] = mCursor.getPosition (); 599 ch = mPage.getCharacter (mCursor); 600 switch (state) 601 { 602 case 0: if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch)) 604 { 605 if ('<' == ch) 606 { 607 mCursor.retreat (); 609 bookmarks[state + 1] = mCursor.getPosition (); 610 } 611 whitespace (attributes, bookmarks); 612 done = true; 613 } 614 else if (!Character.isWhitespace (ch)) 615 { 616 whitespace (attributes, bookmarks); 617 state = 1; 618 } 619 break; 620 case 1: if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch)) 622 { 623 if ('<' == ch) 624 { 625 mCursor.retreat (); 627 bookmarks[state + 1] = mCursor.getPosition (); 628 } 629 standalone (attributes, bookmarks); 630 done = true; 631 } 632 else if (Character.isWhitespace (ch)) 633 { 634 bookmarks[6] = bookmarks[2]; state = 6; 638 } 639 else if ('=' == ch) 640 state = 2; 641 break; 642 case 2: if ((Page.EOF == ch) || ('>' == ch)) 644 { 645 empty (attributes, bookmarks); 646 done = true; 647 } 648 else if ('\'' == ch) 649 { 650 state = 4; 651 bookmarks[4] = bookmarks[3]; 652 } 653 else if ('"' == ch) 654 { 655 state = 5; 656 bookmarks[5] = bookmarks[3]; 657 } 658 else if (Character.isWhitespace (ch)) 659 { 660 } 664 else 665 state = 3; 666 break; 667 case 3: if ((Page.EOF == ch) || ('>' == ch)) 669 { 670 naked (attributes, bookmarks); 671 done = true; 672 } 673 else if (Character.isWhitespace (ch)) 674 { 675 naked (attributes, bookmarks); 676 bookmarks[0] = bookmarks[4]; 677 state = 0; 678 } 679 break; 680 case 4: if (Page.EOF == ch) 682 { 683 single_quote (attributes, bookmarks); 684 done = true; } 686 else if ('\'' == ch) 687 { 688 single_quote (attributes, bookmarks); 689 bookmarks[0] = bookmarks[5] + 1; 690 state = 0; 691 } 692 break; 693 case 5: if (Page.EOF == ch) 695 { 696 double_quote (attributes, bookmarks); 697 done = true; } 699 else if ('"' == ch) 700 { 701 double_quote (attributes, bookmarks); 702 bookmarks[0] = bookmarks[6] + 1; 703 state = 0; 704 } 705 break; 706 case 6: if (Page.EOF == ch) 712 { 713 standalone (attributes, bookmarks); 715 bookmarks[0]=bookmarks[6]; 716 mCursor.retreat(); 717 state=0; 718 } 719 else if (Character.isWhitespace (ch)) 720 { 721 } 723 else if ('=' == ch) { 725 bookmarks[2] = bookmarks[6]; 726 bookmarks[3] = bookmarks[7]; 727 state=2; 728 } 729 else 730 { 731 standalone (attributes, bookmarks); 737 bookmarks[0]=bookmarks[6]; 738 mCursor.retreat(); 739 state=0; 740 } 741 break; 742 default: 743 throw new IllegalStateException ("how the fuck did we get in state " + state); 744 } 745 } 746 747 return (makeTag (start, mCursor.getPosition (), attributes)); 748 } 749 750 753 protected Node makeTag (int start, int end, Vector attributes) 754 throws 755 ParserException 756 { 757 int length; 758 Node ret; 759 760 length = end - start; 761 if (0 != length) 762 { if (2 > length) 764 return (makeString (start, end)); 766 ret = getNodeFactory ().createTagNode (this.getPage (), start, end, attributes); 767 } 768 else 769 ret = null; 770 771 return (ret); 772 } 773 774 814 protected Node parseRemark (int start, boolean quotesmart) 815 throws 816 ParserException 817 { 818 boolean done; 819 char ch; 820 int state; 821 822 done = false; 823 state = 0; 824 while (!done) 825 { 826 ch = mPage.getCharacter (mCursor); 827 if (Page.EOF == ch) 828 done = true; 829 else 830 switch (state) 831 { 832 case 0: if ('>' == ch) 834 done = true; 835 if ('-' == ch) 836 state = 1; 837 else 838 return (parseString (start, quotesmart)); 839 break; 840 case 1: if ('-' == ch) 842 { 843 ch = mPage.getCharacter (mCursor); 845 if (Page.EOF == ch) 846 done = true; 847 else if ('>' == ch) 848 done = true; 849 else 850 { 851 mCursor.retreat (); 852 state = 2; 853 } 854 } 855 else 856 return (parseString (start, quotesmart)); 857 break; 858 case 2: if ('-' == ch) 860 state = 3; 861 else if (Page.EOF == ch) 862 return (parseString (start, quotesmart)); break; 864 case 3: if ('-' == ch) 866 state = 4; 867 else 868 state = 2; 869 break; 870 case 4: if ('>' == ch) 872 done = true; 873 else if (('!' == ch) || ('-' == ch) || Character.isWhitespace (ch)) 874 { 875 } 877 else 878 state = 2; 879 break; 880 default: 881 throw new IllegalStateException ("how the fuck did we get in state " + state); 882 } 883 } 884 885 return (makeRemark (start, mCursor.getPosition ())); 886 } 887 888 891 protected Node makeRemark (int start, int end) 892 throws 893 ParserException 894 { 895 int length; 896 Node ret; 897 898 length = end - start; 899 if (0 != length) 900 { if (2 > length) 902 return (makeString (start, end)); 904 ret = getNodeFactory ().createRemarkNode (this.getPage (), start, end); 905 } 906 else 907 ret = null; 908 909 return (ret); 910 } 911 912 918 protected Node parseJsp (int start) 919 throws 920 ParserException 921 { 922 boolean done; 923 char ch; 924 int state; 925 Vector attributes; 926 int code; 927 928 done = false; 929 state = 0; 930 code = 0; 931 attributes = new Vector (); 932 while (!done) 939 { 940 ch = mPage.getCharacter (mCursor); 941 switch (state) 942 { 943 case 0: switch (ch) 945 { 946 case '%': state = 1; 948 break; 949 default: 952 done = true; 953 break; 954 } 955 break; 956 case 1: switch (ch) 958 { 959 case Page.EOF: case '>': done = true; 962 break; 963 case '=': case '@': code = mCursor.getPosition (); 966 attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); 967 state = 2; 968 break; 969 default: code = mCursor.getPosition () - 1; 971 attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); 972 state = 2; 973 break; 974 } 975 break; 976 case 2: switch (ch) 978 { 979 case Page.EOF: case '>': done = true; 982 break; 983 case '\'': 984 case '"': state = ch; 986 break; 987 case '%': state = 3; 989 break; 990 default: break; 992 } 993 break; 994 case 3: 995 switch (ch) 996 { 997 case Page.EOF: done = true; 999 break; 1000 case '>': 1001 state = 4; 1002 done = true; 1003 break; 1004 default: state = 2; 1006 break; 1007 } 1008 break; 1009 case '"': 1010 switch (ch) 1011 { 1012 case Page.EOF: done = true; 1014 break; 1015 case '"': 1016 state = 2; 1017 break; 1018 default: break; 1020 } 1021 break; 1022 case '\'': 1023 switch (ch) 1024 { 1025 case Page.EOF: done = true; 1027 break; 1028 case '\'': 1029 state = 2; 1030 break; 1031 default: break; 1033 } 1034 break; 1035 default: 1036 throw new IllegalStateException ("how the fuck did we get in state " + state); 1037 } 1038 } 1039 1040 if (4 == state) { 1042 if (0 != code) 1043 { 1044 state = mCursor.getPosition () - 2; attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0)); 1046 attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0)); 1047 } 1048 else 1049 throw new IllegalStateException ("jsp with no code!"); 1050 } 1051 else 1052 return (parseString (start, true)); 1054 return (makeTag (start, mCursor.getPosition (), attributes)); 1055 } 1056 1057 1073 public Node parseCDATA () 1074 throws 1075 ParserException 1076 { 1077 return (parseCDATA (false)); 1078 } 1079 1080 1090 public Node parseCDATA (boolean quotesmart) 1091 throws 1092 ParserException 1093 { 1094 int start; 1095 int state; 1096 boolean done; 1097 char quote; 1098 char ch; 1099 int end; 1100 1101 start = mCursor.getPosition (); 1102 state = 0; 1103 done = false; 1104 quote = 0; 1105 while (!done) 1106 { 1107 ch = mPage.getCharacter (mCursor); 1108 switch (state) 1109 { 1110 case 0: switch (ch) 1112 { 1113 case Page.EOF: 1114 done = true; 1115 break; 1116 case '\'': 1117 if (quotesmart) 1118 if (0 == quote) 1119 quote = '\''; else if ('\'' == quote) 1121 quote = 0; break; 1123 case '"': 1124 if (quotesmart) 1125 if (0 == quote) 1126 quote = '"'; else if ('"' == quote) 1128 quote = 0; break; 1130 case '\\': 1131 if (quotesmart) 1132 if (0 != quote) 1133 { 1134 ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) 1136 done = true; 1137 else if ( (ch != '\\') && (ch != quote)) 1138 mCursor.retreat (); } 1140 break; 1141 case '/': 1142 if (quotesmart) 1143 if (0 == quote) 1144 { 1145 ch = mPage.getCharacter (mCursor); 1147 if (Page.EOF == ch) 1148 done = true; 1149 else if ('/' == ch) 1150 { 1151 do 1152 ch = mPage.getCharacter (mCursor); 1153 while ((Page.EOF != ch) && ('\n' != ch)); 1154 } 1155 else if ('*' == ch) 1156 { 1157 do 1158 { 1159 do 1160 ch = mPage.getCharacter (mCursor); 1161 while ((Page.EOF != ch) && ('*' != ch)); 1162 ch = mPage.getCharacter (mCursor); 1163 if (ch == '*') 1164 mCursor.retreat (); 1165 } 1166 while ((Page.EOF != ch) && ('/' != ch)); 1167 } 1168 else 1169 mCursor.retreat (); 1170 } 1171 break; 1172 case '<': 1173 if (quotesmart) 1174 { 1175 if (0 == quote) 1176 state = 1; 1177 } 1178 else 1179 state = 1; 1180 break; 1181 default: 1182 break; 1183 } 1184 break; 1185 case 1: switch (ch) 1187 { 1188 case Page.EOF: 1189 done = true; 1190 break; 1191 case '/': 1192 state = 2; 1193 break; 1194 default: 1195 state = 0; 1196 break; 1197 } 1198 break; 1199 case 2: if (Page.EOF == ch) 1201 done = true; 1202 else if (Character.isLetter (ch)) 1203 { 1204 done = true; 1205 mCursor.retreat (); 1207 mCursor.retreat (); 1208 mCursor.retreat (); 1209 } 1210 else 1211 state = 0; 1212 break; 1213 default: 1214 throw new IllegalStateException ("how the fuck did we get in state " + state); 1215 } 1216 } 1217 end = mCursor.getPosition (); 1218 1219 return (makeString (start, end)); 1220 } 1221 1222 1226 1232 public Text createStringNode (Page page, int start, int end) 1233 { 1234 return (new TextNode (page, start, end)); 1235 } 1236 1237 1243 public Remark createRemarkNode (Page page, int start, int end) 1244 { 1245 return (new RemarkNode (page, start, end)); 1246 } 1247 1248 1259 public Tag createTagNode (Page page, int start, int end, Vector attributes) 1260 { 1261 return (new TagNode (page, start, end, attributes)); 1262 } 1263 1264 1267 public static void main (String [] args) 1268 throws 1269 MalformedURLException , 1270 IOException , 1271 ParserException 1272 { 1273 Lexer lexer; 1274 Node node; 1275 1276 if (0 >= args.length) 1277 System.out.println ("usage: java -jar htmllexer.jar <url>"); 1278 else 1279 { 1280 try 1281 { 1282 ConnectionManager manager = Page.getConnectionManager (); 1283 lexer = new Lexer (manager.openConnection (args[0])); 1284 while (null != (node = lexer.nextNode ())) 1285 System.out.println (node.toString ()); 1286 } 1287 catch (ParserException pe) 1288 { 1289 System.out.println (pe.getMessage ()); 1290 if (null != pe.getThrowable ()) 1291 System.out.println (pe.getThrowable ().getMessage ()); 1292 } 1293 } 1294 } 1295} 1296 | Popular Tags |