1 17 18 19 20 package org.apache.lenya.lucene.html; 21 22 import java.io.File ; 23 import java.io.FileInputStream ; 24 import java.io.FileNotFoundException ; 25 import java.io.IOException ; 26 import java.io.PipedReader ; 27 import java.io.PipedWriter ; 28 import java.io.Reader ; 29 30 31 34 public class HTMLParser implements HTMLParserConstants { 35 public static int SUMMARY_LENGTH = 200; 36 StringBuffer title = new StringBuffer (SUMMARY_LENGTH); 37 StringBuffer summary = new StringBuffer (SUMMARY_LENGTH * 2); 38 int length = 0; 39 boolean titleComplete = false; 40 boolean inTitle = false; 41 boolean inScript = false; 42 boolean afterTag = false; 43 boolean afterSpace = false; 44 String eol = System.getProperty("line.separator"); 45 PipedReader pipeIn = null; 46 PipedWriter pipeOut; 47 int MAX_WAIT = 1000; 48 public HTMLParserTokenManager token_source; 49 SimpleCharStream jj_input_stream; 50 public Token token; 51 public Token jj_nt; 52 private int jj_ntk; 53 private Token jj_scanpos; 54 private Token jj_lastpos; 55 private int jj_la; 56 public boolean lookingAhead = false; 57 private int jj_gen; 58 final private int[] jj_la1 = new int[13]; 59 final private int[] jj_la1_0 = { 60 0xb3e, 0xb3e, 0x1000, 0x38000, 0x2000, 0x8000, 0x10000, 0x20000, 0x3b000, 0x3b000, 0x800000, 61 0x2000000, 0x18, 62 }; 63 final private JJCalls[] jj_2_rtns = new JJCalls[2]; 64 private boolean jj_rescan = false; 65 private int jj_gc = 0; 66 private java.util.Vector jj_expentries = new java.util.Vector (); 67 private int[] jj_expentry; 68 private int jj_kind = -1; 69 private int[] jj_lasttokens = new int[100]; 70 private int jj_endpos; 71 72 78 public HTMLParser(File file) throws FileNotFoundException { 79 this(new FileInputStream (file)); 80 } 81 82 87 public HTMLParser(java.io.InputStream stream) { 88 jj_input_stream = new SimpleCharStream(stream, 1, 1); 89 token_source = new HTMLParserTokenManager(jj_input_stream); 90 token = new Token(); 91 jj_ntk = -1; 92 jj_gen = 0; 93 94 for (int i = 0; i < 13; i++) 95 jj_la1[i] = -1; 96 97 for (int i = 0; i < jj_2_rtns.length; i++) 98 jj_2_rtns[i] = new JJCalls(); 99 } 100 101 106 public HTMLParser(java.io.Reader stream) { 107 jj_input_stream = new SimpleCharStream(stream, 1, 1); 108 token_source = new HTMLParserTokenManager(jj_input_stream); 109 token = new Token(); 110 jj_ntk = -1; 111 jj_gen = 0; 112 113 for (int i = 0; i < 13; i++) 114 jj_la1[i] = -1; 115 116 for (int i = 0; i < jj_2_rtns.length; i++) 117 jj_2_rtns[i] = new JJCalls(); 118 } 119 120 125 public HTMLParser(HTMLParserTokenManager tm) { 126 token_source = tm; 127 token = new Token(); 128 jj_ntk = -1; 129 jj_gen = 0; 130 131 for (int i = 0; i < 13; i++) 132 jj_la1[i] = -1; 133 134 for (int i = 0; i < jj_2_rtns.length; i++) 135 jj_2_rtns[i] = new JJCalls(); 136 } 137 138 146 public String getTitle() throws IOException , InterruptedException { 147 if (pipeIn == null) { 148 getReader(); } 150 151 int elapsedMillis = 0; 152 153 while (true) { 154 synchronized (this) { 155 if (titleComplete || (length > SUMMARY_LENGTH)) { 156 break; 157 } 158 159 wait(10); 160 161 elapsedMillis = elapsedMillis + 10; 162 163 if (elapsedMillis > MAX_WAIT) { 164 break; 165 } 166 } 167 } 168 169 return title.toString().trim(); 170 } 171 172 180 public String getKeywords() throws IOException , InterruptedException { 181 return ""; 182 } 183 184 192 public String getSummary() throws IOException , InterruptedException { 193 System.out.println("HTMLParser().getSummary()"); 194 195 if (pipeIn == null) { 196 getReader(); } 198 199 int elapsedMillis = 0; 200 201 while (true) { 202 synchronized (this) { 203 if (summary.length() >= SUMMARY_LENGTH) { 204 break; 205 } 206 207 wait(10); 208 209 elapsedMillis = elapsedMillis + 10; 210 211 if (elapsedMillis > MAX_WAIT) { 212 break; 213 } 214 } 215 } 216 217 if (summary.length() > SUMMARY_LENGTH) { 218 summary.setLength(SUMMARY_LENGTH); 219 } 220 221 String sum = summary.toString().trim(); 222 String tit = getTitle(); 223 224 if (sum.startsWith(tit)) { 225 return sum; 226 } else { 227 return sum; 228 } 229 } 230 231 238 public Reader getReader() throws IOException { 239 if (pipeIn == null) { 240 pipeIn = new PipedReader (); 241 pipeOut = new PipedWriter (pipeIn); 242 243 Thread thread = new ParserThread(this); 244 thread.start(); } 246 247 return pipeIn; 248 } 249 250 void addToSummary(String text) { 251 if (summary.length() < SUMMARY_LENGTH) { 252 summary.append(text); 253 254 if (summary.length() >= SUMMARY_LENGTH) { 255 synchronized (this) { 256 notifyAll(); 257 } 258 } 259 } 260 } 261 262 void addToTitle(String text) { 263 title.append(text); 264 } 265 266 void addText(String text) throws IOException { 267 if (inScript) { 268 return; 269 } 270 271 if (inTitle) { 272 addToTitle(text); 273 } else { 274 addToSummary(text); 275 276 if (!titleComplete && !title.equals("")) { 278 synchronized (this) { 279 titleComplete = true; notifyAll(); 281 } 282 } 283 } 284 285 length += text.length(); 286 pipeOut.write(text); 287 288 afterSpace = false; 289 } 290 291 void addSpace() throws IOException { 292 if (inScript) { 293 return; 294 } 295 296 if (!afterSpace) { 297 if (inTitle) { 298 addToTitle(" "); 299 } else { 300 addToSummary(" "); 301 } 302 303 String space = afterTag ? eol : " "; 304 length += space.length(); 305 pipeOut.write(space); 306 afterSpace = true; 307 } 308 } 309 310 316 final public void HTMLDocument() throws ParseException, IOException { 317 Token t; 318 label_1: 319 while (true) { 320 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 321 case TagName: 322 case DeclName: 323 case Comment1: 324 case Comment2: 325 case Word: 326 case Entity: 327 case Space: 328 case Punct: 329 330 break; 331 332 default: 333 jj_la1[0] = jj_gen; 334 335 break label_1; 336 } 337 338 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 339 case TagName: 340 Tag(); 341 afterTag = true; 342 343 break; 344 345 case DeclName: 346 t = Decl(); 347 afterTag = true; 348 349 break; 350 351 case Comment1: 352 case Comment2: 353 CommentTag(); 354 afterTag = true; 355 356 break; 357 358 case Word: 359 t = jj_consume_token(Word); 360 addText(t.image); 361 afterTag = false; 362 363 break; 364 365 case Entity: 366 t = jj_consume_token(Entity); 367 addText(Entities.decode(t.image)); 368 afterTag = false; 369 370 break; 371 372 case Punct: 373 t = jj_consume_token(Punct); 374 addText(t.image); 375 afterTag = false; 376 377 break; 378 379 case Space: 380 jj_consume_token(Space); 381 addSpace(); 382 afterTag = false; 383 384 break; 385 386 default: 387 jj_la1[1] = jj_gen; 388 jj_consume_token(-1); 389 throw new ParseException(); 390 } 391 } 392 393 jj_consume_token(0); 394 } 395 396 402 final public void Tag() throws ParseException, IOException { 403 Token t1; 404 Token t2; 405 boolean inImg = false; 406 t1 = jj_consume_token(TagName); 407 inTitle = t1.image.equalsIgnoreCase("<title"); inImg = t1.image.equalsIgnoreCase("<img"); 410 if (inScript) { inScript = !t1.image.equalsIgnoreCase("</script"); 412 } else { 413 inScript = t1.image.equalsIgnoreCase("<script"); 414 } 415 416 label_2: 417 while (true) { 418 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 419 case ArgName: 420 421 break; 422 423 default: 424 jj_la1[2] = jj_gen; 425 426 break label_2; 427 } 428 429 t1 = jj_consume_token(ArgName); 430 431 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 432 case ArgEquals: 433 jj_consume_token(ArgEquals); 434 435 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 436 case ArgValue: 437 case ArgQuote1: 438 case ArgQuote2: 439 t2 = ArgValue(); 440 441 if (inImg && t1.image.equalsIgnoreCase("alt") && (t2 != null)) { 442 addText("[" + t2.image + "]"); 443 } 444 445 break; 446 447 default: 448 jj_la1[3] = jj_gen; 449 } 450 451 break; 452 453 default: 454 jj_la1[4] = jj_gen; 455 } 456 } 457 458 jj_consume_token(TagEnd); 459 } 460 461 469 final public Token ArgValue() throws ParseException { 470 Token t = null; 471 472 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 473 case ArgValue: 474 t = jj_consume_token(ArgValue); 475 { 476 if (true) { 477 return t; 478 } 479 } 480 481 break; 482 483 default: 484 jj_la1[5] = jj_gen; 485 486 if (jj_2_1(2)) { 487 jj_consume_token(ArgQuote1); 488 jj_consume_token(CloseQuote1); 489 490 if (true) { 491 return t; 492 } 493 } else { 494 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 495 case ArgQuote1: 496 jj_consume_token(ArgQuote1); 497 t = jj_consume_token(Quote1Text); 498 jj_consume_token(CloseQuote1); 499 { 500 if (true) { 501 return t; 502 } 503 } 504 505 break; 506 507 default: 508 jj_la1[6] = jj_gen; 509 510 if (jj_2_2(2)) { 511 jj_consume_token(ArgQuote2); 512 jj_consume_token(CloseQuote2); 513 514 if (true) { 515 return t; 516 } 517 } else { 518 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 519 case ArgQuote2: 520 jj_consume_token(ArgQuote2); 521 t = jj_consume_token(Quote2Text); 522 jj_consume_token(CloseQuote2); 523 { 524 if (true) { 525 return t; 526 } 527 } 528 529 break; 530 531 default: 532 jj_la1[7] = jj_gen; 533 jj_consume_token(-1); 534 throw new ParseException(); 535 } 536 } 537 } 538 } 539 } 540 541 throw new Error ("Missing return statement in function"); 542 } 543 544 552 final public Token Decl() throws ParseException { 553 Token t; 554 t = jj_consume_token(DeclName); 555 label_3: 556 while (true) { 557 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 558 case ArgName: 559 case ArgEquals: 560 case ArgValue: 561 case ArgQuote1: 562 case ArgQuote2: 563 564 break; 565 566 default: 567 jj_la1[8] = jj_gen; 568 569 break label_3; 570 } 571 572 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 573 case ArgName: 574 jj_consume_token(ArgName); 575 576 break; 577 578 case ArgValue: 579 case ArgQuote1: 580 case ArgQuote2: 581 ArgValue(); 582 583 break; 584 585 case ArgEquals: 586 jj_consume_token(ArgEquals); 587 588 break; 589 590 default: 591 jj_la1[9] = jj_gen; 592 jj_consume_token(-1); 593 throw new ParseException(); 594 } 595 } 596 597 jj_consume_token(TagEnd); 598 599 if (true) { 600 return t; 601 } 602 603 throw new Error ("Missing return statement in function"); 604 } 605 606 611 final public void CommentTag() throws ParseException { 612 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 613 case Comment1: 614 jj_consume_token(Comment1); 615 label_4: 616 while (true) { 617 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 618 case CommentText1: 619 620 break; 621 622 default: 623 jj_la1[10] = jj_gen; 624 625 break label_4; 626 } 627 628 jj_consume_token(CommentText1); 629 } 630 631 jj_consume_token(CommentEnd1); 632 633 break; 634 635 case Comment2: 636 jj_consume_token(Comment2); 637 label_5: 638 while (true) { 639 switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { 640 case CommentText2: 641 642 break; 643 644 default: 645 jj_la1[11] = jj_gen; 646 647 break label_5; 648 } 649 650 jj_consume_token(CommentText2); 651 } 652 653 jj_consume_token(CommentEnd2); 654 655 break; 656 657 default: 658 jj_la1[12] = jj_gen; 659 jj_consume_token(-1); 660 throw new ParseException(); 661 } 662 } 663 664 final private boolean jj_2_1(int xla) { 665 jj_la = xla; 666 jj_lastpos = jj_scanpos = token; 667 668 boolean retval = !jj_3_1(); 669 jj_save(0, xla); 670 671 return retval; 672 } 673 674 final private boolean jj_2_2(int xla) { 675 jj_la = xla; 676 jj_lastpos = jj_scanpos = token; 677 678 boolean retval = !jj_3_2(); 679 jj_save(1, xla); 680 681 return retval; 682 } 683 684 final private boolean jj_3_1() { 685 if (jj_scan_token(ArgQuote1)) { 686 return true; 687 } 688 689 if ((jj_la == 0) && (jj_scanpos == jj_lastpos)) { 690 return false; 691 } 692 693 if (jj_scan_token(CloseQuote1)) { 694 return true; 695 } 696 697 if ((jj_la == 0) && (jj_scanpos == jj_lastpos)) { 698 return false; 699 } 700 701 return false; 702 } 703 704 final private boolean jj_3_2() { 705 if (jj_scan_token(ArgQuote2)) { 706 return true; 707 } 708 709 if ((jj_la == 0) && (jj_scanpos == jj_lastpos)) { 710 return false; 711 } 712 713 if (jj_scan_token(CloseQuote2)) { 714 return true; 715 } 716 717 if ((jj_la == 0) && (jj_scanpos == jj_lastpos)) { 718 return false; 719 } 720 721 return false; 722 } 723 724 729 public void ReInit(java.io.InputStream stream) { 730 jj_input_stream.ReInit(stream, 1, 1); 731 token_source.ReInit(jj_input_stream); 732 token = new Token(); 733 jj_ntk = -1; 734 jj_gen = 0; 735 736 for (int i = 0; i < 13; i++) 737 jj_la1[i] = -1; 738 739 for (int i = 0; i < jj_2_rtns.length; i++) 740 jj_2_rtns[i] = new JJCalls(); 741 } 742 743 748 public void ReInit(java.io.Reader stream) { 749 jj_input_stream.ReInit(stream, 1, 1); 750 token_source.ReInit(jj_input_stream); 751 token = new Token(); 752 jj_ntk = -1; 753 jj_gen = 0; 754 755 for (int i = 0; i < 13; i++) 756 jj_la1[i] = -1; 757 758 for (int i = 0; i < jj_2_rtns.length; i++) 759 jj_2_rtns[i] = new JJCalls(); 760 } 761 762 767 public void ReInit(HTMLParserTokenManager tm) { 768 token_source = tm; 769 token = new Token(); 770 jj_ntk = -1; 771 jj_gen = 0; 772 773 for (int i = 0; i < 13; i++) 774 jj_la1[i] = -1; 775 776 for (int i = 0; i < jj_2_rtns.length; i++) 777 jj_2_rtns[i] = new JJCalls(); 778 } 779 780 final private Token jj_consume_token(int kind) throws ParseException { 781 Token oldToken; 782 783 if ((oldToken = token).next != null) { 784 token = token.next; 785 } else { 786 token = token.next = token_source.getNextToken(); 787 } 788 789 jj_ntk = -1; 790 791 if (token.kind == kind) { 792 jj_gen++; 793 794 if (++jj_gc > 100) { 795 jj_gc = 0; 796 797 for (int i = 0; i < jj_2_rtns.length; i++) { 798 JJCalls c = jj_2_rtns[i]; 799 800 while (c != null) { 801 if (c.gen < jj_gen) { 802 c.first = null; 803 } 804 805 c = c.next; 806 } 807 } 808 } 809 810 return token; 811 } 812 813 token = oldToken; 814 jj_kind = kind; 815 throw generateParseException(); 816 } 817 818 final private boolean jj_scan_token(int kind) { 819 if (jj_scanpos == jj_lastpos) { 820 jj_la--; 821 822 if (jj_scanpos.next == null) { 823 jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken(); 824 } else { 825 jj_lastpos = jj_scanpos = jj_scanpos.next; 826 } 827 } else { 828 jj_scanpos = jj_scanpos.next; 829 } 830 831 if (jj_rescan) { 832 int i = 0; 833 Token tok = token; 834 835 while ((tok != null) && (tok != jj_scanpos)) { 836 i++; 837 tok = tok.next; 838 } 839 840 if (tok != null) { 841 jj_add_error_token(kind, i); 842 } 843 } 844 845 return (jj_scanpos.kind != kind); 846 } 847 848 853 final public Token getNextToken() { 854 if (token.next != null) { 855 token = token.next; 856 } else { 857 token = token.next = token_source.getNextToken(); 858 } 859 860 jj_ntk = -1; 861 jj_gen++; 862 863 return token; 864 } 865 866 873 final public Token getToken(int index) { 874 Token t = lookingAhead ? jj_scanpos : token; 875 876 for (int i = 0; i < index; i++) { 877 if (t.next != null) { 878 t = t.next; 879 } else { 880 t = t.next = token_source.getNextToken(); 881 } 882 } 883 884 return t; 885 } 886 887 final private int jj_ntk() { 888 if ((jj_nt = token.next) == null) { 889 return (jj_ntk = (token.next = token_source.getNextToken()).kind); 890 } else { 891 return (jj_ntk = jj_nt.kind); 892 } 893 } 894 895 private void jj_add_error_token(int kind, int pos) { 896 if (pos >= 100) { 897 return; 898 } 899 900 if (pos == (jj_endpos + 1)) { 901 jj_lasttokens[jj_endpos++] = kind; 902 } else if (jj_endpos != 0) { 903 jj_expentry = new int[jj_endpos]; 904 905 for (int i = 0; i < jj_endpos; i++) { 906 jj_expentry[i] = jj_lasttokens[i]; 907 } 908 909 boolean exists = false; 910 911 for (java.util.Enumeration elenum = jj_expentries.elements(); elenum.hasMoreElements();) { 912 int[] oldentry = (int[]) (elenum.nextElement()); 913 914 if (oldentry.length == jj_expentry.length) { 915 exists = true; 916 917 for (int i = 0; i < jj_expentry.length; i++) { 918 if (oldentry[i] != jj_expentry[i]) { 919 exists = false; 920 921 break; 922 } 923 } 924 925 if (exists) { 926 break; 927 } 928 } 929 } 930 931 if (!exists) { 932 jj_expentries.addElement(jj_expentry); 933 } 934 935 if (pos != 0) { 936 jj_lasttokens[(jj_endpos = pos) - 1] = kind; 937 } 938 } 939 } 940 941 946 final public ParseException generateParseException() { 947 jj_expentries.removeAllElements(); 948 949 boolean[] la1tokens = new boolean[27]; 950 951 for (int i = 0; i < 27; i++) { 952 la1tokens[i] = false; 953 } 954 955 if (jj_kind >= 0) { 956 la1tokens[jj_kind] = true; 957 jj_kind = -1; 958 } 959 960 for (int i = 0; i < 13; i++) { 961 if (jj_la1[i] == jj_gen) { 962 for (int j = 0; j < 32; j++) { 963 if ((jj_la1_0[i] & (1 << j)) != 0) { 964 la1tokens[j] = true; 965 } 966 } 967 } 968 } 969 970 for (int i = 0; i < 27; i++) { 971 if (la1tokens[i]) { 972 jj_expentry = new int[1]; 973 jj_expentry[0] = i; 974 jj_expentries.addElement(jj_expentry); 975 } 976 } 977 978 jj_endpos = 0; 979 jj_rescan_token(); 980 jj_add_error_token(0, 0); 981 982 int[][] exptokseq = new int[jj_expentries.size()][]; 983 984 for (int i = 0; i < jj_expentries.size(); i++) { 985 exptokseq[i] = (int[]) jj_expentries.elementAt(i); 986 } 987 988 return new ParseException(token, exptokseq, tokenImage); 989 } 990 991 994 final public void enable_tracing() { 995 } 996 997 1000 final public void disable_tracing() { 1001 } 1002 1003 final private void jj_rescan_token() { 1004 jj_rescan = true; 1005 1006 for (int i = 0; i < 2; i++) { 1007 JJCalls p = jj_2_rtns[i]; 1008 1009 do { 1010 if (p.gen > jj_gen) { 1011 jj_la = p.arg; 1012 jj_lastpos = jj_scanpos = p.first; 1013 1014 switch (i) { 1015 case 0: 1016 jj_3_1(); 1017 1018 break; 1019 1020 case 1: 1021 jj_3_2(); 1022 1023 break; 1024 } 1025 } 1026 1027 p = p.next; 1028 } while (p != null); 1029 } 1030 1031 jj_rescan = false; 1032 } 1033 1034 final private void jj_save(int index, int xla) { 1035 JJCalls p = jj_2_rtns[index]; 1036 1037 while (p.gen > jj_gen) { 1038 if (p.next == null) { 1039 p = p.next = new JJCalls(); 1040 1041 break; 1042 } 1043 1044 p = p.next; 1045 } 1046 1047 p.gen = (jj_gen + xla) - jj_la; 1048 p.first = token; 1049 p.arg = xla; 1050 } 1051 1052 static final class JJCalls { 1053 int gen; 1054 Token first; 1055 int arg; 1056 JJCalls next; 1057 } 1058} 1059 | Popular Tags |