1 2 package org.apache.lucene.demo.html; 3 4 import java.io.*; 5 import java.util.Properties ; 6 7 public class HTMLParser implements HTMLParserConstants { 8 public static int SUMMARY_LENGTH = 175; 9 10 StringBuffer title = new StringBuffer (); 11 StringBuffer summary = new StringBuffer (); 12 Properties metaTags=new Properties (); 13 String currentMetaTag=null; 14 String currentMetaContent=null; 15 int length = 0; 16 boolean titleComplete = false; 17 boolean inTitle = false; 18 boolean inHeading = false; 19 boolean inMetaTag = false; 20 boolean inIgnoredTag = false; 21 boolean afterTag = false; 22 boolean afterSpace = false; 23 String eol = System.getProperty("line.separator"); Reader pipeIn = null; 25 Writer pipeOut; 26 private MyPipedInputStream pipeInStream = null; 27 private PipedOutputStream pipeOutStream = null; 28 29 private class MyPipedInputStream extends PipedInputStream{ 30 31 public MyPipedInputStream(){ 32 super(); 33 } 34 35 public MyPipedInputStream(PipedOutputStream src) throws IOException{ 36 super(src); 37 } 38 39 public boolean full() throws IOException{ 40 return this.available() >= PipedInputStream.PIPE_SIZE; 41 } 42 } 43 44 47 public HTMLParser(File file) throws FileNotFoundException { 48 this(new FileInputStream(file)); 49 } 50 51 public String getTitle() throws IOException, InterruptedException { 52 if (pipeIn == null) 53 getReader(); while (true) { 55 synchronized(this) { 56 if (titleComplete || pipeInStream.full()) 57 break; 58 wait(10); 59 } 60 } 61 return title.toString().trim(); 62 } 63 64 public Properties getMetaTags() throws IOException, 65 InterruptedException { 66 if (pipeIn == null) 67 getReader(); while (true) { 69 synchronized(this) { 70 if (titleComplete || pipeInStream.full()) 71 break; 72 wait(10); 73 } 74 } 75 return metaTags; 76 } 77 78 79 public String getSummary() throws IOException, InterruptedException { 80 if (pipeIn == null) 81 getReader(); while (true) { 83 synchronized(this) { 84 if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full()) 85 break; 86 wait(10); 87 } 88 } 89 return summary.toString().trim(); 90 } 91 92 public Reader getReader() throws IOException { 93 if (pipeIn == null) { 94 pipeInStream = new MyPipedInputStream(); 95 pipeOutStream = new PipedOutputStream(pipeInStream); 96 pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE"); pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE"); 99 Thread thread = new ParserThread(this); 100 thread.start(); } 102 103 return pipeIn; 104 } 105 106 void addToSummary(String text) { 107 if (summary.length() < SUMMARY_LENGTH) { 108 summary.append(text); 109 if (summary.length() == title.length() && summary.toString().equals(title.toString())) { 111 summary.setLength(0); 112 } 113 if (summary.length() >= SUMMARY_LENGTH) { 115 summary.setLength(SUMMARY_LENGTH - 3); 116 summary.append("..."); synchronized(this) { 118 notifyAll(); 119 } 120 } 121 } 122 } 123 124 void addText(String text) throws IOException { 125 if (inIgnoredTag) 126 return; 127 if (inTitle) 128 title.append(text); 129 else { 130 if (!inHeading || summary.length() > 0) { 132 addToSummary(text); 133 } 134 if (!titleComplete && !title.equals("")) { synchronized(this) { 136 titleComplete = true; notifyAll(); 138 } 139 } 140 } 141 142 length += text.length(); 143 pipeOut.write(text); 144 145 afterSpace = false; 146 } 147 148 void addMetaTag() throws IOException { 149 metaTags.setProperty(currentMetaTag, currentMetaContent); 150 if (currentMetaTag.equalsIgnoreCase("keywords")) { pipeOut.write(currentMetaContent); 152 } 153 currentMetaTag = null; 154 currentMetaContent = null; 155 return; 156 } 157 158 void addSpace() throws IOException { 159 if (!afterSpace) { 160 if (inTitle) 161 title.append(" "); else if (summary.length() > 0) 163 addToSummary(" "); 165 String space = afterTag ? eol : " "; length += space.length(); 167 pipeOut.write(space); 168 afterSpace = true; 169 } 170 } 171 172 final public void HTMLDocument() throws ParseException, IOException { 173 Token t; 174 label_1: 175 while (true) { 176 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 177 case ScriptStart: 178 case TagName: 179 case DeclName: 180 case Comment1: 181 case Comment2: 182 case Word: 183 case Space: 184 case Entity: 185 case Punct: 186 ; 187 break; 188 default: 189 jj_la1[0] = jj_gen; 190 break label_1; 191 } 192 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 193 case TagName: 194 Tag(); 195 afterTag = true; 196 break; 197 case DeclName: 198 t = Decl(); 199 afterTag = true; 200 break; 201 case Comment1: 202 case Comment2: 203 CommentTag(); 204 afterTag = true; 205 break; 206 case ScriptStart: 207 ScriptTag(); 208 afterTag = true; 209 break; 210 case Word: 211 t = jj_consume_token(Word); 212 addText(t.image); afterTag = false; 213 break; 214 case Entity: 215 t = jj_consume_token(Entity); 216 addText(Entities.decode(t.image)); afterTag = false; 217 break; 218 case Punct: 219 t = jj_consume_token(Punct); 220 addText(t.image); afterTag = false; 221 break; 222 case Space: 223 jj_consume_token(Space); 224 addSpace(); afterTag = false; 225 break; 226 default: 227 jj_la1[1] = jj_gen; 228 jj_consume_token(-1); 229 throw new ParseException(); 230 } 231 } 232 jj_consume_token(0); 233 } 234 235 final public void Tag() throws ParseException, IOException { 236 Token t1, t2; 237 boolean inImg = false; 238 t1 = jj_consume_token(TagName); 239 String tagName = t1.image.toLowerCase(); 240 if(Tags.WS_ELEMS.contains(tagName) ) { 241 addSpace(); 242 } 243 inTitle = tagName.equals("<title"); inHeading = tagName.startsWith("<h") && tagName.length() == 3 && Character.isDigit(tagName.charAt(2)); inMetaTag = tagName.equals("<meta"); inIgnoredTag = tagName.equals("<style") || tagName.equals("<script"); inImg = tagName.equals("<img"); 249 label_2: 250 while (true) { 251 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 252 case ArgName: 253 ; 254 break; 255 default: 256 jj_la1[2] = jj_gen; 257 break label_2; 258 } 259 t1 = jj_consume_token(ArgName); 260 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 261 case ArgEquals: 262 jj_consume_token(ArgEquals); 263 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 264 case ArgValue: 265 case ArgQuote1: 266 case ArgQuote2: 267 t2 = ArgValue(); 268 if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) addText("[" + t2.image + "]"); 271 if(inMetaTag && 272 ( t1.image.equalsIgnoreCase("name") || t1.image.equalsIgnoreCase("HTTP-EQUIV") ) 275 && t2 != null) 276 { 277 currentMetaTag=t2.image.toLowerCase(); 278 if(currentMetaTag != null && currentMetaContent != null) { 279 addMetaTag(); 280 } 281 } 282 if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != null) 284 { 285 currentMetaContent=t2.image.toLowerCase(); 286 if(currentMetaTag != null && currentMetaContent != null) { 287 addMetaTag(); 288 } 289 } 290 break; 291 default: 292 jj_la1[3] = jj_gen; 293 ; 294 } 295 break; 296 default: 297 jj_la1[4] = jj_gen; 298 ; 299 } 300 } 301 jj_consume_token(TagEnd); 302 } 303 304 final public Token ArgValue() throws ParseException { 305 Token t = null; 306 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 307 case ArgValue: 308 t = jj_consume_token(ArgValue); 309 {if (true) return t;} 310 break; 311 default: 312 jj_la1[5] = jj_gen; 313 if (jj_2_1(2)) { 314 jj_consume_token(ArgQuote1); 315 jj_consume_token(CloseQuote1); 316 {if (true) return t;} 317 } else { 318 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 319 case ArgQuote1: 320 jj_consume_token(ArgQuote1); 321 t = jj_consume_token(Quote1Text); 322 jj_consume_token(CloseQuote1); 323 {if (true) return t;} 324 break; 325 default: 326 jj_la1[6] = jj_gen; 327 if (jj_2_2(2)) { 328 jj_consume_token(ArgQuote2); 329 jj_consume_token(CloseQuote2); 330 {if (true) return t;} 331 } else { 332 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 333 case ArgQuote2: 334 jj_consume_token(ArgQuote2); 335 t = jj_consume_token(Quote2Text); 336 jj_consume_token(CloseQuote2); 337 {if (true) return t;} 338 break; 339 default: 340 jj_la1[7] = jj_gen; 341 jj_consume_token(-1); 342 throw new ParseException(); 343 } 344 } 345 } 346 } 347 } 348 throw new Error ("Missing return statement in function"); } 350 351 final public Token Decl() throws ParseException { 352 Token t; 353 t = jj_consume_token(DeclName); 354 label_3: 355 while (true) { 356 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 357 case ArgName: 358 case ArgEquals: 359 case ArgValue: 360 case ArgQuote1: 361 case ArgQuote2: 362 ; 363 break; 364 default: 365 jj_la1[8] = jj_gen; 366 break label_3; 367 } 368 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 369 case ArgName: 370 jj_consume_token(ArgName); 371 break; 372 case ArgValue: 373 case ArgQuote1: 374 case ArgQuote2: 375 ArgValue(); 376 break; 377 case ArgEquals: 378 jj_consume_token(ArgEquals); 379 break; 380 default: 381 jj_la1[9] = jj_gen; 382 jj_consume_token(-1); 383 throw new ParseException(); 384 } 385 } 386 jj_consume_token(TagEnd); 387 {if (true) return t;} 388 throw new Error ("Missing return statement in function"); } 390 391 final public void CommentTag() throws ParseException { 392 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 393 case Comment1: 394 jj_consume_token(Comment1); 395 label_4: 396 while (true) { 397 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 398 case CommentText1: 399 ; 400 break; 401 default: 402 jj_la1[10] = jj_gen; 403 break label_4; 404 } 405 jj_consume_token(CommentText1); 406 } 407 jj_consume_token(CommentEnd1); 408 break; 409 case Comment2: 410 jj_consume_token(Comment2); 411 label_5: 412 while (true) { 413 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 414 case CommentText2: 415 ; 416 break; 417 default: 418 jj_la1[11] = jj_gen; 419 break label_5; 420 } 421 jj_consume_token(CommentText2); 422 } 423 jj_consume_token(CommentEnd2); 424 break; 425 default: 426 jj_la1[12] = jj_gen; 427 jj_consume_token(-1); 428 throw new ParseException(); 429 } 430 } 431 432 final public void ScriptTag() throws ParseException { 433 jj_consume_token(ScriptStart); 434 label_6: 435 while (true) { 436 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 437 case ScriptText: 438 ; 439 break; 440 default: 441 jj_la1[13] = jj_gen; 442 break label_6; 443 } 444 jj_consume_token(ScriptText); 445 } 446 jj_consume_token(ScriptEnd); 447 } 448 449 final private boolean jj_2_1(int xla) { 450 jj_la = xla; jj_lastpos = jj_scanpos = token; 451 try { return !jj_3_1(); } 452 catch(LookaheadSuccess ls) { return true; } 453 finally { jj_save(0, xla); } 454 } 455 456 final private boolean jj_2_2(int xla) { 457 jj_la = xla; jj_lastpos = jj_scanpos = token; 458 try { return !jj_3_2(); } 459 catch(LookaheadSuccess ls) { return true; } 460 finally { jj_save(1, xla); } 461 } 462 463 final private boolean jj_3_2() { 464 if (jj_scan_token(ArgQuote2)) return true; 465 if (jj_scan_token(CloseQuote2)) return true; 466 return false; 467 } 468 469 final private boolean jj_3_1() { 470 if (jj_scan_token(ArgQuote1)) return true; 471 if (jj_scan_token(CloseQuote1)) return true; 472 return false; 473 } 474 475 public HTMLParserTokenManager token_source; 476 SimpleCharStream jj_input_stream; 477 public Token token, jj_nt; 478 private int jj_ntk; 479 private Token jj_scanpos, jj_lastpos; 480 private int jj_la; 481 public boolean lookingAhead = false; 482 private int jj_gen; 483 final private int[] jj_la1 = new int[14]; 484 static private int[] jj_la1_0; 485 static { 486 jj_la1_0(); 487 } 488 private static void jj_la1_0() { 489 jj_la1_0 = new int[] {0x347e,0x347e,0x10000,0x380000,0x20000,0x80000,0x100000,0x200000,0x3b0000,0x3b0000,0x8000000,0x20000000,0x30,0x4000,}; 490 } 491 final private JJCalls[] jj_2_rtns = new JJCalls[2]; 492 private boolean jj_rescan = false; 493 private int jj_gc = 0; 494 495 public HTMLParser(java.io.InputStream stream) { 496 this(stream, null); 497 } 498 public HTMLParser(java.io.InputStream stream, String encoding) { 499 try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException (e); } 500 token_source = new HTMLParserTokenManager(jj_input_stream); 501 token = new Token(); 502 jj_ntk = -1; 503 jj_gen = 0; 504 for (int i = 0; i < 14; i++) jj_la1[i] = -1; 505 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); 506 } 507 508 public void ReInit(java.io.InputStream stream) { 509 ReInit(stream, null); 510 } 511 public void ReInit(java.io.InputStream stream, String encoding) { 512 try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException (e); } 513 token_source.ReInit(jj_input_stream); 514 token = new Token(); 515 jj_ntk = -1; 516 jj_gen = 0; 517 for (int i = 0; i < 14; i++) jj_la1[i] = -1; 518 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); 519 } 520 521 public HTMLParser(java.io.Reader stream) { 522 jj_input_stream = new SimpleCharStream(stream, 1, 1); 523 token_source = new HTMLParserTokenManager(jj_input_stream); 524 token = new Token(); 525 jj_ntk = -1; 526 jj_gen = 0; 527 for (int i = 0; i < 14; i++) jj_la1[i] = -1; 528 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); 529 } 530 531 public void ReInit(java.io.Reader stream) { 532 jj_input_stream.ReInit(stream, 1, 1); 533 token_source.ReInit(jj_input_stream); 534 token = new Token(); 535 jj_ntk = -1; 536 jj_gen = 0; 537 for (int i = 0; i < 14; i++) jj_la1[i] = -1; 538 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); 539 } 540 541 public HTMLParser(HTMLParserTokenManager tm) { 542 token_source = tm; 543 token = new Token(); 544 jj_ntk = -1; 545 jj_gen = 0; 546 for (int i = 0; i < 14; i++) jj_la1[i] = -1; 547 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); 548 } 549 550 public void ReInit(HTMLParserTokenManager tm) { 551 token_source = tm; 552 token = new Token(); 553 jj_ntk = -1; 554 jj_gen = 0; 555 for (int i = 0; i < 14; i++) jj_la1[i] = -1; 556 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); 557 } 558 559 final private Token jj_consume_token(int kind) throws ParseException { 560 Token oldToken; 561 if ((oldToken = token).next != null) token = token.next; 562 else token = token.next = token_source.getNextToken(); 563 jj_ntk = -1; 564 if (token.kind == kind) { 565 jj_gen++; 566 if (++jj_gc > 100) { 567 jj_gc = 0; 568 for (int i = 0; i < jj_2_rtns.length; i++) { 569 JJCalls c = jj_2_rtns[i]; 570 while (c != null) { 571 if (c.gen < jj_gen) c.first = null; 572 c = c.next; 573 } 574 } 575 } 576 return token; 577 } 578 token = oldToken; 579 jj_kind = kind; 580 throw generateParseException(); 581 } 582 583 static private final class LookaheadSuccess extends java.lang.Error { 584 private static final long serialVersionUID = 1L; 585 } 586 final private LookaheadSuccess jj_ls = new LookaheadSuccess(); 587 final private boolean jj_scan_token(int kind) { 588 if (jj_scanpos == jj_lastpos) { 589 jj_la--; 590 if (jj_scanpos.next == null) { 591 jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken(); 592 } else { 593 jj_lastpos = jj_scanpos = jj_scanpos.next; 594 } 595 } else { 596 jj_scanpos = jj_scanpos.next; 597 } 598 if (jj_rescan) { 599 int i = 0; Token tok = token; 600 while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; } 601 if (tok != null) jj_add_error_token(kind, i); 602 } 603 if (jj_scanpos.kind != kind) return true; 604 if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls; 605 return false; 606 } 607 608 final public Token getNextToken() { 609 if (token.next != null) token = token.next; 610 else token = token.next = token_source.getNextToken(); 611 jj_ntk = -1; 612 jj_gen++; 613 return token; 614 } 615 616 final public Token getToken(int index) { 617 Token t = lookingAhead ? jj_scanpos : token; 618 for (int i = 0; i < index; i++) { 619 if (t.next != null) t = t.next; 620 else t = t.next = token_source.getNextToken(); 621 } 622 return t; 623 } 624 625 final private int jj_ntk() { 626 if ((jj_nt=token.next) == null) 627 return (jj_ntk = (token.next=token_source.getNextToken()).kind); 628 else 629 return (jj_ntk = jj_nt.kind); 630 } 631 632 private java.util.Vector jj_expentries = new java.util.Vector (); 633 private int[] jj_expentry; 634 private int jj_kind = -1; 635 private int[] jj_lasttokens = new int[100]; 636 private int jj_endpos; 637 638 private void jj_add_error_token(int kind, int pos) { 639 if (pos >= 100) return; 640 if (pos == jj_endpos + 1) { 641 jj_lasttokens[jj_endpos++] = kind; 642 } else if (jj_endpos != 0) { 643 jj_expentry = new int[jj_endpos]; 644 for (int i = 0; i < jj_endpos; i++) { 645 jj_expentry[i] = jj_lasttokens[i]; 646 } 647 boolean exists = false; 648 for (java.util.Enumeration e = jj_expentries.elements(); e.hasMoreElements();) { 649 int[] oldentry = (int[])(e.nextElement()); 650 if (oldentry.length == jj_expentry.length) { 651 exists = true; 652 for (int i = 0; i < jj_expentry.length; i++) { 653 if (oldentry[i] != jj_expentry[i]) { 654 exists = false; 655 break; 656 } 657 } 658 if (exists) break; 659 } 660 } 661 if (!exists) jj_expentries.addElement(jj_expentry); 662 if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind; 663 } 664 } 665 666 public ParseException generateParseException() { 667 jj_expentries.removeAllElements(); 668 boolean[] la1tokens = new boolean[31]; 669 for (int i = 0; i < 31; i++) { 670 la1tokens[i] = false; 671 } 672 if (jj_kind >= 0) { 673 la1tokens[jj_kind] = true; 674 jj_kind = -1; 675 } 676 for (int i = 0; i < 14; i++) { 677 if (jj_la1[i] == jj_gen) { 678 for (int j = 0; j < 32; j++) { 679 if ((jj_la1_0[i] & (1<<j)) != 0) { 680 la1tokens[j] = true; 681 } 682 } 683 } 684 } 685 for (int i = 0; i < 31; i++) { 686 if (la1tokens[i]) { 687 jj_expentry = new int[1]; 688 jj_expentry[0] = i; 689 jj_expentries.addElement(jj_expentry); 690 } 691 } 692 jj_endpos = 0; 693 jj_rescan_token(); 694 jj_add_error_token(0, 0); 695 int[][] exptokseq = new int[jj_expentries.size()][]; 696 for (int i = 0; i < jj_expentries.size(); i++) { 697 exptokseq[i] = (int[])jj_expentries.elementAt(i); 698 } 699 return new ParseException(token, exptokseq, tokenImage); 700 } 701 702 final public void enable_tracing() { 703 } 704 705 final public void disable_tracing() { 706 } 707 708 final private void jj_rescan_token() { 709 jj_rescan = true; 710 for (int i = 0; i < 2; i++) { 711 try { 712 JJCalls p = jj_2_rtns[i]; 713 do { 714 if (p.gen > jj_gen) { 715 jj_la = p.arg; jj_lastpos = jj_scanpos = p.first; 716 switch (i) { 717 case 0: jj_3_1(); break; 718 case 1: jj_3_2(); break; 719 } 720 } 721 p = p.next; 722 } while (p != null); 723 } catch(LookaheadSuccess ls) { } 724 } 725 jj_rescan = false; 726 } 727 728 final private void jj_save(int index, int xla) { 729 JJCalls p = jj_2_rtns[index]; 730 while (p.gen > jj_gen) { 731 if (p.next == null) { p = p.next = new JJCalls(); break; } 732 p = p.next; 733 } 734 p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla; 735 } 736 737 static final class JJCalls { 738 int gen; 739 Token first; 740 int arg; 741 JJCalls next; 742 } 743 744 } 745 | Popular Tags |