1 27 package org.htmlparser.beans; 28 29 import java.beans.PropertyChangeListener ; 30 import java.beans.PropertyChangeSupport ; 31 import java.io.Serializable ; 32 import java.net.URLConnection ; 33 34 import org.htmlparser.Parser; 35 import org.htmlparser.Text; 36 import org.htmlparser.tags.LinkTag; 37 import org.htmlparser.Tag; 38 import org.htmlparser.util.ParserException; 39 import org.htmlparser.util.EncodingChangeException; 40 import org.htmlparser.util.Translate; 41 import org.htmlparser.visitors.NodeVisitor; 42 43 74 public class StringBean extends NodeVisitor implements Serializable 75 { 76 79 public static final String PROP_STRINGS_PROPERTY = "strings"; 80 81 84 public static final String PROP_LINKS_PROPERTY = "links"; 85 86 89 public static final String PROP_URL_PROPERTY = "URL"; 90 91 94 public static final String PROP_REPLACE_SPACE_PROPERTY = "replaceNonBreakingSpaces"; 95 96 99 public static final String PROP_COLLAPSE_PROPERTY = "collapse"; 100 101 104 public static final String PROP_CONNECTION_PROPERTY = "connection"; 105 106 109 private static final String newline = System.getProperty ("line.separator"); 110 111 114 private static final int newline_size = newline.length (); 115 116 119 protected PropertyChangeSupport mPropertySupport; 120 121 124 protected Parser mParser; 125 126 129 protected String mStrings; 130 131 134 protected boolean mLinks; 135 136 140 protected boolean mReplaceSpace; 141 142 146 protected boolean mCollapse; 147 148 151 protected StringBuffer mBuffer; 152 153 156 protected boolean mIsScript; 157 158 161 protected boolean mIsPre; 162 163 166 protected boolean mIsStyle; 167 168 180 public StringBean () 181 { 182 super (true, true); 183 mPropertySupport = new PropertyChangeSupport (this); 184 mParser = new Parser (); 185 mStrings = null; 186 mLinks = false; 187 mReplaceSpace = true; 188 mCollapse = true; 189 mBuffer = new StringBuffer (4096); 190 mIsScript = false; 191 mIsPre = false; 192 mIsStyle = false; 193 } 194 195 199 203 protected void carriage_return () 204 { 205 int length; 206 207 length = mBuffer.length (); 208 if ( (0 != length) && ( (newline_size <= length) && (!mBuffer.substring (length - newline_size, length).equals (newline)))) 211 mBuffer.append (newline); 212 } 213 214 234 protected void collapse (StringBuffer buffer, String string) 235 { 236 int chars; 237 int length; 238 int state; 239 char character; 240 241 chars = string.length (); 242 if (0 != chars) 243 { 244 length = buffer.length (); 245 state = ( (0 == length) 246 || (buffer.charAt (length - 1) == ' ') 247 || ((newline_size <= length) && buffer.substring (length - newline_size, length).equals (newline))) ? 0 : 1; 248 for (int i = 0; i < chars; i++) 249 { 250 character = string.charAt (i); 251 switch (character) 252 { 253 case '\u0020': 256 case '\u0009': 257 case '\u000C': 258 case '\u200B': 259 case '\r': 260 case '\n': 261 if (0 != state) 262 state = 1; 263 break; 264 default: 265 if (1 == state) 266 buffer.append (' '); 267 state = 2; 268 buffer.append (character); 269 } 270 } 271 } 272 } 273 274 278 protected String extractStrings () 279 throws 280 ParserException 281 { 282 String ret; 283 284 mParser.visitAllNodesWith (this); 285 ret = mBuffer.toString (); 286 mBuffer = new StringBuffer (4096); 287 288 return (ret); 289 } 290 291 295 protected void updateStrings (String strings) 296 { 297 String oldValue; 298 299 if ((null == mStrings) || !mStrings.equals (strings)) 300 { 301 oldValue = mStrings; 302 mStrings = strings; 303 mPropertySupport.firePropertyChange (PROP_STRINGS_PROPERTY, oldValue, strings); 304 } 305 } 306 307 311 protected void setStrings () 312 { 313 if (null != getURL ()) 314 try 315 { 316 try 317 { 318 mParser.visitAllNodesWith (this); 319 updateStrings (mBuffer.toString ()); 320 } 321 finally 322 { 323 mBuffer = new StringBuffer (4096); 324 } 325 } 326 catch (EncodingChangeException ece) 327 { 328 mIsPre = false; 329 mIsScript = false; 330 mIsStyle = false; 331 try 332 { mParser.reset (); 334 mBuffer = new StringBuffer (4096); 335 mParser.visitAllNodesWith (this); 336 updateStrings (mBuffer.toString ()); 337 } 338 catch (ParserException pe) 339 { 340 updateStrings (pe.toString ()); 341 } 342 finally 343 { 344 mBuffer = new StringBuffer (4096); 345 } 346 } 347 catch (ParserException pe) 348 { 349 updateStrings (pe.toString ()); 350 } 351 else 352 { 353 mStrings = null; 356 mBuffer = new StringBuffer (4096); 357 } 358 } 359 360 365 private void resetStrings () 366 { 367 if (null != mStrings) 368 try 369 { 370 mParser.setURL (getURL ()); 371 setStrings (); 372 } 373 catch (ParserException pe) 374 { 375 updateStrings (pe.toString ()); 376 } 377 } 378 379 383 388 public void addPropertyChangeListener (PropertyChangeListener listener) 389 { 390 mPropertySupport.addPropertyChangeListener (listener); 391 } 392 393 398 public void removePropertyChangeListener (PropertyChangeListener listener) 399 { 400 mPropertySupport.removePropertyChangeListener (listener); 401 } 402 403 407 412 public String getStrings () 413 { 414 if (null == mStrings) 415 if (0 == mBuffer.length ()) 416 setStrings (); 417 else 418 updateStrings (mBuffer.toString ()); 419 420 return (mStrings); 421 } 422 423 428 public boolean getLinks () 429 { 430 return (mLinks); 431 } 432 433 440 public void setLinks (boolean links) 441 { 442 boolean oldValue = mLinks; 443 if (oldValue != links) 444 { 445 mLinks = links; 446 mPropertySupport.firePropertyChange (PROP_LINKS_PROPERTY, oldValue, links); 447 resetStrings (); 448 } 449 } 450 451 456 public String getURL () 457 { 458 return ((null != mParser) ? mParser.getURL () : null); 459 } 460 461 467 public void setURL (String url) 468 { 469 String old; 470 URLConnection conn; 471 472 old = getURL (); 473 conn = getConnection (); 474 if (((null == old) && (null != url)) || ((null != old) && !old.equals (url))) 475 { 476 try 477 { 478 if (null == mParser) 479 mParser = new Parser (url); 480 else 481 mParser.setURL (url); 482 mPropertySupport.firePropertyChange (PROP_URL_PROPERTY, old, getURL ()); 483 mPropertySupport.firePropertyChange (PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); 484 setStrings (); 485 } 486 catch (ParserException pe) 487 { 488 updateStrings (pe.toString ()); 489 } 490 } 491 } 492 493 499 public boolean getReplaceNonBreakingSpaces () 500 { 501 return (mReplaceSpace); 502 } 503 504 512 public void setReplaceNonBreakingSpaces (boolean replace_space) 513 { 514 boolean oldValue = mReplaceSpace; 515 if (oldValue != replace_space) 516 { 517 mReplaceSpace = replace_space; 518 mPropertySupport.firePropertyChange (PROP_REPLACE_SPACE_PROPERTY, oldValue, replace_space); 519 resetStrings (); 520 } 521 } 522 523 536 public boolean getCollapse () 537 { 538 return (mCollapse); 539 } 540 541 548 public void setCollapse (boolean collapse_whitespace) 549 { 550 boolean oldValue = mCollapse; 551 if (oldValue != collapse_whitespace) 552 { 553 mCollapse = collapse_whitespace; 554 mPropertySupport.firePropertyChange (PROP_COLLAPSE_PROPERTY, oldValue, collapse_whitespace); 555 resetStrings (); 556 } 557 } 558 559 564 public URLConnection getConnection () 565 { 566 return ((null != mParser) ? mParser.getConnection () : null); 567 } 568 569 575 public void setConnection (URLConnection connection) 576 { 577 String url; 578 URLConnection conn; 579 580 url = getURL (); 581 conn = getConnection (); 582 if (((null == conn) && (null != connection)) || ((null != conn) && !conn.equals (connection))) 583 { 584 try 585 { 586 if (null == mParser) 587 mParser = new Parser (connection); 588 else 589 mParser.setConnection (connection); 590 mPropertySupport.firePropertyChange (PROP_URL_PROPERTY, url, getURL ()); 591 mPropertySupport.firePropertyChange (PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); 592 setStrings (); 593 } 594 catch (ParserException pe) 595 { 596 updateStrings (pe.toString ()); 597 } 598 } 599 } 600 601 605 609 public void visitStringNode (Text string) 610 { 611 if (!mIsScript && !mIsStyle) 612 { 613 String text = string.getText (); 614 if (!mIsPre) 615 { 616 text = Translate.decode (text); 617 if (getReplaceNonBreakingSpaces ()) 618 text = text.replace ('\u00a0',' '); 619 if (getCollapse ()) 620 collapse (mBuffer, text); 621 else 622 mBuffer.append (text); 623 } 624 else 625 mBuffer.append (text); 626 } 627 } 628 629 633 public void visitTag (Tag tag) 634 { 635 String name; 636 637 if (tag instanceof LinkTag) 638 if (getLinks ()) 639 { mBuffer.append ("<"); 641 mBuffer.append (((LinkTag)tag).getLink ()); 642 mBuffer.append (">"); 643 } 644 name = tag.getTagName (); 645 if (name.equalsIgnoreCase ("PRE")) 646 mIsPre = true; 647 else if (name.equalsIgnoreCase ("SCRIPT")) 648 mIsScript = true; 649 else if (name.equalsIgnoreCase ("STYLE")) 650 mIsStyle = true; 651 if (tag.breaksFlow ()) 652 carriage_return (); 653 } 654 655 659 public void visitEndTag (Tag tag) 660 { 661 String name; 662 663 name = tag.getTagName (); 664 if (name.equalsIgnoreCase ("PRE")) 665 mIsPre = false; 666 else if (name.equalsIgnoreCase ("SCRIPT")) 667 mIsScript = false; 668 else if (name.equalsIgnoreCase ("STYLE")) 669 mIsStyle = false; 670 } 671 672 676 public static void main (String [] args) 677 { 678 if (0 >= args.length) 679 System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.beans.StringBean <http://whatever_url>"); 680 else 681 { 682 StringBean sb = new StringBean (); 683 sb.setLinks (false); 684 sb.setReplaceNonBreakingSpaces (true); 685 sb.setCollapse (true); 686 sb.setURL (args[0]); 687 System.out.println (sb.getStrings ()); 688 } 689 } 690 } 691 | Popular Tags |