StringBean


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2004 Derrick Oswald
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2005/03/12 12:52:19 $
10  // $Revision: 1.42 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.beans;
28  
29  import java.beans.PropertyChangeListener  ;
30  import java.beans.PropertyChangeSupport  ;
31  import java.io.Serializable  ;
32  import java.net.URLConnection  ;
33  
34  import org.htmlparser.Parser;
35  import org.htmlparser.Text;
36  import org.htmlparser.tags.LinkTag;
37  import org.htmlparser.Tag;
38  import org.htmlparser.util.ParserException;
39  import org.htmlparser.util.EncodingChangeException;
40  import org.htmlparser.util.Translate;
41  import org.htmlparser.visitors.NodeVisitor;
42  
43  /**
44   * Extract strings from a URL.
45   * <p>Text within &lt;SCRIPT&gt;&lt;/SCRIPT&gt; tags is removed.</p>
46   * <p>The text within &lt;PRE&gt;&lt;/PRE&gt; tags is not altered.</p>
47   * <p>The property <code>Strings</code>, which is the output property is null
48   * until a URL is set. So a typical usage is:</p>
49   * <pre>
50   *     StringBean sb = new StringBean ();
51   *     sb.setLinks (false);
52   *     sb.setReplaceNonBreakingSpaces (true);
53   *     sb.setCollapse (true);
54   *     sb.setURL ("http://www.netbeans.org"); // the HTTP is performed here
55   *     String s = sb.getStrings ();
56   * </pre>
57   * You can also use the StringBean as a NodeVisitor on your own parser,
58   * in which case you have to refetch your page if you change one of the
59   * properties because it resets the Strings property:</p>
60   * <pre>
61   *     StringBean sb = new StringBean ();
62   *     Parser parser = new Parser ("http://cbc.ca");
63   *     parser.visitAllNodesWith (sb);
64   *     String s = sb.getStrings ();
65   *     sb.setLinks (true);
66   *     parser.reset ();
67   *     parser.visitAllNodesWith (sb);
68   *     String sl = sb.getStrings ();
69   * </pre>
70   * According to Nick Burch, who contributed the patch, this is handy if you
71   * don't want StringBean to wander off and get the content itself, either
72   * because you already have it, it's not on a website etc.
73   */
74  public class StringBean extends NodeVisitor implements Serializable  
75  {
76      /**
77       * Property name in event where the URL contents changes.
78       */
79      public static final String   PROP_STRINGS_PROPERTY = "strings";
80  
81      /**
82       * Property name in event where the 'embed links' state changes.
83       */
84      public static final String   PROP_LINKS_PROPERTY = "links";
85  
86      /**
87       * Property name in event where the URL changes.
88       */
89      public static final String   PROP_URL_PROPERTY = "URL";
90  
91      /**
92       * Property name in event where the 'replace non-breaking spaces' state changes.
93       */
94      public static final String   PROP_REPLACE_SPACE_PROPERTY = "replaceNonBreakingSpaces";
95  
96      /**
97       * Property name in event where the 'collapse whitespace' state changes.
98       */
99      public static final String   PROP_COLLAPSE_PROPERTY = "collapse";
100 
101     /**
102      * Property name in event where the connection changes.
103      */
104     public static final String   PROP_CONNECTION_PROPERTY = "connection";
105 
106     /**
107      * A newline.
108      */
109     private static final String   newline = System.getProperty ("line.separator");
110 
111     /**
112      * The length of the newline.
113      */
114     private static final int newline_size = newline.length ();
115 
116     /**
117      * Bound property support.
118      */
119     protected PropertyChangeSupport   mPropertySupport;
120 
121     /**
122      * The parser used to extract strings.
123      */
124     protected Parser mParser;
125 
126     /**
127      * The strings extracted from the URL.
128      */
129     protected String   mStrings;
130 
131     /**
132      * If <code>true</code> the link URLs are embedded in the text output.
133      */
134     protected boolean mLinks;
135 
136     /**
137      * If <code>true</code> regular space characters are substituted for
138      * non-breaking spaces in the text output.
139      */
140     protected boolean mReplaceSpace;
141 
142     /**
143      * If <code>true</code> sequences of whitespace characters are replaced with a
144      * single space character.
145      */
146     protected boolean mCollapse;
147 
148     /**
149      * The buffer text is stored in while traversing the HTML.
150      */
151     protected StringBuffer   mBuffer;
152 
153     /**
154      * Set <code>true</code> when traversing a SCRIPT tag.
155      */
156     protected boolean mIsScript;
157 
158     /**
159      * Set <code>true</code> when traversing a PRE tag.
160      */
161     protected boolean mIsPre;
162 
163     /**
164      * Set <code>true</code> when traversing a STYLE tag.
165      */
166     protected boolean mIsStyle;
167 
168    /**
169      * Create a StringBean object.
170      * Default property values are set to 'do the right thing':
171      * <p><code>Links</code> is set <code>false</code> so text appears like a
172      * browser would display it, albeit without the colour or underline clues
173      * normally associated with a link.</p>
174      * <p><code>ReplaceNonBreakingSpaces</code> is set <code>true</code>, so
175      * that printing the text works, but the extra information regarding these
176      * formatting marks is available if you set it false.</p>
177      * <p><code>Collapse</code> is set <code>true</code>, so text appears
178      * compact like a browser would display it.</p>
179      */
180     public StringBean ()
181     {
182         super (true, true);
183         mPropertySupport = new PropertyChangeSupport   (this);
184         mParser = new Parser ();
185         mStrings = null;
186         mLinks = false;
187         mReplaceSpace = true;
188         mCollapse = true;
189         mBuffer = new StringBuffer   (4096);
190         mIsScript = false;
191         mIsPre = false;
192         mIsStyle = false;
193     }
194 
195     //
196     // internals
197     //
198 
199     /**
200      * Appends a newline to the buffer if there isn't one there already.
201      * Except if the buffer is empty.
202      */
203     protected void carriage_return ()
204     {
205         int length;
206 
207         length = mBuffer.length ();
208         if (   (0 != length) // why bother appending newlines to the beginning of a buffer
209         && (   (newline_size <= length) // not enough chars to hold a newline
210         && (!mBuffer.substring (length - newline_size, length).equals (newline))))
211             mBuffer.append (newline);
212     }
213 
214     /**
215      * Add the given text collapsing whitespace.
216      * Use a little finite state machine:
217      * <pre>
218      * state 0: whitepace was last emitted character
219      * state 1: in whitespace
220      * state 2: in word
221      * A whitespace character moves us to state 1 and any other character
222      * moves us to state 2, except that state 0 stays in state 0 until
223      * a non-whitespace and going from whitespace to word we emit a space
224      * before the character:
225      *    input:     whitespace   other-character
226      * state\next
227      *    0               0             2
228      *    1               1        space then 2
229      *    2               1             2
230      * </pre>
231      * @param buffer The buffer to append to.
232      * @param string The string to append.
233      */
234     protected void collapse (StringBuffer   buffer, String   string)
235     {
236         int chars;
237         int length;
238         int state;
239         char character;
240 
241         chars = string.length ();
242         if (0 != chars)
243         {
244             length = buffer.length ();
245             state = (   (0 == length)
246             || (buffer.charAt (length - 1) == ' ')
247             || ((newline_size <= length) && buffer.substring (length - newline_size, length).equals (newline))) ? 0 : 1;
248             for (int i = 0; i < chars; i++)
249             {
250                 character = string.charAt (i);
251                 switch (character)
252                 {
253                     // see HTML specification section 9.1 White space
254                     // http://www.w3.org/TR/html4/struct/text.html#h-9.1
255                     case '\u0020':
256                     case '\u0009':
257                     case '\u000C':
258                     case '\u200B':
259                     case '\r':
260                     case '\n':
261                         if (0 != state)
262                             state = 1;
263                         break;
264                     default:
265                         if (1 == state)
266                             buffer.append (' ');
267                         state = 2;
268                         buffer.append (character);
269                 }
270             }
271         }
272     }
273 
274     /**
275      * Extract the text from a page.
276      * @return The textual contents of the page.
277      */
278     protected String   extractStrings ()
279         throws
280             ParserException
281     {
282         String   ret;
283 
284         mParser.visitAllNodesWith (this);
285         ret = mBuffer.toString ();
286         mBuffer = new StringBuffer  (4096);
287 
288         return (ret);
289     }
290 
291     /**
292      * Assign the <code>Strings</code> property, firing the property change.
293      * @param strings The new value of the <code>Strings</code> property.
294      */
295     protected void updateStrings (String   strings)
296     {
297         String   oldValue;
298 
299         if ((null == mStrings) || !mStrings.equals (strings))
300         {
301             oldValue = mStrings;
302             mStrings = strings;
303             mPropertySupport.firePropertyChange (PROP_STRINGS_PROPERTY, oldValue, strings);
304         }
305     }
306 
307     /**
308      * Fetch the URL contents.
309      * Only do work if there is a valid parser with it's URL set.
310      */
311     protected void setStrings ()
312     {
313         if (null != getURL ())
314             try
315             {
316                 try
317                 {
318                     mParser.visitAllNodesWith (this);
319                     updateStrings (mBuffer.toString ());
320                 }
321                 finally
322                 {
323                     mBuffer = new StringBuffer   (4096);
324                 }
325             }
326             catch (EncodingChangeException ece)
327             {
328                 mIsPre = false;
329                 mIsScript = false;
330                 mIsStyle = false;
331                 try
332                 {   // try again with the encoding now in force
333                     mParser.reset ();
334                     mBuffer = new StringBuffer   (4096);
335                     mParser.visitAllNodesWith (this);
336                     updateStrings (mBuffer.toString ());
337                 }
338                 catch (ParserException pe)
339                 {
340                     updateStrings (pe.toString ());
341                 }
342                 finally
343                 {
344                     mBuffer = new StringBuffer   (4096);
345                 }
346              }
347             catch (ParserException pe)
348             {
349                 updateStrings (pe.toString ());
350             }
351         else
352         {
353             // reset in case this StringBean is used as a visitor
354             // on another parser, not it's own
355             mStrings = null;
356             mBuffer = new StringBuffer   (4096);
357         }
358     }
359 
360     /**
361      * Refetch the URL contents.
362      * Only need to worry if there is already a valid parser and it's
363      * been spent fetching the string contents.
364      */
365     private void resetStrings ()
366     {
367         if (null != mStrings)
368             try
369             {
370                 mParser.setURL (getURL ());
371                 setStrings ();
372             }
373             catch (ParserException pe)
374             {
375                 updateStrings (pe.toString ());
376             }
377     }
378 
379     //
380     // Property change support.
381     //
382 
383     /**
384      * Add a PropertyChangeListener to the listener list.
385      * The listener is registered for all properties.
386      * @param listener The PropertyChangeListener to be added.
387      */
388     public void addPropertyChangeListener (PropertyChangeListener   listener)
389     {
390         mPropertySupport.addPropertyChangeListener (listener);
391     }
392 
393     /**
394      * Remove a PropertyChangeListener from the listener list.
395      * This removes a PropertyChangeListener that was registered for all properties.
396      * @param listener The PropertyChangeListener to be removed.
397      */
398     public void removePropertyChangeListener (PropertyChangeListener   listener)
399     {
400         mPropertySupport.removePropertyChangeListener (listener);
401     }
402 
403     //
404     // Properties
405     //
406 
407     /**
408      * Return the textual contents of the URL.
409      * This is the primary output of the bean.
410      * @return The user visible (what would be seen in a browser) text from the URL.
411      */
412     public String   getStrings ()
413     {
414         if (null == mStrings)
415         if (0 == mBuffer.length ())
416             setStrings ();
417         else
418             updateStrings (mBuffer.toString ());
419 
420         return (mStrings);
421     }
422 
423     /**
424      * Get the current 'include links' state.
425      * @return <code>true</code> if link text is included in the text extracted
426      * from the URL, <code>false</code> otherwise.
427      */
428     public boolean getLinks ()
429     {
430         return (mLinks);
431     }
432 
433     /**
434      * Set the 'include links' state.
435      * If the setting is changed after the URL has been set, the text from the
436      * URL will be reacquired, which is possibly expensive.
437      * @param links Use <code>true</code> if link text is to be included in the
438      * text extracted from the URL, <code>false</code> otherwise.
439      */
440     public void setLinks (boolean links)
441     {
442         boolean oldValue = mLinks;
443         if (oldValue != links)
444         {
445             mLinks = links;
446             mPropertySupport.firePropertyChange (PROP_LINKS_PROPERTY, oldValue, links);
447             resetStrings ();
448         }
449     }
450 
451     /**
452      * Get the current URL.
453      * @return The URL from which text has been extracted, or <code>null</code>
454      * if this property has not been set yet.
455      */
456     public String   getURL ()
457     {
458          return ((null != mParser) ? mParser.getURL () : null);
459     }
460 
461     /**
462      * Set the URL to extract strings from.
463      * The text from the URL will be fetched, which may be expensive, so this
464      * property should be set last.
465      * @param url The URL that text should be fetched from.
466      */
467     public void setURL (String   url)
468     {
469         String   old;
470         URLConnection   conn;
471 
472         old = getURL ();
473         conn = getConnection ();
474         if (((null == old) && (null != url)) || ((null != old) && !old.equals (url)))
475         {
476             try
477             {
478                 if (null == mParser)
479                     mParser = new Parser (url);
480                 else
481                     mParser.setURL (url);
482                 mPropertySupport.firePropertyChange (PROP_URL_PROPERTY, old, getURL ());
483                 mPropertySupport.firePropertyChange (PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
484                 setStrings ();
485             }
486             catch (ParserException pe)
487             {
488                 updateStrings (pe.toString ());
489             }
490         }
491     }
492 
493     /**
494      * Get the current 'replace non breaking spaces' state.
495      * @return <code>true</code> if non-breaking spaces (character '&#92;u00a0',
496      * numeric character reference &amp;#160; or character entity reference &amp;nbsp;)
497      * are to be replaced with normal spaces (character '&#92;u0020').
498      */
499     public boolean getReplaceNonBreakingSpaces ()
500     {
501         return (mReplaceSpace);
502     }
503 
504     /**
505      * Set the 'replace non breaking spaces' state.
506      * If the setting is changed after the URL has been set, the text from the
507      * URL will be reacquired, which is possibly expensive.
508      * @param replace_space <code>true</code> if non-breaking spaces (character '&#92;u00a0',
509      * numeric character reference &amp;#160; or character entity reference &amp;nbsp;)
510      * are to be replaced with normal spaces (character '&#92;u0020').
511      */
512     public void setReplaceNonBreakingSpaces (boolean replace_space)
513     {
514         boolean oldValue = mReplaceSpace;
515         if (oldValue != replace_space)
516         {
517             mReplaceSpace = replace_space;
518             mPropertySupport.firePropertyChange (PROP_REPLACE_SPACE_PROPERTY, oldValue, replace_space);
519             resetStrings ();
520         }
521     }
522 
523     /**
524      * Get the current 'collapse whitespace' state.
525      * If set to <code>true</code> this emulates the operation of browsers
526      * in interpretting text where <quote>user agents should collapse input white
527      * space sequences when producing output inter-word space</quote>.
528      * See HTML specification section 9.1 White space
529      * <a HREF="http://www.w3.org/TR/html4/struct/text.html#h-9.1">
530      * http://www.w3.org/TR/html4/struct/text.html#h-9.1</a>.
531      * @return <code>true</code> if sequences of whitespace (space '&#92;u0020',
532      * tab '&#92;u0009', form feed '&#92;u000C', zero-width space '&#92;u200B',
533      * carriage-return '\r' and newline '\n') are to be replaced with a single
534      * space.
535      */
536     public boolean getCollapse ()
537     {
538         return (mCollapse);
539     }
540 
541     /**
542      * Set the current 'collapse whitespace' state.
543      * If the setting is changed after the URL has been set, the text from the
544      * URL will be reacquired, which is possibly expensive.
545      * @param collapse_whitespace If <code>true</code>, sequences of whitespace
546      * will be reduced to a single space.
547      */
548     public void setCollapse (boolean collapse_whitespace)
549     {
550         boolean oldValue = mCollapse;
551         if (oldValue != collapse_whitespace)
552         {
553             mCollapse = collapse_whitespace;
554             mPropertySupport.firePropertyChange (PROP_COLLAPSE_PROPERTY, oldValue, collapse_whitespace);
555             resetStrings ();
556         }
557     }
558 
559     /**
560      * Get the current connection.
561      * @return The connection that the parser has or <code>null</code> if it
562      * hasn't been set or the parser hasn't been constructed yet.
563      */
564     public URLConnection   getConnection ()
565     {
566         return ((null != mParser) ? mParser.getConnection () : null);
567     }
568 
569     /**
570      * Set the parser's connection.
571      * The text from the URL will be fetched, which may be expensive, so this
572      * property should be set last.
573      * @param connection New value of property Connection.
574      */
575     public void setConnection (URLConnection   connection)
576     {
577         String   url;
578         URLConnection   conn;
579 
580         url = getURL ();
581         conn = getConnection ();
582         if (((null == conn) && (null != connection)) || ((null != conn) && !conn.equals (connection)))
583         {
584             try
585             {
586                 if (null == mParser)
587                     mParser = new Parser (connection);
588                 else
589                     mParser.setConnection (connection);
590                 mPropertySupport.firePropertyChange (PROP_URL_PROPERTY, url, getURL ());
591                 mPropertySupport.firePropertyChange (PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
592                 setStrings ();
593             }
594             catch (ParserException pe)
595             {
596                 updateStrings (pe.toString ());
597             }
598         }
599     }
600 
601     //
602     // NodeVisitor overrides
603     //
604 
605     /**
606      * Appends the text to the output.
607      * @param string The text node.
608      */
609     public void visitStringNode (Text string)
610     {
611         if (!mIsScript && !mIsStyle)
612         {
613             String   text = string.getText ();
614             if (!mIsPre)
615             {
616                 text = Translate.decode (text);
617                 if (getReplaceNonBreakingSpaces ())
618                     text = text.replace ('\u00a0',' ');
619                 if (getCollapse ())
620                     collapse (mBuffer, text);
621                 else
622                     mBuffer.append (text);
623             }
624             else
625                 mBuffer.append (text);
626         }
627     }
628 
629     /**
630      * Appends a newline to the output if the tag breaks flow, and
631      * possibly sets the state of the PRE and SCRIPT flags.
632      */
633     public void visitTag (Tag tag)
634     {
635         String   name;
636 
637         if (tag instanceof LinkTag)
638             if (getLinks ())
639             { // appends the link as text between angle brackets to the output.
640                 mBuffer.append ("<");
641                 mBuffer.append (((LinkTag)tag).getLink ());
642                 mBuffer.append (">");
643             }
644         name = tag.getTagName ();
645         if (name.equalsIgnoreCase ("PRE"))
646             mIsPre = true;
647         else if (name.equalsIgnoreCase ("SCRIPT"))
648             mIsScript = true;
649         else if (name.equalsIgnoreCase ("STYLE"))
650             mIsStyle = true;
651         if (tag.breaksFlow ())
652             carriage_return ();
653     }
654 
655     /**
656      * Resets the state of the PRE and SCRIPT flags.
657      * @param tag The end tag to process.
658      */
659     public void visitEndTag (Tag tag)
660     {
661         String   name;
662 
663         name = tag.getTagName ();
664         if (name.equalsIgnoreCase ("PRE"))
665             mIsPre = false;
666         else if (name.equalsIgnoreCase ("SCRIPT"))
667             mIsScript = false;
668         else if (name.equalsIgnoreCase ("STYLE"))
669             mIsStyle = false;
670     }
671 
672     /**
673      * Unit test.
674      * @param args Pass arg[0] as the URL to process.
675      */
676     public static void main (String  [] args)
677     {
678         if (0 >= args.length)
679             System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.beans.StringBean <http://whatever_url>");
680         else
681         {
682             StringBean sb = new StringBean ();
683             sb.setLinks (false);
684             sb.setReplaceNonBreakingSpaces (true);
685             sb.setCollapse (true);
686             sb.setURL (args[0]);
687             System.out.println (sb.getStrings ());
688         }
689     }
690 }
691
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags