KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > beans > StringBean


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Derrick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2005/03/12 12:52:19 $
10
// $Revision: 1.42 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.beans;
28
29 import java.beans.PropertyChangeListener JavaDoc;
30 import java.beans.PropertyChangeSupport JavaDoc;
31 import java.io.Serializable JavaDoc;
32 import java.net.URLConnection JavaDoc;
33
34 import org.htmlparser.Parser;
35 import org.htmlparser.Text;
36 import org.htmlparser.tags.LinkTag;
37 import org.htmlparser.Tag;
38 import org.htmlparser.util.ParserException;
39 import org.htmlparser.util.EncodingChangeException;
40 import org.htmlparser.util.Translate;
41 import org.htmlparser.visitors.NodeVisitor;
42
43 /**
44  * Extract strings from a URL.
45  * <p>Text within &lt;SCRIPT&gt;&lt;/SCRIPT&gt; tags is removed.</p>
46  * <p>The text within &lt;PRE&gt;&lt;/PRE&gt; tags is not altered.</p>
47  * <p>The property <code>Strings</code>, which is the output property is null
48  * until a URL is set. So a typical usage is:</p>
49  * <pre>
50  * StringBean sb = new StringBean ();
51  * sb.setLinks (false);
52  * sb.setReplaceNonBreakingSpaces (true);
53  * sb.setCollapse (true);
54  * sb.setURL ("http://www.netbeans.org"); // the HTTP is performed here
55  * String s = sb.getStrings ();
56  * </pre>
57  * You can also use the StringBean as a NodeVisitor on your own parser,
58  * in which case you have to refetch your page if you change one of the
59  * properties because it resets the Strings property:</p>
60  * <pre>
61  * StringBean sb = new StringBean ();
62  * Parser parser = new Parser ("http://cbc.ca");
63  * parser.visitAllNodesWith (sb);
64  * String s = sb.getStrings ();
65  * sb.setLinks (true);
66  * parser.reset ();
67  * parser.visitAllNodesWith (sb);
68  * String sl = sb.getStrings ();
69  * </pre>
70  * According to Nick Burch, who contributed the patch, this is handy if you
71  * don't want StringBean to wander off and get the content itself, either
72  * because you already have it, it's not on a website etc.
73  */

74 public class StringBean extends NodeVisitor implements Serializable JavaDoc
75 {
76     /**
77      * Property name in event where the URL contents changes.
78      */

79     public static final String JavaDoc PROP_STRINGS_PROPERTY = "strings";
80
81     /**
82      * Property name in event where the 'embed links' state changes.
83      */

84     public static final String JavaDoc PROP_LINKS_PROPERTY = "links";
85
86     /**
87      * Property name in event where the URL changes.
88      */

89     public static final String JavaDoc PROP_URL_PROPERTY = "URL";
90
91     /**
92      * Property name in event where the 'replace non-breaking spaces' state changes.
93      */

94     public static final String JavaDoc PROP_REPLACE_SPACE_PROPERTY = "replaceNonBreakingSpaces";
95
96     /**
97      * Property name in event where the 'collapse whitespace' state changes.
98      */

99     public static final String JavaDoc PROP_COLLAPSE_PROPERTY = "collapse";
100
101     /**
102      * Property name in event where the connection changes.
103      */

104     public static final String JavaDoc PROP_CONNECTION_PROPERTY = "connection";
105
106     /**
107      * A newline.
108      */

109     private static final String JavaDoc newline = System.getProperty ("line.separator");
110
111     /**
112      * The length of the newline.
113      */

114     private static final int newline_size = newline.length ();
115
116     /**
117      * Bound property support.
118      */

119     protected PropertyChangeSupport JavaDoc mPropertySupport;
120
121     /**
122      * The parser used to extract strings.
123      */

124     protected Parser mParser;
125
126     /**
127      * The strings extracted from the URL.
128      */

129     protected String JavaDoc mStrings;
130
131     /**
132      * If <code>true</code> the link URLs are embedded in the text output.
133      */

134     protected boolean mLinks;
135
136     /**
137      * If <code>true</code> regular space characters are substituted for
138      * non-breaking spaces in the text output.
139      */

140     protected boolean mReplaceSpace;
141
142     /**
143      * If <code>true</code> sequences of whitespace characters are replaced with a
144      * single space character.
145      */

146     protected boolean mCollapse;
147
148     /**
149      * The buffer text is stored in while traversing the HTML.
150      */

151     protected StringBuffer JavaDoc mBuffer;
152
153     /**
154      * Set <code>true</code> when traversing a SCRIPT tag.
155      */

156     protected boolean mIsScript;
157
158     /**
159      * Set <code>true</code> when traversing a PRE tag.
160      */

161     protected boolean mIsPre;
162
163     /**
164      * Set <code>true</code> when traversing a STYLE tag.
165      */

166     protected boolean mIsStyle;
167
168    /**
169      * Create a StringBean object.
170      * Default property values are set to 'do the right thing':
171      * <p><code>Links</code> is set <code>false</code> so text appears like a
172      * browser would display it, albeit without the colour or underline clues
173      * normally associated with a link.</p>
174      * <p><code>ReplaceNonBreakingSpaces</code> is set <code>true</code>, so
175      * that printing the text works, but the extra information regarding these
176      * formatting marks is available if you set it false.</p>
177      * <p><code>Collapse</code> is set <code>true</code>, so text appears
178      * compact like a browser would display it.</p>
179      */

180     public StringBean ()
181     {
182         super (true, true);
183         mPropertySupport = new PropertyChangeSupport JavaDoc (this);
184         mParser = new Parser ();
185         mStrings = null;
186         mLinks = false;
187         mReplaceSpace = true;
188         mCollapse = true;
189         mBuffer = new StringBuffer JavaDoc (4096);
190         mIsScript = false;
191         mIsPre = false;
192         mIsStyle = false;
193     }
194
195     //
196
// internals
197
//
198

199     /**
200      * Appends a newline to the buffer if there isn't one there already.
201      * Except if the buffer is empty.
202      */

203     protected void carriage_return ()
204     {
205         int length;
206
207         length = mBuffer.length ();
208         if ( (0 != length) // why bother appending newlines to the beginning of a buffer
209
&& ( (newline_size <= length) // not enough chars to hold a newline
210
&& (!mBuffer.substring (length - newline_size, length).equals (newline))))
211             mBuffer.append (newline);
212     }
213
214     /**
215      * Add the given text collapsing whitespace.
216      * Use a little finite state machine:
217      * <pre>
218      * state 0: whitepace was last emitted character
219      * state 1: in whitespace
220      * state 2: in word
221      * A whitespace character moves us to state 1 and any other character
222      * moves us to state 2, except that state 0 stays in state 0 until
223      * a non-whitespace and going from whitespace to word we emit a space
224      * before the character:
225      * input: whitespace other-character
226      * state\next
227      * 0 0 2
228      * 1 1 space then 2
229      * 2 1 2
230      * </pre>
231      * @param buffer The buffer to append to.
232      * @param string The string to append.
233      */

234     protected void collapse (StringBuffer JavaDoc buffer, String JavaDoc string)
235     {
236         int chars;
237         int length;
238         int state;
239         char character;
240
241         chars = string.length ();
242         if (0 != chars)
243         {
244             length = buffer.length ();
245             state = ( (0 == length)
246             || (buffer.charAt (length - 1) == ' ')
247             || ((newline_size <= length) && buffer.substring (length - newline_size, length).equals (newline))) ? 0 : 1;
248             for (int i = 0; i < chars; i++)
249             {
250                 character = string.charAt (i);
251                 switch (character)
252                 {
253                     // see HTML specification section 9.1 White space
254
// http://www.w3.org/TR/html4/struct/text.html#h-9.1
255
case '\u0020':
256                     case '\u0009':
257                     case '\u000C':
258                     case '\u200B':
259                     case '\r':
260                     case '\n':
261                         if (0 != state)
262                             state = 1;
263                         break;
264                     default:
265                         if (1 == state)
266                             buffer.append (' ');
267                         state = 2;
268                         buffer.append (character);
269                 }
270             }
271         }
272     }
273
274     /**
275      * Extract the text from a page.
276      * @return The textual contents of the page.
277      */

278     protected String JavaDoc extractStrings ()
279         throws
280             ParserException
281     {
282         String JavaDoc ret;
283
284         mParser.visitAllNodesWith (this);
285         ret = mBuffer.toString ();
286         mBuffer = new StringBuffer JavaDoc(4096);
287
288         return (ret);
289     }
290
291     /**
292      * Assign the <code>Strings</code> property, firing the property change.
293      * @param strings The new value of the <code>Strings</code> property.
294      */

295     protected void updateStrings (String JavaDoc strings)
296     {
297         String JavaDoc oldValue;
298
299         if ((null == mStrings) || !mStrings.equals (strings))
300         {
301             oldValue = mStrings;
302             mStrings = strings;
303             mPropertySupport.firePropertyChange (PROP_STRINGS_PROPERTY, oldValue, strings);
304         }
305     }
306
307     /**
308      * Fetch the URL contents.
309      * Only do work if there is a valid parser with it's URL set.
310      */

311     protected void setStrings ()
312     {
313         if (null != getURL ())
314             try
315             {
316                 try
317                 {
318                     mParser.visitAllNodesWith (this);
319                     updateStrings (mBuffer.toString ());
320                 }
321                 finally
322                 {
323                     mBuffer = new StringBuffer JavaDoc (4096);
324                 }
325             }
326             catch (EncodingChangeException ece)
327             {
328                 mIsPre = false;
329                 mIsScript = false;
330                 mIsStyle = false;
331                 try
332                 { // try again with the encoding now in force
333
mParser.reset ();
334                     mBuffer = new StringBuffer JavaDoc (4096);
335                     mParser.visitAllNodesWith (this);
336                     updateStrings (mBuffer.toString ());
337                 }
338                 catch (ParserException pe)
339                 {
340                     updateStrings (pe.toString ());
341                 }
342                 finally
343                 {
344                     mBuffer = new StringBuffer JavaDoc (4096);
345                 }
346              }
347             catch (ParserException pe)
348             {
349                 updateStrings (pe.toString ());
350             }
351         else
352         {
353             // reset in case this StringBean is used as a visitor
354
// on another parser, not it's own
355
mStrings = null;
356             mBuffer = new StringBuffer JavaDoc (4096);
357         }
358     }
359
360     /**
361      * Refetch the URL contents.
362      * Only need to worry if there is already a valid parser and it's
363      * been spent fetching the string contents.
364      */

365     private void resetStrings ()
366     {
367         if (null != mStrings)
368             try
369             {
370                 mParser.setURL (getURL ());
371                 setStrings ();
372             }
373             catch (ParserException pe)
374             {
375                 updateStrings (pe.toString ());
376             }
377     }
378
379     //
380
// Property change support.
381
//
382

383     /**
384      * Add a PropertyChangeListener to the listener list.
385      * The listener is registered for all properties.
386      * @param listener The PropertyChangeListener to be added.
387      */

388     public void addPropertyChangeListener (PropertyChangeListener JavaDoc listener)
389     {
390         mPropertySupport.addPropertyChangeListener (listener);
391     }
392
393     /**
394      * Remove a PropertyChangeListener from the listener list.
395      * This removes a PropertyChangeListener that was registered for all properties.
396      * @param listener The PropertyChangeListener to be removed.
397      */

398     public void removePropertyChangeListener (PropertyChangeListener JavaDoc listener)
399     {
400         mPropertySupport.removePropertyChangeListener (listener);
401     }
402
403     //
404
// Properties
405
//
406

407     /**
408      * Return the textual contents of the URL.
409      * This is the primary output of the bean.
410      * @return The user visible (what would be seen in a browser) text from the URL.
411      */

412     public String JavaDoc getStrings ()
413     {
414         if (null == mStrings)
415         if (0 == mBuffer.length ())
416             setStrings ();
417         else
418             updateStrings (mBuffer.toString ());
419
420         return (mStrings);
421     }
422
423     /**
424      * Get the current 'include links' state.
425      * @return <code>true</code> if link text is included in the text extracted
426      * from the URL, <code>false</code> otherwise.
427      */

428     public boolean getLinks ()
429     {
430         return (mLinks);
431     }
432
433     /**
434      * Set the 'include links' state.
435      * If the setting is changed after the URL has been set, the text from the
436      * URL will be reacquired, which is possibly expensive.
437      * @param links Use <code>true</code> if link text is to be included in the
438      * text extracted from the URL, <code>false</code> otherwise.
439      */

440     public void setLinks (boolean links)
441     {
442         boolean oldValue = mLinks;
443         if (oldValue != links)
444         {
445             mLinks = links;
446             mPropertySupport.firePropertyChange (PROP_LINKS_PROPERTY, oldValue, links);
447             resetStrings ();
448         }
449     }
450
451     /**
452      * Get the current URL.
453      * @return The URL from which text has been extracted, or <code>null</code>
454      * if this property has not been set yet.
455      */

456     public String JavaDoc getURL ()
457     {
458          return ((null != mParser) ? mParser.getURL () : null);
459     }
460
461     /**
462      * Set the URL to extract strings from.
463      * The text from the URL will be fetched, which may be expensive, so this
464      * property should be set last.
465      * @param url The URL that text should be fetched from.
466      */

467     public void setURL (String JavaDoc url)
468     {
469         String JavaDoc old;
470         URLConnection JavaDoc conn;
471
472         old = getURL ();
473         conn = getConnection ();
474         if (((null == old) && (null != url)) || ((null != old) && !old.equals (url)))
475         {
476             try
477             {
478                 if (null == mParser)
479                     mParser = new Parser (url);
480                 else
481                     mParser.setURL (url);
482                 mPropertySupport.firePropertyChange (PROP_URL_PROPERTY, old, getURL ());
483                 mPropertySupport.firePropertyChange (PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
484                 setStrings ();
485             }
486             catch (ParserException pe)
487             {
488                 updateStrings (pe.toString ());
489             }
490         }
491     }
492
493     /**
494      * Get the current 'replace non breaking spaces' state.
495      * @return <code>true</code> if non-breaking spaces (character '&#92;u00a0',
496      * numeric character reference &amp;#160; or character entity reference &amp;nbsp;)
497      * are to be replaced with normal spaces (character '&#92;u0020').
498      */

499     public boolean getReplaceNonBreakingSpaces ()
500     {
501         return (mReplaceSpace);
502     }
503
504     /**
505      * Set the 'replace non breaking spaces' state.
506      * If the setting is changed after the URL has been set, the text from the
507      * URL will be reacquired, which is possibly expensive.
508      * @param replace_space <code>true</code> if non-breaking spaces (character '&#92;u00a0',
509      * numeric character reference &amp;#160; or character entity reference &amp;nbsp;)
510      * are to be replaced with normal spaces (character '&#92;u0020').
511      */

512     public void setReplaceNonBreakingSpaces (boolean replace_space)
513     {
514         boolean oldValue = mReplaceSpace;
515         if (oldValue != replace_space)
516         {
517             mReplaceSpace = replace_space;
518             mPropertySupport.firePropertyChange (PROP_REPLACE_SPACE_PROPERTY, oldValue, replace_space);
519             resetStrings ();
520         }
521     }
522
523     /**
524      * Get the current 'collapse whitespace' state.
525      * If set to <code>true</code> this emulates the operation of browsers
526      * in interpretting text where <quote>user agents should collapse input white
527      * space sequences when producing output inter-word space</quote>.
528      * See HTML specification section 9.1 White space
529      * <a HREF="http://www.w3.org/TR/html4/struct/text.html#h-9.1">
530      * http://www.w3.org/TR/html4/struct/text.html#h-9.1</a>.
531      * @return <code>true</code> if sequences of whitespace (space '&#92;u0020',
532      * tab '&#92;u0009', form feed '&#92;u000C', zero-width space '&#92;u200B',
533      * carriage-return '\r' and newline '\n') are to be replaced with a single
534      * space.
535      */

536     public boolean getCollapse ()
537     {
538         return (mCollapse);
539     }
540
541     /**
542      * Set the current 'collapse whitespace' state.
543      * If the setting is changed after the URL has been set, the text from the
544      * URL will be reacquired, which is possibly expensive.
545      * @param collapse_whitespace If <code>true</code>, sequences of whitespace
546      * will be reduced to a single space.
547      */

548     public void setCollapse (boolean collapse_whitespace)
549     {
550         boolean oldValue = mCollapse;
551         if (oldValue != collapse_whitespace)
552         {
553             mCollapse = collapse_whitespace;
554             mPropertySupport.firePropertyChange (PROP_COLLAPSE_PROPERTY, oldValue, collapse_whitespace);
555             resetStrings ();
556         }
557     }
558
559     /**
560      * Get the current connection.
561      * @return The connection that the parser has or <code>null</code> if it
562      * hasn't been set or the parser hasn't been constructed yet.
563      */

564     public URLConnection JavaDoc getConnection ()
565     {
566         return ((null != mParser) ? mParser.getConnection () : null);
567     }
568
569     /**
570      * Set the parser's connection.
571      * The text from the URL will be fetched, which may be expensive, so this
572      * property should be set last.
573      * @param connection New value of property Connection.
574      */

575     public void setConnection (URLConnection JavaDoc connection)
576     {
577         String JavaDoc url;
578         URLConnection JavaDoc conn;
579
580         url = getURL ();
581         conn = getConnection ();
582         if (((null == conn) && (null != connection)) || ((null != conn) && !conn.equals (connection)))
583         {
584             try
585             {
586                 if (null == mParser)
587                     mParser = new Parser (connection);
588                 else
589                     mParser.setConnection (connection);
590                 mPropertySupport.firePropertyChange (PROP_URL_PROPERTY, url, getURL ());
591                 mPropertySupport.firePropertyChange (PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
592                 setStrings ();
593             }
594             catch (ParserException pe)
595             {
596                 updateStrings (pe.toString ());
597             }
598         }
599     }
600
601     //
602
// NodeVisitor overrides
603
//
604

605     /**
606      * Appends the text to the output.
607      * @param string The text node.
608      */

609     public void visitStringNode (Text string)
610     {
611         if (!mIsScript && !mIsStyle)
612         {
613             String JavaDoc text = string.getText ();
614             if (!mIsPre)
615             {
616                 text = Translate.decode (text);
617                 if (getReplaceNonBreakingSpaces ())
618                     text = text.replace ('\u00a0',' ');
619                 if (getCollapse ())
620                     collapse (mBuffer, text);
621                 else
622                     mBuffer.append (text);
623             }
624             else
625                 mBuffer.append (text);
626         }
627     }
628
629     /**
630      * Appends a newline to the output if the tag breaks flow, and
631      * possibly sets the state of the PRE and SCRIPT flags.
632      */

633     public void visitTag (Tag tag)
634     {
635         String JavaDoc name;
636
637         if (tag instanceof LinkTag)
638             if (getLinks ())
639             { // appends the link as text between angle brackets to the output.
640
mBuffer.append ("<");
641                 mBuffer.append (((LinkTag)tag).getLink ());
642                 mBuffer.append (">");
643             }
644         name = tag.getTagName ();
645         if (name.equalsIgnoreCase ("PRE"))
646             mIsPre = true;
647         else if (name.equalsIgnoreCase ("SCRIPT"))
648             mIsScript = true;
649         else if (name.equalsIgnoreCase ("STYLE"))
650             mIsStyle = true;
651         if (tag.breaksFlow ())
652             carriage_return ();
653     }
654
655     /**
656      * Resets the state of the PRE and SCRIPT flags.
657      * @param tag The end tag to process.
658      */

659     public void visitEndTag (Tag tag)
660     {
661         String JavaDoc name;
662
663         name = tag.getTagName ();
664         if (name.equalsIgnoreCase ("PRE"))
665             mIsPre = false;
666         else if (name.equalsIgnoreCase ("SCRIPT"))
667             mIsScript = false;
668         else if (name.equalsIgnoreCase ("STYLE"))
669             mIsStyle = false;
670     }
671
672     /**
673      * Unit test.
674      * @param args Pass arg[0] as the URL to process.
675      */

676     public static void main (String JavaDoc[] args)
677     {
678         if (0 >= args.length)
679             System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.beans.StringBean <http://whatever_url>");
680         else
681         {
682             StringBean sb = new StringBean ();
683             sb.setLinks (false);
684             sb.setReplaceNonBreakingSpaces (true);
685             sb.setCollapse (true);
686             sb.setURL (args[0]);
687             System.out.println (sb.getStrings ());
688         }
689     }
690 }
691
Popular Tags