KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > beans > FilterBean


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Derrick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/FilterBean.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2005/02/13 20:36:03 $
10
// $Revision: 1.1 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.beans;
28
29 import java.beans.PropertyChangeListener JavaDoc;
30 import java.beans.PropertyChangeSupport JavaDoc;
31 import java.io.Serializable JavaDoc;
32 import java.net.URLConnection JavaDoc;
33
34 import org.htmlparser.NodeFilter;
35 import org.htmlparser.Parser;
36 import org.htmlparser.util.NodeList;
37 import org.htmlparser.util.ParserException;
38 import org.htmlparser.util.EncodingChangeException;
39
40 /**
41  * Extract nodes from a URL using a filter.
42  * FilterBean fb = new FilterBean ("http://cbc.ca");
43  * fb.setFilters (new NodeFilter[] { new TagNameFilter ("META") });
44  * fb.setURL ("http://cbc.ca");
45  * System.out.println (fb.getNodes ().toHtml ());
46  */

47 public class FilterBean
48     implements
49         Serializable JavaDoc
50 {
51     /**
52      * Property name in event where the URL contents changes.
53      */

54     public static final String JavaDoc PROP_NODES_PROPERTY = "nodes";
55
56     /**
57      * Property name in event where the URL contents changes.
58      */

59     public static final String JavaDoc PROP_TEXT_PROPERTY = "text";
60
61     /**
62      * Property name in event where the URL changes.
63      */

64     public static final String JavaDoc PROP_URL_PROPERTY = "URL";
65
66     /**
67      * Property name in event where the connection changes.
68      */

69     public static final String JavaDoc PROP_CONNECTION_PROPERTY = "connection";
70
71     /**
72      * Bound property support.
73      */

74     protected PropertyChangeSupport JavaDoc mPropertySupport;
75
76     /**
77      * The parser used to filter.
78      */

79     protected Parser mParser;
80
81     /**
82      * The filter set.
83      */

84     protected NodeFilter[] mFilters;
85
86     /**
87      * The nodes extracted from the URL.
88      */

89     protected NodeList mNodes;
90
91    /**
92      * Create a FilterBean object.
93      */

94     public FilterBean ()
95     {
96         mPropertySupport = new PropertyChangeSupport JavaDoc (this);
97         mParser = new Parser ();
98         mFilters = null;
99         mNodes = null;
100     }
101
102     //
103
// internals
104
//
105

106     /**
107      * Assign the <code>Nodes</code> property, firing the property change.
108      * @param nodes The new value of the <code>Nodes</code> property.
109      */

110     protected void updateNodes (NodeList nodes)
111     {
112         NodeList oldValue;
113         String JavaDoc oldText;
114         String JavaDoc newText;
115
116         if ((null == mNodes) || !mNodes.equals (nodes))
117         {
118             oldValue = mNodes;
119             if (null != oldValue)
120                 oldText = getText ();
121             else
122                 oldText = "";
123             if (null == oldText)
124                 oldText = "";
125             mNodes = nodes;
126             if (null != mNodes) // TODO: fix this null problem if StringBean finds no nodes
127
newText = getText ();
128             else
129                 newText = "";
130             if (null == newText)
131                 newText = "";
132             mPropertySupport.firePropertyChange (PROP_NODES_PROPERTY, oldValue, nodes);
133             if (!newText.equals (oldText))
134                 mPropertySupport.firePropertyChange (PROP_TEXT_PROPERTY, oldText, newText);
135         }
136     }
137
138     /**
139      * Apply each of the filters.
140      * The first filter is applied to the parser.
141      * Subsequent filters are applied to the output of the prior filter.
142      * @return A list of nodes passed through all filters.
143      * @throws ParserException If an encoding change occurs or there is some other problem.
144      */

145     protected NodeList applyFilters ()
146         throws
147         ParserException
148     {
149         NodeList ret;
150
151         ret = new NodeList ();
152
153         if (null != getFilters ())
154             for (int i = 0; i < getFilters ().length; i++)
155                 if (0 == i)
156                     ret = mParser.parse (getFilters ()[0]);
157                 else
158                     ret = ret.extractAllNodesThatMatch (getFilters ()[i]);
159
160         return (ret);
161     }
162
163     /**
164      * Fetch the URL contents and filter it.
165      * Only do work if there is a valid parser with it's URL set.
166      */

167     protected void setNodes ()
168     {
169         NodeList list;
170
171         if (null != getURL ())
172             try
173             {
174                 list = applyFilters ();
175                 updateNodes (list);
176             }
177             catch (EncodingChangeException ece)
178             {
179                 try
180                 { // try again with the encoding now in force
181
mParser.reset ();
182                     list = applyFilters ();
183                     updateNodes (list);
184                 }
185                 catch (ParserException pe)
186                 {
187                     updateNodes (new NodeList ());
188                 }
189              }
190             catch (ParserException pe)
191             {
192                 updateNodes (new NodeList ());
193             }
194     }
195
196     //
197
// Property change support.
198
//
199

200     /**
201      * Add a PropertyChangeListener to the listener list.
202      * The listener is registered for all properties.
203      * @param listener The PropertyChangeListener to be added.
204      */

205     public void addPropertyChangeListener (PropertyChangeListener JavaDoc listener)
206     {
207         mPropertySupport.addPropertyChangeListener (listener);
208     }
209
210     /**
211      * Remove a PropertyChangeListener from the listener list.
212      * This removes a PropertyChangeListener that was registered for all properties.
213      * @param listener The PropertyChangeListener to be removed.
214      */

215     public void removePropertyChangeListener (PropertyChangeListener JavaDoc listener)
216     {
217         mPropertySupport.removePropertyChangeListener (listener);
218     }
219
220     //
221
// Properties
222
//
223

224     /**
225      * Return the nodes of the URL matching the filter.
226      * This is the primary output of the bean.
227      * @return The nodes from the URL matching the current filter.
228      */

229     public NodeList getNodes ()
230     {
231         if (null == mNodes)
232             setNodes ();
233
234         return (mNodes);
235     }
236
237     /**
238      * Get the current URL.
239      * @return The URL from which text has been extracted, or <code>null</code>
240      * if this property has not been set yet.
241      */

242     public String JavaDoc getURL ()
243     {
244          return ((null != mParser) ? mParser.getURL () : null);
245     }
246
247     /**
248      * Set the URL to extract strings from.
249      * The text from the URL will be fetched, which may be expensive, so this
250      * property should be set last.
251      * @param url The URL that text should be fetched from.
252      */

253     public void setURL (String JavaDoc url)
254     {
255         String JavaDoc old;
256         URLConnection JavaDoc conn;
257
258         old = getURL ();
259         conn = getConnection ();
260         if (((null == old) && (null != url)) || ((null != old) && !old.equals (url)))
261         {
262             try
263             {
264                 if (null == mParser)
265                     mParser = new Parser (url);
266                 else
267                     mParser.setURL (url);
268                 mPropertySupport.firePropertyChange (PROP_URL_PROPERTY, old, getURL ());
269                 mPropertySupport.firePropertyChange (PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
270                 setNodes ();
271             }
272             catch (ParserException pe)
273             {
274                 updateNodes (new NodeList ());
275             }
276         }
277     }
278
279     /**
280      * Get the current connection.
281      * @return The connection that the parser has or <code>null</code> if it
282      * hasn't been set or the parser hasn't been constructed yet.
283      */

284     public URLConnection JavaDoc getConnection ()
285     {
286         return ((null != mParser) ? mParser.getConnection () : null);
287     }
288
289     /**
290      * Set the parser's connection.
291      * The text from the URL will be fetched, which may be expensive, so this
292      * property should be set last.
293      * @param connection New value of property Connection.
294      */

295     public void setConnection (URLConnection JavaDoc connection)
296     {
297         String JavaDoc url;
298         URLConnection JavaDoc conn;
299
300         url = getURL ();
301         conn = getConnection ();
302         if (((null == conn) && (null != connection)) || ((null != conn) && !conn.equals (connection)))
303         {
304             try
305             {
306                 if (null == mParser)
307                     mParser = new Parser (connection);
308                 else
309                     mParser.setConnection (connection);
310                 mPropertySupport.firePropertyChange (PROP_URL_PROPERTY, url, getURL ());
311                 mPropertySupport.firePropertyChange (PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
312                 setNodes ();
313             }
314             catch (ParserException pe)
315             {
316                 updateNodes (new NodeList ());
317             }
318         }
319     }
320
321     /**
322      * Get the current filter set.
323      * @return The current filters.
324      */

325     public NodeFilter[] getFilters ()
326     {
327         return (mFilters);
328     }
329
330     /**
331      * Set the filters for the bean.
332      * If the parser has been set, it is reset and
333      * the nodes are refetched with the new filters.
334      * @param filters The filter set to use.
335      */

336     public void setFilters (NodeFilter[] filters)
337     {
338         mFilters = filters;
339         if (null != getParser ())
340         {
341             getParser ().reset ();
342             setNodes ();
343         }
344     }
345
346     /**
347      * Get the parser used to fetch nodes.
348      * @return The parser used by the bean.
349      */

350     public Parser getParser ()
351     {
352         return (mParser);
353     }
354
355     /**
356      * Set the parser for the bean.
357      * The parser is used immediately to fetch the nodes,
358      * which for a null filter means all the nodes
359      * @param parser The parser to use.
360      */

361     public void setParser (Parser parser)
362     {
363         mParser = parser;
364         if (null != getFilters ())
365             setNodes ();
366     }
367
368     /**
369      * Convenience method to apply a {@link StringBean} to the results of filtering.
370      * This may yield duplicate or multiple text elements if the node list contains nodes from
371      * two or more levels in the same nested tag heirarchy, but if the node list
372      * contains only one tag, it provides access to the text within the node.
373      * @return The textual contents of the nodes that pass through the filter set,
374      * as collected by the StringBean.
375      */

376     public String JavaDoc getText ()
377     {
378         NodeList list;
379         StringBean sb;
380         String JavaDoc ret;
381
382         list = getNodes ();
383         if (0 != list.size ())
384         {
385             sb = new StringBean ();
386             for (int i = 0; i < list.size (); i++)
387                 list.elementAt (i).accept (sb);
388             ret = sb.getStrings ();
389         }
390         else
391             ret = "";
392         
393         return (ret);
394     }
395
396     /**
397      * Unit test.
398      * @param args Pass arg[0] as the URL to process,
399      * and optionally a node name for filtering.
400      */

401     public static void main (String JavaDoc[] args)
402     {
403         if (0 >= args.length)
404             System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.beans.FilterBean <http://whatever_url> [node name]");
405         else
406         {
407             FilterBean fb = new FilterBean ();
408             if (1 < args.length)
409                 fb.setFilters (new NodeFilter[] { new org.htmlparser.filters.TagNameFilter (args[1]) });
410             fb.setURL (args[0]);
411             //System.out.println (fb.getNodes ().toHtml ());
412
System.out.println (fb.getText ());
413         }
414     }
415 }
416
Popular Tags