FilterTest


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2003 Derrick Oswald
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/filterTests/FilterTest.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2004/07/02 00:49:29 $
10  // $Revision: 1.7 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.tests.filterTests;
28  
29  import org.htmlparser.Parser;
30  import org.htmlparser.Tag;
31  import org.htmlparser.filters.AndFilter;
32  import org.htmlparser.filters.CssSelectorNodeFilter;
33  import org.htmlparser.filters.HasAttributeFilter;
34  import org.htmlparser.filters.HasChildFilter;
35  import org.htmlparser.filters.NodeClassFilter;
36  import org.htmlparser.filters.NotFilter;
37  import org.htmlparser.filters.OrFilter;
38  import org.htmlparser.filters.RegexFilter;
39  import org.htmlparser.filters.StringFilter;
40  import org.htmlparser.filters.TagNameFilter;
41  import org.htmlparser.lexer.Lexer;
42  import org.htmlparser.Text;
43  import org.htmlparser.tags.BodyTag;
44  import org.htmlparser.tags.LinkTag;
45  import org.htmlparser.tests.ParserTestCase;
46  import org.htmlparser.util.NodeIterator;
47  import org.htmlparser.util.NodeList;
48  import org.htmlparser.util.ParserException;
49  
50  /**
51   * Test the operation of filters.
52   */
53  public class FilterTest extends ParserTestCase
54  {
55      static
56      {
57          System.setProperty ("org.htmlparser.tests.filterTests.FilterTest", "FilterTest");
58      }
59  
60      public FilterTest (String   name)
61      {
62          super (name);
63      }
64  
65      /**
66       * Test node class filtering.
67       */
68      public void testNodeClass () throws ParserException
69      {
70          String   guts;
71          String   html;
72          NodeList list;
73  
74          guts = "<body>Now is the time for all good men..</body>";
75          html = "<html>" + guts + "</html>";
76          createParser (html);
77          list = parser.extractAllNodesThatMatch (new NodeClassFilter (BodyTag.class));
78          assertEquals ("only one element", 1, list.size ());
79          assertType ("should be BodyTag", BodyTag.class, list.elementAt (0));
80          BodyTag body = (BodyTag)list.elementAt (0);
81          assertEquals ("only one child", 1, body.getChildCount ());
82          assertSuperType ("should be Text", Text.class, body.getChildren ().elementAt (0));
83          assertStringEquals("html", guts, body.toHtml ());
84      }
85  
86  
87      /**
88       * Test tag name filtering.
89       */
90      public void testTagName () throws ParserException
91      {
92          String   guts;
93          String   html;
94          NodeList list;
95  
96          guts = "<booty>Now is the time for all good men..</booty>";
97          html = "<html>" + guts + "</html>";
98          createParser (html);
99          list = parser.extractAllNodesThatMatch (new TagNameFilter ("booty"));
100         assertEquals ("only one element", 1, list.size ());
101         assertSuperType ("should be Tag", Tag.class, list.elementAt (0));
102         assertStringEquals("name", "BOOTY", ((Tag)(list.elementAt (0))).getTagName ());
103     }
104 
105     /**
106      * Test string filtering.
107      */
108     public void testString () throws ParserException
109     {
110         String   guts;
111         String   html;
112         NodeList list;
113 
114         guts = "<body>Now is the <a id=target><b>time</b></a> for all good <time>men</time>..</body>";
115         html = "<html>" + guts + "</html>";
116         createParser (html);
117         list = parser.extractAllNodesThatMatch (new StringFilter ("Time"));
118         assertEquals ("only one element", 1, list.size ());
119         assertSuperType ("should be String", Text.class, list.elementAt (0));
120         assertStringEquals("name", "time", ((Text)list.elementAt (0)).getText ());
121         // test case sensitivity
122         list = parser.extractAllNodesThatMatch (new StringFilter ("Time", true));
123         assertEquals ("should be no elements", 0, list.size ());
124     }
125 
126     /**
127      * Test child filtering.
128      */
129     public void testChild () throws ParserException
130     {
131         String   guts;
132         String   html;
133         NodeList list;
134 
135         guts = "<body>Now is the <a id=target><b>time</b></a> for all good <a HREF=http://bongo.com>men</a>..</body>";
136         html = "<html>" + guts + "</html>";
137         createParser (html);
138         list = parser.extractAllNodesThatMatch (new HasChildFilter (new TagNameFilter ("b")));
139         assertEquals ("only one element", 1, list.size ());
140         assertType ("should be LinkTag", LinkTag.class, list.elementAt (0));
141         LinkTag link = (LinkTag)list.elementAt (0);
142         assertEquals ("three children", 3, link.getChildCount ());
143         assertSuperType ("should be TagNode", Tag.class, link.getChildren ().elementAt (0));
144         Tag tag = (Tag)link.getChildren ().elementAt (0);
145         assertStringEquals("name", "B", tag.getTagName ());
146     }
147 
148     /**
149      * Test attribute filtering.
150      */
151     public void testAttribute () throws ParserException
152     {
153         String   guts;
154         String   html;
155         NodeList list;
156 
157         guts = "<body>Now is the <a id=target><b>time</b></a> for all good <a HREF=http://bongo.com>men</a>..</body>";
158         html = "<html>" + guts + "</html>";
159         createParser (html);
160         list = parser.extractAllNodesThatMatch (new HasAttributeFilter ("id"));
161         assertEquals ("only one element", 1, list.size ());
162         assertType ("should be LinkTag", LinkTag.class, list.elementAt (0));
163         LinkTag link = (LinkTag)list.elementAt (0);
164         assertEquals ("attribute value", "target", link.getAttribute ("id"));
165     }
166 
167     /**
168      * Test and filtering.
169      */
170     public void testAnd () throws ParserException
171     {
172         String   guts;
173         String   html;
174         NodeList list;
175 
176         guts = "<body>Now is the <a id=one><b>time</b></a> for all good <a id=two><b>men</b></a>..</body>";
177         html = "<html>" + guts + "</html>";
178         createParser (html);
179         list = parser.extractAllNodesThatMatch (
180             new AndFilter (
181                 new HasChildFilter (
182                     new TagNameFilter ("b")),
183                 new HasChildFilter (
184                     new StringFilter ("men")))
185                 );
186         assertEquals ("only one element", 1, list.size ());
187         assertType ("should be LinkTag", LinkTag.class, list.elementAt (0));
188         LinkTag link = (LinkTag)list.elementAt (0);
189         assertEquals ("attribute value", "two", link.getAttribute ("id"));
190     }
191 
192     /**
193      * Test or filtering.
194      */
195     public void testOr () throws ParserException
196     {
197         String   guts;
198         String   html;
199         NodeList list;
200 
201         guts = "<body>Now is the <a id=one><b>time</b></a> for <a id=two><b>all</b></a> good <a id=three><b>men</b></a>..</body>";
202         html = "<html>" + guts + "</html>";
203         createParser (html);
204         list = parser.extractAllNodesThatMatch (
205             new OrFilter (
206                 new HasChildFilter (
207                     new StringFilter ("time")),
208                 new HasChildFilter (
209                     new StringFilter ("men")))
210                 );
211         assertEquals ("two elements", 2, list.size ());
212         assertType ("should be LinkTag", LinkTag.class, list.elementAt (0));
213         LinkTag link = (LinkTag)list.elementAt (0);
214         assertEquals ("attribute value", "one", link.getAttribute ("id"));
215         assertType ("should be LinkTag", LinkTag.class, list.elementAt (1));
216         link = (LinkTag)list.elementAt (1);
217         assertEquals ("attribute value", "three", link.getAttribute ("id"));
218     }
219 
220     /**
221      * Test not filtering.
222      */
223     public void testNot () throws ParserException
224     {
225         String   guts;
226         String   html;
227         NodeList list;
228 
229         guts = "<body>Now is the <a id=one><b>time</b></a> for <a id=two><b>all</b></a> good <a id=three><b>men</b></a>..</body>";
230         html = "<html>" + guts + "</html>";
231         createParser (html);
232         list = parser.extractAllNodesThatMatch (
233             new AndFilter (
234                 new HasChildFilter (
235                     new TagNameFilter ("b")),
236                 new NotFilter (
237                     new HasChildFilter (
238                         new StringFilter ("all"))))
239                 );
240         assertEquals ("two elements", 2, list.size ());
241         assertType ("should be LinkTag", LinkTag.class, list.elementAt (0));
242         LinkTag link = (LinkTag)list.elementAt (0);
243         assertEquals ("attribute value", "one", link.getAttribute ("id"));
244         assertType ("should be LinkTag", LinkTag.class, list.elementAt (1));
245         link = (LinkTag)list.elementAt (1);
246         assertEquals ("attribute value", "three", link.getAttribute ("id"));
247     }
248 
249     public void testEscape() throws Exception  
250     {
251         assertEquals ("douchebag", CssSelectorNodeFilter.unescape ("doucheba\\g").toString ());
252     }
253 
254     public void testSelectors() throws Exception  
255     {
256         String   html = "<html><head><title>sample title</title></head><body inserterr=\"true\" yomama=\"false\"><h3 id=\"heading\">big </invalid>heading</h3><ul id=\"things\"><li><br word=\"broken\"/>&gt;moocow<li><applet/>doohickey<li class=\"last\"><b class=\"item\">final<br>item</b></ul></body></html>";
257         Lexer l;
258         Parser p;
259         CssSelectorNodeFilter it;
260         NodeIterator i;
261         int count;
262 
263         l = new Lexer (html);
264         p = new Parser (l);
265         it = new CssSelectorNodeFilter ("li + li");
266         count = 0;
267         for (i = p.extractAllNodesThatMatch (it).elements (); i.hasMoreNodes ();)
268         {
269             assertEquals ("tag name wrong", "LI", ((Tag)i.nextNode()).getTagName());
270             count++;
271         }
272         assertEquals ("wrong count", 2, count);
273     }
274 
275     /**
276      * Test regular expression matching:
277      */
278     public void testRegularExpression () throws Exception  
279     {
280         String   target =
281               "\n"
282             + "\n"
283             + "Most recently, in the Western Conference final, the Flames knocked off \n"
284             + "the San Jose Sharks, the Pacific Division champions, to become the first \n"
285             + "Canadian team to reach the Stanley Cup Championship series since 1994.";
286             
287         String   html =
288               "<html><head><title>CBC Sports Online: NHL Playoffs</title></head>"
289             + "<body><h1>CBC SPORTS ONLINE</h1>\n"
290             + "The Calgary Flames have already defeated three NHL division winners \n"
291             + "during their improbable playoff run. If they are to hoist the Stanley \n"
292             + "Cup they'll have to go through one more. <p><table ALIGN=\"Right\" width=196 CELLPADDING=0 cellspacing=0 hspace=4> <tr><td><img SRC=\"/gfx/topstory/sports/iginla_j0524.jpg\" width=194 height=194 hspace=3 border=1><br>\n"
293             + "\n"
294             + "<font SIZE=\"1\" FACE=\"verdana,arial\">\n"
295             + "Jarome Iginla skates during the Flames' practice on Monday. Calgary takes on the Tampa Bay Lightning in the Stanley Cup finals beginning Tuesday night in Tampa\n"
296             + "</font></td></tr></table>\n"
297             + "\n"
298             + "\n"
299             + "In the post-season's first round, the Flames defeated the Vancouver \n"
300             + "Canucks, the Northwest Division winners, in seven tough games. <p>\n"
301             + "\n"
302             + "In Round 2 it was the Detroit Red Wings, who not only won the Central \n"
303             + "Division, but also boasted the NHL's best overall record during the \n"
304             + "regular season, who fell to the Flames. <p>"
305             + target
306             + "<p>\n"
307             + "\n"
308             + "Up next for the Flames is the Tampa Bay Lighting -- the runaway winners \n"
309             + "of the NHL's Southeast Division and the Eastern Conference's best team \n"
310             + "during the regular season. <p>\n"
311             + "\n"
312             + "The Lighting advanced by beating the Philadelphia Flyers in the Eastern \n"
313             + "Conference final. <p>\n"
314             + "</body></html>\n";
315         Lexer lexer;
316         Parser parser;
317         RegexFilter filter;
318         NodeIterator iterator;
319         int count;
320 
321         lexer = new Lexer (html);
322         parser = new Parser (lexer);
323         filter = new RegexFilter ("(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?");
324         count = 0;
325         for (iterator = parser.extractAllNodesThatMatch (filter).elements (); iterator.hasMoreNodes ();)
326         {
327             assertEquals ("text wrong", target, iterator.nextNode ().toHtml ());
328             count++;
329         }
330         assertEquals ("wrong count", 1, count);
331     }
332 }
333 
334
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags