KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > tests > ParserTest


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Somik Raha
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTest.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2005/02/13 20:36:02 $
10
// $Revision: 1.64 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.tests;
28
29 import java.io.BufferedInputStream JavaDoc;
30 import java.io.DataInputStream JavaDoc;
31 import java.io.File JavaDoc;
32 import java.io.FileInputStream JavaDoc;
33 import java.io.FileWriter JavaDoc;
34 import java.io.PrintWriter JavaDoc;
35 import java.net.HttpURLConnection JavaDoc;
36 import java.net.URL JavaDoc;
37 import java.util.Locale JavaDoc;
38
39 import org.htmlparser.Node;
40 import org.htmlparser.Parser;
41 import org.htmlparser.PrototypicalNodeFactory;
42 import org.htmlparser.Remark;
43 import org.htmlparser.Tag;
44 import org.htmlparser.Text;
45 import org.htmlparser.filters.NodeClassFilter;
46 import org.htmlparser.filters.TagNameFilter;
47 import org.htmlparser.lexer.InputStreamSource;
48 import org.htmlparser.tags.BodyTag;
49 import org.htmlparser.tags.ImageTag;
50 import org.htmlparser.tags.LinkTag;
51 import org.htmlparser.tags.MetaTag;
52 import org.htmlparser.util.DefaultParserFeedback;
53 import org.htmlparser.util.NodeIterator;
54 import org.htmlparser.util.NodeList;
55 import org.htmlparser.util.ParserException;
56 import org.htmlparser.util.SimpleNodeIterator;
57
58 public class ParserTest extends ParserTestCase
59 {
60     static
61     {
62         System.setProperty ("org.htmlparser.tests.ParserTest", "ParserTest");
63     }
64
65     public ParserTest(String JavaDoc name) {
66         super(name);
67     }
68     public void testElements() throws Exception JavaDoc {
69         StringBuffer JavaDoc hugeData = new StringBuffer JavaDoc();
70         for (int i=0;i<5001;i++) hugeData.append('a');
71         createParser(hugeData.toString());
72         int i = 0;
73         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
74         {
75             node[i++] = e.nextNode();
76         }
77         assertEquals("There should be 1 node identified",1,i);
78         // Now try getting the elements again
79
// i = 0;
80
// reader.reset();
81
// reader.setLineCount(1);
82
// reader.setPosInLine(-1);
83
// for (HTMLEnumeration e = parser.elements();e.hasMoreNodes();)
84
// {
85
// node[i++] = e.nextHTMLNode();
86
// }
87
// assertEquals("There should be 1 node identified (second call to parser.elements())",1,i);
88
}
89
90     /**
91      * This testcase needs you to be online.
92      */

93     public void testElementsFromWeb() throws Exception JavaDoc {
94         Parser parser;
95         try {
96             parser = new Parser("http://www.google.com");
97         }
98         catch (Exception JavaDoc e ){
99             throw new ParserException("You must be offline! This test needs you to be connected to the internet.",e);
100         }
101
102         Node[] node = new Node[500];
103         int i = 0;
104         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
105         {
106             node[i++] = e.nextNode();
107         }
108         int cnt = i;
109         parser.reset ();
110         // Now try getting the elements again
111
i = 0;
112         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
113         {
114             node[i++] = e.nextNode();
115         }
116         assertEquals("There should be "+cnt+" nodes identified (second call to parser.elements())",cnt,i);
117     }
118
119     /**
120      * Test the Parser(URLConnection) constructor.
121      * This testcase needs you to be online.
122      * Based on the form at Canada Post <code>http://www.canadapost.ca/tools/pcl/bin/default-e.asp</code>:
123      * <pre>
124      * &lt;form NAME="SearchQuick" method="POST" action="cp_search_response-e.asp"
125      * onSubmit="return runSubmit();"&gt;
126      *
127      * &lt;!-- begin test hidden field code --&gt;
128      * &lt;input TYPE="Hidden" NAME="app_language" value="english"&gt;
129      *
130      * &lt;input TYPE="Hidden" NAME="app_response_start_row_number" value="1"&gt;
131      * &lt;input TYPE="Hidden" NAME="app_response_rows_max" value="9"&gt;
132      *
133      * &lt;input TYPE="Hidden" NAME="app_source" value="quick"&gt;
134      * &lt;input TYPE="Hidden" NAME="query_source" value="q"&gt;
135      *
136      * &lt;input TYPE="Hidden" NAME="name" value&gt;
137      * &lt;input TYPE="Hidden" NAME="postal_code" value&gt;
138      * &lt;input TYPE="Hidden" NAME="directory_area_name" value&gt;
139      *
140      * &lt;input TYPE="Hidden" NAME="delivery_mode" value&gt;
141      * &lt;input TYPE="Hidden" NAME="Suffix" value&gt;
142      *
143      * &lt;input TYPE="Hidden" NAME="street_direction" value&gt;
144      * &lt;input TYPE="Hidden" NAME="installation_type" value&gt;
145      * &lt;input TYPE="Hidden" NAME="delivery_number" value&gt;
146      * &lt;input TYPE="Hidden" NAME="installation_name" value&gt;
147      * &lt;input TYPE="Hidden" NAME="unit_number" value&gt;
148      *
149      * &lt;input TYPE="Hidden" NAME="app_state" value="production"&gt;
150      * &lt;!-- end test hidden field code --&gt;
151      *
152      * &lt;p&gt;
153      * &lt;table border="0" cellpadding="0" width="90%" cellspacing="0"&gt;
154      *
155      * &lt;tr&gt;
156      * &lt;td class="tbltitle"&gt; Street Number: &lt;/td&gt;
157      * &lt;td class="tbltitle"&gt; Street Name: &lt;/td&gt;
158      * &lt;td class="tbltitle"&gt; Street Type:&lt;/td&gt;
159      * &lt;/tr&gt;
160      * &lt;tr&gt;
161      *
162      * &lt;td&gt;
163      * &lt;input type="text" name="street_number" size="10" maxlength="10"&gt;
164      * &lt;/td&gt;
165      * &lt;td&gt;
166      * &lt;input type="text" name="street_name" size="30" maxlength="40"&gt;
167      * &lt;input type="hidden" name="street_type" size="30"&gt;
168      * &lt;/td&gt;
169      * &lt;td&gt;&lt;input type="text" name="test" size="10" maxlength="30"&gt;&lt;/td&gt;
170      * &lt;/tr&gt;
171      *
172      * &lt;/table&gt;
173      * &lt;p&gt;
174      * &lt;table border="0" cellpadding="0" width="90%" cellspacing="0"&gt;
175      * &lt;tr&gt;
176      * &lt;td class="tbltitle"&gt;
177      * Municipality (City, Town, etc.):
178      * &lt;/td&gt;
179      * &lt;td class="tbltitle"&gt;
180      * Province:
181      * &lt;/td&gt;
182      *
183      * &lt;/tr&gt;
184      * &lt;tr&gt;
185      * &lt;td&gt;
186      * &lt;input type="text" name="city" size="30" maxlength="30"&gt;
187      * &lt;/td&gt;
188      * &lt;td&gt;
189      * &lt;select size="1" name="prov"&gt;
190      * &lt;option selected value="NULL"&gt;Select&lt;/option&gt;&lt;option value="AB"&gt;AB - Alberta&lt;/option&gt;&lt;option value="BC"&gt;BC - British Columbia&lt;/option&gt;&lt;option value="MB"&gt;MB - Manitoba&lt;/option&gt;&lt;option value="NB"&gt;NB - New Brunswick&lt;/option&gt;&lt;option value="NL"&gt;NL - Newfoundland and Labrador&lt;/option&gt;&lt;option value="NS"&gt;NS - Nova Scotia&lt;/option&gt;&lt;option value="NT"&gt;NT - Northwest Territories&lt;/option&gt;&lt;option value="NU"&gt;NU - Nunavut&lt;/option&gt;&lt;option value="ON"&gt;ON - Ontario&lt;/option&gt;&lt;option value="PE"&gt;PE - Prince Edward Island&lt;/option&gt;&lt;option value="QC"&gt;QC - Quebec&lt;/option&gt;&lt;option value="SK"&gt;SK - Saskatchewan&lt;/option&gt;&lt;option value="YT"&gt;YT - Yukon&lt;/option&gt;
191      *
192      * &lt;/select&gt;
193      * &lt;/td&gt;
194      * &lt;/tr&gt;
195      * &lt;tr&gt;
196      * &lt;td height="10"&gt;&amp;nbsp;&lt;/td&gt;
197      * &lt;td&gt;&amp;nbsp;&lt;/td&gt;
198      * &lt;/tr&gt;
199      * &lt;tr&gt;
200      * &lt;td colspan="2" align="right" nowrap&gt;
201      * &lt;input type="image" SRC="images/bb_submit-e.gif" name="Search" border="0" WIDTH="88" HEIGHT="23"&gt;
202      * &amp;nbsp; &lt;a HREF="#" onclick="javascript:fClearAllFields();"&gt;&lt;img SRC="images/bb_clear_form-e.gif" name="Clear" border="0" WIDTH="88" HEIGHT="23"&gt;&lt;/a&gt;
203      * &lt;/td&gt;
204      * &lt;/tr&gt;
205      * &lt;/table&gt;
206      * &lt;p&gt;
207      * &lt;/form&gt;
208      * </pre>
209      * Sumbits the POST and verifies the returned HTML contains an expected value.
210      */

211     public void testPOST() throws Exception JavaDoc
212     { // the form data:
213
final String JavaDoc number = "2708";
214         final String JavaDoc street = "Kelly";
215         final String JavaDoc type = "Avenue";
216         final String JavaDoc city = "Ottawa";
217         final String JavaDoc province = "ON";
218         // the correct answer
219
final String JavaDoc postal_code = "K2B 7V4";
220
221         Parser parser;
222         URL JavaDoc url;
223         HttpURLConnection JavaDoc connection;
224         StringBuffer JavaDoc buffer;
225         PrintWriter JavaDoc out;
226         boolean pass;
227         NodeIterator enumeration;
228         Node node;
229         Text string;
230
231         try
232         {
233             url = new URL JavaDoc ("http://www.canadapost.ca/tools/pcl/bin/cp_search_response-e.asp");
234              connection = (HttpURLConnection JavaDoc)url.openConnection ();
235             connection.setRequestMethod ("POST");
236             connection.setRequestProperty ("Referer", "http://www.canadapost.ca/tools/pcl/bin/default-e.asp");
237             connection.setDoOutput (true);
238             connection.setDoInput (true);
239             connection.setUseCaches (false);
240             buffer = new StringBuffer JavaDoc (1024);
241             buffer.append ("app_language=");
242             buffer.append ("english");
243             buffer.append ("&");
244             buffer.append ("app_response_start_row_number=");
245             buffer.append ("1");
246             buffer.append ("&");
247             buffer.append ("app_response_rows_max=");
248             buffer.append ("9");
249             buffer.append ("&");
250             buffer.append ("app_source=");
251             buffer.append ("quick");
252             buffer.append ("&");
253             buffer.append ("query_source=");
254             buffer.append ("q");
255             buffer.append ("&");
256             buffer.append ("name=");
257             buffer.append ("&");
258             buffer.append ("postal_code=");
259             buffer.append ("&");
260             buffer.append ("directory_area_name=");
261             buffer.append ("&");
262             buffer.append ("delivery_mode=");
263             buffer.append ("&");
264             buffer.append ("Suffix=");
265             buffer.append ("&");
266             buffer.append ("street_direction=");
267             buffer.append ("&");
268             buffer.append ("installation_type=");
269             buffer.append ("&");
270             buffer.append ("delivery_number=");
271             buffer.append ("&");
272             buffer.append ("installation_name=");
273             buffer.append ("&");
274             buffer.append ("unit_numbere=");
275             buffer.append ("&");
276             buffer.append ("app_state=");
277             buffer.append ("production");
278             buffer.append ("&");
279             buffer.append ("street_number=");
280             buffer.append (number);
281             buffer.append ("&");
282             buffer.append ("street_name=");
283             buffer.append (street);
284             buffer.append ("&");
285             buffer.append ("street_type=");
286             buffer.append (type);
287             buffer.append ("&");
288             buffer.append ("test=");
289             buffer.append ("&");
290             buffer.append ("city=");
291             buffer.append (city);
292             buffer.append ("&");
293             buffer.append ("prov=");
294             buffer.append (province);
295             buffer.append ("&");
296             buffer.append ("Search=");
297             out = new PrintWriter JavaDoc (connection.getOutputStream ());
298             out.print (buffer);
299             out.close ();
300             parser = new Parser (connection);
301             parser.setNodeFactory (new PrototypicalNodeFactory (true));
302         }
303         catch (Exception JavaDoc e)
304         {
305             throw new ParserException ("You must be offline! This test needs you to be connected to the internet.", e);
306         }
307
308         pass = false;
309         for (enumeration = parser.elements (); enumeration.hasMoreNodes ();)
310         {
311             node = enumeration.nextNode ();
312             if (node instanceof Text)
313             {
314                 string = (Text)node;
315                 if (-1 != string.getText ().indexOf (postal_code))
316                     pass = true;
317             }
318         }
319         assertTrue("POST operation failed.", pass);
320     }
321
322     /**
323      * Tests the 'from file' Parser constructor.
324      */

325     public void testFile ()
326     {
327         String JavaDoc path;
328         File JavaDoc file;
329         PrintWriter JavaDoc out;
330         Parser parser;
331         Node nodes[];
332         int i;
333         NodeIterator enumeration;
334
335         path = System.getProperty ("user.dir");
336         if (!path.endsWith (File.separator))
337             path += File.separator;
338         file = new File JavaDoc (path + "delete_me.html");
339         try
340         {
341             out = new PrintWriter JavaDoc (new FileWriter JavaDoc (file));
342             out.println ("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">");
343             out.println ("<html>");
344             out.println ("<head>");
345             out.println ("<title>test</title>");
346             out.println ("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");
347             out.println ("</head>");
348             out.println ("<body>");
349             out.println ("This is a test page ");
350             out.println ("</body>");
351             out.println ("</html>");
352             out.close ();
353             parser = new Parser (file.getAbsolutePath (), new DefaultParserFeedback(DefaultParserFeedback.QUIET));
354             parser.setNodeFactory (new PrototypicalNodeFactory (true));
355             nodes = new Node[30];
356             i = 0;
357             for (enumeration = parser.elements (); enumeration.hasMoreNodes ();)
358             {
359                 nodes[i] = enumeration.nextNode ();
360                 i++;
361             }
362             assertEquals("Expected nodes",20,i);
363         }
364         catch (Exception JavaDoc e)
365         {
366             fail (e.toString ());
367         }
368         finally
369         {
370             file.delete ();
371         }
372     }
373
374     /**
375      * Tests deleting a file held open by the parser.
376      * See bug #1005409 Input file not free by parser
377      */

378     public void testFileDelete ()
379     {
380         String JavaDoc path;
381         File JavaDoc file;
382         PrintWriter JavaDoc out;
383         Parser parser;
384         NodeIterator enumeration;
385
386         path = System.getProperty ("user.dir");
387         if (!path.endsWith (File.separator))
388             path += File.separator;
389         file = new File JavaDoc (path + "delete_me.html");
390         try
391         {
392             out = new PrintWriter JavaDoc (new FileWriter JavaDoc (file));
393             out.println ("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">");
394             out.println ("<html>");
395             out.println ("<head>");
396             out.println ("<title>test</title>");
397             out.println ("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");
398             out.println ("</head>");
399             out.println ("<body>");
400             out.println ("This is a test page ");
401             out.println ("</body>");
402             out.println ("</html>");
403             // fill our 16K buffer on read
404
for (int i = 0; i < InputStreamSource.BUFFER_SIZE; i++)
405                 out.println ();
406             out.close ();
407             parser = new Parser (file.getAbsolutePath (), new DefaultParserFeedback(DefaultParserFeedback.QUIET));
408             parser.setNodeFactory (new PrototypicalNodeFactory (true));
409             enumeration = parser.elements ();
410             enumeration.nextNode ();
411             if (-1 != System.getProperty ("os.name").indexOf("Windows"))
412                 // linux/unix lets you delete a file even when it's open
413
assertTrue ("file deleted with more available", !file.delete ());
414             // parser.getLexer ().getPage ().close ();
415
parser = null;
416             enumeration = null;
417             System.gc ();
418             System.runFinalization ();
419             assertTrue ("file not deleted after destroy", file.delete ());
420         }
421         catch (Exception JavaDoc e)
422         {
423             fail (e.toString ());
424         }
425         finally
426         {
427             file.delete ();
428         }
429     }
430
431     /**
432      * Test with a HTTP header with a valid charset parameter.
433      * Here, ibm.co.jp is an example of a HTTP server that correctly sets the
434      * charset in the header to match the content encoding.
435      */

436     public void testHTTPCharset ()
437     {
438         Parser parser;
439         try
440         {
441             parser = new Parser("http://www.ibm.com/jp/", Parser.noFeedback);
442             assertTrue("Character set should be Shift_JIS", parser.getEncoding ().equalsIgnoreCase ("Shift_JIS"));
443         }
444         catch (ParserException e)
445         {
446             fail ("could not open http://www.ibm.com/jp/");
447         }
448     }
449
450     /**
451      * Test with a HTML header with a charset parameter not matching the HTTP header.
452      * Here, www.sony.co.jp is an example of a HTTP server that does not set the
453      * charset in the header to match the content encoding. We check that after
454      * the enumeration is created, that the charset has changed to the correct value.
455      */

456     public void testHTMLCharset ()
457     {
458         Parser parser;
459         NodeIterator enumeration;
460
461         try
462         {
463             parser = new Parser("http://www.sony.co.jp", Parser.noFeedback);
464             assertEquals("Character set by default is ISO-8859-1", "ISO-8859-1", parser.getEncoding ());
465             enumeration = parser.elements();
466             // search for the <BODY> tag
467
while (enumeration.hasMoreNodes ())
468                 if (enumeration.nextNode () instanceof BodyTag)
469                     break;
470             assertTrue("Character set should be Shift_JIS", parser.getEncoding ().equalsIgnoreCase ("Shift_JIS"));
471         }
472         catch (ParserException e)
473         {
474             fail ("could not open http://www.sony.co.jp");
475         }
476     }
477
478     /**
479      * Test the case of a charset directive different than the HTTP header.
480      * See bug #707447 META TAG - CHARSET
481      * and bug #699886 can't parse website other than iso-8859-1
482      */

483     public void testSwitchCharset () throws ParserException
484     {
485         Parser parser;
486         String JavaDoc url = "http://htmlparser.sourceforge.net/test/gb2312Charset.html";
487         int i;
488         Node[] nodes;
489
490         parser = new Parser(url);
491         parser.setNodeFactory (new PrototypicalNodeFactory (new MetaTag ()));
492         i = 0;
493         nodes = new Node[30];
494         for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
495             nodes[i++] = e.nextNode();
496         assertEquals ("Expected nodes", 23, i);
497     }
498
499     /**
500      * Test the case of a double quoted charset directive.
501      * See bug #694477.
502      * Technically, this format does not meet the HTTP/1.1
503      * specification in RFC 2068. In this case that I believe
504      * that the quotes are being inproperly generated in the
505      * header by a server-side web application.
506      * Nonetheless, it would be nice to handle this case.
507      */

508     public void testDoubleQuotedCharset () throws ParserException
509     {
510         Parser parser;
511         String JavaDoc url = "http://htmlparser.sourceforge.net/test/DoublequotedCharset.html";
512
513         parser = new Parser(url);
514         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
515             e.nextNode();
516         assertTrue ("Wrong encoding", parser.getEncoding ().equals ("UTF-8"));
517     }
518
519     /**
520      * Test the case of a single quoted charset directive.
521      * See bug #694477.
522      * Technically, this format does not meet the HTTP/1.1
523      * specification in RFC 2068. In this case that I believe
524      * that the quotes are being inproperly generated in the
525      * header by a server-side web application.
526      * Nonetheless, it would be nice to handle this case.
527      */

528     public void testSingleQuotedCharset () throws ParserException
529     {
530         Parser parser;
531         String JavaDoc url = "http://htmlparser.sourceforge.net/test/SinglequotedCharset.html";
532
533         parser = new Parser(url);
534         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
535             e.nextNode();
536         assertTrue ("Wrong encoding", parser.getEncoding ().equals ("UTF-8"));
537     }
538
539     // This test is commented out because the URL no longer has a comma delimited character set.
540
// Reinstate when a suitable URL is discovered, or the unit tests set up their own HTTP server.
541
// /**
542
// * Test a bogus comma delimited charset specification in the HTTP header.
543
// * See bug #722941.
544
// * A comma delimted charset in the HTTP header does not meet the HTTP/1.1
545
// * specification in RFC 2068. In this case that I believe
546
// * that some idiot has misconfigured the HTTP server, but since it's
547
// * AOL it would be nice to handle this case.
548
// */
549
// public void testCommaListCharset () throws ParserException
550
// {
551
// URL url;
552
// URLConnection connection;
553
// Parser parser;
554
// String bogus = "http://users.aol.com/geinster/rej.htm";
555
//
556
// try
557
// {
558
// url = new URL (bogus);
559
// connection = url.openConnection ();
560
// parser = new Parser (new Lexer (new Page (connection)));
561
// // must be the default
562
// assertTrue ("Wrong encoding", parser.getEncoding ().equals ("ISO-8859-1"));
563
// for (NodeIterator e = parser.elements();e.hasMoreNodes();)
564
// e.nextNode();
565
// assertTrue ("Wrong encoding", parser.getEncoding ().equals ("windows-1252"));
566
// }
567
// catch (Exception e)
568
// {
569
// fail (e.getMessage ());
570
// }
571
// }
572

573     public void testNullUrl() {
574         try
575         {
576             new Parser("http://none.existant.url.org", Parser.noFeedback);
577             assertTrue("Should have thrown an exception!",false);
578         }
579         catch (ParserException e)
580         {
581             // expected outcome
582
}
583     }
584
585     public void testURLWithSpaces() throws ParserException{
586         Parser parser;
587         String JavaDoc url = "http://htmlparser.sourceforge.net/test/This is a Test Page.html";
588
589         parser = new Parser(url);
590         parser.setNodeFactory (new PrototypicalNodeFactory (true));
591         Node node [] = new Node[30];
592         int i = 0;
593         for (NodeIterator e = parser.elements();e.hasMoreNodes();) {
594             node[i] = e.nextNode();
595             i++;
596
597         }
598         assertEquals("Expected nodes",20,i);
599     }
600
601     public void testLinkCollection() throws ParserException {
602         createParser(
603         "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"><title>Google</title><style><!--\n"+
604         "body,td,a,p,.h{font-family:arial,sans-serif;} .h{font-size: 20px;} .h{color:} .q{text-decoration:none; color:#0000cc;}\n"+
605         "//--></style>\n"+
606         "<script>\n"+
607         "<!--\n"+
608         "function sf(){document.f.q.focus();}\n"+
609         "function c(p){var f=document.f;if (f.action) {f.action = 'http://'+p;f.submit();return false;}return true;}\n"+
610         "// -->\n"+
611         "</script>\n"+
612         "</head><body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onLoad=sf()><center><table border=0 cellspacing=0 cellpadding=0><tr><td><img SRC=\"images/logo.gif\" width=276 height=110 alt=\"Google\"></td></tr></table><br>\n"+
613         "<table border=0 cellspacing=0 cellpadding=0>" +
614             "<tr>" +
615             "<td width=15>&nbsp;</td>" +
616             "<td id=0 bgcolor=#3366cc align=center width=95 nowrap>" +
617                 "<font color=#ffffff size=-1><b>Web</b></font>" +
618             "</td>" +
619             "<td width=15>&nbsp;</td>" +
620             "<td id=1 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/imghp');\" style=cursor:pointer;cursor:hand;><a id=1a class=q HREF=\"/imghp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/imghp');\"><font size=-1>Images</font></a></td><td width=15>&nbsp;</td><td id=2 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/grphp');\" style=cursor:pointer;cursor:hand;><a id=2a class=q HREF=\"/grphp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/grphp');\"><font size=-1>Groups</font></a></td><td width=15>&nbsp;</td><td id=3 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/dirhp');\" style=cursor:pointer;cursor:hand;><a id=3a class=q HREF=\"/dirhp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/dirhp');\"><font size=-1>Directory</font></a></td><td width=15>&nbsp;</td><td id=4 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/nwshp');\" style=cursor:pointer;cursor:hand;><a id=4a class=q HREF=\"/nwshp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/nwshp');\"><font size=-1><nobr>News-<font color=red>New!</font></nobr></font></a></td><td width=15>&nbsp;</td></tr><tr><td colspan=12 bgcolor=#3366cc><img width=1 height=1 alt=\"\">" +
621             "</td>" +
622             "</tr>" +
623         "</table>" +
624         "<br>" +
625         "<form action=\"/search\" name=f>" +
626             "<table cellspacing=0 cellpadding=0>" +
627             "<tr>" +
628                 "<td width=75>&nbsp;</td>" +
629                 "<td align=center>" +
630                     "<input type=hidden name=hl value=en>" +
631                     "<input type=hidden name=ie value=\"UTF-8\">" +
632                     "<input type=hidden name=oe value=\"UTF-8\">" +
633                     "<input maxLength=256 size=55 name=q value=\"\"><br>" +
634                     "<input type=submit value=\"Google Search\" name=btnG>" +
635                     "<input type=submit value=\"I'm Feeling Lucky\" name=btnI>" +
636                 "</td>" +
637                 "<td valign=top nowrap>" +
638                     "<font size=-2>&nbsp;&#8226;&nbsp;<a HREF=/advanced_search?hl=en>Advanced&nbsp;Search</a>" +
639                     "<br>&nbsp;&#8226;&nbsp;<a HREF=/preferences?hl=en>Preferences</a>" +
640                     "<br>&nbsp;&#8226;&nbsp;<a HREF=/language_tools?hl=en>Language Tools</a>" +
641                     "</font>" +
642                 "</td>" +
643             "</tr>" +
644             "</table>" +
645         "</form><br>\n"+
646         "<br><font size=-1><a HREF=\"/ads/\">Advertise&nbsp;with&nbsp;Us</a> - <a HREF=\"/services/\">Search&nbsp;Solutions</a> - <a HREF=\"/options/\">Services&nbsp;&amp;&nbsp;Tools</a> - <a HREF=/about.html>Jobs,&nbsp;Press,&nbsp;&amp;&nbsp;Help</a><span id=hp style=\"behavior:url(#default#homepage)\"></span>\n"+
647         "<script>\n"+
648         "if (!hp.isHomePage('http://www.google.com/')) {document.write(\"<p><a HREF=\"/mgyhp.html\" onClick=\"style.behavior='url(#default#homepage)';setHomePage('http://www.google.com/');\">Make Google Your Homepage!</a>\");}\n"+
649         "</script></font>\n"+
650         "<p><font size=-2>&copy;2002 Google</font><font size=-2> - Searching 3,083,324,652 web pages</font></center></body></html>\n"
651         );
652         NodeList collectionList = new NodeList();
653         NodeClassFilter filter = new NodeClassFilter (LinkTag.class);
654         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
655             e.nextNode().collectInto(collectionList,filter);
656         assertEquals("Size of collection vector should be 11",11,collectionList.size());
657         // All items in collection vector should be links
658
for (SimpleNodeIterator e = collectionList.elements();e.hasMoreNodes();) {
659             Node node = e.nextNode();
660             assertTrue("Only links should have been parsed",node instanceof LinkTag);
661         }
662     }
663     public void testImageCollection() throws ParserException {
664         createParser(
665         "<html>\n"+
666         "<head>\n"+
667             "<meta name=\"generator\" content=\"Created Using Yahoo! PageBuilder 2.60.24\">\n"+
668         "</head>\n"+
669         "<body bgcolor=\"#FFFFFF\" link=\"#0000FF\" vlink=\"#FF0000\" text=\"#000000\"\n"+
670         " onLoad=\"window.onresize=new Function('if (navigator.appVersion==\'Netscape\') history.go(0);');\">\n"+
671         "<div id=\"layer0\" style=\"position:absolute;left:218;top:40;width:240;height:26;\">\n"+
672         "<table width=240 height=26 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
673         "<td><b><font size=\"+2\"><span style=\"font-size:24\">NISHI-HONGWAN-JI</span></font></b></td>\n"+
674         "</tr></table></div>\n"+
675         "<div id=\"layer1\" style=\"position:absolute;left:75;top:88;width:542;height:83;\">\n"+
676         "<table width=542 height=83 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
677         "<td><span style=\"font-size:14\">The Nihi Hongwanj-ji temple is very traditional, very old, and very beautiful. This is the place that we stayed on our first night in Kyoto. We then attended the morning prayer ceremony, at 6:30 am. Staying here costed us 7,500 yen, which was inclusive of dinner and breakfast, and usage of the o-furo (public bath). Felt more like a luxury hotel than a temple.</span></td>\n"+
678         "</tr></table></div>\n"+
679         "<div id=\"layer2\" style=\"position:absolute;left:144;top:287;width:128;height:96;\">\n"+
680         "<table width=128 height=96 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
681         "<td><a HREF=\"nishi-hongwanji1.html\"><img height=96 width=128 SRC=\"nishi-hongwanji1-thumb.jpg\" border=0 ></a></td>\n"+
682         "</tr></table></div>\n"+
683         "<div id=\"layer3\" style=\"position:absolute;left:415;top:285;width:128;height:96;\">\n"+
684         "<table width=128 height=96 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
685         "<td><a HREF=\"nishi-hongwanji3.html\"><img height=96 width=128 SRC=\"nishi-hongwanji2-thumb.jpg\" border=0 ></a></td>\n"+
686         "</tr></table></div>\n"+
687         "<div id=\"layer4\" style=\"position:absolute;left:414;top:182;width:128;height:96;\">\n"+
688         "<table width=128 height=96 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
689         "<td><a HREF=\"higashi-hongwanji.html\"><img height=96 width=128 SRC=\"higashi-hongwanji-thumb.jpg\" border=0 ></a></td>\n"+
690         "</tr></table></div>\n"+
691         "<div id=\"layer5\" style=\"position:absolute;left:78;top:396;width:530;height:49;\">\n"+
692         "<table width=530 height=49 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
693         "<td><span style=\"font-size:14\">Click on the pictures to see the full-sized versions. The picture at the top right corner is taken in Higashi-Hongwanji. Nishi means west, and Higashi means east. These two temples are adjacent to each other and represent two different Buddhist sects.</span></td>\n"+
694         "</tr></table></div>\n"+
695         "<div id=\"layer6\" style=\"position:absolute;left:143;top:180;width:128;height:102;\">\n"+
696         "<table width=128 height=102 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
697         "<td><a HREF=\"nishi-hongwanji4.html\"><img height=102 width=128 SRC=\"nishi-hongwanji4-thumb.jpg\" border=0 ></a></td>\n"+
698         "</tr></table></div>\n"+
699         "<div id=\"layer7\" style=\"position:absolute;left:280;top:235;width:124;height:99;\">\n"+
700         "<table width=124 height=99 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
701         "<td><a HREF=\"nishi-hongwanji-lodging.html\"><img height=99 width=124 SRC=\"nishi-hongwanji-lodging-thumb.jpg\" border=0 ></a></td>\n"+
702         "</tr></table></div>\n"+
703         "</body>\n"+
704         "</html>");
705         NodeList collectionList = new NodeList();
706         TagNameFilter filter = new TagNameFilter ("IMG");
707         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
708             e.nextNode().collectInto(collectionList,filter);
709         assertEquals("Size of collection vector should be 5",5,collectionList.size());
710         // All items in collection vector should be links
711
for (SimpleNodeIterator e = collectionList.elements();e.hasMoreNodes();) {
712             Node node = e.nextNode();
713             assertTrue("Only images should have been parsed",node instanceof ImageTag);
714         }
715     }
716
717     /**
718      * See bug #728241 OutOfMemory error/ Infinite loop
719      */

720     public void testOutOfMemory () throws Exception JavaDoc
721     {
722         createParser (
723             "<html><head></head>\n"
724             + "<body>\n"
725             + "<table>\n"
726             + "<tr>\n"
727             + " <td><img SRC=\"foo.jpg\" alt=\"f's b\"><font\n"
728             + " size=1>blah</font>\n"
729             + "</td>\n"
730             + "</tr>\n"
731             + "</table>\n"
732             + "</body></html>\n");
733         for (NodeIterator e = parser.elements();e.hasMoreNodes();) {
734             e.nextNode();
735         }
736     }
737
738     /**
739      * See bug #729368 Embedded quote and split tag
740      */

741     public void testEmbeddedQuoteSplit () throws Exception JavaDoc
742     {
743         createParser (
744             "<html><head></head>\n"
745             + "<body>\n"
746             + "<table>\n"
747             + "<tr><td><img SRC=\"x\" alt=\"f's b\"><font\n"
748             + "size=1>blah</font></td></tr>\n"
749             + "</table>\n"
750             + "</body></html>");
751         parser.setNodeFactory (new PrototypicalNodeFactory (true));
752         int i = 0;
753         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
754         {
755             Node node = e.nextNode();
756             if (10 == i)
757             {
758                 assertTrue ("not a tag", node instanceof Tag);
759                 assertTrue ("ALT attribute incorrect", ((Tag)node).getAttribute ("ALT").equals ("f's b"));
760             }
761             i++;
762         }
763         assertEquals("Expected nodes",21,i);
764     }
765
766     /**
767      * See bug #826764 ParserException occurs only when using setInputHTML() instea
768      */

769     public void testSetInputHTML () throws Exception JavaDoc
770     {
771         String JavaDoc html;
772         String JavaDoc path;
773         File JavaDoc file;
774         PrintWriter JavaDoc out;
775         Node[] nodes;
776
777         html = "<html></html>";
778         createParser (html);
779         path = System.getProperty ("user.dir");
780         if (!path.endsWith (File.separator))
781             path += File.separator;
782         file = new File JavaDoc (path + "delete_me.html");
783         try
784         {
785             out = new PrintWriter JavaDoc (new FileWriter JavaDoc (file));
786             out.print ("<html>\r\n");
787             out.print ("<head>\r\n");
788             out.print ("<!-- BEGIN TYPE -->\r\n");
789             out.print ("<!-- NAVIGATION -->\r\n");
790             out.print ("<!-- END TYPE -->\r\n");
791             out.print ("<!-- BEGIN TITLE -->\r\n");
792             out.print ("<title>Einstiegsseite</title>\r\n");
793             out.print ("<!-- END TITLE -->\r\n");
794             out.print ("</head>\r\n");
795             out.print ("<body>\r\n");
796             out.print ("<ul>\r\n");
797             out.print ("<li>\r\n");
798             out.print ("<!-- BEGIN ITEM -->\r\n");
799             out.print ("<!-- BEGIN REF -->\r\n");
800             out.print ("<a HREF=\"kapitel1/index.html\">\r\n");
801             out.print ("<!-- END REF -->\r\n");
802             out.print ("<!-- BEGIN REFTITLE -->\r\n");
803             out.print ("Kapitel 1\r\n");
804             out.print ("<!-- END REFTITLE -->\r\n");
805             out.print ("</a>\r\n");
806             out.print ("<!-- END ITEM -->\r\n");
807             out.print ("</li>\r\n");
808             out.print ("<li>\r\n");
809             out.print ("<!-- BEGIN ITEM -->\r\n");
810             out.print ("<!-- BEGIN REF -->\r\n");
811             out.print ("<a HREF=\"kapitel2/index.html\">\r\n");
812             out.print ("<!-- END REF -->\r\n");
813             out.print ("<!-- BEGIN REFTITLE -->\r\n");
814             out.print ("Kapitel 2\r\n");
815             out.print ("<!-- END REFTITLE -->\r\n");
816             out.print ("</a>\r\n");
817             out.print ("<!-- END ITEM -->\r\n");
818             out.print ("</li>\r\n");
819             out.print ("<li>\r\n");
820             out.print ("<!-- BEGIN ITEM -->\r\n");
821             out.print ("<!-- BEGIN REF -->\r\n");
822             out.print ("<a HREF=\"kapitel3/index.html\">\r\n");
823             out.print ("<!-- END REF -->\r\n");
824             out.print ("<!-- BEGIN REFTITLE -->\r\n");
825             out.print ("Kapitel 3\r\n");
826             out.print ("<!-- END REFTITLE -->\r\n");
827             out.print ("</a>\r\n");
828             out.print ("<!-- END ITEM -->\r\n");
829             out.print ("</li>\r\n");
830             out.print ("</ul>\r\n");
831             out.print ("</body>\r\n");
832             out.print ("</html>");
833             out.close ();
834             DataInputStream JavaDoc stream = new DataInputStream JavaDoc (
835                 new BufferedInputStream JavaDoc (new FileInputStream JavaDoc (file)));
836             byte[] buffer = new byte[(int)file.length ()];
837             stream.readFully (buffer);
838             html = new String JavaDoc (buffer);
839             try
840             {
841                 parser.setInputHTML (html);
842                 nodes = parser.extractAllNodesThatAre (LinkTag.class);
843             }
844             catch (ParserException e)
845             {
846                 e.printStackTrace ();
847                 nodes = new Node[0];
848             }
849             assertTrue ("node count", 3 == nodes.length);
850         }
851         catch (Exception JavaDoc e)
852         {
853             fail (e.toString ());
854         }
855         finally
856         {
857             file.delete ();
858         }
859     }
860
861     /**
862      * Test reproducing a java.lang.StackOverflowError.
863      */

864     public void testXMLTypeToString () throws Exception JavaDoc
865     {
866         String JavaDoc guts;
867         String JavaDoc output;
868                                                                                                                                                         
869         guts = "TD width=\"69\"/";
870         createParser ("<" + guts + ">");
871         parseAndAssertNodeCount (1);
872         output = node[0].toString (); // this was where StackOverflow was thrown
873
assertTrue ("bad toString()", -1 != output.indexOf (guts));
874     }
875
876     /**
877      * See bug #883664 toUpperCase on tag names and attributes depends on locale
878      */

879     public void testDifferentLocale () throws Exception JavaDoc
880     {
881         String JavaDoc html;
882         Locale JavaDoc original;
883                                                                                                                                                         
884         html = "<title>This is supposedly Turkish.</title>";
885         original = Locale.getDefault ();
886         try
887         {
888             Locale.setDefault (new Locale JavaDoc ("tr")); // turkish
889
createParser (html);
890             parseAndAssertNodeCount (1);
891             assertStringEquals ("html", html, node[0].toHtml ());
892         }
893         finally
894         {
895             Locale.setDefault (original);
896         }
897     }
898     
899     /**
900      * See bug #900128 RemarkNode.setText() does not set Text
901      */

902     public void testSetStringText () throws Exception JavaDoc
903     {
904         String JavaDoc text;
905         String JavaDoc html;
906         String JavaDoc newtext;
907         String JavaDoc newhtml;
908         Node txt;
909
910         text = "This is just text.";
911         html = "<body>" + text + "</body>";
912         newtext = "This is different text.";
913         newhtml = "<body>" + newtext + "</body>";
914         createParser (html);
915         parseAndAssertNodeCount (1);
916         assertStringEquals ("html wrong", html, node[0].toHtml ());
917         assertTrue ("wrong number of children", 1 == node[0].getChildren ().size ());
918         assertTrue ("string node expected", node[0].getChildren ().elementAt (0) instanceof Text);
919         txt = node[0].getChildren ().elementAt (0);
920         assertStringEquals ("string html wrong", text, txt.toHtml ());
921         assertStringEquals ("string contents wrong", text, txt.getText ());
922         assertTrue ("toString wrong", txt.toString ().endsWith (text));
923         txt.setText (newtext);
924         assertStringEquals ("html wrong", newhtml, node[0].toHtml ());
925         assertStringEquals ("new string html wrong", newtext, txt.toHtml ());
926         assertStringEquals ("new string contents wrong", newtext, txt.getText ());
927         assertTrue ("toString wrong", txt.toString ().endsWith (newtext));
928     }
929
930     /**
931      * See bug #900128 RemarkNode.setText() does not set Text
932      */

933     public void testSetRemarkText () throws Exception JavaDoc
934     {
935         String JavaDoc text;
936         String JavaDoc remark;
937         String JavaDoc html;
938         String JavaDoc newtext;
939         String JavaDoc newremark;
940         String JavaDoc newhtml;
941         Node rem;
942
943         text = " This is a remark. ";
944         remark = "<!--" + text + "-->";
945         html = "<body>" + remark + "</body>";
946         newtext = " This is a different remark. ";
947         newremark = "<!--" + newtext + "-->";
948         newhtml = "<body>" + newremark + "</body>";
949         createParser (html);
950         parseAndAssertNodeCount (1);
951         assertStringEquals ("html wrong", html, node[0].toHtml ());
952         assertTrue ("wrong number of children", 1 == node[0].getChildren ().size ());
953         assertTrue ("remark node expected", node[0].getChildren ().elementAt (0) instanceof Remark);
954         rem = node[0].getChildren ().elementAt (0);
955         assertStringEquals ("remark html wrong", remark, rem.toHtml ());
956         assertStringEquals ("remark contents wrong", text, rem.getText ());
957         assertTrue ("toString wrong", rem.toString ().endsWith (text));
958         rem.setText (newtext);
959         assertStringEquals ("html wrong", newhtml, node[0].toHtml ());
960         assertStringEquals ("new remark html wrong", newremark, rem.toHtml ());
961         assertStringEquals ("new remark contents wrong", newtext, rem.getText ());
962         assertTrue ("toString wrong", rem.toString ().endsWith (newtext));
963         rem.setText (newremark);
964         assertStringEquals ("html wrong", newhtml, node[0].toHtml ());
965         assertStringEquals ("new remark html wrong", newremark, rem.toHtml ());
966         assertStringEquals ("new remark contents wrong", newtext, rem.getText ());
967         assertTrue ("toString wrong", rem.toString ().endsWith (newtext));
968     }
969
970     public void testFixSpaces () throws ParserException
971     {
972         String JavaDoc url = "http://htmlparser.sourceforge.net/test/This is a Test Page.html";
973         parser = new Parser (url);
974         assertEquals("Expected","http://htmlparser.sourceforge.net/test/This%20is%20a%20Test%20Page.html", parser.getURL ());
975     }
976 }
977
Popular Tags