ParserTest


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2004 Somik Raha
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTest.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2005/02/13 20:36:02 $
10  // $Revision: 1.64 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.tests;
28  
29  import java.io.BufferedInputStream  ;
30  import java.io.DataInputStream  ;
31  import java.io.File  ;
32  import java.io.FileInputStream  ;
33  import java.io.FileWriter  ;
34  import java.io.PrintWriter  ;
35  import java.net.HttpURLConnection  ;
36  import java.net.URL  ;
37  import java.util.Locale  ;
38  
39  import org.htmlparser.Node;
40  import org.htmlparser.Parser;
41  import org.htmlparser.PrototypicalNodeFactory;
42  import org.htmlparser.Remark;
43  import org.htmlparser.Tag;
44  import org.htmlparser.Text;
45  import org.htmlparser.filters.NodeClassFilter;
46  import org.htmlparser.filters.TagNameFilter;
47  import org.htmlparser.lexer.InputStreamSource;
48  import org.htmlparser.tags.BodyTag;
49  import org.htmlparser.tags.ImageTag;
50  import org.htmlparser.tags.LinkTag;
51  import org.htmlparser.tags.MetaTag;
52  import org.htmlparser.util.DefaultParserFeedback;
53  import org.htmlparser.util.NodeIterator;
54  import org.htmlparser.util.NodeList;
55  import org.htmlparser.util.ParserException;
56  import org.htmlparser.util.SimpleNodeIterator;
57  
58  public class ParserTest extends ParserTestCase
59  {
60      static
61      {
62          System.setProperty ("org.htmlparser.tests.ParserTest", "ParserTest");
63      }
64  
65      public ParserTest(String   name) {
66          super(name);
67      }
68      public void testElements() throws Exception   {
69          StringBuffer   hugeData = new StringBuffer  ();
70          for (int i=0;i<5001;i++) hugeData.append('a');
71          createParser(hugeData.toString());
72          int i = 0;
73          for (NodeIterator e = parser.elements();e.hasMoreNodes();)
74          {
75              node[i++] = e.nextNode();
76          }
77          assertEquals("There should be 1 node identified",1,i);
78          // Now try getting the elements again
79  //      i = 0;
80  //      reader.reset();
81  //      reader.setLineCount(1);
82  //      reader.setPosInLine(-1);
83  //      for (HTMLEnumeration e = parser.elements();e.hasMoreNodes();)
84  //      {
85  //          node[i++] = e.nextHTMLNode();
86  //      }
87  //      assertEquals("There should be 1 node identified (second call to parser.elements())",1,i);
88      }
89  
90      /**
91       * This testcase needs you to be online.
92       */
93      public void testElementsFromWeb() throws Exception   {
94          Parser parser;
95          try {
96              parser = new Parser("http://www.google.com");
97          }
98          catch (Exception   e ){
99              throw new ParserException("You must be offline! This test needs you to be connected to the internet.",e);
100         }
101 
102         Node[] node = new Node[500];
103         int i = 0;
104         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
105         {
106             node[i++] = e.nextNode();
107         }
108         int cnt = i;
109         parser.reset ();
110         // Now try getting the elements again
111         i = 0;
112         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
113         {
114             node[i++] = e.nextNode();
115         }
116         assertEquals("There should be "+cnt+" nodes identified (second call to parser.elements())",cnt,i);
117     }
118 
119     /**
120      * Test the Parser(URLConnection) constructor.
121      * This testcase needs you to be online.
122      * Based on the form at Canada Post <code>http://www.canadapost.ca/tools/pcl/bin/default-e.asp</code>:
123      * <pre>
124      * &lt;form NAME="SearchQuick" method="POST" action="cp_search_response-e.asp"
125      *  onSubmit="return runSubmit();"&gt;
126      *
127      * &lt;!-- begin test hidden field code --&gt;
128      *   &lt;input TYPE="Hidden" NAME="app_language" value="english"&gt;
129      *
130      *   &lt;input TYPE="Hidden" NAME="app_response_start_row_number" value="1"&gt;
131      *   &lt;input TYPE="Hidden" NAME="app_response_rows_max" value="9"&gt;
132      *
133      *   &lt;input TYPE="Hidden" NAME="app_source" value="quick"&gt;
134      *   &lt;input TYPE="Hidden" NAME="query_source" value="q"&gt;
135      *
136      *   &lt;input TYPE="Hidden" NAME="name" value&gt;
137      *   &lt;input TYPE="Hidden" NAME="postal_code" value&gt;
138      *   &lt;input TYPE="Hidden" NAME="directory_area_name" value&gt;
139      *
140      *   &lt;input TYPE="Hidden" NAME="delivery_mode" value&gt;
141      *   &lt;input TYPE="Hidden" NAME="Suffix" value&gt;
142      *
143      *   &lt;input TYPE="Hidden" NAME="street_direction" value&gt;
144      *   &lt;input TYPE="Hidden" NAME="installation_type" value&gt;
145      *   &lt;input TYPE="Hidden" NAME="delivery_number" value&gt;
146      *   &lt;input TYPE="Hidden" NAME="installation_name" value&gt;
147      *   &lt;input TYPE="Hidden" NAME="unit_number" value&gt;
148      *
149      *   &lt;input TYPE="Hidden" NAME="app_state" value="production"&gt;
150      * &lt;!-- end test hidden field code --&gt;
151      *
152      * &lt;p&gt;
153      *   &lt;table border="0" cellpadding="0" width="90%" cellspacing="0"&gt;
154      *
155      *     &lt;tr&gt;
156      *       &lt;td  class="tbltitle"&gt; Street Number: &lt;/td&gt;
157      *       &lt;td class="tbltitle"&gt; Street Name: &lt;/td&gt;
158      *       &lt;td class="tbltitle"&gt; Street Type:&lt;/td&gt;
159      *     &lt;/tr&gt;
160      *     &lt;tr&gt;
161      *
162      *       &lt;td&gt;
163      *         &lt;input type="text" name="street_number" size="10" maxlength="10"&gt;
164      *       &lt;/td&gt;
165      *       &lt;td&gt;
166      *         &lt;input type="text" name="street_name" size="30" maxlength="40"&gt;
167      *         &lt;input type="hidden" name="street_type" size="30"&gt;
168      *       &lt;/td&gt;
169      *       &lt;td&gt;&lt;input type="text" name="test" size="10" maxlength="30"&gt;&lt;/td&gt;
170      *     &lt;/tr&gt;
171      *
172      *   &lt;/table&gt;
173      * &lt;p&gt;
174      *   &lt;table border="0" cellpadding="0" width="90%" cellspacing="0"&gt;
175      *     &lt;tr&gt;
176      *       &lt;td class="tbltitle"&gt;
177      *         Municipality (City, Town, etc.):
178      *       &lt;/td&gt;
179      *       &lt;td class="tbltitle"&gt;
180      *         Province:
181      *       &lt;/td&gt;
182      *
183      *     &lt;/tr&gt;
184      *     &lt;tr&gt;
185      *       &lt;td&gt;
186      *         &lt;input type="text" name="city" size="30" maxlength="30"&gt;
187      *       &lt;/td&gt;
188      *       &lt;td&gt;
189      *         &lt;select size="1" name="prov"&gt;
190      *           &lt;option selected value="NULL"&gt;Select&lt;/option&gt;&lt;option value="AB"&gt;AB - Alberta&lt;/option&gt;&lt;option value="BC"&gt;BC - British Columbia&lt;/option&gt;&lt;option value="MB"&gt;MB - Manitoba&lt;/option&gt;&lt;option value="NB"&gt;NB - New Brunswick&lt;/option&gt;&lt;option value="NL"&gt;NL - Newfoundland and Labrador&lt;/option&gt;&lt;option value="NS"&gt;NS - Nova Scotia&lt;/option&gt;&lt;option value="NT"&gt;NT - Northwest Territories&lt;/option&gt;&lt;option value="NU"&gt;NU - Nunavut&lt;/option&gt;&lt;option value="ON"&gt;ON - Ontario&lt;/option&gt;&lt;option value="PE"&gt;PE - Prince Edward Island&lt;/option&gt;&lt;option value="QC"&gt;QC - Quebec&lt;/option&gt;&lt;option value="SK"&gt;SK - Saskatchewan&lt;/option&gt;&lt;option value="YT"&gt;YT - Yukon&lt;/option&gt;
191      *
192      *         &lt;/select&gt;
193      *       &lt;/td&gt;
194      *     &lt;/tr&gt;
195      *     &lt;tr&gt;
196      *       &lt;td height="10"&gt;&amp;nbsp;&lt;/td&gt;
197      *       &lt;td&gt;&amp;nbsp;&lt;/td&gt;
198      *     &lt;/tr&gt;
199      *     &lt;tr&gt;
200      *       &lt;td colspan="2" align="right" nowrap&gt;
201      *     &lt;input type="image" SRC="images/bb_submit-e.gif" name="Search" border="0" WIDTH="88" HEIGHT="23"&gt;
202      *         &amp;nbsp; &lt;a HREF="#" onclick="javascript:fClearAllFields();"&gt;&lt;img SRC="images/bb_clear_form-e.gif" name="Clear" border="0" WIDTH="88" HEIGHT="23"&gt;&lt;/a&gt;
203      *    &lt;/td&gt;
204      *     &lt;/tr&gt;
205      *   &lt;/table&gt;
206      * &lt;p&gt;
207      * &lt;/form&gt;
208      * </pre>
209      * Sumbits the POST and verifies the returned HTML contains an expected value.
210      */
211     public void testPOST() throws Exception  
212     {   // the form data:
213         final String   number = "2708";
214         final String   street = "Kelly";
215         final String   type = "Avenue";
216         final String   city = "Ottawa";
217         final String   province = "ON";
218         // the correct answer
219         final String   postal_code = "K2B 7V4";
220 
221         Parser parser;
222         URL   url;
223         HttpURLConnection   connection;
224         StringBuffer   buffer;
225         PrintWriter   out;
226         boolean pass;
227         NodeIterator enumeration;
228         Node node;
229         Text string;
230 
231         try
232         {
233             url = new URL   ("http://www.canadapost.ca/tools/pcl/bin/cp_search_response-e.asp");
234              connection = (HttpURLConnection  )url.openConnection ();
235             connection.setRequestMethod ("POST");
236             connection.setRequestProperty ("Referer", "http://www.canadapost.ca/tools/pcl/bin/default-e.asp");
237             connection.setDoOutput (true);
238             connection.setDoInput (true);
239             connection.setUseCaches (false);
240             buffer = new StringBuffer   (1024);
241             buffer.append ("app_language=");
242             buffer.append ("english");
243             buffer.append ("&");
244             buffer.append ("app_response_start_row_number=");
245             buffer.append ("1");
246             buffer.append ("&");
247             buffer.append ("app_response_rows_max=");
248             buffer.append ("9");
249             buffer.append ("&");
250             buffer.append ("app_source=");
251             buffer.append ("quick");
252             buffer.append ("&");
253             buffer.append ("query_source=");
254             buffer.append ("q");
255             buffer.append ("&");
256             buffer.append ("name=");
257             buffer.append ("&");
258             buffer.append ("postal_code=");
259             buffer.append ("&");
260             buffer.append ("directory_area_name=");
261             buffer.append ("&");
262             buffer.append ("delivery_mode=");
263             buffer.append ("&");
264             buffer.append ("Suffix=");
265             buffer.append ("&");
266             buffer.append ("street_direction=");
267             buffer.append ("&");
268             buffer.append ("installation_type=");
269             buffer.append ("&");
270             buffer.append ("delivery_number=");
271             buffer.append ("&");
272             buffer.append ("installation_name=");
273             buffer.append ("&");
274             buffer.append ("unit_numbere=");
275             buffer.append ("&");
276             buffer.append ("app_state=");
277             buffer.append ("production");
278             buffer.append ("&");
279             buffer.append ("street_number=");
280             buffer.append (number);
281             buffer.append ("&");
282             buffer.append ("street_name=");
283             buffer.append (street);
284             buffer.append ("&");
285             buffer.append ("street_type=");
286             buffer.append (type);
287             buffer.append ("&");
288             buffer.append ("test=");
289             buffer.append ("&");
290             buffer.append ("city=");
291             buffer.append (city);
292             buffer.append ("&");
293             buffer.append ("prov=");
294             buffer.append (province);
295             buffer.append ("&");
296             buffer.append ("Search=");
297             out = new PrintWriter   (connection.getOutputStream ());
298             out.print (buffer);
299             out.close ();
300             parser = new Parser (connection);
301             parser.setNodeFactory (new PrototypicalNodeFactory (true));
302         }
303         catch (Exception   e)
304         {
305             throw new ParserException ("You must be offline! This test needs you to be connected to the internet.", e);
306         }
307 
308         pass = false;
309         for (enumeration = parser.elements (); enumeration.hasMoreNodes ();)
310         {
311             node = enumeration.nextNode ();
312             if (node instanceof Text)
313             {
314                 string = (Text)node;
315                 if (-1 != string.getText ().indexOf (postal_code))
316                     pass = true;
317             }
318         }
319         assertTrue("POST operation failed.", pass);
320     }
321 
322     /**
323      * Tests the 'from file' Parser constructor.
324      */
325     public void testFile ()
326     {
327         String   path;
328         File   file;
329         PrintWriter   out;
330         Parser parser;
331         Node nodes[];
332         int i;
333         NodeIterator enumeration;
334 
335         path = System.getProperty ("user.dir");
336         if (!path.endsWith (File.separator))
337             path += File.separator;
338         file = new File   (path + "delete_me.html");
339         try
340         {
341             out = new PrintWriter   (new FileWriter   (file));
342             out.println ("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">");
343             out.println ("<html>");
344             out.println ("<head>");
345             out.println ("<title>test</title>");
346             out.println ("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");
347             out.println ("</head>");
348             out.println ("<body>");
349             out.println ("This is a test page ");
350             out.println ("</body>");
351             out.println ("</html>");
352             out.close ();
353             parser = new Parser (file.getAbsolutePath (), new DefaultParserFeedback(DefaultParserFeedback.QUIET));
354             parser.setNodeFactory (new PrototypicalNodeFactory (true));
355             nodes = new Node[30];
356             i = 0;
357             for (enumeration = parser.elements (); enumeration.hasMoreNodes ();)
358             {
359                 nodes[i] = enumeration.nextNode ();
360                 i++;
361             }
362             assertEquals("Expected nodes",20,i);
363         }
364         catch (Exception   e)
365         {
366             fail (e.toString ());
367         }
368         finally
369         {
370             file.delete ();
371         }
372     }
373 
374     /**
375      * Tests deleting a file held open by the parser.
376      * See bug #1005409 Input file not free by parser
377      */
378     public void testFileDelete ()
379     {
380         String   path;
381         File   file;
382         PrintWriter   out;
383         Parser parser;
384         NodeIterator enumeration;
385 
386         path = System.getProperty ("user.dir");
387         if (!path.endsWith (File.separator))
388             path += File.separator;
389         file = new File   (path + "delete_me.html");
390         try
391         {
392             out = new PrintWriter   (new FileWriter   (file));
393             out.println ("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">");
394             out.println ("<html>");
395             out.println ("<head>");
396             out.println ("<title>test</title>");
397             out.println ("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");
398             out.println ("</head>");
399             out.println ("<body>");
400             out.println ("This is a test page ");
401             out.println ("</body>");
402             out.println ("</html>");
403             // fill our 16K buffer on read
404             for (int i = 0; i < InputStreamSource.BUFFER_SIZE; i++)
405                 out.println ();
406             out.close ();
407             parser = new Parser (file.getAbsolutePath (), new DefaultParserFeedback(DefaultParserFeedback.QUIET));
408             parser.setNodeFactory (new PrototypicalNodeFactory (true));
409             enumeration = parser.elements ();
410             enumeration.nextNode ();
411             if (-1 != System.getProperty ("os.name").indexOf("Windows"))
412                 // linux/unix lets you delete a file even when it's open
413                 assertTrue ("file deleted with more available", !file.delete ());
414             // parser.getLexer ().getPage ().close ();
415             parser = null;
416             enumeration = null;
417             System.gc ();
418             System.runFinalization ();
419             assertTrue ("file not deleted after destroy", file.delete ());
420         }
421         catch (Exception   e)
422         {
423             fail (e.toString ());
424         }
425         finally
426         {
427             file.delete ();
428         }
429     }
430 
431     /**
432      * Test with a HTTP header with a valid charset parameter.
433      * Here, ibm.co.jp is an example of a HTTP server that correctly sets the
434      * charset in the header to match the content encoding.
435      */
436     public void testHTTPCharset ()
437     {
438         Parser parser;
439         try
440         {
441             parser = new Parser("http://www.ibm.com/jp/", Parser.noFeedback);
442             assertTrue("Character set should be Shift_JIS", parser.getEncoding ().equalsIgnoreCase ("Shift_JIS"));
443         }
444         catch (ParserException e)
445         {
446             fail ("could not open http://www.ibm.com/jp/");
447         }
448     }
449 
450     /**
451      * Test with a HTML header with a charset parameter not matching the HTTP header.
452      * Here, www.sony.co.jp is an example of a HTTP server that does not set the
453      * charset in the header to match the content encoding. We check that after
454      * the enumeration is created, that the charset has changed to the correct value.
455      */
456     public void testHTMLCharset ()
457     {
458         Parser parser;
459         NodeIterator enumeration;
460 
461         try
462         {
463             parser = new Parser("http://www.sony.co.jp", Parser.noFeedback);
464             assertEquals("Character set by default is ISO-8859-1", "ISO-8859-1", parser.getEncoding ());
465             enumeration = parser.elements();
466             // search for the <BODY> tag
467             while (enumeration.hasMoreNodes ())
468                 if (enumeration.nextNode () instanceof BodyTag)
469                     break;
470             assertTrue("Character set should be Shift_JIS", parser.getEncoding ().equalsIgnoreCase ("Shift_JIS"));
471         }
472         catch (ParserException e)
473         {
474             fail ("could not open http://www.sony.co.jp");
475         }
476     }
477 
478     /**
479      * Test the case of a charset directive different than the HTTP header.
480      * See bug #707447 META TAG - CHARSET
481      * and bug #699886 can't parse website other than iso-8859-1
482      */
483     public void testSwitchCharset () throws ParserException
484     {
485         Parser parser;
486         String   url = "http://htmlparser.sourceforge.net/test/gb2312Charset.html";
487         int i;
488         Node[] nodes;
489 
490         parser = new Parser(url);
491         parser.setNodeFactory (new PrototypicalNodeFactory (new MetaTag ()));
492         i = 0;
493         nodes = new Node[30];
494         for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
495             nodes[i++] = e.nextNode();
496         assertEquals ("Expected nodes", 23, i);
497     }
498 
499     /**
500      * Test the case of a double quoted charset directive.
501      * See bug #694477.
502      * Technically, this format does not meet the HTTP/1.1
503      * specification in RFC 2068. In this case that I believe
504      * that the quotes are being inproperly generated in the
505      * header by a server-side web application.
506      * Nonetheless, it would be nice to handle this case.
507      */
508     public void testDoubleQuotedCharset () throws ParserException
509     {
510         Parser parser;
511         String   url = "http://htmlparser.sourceforge.net/test/DoublequotedCharset.html";
512 
513         parser = new Parser(url);
514         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
515             e.nextNode();
516         assertTrue ("Wrong encoding", parser.getEncoding ().equals ("UTF-8"));
517     }
518 
519     /**
520      * Test the case of a single quoted charset directive.
521      * See bug #694477.
522      * Technically, this format does not meet the HTTP/1.1
523      * specification in RFC 2068. In this case that I believe
524      * that the quotes are being inproperly generated in the
525      * header by a server-side web application.
526      * Nonetheless, it would be nice to handle this case.
527      */
528     public void testSingleQuotedCharset () throws ParserException
529     {
530         Parser parser;
531         String   url = "http://htmlparser.sourceforge.net/test/SinglequotedCharset.html";
532 
533         parser = new Parser(url);
534         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
535             e.nextNode();
536         assertTrue ("Wrong encoding", parser.getEncoding ().equals ("UTF-8"));
537     }
538 
539     // This test is commented out because the URL no longer has a comma delimited character set.
540     // Reinstate when a suitable URL is discovered, or the unit tests set up their own HTTP server.
541 //    /**
542 //     * Test a bogus comma delimited charset specification in the HTTP header.
543 //     * See bug #722941.
544 //     * A comma delimted charset in the HTTP header does not meet the HTTP/1.1
545 //     * specification in RFC 2068. In this case that I believe
546 //     * that some idiot has misconfigured the HTTP server, but since it's
547 //     * AOL it would be nice to handle this case.
548 //     */
549 //    public void testCommaListCharset () throws ParserException
550 //    {
551 //        URL url;
552 //        URLConnection connection;
553 //        Parser parser;
554 //        String bogus = "http://users.aol.com/geinster/rej.htm";
555 //
556 //        try
557 //        {
558 //            url = new URL (bogus);
559 //            connection = url.openConnection ();
560 //            parser = new Parser (new Lexer (new Page (connection)));
561 //            // must be the default
562 //            assertTrue ("Wrong encoding", parser.getEncoding ().equals ("ISO-8859-1"));
563 //            for (NodeIterator e = parser.elements();e.hasMoreNodes();)
564 //                e.nextNode();
565 //            assertTrue ("Wrong encoding", parser.getEncoding ().equals ("windows-1252"));
566 //        }
567 //        catch (Exception e)
568 //        {
569 //            fail (e.getMessage ());
570 //        }
571 //    }
572 
573     public void testNullUrl() {
574         try
575         {
576             new Parser("http://none.existant.url.org", Parser.noFeedback);
577             assertTrue("Should have thrown an exception!",false);
578         }
579         catch (ParserException e)
580         {
581             // expected outcome
582         }
583     }
584 
585     public void testURLWithSpaces() throws ParserException{
586         Parser parser;
587         String   url = "http://htmlparser.sourceforge.net/test/This is a Test Page.html";
588 
589         parser = new Parser(url);
590         parser.setNodeFactory (new PrototypicalNodeFactory (true));
591         Node node [] = new Node[30];
592         int i = 0;
593         for (NodeIterator e = parser.elements();e.hasMoreNodes();) {
594             node[i] = e.nextNode();
595             i++;
596 
597         }
598         assertEquals("Expected nodes",20,i);
599     }
600 
601     public void testLinkCollection() throws ParserException {
602         createParser(
603         "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"><title>Google</title><style><!--\n"+
604         "body,td,a,p,.h{font-family:arial,sans-serif;} .h{font-size: 20px;} .h{color:} .q{text-decoration:none; color:#0000cc;}\n"+
605         "//--></style>\n"+
606         "<script>\n"+
607         "<!--\n"+
608         "function sf(){document.f.q.focus();}\n"+
609         "function c(p){var f=document.f;if (f.action) {f.action = 'http://'+p;f.submit();return false;}return true;}\n"+
610         "// -->\n"+
611         "</script>\n"+
612         "</head><body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onLoad=sf()><center><table border=0 cellspacing=0 cellpadding=0><tr><td><img SRC=\"images/logo.gif\" width=276 height=110 alt=\"Google\"></td></tr></table><br>\n"+
613         "<table border=0 cellspacing=0 cellpadding=0>" +
614             "<tr>" +
615             "<td width=15>&nbsp;</td>" +
616             "<td id=0 bgcolor=#3366cc align=center width=95 nowrap>" +
617                 "<font color=#ffffff size=-1><b>Web</b></font>" +
618             "</td>" +
619             "<td width=15>&nbsp;</td>" +
620             "<td id=1 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/imghp');\" style=cursor:pointer;cursor:hand;><a id=1a class=q HREF=\"/imghp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/imghp');\"><font size=-1>Images</font></a></td><td width=15>&nbsp;</td><td id=2 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/grphp');\" style=cursor:pointer;cursor:hand;><a id=2a class=q HREF=\"/grphp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/grphp');\"><font size=-1>Groups</font></a></td><td width=15>&nbsp;</td><td id=3 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/dirhp');\" style=cursor:pointer;cursor:hand;><a id=3a class=q HREF=\"/dirhp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/dirhp');\"><font size=-1>Directory</font></a></td><td width=15>&nbsp;</td><td id=4 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/nwshp');\" style=cursor:pointer;cursor:hand;><a id=4a class=q HREF=\"/nwshp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/nwshp');\"><font size=-1><nobr>News-<font  color=red>New!</font></nobr></font></a></td><td width=15>&nbsp;</td></tr><tr><td colspan=12 bgcolor=#3366cc><img width=1 height=1 alt=\"\">" +
621             "</td>" +
622             "</tr>" +
623         "</table>" +
624         "<br>" +
625         "<form action=\"/search\" name=f>" +
626             "<table cellspacing=0 cellpadding=0>" +
627             "<tr>" +
628                 "<td width=75>&nbsp;</td>" +
629                 "<td align=center>" +
630                     "<input type=hidden name=hl value=en>" +
631                     "<input type=hidden name=ie value=\"UTF-8\">" +
632                     "<input type=hidden name=oe value=\"UTF-8\">" +
633                     "<input maxLength=256 size=55 name=q value=\"\"><br>" +
634                     "<input type=submit value=\"Google Search\" name=btnG>" +
635                     "<input type=submit value=\"I'm Feeling Lucky\" name=btnI>" +
636                 "</td>" +
637                 "<td valign=top nowrap>" +
638                     "<font size=-2>&nbsp;&#8226;&nbsp;<a HREF=/advanced_search?hl=en>Advanced&nbsp;Search</a>" +
639                     "<br>&nbsp;&#8226;&nbsp;<a HREF=/preferences?hl=en>Preferences</a>" +
640                     "<br>&nbsp;&#8226;&nbsp;<a HREF=/language_tools?hl=en>Language Tools</a>" +
641                     "</font>" +
642                 "</td>" +
643             "</tr>" +
644             "</table>" +
645         "</form><br>\n"+
646         "<br><font size=-1><a HREF=\"/ads/\">Advertise&nbsp;with&nbsp;Us</a> - <a HREF=\"/services/\">Search&nbsp;Solutions</a> - <a HREF=\"/options/\">Services&nbsp;&amp;&nbsp;Tools</a> - <a HREF=/about.html>Jobs,&nbsp;Press,&nbsp;&amp;&nbsp;Help</a><span id=hp style=\"behavior:url(#default#homepage)\"></span>\n"+
647         "<script>\n"+
648         "if (!hp.isHomePage('http://www.google.com/')) {document.write(\"<p><a HREF=\"/mgyhp.html\" onClick=\"style.behavior='url(#default#homepage)';setHomePage('http://www.google.com/');\">Make Google Your Homepage!</a>\");}\n"+
649         "</script></font>\n"+
650         "<p><font size=-2>&copy;2002 Google</font><font size=-2> - Searching 3,083,324,652 web pages</font></center></body></html>\n"
651         );
652         NodeList collectionList = new NodeList();
653         NodeClassFilter filter = new NodeClassFilter (LinkTag.class);
654         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
655             e.nextNode().collectInto(collectionList,filter);
656         assertEquals("Size of collection vector should be 11",11,collectionList.size());
657         // All items in collection vector should be links
658         for (SimpleNodeIterator e = collectionList.elements();e.hasMoreNodes();) {
659             Node node = e.nextNode();
660             assertTrue("Only links should have been parsed",node instanceof LinkTag);
661         }
662     }
663     public void testImageCollection() throws ParserException {
664         createParser(
665         "<html>\n"+
666         "<head>\n"+
667             "<meta name=\"generator\" content=\"Created Using Yahoo! PageBuilder 2.60.24\">\n"+
668         "</head>\n"+
669         "<body bgcolor=\"#FFFFFF\" link=\"#0000FF\" vlink=\"#FF0000\" text=\"#000000\"\n"+
670         " onLoad=\"window.onresize=new Function('if (navigator.appVersion==\'Netscape\') history.go(0);');\">\n"+
671         "<div id=\"layer0\" style=\"position:absolute;left:218;top:40;width:240;height:26;\">\n"+
672         "<table width=240 height=26 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
673         "<td><b><font size=\"+2\"><span style=\"font-size:24\">NISHI-HONGWAN-JI</span></font></b></td>\n"+
674         "</tr></table></div>\n"+
675         "<div id=\"layer1\" style=\"position:absolute;left:75;top:88;width:542;height:83;\">\n"+
676         "<table width=542 height=83 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
677         "<td><span style=\"font-size:14\">The Nihi Hongwanj-ji temple is very traditional, very old, and very beautiful. This is the place that we stayed on our first night in Kyoto. We then attended the morning prayer ceremony, at 6:30 am. Staying here costed us 7,500 yen, which was inclusive of dinner and breakfast, and usage of the o-furo (public bath). Felt more like a luxury hotel than a temple.</span></td>\n"+
678         "</tr></table></div>\n"+
679         "<div id=\"layer2\" style=\"position:absolute;left:144;top:287;width:128;height:96;\">\n"+
680         "<table width=128 height=96 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
681         "<td><a HREF=\"nishi-hongwanji1.html\"><img height=96 width=128 SRC=\"nishi-hongwanji1-thumb.jpg\" border=0 ></a></td>\n"+
682         "</tr></table></div>\n"+
683         "<div id=\"layer3\" style=\"position:absolute;left:415;top:285;width:128;height:96;\">\n"+
684         "<table width=128 height=96 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
685         "<td><a HREF=\"nishi-hongwanji3.html\"><img height=96 width=128 SRC=\"nishi-hongwanji2-thumb.jpg\" border=0 ></a></td>\n"+
686         "</tr></table></div>\n"+
687         "<div id=\"layer4\" style=\"position:absolute;left:414;top:182;width:128;height:96;\">\n"+
688         "<table width=128 height=96 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
689         "<td><a HREF=\"higashi-hongwanji.html\"><img height=96 width=128 SRC=\"higashi-hongwanji-thumb.jpg\" border=0 ></a></td>\n"+
690         "</tr></table></div>\n"+
691         "<div id=\"layer5\" style=\"position:absolute;left:78;top:396;width:530;height:49;\">\n"+
692         "<table width=530 height=49 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
693         "<td><span style=\"font-size:14\">Click on the pictures to see the full-sized versions. The picture at the top right corner is taken in Higashi-Hongwanji. Nishi means west, and Higashi means east. These two temples are adjacent to each other and represent two different Buddhist sects.</span></td>\n"+
694         "</tr></table></div>\n"+
695         "<div id=\"layer6\" style=\"position:absolute;left:143;top:180;width:128;height:102;\">\n"+
696         "<table width=128 height=102 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
697         "<td><a HREF=\"nishi-hongwanji4.html\"><img height=102 width=128 SRC=\"nishi-hongwanji4-thumb.jpg\" border=0 ></a></td>\n"+
698         "</tr></table></div>\n"+
699         "<div id=\"layer7\" style=\"position:absolute;left:280;top:235;width:124;height:99;\">\n"+
700         "<table width=124 height=99 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+
701         "<td><a HREF=\"nishi-hongwanji-lodging.html\"><img height=99 width=124 SRC=\"nishi-hongwanji-lodging-thumb.jpg\" border=0 ></a></td>\n"+
702         "</tr></table></div>\n"+
703         "</body>\n"+
704         "</html>");
705         NodeList collectionList = new NodeList();
706         TagNameFilter filter = new TagNameFilter ("IMG");
707         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
708             e.nextNode().collectInto(collectionList,filter);
709         assertEquals("Size of collection vector should be 5",5,collectionList.size());
710         // All items in collection vector should be links
711         for (SimpleNodeIterator e = collectionList.elements();e.hasMoreNodes();) {
712             Node node = e.nextNode();
713             assertTrue("Only images should have been parsed",node instanceof ImageTag);
714         }
715     }
716 
717     /**
718      * See bug #728241 OutOfMemory error/ Infinite loop
719      */
720     public void testOutOfMemory () throws Exception  
721     {
722         createParser (
723             "<html><head></head>\n"
724             + "<body>\n"
725             + "<table>\n"
726             + "<tr>\n"
727             + "      <td><img SRC=\"foo.jpg\" alt=\"f's b\"><font\n"
728             + " size=1>blah</font>\n"
729             + "</td>\n"
730             + "</tr>\n"
731             + "</table>\n"
732             + "</body></html>\n");
733         for (NodeIterator e = parser.elements();e.hasMoreNodes();) {
734             e.nextNode();
735         }
736     }
737 
738     /**
739      * See bug #729368 Embedded quote and split tag
740      */
741     public void testEmbeddedQuoteSplit () throws Exception  
742     {
743         createParser (
744             "<html><head></head>\n"
745             + "<body>\n"
746             + "<table>\n"
747             + "<tr><td><img SRC=\"x\" alt=\"f's b\"><font\n"
748             + "size=1>blah</font></td></tr>\n"
749             + "</table>\n"
750             + "</body></html>");
751         parser.setNodeFactory (new PrototypicalNodeFactory (true));
752         int i = 0;
753         for (NodeIterator e = parser.elements();e.hasMoreNodes();)
754         {
755             Node node = e.nextNode();
756             if (10 == i)
757             {
758                 assertTrue ("not a tag", node instanceof Tag);
759                 assertTrue ("ALT attribute incorrect", ((Tag)node).getAttribute ("ALT").equals ("f's b"));
760             }
761             i++;
762         }
763         assertEquals("Expected nodes",21,i);
764     }
765 
766     /**
767      * See bug #826764 ParserException occurs only when using setInputHTML() instea
768      */
769     public void testSetInputHTML () throws Exception  
770     {
771         String   html;
772         String   path;
773         File   file;
774         PrintWriter   out;
775         Node[] nodes;
776 
777         html = "<html></html>";
778         createParser (html);
779         path = System.getProperty ("user.dir");
780         if (!path.endsWith (File.separator))
781             path += File.separator;
782         file = new File   (path + "delete_me.html");
783         try
784         {
785             out = new PrintWriter   (new FileWriter   (file));
786             out.print ("<html>\r\n");
787             out.print ("<head>\r\n");
788             out.print ("<!-- BEGIN TYPE -->\r\n");
789             out.print ("<!-- NAVIGATION -->\r\n");
790             out.print ("<!-- END TYPE -->\r\n");
791             out.print ("<!-- BEGIN TITLE -->\r\n");
792             out.print ("<title>Einstiegsseite</title>\r\n");
793             out.print ("<!-- END TITLE -->\r\n");
794             out.print ("</head>\r\n");
795             out.print ("<body>\r\n");
796             out.print ("<ul>\r\n");
797             out.print ("<li>\r\n");
798             out.print ("<!-- BEGIN ITEM -->\r\n");
799             out.print ("<!-- BEGIN REF -->\r\n");
800             out.print ("<a HREF=\"kapitel1/index.html\">\r\n");
801             out.print ("<!-- END REF -->\r\n");
802             out.print ("<!-- BEGIN REFTITLE -->\r\n");
803             out.print ("Kapitel 1\r\n");
804             out.print ("<!-- END REFTITLE -->\r\n");
805             out.print ("</a>\r\n");
806             out.print ("<!-- END ITEM -->\r\n");
807             out.print ("</li>\r\n");
808             out.print ("<li>\r\n");
809             out.print ("<!-- BEGIN ITEM -->\r\n");
810             out.print ("<!-- BEGIN REF -->\r\n");
811             out.print ("<a HREF=\"kapitel2/index.html\">\r\n");
812             out.print ("<!-- END REF -->\r\n");
813             out.print ("<!-- BEGIN REFTITLE -->\r\n");
814             out.print ("Kapitel 2\r\n");
815             out.print ("<!-- END REFTITLE -->\r\n");
816             out.print ("</a>\r\n");
817             out.print ("<!-- END ITEM -->\r\n");
818             out.print ("</li>\r\n");
819             out.print ("<li>\r\n");
820             out.print ("<!-- BEGIN ITEM -->\r\n");
821             out.print ("<!-- BEGIN REF -->\r\n");
822             out.print ("<a HREF=\"kapitel3/index.html\">\r\n");
823             out.print ("<!-- END REF -->\r\n");
824             out.print ("<!-- BEGIN REFTITLE -->\r\n");
825             out.print ("Kapitel 3\r\n");
826             out.print ("<!-- END REFTITLE -->\r\n");
827             out.print ("</a>\r\n");
828             out.print ("<!-- END ITEM -->\r\n");
829             out.print ("</li>\r\n");
830             out.print ("</ul>\r\n");
831             out.print ("</body>\r\n");
832             out.print ("</html>");
833             out.close ();
834             DataInputStream   stream = new DataInputStream   (
835                 new BufferedInputStream   (new FileInputStream   (file)));
836             byte[] buffer = new byte[(int)file.length ()];
837             stream.readFully (buffer);
838             html = new String   (buffer);
839             try
840             {
841                 parser.setInputHTML (html);
842                 nodes = parser.extractAllNodesThatAre (LinkTag.class);
843             }
844             catch (ParserException e)
845             {
846                 e.printStackTrace ();
847                 nodes = new Node[0];
848             }
849             assertTrue ("node count", 3 == nodes.length);
850         }
851         catch (Exception   e)
852         {
853             fail (e.toString ());
854         }
855         finally
856         {
857             file.delete ();
858         }
859     }
860 
861     /**
862      * Test reproducing a java.lang.StackOverflowError.
863      */
864     public void testXMLTypeToString () throws Exception  
865     {
866         String   guts;
867         String   output;
868                                                                                                                                                         
869         guts = "TD width=\"69\"/";
870         createParser ("<" + guts + ">");
871         parseAndAssertNodeCount (1);
872         output = node[0].toString (); // this was where StackOverflow was thrown
873         assertTrue ("bad toString()", -1 != output.indexOf (guts));
874     }
875 
876     /**
877      * See bug #883664 toUpperCase on tag names and attributes depends on locale
878      */
879     public void testDifferentLocale () throws Exception  
880     {
881         String   html;
882         Locale   original;
883                                                                                                                                                         
884         html = "<title>This is supposedly Turkish.</title>";
885         original = Locale.getDefault ();
886         try
887         {
888             Locale.setDefault (new Locale   ("tr")); // turkish
889             createParser (html);
890             parseAndAssertNodeCount (1);
891             assertStringEquals ("html", html, node[0].toHtml ());
892         }
893         finally
894         {
895             Locale.setDefault (original);
896         }
897     }
898     
899     /**
900      * See bug #900128 RemarkNode.setText() does not set Text
901      */
902     public void testSetStringText () throws Exception  
903     {
904         String   text;
905         String   html;
906         String   newtext;
907         String   newhtml;
908         Node txt;
909 
910         text = "This is just text.";
911         html = "<body>" + text + "</body>";
912         newtext = "This is different text.";
913         newhtml = "<body>" + newtext + "</body>";
914         createParser (html);
915         parseAndAssertNodeCount (1);
916         assertStringEquals ("html wrong", html, node[0].toHtml ());
917         assertTrue ("wrong number of children", 1 == node[0].getChildren ().size ());
918         assertTrue ("string node expected", node[0].getChildren ().elementAt (0) instanceof Text);
919         txt = node[0].getChildren ().elementAt (0);
920         assertStringEquals ("string html wrong", text, txt.toHtml ());
921         assertStringEquals ("string contents wrong", text, txt.getText ());
922         assertTrue ("toString wrong", txt.toString ().endsWith (text));
923         txt.setText (newtext);
924         assertStringEquals ("html wrong", newhtml, node[0].toHtml ());
925         assertStringEquals ("new string html wrong", newtext, txt.toHtml ());
926         assertStringEquals ("new string contents wrong", newtext, txt.getText ());
927         assertTrue ("toString wrong", txt.toString ().endsWith (newtext));
928     }
929 
930     /**
931      * See bug #900128 RemarkNode.setText() does not set Text
932      */
933     public void testSetRemarkText () throws Exception  
934     {
935         String   text;
936         String   remark;
937         String   html;
938         String   newtext;
939         String   newremark;
940         String   newhtml;
941         Node rem;
942 
943         text = " This is a remark. ";
944         remark = "<!--" + text + "-->";
945         html = "<body>" + remark + "</body>";
946         newtext = " This is a different remark. ";
947         newremark = "<!--" + newtext + "-->";
948         newhtml = "<body>" + newremark + "</body>";
949         createParser (html);
950         parseAndAssertNodeCount (1);
951         assertStringEquals ("html wrong", html, node[0].toHtml ());
952         assertTrue ("wrong number of children", 1 == node[0].getChildren ().size ());
953         assertTrue ("remark node expected", node[0].getChildren ().elementAt (0) instanceof Remark);
954         rem = node[0].getChildren ().elementAt (0);
955         assertStringEquals ("remark html wrong", remark, rem.toHtml ());
956         assertStringEquals ("remark contents wrong", text, rem.getText ());
957         assertTrue ("toString wrong", rem.toString ().endsWith (text));
958         rem.setText (newtext);
959         assertStringEquals ("html wrong", newhtml, node[0].toHtml ());
960         assertStringEquals ("new remark html wrong", newremark, rem.toHtml ());
961         assertStringEquals ("new remark contents wrong", newtext, rem.getText ());
962         assertTrue ("toString wrong", rem.toString ().endsWith (newtext));
963         rem.setText (newremark);
964         assertStringEquals ("html wrong", newhtml, node[0].toHtml ());
965         assertStringEquals ("new remark html wrong", newremark, rem.toHtml ());
966         assertStringEquals ("new remark contents wrong", newtext, rem.getText ());
967         assertTrue ("toString wrong", rem.toString ().endsWith (newtext));
968     }
969 
970     public void testFixSpaces () throws ParserException
971     {
972         String   url = "http://htmlparser.sourceforge.net/test/This is a Test Page.html";
973         parser = new Parser (url);
974         assertEquals("Expected","http://htmlparser.sourceforge.net/test/This%20is%20a%20Test%20Page.html", parser.getURL ());
975     }
976 }
977
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags