HTMLParserTest


1   /*
2    * Copyright (c) 2002, 2005 Gargoyle Software Inc. All rights reserved.
3    *
4    * Redistribution and use in source and binary forms, with or without
5    * modification, are permitted provided that the following conditions are met:
6    *
7    * 1. Redistributions of source code must retain the above copyright notice,
8    *    this list of conditions and the following disclaimer.
9    * 2. Redistributions in binary form must reproduce the above copyright notice,
10   *    this list of conditions and the following disclaimer in the documentation
11   *    and/or other materials provided with the distribution.
12   * 3. The end-user documentation included with the redistribution, if any, must
13   *    include the following acknowledgment:
14   *
15   *       "This product includes software developed by Gargoyle Software Inc.
16   *        (http://www.GargoyleSoftware.com/)."
17   *
18   *    Alternately, this acknowledgment may appear in the software itself, if
19   *    and wherever such third-party acknowledgments normally appear.
20   * 4. The name "Gargoyle Software" must not be used to endorse or promote
21   *    products derived from this software without prior written permission.
22   *    For written permission, please contact info@GargoyleSoftware.com.
23   * 5. Products derived from this software may not be called "HtmlUnit", nor may
24   *    "HtmlUnit" appear in their name, without prior written permission of
25   *    Gargoyle Software Inc.
26   *
27   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
28   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
29   * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GARGOYLE
30   * SOFTWARE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
31   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
33   * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34   * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35   * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
36   * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37   */
38  package com.gargoylesoftware.htmlunit;
39  
40  import java.net.ConnectException;
41  import java.net.SocketException;
42  import java.net.URL;
43  import java.net.URLConnection;
44  import java.util.ArrayList;
45  import java.util.Arrays;
46  import java.util.Collections;
47  import java.util.List;
48  
49  import com.gargoylesoftware.htmlunit.html.HTMLParser;
50  import com.gargoylesoftware.htmlunit.html.HtmlElement;
51  import com.gargoylesoftware.htmlunit.html.HtmlNoScript;
52  import com.gargoylesoftware.htmlunit.html.HtmlPage;
53  import com.gargoylesoftware.htmlunit.html.xpath.HtmlUnitXPath;
54  
55  /**
56   * test driver for the new HTMLParser implementation
57   *
58   * @version  $Revision: 1.15 $
59   * @author <a HREF="mailto:cse@dynabean.de">Christian Sell</a>
60   */
61  public class HTMLParserTest extends WebTestCase {
62  
63      /**
64       * Create an instance
65       * @param name The name of the test
66       */
67      public HTMLParserTest( final String name ) {
68          super(name);
69      }
70  
71      /**
72       * test the new HTMLParser on a simple HTML string and use the Jaxen XPath navigator
73       * to validate results
74       * @throws Exception failure
75       */
76      public void testSimpleHTMLString() throws Exception {
77          final WebClient webClient = new WebClient();
78          final WebResponse webResponse = new StringWebResponse(
79              "<html><head><title>TITLE</title><noscript>TEST</noscript></head><body></body></html>");
80  
81          final HtmlPage page = HTMLParser.parse(webResponse, webClient.getCurrentWindow());
82  
83          HtmlUnitXPath xpath = new HtmlUnitXPath("//noscript");
84          final String stringVal = xpath.stringValueOf(page);
85  
86          assertEquals("TEST", stringVal);
87  
88          xpath = new HtmlUnitXPath("//*[./text() = 'TEST']");
89          final HtmlElement node = (HtmlElement)xpath.selectSingleNode(page);
90  
91          assertEquals(node.getTagName(), HtmlNoScript.TAG_NAME);
92      }
93  
94      /**
95       * Test when <form> inside <table> and before <tr>
96       * @throws Exception failure
97       */
98      public void testBadlyFormedHTML() throws Exception {
99          final String content
100             = "<html><head><title>first</title>"
101             + "<script>"
102             + "function test()"
103             + "{"
104             + "  alert(document.getElementById('myInput').form.id);\n"
105             + "}"
106             + "</script>"
107             + "</head>"
108             + "<body onload='test()'>"
109             + "<table>"
110             + "<form name='myForm' action='foo' id='myForm'>"
111             + "<tr><td>"
112             + "<input type='text' name='myInput' id='myInput'/>"
113             + "</td></tr>"
114             + "</form>"
115             + "</table>"
116             + "</body></html>";
117 
118         final List collectedAlerts = new ArrayList();
119         final List expectedAlerts = Arrays.asList(new String[]{"myForm"});
120         createTestPageForRealBrowserIfNeeded(content, expectedAlerts);
121 
122         loadPage(content, collectedAlerts);
123 
124         assertEquals( expectedAlerts, collectedAlerts );
125     }
126 
127     /**
128      * Test when an illegal tag is found in head as some websites do
129      * @throws Exception failure
130      */
131     public void testUnknownTagInHead() throws Exception {
132         if (true) {
133             notImplemented();
134             return;
135         }
136 
137         // Note: the <meta> tag in this test is quite important because
138         // I could adapt the TagBalancer to make it work except with this <meta http-equiv...
139         // (it worked with <meta name=...)
140         final String content
141             = "<html><head><mainA3>"
142             + "<meta http-equiv='Content-Type' content='text/html; charset=ISO-8859-1'>"
143             + "<title>first</title>"
144             + "<script>"
145             + "function test()"
146             + "{"
147             + "  alert(document.title);\n"
148             + "}"
149             + "</script>"
150             + "</head>"
151             + "<body onload='test()'>"
152             + "</body></html>";
153 
154         final List collectedAlerts = new ArrayList();
155         final List expectedAlerts = Arrays.asList(new String[]{"first"});
156         createTestPageForRealBrowserIfNeeded(content, expectedAlerts);
157 
158         final HtmlPage page = loadPage(content, collectedAlerts);
159         System.out.println(page.asXml());
160 
161         assertEquals( expectedAlerts, collectedAlerts );
162     }
163 
164     /**
165      * test the new HTMLParser by accessing the HtmlUnit home page and detecting the copyright
166      * string.
167      *
168      * @throws Exception failure
169      */
170     public static void testHtmlUnitHomePage() throws Exception {
171 
172         final URL htmlUnitSite = new URL("http://htmlunit.sourceforge.net");
173         try {
174             final URLConnection connection = htmlUnitSite.openConnection();
175             connection.connect();
176         }
177         catch (final ConnectException e) {
178             /* sf.net's flaky web servers and not being able to connect
179              * here from the shell server can cause this, doesn't mean something
180              * is broken 
181              */
182             System.out.println("Connection could not be made to " + htmlUnitSite.toExternalForm());
183             return; 
184         }
185         catch (final SocketException e) {
186             /* Some systems do not have access to the sf.net's web page.  If the connection
187              * timesout, do not fail the test
188              */
189             System.out.println("Connection could not be made to " + htmlUnitSite.toExternalForm());
190             return;
191         }
192         
193         final WebClient webClient = new WebClient();
194         final WebResponse webResponse = new HttpWebConnection(webClient).getResponse(
195                 htmlUnitSite,
196                 SubmitMethod.GET,
197                 Collections.EMPTY_LIST,
198                 Collections.EMPTY_MAP
199         );
200 
201         final HtmlPage page = HTMLParser.parse(webResponse, webClient.getCurrentWindow());
202 
203         //find the copyright string
204         HtmlUnitXPath xpath = new HtmlUnitXPath("//div[@id='footer']/div[@class='xright']");
205         final String stringVal = xpath.stringValueOf(page).trim();
206         assertEquals("\u00A9 2002-2005, Gargoyle Software Inc.", stringVal);
207 
208         //see if the Google adds were added via Javascript
209         /* google ads not on page anymore
210         xpath = new HtmlUnitXPath("//iframe[@name = 'google_ads_frame']");
211         final HtmlInlineFrame inline = (HtmlInlineFrame)xpath.selectSingleNode(page);
212 
213         assertNotNull("find Google ads", inline);
214 
215         final HtmlPage innerPage = (HtmlPage)inline.getEnclosedPage();
216         assertNotNull(innerPage);
217         */
218     }
219 }
220
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Java Books Remove Frame
Popular Tags