KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > htmlcleaner > HtmlCleanerTest


1 /*
2  * Copyright 2004 Outerthought bvba and Schaubroeck nv
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.outerj.daisy.htmlcleaner;
17
18 import junit.framework.TestCase;
19 import org.xml.sax.InputSource JavaDoc;
20
21 import java.io.InputStream JavaDoc;
22 import java.io.Reader JavaDoc;
23 import java.io.InputStreamReader JavaDoc;
24 import java.io.BufferedReader JavaDoc;
25
26 public class HtmlCleanerTest extends TestCase {
27     public void testIt() throws Exception JavaDoc {
28         HtmlCleanerFactory factory = new HtmlCleanerFactory();
29         InputSource JavaDoc is = new InputSource JavaDoc(getClass().getClassLoader().getResourceAsStream("org/outerj/daisy/htmlcleaner/cleanerconf.xml"));
30         HtmlCleanerTemplate template = factory.buildTemplate(is);
31
32         String JavaDoc result;
33         HtmlCleaner cleaner = template.newHtmlCleaner();
34         // the \u0004 in there is to check that invalid XML characters are removed
35
result = cleaner.cleanToString("<html><body>\u0004abc</body></html>");
36         assertEquals(readResource("output1.txt"), result);
37
38         cleaner = template.newHtmlCleaner();
39         result = cleaner.cleanToString("<html xmlns='abc'><body>abc<ul> </ul></body></html>");
40         assertEquals(readResource("output1.txt"), result);
41
42         cleaner = template.newHtmlCleaner();
43         result = cleaner.cleanToString("<x:html xmlns:x='abc'><x:body x:r='z'>abc</x:body></x:html>");
44         assertEquals(readResource("output1.txt"), result);
45
46         cleaner = template.newHtmlCleaner();
47         result = cleaner.cleanToString("abc");
48         assertEquals(readResource("output1.txt"), result);
49
50         cleaner = template.newHtmlCleaner();
51         result = cleaner.cleanToString("<html>abc</html>");
52         assertEquals(readResource("output1.txt"), result);
53
54         cleaner = template.newHtmlCleaner();
55         result = cleaner.cleanToString("<html><body>abc</html>");
56         assertEquals(readResource("output1.txt"), result);
57
58         // * free text in body should be embedded in <p>
59
// * two br's should be translated to new paragraph
60
cleaner = template.newHtmlCleaner();
61         result = cleaner.cleanToString("<html><body>abc<br/><br/>def</html>");
62         assertEquals(readResource("output2.txt"), result);
63
64         // * more then two br's should give same result
65
cleaner = template.newHtmlCleaner();
66         result = cleaner.cleanToString("<html><body>abc<br/><br/><br/>def</html>");
67         assertEquals(readResource("output2.txt"), result);
68
69         // * two br's are translated to new paragraph
70
// * one br remains one br
71
// * one or more br's before </p> closing tag: remove them
72
cleaner = template.newHtmlCleaner();
73         result = cleaner.cleanToString("<html><body>abc<br/><br/>def<p>xyz<br/>xyz</p><p>yes<br/></p><p>yesyes<br/><br/><br/></html>");
74         assertEquals(readResource("output3.txt"), result);
75
76         // * table inside a <p> tag
77
cleaner = template.newHtmlCleaner();
78         result = cleaner.cleanToString("<html><body><p><table><tr><td>hello!</td></tr></table></p></html>");
79         assertEquals(readResource("output4.txt"), result);
80
81         // ul inside a p tag
82
cleaner = template.newHtmlCleaner();
83         result = cleaner.cleanToString("<html><body><p><ul><li>hello!</li></ul></p></html>");
84         assertEquals(readResource("output5.txt"), result);
85
86         // ul inside a p tag with still some text around it
87
cleaner = template.newHtmlCleaner();
88         result = cleaner.cleanToString("<html><body><p>abc<ul><li>hello!</li></ul>def</p></html>");
89         assertEquals(readResource("output6.txt"), result);
90
91         // test text reflow
92
cleaner = template.newHtmlCleaner();
93         result = cleaner.cleanToString("<html><body><p>Hi, this is a text longer then 80 characters which will hence be split across multiple lines. Isn't this interesting. No it isn't. Anyhow, have I told you about that time when I invented the wheel? Well, it was a long time ago.</p></html>");
94         assertEquals(readResource("output7.txt"), result);
95
96         // test removal of not-allowed tags
97
cleaner = template.newHtmlCleaner();
98         result = cleaner.cleanToString("<html><body><p><font>abc</font></p></html>");
99         assertEquals(readResource("output1.txt"), result);
100
101         // test translation of span with styling
102
cleaner = template.newHtmlCleaner();
103         result = cleaner.cleanToString("<html><body><p><span style='color: green; font-weight:bold '>abc</span><span style='font-style:italic'>abc</span><span style='font-style:italic;font-weight:bold'>abc</span></p></html>");
104         assertEquals(readResource("output8.txt"), result);
105
106         // test img src conversion
107
cleaner = template.newHtmlCleaner();
108         result = cleaner.cleanToString("<html><body><img SRC='hi' daisysrc='daisy:123'/></body></html>");
109         assertEquals(readResource("output9.txt"), result);
110
111         cleaner = template.newHtmlCleaner();
112         result = cleaner.cleanToString("<html><body>Hi this is <strong>strong</strong> and <em>emphasized</em></body></html>");
113         assertEquals(readResource("output10.txt"), result);
114
115         cleaner = template.newHtmlCleaner();
116         result = cleaner.cleanToString("<html><body>Hi this is <strong>strong</strong><em>emphasized</em></body></html>");
117         assertEquals(readResource("output11.txt"), result);
118
119         cleaner = template.newHtmlCleaner();
120         result = cleaner.cleanToString("<html><body>Hi this is <strong>strong</strong> <em>emphasized</em></body></html>");
121         assertEquals(readResource("output12.txt"), result);
122
123         cleaner = template.newHtmlCleaner();
124         result = cleaner.cleanToString("<html><body>aaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb ccc</body></html>");
125         assertEquals(readResource("output13.txt"), result);
126
127         cleaner = template.newHtmlCleaner();
128         result = cleaner.cleanToString("<html><body>aaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb<img SRC='somewhere'/></body></html>");
129         assertEquals(readResource("output14.txt"), result);
130
131         cleaner = template.newHtmlCleaner();
132         result = cleaner.cleanToString("<html><body>test test test test test test test \n\n test test test test test test test<a HREF='http://outerthought.org'>test</a> test</body></html>");
133         assertEquals(readResource("output15.txt"), result);
134
135         cleaner = template.newHtmlCleaner();
136         result = cleaner.cleanToString("<html><body><strong>x</strong><img SRC='x.gif'/><strong>x</strong><img SRC='x.gif'/><strong>x</strong><img SRC='x.gif'/><strong>x</strong><img SRC='x.gif'/><strong>x</strong><img SRC='x.gif'/></html>");
137         assertEquals(readResource("output16.txt"), result);
138
139         cleaner = template.newHtmlCleaner();
140         result = cleaner.cleanToString("<html><body><p> a b c </p></body></html>");
141         assertEquals(readResource("output17.txt"), result);
142
143         cleaner = template.newHtmlCleaner();
144         result = cleaner.cleanToString("<html><body><p> a b c </p> </body></html>");
145         assertEquals(readResource("output17.txt"), result);
146
147         cleaner = template.newHtmlCleaner();
148         result = cleaner.cleanToString("<html><body> a b c <br/> </html>");
149         assertEquals(readResource("output17.txt"), result);
150
151         // test removal of <br> inside <td>
152
cleaner = template.newHtmlCleaner();
153         result = cleaner.cleanToString("<html><body><table><tbody><tr><td><br/></td></tr></tbody></table></html>");
154         assertEquals(readResource("output18.txt"), result);
155
156         cleaner = template.newHtmlCleaner();
157         result = cleaner.cleanToString("<html><body><table><tbody><tr><td><br/>\n</td></tr></tbody></table></html>");
158         assertEquals(readResource("output18.txt"), result);
159
160         String JavaDoc teststring = "<html><head><link rel=\"stylesheet\" type=\"text/css\" HREF=\"/daisy/resources/skins/default/css/htmlarea.css\" /></head>\n" +
161                 " <body>\n" +
162                 " \n" +
163                 " <p><strong>asfasdfa</strong></p>\n" +
164                 " \n" +
165                 " <p><strong>dfsafsa<br /></strong></p><p><strong><br />asfj aflad <span style=\"font-style: italic;\">fafjls fd<br /></span></strong></p><p><strong><span style=\"font-style: italic;\">saj lfsdj </span>lkjlkjweids<br /></strong></p>\n" +
166                 " \n" +
167                 " </body></html>";
168
169         cleaner = template.newHtmlCleaner();
170         result = cleaner.cleanToString(teststring);
171         assertEquals(readResource("output19.txt"), result);
172
173         cleaner = template.newHtmlCleaner();
174         result = cleaner.cleanToString("<html><body><p>abc<strong/></p><p><strong><em><em><em/></em></em></strong></p></body></html>");
175         assertEquals(readResource("output1.txt"), result);
176
177         cleaner = template.newHtmlCleaner();
178         result = cleaner.cleanToString("<html><body><table><tr><td> <br/></td></tr></table></body></html>");
179         assertEquals(readResource("output20.txt"), result);
180
181         cleaner = template.newHtmlCleaner();
182         result = cleaner.cleanToString("<html><body>hallo<table><tr><td>nog eens hallo</td></tr></table></body></html>");
183         assertEquals(readResource("output21.txt"), result);
184
185         cleaner = template.newHtmlCleaner();
186         result = cleaner.cleanToString("<html><body><p>hallo<table><tr><td>nog eens hallo</td></tr></table></p></body></html>");
187         assertEquals(readResource("output21.txt"), result);
188
189         cleaner = template.newHtmlCleaner();
190         result = cleaner.cleanToString("<html><body><p>hallo<table><tr><td>nog eens hallo<br/><br/>jaja<p>jan piet joris</p></td><td><table><tr><td><p>1</p>2</td></tr></table></td></tr></table></p></body></html>");
191         assertEquals(readResource("output22.txt"), result);
192
193         cleaner = template.newHtmlCleaner();
194         result = cleaner.cleanToString("<html><body><pre>each<br/>word<br/>on a new<br/>line</pre></body></html>");
195         assertEquals(readResource("output23.txt"), result);
196
197         cleaner = template.newHtmlCleaner();
198         result = cleaner.cleanToString("<html><body><h1>ab<br/></h1><h1><br/>\n</h1><h1><br/><h2><br/>cd</h2>ef</h1></body></html>");
199         assertEquals(readResource("output24.txt"), result);
200
201         cleaner = template.newHtmlCleaner();
202         result = cleaner.cleanToString("<html><body>klsaflkjdkadjfkajlfksdjakfdsfka&lt;abc&gt;lsjfladjflsafjlsjflkjaskfjlkjflksjafkdjalfsajfkjalfdlsfaj</body></html>");
203         assertEquals(readResource("output25.txt"), result);
204
205         // test link href conversion
206
cleaner = template.newHtmlCleaner();
207         result = cleaner.cleanToString("<html><body><a HREF='hi' daisyhref='daisy:123'>boe</a></body></html>");
208         assertEquals(readResource("output26.txt"), result);
209     }
210
211     String JavaDoc readResource(String JavaDoc name) throws Exception JavaDoc {
212         InputStream JavaDoc is = getClass().getClassLoader().getResourceAsStream("org/outerj/daisy/htmlcleaner/" + name);
213         Reader JavaDoc reader = new InputStreamReader JavaDoc(is, "UTF-8");
214         BufferedReader JavaDoc bufferedReader = new BufferedReader JavaDoc(reader);
215
216         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
217         int c = bufferedReader.read();
218         while (c != -1) {
219             buffer.append((char)c);
220             c = bufferedReader.read();
221         }
222
223         return buffer.toString();
224     }
225 }
226
Popular Tags