1 16 package org.outerj.daisy.htmlcleaner; 17 18 import junit.framework.TestCase; 19 import org.xml.sax.InputSource ; 20 21 import java.io.InputStream ; 22 import java.io.Reader ; 23 import java.io.InputStreamReader ; 24 import java.io.BufferedReader ; 25 26 public class HtmlCleanerTest extends TestCase { 27 public void testIt() throws Exception { 28 HtmlCleanerFactory factory = new HtmlCleanerFactory(); 29 InputSource is = new InputSource (getClass().getClassLoader().getResourceAsStream("org/outerj/daisy/htmlcleaner/cleanerconf.xml")); 30 HtmlCleanerTemplate template = factory.buildTemplate(is); 31 32 String result; 33 HtmlCleaner cleaner = template.newHtmlCleaner(); 34 result = cleaner.cleanToString("<html><body>\u0004abc</body></html>"); 36 assertEquals(readResource("output1.txt"), result); 37 38 cleaner = template.newHtmlCleaner(); 39 result = cleaner.cleanToString("<html xmlns='abc'><body>abc<ul> </ul></body></html>"); 40 assertEquals(readResource("output1.txt"), result); 41 42 cleaner = template.newHtmlCleaner(); 43 result = cleaner.cleanToString("<x:html xmlns:x='abc'><x:body x:r='z'>abc</x:body></x:html>"); 44 assertEquals(readResource("output1.txt"), result); 45 46 cleaner = template.newHtmlCleaner(); 47 result = cleaner.cleanToString("abc"); 48 assertEquals(readResource("output1.txt"), result); 49 50 cleaner = template.newHtmlCleaner(); 51 result = cleaner.cleanToString("<html>abc</html>"); 52 assertEquals(readResource("output1.txt"), result); 53 54 cleaner = template.newHtmlCleaner(); 55 result = cleaner.cleanToString("<html><body>abc</html>"); 56 assertEquals(readResource("output1.txt"), result); 57 58 cleaner = template.newHtmlCleaner(); 61 result = cleaner.cleanToString("<html><body>abc<br/><br/>def</html>"); 62 assertEquals(readResource("output2.txt"), result); 63 64 cleaner = template.newHtmlCleaner(); 66 result = cleaner.cleanToString("<html><body>abc<br/><br/><br/>def</html>"); 67 assertEquals(readResource("output2.txt"), result); 68 69 cleaner = template.newHtmlCleaner(); 73 result = cleaner.cleanToString("<html><body>abc<br/><br/>def<p>xyz<br/>xyz</p><p>yes<br/></p><p>yesyes<br/><br/><br/></html>"); 74 assertEquals(readResource("output3.txt"), result); 75 76 cleaner = template.newHtmlCleaner(); 78 result = cleaner.cleanToString("<html><body><p><table><tr><td>hello!</td></tr></table></p></html>"); 79 assertEquals(readResource("output4.txt"), result); 80 81 cleaner = template.newHtmlCleaner(); 83 result = cleaner.cleanToString("<html><body><p><ul><li>hello!</li></ul></p></html>"); 84 assertEquals(readResource("output5.txt"), result); 85 86 cleaner = template.newHtmlCleaner(); 88 result = cleaner.cleanToString("<html><body><p>abc<ul><li>hello!</li></ul>def</p></html>"); 89 assertEquals(readResource("output6.txt"), result); 90 91 cleaner = template.newHtmlCleaner(); 93 result = cleaner.cleanToString("<html><body><p>Hi, this is a text longer then 80 characters which will hence be split across multiple lines. Isn't this interesting. No it isn't. Anyhow, have I told you about that time when I invented the wheel? Well, it was a long time ago.</p></html>"); 94 assertEquals(readResource("output7.txt"), result); 95 96 cleaner = template.newHtmlCleaner(); 98 result = cleaner.cleanToString("<html><body><p><font>abc</font></p></html>"); 99 assertEquals(readResource("output1.txt"), result); 100 101 cleaner = template.newHtmlCleaner(); 103 result = cleaner.cleanToString("<html><body><p><span style='color: green; font-weight:bold '>abc</span><span style='font-style:italic'>abc</span><span style='font-style:italic;font-weight:bold'>abc</span></p></html>"); 104 assertEquals(readResource("output8.txt"), result); 105 106 cleaner = template.newHtmlCleaner(); 108 result = cleaner.cleanToString("<html><body><img SRC='hi' daisysrc='daisy:123'/></body></html>"); 109 assertEquals(readResource("output9.txt"), result); 110 111 cleaner = template.newHtmlCleaner(); 112 result = cleaner.cleanToString("<html><body>Hi this is <strong>strong</strong> and <em>emphasized</em></body></html>"); 113 assertEquals(readResource("output10.txt"), result); 114 115 cleaner = template.newHtmlCleaner(); 116 result = cleaner.cleanToString("<html><body>Hi this is <strong>strong</strong><em>emphasized</em></body></html>"); 117 assertEquals(readResource("output11.txt"), result); 118 119 cleaner = template.newHtmlCleaner(); 120 result = cleaner.cleanToString("<html><body>Hi this is <strong>strong</strong> <em>emphasized</em></body></html>"); 121 assertEquals(readResource("output12.txt"), result); 122 123 cleaner = template.newHtmlCleaner(); 124 result = cleaner.cleanToString("<html><body>aaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb ccc</body></html>"); 125 assertEquals(readResource("output13.txt"), result); 126 127 cleaner = template.newHtmlCleaner(); 128 result = cleaner.cleanToString("<html><body>aaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb<img SRC='somewhere'/></body></html>"); 129 assertEquals(readResource("output14.txt"), result); 130 131 cleaner = template.newHtmlCleaner(); 132 result = cleaner.cleanToString("<html><body>test test test test test test test \n\n test test test test test test test<a HREF='http://outerthought.org'>test</a> test</body></html>"); 133 assertEquals(readResource("output15.txt"), result); 134 135 cleaner = template.newHtmlCleaner(); 136 result = cleaner.cleanToString("<html><body><strong>x</strong><img SRC='x.gif'/><strong>x</strong><img SRC='x.gif'/><strong>x</strong><img SRC='x.gif'/><strong>x</strong><img SRC='x.gif'/><strong>x</strong><img SRC='x.gif'/></html>"); 137 assertEquals(readResource("output16.txt"), result); 138 139 cleaner = template.newHtmlCleaner(); 140 result = cleaner.cleanToString("<html><body><p> a b c </p></body></html>"); 141 assertEquals(readResource("output17.txt"), result); 142 143 cleaner = template.newHtmlCleaner(); 144 result = cleaner.cleanToString("<html><body><p> a b c </p> </body></html>"); 145 assertEquals(readResource("output17.txt"), result); 146 147 cleaner = template.newHtmlCleaner(); 148 result = cleaner.cleanToString("<html><body> a b c <br/> </html>"); 149 assertEquals(readResource("output17.txt"), result); 150 151 cleaner = template.newHtmlCleaner(); 153 result = cleaner.cleanToString("<html><body><table><tbody><tr><td><br/></td></tr></tbody></table></html>"); 154 assertEquals(readResource("output18.txt"), result); 155 156 cleaner = template.newHtmlCleaner(); 157 result = cleaner.cleanToString("<html><body><table><tbody><tr><td><br/>\n</td></tr></tbody></table></html>"); 158 assertEquals(readResource("output18.txt"), result); 159 160 String teststring = "<html><head><link rel=\"stylesheet\" type=\"text/css\" HREF=\"/daisy/resources/skins/default/css/htmlarea.css\" /></head>\n" + 161 " <body>\n" + 162 " \n" + 163 " <p><strong>asfasdfa</strong></p>\n" + 164 " \n" + 165 " <p><strong>dfsafsa<br /></strong></p><p><strong><br />asfj aflad <span style=\"font-style: italic;\">fafjls fd<br /></span></strong></p><p><strong><span style=\"font-style: italic;\">saj lfsdj </span>lkjlkjweids<br /></strong></p>\n" + 166 " \n" + 167 " </body></html>"; 168 169 cleaner = template.newHtmlCleaner(); 170 result = cleaner.cleanToString(teststring); 171 assertEquals(readResource("output19.txt"), result); 172 173 cleaner = template.newHtmlCleaner(); 174 result = cleaner.cleanToString("<html><body><p>abc<strong/></p><p><strong><em><em><em/></em></em></strong></p></body></html>"); 175 assertEquals(readResource("output1.txt"), result); 176 177 cleaner = template.newHtmlCleaner(); 178 result = cleaner.cleanToString("<html><body><table><tr><td> <br/></td></tr></table></body></html>"); 179 assertEquals(readResource("output20.txt"), result); 180 181 cleaner = template.newHtmlCleaner(); 182 result = cleaner.cleanToString("<html><body>hallo<table><tr><td>nog eens hallo</td></tr></table></body></html>"); 183 assertEquals(readResource("output21.txt"), result); 184 185 cleaner = template.newHtmlCleaner(); 186 result = cleaner.cleanToString("<html><body><p>hallo<table><tr><td>nog eens hallo</td></tr></table></p></body></html>"); 187 assertEquals(readResource("output21.txt"), result); 188 189 cleaner = template.newHtmlCleaner(); 190 result = cleaner.cleanToString("<html><body><p>hallo<table><tr><td>nog eens hallo<br/><br/>jaja<p>jan piet joris</p></td><td><table><tr><td><p>1</p>2</td></tr></table></td></tr></table></p></body></html>"); 191 assertEquals(readResource("output22.txt"), result); 192 193 cleaner = template.newHtmlCleaner(); 194 result = cleaner.cleanToString("<html><body><pre>each<br/>word<br/>on a new<br/>line</pre></body></html>"); 195 assertEquals(readResource("output23.txt"), result); 196 197 cleaner = template.newHtmlCleaner(); 198 result = cleaner.cleanToString("<html><body><h1>ab<br/></h1><h1><br/>\n</h1><h1><br/><h2><br/>cd</h2>ef</h1></body></html>"); 199 assertEquals(readResource("output24.txt"), result); 200 201 cleaner = template.newHtmlCleaner(); 202 result = cleaner.cleanToString("<html><body>klsaflkjdkadjfkajlfksdjakfdsfka<abc>lsjfladjflsafjlsjflkjaskfjlkjflksjafkdjalfsajfkjalfdlsfaj</body></html>"); 203 assertEquals(readResource("output25.txt"), result); 204 205 cleaner = template.newHtmlCleaner(); 207 result = cleaner.cleanToString("<html><body><a HREF='hi' daisyhref='daisy:123'>boe</a></body></html>"); 208 assertEquals(readResource("output26.txt"), result); 209 } 210 211 String readResource(String name) throws Exception { 212 InputStream is = getClass().getClassLoader().getResourceAsStream("org/outerj/daisy/htmlcleaner/" + name); 213 Reader reader = new InputStreamReader (is, "UTF-8"); 214 BufferedReader bufferedReader = new BufferedReader (reader); 215 216 StringBuffer buffer = new StringBuffer (); 217 int c = bufferedReader.read(); 218 while (c != -1) { 219 buffer.append((char)c); 220 c = bufferedReader.read(); 221 } 222 223 return buffer.toString(); 224 } 225 } 226 | Popular Tags |