1 4 package com.dappit.Dapper.parser.test; 5 6 import java.io.ByteArrayOutputStream ; 7 import java.io.File ; 8 import java.io.IOException ; 9 import java.io.OutputStreamWriter ; 10 import java.io.StringWriter ; 11 import java.util.Random ; 12 import java.util.Set ; 13 import java.util.Vector ; 14 import java.util.regex.Matcher ; 15 import java.util.regex.Pattern ; 16 17 import junit.framework.TestCase; 18 19 import org.dom4j.DocumentException; 20 import org.dom4j.io.DOMReader; 21 import org.dom4j.io.HTMLWriter; 22 import org.dom4j.io.OutputFormat; 23 import org.w3c.dom.Document ; 24 import org.w3c.dom.Element ; 25 import org.w3c.dom.NodeList ; 26 27 import com.dappit.Dapper.Configuration; 28 import com.dappit.Dapper.parser.EnviromentController; 29 import com.dappit.Dapper.parser.MozillaParser; 30 import com.dappit.Dapper.parser.ParserInitializationException; 31 import com.dappit.Dapper.parser.profiler.SimpleMemoryProfiler; 32 import com.sun.org.apache.xml.internal.serialize.XMLSerializer; 33 34 38 public class TestMozillaParser extends TestCase { 39 40 boolean doTesting = true; 41 42 public static void initTestingXPCOM(){ 43 File mozillaParserLibraryFile; 44 try 45 { 46 mozillaParserLibraryFile = new File ("native/bin/MozillaParser"+ EnviromentController.getSharedLibraryExtension()); 47 } 48 catch (Exception e1) 49 { 50 mozillaParserLibraryFile = new File ("./native/bin/MozillaParser.dll"); 51 e1.printStackTrace(); 52 } 53 54 55 String mozillaParserLibrary = mozillaParserLibraryFile.getAbsolutePath(); 56 String mozillaComponentBasePath = Configuration.getMozillaComponentsPath(); 57 try 58 { 59 System.out.println("Loading and initializing XPCOM from "+ mozillaParserLibrary); 60 MozillaParser.init(mozillaParserLibrary , mozillaComponentBasePath); 61 System.out.println("done!"); 62 } 63 catch (Exception e) 64 { 65 e.printStackTrace(); 66 } 67 } 68 69 70 static 71 { 72 initTestingXPCOM(); 73 } 74 75 77 public static String serialize(Document document) throws IOException { 78 StringWriter stringWriter = new StringWriter (); 79 XMLSerializer serializer = new XMLSerializer(); 80 serializer.setOutputCharStream(stringWriter); 81 serializer.serialize(document); 82 83 return stringWriter.toString(); 84 85 } 86 87 88 private Document parseAndCompare(String html ,String expectedResult) throws Exception { 89 MozillaParser parser = new MozillaParser(); 91 Document document = parser.parse(html); 92 if (doTesting) 94 assertEquals(expectedResult, serialize(document)); 95 return document; 96 } 97 98 public void testSimple1() throws Exception { 99 String simple1 = "<html>Hello world!</html>"; 100 String expected1 = "<?xml version=\"1.0\"?>\n" + 101 "<html><body>Hello world!</body></html>"; 102 parseAndCompare(simple1 , expected1); 103 } 104 105 106 107 public void testSimple2() throws Exception { 108 String simple2 = "<html>Hello world!</html>"; 109 String expected1 = "<?xml version=\"1.0\"?>\n" + 110 "<html><body>Hello world!</body></html>"; 111 parseAndCompare(simple2 , expected1); 112 } 113 114 public void testComment1() throws Exception { 115 String simple2 = "<html><body><p><!-- a comment --></p> <br> Hello world!</html>"; 116 String expected1 = "<?xml version=\"1.0\"?>\n"+ 117 "<html><body><p><!-- a comment --></p> <br/> Hello world!</body></html>"; 118 parseAndCompare(simple2 , expected1); 119 } 121 122 public void testScriptComment1() throws Exception { 123 String simple2 = "<html><body><script language=\"JavaScript\" > document.write('hell');</script> <br> Hello world!</html>"; 124 String expected1 = "" + 125 "<?xml version=\"1.0\"?>\n"+ 126 "<html><body><script language=\"JavaScript\">document.write('hell');</script> <br/> Hello world!</body></html>"; 127 parseAndCompare(simple2 , expected1); 128 } 130 131 public void testStyleContent() throws Exception { 132 String simple2 = "<html><head><style > <!-- body,td,a,p,.h{font-family:arial,sans-serif} " + 133 ".h{font-size:20px} " + 134 " .h{color:#3366cc} " + 135 " .q{color:#00c} " + 136 " --></style></head><body> <br> Hello world!</html>"; 137 String expected1 = "<?xml version=\"1.0\"?>\n"+ 138 "<html><head><style harmless=\"\"><!-- body,td,a,p,.h{font-family:arial,sans-serif} .h{font-size:20px} .h{color:#3366cc} .q{color:#00c} --></style></head><body> <br/> Hello world!</body></html>"; 139 parseAndCompare(simple2 , expected1); 140 } 142 143 public void testAmpReplacer(){ 144 String testString = " "; 145 String newString = testString.replaceAll(" ", ""); 146 assertEquals("" ,newString ); 147 148 testString = " 3 1 "; 149 newString = testString.replaceAll(" ", ""); 150 assertEquals("31" ,newString ); 151 152 } 153 154 public void testStyleReplacer() 155 { 156 String testString = "< style >"; 157 String newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> "); 158 assertEquals("<style harmless=''> " ,newString ); 159 160 testString = "< style>"; 161 newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> "); 162 assertEquals("<style harmless=''> " ,newString ); 163 164 testString = "<style>"; 165 newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> "); 166 assertEquals("<style harmless=''> " ,newString ); 167 168 testString = "< style defer>"; 169 newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> "); 170 assertNotSame("<style harmless=''> " ,newString ); 171 172 } 173 174 public void testMultithreadedXPCOMInitialization() throws InterruptedException { 175 Thread thread1 = new Thread () 176 { 177 public void run(){ 178 try 179 { 180 initTestingXPCOM(); 181 } 182 catch (Exception e) 183 { 184 e.printStackTrace(); 185 } 186 } 187 }; 188 thread1.start(); 189 thread1.join(); 190 Thread.sleep(1000); 191 thread1 = new Thread () 192 { 193 public void run(){ 194 try 195 { 196 initTestingXPCOM(); 197 } 198 catch (Exception e) 199 { 200 e.printStackTrace(); 201 } 202 } 203 }; 204 thread1.start(); 205 thread1.join(); 206 207 } 208 209 public Document parseRandomHtml(int length) throws ParserInitializationException, DocumentException 210 { 211 String html = "<html><body>"; 212 for (int i=0; i<length; i++) 213 html += "<div>"+Math.random()+"</div>"; 214 html += "</body></html>"; 215 MozillaParser parser = new MozillaParser(); 216 return parser.parse(html); 217 } 218 219 public void testMultithreaded1() 220 { 221 Thread thread1 = new Thread () 222 { 223 public void run(){ 224 try 225 { 226 parseRandomHtml(100); 227 } catch (Exception e) { 228 e.printStackTrace(); 230 } 231 } 232 }; 233 234 Thread thread2 = new Thread (){ 235 public void run(){ 236 try 237 { 238 parseRandomHtml(100); 239 } catch (Exception e) { 240 e.printStackTrace(); 242 } 243 } 244 }; 245 246 thread1.start(); 247 thread2.start(); 248 try { 249 thread1.join(); 250 thread2.join(); 251 } catch (InterruptedException e) { 252 e.printStackTrace(); 254 } 255 256 } 257 258 volatile int failed = 0; 259 260 public void testMultithreaded2() throws InterruptedException 261 { 262 263 int NUM_THREADS=50; 264 265 final Thread [] threadPool = new Thread [NUM_THREADS]; 266 final Random random = new Random (0); 267 268 for (int i=0; i<NUM_THREADS; i++) 269 threadPool[i]= new Thread (){ 270 public void run(){ 271 try { 272 double randomNumber = random.nextDouble()*100000000.0; 273 String html = "<html><body>"; 274 for (int i=0; i<100; i++) 275 html += "<p>"+randomNumber+"</p>"; 276 html += "</body></html>"; 277 MozillaParser parser = new MozillaParser(); 278 Document document = parser.parse(html); 279 Vector <String > instructions = parser.getDomBuilderArguments(); 280 int closeNodeCounter=0; 281 int openNodeCounter=0; 282 for (String instruction:instructions) 283 { 284 if (instruction.equalsIgnoreCase("CloseNode")) 285 closeNodeCounter++; 286 if (instruction.equalsIgnoreCase("OpenNode")) 287 openNodeCounter++; 288 289 } 290 293 if (!serialize(document).equals("<?xml version=\"1.0\"?>\n"+html)) 294 { 295 synchronized(threadPool) 296 { 297 System.err.println("Html input was :" + "<?xml version=\"1.0\"?>\n"+html); 298 System.err.println("Failed document :" + serialize(document)); 299 parser.dump(); 300 System.err.println("Verifying :" + document.getChildNodes().item(0).getChildNodes().item(0).getChildNodes().item(0).getNodeName() ); 301 System.err.println("<p number : > : " + document.getChildNodes().item(0).getChildNodes().item(0).getChildNodes().getLength()); 302 303 failed++; 304 } 305 } 306 307 } 308 catch (Throwable e) 309 { 310 e.printStackTrace(); 311 failed++; 312 } 313 } 314 }; 315 for (int i=0; i<NUM_THREADS; i++) 316 threadPool[i].start(); 317 for (int i=0; i<NUM_THREADS; i++) 318 threadPool[i].join(); 319 assertEquals(0, failed); 320 } 321 322 323 324 @SuppressWarnings ("unchecked") 325 public void testEntityDomWriterBug() throws Exception { 326 String testString = 327 "<!doctype html public \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"+ 328 "<html>" + 329 "<body>"+ 330 " <a HREF=\"http://us.ard.yahoo.co" + 338 "" + 339 "m/SIG=12ku07d54/M=289534.6742909.7689533.6551553/D=yahoosrch/S" + 340 "" + 341 "=2766679:HLSCH/Y=YAHOO/EXP=1167961934/A=2828626/R=2/SIG=10n3m6b64/*http" + 342 "://mail.yahoo.com?fr=yfp-t-501\">" + 343 "Mail</a> " + 344 " " + 345 " Welcome, " + 346 "<strong>Guest</strong> [" + 347 ""; 348 349 Document document = new MozillaParser().parse(testString); 350 351 ByteArrayOutputStream bs = new ByteArrayOutputStream (); 352 OutputStreamWriter oSW = null; 353 oSW = new OutputStreamWriter (bs); 354 355 OutputFormat format = OutputFormat.createPrettyPrint(); 356 format.setXHTML(false); 357 format.setExpandEmptyElements(true); 358 HTMLWriter writer = new HTMLWriter(oSW, format); 359 Set tags = writer.getPreformattedTags(); 360 tags.add("STYLE"); 361 writer.setPreformattedTags(tags); 362 363 DOMReader domReader = new DOMReader(); 364 365 367 writer.write(domReader.read(document)); 368 writer.flush(); 369 370 372 } 373 374 375 private String findEncoding(Element rootElement) { 377 String encoding = "UTF-8"; 378 NodeList metas = rootElement.getElementsByTagName("meta"); 379 for (int m = 0; m < metas.getLength(); m++) { 380 Element meta = (Element )metas.item(m); 381 boolean hasHttpEquivContentType = false; boolean hasNameContentType = false; if (meta.getAttribute("http-equiv").length()>0) 385 { 386 hasHttpEquivContentType = meta.getAttribute("http-equiv").toLowerCase().equals("content-type"); 387 } 388 else 389 if (meta.getAttribute("HTTP-EQUIV").length()>0) 390 { 391 hasHttpEquivContentType = meta.getAttribute("HTTP-EQUIV").toLowerCase().equals("content-type"); 392 } 393 394 if (meta.getAttribute("name").length()>0) 395 hasNameContentType = meta.getAttribute("name").toLowerCase().equals("content-type"); 396 else 397 if (meta.getAttribute("NAME").length()>0) 398 hasNameContentType = meta.getAttribute("NAME").toLowerCase().equals("content-type"); 399 400 String contentAttributeStr = null; 401 402 if ( meta.getAttribute("content").length()>0) 403 contentAttributeStr = meta.getAttribute("content") ; 404 else 405 if ( meta.getAttribute("CONTENT").length()>0) 406 contentAttributeStr = meta.getAttribute("CONTENT") ; 407 408 if ( (hasHttpEquivContentType || hasNameContentType) && contentAttributeStr != null ) { 409 410 Pattern pat = Pattern.compile("charset\\s?=\\s?(.+);*",Pattern.CASE_INSENSITIVE); 411 Matcher mat = pat.matcher(contentAttributeStr); 412 if (mat.find()) 413 { 414 encoding = mat.group(1); 415 break; 416 } 417 } 418 } 419 420 return encoding; 421 } 422 423 private void printDocumentPreety(Document doc) throws IOException { 424 StringWriter stringWriter = new StringWriter (); 425 OutputFormat format = OutputFormat.createPrettyPrint(); 426 format.setXHTML(false); 427 format.setEncoding(findEncoding(doc.getDocumentElement())); 428 format.setExpandEmptyElements(true); 429 HTMLWriter writer = new HTMLWriter(stringWriter, format); 430 Set tags = writer.getPreformattedTags(); 431 tags.clear(); 433 writer.setPreformattedTags(tags); 434 DOMReader domReader = new DOMReader(); 435 writer.write(domReader.read(doc)); 436 } 438 439 public void testMemoryLeak() throws Exception 781 { 782 SimpleMemoryProfiler memoryProfiler = new SimpleMemoryProfiler(); 783 memoryProfiler.start(); 784 for (int i=0; i<20000; i++) 785 { 786 testSimple2(); 787 } 788 } 790 791 792 793 794 } 795 | Popular Tags |