1 4 package com.dappit.Dapper.parser.test; 5 6 import java.io.ByteArrayOutputStream ; 7 import java.io.File ; 8 import java.io.FileInputStream ; 9 import java.io.FileNotFoundException ; 10 import java.io.IOException ; 11 import java.io.StringReader ; 12 import java.net.MalformedURLException ; 13 import java.util.Hashtable ; 14 import java.util.concurrent.ExecutorService ; 15 import java.util.concurrent.Executors ; 16 import java.util.concurrent.TimeUnit ; 17 import java.util.zip.ZipEntry ; 18 import java.util.zip.ZipInputStream ; 19 20 import org.ccil.cowan.tagsoup.Parser; 21 import org.dom4j.DocumentException; 22 import org.dom4j.io.DOMWriter; 23 import org.dom4j.io.SAXReader; 24 import org.w3c.dom.Document ; 25 26 import com.dappit.Dapper.parser.MozillaParser; 27 import com.dappit.Dapper.parser.profiler.SimpleTimeProfiler; 28 import com.dappit.Dapper.parser.test.util.ProgressLogger; 29 30 34 public class ParserComparator 35 { 36 37 private static volatile double mozillaParsingTime; 38 private static volatile double tagsoupParsingTime; 39 40 public static byte[] fileGetContentsInBytes(File file) throws FileNotFoundException , IOException { 41 FileInputStream fIS = new FileInputStream (file); 42 ByteArrayOutputStream bIS = new ByteArrayOutputStream (); 43 byte[] temp = new byte[256]; 44 int bytesRead = 0; 45 while ((bytesRead = fIS.read(temp)) != -1) { 46 bIS.write(temp,0,bytesRead); 47 } 48 fIS.close(); 49 bIS.close(); 50 51 return bIS.toByteArray(); 52 } 53 54 61 private static void compareMozillaAndTagsoup(String content) throws Exception { 62 63 SimpleTimeProfiler profiler = new SimpleTimeProfiler(); 64 65 profiler.start(); 67 MozillaParser parser = new MozillaParser(); 69 System.out.println("Mozilla Parsing..."); 70 parser.parse(content); 71 mozillaParsingTime += profiler.report("Mozilla:"); 72 73 profiler = new SimpleTimeProfiler(); 74 System.out.println("Tagsoup Parsing..."); 76 profiler.start(); 77 tagSoupParse(content); 78 tagsoupParsingTime+= profiler.report("tagsoup:"); 79 } 80 81 private static Document tagSoupParse(String content) { 82 Parser htmlParser = new Parser(); 83 84 SAXReader saxReader = new SAXReader(htmlParser); 85 saxReader.setMergeAdjacentText(true); 86 DOMWriter domWriter = new DOMWriter(); 87 try 88 { 89 return domWriter.write(saxReader.read(new StringReader (content))); 90 } 91 catch (Exception e) 92 { 93 e.printStackTrace(); 94 } 95 return null; 96 } 97 98 private static void testZippedContent() throws Exception 99 { 100 ZipInputStream zippedInputStream = new ZipInputStream (new FileInputStream ("./test.content.zip")); 101 int counter = 0; 102 int maxCount = 1000; 103 ProgressLogger progressLogger = new ProgressLogger(maxCount); 104 while (counter++ < maxCount) 105 { 106 107 ZipEntry nextZippedEntry = zippedInputStream.getNextEntry(); 108 if (nextZippedEntry == null) 109 break; 110 ByteArrayOutputStream bos = new ByteArrayOutputStream (); 111 System.out.println("Reading zipped file :" + nextZippedEntry.getName()); 112 byte[] buf = new byte[1024]; 113 int len; 114 while ((len = zippedInputStream.read(buf)) > 0) { 115 bos.write(buf, 0, len); 116 } 117 String content = new String (bos.toByteArray()); 118 bos.close(); 120 compareMozillaAndTagsoup(content); 121 122 progressLogger.incrementCount(); 123 } 124 System.out.println("Mozilla Parsing time :" + mozillaParsingTime +" sec"); 125 System.out.println("Tagsoup Parsing time :" + tagsoupParsingTime +" sec"); 126 127 } 128 129 public static class ZipFileReader { 130 131 private final String fileName; 132 private ZipInputStream zippedInputStream; 133 134 public ZipFileReader(String fileName) throws FileNotFoundException { 135 this.fileName = fileName; 136 zippedInputStream = new ZipInputStream (new FileInputStream (this.fileName)); 137 } 138 139 public synchronized String nextContent() throws Exception 140 { 141 ZipEntry nextZippedEntry = zippedInputStream.getNextEntry(); 142 if (nextZippedEntry == null) 143 return null; 144 ByteArrayOutputStream bos = new ByteArrayOutputStream (); 145 System.out.println("Reading zipped file :" + nextZippedEntry.getName()); 146 byte[] buf = new byte[1024]; 147 int len; 148 while ((len = zippedInputStream.read(buf)) > 0) { 149 bos.write(buf, 0, len); 150 } 151 String content = new String (bos.toByteArray()); 152 bos.close(); 154 return content; 155 } 156 157 } 158 159 private static void testZippedContentMultithreaded() throws Exception 160 { 161 int maxThreads = 10; 162 ExecutorService mozillThreadPool = Executors.newFixedThreadPool(maxThreads); 163 ExecutorService tagsoupThreadPool = Executors.newFixedThreadPool(maxThreads); 164 165 ZipFileReader mozillaFileReader = new ZipFileReader("./test.content.zip"); 166 ZipFileReader tagsoupFileReader = new ZipFileReader("./test.content.zip"); 167 int counter = 0; 168 int maxCount =530; 169 170 SimpleTimeProfiler mozillaProfiler = new SimpleTimeProfiler(); 171 mozillaProfiler.start(); 172 while (counter++ < maxCount) 174 { 175 mozillThreadPool.execute(new MozillaParsingThread(mozillaFileReader)); 176 } 177 mozillThreadPool.shutdown(); 178 mozillThreadPool.awaitTermination(10000, TimeUnit.SECONDS); 179 double mozillaTime = mozillaProfiler.report("Mozilla total time"); 180 181 counter = 0; 182 SimpleTimeProfiler tagsoupProfiler = new SimpleTimeProfiler(); 184 tagsoupProfiler .start(); 185 while (counter++ < maxCount) 186 { 187 tagsoupThreadPool.execute(new TagsoupParsingThread(tagsoupFileReader)); 188 } 189 tagsoupThreadPool.shutdown(); 190 tagsoupThreadPool.awaitTermination(10000, TimeUnit.SECONDS); 191 192 double tagsoupTime = tagsoupProfiler.report("Tagsoup total time"); 193 194 System.out.println("Mozilla Parsing multithreaded time :" + mozillaParsingTime +" sec"); 195 System.out.println("Tagsoup Parsing multithreaded time :" + tagsoupParsingTime +" sec"); 196 197 System.out.println("Mozilla Parsing Total time :" + mozillaTime +" sec"); 198 System.out.println("Tagsoup Parsing Total time :" + tagsoupTime +" sec"); 199 200 } 201 202 public static class MozillaParsingThread extends Thread 203 { 204 205 private final ZipFileReader mozillaFileReader; 206 private boolean synchronize; 207 private static Object SynchronizationObject = new Object (); 208 private static Hashtable <String , Document > documentHashTable = new Hashtable <String , Document >(); 209 210 213 public MozillaParsingThread(ZipFileReader tagsoupFileReader) { 214 this(tagsoupFileReader,false); 215 } 216 217 221 public MozillaParsingThread(ZipFileReader tagsoupFileReader, boolean synchronize) { 222 this.synchronize = synchronize; 223 this.mozillaFileReader = tagsoupFileReader; 224 } 225 226 public void run() 227 { 228 String content; 229 try 230 { 231 content = mozillaFileReader.nextContent(); 232 SimpleTimeProfiler profiler = new SimpleTimeProfiler(); 233 profiler.start(); 234 MozillaParser parser = new MozillaParser(); 235 org.dom4j.Document document; 236 if (this.synchronize) 237 { 238 synchronized(SynchronizationObject) 239 { 240 document = (org.dom4j.Document) parser.parse(content); 241 } 242 } 243 else 244 { 245 document = (org.dom4j.Document) parser.parse(content); 246 } 247 248 mozillaParsingTime += profiler.report("Mozilla"); 249 255 documentHashTable.put(content.hashCode()+Boolean.toString(synchronize), (Document ) document); 256 } 257 catch (Exception e) 258 { 259 e.printStackTrace(); 260 } 261 262 } 263 264 public static Hashtable <String , Document > getDocumentsHashTable(){ 265 return documentHashTable; 266 } 267 268 } 269 270 public static class TagsoupParsingThread extends Thread { 271 272 private final ZipFileReader tagsoupFileReader; 273 private final boolean synchronize; 274 private static Object SynchronizationObject = new Object (); 275 276 279 public TagsoupParsingThread(ZipFileReader tagsoupFileReader) { 280 this(tagsoupFileReader,false); 281 } 282 283 287 public TagsoupParsingThread(ZipFileReader tagsoupFileReader, boolean synchronize) { 288 this.synchronize = synchronize; 289 this.tagsoupFileReader = tagsoupFileReader; 290 } 291 292 public void run() 293 { 294 try 295 { 296 String content = tagsoupFileReader.nextContent(); 297 SimpleTimeProfiler profiler = new SimpleTimeProfiler(); 298 profiler.start(); 299 if (synchronize) 300 { 301 synchronized(SynchronizationObject) 302 { 303 tagSoupParse(content); 304 } 305 } 306 else 307 tagSoupParse(content); 308 tagsoupParsingTime += profiler.report("Tagsoup"); 309 310 } catch (Exception e) { 311 e.printStackTrace(); 313 } 314 315 316 } 317 318 } 319 320 public static void main(String [] args) throws Exception 321 { 322 TestMozillaParser.initTestingXPCOM(); 323 324 testZippedContentMultithreaded(); 326 327 330 } 350 351 355 private static void testTagsoupSynchronizedParsing() throws Exception { 356 tagsoupMultithreadedParse(true , "Tagsoup Synchronized "); 357 tagsoupMultithreadedParse(false, "Tagsoup Parallel "); 358 } 359 360 364 private static void testMozillaSynchronizedParsing() throws Exception { 365 mozillaMultithreadedParse(true , "Mozilla Synchronized "); 366 mozillaMultithreadedParse(false, "Mozilla Parallel "); 367 } 368 369 374 private static void mozillaMultithreadedParse(final boolean synchronize , String reportString) throws Exception 375 { 376 int maxThreads = 30; 377 ExecutorService mozillaThreadPool = Executors.newFixedThreadPool(maxThreads); 378 mozillaParsingTime=0; 379 ZipFileReader tagsoupFileReader = new ZipFileReader("./test.content.zip"); 380 int counter = 0; 381 int maxCount =530; 382 383 SimpleTimeProfiler mozillaProfiler = new SimpleTimeProfiler(); 385 mozillaProfiler .start(); 386 while (counter++ < maxCount) 387 { 388 mozillaThreadPool.execute(new MozillaParsingThread(tagsoupFileReader , synchronize)); 389 } 390 mozillaThreadPool.shutdown(); 391 mozillaThreadPool.awaitTermination(10000, TimeUnit.SECONDS); 392 393 double mozillaTime = mozillaProfiler.report("Tagsoup synchronized total time"); 394 395 System.out.println(reportString + " time :" + mozillaParsingTime +" sec"); 396 System.out.println(reportString + " Total time :" + mozillaTime +" sec"); 397 398 } 399 400 405 private static void tagsoupMultithreadedParse(final boolean synchronize , String reportString) throws Exception 406 { 407 int maxThreads = 10; 408 ExecutorService tagsoupThreadPool = Executors.newFixedThreadPool(maxThreads); 409 tagsoupParsingTime=0; 410 ZipFileReader tagsoupFileReader = new ZipFileReader("./test.content.zip"); 411 int counter = 0; 412 int maxCount =530; 413 414 SimpleTimeProfiler tagsoupProfiler = new SimpleTimeProfiler(); 416 tagsoupProfiler .start(); 417 while (counter++ < maxCount) 418 { 419 tagsoupThreadPool.execute(new TagsoupParsingThread(tagsoupFileReader , synchronize)); 420 } 421 tagsoupThreadPool.shutdown(); 422 tagsoupThreadPool.awaitTermination(10000, TimeUnit.SECONDS); 423 424 double tagsoupTime = tagsoupProfiler.report("Tagsoup synchronized total time"); 425 426 System.out.println(reportString + " time :" + tagsoupParsingTime +" sec"); 427 System.out.println(reportString + " Total time :" + tagsoupTime +" sec"); 428 429 } 430 431 432 } 433 | Popular Tags |