KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > dappit > Dapper > parser > test > ParserComparator


1 /**
2  *
3  */

4 package com.dappit.Dapper.parser.test;
5
6 import java.io.ByteArrayOutputStream JavaDoc;
7 import java.io.File JavaDoc;
8 import java.io.FileInputStream JavaDoc;
9 import java.io.FileNotFoundException JavaDoc;
10 import java.io.IOException JavaDoc;
11 import java.io.StringReader JavaDoc;
12 import java.net.MalformedURLException JavaDoc;
13 import java.util.Hashtable JavaDoc;
14 import java.util.concurrent.ExecutorService JavaDoc;
15 import java.util.concurrent.Executors JavaDoc;
16 import java.util.concurrent.TimeUnit JavaDoc;
17 import java.util.zip.ZipEntry JavaDoc;
18 import java.util.zip.ZipInputStream JavaDoc;
19
20 import org.ccil.cowan.tagsoup.Parser;
21 import org.dom4j.DocumentException;
22 import org.dom4j.io.DOMWriter;
23 import org.dom4j.io.SAXReader;
24 import org.w3c.dom.Document JavaDoc;
25
26 import com.dappit.Dapper.parser.MozillaParser;
27 import com.dappit.Dapper.parser.profiler.SimpleTimeProfiler;
28 import com.dappit.Dapper.parser.test.util.ProgressLogger;
29
30 /**
31  * @author Ohad Serfaty
32  *
33  */

34 public class ParserComparator
35 {
36     
37     private static volatile double mozillaParsingTime;
38     private static volatile double tagsoupParsingTime;
39
40     public static byte[] fileGetContentsInBytes(File JavaDoc file) throws FileNotFoundException JavaDoc, IOException JavaDoc {
41         FileInputStream JavaDoc fIS = new FileInputStream JavaDoc(file);
42         ByteArrayOutputStream JavaDoc bIS = new ByteArrayOutputStream JavaDoc();
43         byte[] temp = new byte[256];
44         int bytesRead = 0;
45         while ((bytesRead = fIS.read(temp)) != -1) {
46           bIS.write(temp,0,bytesRead);
47         }
48         fIS.close();
49         bIS.close();
50         
51         return bIS.toByteArray();
52       }
53
54     /**
55      * @param youTubeContent
56      * @throws DocumentException
57      * @throws NetworkErrorException
58      * @throws IOException
59      * @throws MalformedURLException
60      */

61     private static void compareMozillaAndTagsoup(String JavaDoc content) throws Exception JavaDoc {
62         
63         SimpleTimeProfiler profiler = new SimpleTimeProfiler();
64         
65         // profile mozilla :
66
profiler.start();
67 // System.out.println("Parsing content : "+ content);
68
MozillaParser parser = new MozillaParser();
69         System.out.println("Mozilla Parsing...");
70         parser.parse(content);
71         mozillaParsingTime += profiler.report("Mozilla:");
72
73         profiler = new SimpleTimeProfiler();
74         // profile tagsoup :
75
System.out.println("Tagsoup Parsing...");
76         profiler.start();
77         tagSoupParse(content);
78         tagsoupParsingTime+= profiler.report("tagsoup:");
79     }
80     
81     private static Document JavaDoc tagSoupParse(String JavaDoc content) {
82         Parser htmlParser = new Parser();
83
84         SAXReader saxReader = new SAXReader(htmlParser);
85         saxReader.setMergeAdjacentText(true);
86         DOMWriter domWriter = new DOMWriter();
87         try
88         {
89             return domWriter.write(saxReader.read(new StringReader JavaDoc(content)));
90         }
91         catch (Exception JavaDoc e)
92         {
93             e.printStackTrace();
94         }
95         return null;
96     }
97     
98     private static void testZippedContent() throws Exception JavaDoc
99     {
100         ZipInputStream JavaDoc zippedInputStream = new ZipInputStream JavaDoc(new FileInputStream JavaDoc("./test.content.zip"));
101         int counter = 0;
102         int maxCount = 1000;
103         ProgressLogger progressLogger = new ProgressLogger(maxCount);
104         while (counter++ < maxCount)
105             {
106             
107                 ZipEntry JavaDoc nextZippedEntry = zippedInputStream.getNextEntry();
108                 if (nextZippedEntry == null)
109                     break;
110                 ByteArrayOutputStream JavaDoc bos = new ByteArrayOutputStream JavaDoc();
111                 System.out.println("Reading zipped file :" + nextZippedEntry.getName());
112                 byte[] buf = new byte[1024];
113                 int len;
114                 while ((len = zippedInputStream.read(buf)) > 0) {
115                     bos.write(buf, 0, len);
116                 }
117                 String JavaDoc content = new String JavaDoc(bos.toByteArray());
118 // System.out.println("Content : "+ content);
119
bos.close();
120                 compareMozillaAndTagsoup(content);
121                 
122                 progressLogger.incrementCount();
123             }
124         System.out.println("Mozilla Parsing time :" + mozillaParsingTime +" sec");
125         System.out.println("Tagsoup Parsing time :" + tagsoupParsingTime +" sec");
126         
127     }
128     
129     public static class ZipFileReader {
130         
131         private final String JavaDoc fileName;
132         private ZipInputStream JavaDoc zippedInputStream;
133
134         public ZipFileReader(String JavaDoc fileName) throws FileNotFoundException JavaDoc{
135             this.fileName = fileName;
136             zippedInputStream = new ZipInputStream JavaDoc(new FileInputStream JavaDoc(this.fileName));
137         }
138         
139         public synchronized String JavaDoc nextContent() throws Exception JavaDoc
140         {
141             ZipEntry JavaDoc nextZippedEntry = zippedInputStream.getNextEntry();
142             if (nextZippedEntry == null)
143                 return null;
144             ByteArrayOutputStream JavaDoc bos = new ByteArrayOutputStream JavaDoc();
145             System.out.println("Reading zipped file :" + nextZippedEntry.getName());
146             byte[] buf = new byte[1024];
147             int len;
148             while ((len = zippedInputStream.read(buf)) > 0) {
149                 bos.write(buf, 0, len);
150             }
151             String JavaDoc content = new String JavaDoc(bos.toByteArray());
152 // System.out.println("Content : "+ content);
153
bos.close();
154             return content;
155         }
156         
157     }
158         
159     private static void testZippedContentMultithreaded() throws Exception JavaDoc
160     {
161         int maxThreads = 10;
162         ExecutorService JavaDoc mozillThreadPool = Executors.newFixedThreadPool(maxThreads);
163         ExecutorService JavaDoc tagsoupThreadPool = Executors.newFixedThreadPool(maxThreads);
164         
165         ZipFileReader mozillaFileReader = new ZipFileReader("./test.content.zip");
166         ZipFileReader tagsoupFileReader = new ZipFileReader("./test.content.zip");
167         int counter = 0;
168         int maxCount =530;
169         
170         SimpleTimeProfiler mozillaProfiler = new SimpleTimeProfiler();
171         mozillaProfiler.start();
172         // first have Mozilla :
173
while (counter++ < maxCount)
174         {
175             mozillThreadPool.execute(new MozillaParsingThread(mozillaFileReader));
176         }
177         mozillThreadPool.shutdown();
178         mozillThreadPool.awaitTermination(10000, TimeUnit.SECONDS);
179         double mozillaTime = mozillaProfiler.report("Mozilla total time");
180         
181         counter = 0;
182         // then have tagsoup :
183
SimpleTimeProfiler tagsoupProfiler = new SimpleTimeProfiler();
184         tagsoupProfiler .start();
185         while (counter++ < maxCount)
186         {
187             tagsoupThreadPool.execute(new TagsoupParsingThread(tagsoupFileReader));
188         }
189         tagsoupThreadPool.shutdown();
190         tagsoupThreadPool.awaitTermination(10000, TimeUnit.SECONDS);
191         
192         double tagsoupTime = tagsoupProfiler.report("Tagsoup total time");
193         
194         System.out.println("Mozilla Parsing multithreaded time :" + mozillaParsingTime +" sec");
195         System.out.println("Tagsoup Parsing multithreaded time :" + tagsoupParsingTime +" sec");
196         
197         System.out.println("Mozilla Parsing Total time :" + mozillaTime +" sec");
198         System.out.println("Tagsoup Parsing Total time :" + tagsoupTime +" sec");
199         
200     }
201     
202     public static class MozillaParsingThread extends Thread JavaDoc
203     {
204         
205         private final ZipFileReader mozillaFileReader;
206         private boolean synchronize;
207         private static Object JavaDoc SynchronizationObject = new Object JavaDoc();
208         private static Hashtable JavaDoc<String JavaDoc, Document JavaDoc> documentHashTable = new Hashtable JavaDoc<String JavaDoc, Document JavaDoc>();
209
210         /**
211          * @param tagsoupFileReader
212          */

213         public MozillaParsingThread(ZipFileReader tagsoupFileReader) {
214             this(tagsoupFileReader,false);
215         }
216
217         /**
218          * @param tagsoupFileReader2
219          * @param b
220          */

221         public MozillaParsingThread(ZipFileReader tagsoupFileReader, boolean synchronize) {
222             this.synchronize = synchronize;
223             this.mozillaFileReader = tagsoupFileReader;
224         }
225         
226         public void run()
227         {
228             String JavaDoc content;
229             try
230             {
231                 content = mozillaFileReader.nextContent();
232                 SimpleTimeProfiler profiler = new SimpleTimeProfiler();
233                 profiler.start();
234                 MozillaParser parser = new MozillaParser();
235                 org.dom4j.Document document;
236                 if (this.synchronize)
237                 {
238                     synchronized(SynchronizationObject)
239                     {
240                         document = (org.dom4j.Document) parser.parse(content);
241                     }
242                 }
243                 else
244                 {
245                     document = (org.dom4j.Document) parser.parse(content);
246                 }
247                 
248                 mozillaParsingTime += profiler.report("Mozilla");
249 // org.dom4j.Document document2 = (org.dom4j.Document) parser.parse(content);
250
// if (!document2.asXML().equals(document.asXML()))
251
// {
252
// System.err.println("------------------------->>> content not equals ????");
253
// }
254

255                 documentHashTable.put(content.hashCode()+Boolean.toString(synchronize), (Document JavaDoc) document);
256             }
257             catch (Exception JavaDoc e)
258             {
259                 e.printStackTrace();
260             }
261             
262         }
263         
264         public static Hashtable JavaDoc<String JavaDoc, Document JavaDoc> getDocumentsHashTable(){
265             return documentHashTable;
266         }
267         
268     }
269     
270     public static class TagsoupParsingThread extends Thread JavaDoc {
271         
272         private final ZipFileReader tagsoupFileReader;
273         private final boolean synchronize;
274         private static Object JavaDoc SynchronizationObject = new Object JavaDoc();
275
276         /**
277          * @param tagsoupFileReader
278          */

279         public TagsoupParsingThread(ZipFileReader tagsoupFileReader) {
280             this(tagsoupFileReader,false);
281         }
282
283         /**
284          * @param tagsoupFileReader2
285          * @param b
286          */

287         public TagsoupParsingThread(ZipFileReader tagsoupFileReader, boolean synchronize) {
288             this.synchronize = synchronize;
289             this.tagsoupFileReader = tagsoupFileReader;
290         }
291
292         public void run()
293         {
294             try
295             {
296                 String JavaDoc content = tagsoupFileReader.nextContent();
297                 SimpleTimeProfiler profiler = new SimpleTimeProfiler();
298                 profiler.start();
299                 if (synchronize)
300                 {
301                     synchronized(SynchronizationObject)
302                     {
303                         tagSoupParse(content);
304                     }
305                 }
306                 else
307                     tagSoupParse(content);
308                 tagsoupParsingTime += profiler.report("Tagsoup");
309                 
310             } catch (Exception JavaDoc e) {
311                 // TODO Auto-generated catch block
312
e.printStackTrace();
313             }
314             
315             
316         }
317         
318     }
319     
320     public static void main(String JavaDoc[] args) throws Exception JavaDoc
321     {
322         TestMozillaParser.initTestingXPCOM();
323         
324         // Scheme 1 :
325
testZippedContentMultithreaded();
326         
327         // Scheme 2 :
328
// testTagsoupSynchronizedParsing();
329

330         // Scheme 3 :
331
// testMozillaSynchronizedParsing();
332
// System.out.println( MozillaParsingThread.getDocumentsHashTable());
333
// Hashtable<String, Document> documentHashTable = MozillaParsingThread.getDocumentsHashTable();
334
// for (String contentType:documentHashTable.keySet())
335
// {
336
// if (contentType.endsWith("true"))
337
// {
338
// org.dom4j.Document synchronizedDocumentResult = (org.dom4j.Document) documentHashTable.get(contentType);
339
// System.out.println(contentType +"->" + synchronizedDocumentResult);
340
//
341
// String parralelScontent = contentType.replace("true", "false");
342
// org.dom4j.Document unsynchronizedDocumentResult = (org.dom4j.Document) documentHashTable.get(parralelScontent);
343
// System.out.println( parralelScontent+"->" +unsynchronizedDocumentResult );
344
// if (!unsynchronizedDocumentResult.asXML().equals(synchronizedDocumentResult.asXML()))
345
// System.err.println("Not Good : " + contentType);
346
// }
347
// }
348
//
349
}
350
351     /**
352      * @throws Exception
353      *
354      */

355     private static void testTagsoupSynchronizedParsing() throws Exception JavaDoc {
356         tagsoupMultithreadedParse(true , "Tagsoup Synchronized ");
357         tagsoupMultithreadedParse(false, "Tagsoup Parallel ");
358     }
359     
360     /**
361      * @throws Exception
362      *
363      */

364     private static void testMozillaSynchronizedParsing() throws Exception JavaDoc {
365         mozillaMultithreadedParse(true , "Mozilla Synchronized ");
366         mozillaMultithreadedParse(false, "Mozilla Parallel ");
367     }
368
369     /**
370      * @throws FileNotFoundException
371      * @throws Exception
372      *
373      */

374     private static void mozillaMultithreadedParse(final boolean synchronize , String JavaDoc reportString) throws Exception JavaDoc
375     {
376         int maxThreads = 30;
377         ExecutorService JavaDoc mozillaThreadPool = Executors.newFixedThreadPool(maxThreads);
378         mozillaParsingTime=0;
379         ZipFileReader tagsoupFileReader = new ZipFileReader("./test.content.zip");
380         int counter = 0;
381         int maxCount =530;
382         
383         // then have tagsoup :
384
SimpleTimeProfiler mozillaProfiler = new SimpleTimeProfiler();
385         mozillaProfiler .start();
386         while (counter++ < maxCount)
387         {
388             mozillaThreadPool.execute(new MozillaParsingThread(tagsoupFileReader , synchronize));
389         }
390         mozillaThreadPool.shutdown();
391         mozillaThreadPool.awaitTermination(10000, TimeUnit.SECONDS);
392         
393         double mozillaTime = mozillaProfiler.report("Tagsoup synchronized total time");
394         
395         System.out.println(reportString + " time :" + mozillaParsingTime +" sec");
396         System.out.println(reportString + " Total time :" + mozillaTime +" sec");
397         
398     }
399     
400     /**
401      * @throws FileNotFoundException
402      * @throws Exception
403      *
404      */

405     private static void tagsoupMultithreadedParse(final boolean synchronize , String JavaDoc reportString) throws Exception JavaDoc
406     {
407         int maxThreads = 10;
408         ExecutorService JavaDoc tagsoupThreadPool = Executors.newFixedThreadPool(maxThreads);
409         tagsoupParsingTime=0;
410         ZipFileReader tagsoupFileReader = new ZipFileReader("./test.content.zip");
411         int counter = 0;
412         int maxCount =530;
413         
414         // then have tagsoup :
415
SimpleTimeProfiler tagsoupProfiler = new SimpleTimeProfiler();
416         tagsoupProfiler .start();
417         while (counter++ < maxCount)
418         {
419             tagsoupThreadPool.execute(new TagsoupParsingThread(tagsoupFileReader , synchronize));
420         }
421         tagsoupThreadPool.shutdown();
422         tagsoupThreadPool.awaitTermination(10000, TimeUnit.SECONDS);
423         
424         double tagsoupTime = tagsoupProfiler.report("Tagsoup synchronized total time");
425         
426         System.out.println(reportString + " time :" + tagsoupParsingTime +" sec");
427         System.out.println(reportString + " Total time :" + tagsoupTime +" sec");
428         
429     }
430     
431
432 }
433
Popular Tags