KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > dappit > Dapper > parser > test > TestMozillaParser


1 /**
2  *
3  */

4 package com.dappit.Dapper.parser.test;
5
6 import java.io.ByteArrayOutputStream JavaDoc;
7 import java.io.File JavaDoc;
8 import java.io.IOException JavaDoc;
9 import java.io.OutputStreamWriter JavaDoc;
10 import java.io.StringWriter JavaDoc;
11 import java.util.Random JavaDoc;
12 import java.util.Set JavaDoc;
13 import java.util.Vector JavaDoc;
14 import java.util.regex.Matcher JavaDoc;
15 import java.util.regex.Pattern JavaDoc;
16
17 import junit.framework.TestCase;
18
19 import org.dom4j.DocumentException;
20 import org.dom4j.io.DOMReader;
21 import org.dom4j.io.HTMLWriter;
22 import org.dom4j.io.OutputFormat;
23 import org.w3c.dom.Document JavaDoc;
24 import org.w3c.dom.Element JavaDoc;
25 import org.w3c.dom.NodeList JavaDoc;
26
27 import com.dappit.Dapper.Configuration;
28 import com.dappit.Dapper.parser.EnviromentController;
29 import com.dappit.Dapper.parser.MozillaParser;
30 import com.dappit.Dapper.parser.ParserInitializationException;
31 import com.dappit.Dapper.parser.profiler.SimpleMemoryProfiler;
32 import com.sun.org.apache.xml.internal.serialize.XMLSerializer;
33
34 /**
35  * @author Ohad Serfaty
36  *
37  */

38 public class TestMozillaParser extends TestCase {
39
40     boolean doTesting = true;
41     
42     public static void initTestingXPCOM(){
43         File JavaDoc mozillaParserLibraryFile;
44         try
45         {
46             mozillaParserLibraryFile = new File JavaDoc("native/bin/MozillaParser"+ EnviromentController.getSharedLibraryExtension());
47         }
48         catch (Exception JavaDoc e1)
49         {
50             mozillaParserLibraryFile = new File JavaDoc("./native/bin/MozillaParser.dll");
51             e1.printStackTrace();
52         }
53         
54         
55         String JavaDoc mozillaParserLibrary = mozillaParserLibraryFile.getAbsolutePath();
56         String JavaDoc mozillaComponentBasePath = Configuration.getMozillaComponentsPath();
57             try
58             {
59                 System.out.println("Loading and initializing XPCOM from "+ mozillaParserLibrary);
60                 MozillaParser.init(mozillaParserLibrary , mozillaComponentBasePath);
61                 System.out.println("done!");
62             }
63             catch (Exception JavaDoc e)
64             {
65                 e.printStackTrace();
66             }
67     }
68     
69     
70     static
71     {
72         initTestingXPCOM();
73     }
74     
75     // helper function : get the string of the dom document
76

77     public static String JavaDoc serialize(Document JavaDoc document) throws IOException JavaDoc{
78         StringWriter JavaDoc stringWriter = new StringWriter JavaDoc();
79         XMLSerializer serializer = new XMLSerializer();
80         serializer.setOutputCharStream(stringWriter);
81         serializer.serialize(document);
82         
83         return stringWriter.toString();
84         
85     }
86     
87     
88     private Document JavaDoc parseAndCompare(String JavaDoc html ,String JavaDoc expectedResult) throws Exception JavaDoc{
89         //MozillaParser parser = MozillaParser.getInstance();
90
MozillaParser parser = new MozillaParser();
91          Document JavaDoc document = parser.parse(html);
92          //System.out.println(serialize(document));
93
if (doTesting)
94              assertEquals(expectedResult, serialize(document));
95          return document;
96     }
97     
98     public void testSimple1() throws Exception JavaDoc{
99          String JavaDoc simple1 = "<html>Hello world!</html>";
100          String JavaDoc expected1 = "<?xml version=\"1.0\"?>\n" +
101                 "<html><body>Hello world!</body></html>";
102          parseAndCompare(simple1 , expected1);
103     }
104     
105     
106     
107     public void testSimple2() throws Exception JavaDoc{
108          String JavaDoc simple2 = "<html>Hello world!</html>";
109          String JavaDoc expected1 = "<?xml version=\"1.0\"?>\n" +
110                 "<html><body>Hello world!</body></html>";
111          parseAndCompare(simple2 , expected1);
112     }
113
114     public void testComment1() throws Exception JavaDoc{
115          String JavaDoc simple2 = "<html><body><p><!-- a comment --></p> <br> Hello world!</html>";
116          String JavaDoc expected1 = "<?xml version=\"1.0\"?>\n"+
117                     "<html><body><p><!-- a comment --></p> <br/> Hello world!</body></html>";
118          parseAndCompare(simple2 , expected1);
119 // System.out.println(serialize(document));
120
}
121     
122     public void testScriptComment1() throws Exception JavaDoc{
123          String JavaDoc simple2 = "<html><body><script language=\"JavaScript\" > document.write('hell');</script> <br> Hello world!</html>";
124          String JavaDoc expected1 = "" +
125                 "<?xml version=\"1.0\"?>\n"+
126 "<html><body><script language=\"JavaScript\">document.write('hell');</script> <br/> Hello world!</body></html>";
127          parseAndCompare(simple2 , expected1);
128 // System.out.println(serialize(document));
129
}
130     
131     public void testStyleContent() throws Exception JavaDoc{
132          String JavaDoc simple2 = "<html><head><style > <!-- body,td,a,p,.h{font-family:arial,sans-serif} " +
133                 ".h{font-size:20px} " +
134                 " .h{color:#3366cc} " +
135                 " .q{color:#00c} " +
136                 " --></style></head><body> <br> Hello world!</html>";
137          String JavaDoc expected1 = "<?xml version=\"1.0\"?>\n"+
138                     "<html><head><style harmless=\"\">&lt;!-- body,td,a,p,.h{font-family:arial,sans-serif} .h{font-size:20px} .h{color:#3366cc} .q{color:#00c} --&gt;</style></head><body> <br/> Hello world!</body></html>";
139          parseAndCompare(simple2 , expected1);
140 // System.out.println(serialize(document));
141
}
142     
143     public void testAmpReplacer(){
144         String JavaDoc testString = "&#10;&#10;&#10;";
145         String JavaDoc newString = testString.replaceAll("&#10;", "");
146         assertEquals("" ,newString );
147         
148         testString = "&#10;3&#10;1&#10;";
149         newString = testString.replaceAll("&#10;", "");
150         assertEquals("31" ,newString );
151         
152     }
153     
154     public void testStyleReplacer()
155     {
156         String JavaDoc testString = "< style >";
157         String JavaDoc newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> ");
158         assertEquals("<style harmless=''> " ,newString );
159         
160         testString = "< style>";
161         newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> ");
162         assertEquals("<style harmless=''> " ,newString );
163         
164         testString = "<style>";
165         newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> ");
166         assertEquals("<style harmless=''> " ,newString );
167         
168         testString = "< style defer>";
169         newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> ");
170         assertNotSame("<style harmless=''> " ,newString );
171         
172     }
173     
174     public void testMultithreadedXPCOMInitialization() throws InterruptedException JavaDoc{
175         Thread JavaDoc thread1 = new Thread JavaDoc()
176         {
177             public void run(){
178                 try
179                 {
180                     initTestingXPCOM();
181                 }
182                 catch (Exception JavaDoc e)
183                 {
184                     e.printStackTrace();
185                 }
186             }
187         };
188         thread1.start();
189         thread1.join();
190         Thread.sleep(1000);
191         thread1 = new Thread JavaDoc()
192         {
193             public void run(){
194                 try
195                 {
196                     initTestingXPCOM();
197                 }
198                 catch (Exception JavaDoc e)
199                 {
200                     e.printStackTrace();
201                 }
202             }
203         };
204         thread1.start();
205         thread1.join();
206         
207     }
208     
209     public Document JavaDoc parseRandomHtml(int length) throws ParserInitializationException, DocumentException
210     {
211         String JavaDoc html = "<html><body>";
212         for (int i=0; i<length; i++)
213             html += "<div>"+Math.random()+"</div>";
214         html += "</body></html>";
215         MozillaParser parser = new MozillaParser();
216          return parser.parse(html);
217     }
218     
219     public void testMultithreaded1()
220     {
221         Thread JavaDoc thread1 = new Thread JavaDoc()
222         {
223             public void run(){
224                 try
225                 {
226                     parseRandomHtml(100);
227                 } catch (Exception JavaDoc e) {
228                     // TODO Auto-generated catch block
229
e.printStackTrace();
230                 }
231             }
232         };
233         
234         Thread JavaDoc thread2 = new Thread JavaDoc(){
235             public void run(){
236                 try
237                 {
238                     parseRandomHtml(100);
239                 } catch (Exception JavaDoc e) {
240                     // TODO Auto-generated catch block
241
e.printStackTrace();
242                 }
243             }
244         };
245         
246         thread1.start();
247         thread2.start();
248         try {
249             thread1.join();
250             thread2.join();
251         } catch (InterruptedException JavaDoc e) {
252             // TODO Auto-generated catch block
253
e.printStackTrace();
254         }
255         
256     }
257     
258     volatile int failed = 0;
259     
260     public void testMultithreaded2() throws InterruptedException JavaDoc
261     {
262         
263         int NUM_THREADS=50;
264         
265         final Thread JavaDoc[] threadPool = new Thread JavaDoc[NUM_THREADS];
266         final Random JavaDoc random = new Random JavaDoc(0);
267         
268         for (int i=0; i<NUM_THREADS; i++)
269             threadPool[i]= new Thread JavaDoc(){
270             public void run(){
271                 try {
272                     double randomNumber = random.nextDouble()*100000000.0;
273                     String JavaDoc html = "<html><body>";
274                     for (int i=0; i<100; i++)
275                         html += "<p>"+randomNumber+"</p>";
276                     html += "</body></html>";
277                     MozillaParser parser = new MozillaParser();
278                      Document JavaDoc document = parser.parse(html);
279                      Vector JavaDoc<String JavaDoc> instructions = parser.getDomBuilderArguments();
280                      int closeNodeCounter=0;
281                      int openNodeCounter=0;
282                      for (String JavaDoc instruction:instructions)
283                      {
284                          if (instruction.equalsIgnoreCase("CloseNode"))
285                              closeNodeCounter++;
286                          if (instruction.equalsIgnoreCase("OpenNode"))
287                              openNodeCounter++;
288                          
289                      }
290 // System.err.println("Close Node Counter :" + closeNodeCounter);
291
// System.err.println("Open Node Counter :" + openNodeCounter);
292

293                      if (!serialize(document).equals("<?xml version=\"1.0\"?>\n"+html))
294                      {
295                          synchronized(threadPool)
296                          {
297                              System.err.println("Html input was :" + "<?xml version=\"1.0\"?>\n"+html);
298                              System.err.println("Failed document :" + serialize(document));
299                              parser.dump();
300                              System.err.println("Verifying :" + document.getChildNodes().item(0).getChildNodes().item(0).getChildNodes().item(0).getNodeName() );
301                              System.err.println("<p number : > : " + document.getChildNodes().item(0).getChildNodes().item(0).getChildNodes().getLength());
302                              
303                              failed++;
304                          }
305                      }
306                     
307                 }
308                 catch (Throwable JavaDoc e)
309                 {
310                     e.printStackTrace();
311                     failed++;
312                 }
313             }
314         };
315         for (int i=0; i<NUM_THREADS; i++)
316             threadPool[i].start();
317         for (int i=0; i<NUM_THREADS; i++)
318             threadPool[i].join();
319         assertEquals(0, failed);
320     }
321     
322     
323     
324     @SuppressWarnings JavaDoc("unchecked")
325     public void testEntityDomWriterBug() throws Exception JavaDoc{
326         String JavaDoc testString =
327             "<!doctype html public \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"+
328             "<html>" +
329             "<body>"+
330 // "<a HREF=\"http://us.ard.yahoo.com/SIG=12ku07d54/M=289534.6742909.7689533.6551553/D=yahoosrch/S=2766679:HLSCH/Y=YAHOO/EXP=1167961934/A=2828626/R=0/SIG=10mgpruen" +
331
// "/*http://www.yahoo.com?fr=yfp-t-501\">Yahoo!</a> &nbsp; " +
332
// "<a HREF=\"http://us.ard.yahoo.com/SIG=12ku07d54/M=289534.6742909.7689533.6551553/D=yahoosrch/S=2766679:HLSCH/Y=YAHOO" +
333
// "" +
334
// "/EXP=1167961934/A=2828626/R=1/SIG=11nbq2pc6/*http://us.rd.yahoo.com/evt=31554/*http://my.yahoo.com?fr=yfp-t-501\">" +
335
// "My Yahoo!</a>" +
336
// " &nbsp;" +
337
" <a HREF=\"http://us.ard.yahoo.co" +
338             "" +
339             "m/SIG=12ku07d54/M=289534.6742909.7689533.6551553/D=yahoosrch/S" +
340             "" +
341             "=2766679:HLSCH/Y=YAHOO/EXP=1167961934/A=2828626/R=2/SIG=10n3m6b64/*http" +
342             "://mail.yahoo.com?fr=yfp-t-501\">" +
343             "Mail</a> " +
344             "&nbsp; &nbsp;" +
345             " Welcome, " +
346             "<strong>Guest</strong> [" +
347             "";
348             
349             Document JavaDoc document = new MozillaParser().parse(testString);
350             
351             ByteArrayOutputStream JavaDoc bs = new ByteArrayOutputStream JavaDoc();
352             OutputStreamWriter JavaDoc oSW = null;
353               oSW = new OutputStreamWriter JavaDoc(bs);
354               
355             OutputFormat format = OutputFormat.createPrettyPrint();
356             format.setXHTML(false);
357             format.setExpandEmptyElements(true);
358             HTMLWriter writer = new HTMLWriter(oSW, format);
359             Set JavaDoc tags = writer.getPreformattedTags();
360             tags.add("STYLE");
361             writer.setPreformattedTags(tags);
362             
363             DOMReader domReader = new DOMReader();
364             
365 // System.out.println(" dom serialization : \n "+ serialize(document));
366

367               writer.write(domReader.read(document));
368               writer.flush();
369
370               // nhaving no exception means that the test is OK.
371

372     }
373     
374     
375     // from dapper : TODO : put this in a UTIL class :
376
private String JavaDoc findEncoding(Element JavaDoc rootElement) {
377             String JavaDoc encoding = "UTF-8";
378             NodeList JavaDoc metas = rootElement.getElementsByTagName("meta");
379             for (int m = 0; m < metas.getLength(); m++) {
380               Element JavaDoc meta = (Element JavaDoc)metas.item(m);
381               // find if we have an http-equiv attribute :
382
boolean hasHttpEquivContentType = false; // guilty until proven otherwise.
383
boolean hasNameContentType = false; // guilty until proven otherwise.
384
if (meta.getAttribute("http-equiv").length()>0)
385               {
386                   hasHttpEquivContentType = meta.getAttribute("http-equiv").toLowerCase().equals("content-type");
387               }
388               else
389                   if (meta.getAttribute("HTTP-EQUIV").length()>0)
390                   {
391                       hasHttpEquivContentType = meta.getAttribute("HTTP-EQUIV").toLowerCase().equals("content-type");
392                   }
393               
394               if (meta.getAttribute("name").length()>0)
395                   hasNameContentType = meta.getAttribute("name").toLowerCase().equals("content-type");
396               else
397                   if (meta.getAttribute("NAME").length()>0)
398                       hasNameContentType = meta.getAttribute("NAME").toLowerCase().equals("content-type");
399               
400               String JavaDoc contentAttributeStr = null;
401               
402               if ( meta.getAttribute("content").length()>0)
403                   contentAttributeStr = meta.getAttribute("content") ;
404               else
405                   if ( meta.getAttribute("CONTENT").length()>0)
406                       contentAttributeStr = meta.getAttribute("CONTENT") ;
407               
408               if ( (hasHttpEquivContentType || hasNameContentType) && contentAttributeStr != null ) {
409
410                 Pattern JavaDoc pat = Pattern.compile("charset\\s?=\\s?(.+);*",Pattern.CASE_INSENSITIVE);
411                 Matcher JavaDoc mat = pat.matcher(contentAttributeStr);
412                 if (mat.find())
413                 {
414                   encoding = mat.group(1);
415                   break;
416                 }
417               }
418             }
419             
420             return encoding;
421           }
422     
423       private void printDocumentPreety(Document JavaDoc doc) throws IOException JavaDoc{
424           StringWriter JavaDoc stringWriter = new StringWriter JavaDoc();
425             OutputFormat format = OutputFormat.createPrettyPrint();
426             format.setXHTML(false);
427             format.setEncoding(findEncoding(doc.getDocumentElement()));
428             format.setExpandEmptyElements(true);
429             HTMLWriter writer = new HTMLWriter(stringWriter, format);
430             Set JavaDoc tags = writer.getPreformattedTags();
431 // tags.add("STYLE");
432
tags.clear();
433             writer.setPreformattedTags(tags);
434             DOMReader domReader = new DOMReader();
435             writer.write(domReader.read(doc));
436 // System.out.println("Document:\n" + stringWriter.toString());
437
}
438       
439 // /**
440
// * @param youTubeContent
441
// * @throws DocumentException
442
// * @throws NetworkErrorException
443
// * @throws IOException
444
// * @throws MalformedURLException
445
// */
446
// private void displayMozillaAndTagsoupDoms(Cacher cacher , String url) throws Exception {
447
// String content = null;
448
// try
449
// {
450
// System.err.println("Fetching content from :" + url);
451
// content = cacher.getCache(url);
452
// }
453
// catch (Exception e)
454
// {
455
// System.err.println("couldn't find contetn for URL:" + url +". grabbing page from net...");
456
// content = Util.urlGetContents(new URL(url));
457
// cacher.putCache(url , content);
458
// }
459
//
460
//
461
// // profile mozilla :
462
// Document document = MozillaParser.getInstance().parse(content);
463
//
464
//// System.out.println("Mozilla encoding :" + findEncoding(document.getDocumentElement()));
465
//
466
// printDocumentPreety(document);
467
//
468
// // profile tagsoup :
469
// Parser htmlParser = new Parser();
470
//
471
// SAXReader saxReader = new SAXReader(htmlParser);
472
// saxReader.setMergeAdjacentText(true);
473
// DOMWriter domWriter = new DOMWriter();
474
// document = domWriter.write(saxReader.read(new StringReader(content)));
475
//
476
//// System.out.println("Tagsoup encoding :" + findEncoding(document.getDocumentElement()));
477
//
478
// printDocumentPreety(document);
479
//
480
////// System.out.println("title :" + );
481
//// String nanaTitle = document.getDocumentElement().getChildNodes().item(0)
482
//// .getChildNodes().item(4).getTextContent();
483
//// for (int i=0; i<nanaTitle.length(); i++)
484
//// System.out.println((int)nanaTitle.charAt(i));
485
// }
486
//
487
// public void testHebrew(){
488
// char dalet = 0xD793;
489
// System.out.println(dalet);
490
// }
491
//
492
//
493
// // this onw is not a true test , just a debug check..
494
// public void testHebrewEncoding() throws Exception
495
// {
496
// Cacher contentCacher = new Cacher("ohad.dappit.com");
497
// displayMozillaAndTagsoupDoms(contentCacher, "http://www.nana.co.il");
498
// }
499
//
500
//
501
// Vector<String> contentList = new Vector<String>();
502
//
503
// public void addToContentList(Cacher cacher , String url) throws Exception{
504
// String content = null;
505
// try
506
// {
507
// System.err.println("Fetching content from :" + url);
508
// content = cacher.getCache(url);
509
// }
510
// catch (Exception e)
511
// {
512
// System.err.println("couldn't find contetn for URL:" + url +". grabbing page from net...");
513
// content = Util.urlGetContents(new URL(url));
514
// cacher.putCache(url , content);
515
// }
516
// contentList.add(content);
517
// }
518
//
519
// public void testMultithreadedPerformance() throws Exception {
520
// Cacher contentCacher = new Cacher("ohad.dappit.com");
521
// contentCacher.setCacheLifeTime(Integer.MAX_VALUE);
522
// addToContentList(contentCacher,"http://www.youtube.com/results?search_query=saddam&search=Search");
523
// addToContentList(contentCacher, "http://www.digg.com");
524
// addToContentList(contentCacher, "http://www.walla.co.il");
525
// addToContentList(contentCacher, "http://www.dappit.com");
526
// addToContentList(contentCacher, "http://www.cnn.com");
527
// addToContentList(contentCacher, "http://slashdot.org");
528
// addToContentList(contentCacher, "http://www.netdimes.org");
529
// addToContentList(contentCacher, "http://www.yahoo.com");
530
// addToContentList(contentCacher, "http://www.mozilla.org");
531
// addToContentList(contentCacher, "http://www.nana.co.il");
532
// addToContentList(contentCacher, "http://www.finance.com");
533
// addToContentList(contentCacher, "http://www.cnn.co.jp/");
534
// addToContentList(contentCacher, "http://www.techcrunch.com/");
535
// addToContentList(contentCacher, "http://freshmeat.net/");
536
//
537
// mozillaParsingTime = 0.0;
538
// tagsoupParsingTime = 0.0;
539
//
540
// System.err.println("Mozilla parsing time :" + mozillaParsingTime +" sec.");
541
// System.err.println("Tagsoup parsing time :" + tagsoupParsingTime +" sec.");
542
//
543
// MozillaParsingThread[] mozillaThreads = new MozillaParsingThread[contentList.size()];
544
// TagSoupParsingThread[] tagsoupThreads = new TagSoupParsingThread[contentList.size()];
545
//
546
// for (int i=0; i<contentList.size(); i++)
547
// {
548
// mozillaThreads[i] = new MozillaParsingThread(contentList.get(i));
549
// tagsoupThreads[i] = new TagSoupParsingThread(contentList.get(i));
550
// }
551
//
552
//
553
// // first do the tagsoup threads :
554
// for (int i=0; i<contentList.size(); i++)
555
// tagsoupThreads[i].start();
556
// for (int i=0; i<contentList.size(); i++)
557
// tagsoupThreads[i].join();
558
//
559
// // then do mizlla threads :
560
// for (int i=0; i<contentList.size(); i++)
561
// mozillaThreads[i].start();
562
//
563
// for (int i=0; i<contentList.size(); i++)
564
// mozillaThreads[i].join();
565
//
566
// System.err.println("--------------> Mozilla parsing time :" + mozillaParsingTime +" sec.");
567
// System.err.println("--------------> Tagsoup parsing time :" + tagsoupParsingTime +" sec.");
568
//
569
// // assert that mozilla parser works no worse than 1.25 the tagsoup time :
570
// assertTrue(1.25*tagsoupParsingTime > mozillaParsingTime);
571
// }
572
//
573
// class MozillaParsingThread extends Thread {
574
//
575
// private final String content;
576
//
577
// public MozillaParsingThread(String content){
578
// this.content = content;
579
// }
580
//
581
// public void run()
582
// {
583
// SimpleTimeProfiler profiler = new SimpleTimeProfiler();
584
// profiler.start();
585
// MozillaParser.getInstance().parse(content);
586
// mozillaParsingTime += profiler.report("Mozilla:");
587
// }
588
//
589
//
590
// }
591
//
592
//class TagSoupParsingThread extends Thread
593
//{
594
//
595
// private final String content;
596
//
597
// public TagSoupParsingThread(String content){
598
// this.content = content;
599
// }
600
//
601
// public void run()
602
// {
603
// SimpleTimeProfiler profiler = new SimpleTimeProfiler();
604
// profiler.start();
605
// try {
606
// tagSoupParse(content);
607
// } catch (DocumentException e) {
608
// // TODO Auto-generated catch block
609
// e.printStackTrace();
610
// }
611
// tagsoupParsingTime += profiler.report("Tagsoup:");
612
// }
613
//
614
//
615
// }
616
//
617
//
618
//
619
// public void testPerformance() throws Exception
620
// {
621
// mozillaParsingTime = 0.0;
622
// tagsoupParsingTime = 0.0;
623
//
624
// Cacher contentCacher = new Cacher("ohad.dappit.com");
625
// contentCacher.setCacheLifeTime(Integer.MAX_VALUE);
626
//
627
// compareMozillaAndTagsoup(contentCacher,"http://www.youtube.com/results?search_query=saddam&search=Search");
628
//
629
// compareMozillaAndTagsoup(contentCacher, "http://www.digg.com");
630
//
631
// compareMozillaAndTagsoup(contentCacher, "http://www.walla.co.il");
632
//
633
// compareMozillaAndTagsoup(contentCacher, "http://www.dappit.com");
634
//
635
// compareMozillaAndTagsoup(contentCacher, "http://www.cnn.com");
636
//
637
// compareMozillaAndTagsoup(contentCacher, "http://slashdot.org");
638
//
639
// compareMozillaAndTagsoup(contentCacher, "http://www.netdimes.org");
640
//
641
// compareMozillaAndTagsoup(contentCacher, "http://www.yahoo.com");
642
//
643
// compareMozillaAndTagsoup(contentCacher, "http://www.mozilla.org");
644
// compareMozillaAndTagsoup(contentCacher, "http://www.nana.co.il");
645
// compareMozillaAndTagsoup(contentCacher, "http://www.finance.com");
646
// compareMozillaAndTagsoup(contentCacher, "http://www.cnn.co.jp/");
647
// compareMozillaAndTagsoup(contentCacher, "http://www.techcrunch.com/");
648
// compareMozillaAndTagsoup(contentCacher, "http://freshmeat.net/");
649
//
650
//
651
// System.err.println("--------------> Mozilla parsing time :" + mozillaParsingTime +" sec.");
652
// System.err.println("--------------> Tagsoup parsing time :" + tagsoupParsingTime +" sec.");
653
//
654
// // assert that mozilla parser works no worse than 1.25 the tagsoup time :
655
// assertTrue(1.25*tagsoupParsingTime > mozillaParsingTime);
656
//
657
// }
658
//
659
// private Document tagSoupParse(String content) throws DocumentException{
660
// Parser htmlParser = new Parser();
661
//
662
// SAXReader saxReader = new SAXReader(htmlParser);
663
// saxReader.setMergeAdjacentText(true);
664
// DOMWriter domWriter = new DOMWriter();
665
// return domWriter.write(saxReader.read(new StringReader(content)));
666
// }
667
//
668
// public void testCrawler() throws MalformedURLException, IOException, NetworkErrorException, CacheDirectoryException, CacheWriteException, DocumentException{
669
//
670
// Cacher cacher = new Cacher();
671
// cacher.setCacheLifeTime(Integer.MAX_VALUE);
672
// for (int i=1; i<20 ; i++)
673
// {
674
// int start=10*i;
675
// String googleUrlString = "http://www.google.co.il/search?q=windows&hl=iw&lr=&start=" +start + "&sa=N";
676
// System.out.println("Fetching :" +googleUrlString);
677
// String urlContent = Util.urlGetContents( new URL(googleUrlString));
678
//
679
// Document googleDoc = tagSoupParse(urlContent);
680
//// Document googleDoc = MozillaParser.getInstance().parse(urlContent);
681
// NodeList anchors = googleDoc.getElementsByTagName("a");
682
// System.out.println("number of anchors : " + anchors.getLength());
683
// for (int j=0; j<anchors.getLength() ; j++)
684
// {
685
// Attr hrefAttribute = (Attr)anchors.item(j).getAttributes().getNamedItem("href");
686
// if (hrefAttribute!=null)
687
// {
688
// String attributeValue = hrefAttribute.getValue();
689
// if (attributeValue.startsWith("http://") && !attributeValue.endsWith(".pdf"))
690
// {
691
// System.err.println(i+":"+j+"/"+anchors.getLength()+ " : Fetching from : " + attributeValue);
692
// String urlContent2=null;
693
// try
694
// {
695
// urlContent2 = cacher.getCache(attributeValue);
696
// }
697
// catch (Exception e)
698
// {
699
// try
700
// {
701
// urlContent2 = Util.urlGetContents(new URL(attributeValue));
702
// }
703
// catch (Exception ex)
704
// {
705
// ex.printStackTrace();
706
// urlContent2 = "<html>";
707
// }
708
// cacher.putCache(attributeValue, urlContent2);
709
// }
710
//// tagSoupParse(urlContent2);
711
// MozillaParser.getInstance().parse(urlContent2);
712
// }
713
// }
714
// }
715
//
716
//
717
//
718
// }
719
//
720
//
721
// }
722
//
723
//
724
// volatile double mozillaParsingTime = 0.0;
725
// volatile double tagsoupParsingTime = 0.0;
726
//
727
// /**
728
// * @param youTubeContent
729
// * @throws DocumentException
730
// * @throws NetworkErrorException
731
// * @throws IOException
732
// * @throws MalformedURLException
733
// */
734
// private void compareMozillaAndTagsoup(Cacher cacher , String url) throws Exception {
735
// String content = null;
736
// try
737
// {
738
// System.err.println("Fetching content from :" + url);
739
// content = cacher.getCache(url);
740
// }
741
// catch (Exception e)
742
// {
743
// System.err.println("couldn't find contetn for URL:" + url +". grabbing page from net...");
744
// content = Util.urlGetContents(new URL(url));
745
// cacher.putCache(url , content);
746
// }
747
//
748
// SimpleTimeProfiler profiler = new SimpleTimeProfiler();
749
//
750
// // profile mozilla :
751
// profiler.start();
752
//// System.out.println("Parsing content : "+ content);
753
// MozillaParser.getInstance().parse(content);
754
// mozillaParsingTime += profiler.report("Mozilla:");
755
//
756
// // profile tagsoup :
757
// profiler.start();
758
// tagSoupParse(content);
759
// tagsoupParsingTime+= profiler.report("tagsoup:");
760
// }
761
//
762
//
763
// public void testXClarisWindow() throws Exception
764
// {
765
//
766
// // came across this error that crashed the parser :
767
//// ###!!! ASSERTION: unsupported leaf node type: 'Not Reached', file C:\dapper\mozilla\parser\htmlparser\java\JavaContentSink.cpp, line 782
768
//// Break: at file C:\dapper\mozilla\parser\htmlparser\java\JavaContentSink.cpp, line 782
769
//
770
// Cacher contentCacher = new Cacher("ohad.dappit.com");
771
// contentCacher.setCacheLifeTime(Integer.MAX_VALUE);
772
//
773
// compareMozillaAndTagsoup(contentCacher," http://www.sdcoe.k12.ca.us/score/cla.html");
774
// }
775
//
776
//
777
// // WARNING : THIS TEST IS NOT WORKING AUTOMATICALLY
778
// // YOU MUST CHECK THAT THE MEMORY CONSUMPTION IN NOT INCREASING MANUALLY
779
// // TODO : FIND A BETTER WAY TO HANDLE THIS
780
public void testMemoryLeak() throws Exception JavaDoc
781     {
782         SimpleMemoryProfiler memoryProfiler = new SimpleMemoryProfiler();
783         memoryProfiler.start();
784         for (int i=0; i<20000; i++)
785         {
786             testSimple2();
787         }
788         //assertTrue("Memory diff is bigger than 20MB. Please check for memory leak" , memoryProfiler.report("Total memory diff") > -100000.0);
789
}
790     
791     
792     
793     
794 }
795
Popular Tags