TestMozillaParser


1   /**
2    * 
3    */
4   package com.dappit.Dapper.parser.test;
5   
6   import java.io.ByteArrayOutputStream  ;
7   import java.io.File  ;
8   import java.io.IOException  ;
9   import java.io.OutputStreamWriter  ;
10  import java.io.StringWriter  ;
11  import java.util.Random  ;
12  import java.util.Set  ;
13  import java.util.Vector  ;
14  import java.util.regex.Matcher  ;
15  import java.util.regex.Pattern  ;
16  
17  import junit.framework.TestCase;
18  
19  import org.dom4j.DocumentException;
20  import org.dom4j.io.DOMReader;
21  import org.dom4j.io.HTMLWriter;
22  import org.dom4j.io.OutputFormat;
23  import org.w3c.dom.Document  ;
24  import org.w3c.dom.Element  ;
25  import org.w3c.dom.NodeList  ;
26  
27  import com.dappit.Dapper.Configuration;
28  import com.dappit.Dapper.parser.EnviromentController;
29  import com.dappit.Dapper.parser.MozillaParser;
30  import com.dappit.Dapper.parser.ParserInitializationException;
31  import com.dappit.Dapper.parser.profiler.SimpleMemoryProfiler;
32  import com.sun.org.apache.xml.internal.serialize.XMLSerializer;
33  
34  /**
35   * @author Ohad Serfaty
36   *
37   */
38  public class TestMozillaParser extends TestCase {
39  
40      boolean doTesting = true;
41      
42      public static void initTestingXPCOM(){
43          File   mozillaParserLibraryFile;
44          try 
45          {
46              mozillaParserLibraryFile = new File  ("native/bin/MozillaParser"+ EnviromentController.getSharedLibraryExtension());
47          }
48          catch (Exception   e1) 
49          {
50              mozillaParserLibraryFile = new File  ("./native/bin/MozillaParser.dll");
51              e1.printStackTrace();
52          }
53          
54          
55          String   mozillaParserLibrary = mozillaParserLibraryFile.getAbsolutePath();
56          String   mozillaComponentBasePath = Configuration.getMozillaComponentsPath();
57              try 
58              {
59                  System.out.println("Loading and initializing XPCOM from "+ mozillaParserLibrary);
60                  MozillaParser.init(mozillaParserLibrary , mozillaComponentBasePath);
61                  System.out.println("done!");
62              }
63              catch (Exception   e) 
64              {
65                  e.printStackTrace();
66              }   
67      }
68      
69      
70      static 
71      {
72          initTestingXPCOM();
73      }
74      
75      // helper function : get the string of the dom document
76      
77      public static String   serialize(Document   document) throws IOException  {
78          StringWriter   stringWriter = new StringWriter  ();
79          XMLSerializer serializer = new XMLSerializer();
80          serializer.setOutputCharStream(stringWriter);
81          serializer.serialize(document);
82          
83          return stringWriter.toString();
84          
85      }
86      
87      
88      private Document   parseAndCompare(String   html ,String   expectedResult) throws Exception  {
89          //MozillaParser parser = MozillaParser.getInstance();
90          MozillaParser parser = new MozillaParser();
91           Document   document = parser.parse(html);
92           //System.out.println(serialize(document));
93           if (doTesting)
94               assertEquals(expectedResult, serialize(document));
95           return document;
96      }
97      
98      public void testSimple1() throws Exception  {
99           String   simple1 = "<html>Hello world!</html>";
100          String   expected1 = "<?xml version=\"1.0\"?>\n" +
101                 "<html><body>Hello world!</body></html>";
102          parseAndCompare(simple1 , expected1);
103     }
104     
105     
106     
107     public void testSimple2() throws Exception  {
108          String   simple2 = "<html>Hello world!</html>";
109          String   expected1 = "<?xml version=\"1.0\"?>\n" +
110                 "<html><body>Hello world!</body></html>";
111          parseAndCompare(simple2 , expected1);
112     }
113 
114     public void testComment1() throws Exception  {
115          String   simple2 = "<html><body><p><!-- a comment --></p> <br> Hello world!</html>";
116          String   expected1 = "<?xml version=\"1.0\"?>\n"+
117                     "<html><body><p><!-- a comment --></p> <br/> Hello world!</body></html>";
118          parseAndCompare(simple2 , expected1);
119 //       System.out.println(serialize(document));
120     }
121     
122     public void testScriptComment1() throws Exception  {
123          String   simple2 = "<html><body><script language=\"JavaScript\" > document.write('hell');</script> <br> Hello world!</html>";
124          String   expected1 = "" +
125                 "<?xml version=\"1.0\"?>\n"+
126 "<html><body><script language=\"JavaScript\">document.write('hell');</script> <br/> Hello world!</body></html>";
127          parseAndCompare(simple2 , expected1);
128 //       System.out.println(serialize(document));
129     }
130     
131     public void testStyleContent() throws Exception  {
132          String   simple2 = "<html><head><style > <!--  body,td,a,p,.h{font-family:arial,sans-serif} " +
133                 ".h{font-size:20px} " +
134                 " .h{color:#3366cc} " +
135                 " .q{color:#00c} " +
136                 " --></style></head><body> <br> Hello world!</html>";
137          String   expected1 = "<?xml version=\"1.0\"?>\n"+
138                     "<html><head><style harmless=\"\">&lt;!--  body,td,a,p,.h{font-family:arial,sans-serif} .h{font-size:20px}  .h{color:#3366cc}  .q{color:#00c}  --&gt;</style></head><body> <br/> Hello world!</body></html>";
139          parseAndCompare(simple2 , expected1);
140 //       System.out.println(serialize(document));
141     }
142     
143     public void testAmpReplacer(){
144         String   testString = "&#10;&#10;&#10;";
145         String   newString = testString.replaceAll("&#10;", "");
146         assertEquals("" ,newString );
147         
148         testString = "&#10;3&#10;1&#10;";
149         newString = testString.replaceAll("&#10;", "");
150         assertEquals("31" ,newString );
151         
152     }
153     
154     public void testStyleReplacer()
155     {
156         String   testString = "< style >";
157         String   newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> ");
158         assertEquals("<style harmless=''> " ,newString );
159         
160         testString = "< style>";
161         newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> ");
162         assertEquals("<style harmless=''> " ,newString );
163         
164         testString = "<style>";
165         newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> ");
166         assertEquals("<style harmless=''> " ,newString );
167         
168         testString = "< style defer>";
169         newString = testString.replaceAll("<\\s*style\\s*>", "<style harmless=''> ");
170         assertNotSame("<style harmless=''> " ,newString );
171         
172     }
173     
174     public void testMultithreadedXPCOMInitialization() throws InterruptedException  {
175         Thread   thread1 = new Thread  ()
176         {
177             public void run(){
178                 try 
179                 {
180                     initTestingXPCOM();
181                 }
182                 catch (Exception   e) 
183                 {
184                     e.printStackTrace();
185                 }
186             }
187         };
188         thread1.start();
189         thread1.join();
190         Thread.sleep(1000);
191         thread1 = new Thread  ()
192         {
193             public void run(){
194                 try 
195                 {
196                     initTestingXPCOM();
197                 }
198                 catch (Exception   e) 
199                 {
200                     e.printStackTrace();
201                 }
202             }
203         };
204         thread1.start();
205         thread1.join();
206         
207     }
208     
209     public Document   parseRandomHtml(int length) throws ParserInitializationException, DocumentException
210     {
211         String   html = "<html><body>";
212         for (int i=0; i<length; i++)
213             html += "<div>"+Math.random()+"</div>";
214         html += "</body></html>";
215         MozillaParser parser = new MozillaParser();
216          return parser.parse(html);
217     }
218     
219     public void testMultithreaded1()
220     {
221         Thread   thread1 = new Thread  ()
222         {
223             public void run(){
224                 try 
225                 {
226                     parseRandomHtml(100);
227                 } catch (Exception   e) {
228                     // TODO Auto-generated catch block
229                     e.printStackTrace();
230                 }
231             }
232         };
233         
234         Thread   thread2 = new Thread  (){
235             public void run(){
236                 try 
237                 {
238                     parseRandomHtml(100);
239                 } catch (Exception   e) {
240                     // TODO Auto-generated catch block
241                     e.printStackTrace();
242                 }
243             }
244         };
245         
246         thread1.start();
247         thread2.start();
248         try {
249             thread1.join();
250             thread2.join();
251         } catch (InterruptedException   e) {
252             // TODO Auto-generated catch block
253             e.printStackTrace();
254         }
255         
256     }
257     
258     volatile int failed = 0;
259     
260     public void testMultithreaded2() throws InterruptedException  
261     {
262         
263         int NUM_THREADS=50;
264         
265         final Thread  [] threadPool = new Thread  [NUM_THREADS];
266         final Random   random = new Random  (0);
267         
268         for (int i=0; i<NUM_THREADS; i++)
269             threadPool[i]= new Thread  (){
270             public void run(){
271                 try {
272                     double randomNumber = random.nextDouble()*100000000.0;
273                     String   html = "<html><body>";
274                     for (int i=0; i<100; i++)
275                         html += "<p>"+randomNumber+"</p>";
276                     html += "</body></html>";
277                     MozillaParser parser = new MozillaParser();
278                      Document   document = parser.parse(html);
279                      Vector  <String  > instructions = parser.getDomBuilderArguments();
280                      int closeNodeCounter=0;
281                      int openNodeCounter=0;
282                      for (String   instruction:instructions)
283                      {
284                          if (instruction.equalsIgnoreCase("CloseNode"))
285                              closeNodeCounter++;
286                          if (instruction.equalsIgnoreCase("OpenNode"))
287                              openNodeCounter++;
288                          
289                      }
290 //                   System.err.println("Close Node Counter :" + closeNodeCounter);
291 //                   System.err.println("Open Node Counter :" + openNodeCounter);
292                      
293                      if (!serialize(document).equals("<?xml version=\"1.0\"?>\n"+html))
294                      {
295                          synchronized(threadPool)
296                          {
297                              System.err.println("Html input was  :" + "<?xml version=\"1.0\"?>\n"+html);
298                              System.err.println("Failed document :" + serialize(document));
299                              parser.dump();
300                              System.err.println("Verifying :" + document.getChildNodes().item(0).getChildNodes().item(0).getChildNodes().item(0).getNodeName() );
301                              System.err.println("<p number : > : " + document.getChildNodes().item(0).getChildNodes().item(0).getChildNodes().getLength());
302                              
303                              failed++;
304                          }
305                      }
306                     
307                 }
308                 catch (Throwable   e)
309                 {
310                     e.printStackTrace();
311                     failed++;
312                 }
313             }
314         };
315         for (int i=0; i<NUM_THREADS; i++)
316             threadPool[i].start();
317         for (int i=0; i<NUM_THREADS; i++)
318             threadPool[i].join();
319         assertEquals(0, failed);
320     }
321     
322     
323     
324     @SuppressWarnings  ("unchecked")
325     public void testEntityDomWriterBug() throws Exception  {
326         String   testString = 
327             "<!doctype html public \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"+
328             "<html>" +
329             "<body>"+
330 //          "<a HREF=\"http://us.ard.yahoo.com/SIG=12ku07d54/M=289534.6742909.7689533.6551553/D=yahoosrch/S=2766679:HLSCH/Y=YAHOO/EXP=1167961934/A=2828626/R=0/SIG=10mgpruen" +
331 //          "/*http://www.yahoo.com?fr=yfp-t-501\">Yahoo!</a> &nbsp; " +
332 //          "<a HREF=\"http://us.ard.yahoo.com/SIG=12ku07d54/M=289534.6742909.7689533.6551553/D=yahoosrch/S=2766679:HLSCH/Y=YAHOO" +
333 //          "" +
334 //          "/EXP=1167961934/A=2828626/R=1/SIG=11nbq2pc6/*http://us.rd.yahoo.com/evt=31554/*http://my.yahoo.com?fr=yfp-t-501\">" +
335 //          "My Yahoo!</a>" +
336 //          " &nbsp;" +
337             " <a HREF=\"http://us.ard.yahoo.co" +
338             "" +
339             "m/SIG=12ku07d54/M=289534.6742909.7689533.6551553/D=yahoosrch/S" +
340             "" +
341             "=2766679:HLSCH/Y=YAHOO/EXP=1167961934/A=2828626/R=2/SIG=10n3m6b64/*http" +
342             "://mail.yahoo.com?fr=yfp-t-501\">" +
343             "Mail</a> " +
344             "&nbsp; &nbsp;" +
345             " Welcome, " +
346             "<strong>Guest</strong> [" +
347             "";
348             
349             Document   document = new MozillaParser().parse(testString);
350             
351             ByteArrayOutputStream   bs = new ByteArrayOutputStream  ();
352             OutputStreamWriter   oSW = null;
353               oSW = new OutputStreamWriter  (bs);
354               
355             OutputFormat format = OutputFormat.createPrettyPrint();
356             format.setXHTML(false);
357             format.setExpandEmptyElements(true);
358             HTMLWriter writer = new HTMLWriter(oSW, format);
359             Set   tags = writer.getPreformattedTags();
360             tags.add("STYLE");
361             writer.setPreformattedTags(tags);
362             
363             DOMReader domReader = new DOMReader(); 
364             
365 //          System.out.println(" dom serialization : \n "+ serialize(document));
366             
367               writer.write(domReader.read(document));
368               writer.flush();
369 
370               // nhaving no exception means that the test is OK.
371             
372     }
373     
374     
375     // from dapper : TODO : put this in a UTIL class :
376       private String   findEncoding(Element   rootElement) {
377             String   encoding = "UTF-8";
378             NodeList   metas = rootElement.getElementsByTagName("meta");
379             for (int m = 0; m < metas.getLength(); m++) {
380               Element   meta = (Element  )metas.item(m);
381               // find if we have an http-equiv attribute :
382               boolean hasHttpEquivContentType = false;      // guilty until proven otherwise.
383               boolean hasNameContentType = false;       // guilty until proven otherwise.
384               if (meta.getAttribute("http-equiv").length()>0)
385               {
386                   hasHttpEquivContentType = meta.getAttribute("http-equiv").toLowerCase().equals("content-type");
387               }
388               else
389                   if (meta.getAttribute("HTTP-EQUIV").length()>0)
390                   {
391                       hasHttpEquivContentType = meta.getAttribute("HTTP-EQUIV").toLowerCase().equals("content-type");
392                   }
393               
394               if (meta.getAttribute("name").length()>0)
395                   hasNameContentType = meta.getAttribute("name").toLowerCase().equals("content-type");
396               else
397                   if (meta.getAttribute("NAME").length()>0)
398                       hasNameContentType = meta.getAttribute("NAME").toLowerCase().equals("content-type");
399               
400               String   contentAttributeStr = null;
401               
402               if ( meta.getAttribute("content").length()>0)
403                   contentAttributeStr = meta.getAttribute("content") ;
404               else
405                   if ( meta.getAttribute("CONTENT").length()>0)
406                       contentAttributeStr = meta.getAttribute("CONTENT") ;
407               
408               if ( (hasHttpEquivContentType || hasNameContentType)  &&  contentAttributeStr != null ) {
409 
410                 Pattern   pat = Pattern.compile("charset\\s?=\\s?(.+);*",Pattern.CASE_INSENSITIVE);
411                 Matcher   mat = pat.matcher(contentAttributeStr);
412                 if (mat.find()) 
413                 {
414                   encoding = mat.group(1);
415                   break;
416                 }
417               }
418             }
419             
420             return encoding;
421           }
422     
423       private void printDocumentPreety(Document   doc) throws IOException  {
424           StringWriter   stringWriter = new StringWriter  ();
425             OutputFormat format = OutputFormat.createPrettyPrint();
426             format.setXHTML(false);
427             format.setEncoding(findEncoding(doc.getDocumentElement()));
428             format.setExpandEmptyElements(true);
429             HTMLWriter writer = new HTMLWriter(stringWriter, format);
430             Set   tags = writer.getPreformattedTags();
431 //          tags.add("STYLE");
432             tags.clear();
433             writer.setPreformattedTags(tags);    
434             DOMReader domReader = new DOMReader(); 
435             writer.write(domReader.read(doc));
436 //          System.out.println("Document:\n" + stringWriter.toString());
437       }
438       
439 //  /**
440 //   * @param youTubeContent
441 //   * @throws DocumentException 
442 //   * @throws NetworkErrorException 
443 //   * @throws IOException 
444 //   * @throws MalformedURLException 
445 //   */
446 //  private void displayMozillaAndTagsoupDoms(Cacher cacher , String url) throws Exception {
447 //      String content = null;
448 //      try
449 //      {
450 //          System.err.println("Fetching content from :" + url);
451 //          content = cacher.getCache(url);
452 //      }
453 //      catch (Exception e)
454 //      {
455 //          System.err.println("couldn't find contetn for URL:" + url +". grabbing page from net...");
456 //          content = Util.urlGetContents(new URL(url));
457 //          cacher.putCache(url , content);
458 //      }
459 //      
460 //      
461 //      // profile mozilla :
462 //      Document document = MozillaParser.getInstance().parse(content);
463 //      
464 ////         System.out.println("Mozilla encoding :" + findEncoding(document.getDocumentElement()));
465 //      
466 //       printDocumentPreety(document);
467 //       
468 //      // profile tagsoup :
469 //      Parser htmlParser       = new Parser();
470 //       
471 //      SAXReader saxReader     = new SAXReader(htmlParser);
472 //      saxReader.setMergeAdjacentText(true);
473 //      DOMWriter domWriter     = new DOMWriter();
474 //      document                = domWriter.write(saxReader.read(new StringReader(content)));
475 //      
476 ////        System.out.println("Tagsoup encoding :" + findEncoding(document.getDocumentElement()));
477 //      
478 //      printDocumentPreety(document);
479 //   
480 //////      System.out.println("title :" + );
481 ////        String nanaTitle = document.getDocumentElement().getChildNodes().item(0) 
482 ////        .getChildNodes().item(4).getTextContent();
483 ////        for (int i=0; i<nanaTitle.length(); i++)
484 ////            System.out.println((int)nanaTitle.charAt(i));
485 //  }
486 //  
487 //  public void testHebrew(){
488 //      char dalet = 0xD793;
489 //      System.out.println(dalet);
490 //  }
491 //  
492 //  
493 //  // this onw is not a true test , just a debug check..
494 //  public void testHebrewEncoding() throws Exception 
495 //  {
496 //      Cacher contentCacher = new Cacher("ohad.dappit.com");
497 //      displayMozillaAndTagsoupDoms(contentCacher, "http://www.nana.co.il");
498 //  }
499 //  
500 //  
501 //  Vector<String> contentList = new Vector<String>();
502 //  
503 //  public void addToContentList(Cacher cacher , String url) throws Exception{
504 //      String content = null;
505 //      try
506 //      {
507 //          System.err.println("Fetching content from :" + url);
508 //          content = cacher.getCache(url);
509 //      }
510 //      catch (Exception e)
511 //      {
512 //          System.err.println("couldn't find contetn for URL:" + url +". grabbing page from net...");
513 //          content = Util.urlGetContents(new URL(url));
514 //          cacher.putCache(url , content);
515 //      }
516 //      contentList.add(content);
517 //  }
518 //  
519 //  public void testMultithreadedPerformance() throws Exception {
520 //      Cacher contentCacher = new Cacher("ohad.dappit.com");
521 //      contentCacher.setCacheLifeTime(Integer.MAX_VALUE);
522 //      addToContentList(contentCacher,"http://www.youtube.com/results?search_query=saddam&search=Search");
523 //      addToContentList(contentCacher, "http://www.digg.com");
524 //      addToContentList(contentCacher, "http://www.walla.co.il");
525 //      addToContentList(contentCacher, "http://www.dappit.com");
526 //      addToContentList(contentCacher, "http://www.cnn.com");
527 //      addToContentList(contentCacher, "http://slashdot.org");
528 //      addToContentList(contentCacher, "http://www.netdimes.org");
529 //      addToContentList(contentCacher, "http://www.yahoo.com");
530 //      addToContentList(contentCacher, "http://www.mozilla.org");
531 //      addToContentList(contentCacher, "http://www.nana.co.il");
532 //      addToContentList(contentCacher, "http://www.finance.com");
533 //      addToContentList(contentCacher, "http://www.cnn.co.jp/");
534 //      addToContentList(contentCacher, "http://www.techcrunch.com/");
535 //      addToContentList(contentCacher, "http://freshmeat.net/");
536 //      
537 //      mozillaParsingTime = 0.0;
538 //      tagsoupParsingTime = 0.0;
539 //      
540 //      System.err.println("Mozilla parsing time :" + mozillaParsingTime +" sec.");
541 //      System.err.println("Tagsoup parsing time :" + tagsoupParsingTime +" sec.");
542 //      
543 //      MozillaParsingThread[] mozillaThreads = new MozillaParsingThread[contentList.size()];
544 //      TagSoupParsingThread[] tagsoupThreads = new TagSoupParsingThread[contentList.size()];
545 //      
546 //      for (int i=0; i<contentList.size(); i++)
547 //      {
548 //          mozillaThreads[i] = new MozillaParsingThread(contentList.get(i));
549 //          tagsoupThreads[i] = new TagSoupParsingThread(contentList.get(i));
550 //      }
551 //      
552 //      
553 //      // first do the tagsoup threads :
554 //      for (int i=0; i<contentList.size(); i++)
555 //          tagsoupThreads[i].start();
556 //      for (int i=0; i<contentList.size(); i++)
557 //          tagsoupThreads[i].join();
558 //      
559 //      // then do mizlla threads :
560 //      for (int i=0; i<contentList.size(); i++)
561 //          mozillaThreads[i].start();
562 //      
563 //      for (int i=0; i<contentList.size(); i++)
564 //          mozillaThreads[i].join();
565 //      
566 //      System.err.println("--------------> Mozilla parsing time :" + mozillaParsingTime +" sec.");
567 //      System.err.println("--------------> Tagsoup parsing time :" + tagsoupParsingTime +" sec.");
568 //      
569 //      // assert that mozilla parser works no worse than 1.25 the tagsoup time :
570 //      assertTrue(1.25*tagsoupParsingTime > mozillaParsingTime);
571 //  }
572 //  
573 //  class MozillaParsingThread extends Thread {
574 //      
575 //      private final String content;
576 //
577 //      public MozillaParsingThread(String content){
578 //          this.content = content;
579 //      }
580 //      
581 //      public void run()
582 //      {
583 //          SimpleTimeProfiler profiler = new SimpleTimeProfiler();
584 //          profiler.start();
585 //          MozillaParser.getInstance().parse(content);
586 //          mozillaParsingTime += profiler.report("Mozilla:");
587 //      }
588 //      
589 //      
590 //  }
591 //  
592 //class TagSoupParsingThread extends Thread 
593 //{
594 //      
595 //      private final String content;
596 //
597 //      public TagSoupParsingThread(String content){
598 //          this.content = content;
599 //      }
600 //      
601 //      public void run()
602 //      {
603 //          SimpleTimeProfiler profiler = new SimpleTimeProfiler();
604 //          profiler.start();
605 //          try {
606 //              tagSoupParse(content);
607 //          } catch (DocumentException e) {
608 //              // TODO Auto-generated catch block
609 //              e.printStackTrace();
610 //          }
611 //          tagsoupParsingTime += profiler.report("Tagsoup:");
612 //      }
613 //      
614 //      
615 //  }
616 //  
617 //  
618 //  
619 //  public void testPerformance() throws Exception
620 //  {
621 //      mozillaParsingTime = 0.0;
622 //      tagsoupParsingTime = 0.0;
623 //      
624 //      Cacher contentCacher = new Cacher("ohad.dappit.com");
625 //      contentCacher.setCacheLifeTime(Integer.MAX_VALUE);
626 //      
627 //      compareMozillaAndTagsoup(contentCacher,"http://www.youtube.com/results?search_query=saddam&search=Search");
628 //      
629 //      compareMozillaAndTagsoup(contentCacher, "http://www.digg.com");
630 //      
631 //      compareMozillaAndTagsoup(contentCacher, "http://www.walla.co.il");
632 //      
633 //      compareMozillaAndTagsoup(contentCacher, "http://www.dappit.com");
634 //      
635 //      compareMozillaAndTagsoup(contentCacher, "http://www.cnn.com");
636 //      
637 //      compareMozillaAndTagsoup(contentCacher, "http://slashdot.org");
638 //      
639 //      compareMozillaAndTagsoup(contentCacher, "http://www.netdimes.org");
640 //      
641 //      compareMozillaAndTagsoup(contentCacher, "http://www.yahoo.com");
642 //      
643 //      compareMozillaAndTagsoup(contentCacher, "http://www.mozilla.org");
644 //      compareMozillaAndTagsoup(contentCacher, "http://www.nana.co.il");
645 //      compareMozillaAndTagsoup(contentCacher, "http://www.finance.com");
646 //      compareMozillaAndTagsoup(contentCacher, "http://www.cnn.co.jp/");
647 //      compareMozillaAndTagsoup(contentCacher, "http://www.techcrunch.com/");
648 //      compareMozillaAndTagsoup(contentCacher, "http://freshmeat.net/");
649 //      
650 //      
651 //      System.err.println("--------------> Mozilla parsing time :" + mozillaParsingTime +" sec.");
652 //      System.err.println("--------------> Tagsoup parsing time :" + tagsoupParsingTime +" sec.");
653 //      
654 //      // assert that mozilla parser works no worse than 1.25 the tagsoup time :
655 //      assertTrue(1.25*tagsoupParsingTime > mozillaParsingTime);
656 //      
657 //  }
658 //  
659 //  private Document tagSoupParse(String content) throws DocumentException{
660 //      Parser htmlParser       = new Parser();
661 //       
662 //      SAXReader saxReader     = new SAXReader(htmlParser);
663 //      saxReader.setMergeAdjacentText(true);
664 //      DOMWriter domWriter     = new DOMWriter();
665 //     return  domWriter.write(saxReader.read(new StringReader(content)));
666 //  }
667 //
668 //  public void testCrawler() throws MalformedURLException, IOException, NetworkErrorException, CacheDirectoryException, CacheWriteException, DocumentException{
669 //      
670 //      Cacher cacher = new Cacher();
671 //      cacher.setCacheLifeTime(Integer.MAX_VALUE);
672 //      for (int i=1; i<20 ; i++)
673 //      {
674 //          int start=10*i;
675 //          String googleUrlString = "http://www.google.co.il/search?q=windows&hl=iw&lr=&start=" +start +  "&sa=N";
676 //          System.out.println("Fetching :" +googleUrlString);
677 //          String urlContent = Util.urlGetContents( new URL(googleUrlString));
678 //          
679 //          Document googleDoc = tagSoupParse(urlContent);
680 ////            Document googleDoc = MozillaParser.getInstance().parse(urlContent);
681 //          NodeList anchors = googleDoc.getElementsByTagName("a");
682 //          System.out.println("number of anchors : " + anchors.getLength());
683 //          for (int j=0; j<anchors.getLength() ; j++)
684 //          {
685 //              Attr hrefAttribute = (Attr)anchors.item(j).getAttributes().getNamedItem("href");
686 //              if (hrefAttribute!=null)
687 //              {
688 //                  String attributeValue = hrefAttribute.getValue();
689 //                  if (attributeValue.startsWith("http://") && !attributeValue.endsWith(".pdf"))
690 //                  {
691 //                      System.err.println(i+":"+j+"/"+anchors.getLength()+ " : Fetching from : " + attributeValue);
692 //                      String urlContent2=null;
693 //                      try 
694 //                      {
695 //                          urlContent2 = cacher.getCache(attributeValue);
696 //                      }
697 //                      catch (Exception e)
698 //                      {
699 //                          try
700 //                          {
701 //                              urlContent2 = Util.urlGetContents(new URL(attributeValue));
702 //                          }
703 //                          catch (Exception ex) 
704 //                          {
705 //                              ex.printStackTrace();
706 //                              urlContent2 = "<html>";
707 //                          }
708 //                          cacher.putCache(attributeValue, urlContent2);
709 //                      }
710 ////                        tagSoupParse(urlContent2);
711 //                      MozillaParser.getInstance().parse(urlContent2);
712 //                  }   
713 //              }
714 //          }
715 //          
716 //          
717 //          
718 //      }
719 //      
720 //      
721 //  }
722 //  
723 //
724 //  volatile double mozillaParsingTime = 0.0;
725 //  volatile double tagsoupParsingTime = 0.0;
726 //  
727 //  /**
728 //   * @param youTubeContent
729 //   * @throws DocumentException 
730 //   * @throws NetworkErrorException 
731 //   * @throws IOException 
732 //   * @throws MalformedURLException 
733 //   */
734 //  private void compareMozillaAndTagsoup(Cacher cacher , String url) throws Exception {
735 //      String content = null;
736 //      try
737 //      {
738 //          System.err.println("Fetching content from :" + url);
739 //          content = cacher.getCache(url);
740 //      }
741 //      catch (Exception e)
742 //      {
743 //          System.err.println("couldn't find contetn for URL:" + url +". grabbing page from net...");
744 //          content = Util.urlGetContents(new URL(url));
745 //          cacher.putCache(url , content);
746 //      }
747 //      
748 //      SimpleTimeProfiler profiler = new SimpleTimeProfiler();
749 //      
750 //      // profile mozilla :
751 //      profiler.start();
752 ////        System.out.println("Parsing content : "+ content);
753 //      MozillaParser.getInstance().parse(content);
754 //      mozillaParsingTime += profiler.report("Mozilla:");
755 //      
756 //      // profile tagsoup :
757 //      profiler.start();
758 //      tagSoupParse(content);
759 //      tagsoupParsingTime+= profiler.report("tagsoup:");
760 //  }
761 //  
762 //  
763 //  public void testXClarisWindow() throws Exception
764 //  {
765 //      
766 //      // came across this error that crashed the parser :
767 ////        ###!!! ASSERTION: unsupported leaf node type: 'Not Reached', file C:\dapper\mozilla\parser\htmlparser\java\JavaContentSink.cpp, line 782
768 ////        Break: at file C:\dapper\mozilla\parser\htmlparser\java\JavaContentSink.cpp, line 782
769 //      
770 //      Cacher contentCacher = new Cacher("ohad.dappit.com");
771 //      contentCacher.setCacheLifeTime(Integer.MAX_VALUE);
772 //      
773 //      compareMozillaAndTagsoup(contentCacher," http://www.sdcoe.k12.ca.us/score/cla.html");
774 //  }
775 //  
776 //  
777 //  // WARNING : THIS TEST IS NOT WORKING AUTOMATICALLY
778 //  // YOU MUST CHECK THAT THE MEMORY CONSUMPTION IN NOT INCREASING MANUALLY
779 //  // TODO : FIND A BETTER WAY TO HANDLE THIS
780     public void testMemoryLeak() throws Exception  
781     {
782         SimpleMemoryProfiler memoryProfiler = new SimpleMemoryProfiler();
783         memoryProfiler.start();
784         for (int i=0; i<20000; i++)
785         {
786             testSimple2();
787         }
788         //assertTrue("Memory diff is bigger than 20MB. Please check for memory leak" , memoryProfiler.report("Total memory diff") > -100000.0);
789     }
790     
791     
792     
793     
794 }
795
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags