LexerTests


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2004 Derrick Oswald
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2005/03/06 21:46:32 $
10  // $Revision: 1.23 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.tests.lexerTests;
28  
29  import java.io.IOException  ;
30  import java.net.URL  ;
31  import java.util.HashSet  ;
32  
33  import org.htmlparser.Node;
34  import org.htmlparser.Parser;
35  import org.htmlparser.Remark;
36  import org.htmlparser.Tag;
37  import org.htmlparser.Text;
38  import org.htmlparser.lexer.Lexer;
39  import org.htmlparser.tests.ParserTestCase;
40  import org.htmlparser.util.EncodingChangeException;
41  import org.htmlparser.util.NodeIterator;
42  import org.htmlparser.util.NodeList;
43  import org.htmlparser.util.ParserException;
44  
45  public class LexerTests extends ParserTestCase
46  {
47  
48      static
49      {
50          System.setProperty ("org.htmlparser.tests.lexerTests.LexerTests", "LexerTests");
51      }
52  
53      /**
54       * Test the Lexer class.
55       */
56      public LexerTests (String   name)
57      {
58          super (name);
59      }
60  
61      /**
62       * Test operation without tags.
63       */
64      public void testPureText () throws ParserException
65      {
66          String   reference;
67          Lexer lexer;
68          Text node;
69  
70          reference = "Hello world";
71          lexer = new Lexer (reference);
72          node = (Text)lexer.nextNode ();
73          assertEquals ("Text contents wrong", reference, node.getText ());
74      }
75  
76      /**
77       * Test operation with Unix line endings.
78       */
79      public void testUnixEOL () throws ParserException
80      {
81          String   reference;
82          Lexer lexer;
83          Text node;
84  
85          reference = "Hello\nworld";
86          lexer = new Lexer (reference);
87          node = (Text)lexer.nextNode ();
88          assertEquals ("Text contents wrong", reference, node.getText ());
89      }
90  
91      /**
92       * Test operation with Dos line endings.
93       */
94      public void testDosEOL () throws ParserException
95      {
96          String   reference;
97          Lexer lexer;
98          Text node;
99  
100         reference = "Hello\r\nworld";
101         lexer = new Lexer (reference);
102         node = (Text)lexer.nextNode ();
103         assertEquals ("Text contents wrong", reference, node.getText ());
104         reference = "Hello\rworld";
105         lexer = new Lexer (reference);
106         node = (Text)lexer.nextNode ();
107         assertEquals ("Text contents wrong", reference, node.getText ());
108     }
109 
110     /**
111      * Test operation with line endings near the end of input.
112      */
113     public void testEOF_EOL () throws ParserException
114     {
115         String   reference;
116         Lexer lexer;
117         Text node;
118 
119         reference = "Hello world\n";
120         lexer = new Lexer (reference);
121         node = (Text)lexer.nextNode ();
122         assertEquals ("Text contents wrong", reference, node.getText ());
123         reference = "Hello world\r";
124         lexer = new Lexer (reference);
125         node = (Text)lexer.nextNode ();
126         assertEquals ("Text contents wrong", reference, node.getText ());
127         reference = "Hello world\r\n";
128         lexer = new Lexer (reference);
129         node = (Text)lexer.nextNode ();
130         assertEquals ("Text contents wrong", reference, node.getText ());
131     }
132 
133     /**
134      * Test that tags stop string nodes.
135      */
136     public void testTagStops () throws ParserException
137     {
138         String  [] references =
139         {
140             "Hello world",
141             "Hello world\n",
142             "Hello world\r\n",
143             "Hello world\r",
144 
145         };
146         String  [] suffixes =
147         {
148             "<head>",
149             "</head>",
150             "<%=head%>",
151             "<!--head-->",
152         };
153         Lexer lexer;
154         Text node;
155 
156         for (int i = 0; i < references.length; i++)
157         {
158             for (int j = 0; j < suffixes.length; j++)
159             {
160                 lexer = new Lexer (references[i] + suffixes[j]);
161                 node = (Text)lexer.nextNode ();
162                 assertEquals ("Text contents wrong", references[i], node.getText ());
163             }
164         }
165     }
166 
167     /**
168      * Test operation with only tags.
169      */
170     public void testPureTag () throws ParserException
171     {
172         String   reference;
173         String   suffix;
174         Lexer lexer;
175         Node node;
176 
177         reference = "<head>";
178         lexer = new Lexer (reference);
179         node = lexer.nextNode ();
180         assertEquals ("Tag contents wrong", reference, node.toHtml ());
181 
182         reference = "<head>";
183         suffix = "<body>";
184         lexer = new Lexer (reference + suffix);
185         node = lexer.nextNode ();
186         assertEquals ("Tag contents wrong", reference, node.toHtml ());
187         node = lexer.nextNode ();
188         assertEquals ("Tag contents wrong", suffix, node.toHtml ());
189     }
190 
191     /**
192      * Test operation with attributed tags.
193      */
194     public void testAttributedTag () throws ParserException
195     {
196         String   reference;
197         Lexer lexer;
198         Node node;
199 
200         reference = "<head lang='en_US' dir=ltr\nprofile=\"http://htmlparser.sourceforge.org/dictionary.html\">";
201         lexer = new Lexer (reference);
202         node = lexer.nextNode ();
203         assertEquals ("Tag contents wrong", reference, node.toHtml ());
204     }
205 
206     /**
207      * Test operation with comments.
208      */
209     public void testRemark () throws ParserException
210     {
211         String   reference;
212         Lexer lexer;
213         Remark node;
214         String   suffix;
215 
216         reference = "<!-- This is a comment -->";
217         lexer = new Lexer (reference);
218         node = (Remark)lexer.nextNode ();
219         assertEquals ("Tag contents wrong", reference, node.toHtml ());
220 
221         reference = "<!-- This is a comment --  >";
222         lexer = new Lexer (reference);
223         node = (Remark)lexer.nextNode ();
224         assertEquals ("Tag contents wrong", reference, node.toHtml ());
225 
226         reference = "<!-- This is a\nmultiline comment -->";
227         lexer = new Lexer (reference);
228         node = (Remark)lexer.nextNode ();
229         assertEquals ("Tag contents wrong", reference, node.toHtml ());
230 
231         suffix = "<head>";
232         reference = "<!-- This is a comment -->";
233         lexer = new Lexer (reference + suffix);
234         node = (Remark)lexer.nextNode ();
235         assertEquals ("Tag contents wrong", reference, node.toHtml ());
236 
237         reference = "<!-- This is a comment --  >";
238         lexer = new Lexer (reference + suffix);
239         node = (Remark)lexer.nextNode ();
240         assertEquals ("Tag contents wrong", reference, node.toHtml ());
241 
242         reference = "<!-- This is a\nmultiline comment -->";
243         lexer = new Lexer (reference + suffix);
244         node = (Remark)lexer.nextNode ();
245         assertEquals ("Tag contents wrong", reference, node.toHtml ());
246     }
247 
248 //    /**
249 //     * Try a real page.
250 //     */
251 //    public void testReal () throws ParserException, IOException
252 //    {
253 //        Lexer lexer;
254 //        Node node;
255 //
256 //        URL url = new URL ("http://sourceforge.net/projects/htmlparser");
257 //        lexer = new Lexer (url.openConnection ());
258 //        while (null != (node = lexer.nextNode ()))
259 //            System.out.println (node.toString ());
260 //    }
261 
262     /**
263      * Test the fidelity of the toHtml() method.
264      */
265     public void testFidelity () throws ParserException, IOException  
266     {
267         Lexer lexer;
268         Node node;
269         int position;
270         StringBuffer   buffer;
271         String   string;
272         char[] ref;
273         char[] test;
274 
275         URL   url = new URL   ("http://sourceforge.net/projects/htmlparser");
276         lexer = new Lexer (url.openConnection ());
277         position = 0;
278         buffer = new StringBuffer   (80000);
279         while (null != (node = lexer.nextNode ()))
280         {
281             string = node.toHtml ();
282             if (position != node.elementBegin ())
283                 fail ("non-contiguous" + string);
284             buffer.append (string);
285             position = node.elementEnd ();
286             if (buffer.length () != position)
287                 fail ("text length differed after encountering node " + string);
288         }
289         ref = lexer.getPage ().getText ().toCharArray ();
290         test = new char[buffer.length ()];
291         buffer.getChars (0, buffer.length (), test, 0);
292         assertEquals ("different amounts of text", ref.length, test.length);
293         for (int i = 0; i < ref.length; i++)
294             if (ref[i] != test[i])
295                 fail ("character differs at position " + i + ", expected <" + ref[i] + "> but was <" + test[i] + ">");
296     }
297 
298 //    /**
299 //     * Test the relative speed reading from a string parsing tags too.
300 //     */
301 //    public void testSpeedStringWithoutTags () throws ParserException, IOException
302 //    {
303 //        final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
304 //        URL url;
305 //        URLConnection connection;
306 //        Source source;
307 //        StringBuffer buffer;
308 //        int i;
309 //        String html;
310 //
311 //        long old_total;
312 //        long new_total;
313 //        long begin;
314 //        long end;
315 //        StringReader reader;
316 //        NodeReader nodes;
317 //        Parser parser;
318 //        int nodecount;
319 //        Node node;
320 //        int charcount;
321 //
322 //        url = new URL (link);
323 //        connection = url.openConnection ();
324 //        connection.connect ();
325 //        source = new Source (new Stream (connection.getInputStream ()));
326 //        buffer = new StringBuffer (350000);
327 //        while (-1 != (i = source.read ()))
328 //            buffer.append ((char)i);
329 //        source.close ();
330 //        html = buffer.toString ();
331 //        old_total = 0;
332 //        new_total = 0;
333 //        for (i = 0; i < 5; i++)
334 //        {
335 //            System.gc ();
336 //            begin = System.currentTimeMillis ();
337 //            Lexer lexer = new Lexer (html);
338 //            nodecount = 0;
339 //            while (null != (node = lexer.nextNode ()))
340 //                nodecount++;
341 //            end = System.currentTimeMillis ();
342 //            System.out.println ("     lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
343 //            if (0 != i) // the first timing is way different
344 //                new_total += (end - begin);
345 //
346 //            System.gc ();
347 //            begin = System.currentTimeMillis ();
348 //            reader = new StringReader (html);
349 //            nodes =  new NodeReader (new BufferedReader (reader), 350000);
350 //            parser = new Parser (nodes, null);
351 //            nodecount = 0;
352 //            while (null != (node = nodes.readElement ()))
353 //                nodecount++;
354 //            end = System.currentTimeMillis ();
355 //            System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
356 //            if (0 != i) // the first timing is way different
357 //                old_total += (end - begin);
358 //        }
359 //        assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
360 //        System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
361 //    }
362 //
363 //    /**
364 //     * Test the relative speed reading from a string parsing tags too.
365 //     */
366 //    public void testSpeedStringWithTags () throws ParserException, IOException
367 //    {
368 //        final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
369 //        URL url;
370 //        URLConnection connection;
371 //        Source source;
372 //        StringBuffer buffer;
373 //        int i;
374 //        String html;
375 //
376 //        long old_total;
377 //        long new_total;
378 //        long begin;
379 //        long end;
380 //        StringReader reader;
381 //        NodeReader nodes;
382 //        Parser parser;
383 //        int nodecount;
384 //        Node node;
385 //        int charcount;
386 //
387 //        url = new URL (link);
388 //        connection = url.openConnection ();
389 //        connection.connect ();
390 //        source = new Source (new Stream (connection.getInputStream ()));
391 //        buffer = new StringBuffer (350000);
392 //        while (-1 != (i = source.read ()))
393 //            buffer.append ((char)i);
394 //        source.close ();
395 //        html = buffer.toString ();
396 //        old_total = 0;
397 //        new_total = 0;
398 //        for (i = 0; i < 5; i++)
399 //        {
400 //            System.gc ();
401 //            begin = System.currentTimeMillis ();
402 //            Lexer lexer = new Lexer (html);
403 //            nodecount = 0;
404 //            while (null != (node = lexer.nextNode ()))
405 //            {
406 //                nodecount++;
407 //                if (node instanceof TagNode)
408 //                    ((TagNode)node).getAttributes ();
409 //            }
410 //            end = System.currentTimeMillis ();
411 //            System.out.println ("     lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
412 //            if (0 != i) // the first timing is way different
413 //                new_total += (end - begin);
414 //
415 //            System.gc ();
416 //            begin = System.currentTimeMillis ();
417 //            reader = new StringReader (html);
418 //            nodes =  new NodeReader (new BufferedReader (reader), 350000);
419 //            parser = new Parser (nodes, null);
420 //            nodecount = 0;
421 //            while (null != (node = nodes.readElement ()))
422 //            {
423 //                nodecount++;
424 //                if (node instanceof Tag)
425 //                    ((Tag)node).getAttributes ();
426 //            }
427 //            end = System.currentTimeMillis ();
428 //            System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
429 //            if (0 != i) // the first timing is way different
430 //                old_total += (end - begin);
431 //        }
432 //        assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
433 //        System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
434 //    }
435 //
436 //    public void testSpeedStreamWithoutTags () throws ParserException, IOException
437 //    {
438 //        final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
439 //        URL url;
440 //        URLConnection connection;
441 //        Source source;
442 //        StringBuffer buffer;
443 //        int i;
444 //        String html;
445 //        InputStream stream;
446 //
447 //        long old_total;
448 //        long new_total;
449 //        long begin;
450 //        long end;
451 //        InputStreamReader reader;
452 //        NodeReader nodes;
453 //        Parser parser;
454 //        int nodecount;
455 //        Node node;
456 //        int charcount;
457 //
458 //        url = new URL (link);
459 //        connection = url.openConnection ();
460 //        connection.connect ();
461 //        source = new Source (new Stream (connection.getInputStream ()));
462 //        buffer = new StringBuffer (350000);
463 //        while (-1 != (i = source.read ()))
464 //            buffer.append ((char)i);
465 //        source.close ();
466 //        html = buffer.toString ();
467 //        old_total = 0;
468 //        new_total = 0;
469 //
470 //        for (i = 0; i < 5; i++)
471 //        {
472 //
473 //            System.gc ();
474 //            begin = System.currentTimeMillis ();
475 //            stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
476 //            Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET));
477 //            nodecount = 0;
478 //            while (null != (node = lexer.nextNode ()))
479 //                nodecount++;
480 //            end = System.currentTimeMillis ();
481 //            System.out.println ("     lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
482 //            if (0 != i) // the first timing is way different
483 //                new_total += (end - begin);
484 //
485 //            System.gc ();
486 //            begin = System.currentTimeMillis ();
487 //            stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
488 //            reader = new InputStreamReader (stream);
489 //            nodes =  new NodeReader (reader, 350000);
490 //            parser = new Parser (nodes, null);
491 //            nodecount = 0;
492 //            while (null != (node = nodes.readElement ()))
493 //                nodecount++;
494 //            end = System.currentTimeMillis ();
495 //            System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
496 //            if (0 != i) // the first timing is way different
497 //                old_total += (end - begin);
498 //
499 //        }
500 //        assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
501 //        System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
502 //    }
503 //
504 //    public void testSpeedStreamWithTags () throws ParserException, IOException
505 //    {
506 //        final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
507 //        URL url;
508 //        URLConnection connection;
509 //        Source source;
510 //        StringBuffer buffer;
511 //        int i;
512 //        String html;
513 //        InputStream stream;
514 //
515 //        long old_total;
516 //        long new_total;
517 //        long begin;
518 //        long end;
519 //        InputStreamReader reader;
520 //        NodeReader nodes;
521 //        Parser parser;
522 //        int nodecount;
523 //        Node node;
524 //        int charcount;
525 //
526 //        url = new URL (link);
527 //        connection = url.openConnection ();
528 //        connection.connect ();
529 //        source = new Source (new Stream (connection.getInputStream ()));
530 //        buffer = new StringBuffer (350000);
531 //        while (-1 != (i = source.read ()))
532 //            buffer.append ((char)i);
533 //        source.close ();
534 //        html = buffer.toString ();
535 //        old_total = 0;
536 //        new_total = 0;
537 //
538 //        for (i = 0; i < 5; i++)
539 //        {
540 //
541 //            System.gc ();
542 //            begin = System.currentTimeMillis ();
543 //            stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
544 //            Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET));
545 //            nodecount = 0;
546 //            while (null != (node = lexer.nextNode ()))
547 //            {
548 //                nodecount++;
549 //                if (node instanceof TagNode)
550 //                    ((TagNode)node).getAttributes ();
551 //            }
552 //            end = System.currentTimeMillis ();
553 //            System.out.println ("     lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
554 //            if (0 != i) // the first timing is way different
555 //                new_total += (end - begin);
556 //
557 //            System.gc ();
558 //            begin = System.currentTimeMillis ();
559 //            stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
560 //            reader = new InputStreamReader (stream);
561 //            nodes =  new NodeReader (reader, 350000);
562 //            parser = new Parser (nodes, null);
563 //            nodecount = 0;
564 //            while (null != (node = nodes.readElement ()))
565 //            {
566 //                nodecount++;
567 //                if (node instanceof Tag)
568 //                    ((Tag)node).getAttributes ();
569 //            }
570 //            end = System.currentTimeMillis ();
571 //            System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
572 //            if (0 != i) // the first timing is way different
573 //                old_total += (end - begin);
574 //        }
575 //        assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
576 //        System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
577 //    }
578 
579 //    public static void main (String[] args) throws ParserException, IOException
580 //    {
581 //        LexerTests tests = new LexerTests ("hallow");
582 //        tests.testSpeedStreamWithTags ();
583 //    }
584 
585     static final HashSet   mAcceptable;
586     static
587     {
588         mAcceptable = new HashSet   ();
589         mAcceptable.add ("A");
590         mAcceptable.add ("BODY");
591         mAcceptable.add ("BR");
592         mAcceptable.add ("CENTER");
593         mAcceptable.add ("FONT");
594         mAcceptable.add ("HEAD");
595         mAcceptable.add ("HR");
596         mAcceptable.add ("HTML");
597         mAcceptable.add ("IMG");
598         mAcceptable.add ("P");
599         mAcceptable.add ("TABLE");
600         mAcceptable.add ("TD");
601         mAcceptable.add ("TITLE");
602         mAcceptable.add ("TR");
603         mAcceptable.add ("META");
604         mAcceptable.add ("STRONG");
605         mAcceptable.add ("FORM");
606         mAcceptable.add ("INPUT");
607         mAcceptable.add ("!DOCTYPE");
608         mAcceptable.add ("TBODY");
609         mAcceptable.add ("B");
610         mAcceptable.add ("DIV");
611         mAcceptable.add ("SCRIPT");
612         mAcceptable.add ("NOSCRIPT");
613     }
614 
615     /**
616      * Test case for bug #789439 Japanese page causes OutOfMemory Exception
617      * No exception is thrown in the current version of the parser,
618      * however, the problem is that ISO-2022-JP (aka JIS) encoding sometimes
619      * causes spurious tags.
620      * The root cause is characters bracketed by [esc]$B and [esc](J (contrary
621      * to what is indicated in then j_s_nightingale analysis of the problem) that
622      * sometimes have an angle bracket (&lt; or 0x3c) embedded in them. These
623      * are taken to be tags by the parser, instead of being considered strings.
624      * <p>
625      * The URL refrenced has an ISO-8859-1 encoding (the default), but
626      * Japanese characters intermixed on the page with English, using the JIS
627      * encoding. We detect failure by looking for weird tag names which were
628      * not correctly handled as string nodes.
629      * <p>
630      * Here is a partial dump of the page with escape sequences:
631      * <pre>
632      * 0002420 1b 24 42 3f 79 4a 42 25 47 25 38 25 2b 25 61 43
633      * 0002440 35 44 65 43 44 1b 28 4a 20 77 69 74 68 20 43 61
634      * ..
635      * 0002720 6c 22 3e 4a 53 6b 79 1b 24 42 42 50 31 7e 25 5a
636      * 0002740 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 3c
637      * ..
638      * 0003060 20 69 1b 24 42 25 62 21 3c 25 49 42 50 31 7e 25
639      * 0003100 5a 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a
640      * ..
641      * 0003220 1b 24 42 25 2d 25 3f 25 5e 25 2f 25 69 24 4e 25
642      * 0003240 5b 21 3c 25 60 25 5a 21 3c 25 38 1b 28 4a 3c 2f
643      * ..
644      * 0003320 6e 65 31 2e 70 6c 22 3e 1b 24 42 3d 60 48 77 43
645      * 0003340 66 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 2d 2d 2d 2d
646      * ..
647      * 0004400 46 6f 72 75 6d 20 30 30 39 20 28 1b 24 42 3e 21
648      * 0004420 3c 6a 24 4b 31 4a 4a 21 44 2e 24 4a 24 49 1b 28
649      * 0004440 4a 29 3c 2f 41 3e 3c 49 4d 47 20 53 52 43 3d 22
650      * </pre>
651      * <p>
652      * The fix proposed by j_s_nightingale is implemented to swallow JIS
653      * escape sequences in the string parser.
654      * Apparently the fix won't help EUC-JP and Shift-JIS though, so this may
655      * still be a problem.
656      * It's theoretically possible that JIS encoding, or another one,
657      * could be used as attribute names or values within tags as well,
658      * but this is considered improbable and is therefore not handled in
659      * the tag parser state machine.
660      */
661     public void testJIS ()
662         throws ParserException
663     {
664         Parser parser;
665         NodeIterator iterator;
666         
667         parser = new Parser ("http://www.009.com/");
668         try
669         {
670             iterator = parser.elements ();
671             while (iterator.hasMoreNodes ())
672                 checkTagNames (iterator.nextNode ());
673         }
674         catch (EncodingChangeException ece)
675         {
676             parser.reset ();
677             iterator = parser.elements ();
678             while (iterator.hasMoreNodes ())
679                 checkTagNames (iterator.nextNode ());
680         }
681     }
682 
683     /**
684      * Check the tag name for one of the ones expected on the page.
685      * Recursively check the children.
686      */
687     public void checkTagNames (Node node)
688     {
689         Tag tag;
690         String   name;
691         NodeList children;
692         
693         if (node instanceof Tag)
694         {
695             tag = (Tag)node;
696             name = tag.getTagName ();
697             if (!mAcceptable.contains (name))
698                 fail ("unrecognized tag name \"" + name + "\"");
699             children = tag.getChildren ();
700             if (null != children)
701                 for (int i = 0; i < children.size (); i++)
702                     checkTagNames (children.elementAt (i));
703         }
704     }
705 
706     /**
707      * See bug #825820 Words conjoined
708      */
709     public void testConjoined ()
710         throws
711             ParserException
712     {
713         StringBuffer   buffer;
714         NodeIterator iterator;
715         Node node;
716         String   expected;
717 
718         expected = "The Title\nThis is the body.";
719         String   html1 = "<html><title>The Title\n</title>" +
720             "<body>This is <a HREF=\"foo.html\">the body</a>.</body></html>";
721         createParser (html1);
722         buffer = new StringBuffer   ();
723         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
724         {
725             node = iterator.nextNode ();
726             String   text = node.toPlainTextString ();
727             buffer.append (text);
728         }
729         assertStringEquals ("conjoined text", expected, buffer.toString ());
730 
731         String   html2 = "<html><title>The Title</title>\n" +
732             "<body>This is <a HREF=\"foo.html\">the body</a>.</body></html>";
733         createParser (html2);
734         buffer = new StringBuffer   ();
735         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
736         {
737             node = iterator.nextNode ();
738             String   text = node.toPlainTextString ();
739             buffer.append (text);
740         }
741         assertStringEquals ("conjoined text", expected, buffer.toString ());
742         
743         String   html3 = "<html><title>The Title</title>" +
744             "<body>\nThis is <a HREF=\"foo.html\">the body</a>.</body></html>";
745         createParser (html3);
746         buffer = new StringBuffer   ();
747         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
748         {
749             node = iterator.nextNode ();
750             String   text = node.toPlainTextString ();
751             buffer.append (text);
752         }
753         assertStringEquals ("conjoined text", expected, buffer.toString ());
754     }
755 
756     /**
757      * Check for StackOverflow error.
758      */
759     public void testStackOverflow ()
760         throws
761             ParserException
762     {
763         NodeIterator iterator;
764         Node node;
765         String   html;
766                                                                                                                                                         
767         html = "<a href = \"http://test.com\" />";
768         createParser (html);
769         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
770         {
771             node = iterator.nextNode ();
772             String   text = node.toHtml ();
773             assertStringEquals ("no overflow", html, text);
774         }
775         html = "<a HREF=\"http://test.com\"/>";
776         createParser (html);
777         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
778         {
779             node = iterator.nextNode ();
780             String   text = node.toHtml ();
781             assertStringEquals ("no overflow", html, text);
782         }
783         html = "<a href = \"http://test.com\"/>";
784         createParser (html);
785         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
786         {
787             node = iterator.nextNode ();
788             String   text = node.toHtml ();
789             assertStringEquals ("no overflow", html, text);
790         }
791     }
792 
793     /**
794      * See bug #880283 Character "&gt;" erroneously inserted by Lexer
795      */
796     public void testJsp () throws ParserException
797     {
798         String   html;
799         Lexer lexer;
800         Node node;
801         
802         html = "<% out.urlEncode('abc') + \"<br>\" + out.urlEncode('xyz') %>";
803         lexer = new Lexer (html);
804         node = lexer.nextNode ();
805         if (node == null)
806             fail ("too few nodes");
807         else
808             assertStringEquals ("bad html", html, node.toHtml());
809         assertNull ("too many nodes", lexer.nextNode ());
810     }
811     
812     /**
813      * See bug #899413 bug in javascript end detection.
814      */
815     public void testEscapedQuote () throws ParserException
816     {
817         String   string;
818         String   html;
819         Lexer lexer;
820         Node node;
821         
822         string = "\na='\\'';\n";
823         html = string + "</script>";
824         lexer = new Lexer (html);
825         node = lexer.nextNode (true);
826         if (node == null)
827             fail ("too few nodes");
828         else
829             assertStringEquals ("bad string", string, node.toHtml());
830         assertNotNull ("too few nodes", lexer.nextNode (true));
831         assertNull ("too many nodes", lexer.nextNode (true));
832     }
833 
834 }
835 
836
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags