KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > tests > lexerTests > LexerTests


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Derrick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2005/03/06 21:46:32 $
10
// $Revision: 1.23 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.tests.lexerTests;
28
29 import java.io.IOException JavaDoc;
30 import java.net.URL JavaDoc;
31 import java.util.HashSet JavaDoc;
32
33 import org.htmlparser.Node;
34 import org.htmlparser.Parser;
35 import org.htmlparser.Remark;
36 import org.htmlparser.Tag;
37 import org.htmlparser.Text;
38 import org.htmlparser.lexer.Lexer;
39 import org.htmlparser.tests.ParserTestCase;
40 import org.htmlparser.util.EncodingChangeException;
41 import org.htmlparser.util.NodeIterator;
42 import org.htmlparser.util.NodeList;
43 import org.htmlparser.util.ParserException;
44
45 public class LexerTests extends ParserTestCase
46 {
47
48     static
49     {
50         System.setProperty ("org.htmlparser.tests.lexerTests.LexerTests", "LexerTests");
51     }
52
53     /**
54      * Test the Lexer class.
55      */

56     public LexerTests (String JavaDoc name)
57     {
58         super (name);
59     }
60
61     /**
62      * Test operation without tags.
63      */

64     public void testPureText () throws ParserException
65     {
66         String JavaDoc reference;
67         Lexer lexer;
68         Text node;
69
70         reference = "Hello world";
71         lexer = new Lexer (reference);
72         node = (Text)lexer.nextNode ();
73         assertEquals ("Text contents wrong", reference, node.getText ());
74     }
75
76     /**
77      * Test operation with Unix line endings.
78      */

79     public void testUnixEOL () throws ParserException
80     {
81         String JavaDoc reference;
82         Lexer lexer;
83         Text node;
84
85         reference = "Hello\nworld";
86         lexer = new Lexer (reference);
87         node = (Text)lexer.nextNode ();
88         assertEquals ("Text contents wrong", reference, node.getText ());
89     }
90
91     /**
92      * Test operation with Dos line endings.
93      */

94     public void testDosEOL () throws ParserException
95     {
96         String JavaDoc reference;
97         Lexer lexer;
98         Text node;
99
100         reference = "Hello\r\nworld";
101         lexer = new Lexer (reference);
102         node = (Text)lexer.nextNode ();
103         assertEquals ("Text contents wrong", reference, node.getText ());
104         reference = "Hello\rworld";
105         lexer = new Lexer (reference);
106         node = (Text)lexer.nextNode ();
107         assertEquals ("Text contents wrong", reference, node.getText ());
108     }
109
110     /**
111      * Test operation with line endings near the end of input.
112      */

113     public void testEOF_EOL () throws ParserException
114     {
115         String JavaDoc reference;
116         Lexer lexer;
117         Text node;
118
119         reference = "Hello world\n";
120         lexer = new Lexer (reference);
121         node = (Text)lexer.nextNode ();
122         assertEquals ("Text contents wrong", reference, node.getText ());
123         reference = "Hello world\r";
124         lexer = new Lexer (reference);
125         node = (Text)lexer.nextNode ();
126         assertEquals ("Text contents wrong", reference, node.getText ());
127         reference = "Hello world\r\n";
128         lexer = new Lexer (reference);
129         node = (Text)lexer.nextNode ();
130         assertEquals ("Text contents wrong", reference, node.getText ());
131     }
132
133     /**
134      * Test that tags stop string nodes.
135      */

136     public void testTagStops () throws ParserException
137     {
138         String JavaDoc[] references =
139         {
140             "Hello world",
141             "Hello world\n",
142             "Hello world\r\n",
143             "Hello world\r",
144
145         };
146         String JavaDoc[] suffixes =
147         {
148             "<head>",
149             "</head>",
150             "<%=head%>",
151             "<!--head-->",
152         };
153         Lexer lexer;
154         Text node;
155
156         for (int i = 0; i < references.length; i++)
157         {
158             for (int j = 0; j < suffixes.length; j++)
159             {
160                 lexer = new Lexer (references[i] + suffixes[j]);
161                 node = (Text)lexer.nextNode ();
162                 assertEquals ("Text contents wrong", references[i], node.getText ());
163             }
164         }
165     }
166
167     /**
168      * Test operation with only tags.
169      */

170     public void testPureTag () throws ParserException
171     {
172         String JavaDoc reference;
173         String JavaDoc suffix;
174         Lexer lexer;
175         Node node;
176
177         reference = "<head>";
178         lexer = new Lexer (reference);
179         node = lexer.nextNode ();
180         assertEquals ("Tag contents wrong", reference, node.toHtml ());
181
182         reference = "<head>";
183         suffix = "<body>";
184         lexer = new Lexer (reference + suffix);
185         node = lexer.nextNode ();
186         assertEquals ("Tag contents wrong", reference, node.toHtml ());
187         node = lexer.nextNode ();
188         assertEquals ("Tag contents wrong", suffix, node.toHtml ());
189     }
190
191     /**
192      * Test operation with attributed tags.
193      */

194     public void testAttributedTag () throws ParserException
195     {
196         String JavaDoc reference;
197         Lexer lexer;
198         Node node;
199
200         reference = "<head lang='en_US' dir=ltr\nprofile=\"http://htmlparser.sourceforge.org/dictionary.html\">";
201         lexer = new Lexer (reference);
202         node = lexer.nextNode ();
203         assertEquals ("Tag contents wrong", reference, node.toHtml ());
204     }
205
206     /**
207      * Test operation with comments.
208      */

209     public void testRemark () throws ParserException
210     {
211         String JavaDoc reference;
212         Lexer lexer;
213         Remark node;
214         String JavaDoc suffix;
215
216         reference = "<!-- This is a comment -->";
217         lexer = new Lexer (reference);
218         node = (Remark)lexer.nextNode ();
219         assertEquals ("Tag contents wrong", reference, node.toHtml ());
220
221         reference = "<!-- This is a comment -- >";
222         lexer = new Lexer (reference);
223         node = (Remark)lexer.nextNode ();
224         assertEquals ("Tag contents wrong", reference, node.toHtml ());
225
226         reference = "<!-- This is a\nmultiline comment -->";
227         lexer = new Lexer (reference);
228         node = (Remark)lexer.nextNode ();
229         assertEquals ("Tag contents wrong", reference, node.toHtml ());
230
231         suffix = "<head>";
232         reference = "<!-- This is a comment -->";
233         lexer = new Lexer (reference + suffix);
234         node = (Remark)lexer.nextNode ();
235         assertEquals ("Tag contents wrong", reference, node.toHtml ());
236
237         reference = "<!-- This is a comment -- >";
238         lexer = new Lexer (reference + suffix);
239         node = (Remark)lexer.nextNode ();
240         assertEquals ("Tag contents wrong", reference, node.toHtml ());
241
242         reference = "<!-- This is a\nmultiline comment -->";
243         lexer = new Lexer (reference + suffix);
244         node = (Remark)lexer.nextNode ();
245         assertEquals ("Tag contents wrong", reference, node.toHtml ());
246     }
247
248 // /**
249
// * Try a real page.
250
// */
251
// public void testReal () throws ParserException, IOException
252
// {
253
// Lexer lexer;
254
// Node node;
255
//
256
// URL url = new URL ("http://sourceforge.net/projects/htmlparser");
257
// lexer = new Lexer (url.openConnection ());
258
// while (null != (node = lexer.nextNode ()))
259
// System.out.println (node.toString ());
260
// }
261

262     /**
263      * Test the fidelity of the toHtml() method.
264      */

265     public void testFidelity () throws ParserException, IOException JavaDoc
266     {
267         Lexer lexer;
268         Node node;
269         int position;
270         StringBuffer JavaDoc buffer;
271         String JavaDoc string;
272         char[] ref;
273         char[] test;
274
275         URL JavaDoc url = new URL JavaDoc ("http://sourceforge.net/projects/htmlparser");
276         lexer = new Lexer (url.openConnection ());
277         position = 0;
278         buffer = new StringBuffer JavaDoc (80000);
279         while (null != (node = lexer.nextNode ()))
280         {
281             string = node.toHtml ();
282             if (position != node.elementBegin ())
283                 fail ("non-contiguous" + string);
284             buffer.append (string);
285             position = node.elementEnd ();
286             if (buffer.length () != position)
287                 fail ("text length differed after encountering node " + string);
288         }
289         ref = lexer.getPage ().getText ().toCharArray ();
290         test = new char[buffer.length ()];
291         buffer.getChars (0, buffer.length (), test, 0);
292         assertEquals ("different amounts of text", ref.length, test.length);
293         for (int i = 0; i < ref.length; i++)
294             if (ref[i] != test[i])
295                 fail ("character differs at position " + i + ", expected <" + ref[i] + "> but was <" + test[i] + ">");
296     }
297
298 // /**
299
// * Test the relative speed reading from a string parsing tags too.
300
// */
301
// public void testSpeedStringWithoutTags () throws ParserException, IOException
302
// {
303
// final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
304
// URL url;
305
// URLConnection connection;
306
// Source source;
307
// StringBuffer buffer;
308
// int i;
309
// String html;
310
//
311
// long old_total;
312
// long new_total;
313
// long begin;
314
// long end;
315
// StringReader reader;
316
// NodeReader nodes;
317
// Parser parser;
318
// int nodecount;
319
// Node node;
320
// int charcount;
321
//
322
// url = new URL (link);
323
// connection = url.openConnection ();
324
// connection.connect ();
325
// source = new Source (new Stream (connection.getInputStream ()));
326
// buffer = new StringBuffer (350000);
327
// while (-1 != (i = source.read ()))
328
// buffer.append ((char)i);
329
// source.close ();
330
// html = buffer.toString ();
331
// old_total = 0;
332
// new_total = 0;
333
// for (i = 0; i < 5; i++)
334
// {
335
// System.gc ();
336
// begin = System.currentTimeMillis ();
337
// Lexer lexer = new Lexer (html);
338
// nodecount = 0;
339
// while (null != (node = lexer.nextNode ()))
340
// nodecount++;
341
// end = System.currentTimeMillis ();
342
// System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
343
// if (0 != i) // the first timing is way different
344
// new_total += (end - begin);
345
//
346
// System.gc ();
347
// begin = System.currentTimeMillis ();
348
// reader = new StringReader (html);
349
// nodes = new NodeReader (new BufferedReader (reader), 350000);
350
// parser = new Parser (nodes, null);
351
// nodecount = 0;
352
// while (null != (node = nodes.readElement ()))
353
// nodecount++;
354
// end = System.currentTimeMillis ();
355
// System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
356
// if (0 != i) // the first timing is way different
357
// old_total += (end - begin);
358
// }
359
// assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
360
// System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
361
// }
362
//
363
// /**
364
// * Test the relative speed reading from a string parsing tags too.
365
// */
366
// public void testSpeedStringWithTags () throws ParserException, IOException
367
// {
368
// final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
369
// URL url;
370
// URLConnection connection;
371
// Source source;
372
// StringBuffer buffer;
373
// int i;
374
// String html;
375
//
376
// long old_total;
377
// long new_total;
378
// long begin;
379
// long end;
380
// StringReader reader;
381
// NodeReader nodes;
382
// Parser parser;
383
// int nodecount;
384
// Node node;
385
// int charcount;
386
//
387
// url = new URL (link);
388
// connection = url.openConnection ();
389
// connection.connect ();
390
// source = new Source (new Stream (connection.getInputStream ()));
391
// buffer = new StringBuffer (350000);
392
// while (-1 != (i = source.read ()))
393
// buffer.append ((char)i);
394
// source.close ();
395
// html = buffer.toString ();
396
// old_total = 0;
397
// new_total = 0;
398
// for (i = 0; i < 5; i++)
399
// {
400
// System.gc ();
401
// begin = System.currentTimeMillis ();
402
// Lexer lexer = new Lexer (html);
403
// nodecount = 0;
404
// while (null != (node = lexer.nextNode ()))
405
// {
406
// nodecount++;
407
// if (node instanceof TagNode)
408
// ((TagNode)node).getAttributes ();
409
// }
410
// end = System.currentTimeMillis ();
411
// System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
412
// if (0 != i) // the first timing is way different
413
// new_total += (end - begin);
414
//
415
// System.gc ();
416
// begin = System.currentTimeMillis ();
417
// reader = new StringReader (html);
418
// nodes = new NodeReader (new BufferedReader (reader), 350000);
419
// parser = new Parser (nodes, null);
420
// nodecount = 0;
421
// while (null != (node = nodes.readElement ()))
422
// {
423
// nodecount++;
424
// if (node instanceof Tag)
425
// ((Tag)node).getAttributes ();
426
// }
427
// end = System.currentTimeMillis ();
428
// System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
429
// if (0 != i) // the first timing is way different
430
// old_total += (end - begin);
431
// }
432
// assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
433
// System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
434
// }
435
//
436
// public void testSpeedStreamWithoutTags () throws ParserException, IOException
437
// {
438
// final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
439
// URL url;
440
// URLConnection connection;
441
// Source source;
442
// StringBuffer buffer;
443
// int i;
444
// String html;
445
// InputStream stream;
446
//
447
// long old_total;
448
// long new_total;
449
// long begin;
450
// long end;
451
// InputStreamReader reader;
452
// NodeReader nodes;
453
// Parser parser;
454
// int nodecount;
455
// Node node;
456
// int charcount;
457
//
458
// url = new URL (link);
459
// connection = url.openConnection ();
460
// connection.connect ();
461
// source = new Source (new Stream (connection.getInputStream ()));
462
// buffer = new StringBuffer (350000);
463
// while (-1 != (i = source.read ()))
464
// buffer.append ((char)i);
465
// source.close ();
466
// html = buffer.toString ();
467
// old_total = 0;
468
// new_total = 0;
469
//
470
// for (i = 0; i < 5; i++)
471
// {
472
//
473
// System.gc ();
474
// begin = System.currentTimeMillis ();
475
// stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
476
// Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET));
477
// nodecount = 0;
478
// while (null != (node = lexer.nextNode ()))
479
// nodecount++;
480
// end = System.currentTimeMillis ();
481
// System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
482
// if (0 != i) // the first timing is way different
483
// new_total += (end - begin);
484
//
485
// System.gc ();
486
// begin = System.currentTimeMillis ();
487
// stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
488
// reader = new InputStreamReader (stream);
489
// nodes = new NodeReader (reader, 350000);
490
// parser = new Parser (nodes, null);
491
// nodecount = 0;
492
// while (null != (node = nodes.readElement ()))
493
// nodecount++;
494
// end = System.currentTimeMillis ();
495
// System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
496
// if (0 != i) // the first timing is way different
497
// old_total += (end - begin);
498
//
499
// }
500
// assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
501
// System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
502
// }
503
//
504
// public void testSpeedStreamWithTags () throws ParserException, IOException
505
// {
506
// final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
507
// URL url;
508
// URLConnection connection;
509
// Source source;
510
// StringBuffer buffer;
511
// int i;
512
// String html;
513
// InputStream stream;
514
//
515
// long old_total;
516
// long new_total;
517
// long begin;
518
// long end;
519
// InputStreamReader reader;
520
// NodeReader nodes;
521
// Parser parser;
522
// int nodecount;
523
// Node node;
524
// int charcount;
525
//
526
// url = new URL (link);
527
// connection = url.openConnection ();
528
// connection.connect ();
529
// source = new Source (new Stream (connection.getInputStream ()));
530
// buffer = new StringBuffer (350000);
531
// while (-1 != (i = source.read ()))
532
// buffer.append ((char)i);
533
// source.close ();
534
// html = buffer.toString ();
535
// old_total = 0;
536
// new_total = 0;
537
//
538
// for (i = 0; i < 5; i++)
539
// {
540
//
541
// System.gc ();
542
// begin = System.currentTimeMillis ();
543
// stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
544
// Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET));
545
// nodecount = 0;
546
// while (null != (node = lexer.nextNode ()))
547
// {
548
// nodecount++;
549
// if (node instanceof TagNode)
550
// ((TagNode)node).getAttributes ();
551
// }
552
// end = System.currentTimeMillis ();
553
// System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
554
// if (0 != i) // the first timing is way different
555
// new_total += (end - begin);
556
//
557
// System.gc ();
558
// begin = System.currentTimeMillis ();
559
// stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
560
// reader = new InputStreamReader (stream);
561
// nodes = new NodeReader (reader, 350000);
562
// parser = new Parser (nodes, null);
563
// nodecount = 0;
564
// while (null != (node = nodes.readElement ()))
565
// {
566
// nodecount++;
567
// if (node instanceof Tag)
568
// ((Tag)node).getAttributes ();
569
// }
570
// end = System.currentTimeMillis ();
571
// System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
572
// if (0 != i) // the first timing is way different
573
// old_total += (end - begin);
574
// }
575
// assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
576
// System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
577
// }
578

579 // public static void main (String[] args) throws ParserException, IOException
580
// {
581
// LexerTests tests = new LexerTests ("hallow");
582
// tests.testSpeedStreamWithTags ();
583
// }
584

585     static final HashSet JavaDoc mAcceptable;
586     static
587     {
588         mAcceptable = new HashSet JavaDoc ();
589         mAcceptable.add ("A");
590         mAcceptable.add ("BODY");
591         mAcceptable.add ("BR");
592         mAcceptable.add ("CENTER");
593         mAcceptable.add ("FONT");
594         mAcceptable.add ("HEAD");
595         mAcceptable.add ("HR");
596         mAcceptable.add ("HTML");
597         mAcceptable.add ("IMG");
598         mAcceptable.add ("P");
599         mAcceptable.add ("TABLE");
600         mAcceptable.add ("TD");
601         mAcceptable.add ("TITLE");
602         mAcceptable.add ("TR");
603         mAcceptable.add ("META");
604         mAcceptable.add ("STRONG");
605         mAcceptable.add ("FORM");
606         mAcceptable.add ("INPUT");
607         mAcceptable.add ("!DOCTYPE");
608         mAcceptable.add ("TBODY");
609         mAcceptable.add ("B");
610         mAcceptable.add ("DIV");
611         mAcceptable.add ("SCRIPT");
612         mAcceptable.add ("NOSCRIPT");
613     }
614
615     /**
616      * Test case for bug #789439 Japanese page causes OutOfMemory Exception
617      * No exception is thrown in the current version of the parser,
618      * however, the problem is that ISO-2022-JP (aka JIS) encoding sometimes
619      * causes spurious tags.
620      * The root cause is characters bracketed by [esc]$B and [esc](J (contrary
621      * to what is indicated in then j_s_nightingale analysis of the problem) that
622      * sometimes have an angle bracket (&lt; or 0x3c) embedded in them. These
623      * are taken to be tags by the parser, instead of being considered strings.
624      * <p>
625      * The URL refrenced has an ISO-8859-1 encoding (the default), but
626      * Japanese characters intermixed on the page with English, using the JIS
627      * encoding. We detect failure by looking for weird tag names which were
628      * not correctly handled as string nodes.
629      * <p>
630      * Here is a partial dump of the page with escape sequences:
631      * <pre>
632      * 0002420 1b 24 42 3f 79 4a 42 25 47 25 38 25 2b 25 61 43
633      * 0002440 35 44 65 43 44 1b 28 4a 20 77 69 74 68 20 43 61
634      * ..
635      * 0002720 6c 22 3e 4a 53 6b 79 1b 24 42 42 50 31 7e 25 5a
636      * 0002740 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 3c
637      * ..
638      * 0003060 20 69 1b 24 42 25 62 21 3c 25 49 42 50 31 7e 25
639      * 0003100 5a 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a
640      * ..
641      * 0003220 1b 24 42 25 2d 25 3f 25 5e 25 2f 25 69 24 4e 25
642      * 0003240 5b 21 3c 25 60 25 5a 21 3c 25 38 1b 28 4a 3c 2f
643      * ..
644      * 0003320 6e 65 31 2e 70 6c 22 3e 1b 24 42 3d 60 48 77 43
645      * 0003340 66 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 2d 2d 2d 2d
646      * ..
647      * 0004400 46 6f 72 75 6d 20 30 30 39 20 28 1b 24 42 3e 21
648      * 0004420 3c 6a 24 4b 31 4a 4a 21 44 2e 24 4a 24 49 1b 28
649      * 0004440 4a 29 3c 2f 41 3e 3c 49 4d 47 20 53 52 43 3d 22
650      * </pre>
651      * <p>
652      * The fix proposed by j_s_nightingale is implemented to swallow JIS
653      * escape sequences in the string parser.
654      * Apparently the fix won't help EUC-JP and Shift-JIS though, so this may
655      * still be a problem.
656      * It's theoretically possible that JIS encoding, or another one,
657      * could be used as attribute names or values within tags as well,
658      * but this is considered improbable and is therefore not handled in
659      * the tag parser state machine.
660      */

661     public void testJIS ()
662         throws ParserException
663     {
664         Parser parser;
665         NodeIterator iterator;
666         
667         parser = new Parser ("http://www.009.com/");
668         try
669         {
670             iterator = parser.elements ();
671             while (iterator.hasMoreNodes ())
672                 checkTagNames (iterator.nextNode ());
673         }
674         catch (EncodingChangeException ece)
675         {
676             parser.reset ();
677             iterator = parser.elements ();
678             while (iterator.hasMoreNodes ())
679                 checkTagNames (iterator.nextNode ());
680         }
681     }
682
683     /**
684      * Check the tag name for one of the ones expected on the page.
685      * Recursively check the children.
686      */

687     public void checkTagNames (Node node)
688     {
689         Tag tag;
690         String JavaDoc name;
691         NodeList children;
692         
693         if (node instanceof Tag)
694         {
695             tag = (Tag)node;
696             name = tag.getTagName ();
697             if (!mAcceptable.contains (name))
698                 fail ("unrecognized tag name \"" + name + "\"");
699             children = tag.getChildren ();
700             if (null != children)
701                 for (int i = 0; i < children.size (); i++)
702                     checkTagNames (children.elementAt (i));
703         }
704     }
705
706     /**
707      * See bug #825820 Words conjoined
708      */

709     public void testConjoined ()
710         throws
711             ParserException
712     {
713         StringBuffer JavaDoc buffer;
714         NodeIterator iterator;
715         Node node;
716         String JavaDoc expected;
717
718         expected = "The Title\nThis is the body.";
719         String JavaDoc html1 = "<html><title>The Title\n</title>" +
720             "<body>This is <a HREF=\"foo.html\">the body</a>.</body></html>";
721         createParser (html1);
722         buffer = new StringBuffer JavaDoc ();
723         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
724         {
725             node = iterator.nextNode ();
726             String JavaDoc text = node.toPlainTextString ();
727             buffer.append (text);
728         }
729         assertStringEquals ("conjoined text", expected, buffer.toString ());
730
731         String JavaDoc html2 = "<html><title>The Title</title>\n" +
732             "<body>This is <a HREF=\"foo.html\">the body</a>.</body></html>";
733         createParser (html2);
734         buffer = new StringBuffer JavaDoc ();
735         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
736         {
737             node = iterator.nextNode ();
738             String JavaDoc text = node.toPlainTextString ();
739             buffer.append (text);
740         }
741         assertStringEquals ("conjoined text", expected, buffer.toString ());
742         
743         String JavaDoc html3 = "<html><title>The Title</title>" +
744             "<body>\nThis is <a HREF=\"foo.html\">the body</a>.</body></html>";
745         createParser (html3);
746         buffer = new StringBuffer JavaDoc ();
747         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
748         {
749             node = iterator.nextNode ();
750             String JavaDoc text = node.toPlainTextString ();
751             buffer.append (text);
752         }
753         assertStringEquals ("conjoined text", expected, buffer.toString ());
754     }
755
756     /**
757      * Check for StackOverflow error.
758      */

759     public void testStackOverflow ()
760         throws
761             ParserException
762     {
763         NodeIterator iterator;
764         Node node;
765         String JavaDoc html;
766                                                                                                                                                         
767         html = "<a href = \"http://test.com\" />";
768         createParser (html);
769         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
770         {
771             node = iterator.nextNode ();
772             String JavaDoc text = node.toHtml ();
773             assertStringEquals ("no overflow", html, text);
774         }
775         html = "<a HREF=\"http://test.com\"/>";
776         createParser (html);
777         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
778         {
779             node = iterator.nextNode ();
780             String JavaDoc text = node.toHtml ();
781             assertStringEquals ("no overflow", html, text);
782         }
783         html = "<a href = \"http://test.com\"/>";
784         createParser (html);
785         for (iterator = parser.elements (); iterator.hasMoreNodes (); )
786         {
787             node = iterator.nextNode ();
788             String JavaDoc text = node.toHtml ();
789             assertStringEquals ("no overflow", html, text);
790         }
791     }
792
793     /**
794      * See bug #880283 Character "&gt;" erroneously inserted by Lexer
795      */

796     public void testJsp () throws ParserException
797     {
798         String JavaDoc html;
799         Lexer lexer;
800         Node node;
801         
802         html = "<% out.urlEncode('abc') + \"<br>\" + out.urlEncode('xyz') %>";
803         lexer = new Lexer (html);
804         node = lexer.nextNode ();
805         if (node == null)
806             fail ("too few nodes");
807         else
808             assertStringEquals ("bad html", html, node.toHtml());
809         assertNull ("too many nodes", lexer.nextNode ());
810     }
811     
812     /**
813      * See bug #899413 bug in javascript end detection.
814      */

815     public void testEscapedQuote () throws ParserException
816     {
817         String JavaDoc string;
818         String JavaDoc html;
819         Lexer lexer;
820         Node node;
821         
822         string = "\na='\\'';\n";
823         html = string + "</script>";
824         lexer = new Lexer (html);
825         node = lexer.nextNode (true);
826         if (node == null)
827             fail ("too few nodes");
828         else
829             assertStringEquals ("bad string", string, node.toHtml());
830         assertNotNull ("too few nodes", lexer.nextNode (true));
831         assertNull ("too many nodes", lexer.nextNode (true));
832     }
833
834 }
835
836
Popular Tags