1 27 package org.htmlparser.tests.lexerTests; 28 29 import java.io.IOException ; 30 import java.net.URL ; 31 import java.util.HashSet ; 32 33 import org.htmlparser.Node; 34 import org.htmlparser.Parser; 35 import org.htmlparser.Remark; 36 import org.htmlparser.Tag; 37 import org.htmlparser.Text; 38 import org.htmlparser.lexer.Lexer; 39 import org.htmlparser.tests.ParserTestCase; 40 import org.htmlparser.util.EncodingChangeException; 41 import org.htmlparser.util.NodeIterator; 42 import org.htmlparser.util.NodeList; 43 import org.htmlparser.util.ParserException; 44 45 public class LexerTests extends ParserTestCase 46 { 47 48 static 49 { 50 System.setProperty ("org.htmlparser.tests.lexerTests.LexerTests", "LexerTests"); 51 } 52 53 56 public LexerTests (String name) 57 { 58 super (name); 59 } 60 61 64 public void testPureText () throws ParserException 65 { 66 String reference; 67 Lexer lexer; 68 Text node; 69 70 reference = "Hello world"; 71 lexer = new Lexer (reference); 72 node = (Text)lexer.nextNode (); 73 assertEquals ("Text contents wrong", reference, node.getText ()); 74 } 75 76 79 public void testUnixEOL () throws ParserException 80 { 81 String reference; 82 Lexer lexer; 83 Text node; 84 85 reference = "Hello\nworld"; 86 lexer = new Lexer (reference); 87 node = (Text)lexer.nextNode (); 88 assertEquals ("Text contents wrong", reference, node.getText ()); 89 } 90 91 94 public void testDosEOL () throws ParserException 95 { 96 String reference; 97 Lexer lexer; 98 Text node; 99 100 reference = "Hello\r\nworld"; 101 lexer = new Lexer (reference); 102 node = (Text)lexer.nextNode (); 103 assertEquals ("Text contents wrong", reference, node.getText ()); 104 reference = "Hello\rworld"; 105 lexer = new Lexer (reference); 106 node = (Text)lexer.nextNode (); 107 assertEquals ("Text contents wrong", reference, node.getText ()); 108 } 109 110 113 public void testEOF_EOL () throws ParserException 114 { 115 String reference; 116 Lexer lexer; 117 Text node; 118 119 reference = "Hello world\n"; 120 lexer = new Lexer (reference); 121 node = (Text)lexer.nextNode (); 122 assertEquals ("Text contents wrong", reference, node.getText ()); 123 reference = "Hello world\r"; 124 lexer = new Lexer (reference); 125 node = (Text)lexer.nextNode (); 126 assertEquals ("Text contents wrong", reference, node.getText ()); 127 reference = "Hello world\r\n"; 128 lexer = new Lexer (reference); 129 node = (Text)lexer.nextNode (); 130 assertEquals ("Text contents wrong", reference, node.getText ()); 131 } 132 133 136 public void testTagStops () throws ParserException 137 { 138 String [] references = 139 { 140 "Hello world", 141 "Hello world\n", 142 "Hello world\r\n", 143 "Hello world\r", 144 145 }; 146 String [] suffixes = 147 { 148 "<head>", 149 "</head>", 150 "<%=head%>", 151 "<!--head-->", 152 }; 153 Lexer lexer; 154 Text node; 155 156 for (int i = 0; i < references.length; i++) 157 { 158 for (int j = 0; j < suffixes.length; j++) 159 { 160 lexer = new Lexer (references[i] + suffixes[j]); 161 node = (Text)lexer.nextNode (); 162 assertEquals ("Text contents wrong", references[i], node.getText ()); 163 } 164 } 165 } 166 167 170 public void testPureTag () throws ParserException 171 { 172 String reference; 173 String suffix; 174 Lexer lexer; 175 Node node; 176 177 reference = "<head>"; 178 lexer = new Lexer (reference); 179 node = lexer.nextNode (); 180 assertEquals ("Tag contents wrong", reference, node.toHtml ()); 181 182 reference = "<head>"; 183 suffix = "<body>"; 184 lexer = new Lexer (reference + suffix); 185 node = lexer.nextNode (); 186 assertEquals ("Tag contents wrong", reference, node.toHtml ()); 187 node = lexer.nextNode (); 188 assertEquals ("Tag contents wrong", suffix, node.toHtml ()); 189 } 190 191 194 public void testAttributedTag () throws ParserException 195 { 196 String reference; 197 Lexer lexer; 198 Node node; 199 200 reference = "<head lang='en_US' dir=ltr\nprofile=\"http://htmlparser.sourceforge.org/dictionary.html\">"; 201 lexer = new Lexer (reference); 202 node = lexer.nextNode (); 203 assertEquals ("Tag contents wrong", reference, node.toHtml ()); 204 } 205 206 209 public void testRemark () throws ParserException 210 { 211 String reference; 212 Lexer lexer; 213 Remark node; 214 String suffix; 215 216 reference = "<!-- This is a comment -->"; 217 lexer = new Lexer (reference); 218 node = (Remark)lexer.nextNode (); 219 assertEquals ("Tag contents wrong", reference, node.toHtml ()); 220 221 reference = "<!-- This is a comment -- >"; 222 lexer = new Lexer (reference); 223 node = (Remark)lexer.nextNode (); 224 assertEquals ("Tag contents wrong", reference, node.toHtml ()); 225 226 reference = "<!-- This is a\nmultiline comment -->"; 227 lexer = new Lexer (reference); 228 node = (Remark)lexer.nextNode (); 229 assertEquals ("Tag contents wrong", reference, node.toHtml ()); 230 231 suffix = "<head>"; 232 reference = "<!-- This is a comment -->"; 233 lexer = new Lexer (reference + suffix); 234 node = (Remark)lexer.nextNode (); 235 assertEquals ("Tag contents wrong", reference, node.toHtml ()); 236 237 reference = "<!-- This is a comment -- >"; 238 lexer = new Lexer (reference + suffix); 239 node = (Remark)lexer.nextNode (); 240 assertEquals ("Tag contents wrong", reference, node.toHtml ()); 241 242 reference = "<!-- This is a\nmultiline comment -->"; 243 lexer = new Lexer (reference + suffix); 244 node = (Remark)lexer.nextNode (); 245 assertEquals ("Tag contents wrong", reference, node.toHtml ()); 246 } 247 248 262 265 public void testFidelity () throws ParserException, IOException 266 { 267 Lexer lexer; 268 Node node; 269 int position; 270 StringBuffer buffer; 271 String string; 272 char[] ref; 273 char[] test; 274 275 URL url = new URL ("http://sourceforge.net/projects/htmlparser"); 276 lexer = new Lexer (url.openConnection ()); 277 position = 0; 278 buffer = new StringBuffer (80000); 279 while (null != (node = lexer.nextNode ())) 280 { 281 string = node.toHtml (); 282 if (position != node.elementBegin ()) 283 fail ("non-contiguous" + string); 284 buffer.append (string); 285 position = node.elementEnd (); 286 if (buffer.length () != position) 287 fail ("text length differed after encountering node " + string); 288 } 289 ref = lexer.getPage ().getText ().toCharArray (); 290 test = new char[buffer.length ()]; 291 buffer.getChars (0, buffer.length (), test, 0); 292 assertEquals ("different amounts of text", ref.length, test.length); 293 for (int i = 0; i < ref.length; i++) 294 if (ref[i] != test[i]) 295 fail ("character differs at position " + i + ", expected <" + ref[i] + "> but was <" + test[i] + ">"); 296 } 297 298 579 585 static final HashSet mAcceptable; 586 static 587 { 588 mAcceptable = new HashSet (); 589 mAcceptable.add ("A"); 590 mAcceptable.add ("BODY"); 591 mAcceptable.add ("BR"); 592 mAcceptable.add ("CENTER"); 593 mAcceptable.add ("FONT"); 594 mAcceptable.add ("HEAD"); 595 mAcceptable.add ("HR"); 596 mAcceptable.add ("HTML"); 597 mAcceptable.add ("IMG"); 598 mAcceptable.add ("P"); 599 mAcceptable.add ("TABLE"); 600 mAcceptable.add ("TD"); 601 mAcceptable.add ("TITLE"); 602 mAcceptable.add ("TR"); 603 mAcceptable.add ("META"); 604 mAcceptable.add ("STRONG"); 605 mAcceptable.add ("FORM"); 606 mAcceptable.add ("INPUT"); 607 mAcceptable.add ("!DOCTYPE"); 608 mAcceptable.add ("TBODY"); 609 mAcceptable.add ("B"); 610 mAcceptable.add ("DIV"); 611 mAcceptable.add ("SCRIPT"); 612 mAcceptable.add ("NOSCRIPT"); 613 } 614 615 661 public void testJIS () 662 throws ParserException 663 { 664 Parser parser; 665 NodeIterator iterator; 666 667 parser = new Parser ("http://www.009.com/"); 668 try 669 { 670 iterator = parser.elements (); 671 while (iterator.hasMoreNodes ()) 672 checkTagNames (iterator.nextNode ()); 673 } 674 catch (EncodingChangeException ece) 675 { 676 parser.reset (); 677 iterator = parser.elements (); 678 while (iterator.hasMoreNodes ()) 679 checkTagNames (iterator.nextNode ()); 680 } 681 } 682 683 687 public void checkTagNames (Node node) 688 { 689 Tag tag; 690 String name; 691 NodeList children; 692 693 if (node instanceof Tag) 694 { 695 tag = (Tag)node; 696 name = tag.getTagName (); 697 if (!mAcceptable.contains (name)) 698 fail ("unrecognized tag name \"" + name + "\""); 699 children = tag.getChildren (); 700 if (null != children) 701 for (int i = 0; i < children.size (); i++) 702 checkTagNames (children.elementAt (i)); 703 } 704 } 705 706 709 public void testConjoined () 710 throws 711 ParserException 712 { 713 StringBuffer buffer; 714 NodeIterator iterator; 715 Node node; 716 String expected; 717 718 expected = "The Title\nThis is the body."; 719 String html1 = "<html><title>The Title\n</title>" + 720 "<body>This is <a HREF=\"foo.html\">the body</a>.</body></html>"; 721 createParser (html1); 722 buffer = new StringBuffer (); 723 for (iterator = parser.elements (); iterator.hasMoreNodes (); ) 724 { 725 node = iterator.nextNode (); 726 String text = node.toPlainTextString (); 727 buffer.append (text); 728 } 729 assertStringEquals ("conjoined text", expected, buffer.toString ()); 730 731 String html2 = "<html><title>The Title</title>\n" + 732 "<body>This is <a HREF=\"foo.html\">the body</a>.</body></html>"; 733 createParser (html2); 734 buffer = new StringBuffer (); 735 for (iterator = parser.elements (); iterator.hasMoreNodes (); ) 736 { 737 node = iterator.nextNode (); 738 String text = node.toPlainTextString (); 739 buffer.append (text); 740 } 741 assertStringEquals ("conjoined text", expected, buffer.toString ()); 742 743 String html3 = "<html><title>The Title</title>" + 744 "<body>\nThis is <a HREF=\"foo.html\">the body</a>.</body></html>"; 745 createParser (html3); 746 buffer = new StringBuffer (); 747 for (iterator = parser.elements (); iterator.hasMoreNodes (); ) 748 { 749 node = iterator.nextNode (); 750 String text = node.toPlainTextString (); 751 buffer.append (text); 752 } 753 assertStringEquals ("conjoined text", expected, buffer.toString ()); 754 } 755 756 759 public void testStackOverflow () 760 throws 761 ParserException 762 { 763 NodeIterator iterator; 764 Node node; 765 String html; 766 767 html = "<a href = \"http://test.com\" />"; 768 createParser (html); 769 for (iterator = parser.elements (); iterator.hasMoreNodes (); ) 770 { 771 node = iterator.nextNode (); 772 String text = node.toHtml (); 773 assertStringEquals ("no overflow", html, text); 774 } 775 html = "<a HREF=\"http://test.com\"/>"; 776 createParser (html); 777 for (iterator = parser.elements (); iterator.hasMoreNodes (); ) 778 { 779 node = iterator.nextNode (); 780 String text = node.toHtml (); 781 assertStringEquals ("no overflow", html, text); 782 } 783 html = "<a href = \"http://test.com\"/>"; 784 createParser (html); 785 for (iterator = parser.elements (); iterator.hasMoreNodes (); ) 786 { 787 node = iterator.nextNode (); 788 String text = node.toHtml (); 789 assertStringEquals ("no overflow", html, text); 790 } 791 } 792 793 796 public void testJsp () throws ParserException 797 { 798 String html; 799 Lexer lexer; 800 Node node; 801 802 html = "<% out.urlEncode('abc') + \"<br>\" + out.urlEncode('xyz') %>"; 803 lexer = new Lexer (html); 804 node = lexer.nextNode (); 805 if (node == null) 806 fail ("too few nodes"); 807 else 808 assertStringEquals ("bad html", html, node.toHtml()); 809 assertNull ("too many nodes", lexer.nextNode ()); 810 } 811 812 815 public void testEscapedQuote () throws ParserException 816 { 817 String string; 818 String html; 819 Lexer lexer; 820 Node node; 821 822 string = "\na='\\'';\n"; 823 html = string + "</script>"; 824 lexer = new Lexer (html); 825 node = lexer.nextNode (true); 826 if (node == null) 827 fail ("too few nodes"); 828 else 829 assertStringEquals ("bad string", string, node.toHtml()); 830 assertNotNull ("too few nodes", lexer.nextNode (true)); 831 assertNull ("too many nodes", lexer.nextNode (true)); 832 } 833 834 } 835 836 | Popular Tags |