1 27 package org.htmlparser.tests.utilTests; 28 29 import java.io.ByteArrayInputStream ; 30 import java.io.ByteArrayOutputStream ; 31 import java.io.File ; 32 import java.io.FileInputStream ; 33 import java.io.FileWriter ; 34 import java.io.IOException ; 35 import java.io.InputStream ; 36 import java.io.PrintStream ; 37 import java.io.PrintWriter ; 38 import java.lang.reflect.Field ; 39 import java.net.URL ; 40 import java.net.URLConnection ; 41 import java.util.ArrayList ; 42 import java.util.Random ; 43 44 import org.htmlparser.Node; 45 import org.htmlparser.Parser; 46 import org.htmlparser.Remark; 47 import org.htmlparser.Tag; 48 import org.htmlparser.Text; 49 import org.htmlparser.tags.LinkTag; 50 import org.htmlparser.tests.ParserTestCase; 51 import org.htmlparser.util.CharacterReference; 52 import org.htmlparser.util.NodeIterator; 53 import org.htmlparser.util.NodeList; 54 import org.htmlparser.util.ParserException; 55 import org.htmlparser.util.Translate; 56 import org.htmlparser.util.sort.Sort; 57 58 public class CharacterTranslationTest 59 extends 60 ParserTestCase 61 { 62 static 63 { 64 System.setProperty ("org.htmlparser.tests.utilTests.CharacterTranslationTest", "CharacterTranslationTest"); 65 } 66 67 70 protected static CharacterReference[] mReferences; 71 72 public CharacterTranslationTest (String name) 73 { 74 super (name); 75 } 76 77 80 class SimpleClassLoader extends ClassLoader 81 { 82 85 String mRoot; 86 87 public SimpleClassLoader (String root) 88 { 89 if (!root.endsWith (File.separator)) 90 root += File.separator; 91 mRoot = root; 92 } 93 94 public Class loadClass (String className) 95 throws 96 ClassNotFoundException 97 { 98 return (loadClass (className, true)); 99 } 100 101 public synchronized Class loadClass (String className, boolean resolveIt) 102 throws 103 ClassNotFoundException 104 { 105 byte data[]; 106 FileInputStream in; 107 Class ret; 108 109 try 110 { 111 ret = super.findSystemClass (className); 113 } 114 catch (ClassNotFoundException e) 115 { 116 try 117 { 118 in = new FileInputStream (mRoot + className + ".class"); 119 data = new byte[in.available ()]; 120 in.read (data); 121 in.close (); 122 ret = defineClass (className, data, 0, data.length); 123 if (null == ret) 124 throw new ClassFormatError (); 125 if (resolveIt) 126 resolveClass (ret); 127 } 128 catch (IOException ioe) 129 { 130 throw new ClassNotFoundException (); 131 } 132 } 133 134 return (ret); 135 } 136 } 137 138 153 public class Generate 154 { 155 158 protected Parser mParser; 159 160 protected String nl = System.getProperty ("line.separator", "\n"); 161 162 168 public Generate () 169 throws ParserException 170 { 171 mParser = new Parser ("http://www.w3.org/TR/REC-html40/sgml/entities.html"); 172 } 173 174 184 public String translate (String string) 185 { 186 int index; 187 int amp; 188 StringBuffer ret; 189 190 ret = new StringBuffer (4096); 191 192 index = 0; 193 while ((index < string.length ()) && (-1 != (amp = string.indexOf ('&', index)))) 194 { 195 ret.append (string.substring (index, amp)); 197 if (string.startsWith (" ", amp)) 198 { 199 ret.append (" "); 200 index = amp + 6; 201 } 202 else if (string.startsWith ("<", amp)) 203 { 204 ret.append ("<"); 205 index = amp + 4; 206 } 207 else if (string.startsWith (">", amp)) 208 { 209 ret.append (">"); 210 index = amp + 4; 211 } 212 else if (string.startsWith ("&", amp)) 213 { 214 ret.append ("&"); 215 index = amp + 5; 216 } 217 else if (string.startsWith (""e;", amp)) 218 { 219 ret.append ("\""); 220 index = amp + 7; 221 } 222 else if (string.startsWith ("÷", amp)) 223 { 224 ret.append ("&"); 227 index = amp + 1; 228 } 229 else if (string.startsWith ("©", amp)) 230 { 231 ret.append ("&"); 234 index = amp + 1; 235 } 236 else 237 { 238 System.out.println ("unknown special character starting with " + string.substring (amp, amp + 7)); 239 ret.append ("&"); 240 index = amp + 1; 241 } 242 } 243 ret.append (string.substring (index)); 244 245 return (ret.toString ()); 246 } 247 248 public void gather (Node node, StringBuffer buffer) 249 { 250 NodeList children; 251 252 if (node instanceof Text) 253 { 254 Text stringNode = (Text)node; 257 buffer.append (stringNode.getText ()); 259 } 260 else if (node instanceof LinkTag) 261 { 262 LinkTag linkNode = (LinkTag)node; 265 buffer.append (linkNode.getLinkText ()); 267 } 268 else if (node instanceof Tag) 269 { 270 String name = ((Tag)node).getTagName (); 271 if (name.equals ("BR") || name.equals ("P")) 272 buffer.append (nl); 273 else 274 { 275 children = ((Tag)node).getChildren (); 276 if (null != children) 277 for (int i = 0; i < children.size (); i++) 278 gather (children.elementAt (i), buffer); 279 } 280 } 281 else if (node instanceof Remark) 282 { 283 } 284 else 285 { 286 System.out.println (); 287 System.out.println(node.toString()); 288 } 289 } 290 291 297 public int indexOfWhitespace (String string, int index) 298 { 299 int space; 300 int cr; 301 int ret; 302 303 space = string.indexOf (" ", index); 304 cr = string.indexOf (nl, index); 305 if (-1 == space) 306 ret = cr; 307 else if (-1 == cr) 308 ret = space; 309 else 310 ret = Math.min (space, cr); 311 312 return (ret); 313 } 314 315 326 public String pack (String string) 327 { 328 int index; 329 int spaces; 330 StringBuffer ret; 331 332 ret = new StringBuffer (string.length ()); 333 334 if (string.startsWith ("-- ")) 335 string = string.substring (3); 336 index = 0; 338 while ((index < string.length ()) && (-1 != (spaces = indexOfWhitespace (string, index)))) 339 { 340 ret.append (string.substring (index, spaces)); 341 ret.append (" "); 342 while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces))) 343 spaces++; 344 index = spaces; 345 } 346 if (index < string.length ()) 347 ret.append (string.substring (index)); 348 349 return (ret.toString ()); 350 } 351 352 357 public String pretty (String string) 358 { 359 int index; 360 int spaces; 361 StringBuffer ret; 362 363 ret = new StringBuffer (string.length ()); 364 365 index = 0; 367 while ((index < string.length ()) && (-1 != (spaces = string.indexOf (" ", index)))) 368 { 369 ret.append (" // " + string.substring (index, spaces)); 370 if (!string.substring (index, spaces).endsWith (nl)) 371 ret.append (nl); 372 while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces))) 373 spaces++; 374 index = spaces; 375 } 376 if (index < string.length ()) 377 ret.append (" // " + string.substring (index)); 378 379 return (ret.toString ()); 380 } 381 382 389 public String pad (String string, char character, int length) 390 { 391 StringBuffer ret; 392 393 ret = new StringBuffer (length); 394 ret.append (string); 395 while (length > ret.length ()) 396 ret.insert (0, character); 397 398 return (ret.toString ()); 399 } 400 401 407 public String unicode (String string) 408 { 409 int code; 410 411 if (string.startsWith ("\"&#") && string.endsWith (";\"")) 412 { 413 string = string.substring (3, string.length () - 2); 414 try 415 { 416 code = Integer.parseInt (string); 417 string = "'\\u" + pad (Integer.toHexString (code), '0', 4) + "'"; 418 } 419 catch (Exception e) 420 { 421 e.printStackTrace (); 422 } 423 return (string); 424 } 425 else 426 return (string); 427 } 428 429 438 public void extract (String string, PrintWriter out) 439 { 440 int space; 441 String token; 442 String code; 443 444 if (string.startsWith ("<!--")) 445 out.println (pretty (string.substring (4, string.length () - 3).trim ())); 446 else if (string.startsWith ("<!ENTITY")) 447 { 448 string = string.substring (8, string.length () - 3).trim (); 449 if (-1 != (space = string.indexOf (" "))) 450 { 451 token = string.substring (0, space); 452 string = string.substring (space).trim (); 453 if (string.startsWith ("CDATA")) 454 { 455 string = string.substring (5).trim (); 456 if (-1 != (space = string.indexOf (" "))) 457 { 458 code = string.substring (0, space).trim (); 459 code = unicode (code); 460 string = string.substring (space).trim (); 461 out.println ( 462 " new CharacterReference (\"" + token + "\"," 463 + pad (code, ' ', code.length () + 9 - token.length ()) + ")," 465 + " // " 466 + pack (string)); 467 } 468 else 469 out.println (string); 470 } 471 else 472 out.println (string); 473 } 474 else 475 out.println (string); 476 } 477 else 478 out.println (string); 479 } 480 481 491 public void sgml (String string, PrintWriter out) 492 { 493 int index; 494 int begin; 495 int end; 496 497 index = 0; 498 while (-1 != (begin = string.indexOf ("<", index))) 499 { 500 if (-1 != (end = string.indexOf ("-->", begin))) 501 { 502 extract (string.substring (begin, end + 3), out); 503 index = end + 3; 504 } 505 else 506 index = begin + 1; 507 } 508 } 509 510 514 public void parse (PrintWriter out) 515 throws 516 ParserException 517 { 518 Node node; 519 StringBuffer buffer = new StringBuffer (4096); 520 521 for (NodeIterator e = mParser.elements (); e.hasMoreNodes ();) 524 { 525 node = e.nextNode (); 526 gather (node, buffer); 527 } 528 529 String text = translate (buffer.toString ()); 530 sgml (text, out); 531 } 532 } 533 534 public CharacterReference[] getReferences () 535 { 536 final String class_name = "CharacterEntityReferenceList"; 537 String paths; 538 String path; 539 String source; 540 PrintWriter out; 541 Generate generate; 542 SimpleClassLoader loader; 543 Class hello; 544 Field field; 545 CharacterReference[] ret; 546 547 ret = mReferences; 548 if (null == ret) 549 { 550 paths = System.getProperty ("java.class.path"); 551 path = System.getProperty ("user.home"); 552 if (!path.endsWith (File.separator)) 553 path += File.separator; 554 source = path + class_name + ".java"; 555 try 556 { 557 generate = new Generate (); 559 out = new PrintWriter (new FileWriter (source)); 560 out.println ("import org.htmlparser.util.CharacterReference;"); 561 out.println (); 562 out.println ("/** Generated by " + this.getClass ().getName () + " **/"); 563 out.println ("public class " + class_name); 564 out.println ("{"); 565 out.println (" /**"); 566 out.println (" * Table mapping character to entity reference."); 567 out.println (" */"); 568 out.println (" public static final CharacterReference[] mCharacterReferences ="); 569 out.println (" {"); 570 generate.parse (out); 571 out.println (" };"); 572 out.println ("}"); 573 out.close (); 574 if (0 == com.sun.tools.javac.Main.compile (new String [] {"-classpath", paths, source})) 576 { 577 try 578 { 579 loader = new SimpleClassLoader (path); 581 hello = loader.loadClass (class_name); 582 try 583 { 584 field = hello.getField ("mCharacterReferences"); 586 ret = (CharacterReference[])field.get (null); 587 Sort.QuickSort (ret); 588 } 589 catch (IllegalAccessException iae) 590 { 591 fail ("references not accessible"); 592 } 593 catch (NoSuchFieldException nsfe) 594 { 595 fail ("references not found"); 596 } 597 } 598 catch (ClassNotFoundException cnfe) 599 { 600 fail ("couldn't load class"); 601 } 602 finally 603 { 604 File classfile; 605 606 classfile = new File (path + class_name + ".class"); 607 classfile.delete (); 608 } 609 } 610 else 611 fail ("couldn't compile class"); 612 mReferences = ret; 613 } 614 catch (IOException ioe) 615 { 616 fail ("couldn't write class"); 617 } 618 catch (ParserException ioe) 619 { 620 fail ("couldn't parse w3.org entities list"); 621 } 622 } 623 624 return (ret); 625 } 626 627 public void testInitialCharacterEntityReference () 628 { 629 assertEquals ( 630 "character entity reference at start of string doesn't work", 631 "\u00f7 is the division sign.", 632 Translate.decode ("÷ is the division sign.")); 633 } 634 635 public void testInitialNumericCharacterReference1 () 636 { 637 assertEquals ( 638 "numeric character reference at start of string doesn't work", 639 "\u00f7 is the division sign.", 640 Translate.decode ("÷ is the division sign.")); 641 } 642 643 public void testInitialNumericCharacterReference2 () 644 { 645 assertEquals ( 646 "numeric character reference at start of string doesn't work", 647 "\u00f7 is the division sign.", 648 Translate.decode ("÷ is the division sign.")); 649 } 650 651 public void testInitialHexNumericCharacterReference1 () 652 { 653 assertEquals ( 654 "numeric character reference at start of string doesn't work", 655 "\u00f7 is the division sign.", 656 Translate.decode ("÷ is the division sign.")); 657 } 658 659 public void testInitialHexNumericCharacterReference2 () 660 { 661 assertEquals ( 662 "numeric character reference at start of string doesn't work", 663 "\u00f7 is the division sign.", 664 Translate.decode ("÷ is the division sign.")); 665 } 666 667 public void testInitialHexNumericCharacterReference3 () 668 { 669 assertEquals ( 670 "numeric character reference at start of string doesn't work", 671 "\u00f7 is the division sign.", 672 Translate.decode ("÷ is the division sign.")); 673 } 674 675 public void testInitialHexNumericCharacterReference4 () 676 { 677 assertEquals ( 678 "numeric character reference at start of string doesn't work", 679 "\u00f7 is the division sign.", 680 Translate.decode ("÷ is the division sign.")); 681 } 682 683 public void testInitialHexNumericCharacterReference5 () 684 { 685 assertEquals ( 686 "numeric character reference at start of string doesn't work", 687 "\u00f7 is the division sign.", 688 Translate.decode ("÷ is the division sign.")); 689 } 690 691 public void testInitialHexNumericCharacterReference6 () 692 { 693 assertEquals ( 694 "numeric character reference at start of string doesn't work", 695 "\u00f7 is the division sign.", 696 Translate.decode ("÷ is the division sign.")); 697 } 698 699 public void testInitialHexNumericCharacterReference7 () 700 { 701 assertEquals ( 702 "numeric character reference at start of string doesn't work", 703 "\u00f7 is the division sign.", 704 Translate.decode ("÷ is the division sign.")); 705 } 706 707 public void testInitialHexNumericCharacterReference8 () 708 { 709 assertEquals ( 710 "numeric character reference at start of string doesn't work", 711 "\u00f7 is the division sign.", 712 Translate.decode ("÷ is the division sign.")); 713 } 714 715 public void testInitialCharacterEntityReferenceWithoutSemi () 716 { 717 assertEquals ( 718 "character entity reference without a semicolon at start of string doesn't work", 719 "\u00f7 is the division sign.", 720 Translate.decode ("÷ is the division sign.")); 721 } 722 723 public void testInitialNumericCharacterReferenceWithoutSemi () 724 { 725 assertEquals ( 726 "numeric character reference without a semicolon at start of string doesn't work", 727 "\u00f7 is the division sign.", 728 Translate.decode ("÷ is the division sign.")); 729 } 730 731 public void testInitialHexNumericCharacterReferenceWithoutSemi1 () 732 { 733 assertEquals ( 734 "numeric character reference without a semicolon at start of string doesn't work", 735 "\u00f7 is the division sign.", 736 Translate.decode ("÷ is the division sign.")); 737 } 738 739 public void testInitialHexNumericCharacterReferenceWithoutSemi2 () 740 { 741 assertEquals ( 742 "numeric character reference without a semicolon at start of string doesn't work", 743 "\u00f7 is the division sign.", 744 Translate.decode ("÷ is the division sign.")); 745 } 746 747 public void testInitialHexNumericCharacterReferenceWithoutSemi3 () 748 { 749 assertEquals ( 750 "numeric character reference without a semicolon at start of string doesn't work", 751 "\u00f7 is the division sign.", 752 Translate.decode ("÷ is the division sign.")); 753 } 754 755 public void testInitialHexNumericCharacterReferenceWithoutSemi4 () 756 { 757 assertEquals ( 758 "numeric character reference without a semicolon at start of string doesn't work", 759 "\u00f7 is the division sign.", 760 Translate.decode ("÷ is the division sign.")); 761 } 762 763 public void testInitialHexNumericCharacterReferenceWithoutSemi5 () 764 { 765 assertEquals ( 766 "numeric character reference without a semicolon at start of string doesn't work", 767 "\u00f7 is the division sign.", 768 Translate.decode ("÷ is the division sign.")); 769 } 770 771 public void testInitialHexNumericCharacterReferenceWithoutSemi6 () 772 { 773 assertEquals ( 774 "numeric character reference without a semicolon at start of string doesn't work", 775 "\u00f7 is the division sign.", 776 Translate.decode ("÷ is the division sign.")); 777 } 778 779 public void testInitialHexNumericCharacterReferenceWithoutSemi7 () 780 { 781 assertEquals ( 782 "numeric character reference without a semicolon at start of string doesn't work", 783 "\u00f7 is the division sign.", 784 Translate.decode ("÷ is the division sign.")); 785 } 786 787 public void testInitialHexNumericCharacterReferenceWithoutSemi8 () 788 { 789 assertEquals ( 790 "numeric character reference without a semicolon at start of string doesn't work", 791 "\u00f7 is the division sign.", 792 Translate.decode ("÷ is the division sign.")); 793 } 794 795 public void testFinalCharacterEntityReference () 796 { 797 assertEquals ( 798 "character entity reference at end of string doesn't work", 799 "The division sign (\u00f7) is \u00f7", 800 Translate.decode ("The division sign (\u00f7) is ÷")); 801 } 802 803 public void testFinalNumericCharacterReference () 804 { 805 assertEquals ( 806 "numeric character reference at end of string doesn't work", 807 "The division sign (\u00f7) is \u00f7", 808 Translate.decode ("The division sign (\u00f7) is ÷")); 809 } 810 811 public void testFinalHexNumericCharacterReference1 () 812 { 813 assertEquals ( 814 "numeric character reference at end of string doesn't work", 815 "The division sign (\u00f7) is \u00f7", 816 Translate.decode ("The division sign (\u00f7) is ÷")); 817 } 818 819 public void testFinalHexNumericCharacterReference2 () 820 { 821 assertEquals ( 822 "numeric character reference at end of string doesn't work", 823 "The division sign (\u00f7) is \u00f7", 824 Translate.decode ("The division sign (\u00f7) is ÷")); 825 } 826 827 public void testFinalHexNumericCharacterReference3 () 828 { 829 assertEquals ( 830 "numeric character reference at end of string doesn't work", 831 "The division sign (\u00f7) is \u00f7", 832 Translate.decode ("The division sign (\u00f7) is ÷")); 833 } 834 835 public void testFinalHexNumericCharacterReference4 () 836 { 837 assertEquals ( 838 "numeric character reference at end of string doesn't work", 839 "The division sign (\u00f7) is \u00f7", 840 Translate.decode ("The division sign (\u00f7) is ÷")); 841 } 842 843 public void testFinalHexNumericCharacterReference5 () 844 { 845 assertEquals ( 846 "numeric character reference at end of string doesn't work", 847 "The division sign (\u00f7) is \u00f7", 848 Translate.decode ("The division sign (\u00f7) is ÷")); 849 } 850 851 public void testFinalHexNumericCharacterReference6 () 852 { 853 assertEquals ( 854 "numeric character reference at end of string doesn't work", 855 "The division sign (\u00f7) is \u00f7", 856 Translate.decode ("The division sign (\u00f7) is ÷")); 857 } 858 859 public void testFinalHexNumericCharacterReference7 () 860 { 861 assertEquals ( 862 "numeric character reference at end of string doesn't work", 863 "The division sign (\u00f7) is \u00f7", 864 Translate.decode ("The division sign (\u00f7) is ÷")); 865 } 866 867 public void testFinalHexNumericCharacterReference8 () 868 { 869 assertEquals ( 870 "numeric character reference at end of string doesn't work", 871 "The division sign (\u00f7) is \u00f7", 872 Translate.decode ("The division sign (\u00f7) is ÷")); 873 } 874 875 public void testFinalCharacterEntityReferenceWithoutSemi () 876 { 877 assertEquals ( 878 "character entity reference without a semicolon at end of string doesn't work", 879 "The division sign (\u00f7) is \u00f7", 880 Translate.decode ("The division sign (\u00f7) is ÷")); 881 } 882 883 public void testFinalNumericCharacterReferenceWithoutSemi1 () 884 { 885 assertEquals ( 886 "numeric character reference without a semicolon at end of string doesn't work", 887 "The division sign (\u00f7) is \u00f7", 888 Translate.decode ("The division sign (\u00f7) is ÷")); 889 } 890 891 public void testFinalNumericCharacterReferenceWithoutSemi2 () 892 { 893 assertEquals ( 894 "numeric character reference without a semicolon at end of string doesn't work", 895 "The division sign (\u00f7) is \u00f7", 896 Translate.decode ("The division sign (\u00f7) is ÷")); 897 } 898 899 public void testFinalHexNumericCharacterReferenceWithoutSemi1 () 900 { 901 assertEquals ( 902 "numeric character reference without a semicolon at end of string doesn't work", 903 "The division sign (\u00f7) is \u00f7", 904 Translate.decode ("The division sign (\u00f7) is ÷")); 905 } 906 907 public void testFinalHexNumericCharacterReferenceWithoutSemi2 () 908 { 909 assertEquals ( 910 "numeric character reference without a semicolon at end of string doesn't work", 911 "The division sign (\u00f7) is \u00f7", 912 Translate.decode ("The division sign (\u00f7) is ÷")); 913 } 914 915 public void testFinalHexNumericCharacterReferenceWithoutSemi3 () 916 { 917 assertEquals ( 918 "numeric character reference without a semicolon at end of string doesn't work", 919 "The division sign (\u00f7) is \u00f7", 920 Translate.decode ("The division sign (\u00f7) is ÷")); 921 } 922 923 public void testFinalHexNumericCharacterReferenceWithoutSemi4 () 924 { 925 assertEquals ( 926 "numeric character reference without a semicolon at end of string doesn't work", 927 "The division sign (\u00f7) is \u00f7", 928 Translate.decode ("The division sign (\u00f7) is ÷")); 929 } 930 931 public void testFinalHexNumericCharacterReferenceWithoutSemi5 () 932 { 933 assertEquals ( 934 "numeric character reference without a semicolon at end of string doesn't work", 935 "The division sign (\u00f7) is \u00f7", 936 Translate.decode ("The division sign (\u00f7) is ÷")); 937 } 938 939 public void testFinalHexNumericCharacterReferenceWithoutSemi6 () 940 { 941 assertEquals ( 942 "numeric character reference without a semicolon at end of string doesn't work", 943 "The division sign (\u00f7) is \u00f7", 944 Translate.decode ("The division sign (\u00f7) is ÷")); 945 } 946 947 public void testFinalHexNumericCharacterReferenceWithoutSemi7 () 948 { 949 assertEquals ( 950 "numeric character reference without a semicolon at end of string doesn't work", 951 "The division sign (\u00f7) is \u00f7", 952 Translate.decode ("The division sign (\u00f7) is ÷")); 953 } 954 955 public void testFinalHexNumericCharacterReferenceWithoutSemi8 () 956 { 957 assertEquals ( 958 "numeric character reference without a semicolon at end of string doesn't work", 959 "The division sign (\u00f7) is \u00f7", 960 Translate.decode ("The division sign (\u00f7) is ÷")); 961 } 962 963 public void testReferencesInString () 964 { 965 assertEquals ( 966 "character references within a string don't work", 967 "Thus, the character entity reference \u00f7 is a more convenient form than \u00f7 for obtaining the division sign (\u00f7)", 968 Translate.decode ("Thus, the character entity reference ÷ is a more convenient form than ÷ for obtaining the division sign (\u00f7)")); 969 } 970 971 public void testBogusCharacterEntityReference1 () 972 { 973 assertEquals ( 974 "bogus character entity reference doesn't work", 975 "The character entity reference &divode; is bogus", 976 Translate.decode ("The character entity reference &divode; is bogus")); 977 } 978 979 public void testBogusCharacterEntityReference2 () 980 { 981 assertEquals ( 982 "bogus character entity reference doesn't work", 983 "The character entity reference &(divide) is bogus", 984 Translate.decode ("The character entity reference &(divide) is bogus")); 985 } 986 987 public void testBogusNumericCharacterReference () 988 { 989 assertEquals ( 990 "bogus numeric character reference doesn't work", 991 "The numeric character reference &#BF7; is bogus", 992 Translate.decode ("The numeric character reference &#BF7; is bogus")); 993 } 994 995 public void testBogusHexNumericCharacterReference () 996 { 997 assertEquals ( 998 "bogus numeric character reference doesn't work", 999 "The numeric character reference &#xKJ7; is bogus", 1000 Translate.decode ("The numeric character reference &#xKJ7; is bogus")); 1001 } 1002 1003 public void testPoorlyTerminatedCharacterEntityReference1 () 1004 { 1005 assertEquals ( 1006 "poorly terminated character entity reference doesn't work", 1007 "The character entity reference \u00f7d should be decoded", 1008 Translate.decode ("The character entity reference ÷d should be decoded")); 1009 } 1010 1011 public void testPoorlyTerminatedCharacterEntityReference2 () 1012 { 1013 assertEquals ( 1014 "poorly terminated character entity reference doesn't work", 1015 "The character entity reference \u00f7<br> should be decoded", 1016 Translate.decode ("The character entity reference ÷<br> should be decoded")); 1017 } 1018 1019 public void testPoorlyTerminatedNumericCharacterReference1 () 1020 { 1021 assertEquals ( 1022 "poorly terminated numeric character reference doesn't work", 1023 "The numeric character reference \u00f7pop should be decoded", 1024 Translate.decode ("The numeric character reference ÷pop should be decoded")); 1025 } 1026 1027 public void testPoorlyTerminatedNumericCharacterReference2 () 1028 { 1029 assertEquals ( 1030 "poorly terminated numeric character reference doesn't work", 1031 "The numeric character reference \u00f7<br> should be decoded", 1032 Translate.decode ("The numeric character reference ÷<br> should be decoded")); 1033 } 1034 1035 public void testPoorlyTerminatedNumericCharacterReference3 () 1036 { 1037 assertEquals ( 1038 "poorly terminated numeric character reference doesn't work", 1039 "The numeric character reference \u00f7xpert should be decoded", 1040 Translate.decode ("The numeric character reference ÷xpert should be decoded")); 1041 } 1042 1043 public void testEncode () 1044 { 1045 assertEquals ( 1046 "encode doesn't work", 1047 "Character entity reference: ÷, another: , numeric character reference: ♧.", 1048 Translate.encode ("Character entity reference: \u00f7, another: \u00a0, numeric character reference: \u2667.")); 1049 } 1050 1051 public void testEncodeLink () 1052 { 1053 assertEquals ( 1054 "encode link doesn't work", 1055 "<a HREF="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>", 1056 Translate.encode ("<a HREF=\"http://www.w3.org/TR/REC-html40/sgml/entities.html\">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>")); 1057 } 1058 1059 public byte[] encodedecode (byte[] bytes) 1060 throws 1061 IOException 1062 { 1063 InputStream in; 1064 ByteArrayOutputStream out; 1065 byte[] data; 1066 1067 in = new ByteArrayInputStream (bytes); 1069 out = new ByteArrayOutputStream (); 1070 Translate.encode (in, new PrintStream (out, false, "ISO-8859-1")); 1071 in.close (); 1072 out.close (); 1073 data = out.toByteArray (); 1074 1075 in = new ByteArrayInputStream (data); 1077 out = new ByteArrayOutputStream (); 1078 Translate.decode (in, new PrintStream (out, false, "ISO-8859-1")); 1079 in.close (); 1080 out.close (); 1081 data = out.toByteArray (); 1082 1083 return (data); 1084 } 1085 1086 public void check (byte[] reference, byte[] result) 1087 throws 1088 IOException 1089 { 1090 InputStream ref; 1091 InputStream in; 1092 int i; 1093 int i1; 1094 int i2; 1095 1096 ref = new ByteArrayInputStream (reference); 1097 in = new ByteArrayInputStream (result); 1098 i = 0; 1099 do 1100 { 1101 i1 = ref.read (); 1102 i2 = in.read (); 1103 if (i1 != i2) 1104 fail ("byte difference detected at offset " + i + " expected " + i1 + ", actual " + i2); 1105 i++; 1106 } 1107 while (-1 != i1); 1108 ref.close (); 1109 in.close (); 1110 } 1111 1112 public void testHexNumericEncoding () 1113 throws 1114 IOException 1115 { 1116 try 1117 { 1118 Translate.ENCODE_HEXADECIMAL = true; 1119 assertEquals ( 1120 "hex value incorrect", 1121 "֫ is a non-existant character.", 1122 Translate.encode ("\u05AB is a non-existant character.")); 1123 } 1124 finally 1125 { 1126 Translate.ENCODE_HEXADECIMAL = false; 1127 } 1128 } 1129 1130 public void testLastCharacterEntityReference () 1131 throws 1132 IOException 1133 { 1134 assertEquals ( 1135 "poorly terminated numeric character reference doesn't work", 1136 "The character entity reference\u200cshould be decoded", 1137 Translate.decode ("The character entity reference&zwnjshould be decoded")); 1138 } 1139 1140 public void testEncodeDecodePage () throws IOException 1141 { 1142 URL url; 1143 URLConnection connection; 1144 InputStream in; 1145 ByteArrayOutputStream out; 1146 byte[] bytes; 1147 byte[] result; 1148 int c; 1149 1150 url = new URL ("http://sourceforge.net/projects/htmlparser"); 1152 connection = url.openConnection (); 1153 in = connection.getInputStream (); 1154 out = new ByteArrayOutputStream (); 1155 while (-1 != (c = in.read ())) 1156 out.write (c); 1157 in.close (); 1158 out.close (); 1159 bytes = out.toByteArray (); 1160 1161 result = encodedecode (bytes); 1163 1164 check (bytes, result); 1166 } 1167 1168 1174 public void testEncodeDecodeAll () 1175 { 1176 CharacterReference[] list; 1177 StringBuffer stimulus; 1178 StringBuffer response; 1179 CharacterReference ref; 1180 String string; 1181 1182 list = getReferences (); 1183 stimulus = new StringBuffer (); 1184 response = new StringBuffer (); 1185 for (int i = 0; i < list.length; i++) 1186 { 1187 ref = list[i]; 1188 stimulus.append ((char)ref.getCharacter ()); 1189 response.append ("&"); 1190 response.append (ref.getKernel ()); 1191 response.append (";"); 1192 } 1193 string = Translate.encode (stimulus.toString ()); 1194 if (!string.equals (response.toString ())) 1195 fail ("encoding incorrect, expected \n\"" + response.toString () + "\", encoded \n\"" + string + "\""); 1196 string = Translate.decode (string); 1197 if (!string.equals (stimulus.toString ())) 1198 fail ("decoding incorrect, expected \n\"" + stimulus.toString () + "\", decoded \n\"" + string + "\", encoded \n\"" + response.toString () + "\""); 1199 } 1200 1201 public void testEncodeDecodeRandom () 1202 { 1203 Random random; 1204 CharacterReference[] list; 1205 StringBuffer stimulus; 1206 StringBuffer response; 1207 char character; 1208 CharacterReference ref; 1209 String string; 1210 1211 random = new Random (); 1212 list = getReferences (); 1213 stimulus = new StringBuffer (); 1214 response = new StringBuffer (); 1215 for (int i = 0; i < 1000; i++) 1216 { 1217 for (int j = 0; j < 10; j++) 1218 { 1219 for (int k = 0; k < 10; k++) 1221 { 1222 character = (char)random.nextInt (127); 1223 if (character >= ' ') 1224 { 1225 if ('&' == character) 1226 { 1227 stimulus.append (character); 1228 response.append ("&"); 1229 } 1230 else if ('"' == character) 1231 { 1232 stimulus.append (character); 1233 response.append ("""); 1234 } 1235 else if ('<' == character) 1236 { 1237 stimulus.append (character); 1238 response.append ("<"); 1239 } 1240 else if ('>' == character) 1241 { 1242 stimulus.append (character); 1243 response.append (">"); 1244 } 1245 else 1246 { 1247 stimulus.append (character); 1248 response.append (character); 1249 } 1250 } 1251 } 1252 ref = list[random.nextInt (list.length)]; 1253 stimulus.append ((char)ref.getCharacter ()); 1254 response.append ("&"); 1255 response.append (ref.getKernel ()); 1256 response.append (";"); 1257 for (int k = 0; k < 10; k++) 1259 { 1260 character = (char)random.nextInt (127); 1261 if (character >= ' ') 1262 { 1263 if ('&' == character) 1264 { 1265 stimulus.append (character); 1266 response.append ("&"); 1267 } 1268 else if ('"' == character) 1269 { 1270 stimulus.append (character); 1271 response.append ("""); 1272 } 1273 else if ('<' == character) 1274 { 1275 stimulus.append (character); 1276 response.append ("<"); 1277 } 1278 else if ('>' == character) 1279 { 1280 stimulus.append (character); 1281 response.append (">"); 1282 } 1283 else 1284 { 1285 stimulus.append (character); 1286 response.append (character); 1287 } 1288 } 1289 } 1290 } 1291 string = Translate.encode (stimulus.toString ()); 1292 if (!string.equals (response.toString ())) 1293 fail ("encoding incorrect, expected \n\"" + response.toString () + "\", encoded \n\"" + string + "\""); 1294 string = Translate.decode (string); 1295 if (!string.equals (stimulus.toString ())) 1296 fail ("decoding incorrect, expected \n\"" + stimulus.toString () + "\", decoded \n\"" + string + "\", encoded \n\"" + response.toString () + "\""); 1297 stimulus.setLength (0); 1298 response.setLength (0); 1299 } 1300 1301 } 1302 1303 public void testEncodeDecodeRandomNoSemi () 1304 { 1305 Random random; 1306 CharacterReference[] list; 1307 StringBuffer stimulus; 1308 StringBuffer response; 1309 char character; 1310 int index; 1311 CharacterReference ref; 1312 String kernel; 1313 ArrayList forbidden; 1314 String string; 1315 1316 random = new Random (); 1317 list = getReferences (); 1318 stimulus = new StringBuffer (); 1319 response = new StringBuffer (); 1320 for (int i = 0; i < 1000; i++) 1321 { 1322 for (int j = 0; j < 10; j++) 1323 { 1324 for (int k = 0; k < 10; k++) 1326 { 1327 character = (char)random.nextInt (127); 1328 if (character >= ' ') 1329 { 1330 if ('&' == character) 1331 { 1332 stimulus.append (character); 1333 response.append ("&"); 1334 } 1335 else if ('"' == character) 1336 { 1337 stimulus.append (character); 1338 response.append ("""); 1339 } 1340 else if ('<' == character) 1341 { 1342 stimulus.append (character); 1343 response.append ("<"); 1344 } 1345 else if ('>' == character) 1346 { 1347 stimulus.append (character); 1348 response.append (">"); 1349 } 1350 else 1351 { 1352 stimulus.append (character); 1353 response.append (character); 1354 } 1355 } 1356 } 1357 index = random.nextInt (list.length); 1358 ref = list[index]; 1359 kernel = ref.getKernel (); 1360 stimulus.append ((char)ref.getCharacter ()); 1361 response.append ("&"); 1362 response.append (kernel); 1363 forbidden = new ArrayList (); 1367 for (int k = index + 1; k < list.length; k++) 1368 if (list[k].getKernel ().regionMatches ( 1369 0, 1370 kernel, 1371 0, 1372 kernel.length ())) 1373 forbidden.add (new Character (list[k].getKernel ().charAt (kernel.length ()))); 1374 else 1375 break; 1376 do 1377 { 1378 character = (char)random.nextInt (127); 1379 if ( (' ' <= character) 1380 && ('&' != character) 1381 && ('"' != character) 1382 && ('<' != character) 1383 && ('>' != character) 1384 && (';' != character) 1385 && !(forbidden.contains (new Character (character)))) 1386 { 1387 stimulus.append (character); 1388 response.append (character); 1389 character = 0; 1390 } 1391 else 1392 character = ' '; 1393 1394 } 1395 while (0 != character); 1396 for (int k = 0; k < 10; k++) 1398 { 1399 character = (char)random.nextInt (127); 1400 if (character >= ' ') 1401 { 1402 if ('&' == character) 1403 { 1404 stimulus.append (character); 1405 response.append ("&"); 1406 } 1407 else if ('"' == character) 1408 { 1409 stimulus.append (character); 1410 response.append ("""); 1411 } 1412 else if ('<' == character) 1413 { 1414 stimulus.append (character); 1415 response.append ("<"); 1416 } 1417 else if ('>' == character) 1418 { 1419 stimulus.append (character); 1420 response.append (">"); 1421 } 1422 else 1423 { 1424 stimulus.append (character); 1425 response.append (character); 1426 } 1427 } 1428 } 1429 } 1430 string = Translate.decode (response.toString ()); 1431 if (!string.equals (stimulus.toString ())) 1432 fail ("decoding incorrect:\nexpected \"" + stimulus.toString () + "\"\n decoded \"" + string + "\"\n encoded \"" + response.toString () + "\""); 1433 stimulus.setLength (0); 1434 response.setLength (0); 1435 } 1436 } 1437} 1438 1439 1440 1441 | Popular Tags |