|                                                                                                              1
 27  package org.htmlparser.tests.utilTests;
 28
 29  import java.io.ByteArrayInputStream
  ; 30  import java.io.ByteArrayOutputStream
  ; 31  import java.io.File
  ; 32  import java.io.FileInputStream
  ; 33  import java.io.FileWriter
  ; 34  import java.io.IOException
  ; 35  import java.io.InputStream
  ; 36  import java.io.PrintStream
  ; 37  import java.io.PrintWriter
  ; 38  import java.lang.reflect.Field
  ; 39  import java.net.URL
  ; 40  import java.net.URLConnection
  ; 41  import java.util.ArrayList
  ; 42  import java.util.Random
  ; 43
 44  import org.htmlparser.Node;
 45  import org.htmlparser.Parser;
 46  import org.htmlparser.Remark;
 47  import org.htmlparser.Tag;
 48  import org.htmlparser.Text;
 49  import org.htmlparser.tags.LinkTag;
 50  import org.htmlparser.tests.ParserTestCase;
 51  import org.htmlparser.util.CharacterReference;
 52  import org.htmlparser.util.NodeIterator;
 53  import org.htmlparser.util.NodeList;
 54  import org.htmlparser.util.ParserException;
 55  import org.htmlparser.util.Translate;
 56  import org.htmlparser.util.sort.Sort;
 57
 58  public class CharacterTranslationTest
 59      extends
 60          ParserTestCase
 61  {
 62      static
 63      {
 64          System.setProperty ("org.htmlparser.tests.utilTests.CharacterTranslationTest", "CharacterTranslationTest");
 65      }
 66
 67
 70      protected static CharacterReference[] mReferences;
 71
 72      public CharacterTranslationTest (String
  name) 73      {
 74          super (name);
 75      }
 76
 77
 80      class SimpleClassLoader extends ClassLoader
  81      {
 82
 85          String
  mRoot; 86
 87          public SimpleClassLoader (String
  root) 88          {
 89              if (!root.endsWith (File.separator))
 90                  root += File.separator;
 91              mRoot = root;
 92          }
 93
 94          public Class
  loadClass (String  className) 95              throws
 96                  ClassNotFoundException
  97          {
 98              return (loadClass (className, true));
 99          }
 100
 101         public synchronized Class
  loadClass (String  className, boolean resolveIt) 102             throws
 103                 ClassNotFoundException
  104         {
 105             byte data[];
 106             FileInputStream
  in; 107             Class
  ret; 108
 109             try
 110             {
 111                                 ret = super.findSystemClass (className);
 113             }
 114             catch (ClassNotFoundException
  e) 115             {
 116                 try
 117                 {
 118                     in = new FileInputStream
  (mRoot + className + ".class"); 119                     data = new byte[in.available ()];
 120                     in.read (data);
 121                     in.close ();
 122                     ret = defineClass (className, data, 0, data.length);
 123                     if (null == ret)
 124                         throw new ClassFormatError
  (); 125                     if (resolveIt)
 126                         resolveClass (ret);
 127                 }
 128                 catch (IOException
  ioe) 129                 {
 130                     throw new ClassNotFoundException
  (); 131                 }
 132             }
 133
 134             return (ret);
 135         }
 136     }
 137
 138
 153     public class Generate
 154     {
 155
 158         protected Parser mParser;
 159
 160         protected String
  nl = System.getProperty ("line.separator", "\n"); 161
 162
 168         public Generate ()
 169             throws ParserException
 170         {
 171             mParser = new Parser ("http://www.w3.org/TR/REC-html40/sgml/entities.html");
 172         }
 173
 174
 184         public String
  translate (String  string) 185         {
 186             int index;
 187             int amp;
 188             StringBuffer
  ret; 189
 190             ret = new StringBuffer
  (4096); 191
 192             index = 0;
 193             while ((index < string.length ()) && (-1 != (amp = string.indexOf ('&', index))))
 194             {
 195                                 ret.append (string.substring (index, amp));
 197                 if (string.startsWith (" ", amp))
 198                 {
 199                     ret.append (" ");
 200                     index = amp + 6;
 201                 }
 202                 else if (string.startsWith ("<", amp))
 203                 {
 204                     ret.append ("<");
 205                     index = amp + 4;
 206                 }
 207                 else if (string.startsWith (">", amp))
 208                 {
 209                     ret.append (">");
 210                     index = amp + 4;
 211                 }
 212                 else if (string.startsWith ("&", amp))
 213                 {
 214                     ret.append ("&");
 215                     index = amp + 5;
 216                 }
 217                 else if (string.startsWith (""e;", amp))
 218                 {
 219                     ret.append ("\"");
 220                     index = amp + 7;
 221                 }
 222                 else if (string.startsWith ("÷", amp))
 223                 {
 224                                                             ret.append ("&");
 227                     index = amp + 1;
 228                 }
 229                 else if (string.startsWith ("©", amp))
 230                 {
 231                                                             ret.append ("&");
 234                     index = amp + 1;
 235                 }
 236                 else
 237                 {
 238                     System.out.println ("unknown special character starting with " + string.substring (amp, amp + 7));
 239                     ret.append ("&");
 240                     index = amp + 1;
 241                 }
 242             }
 243             ret.append (string.substring (index));
 244
 245             return (ret.toString ());
 246         }
 247
 248         public void gather (Node node, StringBuffer
  buffer) 249         {
 250             NodeList children;
 251
 252             if (node instanceof Text)
 253             {
 254                                                 Text stringNode = (Text)node;
 257                                 buffer.append (stringNode.getText ());
 259             }
 260             else if (node instanceof LinkTag)
 261             {
 262                                                 LinkTag linkNode = (LinkTag)node;
 265                                 buffer.append (linkNode.getLinkText ());
 267             }
 268             else if (node instanceof Tag)
 269             {
 270                 String
  name = ((Tag)node).getTagName (); 271                 if (name.equals ("BR") || name.equals ("P"))
 272                     buffer.append (nl);
 273                 else
 274                 {
 275                     children = ((Tag)node).getChildren ();
 276                     if (null != children)
 277                         for (int i = 0; i < children.size (); i++)
 278                             gather (children.elementAt (i), buffer);
 279                 }
 280             }
 281             else if (node instanceof Remark)
 282             {
 283             }
 284             else
 285             {
 286                 System.out.println ();
 287                 System.out.println(node.toString());
 288             }
 289         }
 290
 291
 297         public int indexOfWhitespace (String
  string, int index) 298         {
 299             int space;
 300             int cr;
 301             int ret;
 302
 303             space = string.indexOf (" ", index);
 304             cr = string.indexOf (nl, index);
 305             if (-1 == space)
 306                 ret = cr;
 307             else if (-1 == cr)
 308                 ret = space;
 309             else
 310                 ret = Math.min (space, cr);
 311
 312             return (ret);
 313         }
 314
 315
 326         public String
  pack (String  string) 327         {
 328             int index;
 329             int spaces;
 330             StringBuffer
  ret; 331
 332             ret = new StringBuffer
  (string.length ()); 333
 334             if (string.startsWith ("-- "))
 335                 string = string.substring (3);
 336                         index = 0;
 338             while ((index < string.length ()) && (-1 != (spaces = indexOfWhitespace (string, index))))
 339             {
 340                 ret.append (string.substring (index, spaces));
 341                 ret.append (" ");
 342                 while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces)))
 343                     spaces++;
 344                 index = spaces;
 345             }
 346             if (index < string.length ())
 347                 ret.append (string.substring (index));
 348
 349             return (ret.toString ());
 350         }
 351
 352
 357         public String
  pretty (String  string) 358         {
 359             int index;
 360             int spaces;
 361             StringBuffer
  ret; 362
 363             ret = new StringBuffer
  (string.length ()); 364
 365                         index = 0;
 367             while ((index < string.length ()) && (-1 != (spaces = string.indexOf ("  ", index))))
 368             {
 369                 ret.append ("        // " + string.substring (index, spaces));
 370                 if (!string.substring (index, spaces).endsWith (nl))
 371                     ret.append (nl);
 372                 while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces)))
 373                     spaces++;
 374                 index = spaces;
 375             }
 376             if (index < string.length ())
 377                 ret.append ("        // " + string.substring (index));
 378
 379             return (ret.toString ());
 380         }
 381
 382
 389         public String
  pad (String  string, char character, int length) 390         {
 391             StringBuffer
  ret; 392
 393             ret = new StringBuffer
  (length); 394             ret.append (string);
 395             while (length > ret.length ())
 396                 ret.insert (0, character);
 397
 398             return (ret.toString ());
 399         }
 400
 401
 407         public String
  unicode (String  string) 408         {
 409             int code;
 410
 411             if (string.startsWith ("\"&#") && string.endsWith (";\""))
 412             {
 413                 string = string.substring (3, string.length () - 2);
 414                 try
 415                 {
 416                     code = Integer.parseInt (string);
 417                     string = "'\\u" + pad (Integer.toHexString (code), '0', 4) + "'";
 418                 }
 419                 catch (Exception
  e) 420                 {
 421                     e.printStackTrace ();
 422                 }
 423                 return (string);
 424             }
 425             else
 426                 return (string);
 427         }
 428
 429
 438         public void extract (String
  string, PrintWriter  out) 439         {
 440             int space;
 441             String
  token; 442             String
  code; 443
 444             if (string.startsWith ("<!--"))
 445                 out.println (pretty (string.substring (4, string.length () - 3).trim ()));
 446             else if (string.startsWith ("<!ENTITY"))
 447             {
 448                 string = string.substring (8, string.length () - 3).trim ();
 449                 if (-1 != (space = string.indexOf (" ")))
 450                 {
 451                     token = string.substring (0, space);
 452                     string = string.substring (space).trim ();
 453                     if (string.startsWith ("CDATA"))
 454                     {
 455                         string = string.substring (5).trim ();
 456                         if (-1 != (space = string.indexOf (" ")))
 457                         {
 458                             code = string.substring (0, space).trim ();
 459                             code = unicode (code);
 460                             string = string.substring (space).trim ();
 461                             out.println (
 462                                 "        new CharacterReference (\"" + token + "\","
 463                                                                 + pad (code, ' ', code.length () + 9 - token.length ()) + "),"
 465                                 + " // "
 466                                 + pack (string));
 467                         }
 468                         else
 469                             out.println (string);
 470                     }
 471                     else
 472                         out.println (string);
 473                 }
 474                 else
 475                     out.println (string);
 476             }
 477             else
 478                 out.println (string);
 479         }
 480
 481
 491         public void sgml (String
  string, PrintWriter  out) 492         {
 493             int index;
 494             int begin;
 495             int end;
 496
 497             index = 0;
 498             while (-1 != (begin = string.indexOf ("<", index)))
 499             {
 500                 if (-1 != (end = string.indexOf ("-->", begin)))
 501                 {
 502                     extract (string.substring (begin, end + 3), out);
 503                     index = end + 3;
 504                 }
 505                 else
 506                     index = begin + 1;
 507             }
 508         }
 509
 510
 514         public void parse (PrintWriter
  out) 515             throws
 516                 ParserException
 517         {
 518             Node node;
 519             StringBuffer
  buffer = new StringBuffer  (4096); 520
 521                                     for (NodeIterator e = mParser.elements (); e.hasMoreNodes ();)
 524             {
 525                 node = e.nextNode ();
 526                 gather (node, buffer);
 527             }
 528
 529             String
  text = translate (buffer.toString ()); 530             sgml (text, out);
 531         }
 532     }
 533
 534     public CharacterReference[] getReferences ()
 535     {
 536         final String
  class_name = "CharacterEntityReferenceList"; 537         String
  paths; 538         String
  path; 539         String
  source; 540         PrintWriter
  out; 541         Generate generate;
 542         SimpleClassLoader loader;
 543         Class
  hello; 544         Field
  field; 545         CharacterReference[] ret;
 546
 547         ret = mReferences;
 548         if (null == ret)
 549         {
 550             paths = System.getProperty ("java.class.path");
 551             path = System.getProperty ("user.home");
 552             if (!path.endsWith (File.separator))
 553                 path += File.separator;
 554             source = path + class_name + ".java";
 555             try
 556             {
 557                                 generate = new Generate ();
 559                 out = new PrintWriter
  (new FileWriter  (source)); 560                 out.println ("import org.htmlparser.util.CharacterReference;");
 561                 out.println ();
 562                 out.println ("/** Generated by " + this.getClass ().getName () + " **/");
 563                 out.println ("public class " + class_name);
 564                 out.println ("{");
 565                 out.println ("    /**");
 566                 out.println ("     * Table mapping character to entity reference.");
 567                 out.println ("     */");
 568                 out.println ("    public static final CharacterReference[] mCharacterReferences =");
 569                 out.println ("    {");
 570                 generate.parse (out);
 571                 out.println ("    };");
 572                 out.println ("}");
 573                 out.close ();
 574                                 if (0 == com.sun.tools.javac.Main.compile (new String
  [] {"-classpath", paths, source})) 576                 {
 577                     try
 578                     {
 579                                                 loader = new SimpleClassLoader (path);
 581                         hello = loader.loadClass (class_name);
 582                         try
 583                         {
 584                                                         field = hello.getField ("mCharacterReferences");
 586                             ret = (CharacterReference[])field.get (null);
 587                             Sort.QuickSort (ret);
 588                         }
 589                         catch (IllegalAccessException
  iae) 590                         {
 591                             fail ("references not accessible");
 592                         }
 593                         catch (NoSuchFieldException
  nsfe) 594                         {
 595                             fail ("references not found");
 596                         }
 597                     }
 598                     catch (ClassNotFoundException
  cnfe) 599                     {
 600                         fail ("couldn't load class");
 601                     }
 602                     finally
 603                     {
 604                         File
  classfile; 605
 606                         classfile = new File
  (path + class_name + ".class"); 607                         classfile.delete ();
 608                     }
 609                 }
 610                 else
 611                     fail ("couldn't compile class");
 612                 mReferences = ret;
 613             }
 614             catch (IOException
  ioe) 615             {
 616                 fail ("couldn't write class");
 617             }
 618             catch (ParserException ioe)
 619             {
 620                 fail ("couldn't parse w3.org entities list");
 621             }
 622         }
 623
 624         return (ret);
 625     }
 626
 627     public void testInitialCharacterEntityReference ()
 628     {
 629         assertEquals (
 630             "character entity reference at start of string doesn't work",
 631             "\u00f7 is the division sign.",
 632             Translate.decode ("÷ is the division sign."));
 633     }
 634
 635     public void testInitialNumericCharacterReference1 ()
 636     {
 637         assertEquals (
 638             "numeric character reference at start of string doesn't work",
 639             "\u00f7 is the division sign.",
 640             Translate.decode ("÷ is the division sign."));
 641     }
 642
 643     public void testInitialNumericCharacterReference2 ()
 644     {
 645         assertEquals (
 646             "numeric character reference at start of string doesn't work",
 647             "\u00f7 is the division sign.",
 648             Translate.decode ("÷ is the division sign."));
 649     }
 650
 651     public void testInitialHexNumericCharacterReference1 ()
 652     {
 653         assertEquals (
 654             "numeric character reference at start of string doesn't work",
 655             "\u00f7 is the division sign.",
 656             Translate.decode ("÷ is the division sign."));
 657     }
 658
 659     public void testInitialHexNumericCharacterReference2 ()
 660     {
 661         assertEquals (
 662             "numeric character reference at start of string doesn't work",
 663             "\u00f7 is the division sign.",
 664             Translate.decode ("÷ is the division sign."));
 665     }
 666
 667     public void testInitialHexNumericCharacterReference3 ()
 668     {
 669         assertEquals (
 670             "numeric character reference at start of string doesn't work",
 671             "\u00f7 is the division sign.",
 672             Translate.decode ("÷ is the division sign."));
 673     }
 674
 675     public void testInitialHexNumericCharacterReference4 ()
 676     {
 677         assertEquals (
 678             "numeric character reference at start of string doesn't work",
 679             "\u00f7 is the division sign.",
 680             Translate.decode ("÷ is the division sign."));
 681     }
 682
 683     public void testInitialHexNumericCharacterReference5 ()
 684     {
 685         assertEquals (
 686             "numeric character reference at start of string doesn't work",
 687             "\u00f7 is the division sign.",
 688             Translate.decode ("÷ is the division sign."));
 689     }
 690
 691     public void testInitialHexNumericCharacterReference6 ()
 692     {
 693         assertEquals (
 694             "numeric character reference at start of string doesn't work",
 695             "\u00f7 is the division sign.",
 696             Translate.decode ("÷ is the division sign."));
 697     }
 698
 699     public void testInitialHexNumericCharacterReference7 ()
 700     {
 701         assertEquals (
 702             "numeric character reference at start of string doesn't work",
 703             "\u00f7 is the division sign.",
 704             Translate.decode ("÷ is the division sign."));
 705     }
 706
 707     public void testInitialHexNumericCharacterReference8 ()
 708     {
 709         assertEquals (
 710             "numeric character reference at start of string doesn't work",
 711             "\u00f7 is the division sign.",
 712             Translate.decode ("÷ is the division sign."));
 713     }
 714
 715     public void testInitialCharacterEntityReferenceWithoutSemi ()
 716     {
 717         assertEquals (
 718             "character entity reference without a semicolon at start of string doesn't work",
 719             "\u00f7 is the division sign.",
 720             Translate.decode ("÷ is the division sign."));
 721     }
 722
 723     public void testInitialNumericCharacterReferenceWithoutSemi ()
 724     {
 725         assertEquals (
 726             "numeric character reference without a semicolon at start of string doesn't work",
 727             "\u00f7 is the division sign.",
 728             Translate.decode ("÷ is the division sign."));
 729     }
 730
 731     public void testInitialHexNumericCharacterReferenceWithoutSemi1 ()
 732     {
 733         assertEquals (
 734             "numeric character reference without a semicolon at start of string doesn't work",
 735             "\u00f7 is the division sign.",
 736             Translate.decode ("÷ is the division sign."));
 737     }
 738
 739     public void testInitialHexNumericCharacterReferenceWithoutSemi2 ()
 740     {
 741         assertEquals (
 742             "numeric character reference without a semicolon at start of string doesn't work",
 743             "\u00f7 is the division sign.",
 744             Translate.decode ("÷ is the division sign."));
 745     }
 746
 747     public void testInitialHexNumericCharacterReferenceWithoutSemi3 ()
 748     {
 749         assertEquals (
 750             "numeric character reference without a semicolon at start of string doesn't work",
 751             "\u00f7 is the division sign.",
 752             Translate.decode ("÷ is the division sign."));
 753     }
 754
 755     public void testInitialHexNumericCharacterReferenceWithoutSemi4 ()
 756     {
 757         assertEquals (
 758             "numeric character reference without a semicolon at start of string doesn't work",
 759             "\u00f7 is the division sign.",
 760             Translate.decode ("÷ is the division sign."));
 761     }
 762
 763     public void testInitialHexNumericCharacterReferenceWithoutSemi5 ()
 764     {
 765         assertEquals (
 766             "numeric character reference without a semicolon at start of string doesn't work",
 767             "\u00f7 is the division sign.",
 768             Translate.decode ("÷ is the division sign."));
 769     }
 770
 771     public void testInitialHexNumericCharacterReferenceWithoutSemi6 ()
 772     {
 773         assertEquals (
 774             "numeric character reference without a semicolon at start of string doesn't work",
 775             "\u00f7 is the division sign.",
 776             Translate.decode ("÷ is the division sign."));
 777     }
 778
 779     public void testInitialHexNumericCharacterReferenceWithoutSemi7 ()
 780     {
 781         assertEquals (
 782             "numeric character reference without a semicolon at start of string doesn't work",
 783             "\u00f7 is the division sign.",
 784             Translate.decode ("÷ is the division sign."));
 785     }
 786
 787     public void testInitialHexNumericCharacterReferenceWithoutSemi8 ()
 788     {
 789         assertEquals (
 790             "numeric character reference without a semicolon at start of string doesn't work",
 791             "\u00f7 is the division sign.",
 792             Translate.decode ("÷ is the division sign."));
 793     }
 794
 795     public void testFinalCharacterEntityReference ()
 796     {
 797         assertEquals (
 798             "character entity reference at end of string doesn't work",
 799             "The division sign (\u00f7) is \u00f7",
 800             Translate.decode ("The division sign (\u00f7) is ÷"));
 801     }
 802
 803     public void testFinalNumericCharacterReference ()
 804     {
 805         assertEquals (
 806             "numeric character reference at end of string doesn't work",
 807             "The division sign (\u00f7) is \u00f7",
 808             Translate.decode ("The division sign (\u00f7) is ÷"));
 809     }
 810
 811     public void testFinalHexNumericCharacterReference1 ()
 812     {
 813         assertEquals (
 814             "numeric character reference at end of string doesn't work",
 815             "The division sign (\u00f7) is \u00f7",
 816             Translate.decode ("The division sign (\u00f7) is ÷"));
 817     }
 818
 819     public void testFinalHexNumericCharacterReference2 ()
 820     {
 821         assertEquals (
 822             "numeric character reference at end of string doesn't work",
 823             "The division sign (\u00f7) is \u00f7",
 824             Translate.decode ("The division sign (\u00f7) is ÷"));
 825     }
 826
 827     public void testFinalHexNumericCharacterReference3 ()
 828     {
 829         assertEquals (
 830             "numeric character reference at end of string doesn't work",
 831             "The division sign (\u00f7) is \u00f7",
 832             Translate.decode ("The division sign (\u00f7) is ÷"));
 833     }
 834
 835     public void testFinalHexNumericCharacterReference4 ()
 836     {
 837         assertEquals (
 838             "numeric character reference at end of string doesn't work",
 839             "The division sign (\u00f7) is \u00f7",
 840             Translate.decode ("The division sign (\u00f7) is ÷"));
 841     }
 842
 843     public void testFinalHexNumericCharacterReference5 ()
 844     {
 845         assertEquals (
 846             "numeric character reference at end of string doesn't work",
 847             "The division sign (\u00f7) is \u00f7",
 848             Translate.decode ("The division sign (\u00f7) is ÷"));
 849     }
 850
 851     public void testFinalHexNumericCharacterReference6 ()
 852     {
 853         assertEquals (
 854             "numeric character reference at end of string doesn't work",
 855             "The division sign (\u00f7) is \u00f7",
 856             Translate.decode ("The division sign (\u00f7) is ÷"));
 857     }
 858
 859     public void testFinalHexNumericCharacterReference7 ()
 860     {
 861         assertEquals (
 862             "numeric character reference at end of string doesn't work",
 863             "The division sign (\u00f7) is \u00f7",
 864             Translate.decode ("The division sign (\u00f7) is ÷"));
 865     }
 866
 867     public void testFinalHexNumericCharacterReference8 ()
 868     {
 869         assertEquals (
 870             "numeric character reference at end of string doesn't work",
 871             "The division sign (\u00f7) is \u00f7",
 872             Translate.decode ("The division sign (\u00f7) is ÷"));
 873     }
 874
 875     public void testFinalCharacterEntityReferenceWithoutSemi ()
 876     {
 877         assertEquals (
 878             "character entity reference without a semicolon at end of string doesn't work",
 879             "The division sign (\u00f7) is \u00f7",
 880             Translate.decode ("The division sign (\u00f7) is ÷"));
 881     }
 882
 883     public void testFinalNumericCharacterReferenceWithoutSemi1 ()
 884     {
 885         assertEquals (
 886             "numeric character reference without a semicolon at end of string doesn't work",
 887             "The division sign (\u00f7) is \u00f7",
 888             Translate.decode ("The division sign (\u00f7) is ÷"));
 889     }
 890
 891     public void testFinalNumericCharacterReferenceWithoutSemi2 ()
 892     {
 893         assertEquals (
 894             "numeric character reference without a semicolon at end of string doesn't work",
 895             "The division sign (\u00f7) is \u00f7",
 896             Translate.decode ("The division sign (\u00f7) is ÷"));
 897     }
 898
 899     public void testFinalHexNumericCharacterReferenceWithoutSemi1 ()
 900     {
 901         assertEquals (
 902             "numeric character reference without a semicolon at end of string doesn't work",
 903             "The division sign (\u00f7) is \u00f7",
 904             Translate.decode ("The division sign (\u00f7) is ÷"));
 905     }
 906
 907     public void testFinalHexNumericCharacterReferenceWithoutSemi2 ()
 908     {
 909         assertEquals (
 910             "numeric character reference without a semicolon at end of string doesn't work",
 911             "The division sign (\u00f7) is \u00f7",
 912             Translate.decode ("The division sign (\u00f7) is ÷"));
 913     }
 914
 915     public void testFinalHexNumericCharacterReferenceWithoutSemi3 ()
 916     {
 917         assertEquals (
 918             "numeric character reference without a semicolon at end of string doesn't work",
 919             "The division sign (\u00f7) is \u00f7",
 920             Translate.decode ("The division sign (\u00f7) is ÷"));
 921     }
 922
 923     public void testFinalHexNumericCharacterReferenceWithoutSemi4 ()
 924     {
 925         assertEquals (
 926             "numeric character reference without a semicolon at end of string doesn't work",
 927             "The division sign (\u00f7) is \u00f7",
 928             Translate.decode ("The division sign (\u00f7) is ÷"));
 929     }
 930
 931     public void testFinalHexNumericCharacterReferenceWithoutSemi5 ()
 932     {
 933         assertEquals (
 934             "numeric character reference without a semicolon at end of string doesn't work",
 935             "The division sign (\u00f7) is \u00f7",
 936             Translate.decode ("The division sign (\u00f7) is ÷"));
 937     }
 938
 939     public void testFinalHexNumericCharacterReferenceWithoutSemi6 ()
 940     {
 941         assertEquals (
 942             "numeric character reference without a semicolon at end of string doesn't work",
 943             "The division sign (\u00f7) is \u00f7",
 944             Translate.decode ("The division sign (\u00f7) is ÷"));
 945     }
 946
 947     public void testFinalHexNumericCharacterReferenceWithoutSemi7 ()
 948     {
 949         assertEquals (
 950             "numeric character reference without a semicolon at end of string doesn't work",
 951             "The division sign (\u00f7) is \u00f7",
 952             Translate.decode ("The division sign (\u00f7) is ÷"));
 953     }
 954
 955     public void testFinalHexNumericCharacterReferenceWithoutSemi8 ()
 956     {
 957         assertEquals (
 958             "numeric character reference without a semicolon at end of string doesn't work",
 959             "The division sign (\u00f7) is \u00f7",
 960             Translate.decode ("The division sign (\u00f7) is ÷"));
 961     }
 962
 963     public void testReferencesInString ()
 964     {
 965         assertEquals (
 966             "character references within a string don't work",
 967             "Thus, the character entity reference \u00f7 is a more convenient form than \u00f7 for obtaining the division sign (\u00f7)",
 968             Translate.decode ("Thus, the character entity reference ÷ is a more convenient form than ÷ for obtaining the division sign (\u00f7)"));
 969     }
 970
 971     public void testBogusCharacterEntityReference1 ()
 972     {
 973         assertEquals (
 974             "bogus character entity reference doesn't work",
 975             "The character entity reference &divode; is bogus",
 976             Translate.decode ("The character entity reference &divode; is bogus"));
 977     }
 978
 979     public void testBogusCharacterEntityReference2 ()
 980     {
 981         assertEquals (
 982             "bogus character entity reference doesn't work",
 983             "The character entity reference &(divide) is bogus",
 984             Translate.decode ("The character entity reference &(divide) is bogus"));
 985     }
 986
 987     public void testBogusNumericCharacterReference ()
 988     {
 989         assertEquals (
 990             "bogus numeric character reference doesn't work",
 991             "The numeric character reference &#BF7; is bogus",
 992             Translate.decode ("The numeric character reference &#BF7; is bogus"));
 993     }
 994
 995     public void testBogusHexNumericCharacterReference ()
 996     {
 997         assertEquals (
 998             "bogus numeric character reference doesn't work",
 999             "The numeric character reference &#xKJ7; is bogus",
 1000            Translate.decode ("The numeric character reference &#xKJ7; is bogus"));
 1001    }
 1002
 1003    public void testPoorlyTerminatedCharacterEntityReference1 ()
 1004    {
 1005        assertEquals (
 1006            "poorly terminated character entity reference doesn't work",
 1007            "The character entity reference \u00f7d should be decoded",
 1008            Translate.decode ("The character entity reference ÷d should be decoded"));
 1009    }
 1010
 1011    public void testPoorlyTerminatedCharacterEntityReference2 ()
 1012    {
 1013        assertEquals (
 1014            "poorly terminated character entity reference doesn't work",
 1015            "The character entity reference \u00f7<br> should be decoded",
 1016            Translate.decode ("The character entity reference ÷<br> should be decoded"));
 1017    }
 1018
 1019    public void testPoorlyTerminatedNumericCharacterReference1 ()
 1020    {
 1021        assertEquals (
 1022            "poorly terminated numeric character reference doesn't work",
 1023            "The numeric character reference \u00f7pop should be decoded",
 1024            Translate.decode ("The numeric character reference ÷pop should be decoded"));
 1025    }
 1026
 1027    public void testPoorlyTerminatedNumericCharacterReference2 ()
 1028    {
 1029        assertEquals (
 1030            "poorly terminated numeric character reference doesn't work",
 1031            "The numeric character reference \u00f7<br> should be decoded",
 1032            Translate.decode ("The numeric character reference ÷<br> should be decoded"));
 1033    }
 1034
 1035    public void testPoorlyTerminatedNumericCharacterReference3 ()
 1036    {
 1037        assertEquals (
 1038            "poorly terminated numeric character reference doesn't work",
 1039            "The numeric character reference \u00f7xpert should be decoded",
 1040            Translate.decode ("The numeric character reference ÷xpert should be decoded"));
 1041    }
 1042
 1043    public void testEncode ()
 1044    {
 1045        assertEquals (
 1046            "encode doesn't work",
 1047            "Character entity reference: ÷, another:  , numeric character reference: ♧.",
 1048            Translate.encode ("Character entity reference: \u00f7, another: \u00a0, numeric character reference: \u2667."));
 1049    }
 1050
 1051    public void testEncodeLink ()
 1052    {
 1053        assertEquals (
 1054            "encode link doesn't work",
 1055            "<a HREF="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>",
 1056            Translate.encode ("<a HREF=\"http://www.w3.org/TR/REC-html40/sgml/entities.html\">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>"));
 1057    }
 1058
 1059    public byte[] encodedecode (byte[] bytes)
 1060        throws
 1061            IOException
  1062    {
 1063        InputStream
  in; 1064        ByteArrayOutputStream
  out; 1065        byte[] data;
 1066
 1067                in = new ByteArrayInputStream
  (bytes); 1069        out = new ByteArrayOutputStream
  (); 1070        Translate.encode (in, new PrintStream
  (out, false, "ISO-8859-1")); 1071        in.close ();
 1072        out.close ();
 1073        data = out.toByteArray ();
 1074
 1075                in = new ByteArrayInputStream
  (data); 1077        out = new ByteArrayOutputStream
  (); 1078        Translate.decode (in, new PrintStream
  (out, false, "ISO-8859-1")); 1079        in.close ();
 1080        out.close ();
 1081        data = out.toByteArray ();
 1082
 1083        return (data);
 1084    }
 1085
 1086    public void check (byte[] reference, byte[] result)
 1087        throws
 1088            IOException
  1089    {
 1090        InputStream
  ref; 1091        InputStream
  in; 1092        int i;
 1093        int i1;
 1094        int i2;
 1095
 1096        ref = new ByteArrayInputStream
  (reference); 1097        in = new ByteArrayInputStream
  (result); 1098        i = 0;
 1099        do
 1100        {
 1101            i1 = ref.read ();
 1102            i2 = in.read ();
 1103            if (i1 != i2)
 1104                fail ("byte difference detected at offset " + i + " expected " + i1 + ", actual " + i2);
 1105            i++;
 1106        }
 1107        while (-1 != i1);
 1108        ref.close ();
 1109        in.close ();
 1110    }
 1111
 1112    public void testHexNumericEncoding ()
 1113        throws
 1114            IOException
  1115    {
 1116        try
 1117        {
 1118            Translate.ENCODE_HEXADECIMAL = true;
 1119            assertEquals (
 1120                "hex value incorrect",
 1121                "֫ is a non-existant character.",
 1122                Translate.encode ("\u05AB is a non-existant character."));
 1123        }
 1124        finally
 1125        {
 1126            Translate.ENCODE_HEXADECIMAL = false;
 1127        }
 1128    }
 1129
 1130    public void testLastCharacterEntityReference ()
 1131        throws
 1132            IOException
  1133    {
 1134        assertEquals (
 1135            "poorly terminated numeric character reference doesn't work",
 1136            "The character entity reference\u200cshould be decoded",
 1137            Translate.decode ("The character entity reference&zwnjshould be decoded"));
 1138    }
 1139
 1140    public void testEncodeDecodePage () throws IOException
  1141    {
 1142        URL
  url; 1143        URLConnection
  connection; 1144        InputStream
  in; 1145        ByteArrayOutputStream
  out; 1146        byte[] bytes;
 1147        byte[] result;
 1148        int c;
 1149
 1150                url = new URL
  ("http://sourceforge.net/projects/htmlparser"); 1152        connection = url.openConnection ();
 1153        in = connection.getInputStream ();
 1154        out = new ByteArrayOutputStream
  (); 1155        while (-1 != (c = in.read ()))
 1156            out.write (c);
 1157        in.close ();
 1158        out.close ();
 1159        bytes = out.toByteArray ();
 1160
 1161                result = encodedecode (bytes);
 1163
 1164                check (bytes, result);
 1166    }
 1167
 1168
 1174    public void testEncodeDecodeAll ()
 1175    {
 1176        CharacterReference[] list;
 1177        StringBuffer
  stimulus; 1178        StringBuffer
  response; 1179        CharacterReference ref;
 1180        String
  string; 1181
 1182        list = getReferences ();
 1183        stimulus = new StringBuffer
  (); 1184        response = new StringBuffer
  (); 1185        for (int i = 0; i < list.length; i++)
 1186        {
 1187            ref = list[i];
 1188            stimulus.append ((char)ref.getCharacter ());
 1189            response.append ("&");
 1190            response.append (ref.getKernel ());
 1191            response.append (";");
 1192        }
 1193        string = Translate.encode (stimulus.toString ());
 1194        if (!string.equals (response.toString ()))
 1195            fail ("encoding incorrect, expected \n\"" + response.toString () + "\", encoded \n\"" + string + "\"");
 1196        string = Translate.decode (string);
 1197        if (!string.equals (stimulus.toString ()))
 1198            fail ("decoding incorrect, expected \n\"" + stimulus.toString () + "\", decoded \n\"" + string + "\", encoded \n\"" + response.toString () + "\"");
 1199    }
 1200
 1201    public void testEncodeDecodeRandom ()
 1202    {
 1203        Random
  random; 1204        CharacterReference[] list;
 1205        StringBuffer
  stimulus; 1206        StringBuffer
  response; 1207        char character;
 1208        CharacterReference ref;
 1209        String
  string; 1210
 1211        random = new Random
  (); 1212        list = getReferences ();
 1213        stimulus = new StringBuffer
  (); 1214        response = new StringBuffer
  (); 1215        for (int i = 0; i < 1000; i++)
 1216        {
 1217            for (int j = 0; j < 10; j++)
 1218            {
 1219                                for (int k = 0; k < 10; k++)
 1221                {
 1222                    character = (char)random.nextInt (127);
 1223                    if (character >= ' ')
 1224                    {
 1225                        if ('&' == character)
 1226                        {
 1227                            stimulus.append (character);
 1228                            response.append ("&");
 1229                        }
 1230                        else if ('"' == character)
 1231                        {
 1232                            stimulus.append (character);
 1233                            response.append (""");
 1234                        }
 1235                        else if ('<' == character)
 1236                        {
 1237                            stimulus.append (character);
 1238                            response.append ("<");
 1239                        }
 1240                        else if ('>' == character)
 1241                        {
 1242                            stimulus.append (character);
 1243                            response.append (">");
 1244                        }
 1245                        else
 1246                        {
 1247                            stimulus.append (character);
 1248                            response.append (character);
 1249                        }
 1250                    }
 1251                }
 1252                ref = list[random.nextInt (list.length)];
 1253                stimulus.append ((char)ref.getCharacter ());
 1254                response.append ("&");
 1255                response.append (ref.getKernel ());
 1256                response.append (";");
 1257                                for (int k = 0; k < 10; k++)
 1259                {
 1260                    character = (char)random.nextInt (127);
 1261                    if (character >= ' ')
 1262                    {
 1263                        if ('&' == character)
 1264                        {
 1265                            stimulus.append (character);
 1266                            response.append ("&");
 1267                        }
 1268                        else if ('"' == character)
 1269                        {
 1270                            stimulus.append (character);
 1271                            response.append (""");
 1272                        }
 1273                        else if ('<' == character)
 1274                        {
 1275                            stimulus.append (character);
 1276                            response.append ("<");
 1277                        }
 1278                        else if ('>' == character)
 1279                        {
 1280                            stimulus.append (character);
 1281                            response.append (">");
 1282                        }
 1283                        else
 1284                        {
 1285                            stimulus.append (character);
 1286                            response.append (character);
 1287                        }
 1288                    }
 1289                }
 1290            }
 1291            string = Translate.encode (stimulus.toString ());
 1292            if (!string.equals (response.toString ()))
 1293                fail ("encoding incorrect, expected \n\"" + response.toString () + "\", encoded \n\"" + string + "\"");
 1294            string = Translate.decode (string);
 1295            if (!string.equals (stimulus.toString ()))
 1296                fail ("decoding incorrect, expected \n\"" + stimulus.toString () + "\", decoded \n\"" + string + "\", encoded \n\"" + response.toString () + "\"");
 1297            stimulus.setLength (0);
 1298            response.setLength (0);
 1299        }
 1300
 1301    }
 1302
 1303    public void testEncodeDecodeRandomNoSemi ()
 1304    {
 1305        Random
  random; 1306        CharacterReference[] list;
 1307        StringBuffer
  stimulus; 1308        StringBuffer
  response; 1309        char character;
 1310        int index;
 1311        CharacterReference ref;
 1312        String
  kernel; 1313        ArrayList
  forbidden; 1314        String
  string; 1315
 1316        random = new Random
  (); 1317        list = getReferences ();
 1318        stimulus = new StringBuffer
  (); 1319        response = new StringBuffer
  (); 1320        for (int i = 0; i < 1000; i++)
 1321        {
 1322            for (int j = 0; j < 10; j++)
 1323            {
 1324                                for (int k = 0; k < 10; k++)
 1326                {
 1327                    character = (char)random.nextInt (127);
 1328                    if (character >= ' ')
 1329                    {
 1330                        if ('&' == character)
 1331                        {
 1332                            stimulus.append (character);
 1333                            response.append ("&");
 1334                        }
 1335                        else if ('"' == character)
 1336                        {
 1337                            stimulus.append (character);
 1338                            response.append (""");
 1339                        }
 1340                        else if ('<' == character)
 1341                        {
 1342                            stimulus.append (character);
 1343                            response.append ("<");
 1344                        }
 1345                        else if ('>' == character)
 1346                        {
 1347                            stimulus.append (character);
 1348                            response.append (">");
 1349                        }
 1350                        else
 1351                        {
 1352                            stimulus.append (character);
 1353                            response.append (character);
 1354                        }
 1355                    }
 1356                }
 1357                index = random.nextInt (list.length);
 1358                ref = list[index];
 1359                kernel = ref.getKernel ();
 1360                stimulus.append ((char)ref.getCharacter ());
 1361                response.append ("&");
 1362                response.append (kernel);
 1363                                                                forbidden = new ArrayList
  (); 1367                for (int k = index + 1; k < list.length; k++)
 1368                    if (list[k].getKernel ().regionMatches (
 1369                        0,
 1370                        kernel,
 1371                        0,
 1372                        kernel.length ()))
 1373                        forbidden.add (new Character
  (list[k].getKernel ().charAt (kernel.length ()))); 1374                    else
 1375                        break;
 1376                do
 1377                {
 1378                    character = (char)random.nextInt (127);
 1379                    if (   (' ' <= character)
 1380                        && ('&' != character)
 1381                        && ('"' != character)
 1382                        && ('<' != character)
 1383                        && ('>' != character)
 1384                        && (';' != character)
 1385                        && !(forbidden.contains (new Character
  (character)))) 1386                    {
 1387                        stimulus.append (character);
 1388                        response.append (character);
 1389                        character = 0;
 1390                    }
 1391                    else
 1392                        character = ' ';
 1393
 1394                }
 1395                while (0 != character);
 1396                                for (int k = 0; k < 10; k++)
 1398                {
 1399                    character = (char)random.nextInt (127);
 1400                    if (character >= ' ')
 1401                    {
 1402                        if ('&' == character)
 1403                        {
 1404                            stimulus.append (character);
 1405                            response.append ("&");
 1406                        }
 1407                        else if ('"' == character)
 1408                        {
 1409                            stimulus.append (character);
 1410                            response.append (""");
 1411                        }
 1412                        else if ('<' == character)
 1413                        {
 1414                            stimulus.append (character);
 1415                            response.append ("<");
 1416                        }
 1417                        else if ('>' == character)
 1418                        {
 1419                            stimulus.append (character);
 1420                            response.append (">");
 1421                        }
 1422                        else
 1423                        {
 1424                            stimulus.append (character);
 1425                            response.append (character);
 1426                        }
 1427                    }
 1428                }
 1429            }
 1430            string = Translate.decode (response.toString ());
 1431            if (!string.equals (stimulus.toString ()))
 1432                fail ("decoding incorrect:\nexpected \"" + stimulus.toString () + "\"\n decoded \"" + string + "\"\n encoded \"" + response.toString () + "\"");
 1433            stimulus.setLength (0);
 1434            response.setLength (0);
 1435        }
 1436    }
 1437}
 1438
 1439
 1440
 1441
                                                                                                                                                                                                             |                                                                       
 
 
 
 
 
                                                                                   Popular Tags                                                                                                                                                                                              |