KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > tests > utilTests > CharacterTranslationTest


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Derick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2004/07/31 16:42:32 $
10
// $Revision: 1.46 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.tests.utilTests;
28
29 import java.io.ByteArrayInputStream JavaDoc;
30 import java.io.ByteArrayOutputStream JavaDoc;
31 import java.io.File JavaDoc;
32 import java.io.FileInputStream JavaDoc;
33 import java.io.FileWriter JavaDoc;
34 import java.io.IOException JavaDoc;
35 import java.io.InputStream JavaDoc;
36 import java.io.PrintStream JavaDoc;
37 import java.io.PrintWriter JavaDoc;
38 import java.lang.reflect.Field JavaDoc;
39 import java.net.URL JavaDoc;
40 import java.net.URLConnection JavaDoc;
41 import java.util.ArrayList JavaDoc;
42 import java.util.Random JavaDoc;
43
44 import org.htmlparser.Node;
45 import org.htmlparser.Parser;
46 import org.htmlparser.Remark;
47 import org.htmlparser.Tag;
48 import org.htmlparser.Text;
49 import org.htmlparser.tags.LinkTag;
50 import org.htmlparser.tests.ParserTestCase;
51 import org.htmlparser.util.CharacterReference;
52 import org.htmlparser.util.NodeIterator;
53 import org.htmlparser.util.NodeList;
54 import org.htmlparser.util.ParserException;
55 import org.htmlparser.util.Translate;
56 import org.htmlparser.util.sort.Sort;
57
58 public class CharacterTranslationTest
59     extends
60         ParserTestCase
61 {
62     static
63     {
64         System.setProperty ("org.htmlparser.tests.utilTests.CharacterTranslationTest", "CharacterTranslationTest");
65     }
66
67     /**
68      * The list of references.
69      */

70     protected static CharacterReference[] mReferences;
71     
72     public CharacterTranslationTest (String JavaDoc name)
73     {
74         super (name);
75     }
76
77     /**
78      * Class loader to access the compiled character references.
79      */

80     class SimpleClassLoader extends ClassLoader JavaDoc
81     {
82         /**
83          * The class path for this class loader.
84          */

85         String JavaDoc mRoot;
86
87         public SimpleClassLoader (String JavaDoc root)
88         {
89             if (!root.endsWith (File.separator))
90                 root += File.separator;
91             mRoot = root;
92         }
93
94         public Class JavaDoc loadClass (String JavaDoc className)
95             throws
96                 ClassNotFoundException JavaDoc
97         {
98             return (loadClass (className, true));
99         }
100         
101         public synchronized Class JavaDoc loadClass (String JavaDoc className, boolean resolveIt)
102             throws
103                 ClassNotFoundException JavaDoc
104         {
105             byte data[];
106             FileInputStream JavaDoc in;
107             Class JavaDoc ret;
108             
109             try
110             {
111                 // try system class loader
112
ret = super.findSystemClass (className);
113             }
114             catch (ClassNotFoundException JavaDoc e)
115             {
116                 try
117                 {
118                     in = new FileInputStream JavaDoc (mRoot + className + ".class");
119                     data = new byte[in.available ()];
120                     in.read (data);
121                     in.close ();
122                     ret = defineClass (className, data, 0, data.length);
123                     if (null == ret)
124                         throw new ClassFormatError JavaDoc ();
125                     if (resolveIt)
126                         resolveClass (ret);
127                 }
128                 catch (IOException JavaDoc ioe)
129                 {
130                     throw new ClassNotFoundException JavaDoc ();
131                 }
132             }
133             
134             return (ret);
135         }
136     }
137
138     /**
139      * Create a character reference translation class source file.
140      * Usage:
141      * <pre>
142      * java -classpath .:lib/htmlparser.jar Generate > Translate.java
143      * </pre>
144      * Derived from HTMLStringFilter.java provided as an example with the
145      * htmlparser.jar file available at
146      * <a HREF="http://htmlparser.sourceforge.net">htmlparser.sourceforge.net</a>
147      * written by Somik Raha (
148      * <a HREF='mailto:somik@industriallogic.com?
149      * subject=htmlparser'>somik@industriallogic. com</a>
150      * <a HREF="http://industriallogic.com">http://industriallogic.com</a>).
151      * @author <a HREF='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>
152      */

153     public class Generate
154     {
155         /**
156          * The working parser.
157          */

158         protected Parser mParser;
159
160         protected String JavaDoc nl = System.getProperty ("line.separator", "\n");
161         
162         /**
163          * Create a Generate object.
164          * Sets up the generation by creating a new <code>Parser</code> pointed
165          * at <a HREF="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>
166          * with the standard scanners registered.
167          */

168         public Generate ()
169             throws ParserException
170         {
171             mParser = new Parser ("http://www.w3.org/TR/REC-html40/sgml/entities.html");
172         }
173
174         /**
175          * Translate character references.
176          * After generating the Translate class we could use it
177          * to do this job, but that would involve a bootstrap
178          * problem, so this method does the reference conversion
179          * for a very tiny subset (enough to understand the w3.org
180          * page).
181          * @param string The raw string.
182          * @return The string with character references fixed.
183          */

184         public String JavaDoc translate (String JavaDoc string)
185         {
186             int index;
187             int amp;
188             StringBuffer JavaDoc ret;
189
190             ret = new StringBuffer JavaDoc (4096);
191
192             index = 0;
193             while ((index < string.length ()) && (-1 != (amp = string.indexOf ('&', index))))
194             {
195                 // include the part before the special character
196
ret.append (string.substring (index, amp));
197                 if (string.startsWith ("&nbsp;", amp))
198                 {
199                     ret.append (" ");
200                     index = amp + 6;
201                 }
202                 else if (string.startsWith ("&lt;", amp))
203                 {
204                     ret.append ("<");
205                     index = amp + 4;
206                 }
207                 else if (string.startsWith ("&gt;", amp))
208                 {
209                     ret.append (">");
210                     index = amp + 4;
211                 }
212                 else if (string.startsWith ("&amp;", amp))
213                 {
214                     ret.append ("&");
215                     index = amp + 5;
216                 }
217                 else if (string.startsWith ("&quote;", amp))
218                 {
219                     ret.append ("\"");
220                     index = amp + 7;
221                 }
222                 else if (string.startsWith ("&divide;", amp))
223                 {
224                     //ret.append ('\u00F7');
225
//index = amp + 8;
226
ret.append ("&");
227                     index = amp + 1;
228                 }
229                 else if (string.startsWith ("&copy;", amp))
230                 {
231                     //ret.append ('\u00A9');
232
//index = amp + 6;
233
ret.append ("&");
234                     index = amp + 1;
235                 }
236                 else
237                 {
238                     System.out.println ("unknown special character starting with " + string.substring (amp, amp + 7));
239                     ret.append ("&");
240                     index = amp + 1;
241                 }
242             }
243             ret.append (string.substring (index));
244
245             return (ret.toString ());
246         }
247
248         public void gather (Node node, StringBuffer JavaDoc buffer)
249         {
250             NodeList children;
251
252             if (node instanceof Text)
253             {
254                 // Node is a plain string
255
// Cast it to an HTMLText
256
Text stringNode = (Text)node;
257                 // Retrieve the data from the object
258
buffer.append (stringNode.getText ());
259             }
260             else if (node instanceof LinkTag)
261             {
262                 // Node is a link
263
// Cast it to an HTMLLinkTag
264
LinkTag linkNode = (LinkTag)node;
265                 // Retrieve the data from the object and print it
266
buffer.append (linkNode.getLinkText ());
267             }
268             else if (node instanceof Tag)
269             {
270                 String JavaDoc name = ((Tag)node).getTagName ();
271                 if (name.equals ("BR") || name.equals ("P"))
272                     buffer.append (nl);
273                 else
274                 {
275                     children = ((Tag)node).getChildren ();
276                     if (null != children)
277                         for (int i = 0; i < children.size (); i++)
278                             gather (children.elementAt (i), buffer);
279                 }
280             }
281             else if (node instanceof Remark)
282             {
283             }
284             else
285             {
286                 System.out.println ();
287                 System.out.println(node.toString());
288             }
289         }
290
291         /**
292          * Find the lowest index of whitespace (space or newline).
293          * @param string The string to look in.
294          * @param index Where to start looking.
295          * @return -1 if there is no whitespace, the minimum index otherwise.
296          */

297         public int indexOfWhitespace (String JavaDoc string, int index)
298         {
299             int space;
300             int cr;
301             int ret;
302
303             space = string.indexOf (" ", index);
304             cr = string.indexOf (nl, index);
305             if (-1 == space)
306                 ret = cr;
307             else if (-1 == cr)
308                 ret = space;
309             else
310                 ret = Math.min (space, cr);
311
312             return (ret);
313         }
314
315         /**
316          * Rewrite the comment string.
317          * In the sgml table, the comments are of the form:
318          * <pre>
319          * -- latin capital letter I with diaeresis,
320          * U+00CF ISOlat1
321          * </pre>
322          * so we just want to make a one-liner without the spaces and newlines.
323          * @param string The raw comment.
324          * @return The single line comment.
325          */

326         public String JavaDoc pack (String JavaDoc string)
327         {
328             int index;
329             int spaces;
330             StringBuffer JavaDoc ret;
331
332             ret = new StringBuffer JavaDoc (string.length ());
333
334             if (string.startsWith ("-- "))
335                 string = string.substring (3);
336             // remove doublespaces
337
index = 0;
338             while ((index < string.length ()) && (-1 != (spaces = indexOfWhitespace (string, index))))
339             {
340                 ret.append (string.substring (index, spaces));
341                 ret.append (" ");
342                 while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces)))
343                     spaces++;
344                 index = spaces;
345             }
346             if (index < string.length ())
347                 ret.append (string.substring (index));
348
349             return (ret.toString ());
350         }
351
352         /**
353          * Pretty up a comment string.
354          * @param string The comment to operate on.
355          * @return The beautiful comment string.
356          */

357         public String JavaDoc pretty (String JavaDoc string)
358         {
359             int index;
360             int spaces;
361             StringBuffer JavaDoc ret;
362
363             ret = new StringBuffer JavaDoc (string.length ());
364
365             // newline instead of doublespaces
366
index = 0;
367             while ((index < string.length ()) && (-1 != (spaces = string.indexOf (" ", index))))
368             {
369                 ret.append (" // " + string.substring (index, spaces));
370                 if (!string.substring (index, spaces).endsWith (nl))
371                     ret.append (nl);
372                 while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces)))
373                     spaces++;
374                 index = spaces;
375             }
376             if (index < string.length ())
377                 ret.append (" // " + string.substring (index));
378
379             return (ret.toString ());
380         }
381
382         /**
383          * Pad a string on the left with the given character to the length specified.
384          * @param string The string to pad
385          * @param character The character to pad with.
386          * @param length The size to pad to.
387          * @return The padded string.
388          */

389         public String JavaDoc pad (String JavaDoc string, char character, int length)
390         {
391             StringBuffer JavaDoc ret;
392
393             ret = new StringBuffer JavaDoc (length);
394             ret.append (string);
395             while (length > ret.length ())
396                 ret.insert (0, character);
397
398             return (ret.toString ());
399         }
400
401         /**
402          * Convert the textual representation of the numeric character reference to a character.
403          * @param string The numeric character reference (in quotes).
404          * @return The character represented by the numeric character reference.
405          *
406          */

407         public String JavaDoc unicode (String JavaDoc string)
408         {
409             int code;
410
411             if (string.startsWith ("\"&#") && string.endsWith (";\""))
412             {
413                 string = string.substring (3, string.length () - 2);
414                 try
415                 {
416                     code = Integer.parseInt (string);
417                     string = "'\\u" + pad (Integer.toHexString (code), '0', 4) + "'";
418                 }
419                 catch (Exception JavaDoc e)
420                 {
421                     e.printStackTrace ();
422                 }
423                 return (string);
424             }
425             else
426                 return (string);
427         }
428
429         /**
430          * Parse the sgml declaration for character entity reference
431          * name, equivalent numeric character reference and a comment.
432          * Emit a java hash table 'put' with the name as the key, the
433          * numeric character as the value and comment the insertion
434          * with the comment.
435          * @param string The contents of the sgml declaration.
436          * @param out The sink for output.
437          */

438         public void extract (String JavaDoc string, PrintWriter JavaDoc out)
439         {
440             int space;
441             String JavaDoc token;
442             String JavaDoc code;
443
444             if (string.startsWith ("<!--"))
445                 out.println (pretty (string.substring (4, string.length () - 3).trim ()));
446             else if (string.startsWith ("<!ENTITY"))
447             {
448                 string = string.substring (8, string.length () - 3).trim ();
449                 if (-1 != (space = string.indexOf (" ")))
450                 {
451                     token = string.substring (0, space);
452                     string = string.substring (space).trim ();
453                     if (string.startsWith ("CDATA"))
454                     {
455                         string = string.substring (5).trim ();
456                         if (-1 != (space = string.indexOf (" ")))
457                         {
458                             code = string.substring (0, space).trim ();
459                             code = unicode (code);
460                             string = string.substring (space).trim ();
461                             out.println (
462                                 " new CharacterReference (\"" + token + "\","
463                                 // no token is larger than 8 characters - yet
464
+ pad (code, ' ', code.length () + 9 - token.length ()) + "),"
465                                 + " // "
466                                 + pack (string));
467                         }
468                         else
469                             out.println (string);
470                     }
471                     else
472                         out.println (string);
473                 }
474                 else
475                     out.println (string);
476             }
477             else
478                 out.println (string);
479         }
480
481         /**
482          * Extract special characters.
483          * Scan the string looking for substrings of the form:
484          * <pre>
485          * &lt;!ENTITY nbsp CDATA "&amp;#160;" -- no-break space = non-breaking space, U+00A0 ISOnum --&gt;
486          * </pre>
487          * and emit a java definition for each.
488          * @param string The raw string from w3.org.
489          * @param out The sink for output.
490          */

491         public void sgml (String JavaDoc string, PrintWriter JavaDoc out)
492         {
493             int index;
494             int begin;
495             int end;
496
497             index = 0;
498             while (-1 != (begin = string.indexOf ("<", index)))
499             {
500                 if (-1 != (end = string.indexOf ("-->", begin)))
501                 {
502                     extract (string.substring (begin, end + 3), out);
503                     index = end + 3;
504                 }
505                 else
506                     index = begin + 1;
507             }
508         }
509
510         /**
511          * Pull out text elements from the HTML.
512          * @param out The sink for output.
513          */

514         public void parse (PrintWriter JavaDoc out)
515             throws
516                 ParserException
517         {
518             Node node;
519             StringBuffer JavaDoc buffer = new StringBuffer JavaDoc (4096);
520
521             // Run through an enumeration of html elements, and pick up
522
// only those that are plain string.
523
for (NodeIterator e = mParser.elements (); e.hasMoreNodes ();)
524             {
525                 node = e.nextNode ();
526                 gather (node, buffer);
527             }
528
529             String JavaDoc text = translate (buffer.toString ());
530             sgml (text, out);
531         }
532     }
533
534     public CharacterReference[] getReferences ()
535     {
536         final String JavaDoc class_name = "CharacterEntityReferenceList";
537         String JavaDoc paths;
538         String JavaDoc path;
539         String JavaDoc source;
540         PrintWriter JavaDoc out;
541         Generate generate;
542         SimpleClassLoader loader;
543         Class JavaDoc hello;
544         Field JavaDoc field;
545         CharacterReference[] ret;
546
547         ret = mReferences;
548         if (null == ret)
549         {
550             paths = System.getProperty ("java.class.path");
551             path = System.getProperty ("user.home");
552             if (!path.endsWith (File.separator))
553                 path += File.separator;
554             source = path + class_name + ".java";
555             try
556             {
557                 // create it
558
generate = new Generate ();
559                 out = new PrintWriter JavaDoc (new FileWriter JavaDoc (source));
560                 out.println ("import org.htmlparser.util.CharacterReference;");
561                 out.println ();
562                 out.println ("/** Generated by " + this.getClass ().getName () + " **/");
563                 out.println ("public class " + class_name);
564                 out.println ("{");
565                 out.println (" /**");
566                 out.println (" * Table mapping character to entity reference.");
567                 out.println (" */");
568                 out.println (" public static final CharacterReference[] mCharacterReferences =");
569                 out.println (" {");
570                 generate.parse (out);
571                 out.println (" };");
572                 out.println ("}");
573                 out.close ();
574                 // compile it
575
if (0 == com.sun.tools.javac.Main.compile (new String JavaDoc[] {"-classpath", paths, source}))
576                 {
577                     try
578                     {
579                         // load it
580
loader = new SimpleClassLoader (path);
581                         hello = loader.loadClass (class_name);
582                         try
583                         {
584                             // get the references
585
field = hello.getField ("mCharacterReferences");
586                             ret = (CharacterReference[])field.get (null);
587                             Sort.QuickSort (ret);
588                         }
589                         catch (IllegalAccessException JavaDoc iae)
590                         {
591                             fail ("references not accessible");
592                         }
593                         catch (NoSuchFieldException JavaDoc nsfe)
594                         {
595                             fail ("references not found");
596                         }
597                     }
598                     catch (ClassNotFoundException JavaDoc cnfe)
599                     {
600                         fail ("couldn't load class");
601                     }
602                     finally
603                     {
604                         File JavaDoc classfile;
605
606                         classfile = new File JavaDoc (path + class_name + ".class");
607                         classfile.delete ();
608                     }
609                 }
610                 else
611                     fail ("couldn't compile class");
612                 mReferences = ret;
613             }
614             catch (IOException JavaDoc ioe)
615             {
616                 fail ("couldn't write class");
617             }
618             catch (ParserException ioe)
619             {
620                 fail ("couldn't parse w3.org entities list");
621             }
622         }
623         
624         return (ret);
625     }
626
627     public void testInitialCharacterEntityReference ()
628     {
629         assertEquals (
630             "character entity reference at start of string doesn't work",
631             "\u00f7 is the division sign.",
632             Translate.decode ("&divide; is the division sign."));
633     }
634
635     public void testInitialNumericCharacterReference1 ()
636     {
637         assertEquals (
638             "numeric character reference at start of string doesn't work",
639             "\u00f7 is the division sign.",
640             Translate.decode ("&#247; is the division sign."));
641     }
642
643     public void testInitialNumericCharacterReference2 ()
644     {
645         assertEquals (
646             "numeric character reference at start of string doesn't work",
647             "\u00f7 is the division sign.",
648             Translate.decode ("&#0247; is the division sign."));
649     }
650
651     public void testInitialHexNumericCharacterReference1 ()
652     {
653         assertEquals (
654             "numeric character reference at start of string doesn't work",
655             "\u00f7 is the division sign.",
656             Translate.decode ("&#xf7; is the division sign."));
657     }
658
659     public void testInitialHexNumericCharacterReference2 ()
660     {
661         assertEquals (
662             "numeric character reference at start of string doesn't work",
663             "\u00f7 is the division sign.",
664             Translate.decode ("&#xF7; is the division sign."));
665     }
666
667     public void testInitialHexNumericCharacterReference3 ()
668     {
669         assertEquals (
670             "numeric character reference at start of string doesn't work",
671             "\u00f7 is the division sign.",
672             Translate.decode ("&#x0f7; is the division sign."));
673     }
674
675     public void testInitialHexNumericCharacterReference4 ()
676     {
677         assertEquals (
678             "numeric character reference at start of string doesn't work",
679             "\u00f7 is the division sign.",
680             Translate.decode ("&#x0F7; is the division sign."));
681     }
682
683     public void testInitialHexNumericCharacterReference5 ()
684     {
685         assertEquals (
686             "numeric character reference at start of string doesn't work",
687             "\u00f7 is the division sign.",
688             Translate.decode ("&#Xf7; is the division sign."));
689     }
690
691     public void testInitialHexNumericCharacterReference6 ()
692     {
693         assertEquals (
694             "numeric character reference at start of string doesn't work",
695             "\u00f7 is the division sign.",
696             Translate.decode ("&#XF7; is the division sign."));
697     }
698
699     public void testInitialHexNumericCharacterReference7 ()
700     {
701         assertEquals (
702             "numeric character reference at start of string doesn't work",
703             "\u00f7 is the division sign.",
704             Translate.decode ("&#X0f7; is the division sign."));
705     }
706
707     public void testInitialHexNumericCharacterReference8 ()
708     {
709         assertEquals (
710             "numeric character reference at start of string doesn't work",
711             "\u00f7 is the division sign.",
712             Translate.decode ("&#X0F7; is the division sign."));
713     }
714
715     public void testInitialCharacterEntityReferenceWithoutSemi ()
716     {
717         assertEquals (
718             "character entity reference without a semicolon at start of string doesn't work",
719             "\u00f7 is the division sign.",
720             Translate.decode ("&divide is the division sign."));
721     }
722
723     public void testInitialNumericCharacterReferenceWithoutSemi ()
724     {
725         assertEquals (
726             "numeric character reference without a semicolon at start of string doesn't work",
727             "\u00f7 is the division sign.",
728             Translate.decode ("&#247 is the division sign."));
729     }
730
731     public void testInitialHexNumericCharacterReferenceWithoutSemi1 ()
732     {
733         assertEquals (
734             "numeric character reference without a semicolon at start of string doesn't work",
735             "\u00f7 is the division sign.",
736             Translate.decode ("&#xf7 is the division sign."));
737     }
738
739     public void testInitialHexNumericCharacterReferenceWithoutSemi2 ()
740     {
741         assertEquals (
742             "numeric character reference without a semicolon at start of string doesn't work",
743             "\u00f7 is the division sign.",
744             Translate.decode ("&#xF7 is the division sign."));
745     }
746
747     public void testInitialHexNumericCharacterReferenceWithoutSemi3 ()
748     {
749         assertEquals (
750             "numeric character reference without a semicolon at start of string doesn't work",
751             "\u00f7 is the division sign.",
752             Translate.decode ("&#x0f7 is the division sign."));
753     }
754
755     public void testInitialHexNumericCharacterReferenceWithoutSemi4 ()
756     {
757         assertEquals (
758             "numeric character reference without a semicolon at start of string doesn't work",
759             "\u00f7 is the division sign.",
760             Translate.decode ("&#x0F7 is the division sign."));
761     }
762
763     public void testInitialHexNumericCharacterReferenceWithoutSemi5 ()
764     {
765         assertEquals (
766             "numeric character reference without a semicolon at start of string doesn't work",
767             "\u00f7 is the division sign.",
768             Translate.decode ("&#Xf7 is the division sign."));
769     }
770
771     public void testInitialHexNumericCharacterReferenceWithoutSemi6 ()
772     {
773         assertEquals (
774             "numeric character reference without a semicolon at start of string doesn't work",
775             "\u00f7 is the division sign.",
776             Translate.decode ("&#XF7 is the division sign."));
777     }
778
779     public void testInitialHexNumericCharacterReferenceWithoutSemi7 ()
780     {
781         assertEquals (
782             "numeric character reference without a semicolon at start of string doesn't work",
783             "\u00f7 is the division sign.",
784             Translate.decode ("&#X0f7 is the division sign."));
785     }
786
787     public void testInitialHexNumericCharacterReferenceWithoutSemi8 ()
788     {
789         assertEquals (
790             "numeric character reference without a semicolon at start of string doesn't work",
791             "\u00f7 is the division sign.",
792             Translate.decode ("&#X0F7 is the division sign."));
793     }
794
795     public void testFinalCharacterEntityReference ()
796     {
797         assertEquals (
798             "character entity reference at end of string doesn't work",
799             "The division sign (\u00f7) is \u00f7",
800             Translate.decode ("The division sign (\u00f7) is &divide;"));
801     }
802
803     public void testFinalNumericCharacterReference ()
804     {
805         assertEquals (
806             "numeric character reference at end of string doesn't work",
807             "The division sign (\u00f7) is \u00f7",
808             Translate.decode ("The division sign (\u00f7) is &#247;"));
809     }
810
811     public void testFinalHexNumericCharacterReference1 ()
812     {
813         assertEquals (
814             "numeric character reference at end of string doesn't work",
815             "The division sign (\u00f7) is \u00f7",
816             Translate.decode ("The division sign (\u00f7) is &#xf7;"));
817     }
818
819     public void testFinalHexNumericCharacterReference2 ()
820     {
821         assertEquals (
822             "numeric character reference at end of string doesn't work",
823             "The division sign (\u00f7) is \u00f7",
824             Translate.decode ("The division sign (\u00f7) is &#xF7;"));
825     }
826
827     public void testFinalHexNumericCharacterReference3 ()
828     {
829         assertEquals (
830             "numeric character reference at end of string doesn't work",
831             "The division sign (\u00f7) is \u00f7",
832             Translate.decode ("The division sign (\u00f7) is &#x0f7;"));
833     }
834
835     public void testFinalHexNumericCharacterReference4 ()
836     {
837         assertEquals (
838             "numeric character reference at end of string doesn't work",
839             "The division sign (\u00f7) is \u00f7",
840             Translate.decode ("The division sign (\u00f7) is &#x0F7;"));
841     }
842
843     public void testFinalHexNumericCharacterReference5 ()
844     {
845         assertEquals (
846             "numeric character reference at end of string doesn't work",
847             "The division sign (\u00f7) is \u00f7",
848             Translate.decode ("The division sign (\u00f7) is &#Xf7;"));
849     }
850
851     public void testFinalHexNumericCharacterReference6 ()
852     {
853         assertEquals (
854             "numeric character reference at end of string doesn't work",
855             "The division sign (\u00f7) is \u00f7",
856             Translate.decode ("The division sign (\u00f7) is &#XF7;"));
857     }
858
859     public void testFinalHexNumericCharacterReference7 ()
860     {
861         assertEquals (
862             "numeric character reference at end of string doesn't work",
863             "The division sign (\u00f7) is \u00f7",
864             Translate.decode ("The division sign (\u00f7) is &#X0f7;"));
865     }
866
867     public void testFinalHexNumericCharacterReference8 ()
868     {
869         assertEquals (
870             "numeric character reference at end of string doesn't work",
871             "The division sign (\u00f7) is \u00f7",
872             Translate.decode ("The division sign (\u00f7) is &#X0F7;"));
873     }
874
875     public void testFinalCharacterEntityReferenceWithoutSemi ()
876     {
877         assertEquals (
878             "character entity reference without a semicolon at end of string doesn't work",
879             "The division sign (\u00f7) is \u00f7",
880             Translate.decode ("The division sign (\u00f7) is &divide"));
881     }
882
883     public void testFinalNumericCharacterReferenceWithoutSemi1 ()
884     {
885         assertEquals (
886             "numeric character reference without a semicolon at end of string doesn't work",
887             "The division sign (\u00f7) is \u00f7",
888             Translate.decode ("The division sign (\u00f7) is &#247"));
889     }
890
891     public void testFinalNumericCharacterReferenceWithoutSemi2 ()
892     {
893         assertEquals (
894             "numeric character reference without a semicolon at end of string doesn't work",
895             "The division sign (\u00f7) is \u00f7",
896             Translate.decode ("The division sign (\u00f7) is &#0247"));
897     }
898
899     public void testFinalHexNumericCharacterReferenceWithoutSemi1 ()
900     {
901         assertEquals (
902             "numeric character reference without a semicolon at end of string doesn't work",
903             "The division sign (\u00f7) is \u00f7",
904             Translate.decode ("The division sign (\u00f7) is &#xf7"));
905     }
906
907     public void testFinalHexNumericCharacterReferenceWithoutSemi2 ()
908     {
909         assertEquals (
910             "numeric character reference without a semicolon at end of string doesn't work",
911             "The division sign (\u00f7) is \u00f7",
912             Translate.decode ("The division sign (\u00f7) is &#xF7"));
913     }
914
915     public void testFinalHexNumericCharacterReferenceWithoutSemi3 ()
916     {
917         assertEquals (
918             "numeric character reference without a semicolon at end of string doesn't work",
919             "The division sign (\u00f7) is \u00f7",
920             Translate.decode ("The division sign (\u00f7) is &#x0f7"));
921     }
922
923     public void testFinalHexNumericCharacterReferenceWithoutSemi4 ()
924     {
925         assertEquals (
926             "numeric character reference without a semicolon at end of string doesn't work",
927             "The division sign (\u00f7) is \u00f7",
928             Translate.decode ("The division sign (\u00f7) is &#x0F7"));
929     }
930
931     public void testFinalHexNumericCharacterReferenceWithoutSemi5 ()
932     {
933         assertEquals (
934             "numeric character reference without a semicolon at end of string doesn't work",
935             "The division sign (\u00f7) is \u00f7",
936             Translate.decode ("The division sign (\u00f7) is &#Xf7"));
937     }
938
939     public void testFinalHexNumericCharacterReferenceWithoutSemi6 ()
940     {
941         assertEquals (
942             "numeric character reference without a semicolon at end of string doesn't work",
943             "The division sign (\u00f7) is \u00f7",
944             Translate.decode ("The division sign (\u00f7) is &#XF7"));
945     }
946
947     public void testFinalHexNumericCharacterReferenceWithoutSemi7 ()
948     {
949         assertEquals (
950             "numeric character reference without a semicolon at end of string doesn't work",
951             "The division sign (\u00f7) is \u00f7",
952             Translate.decode ("The division sign (\u00f7) is &#X0f7"));
953     }
954
955     public void testFinalHexNumericCharacterReferenceWithoutSemi8 ()
956     {
957         assertEquals (
958             "numeric character reference without a semicolon at end of string doesn't work",
959             "The division sign (\u00f7) is \u00f7",
960             Translate.decode ("The division sign (\u00f7) is &#X0F7"));
961     }
962
963     public void testReferencesInString ()
964     {
965         assertEquals (
966             "character references within a string don't work",
967             "Thus, the character entity reference \u00f7 is a more convenient form than \u00f7 for obtaining the division sign (\u00f7)",
968             Translate.decode ("Thus, the character entity reference &divide; is a more convenient form than &#247; for obtaining the division sign (\u00f7)"));
969     }
970
971     public void testBogusCharacterEntityReference1 ()
972     {
973         assertEquals (
974             "bogus character entity reference doesn't work",
975             "The character entity reference &divode; is bogus",
976             Translate.decode ("The character entity reference &divode; is bogus"));
977     }
978
979     public void testBogusCharacterEntityReference2 ()
980     {
981         assertEquals (
982             "bogus character entity reference doesn't work",
983             "The character entity reference &(divide) is bogus",
984             Translate.decode ("The character entity reference &(divide) is bogus"));
985     }
986
987     public void testBogusNumericCharacterReference ()
988     {
989         assertEquals (
990             "bogus numeric character reference doesn't work",
991             "The numeric character reference &#BF7; is bogus",
992             Translate.decode ("The numeric character reference &#BF7; is bogus"));
993     }
994
995     public void testBogusHexNumericCharacterReference ()
996     {
997         assertEquals (
998             "bogus numeric character reference doesn't work",
999             "The numeric character reference &#xKJ7; is bogus",
1000            Translate.decode ("The numeric character reference &#xKJ7; is bogus"));
1001    }
1002
1003    public void testPoorlyTerminatedCharacterEntityReference1 ()
1004    {
1005        assertEquals (
1006            "poorly terminated character entity reference doesn't work",
1007            "The character entity reference \u00f7d should be decoded",
1008            Translate.decode ("The character entity reference &divided should be decoded"));
1009    }
1010
1011    public void testPoorlyTerminatedCharacterEntityReference2 ()
1012    {
1013        assertEquals (
1014            "poorly terminated character entity reference doesn't work",
1015            "The character entity reference \u00f7<br> should be decoded",
1016            Translate.decode ("The character entity reference &divide<br> should be decoded"));
1017    }
1018
1019    public void testPoorlyTerminatedNumericCharacterReference1 ()
1020    {
1021        assertEquals (
1022            "poorly terminated numeric character reference doesn't work",
1023            "The numeric character reference \u00f7pop should be decoded",
1024            Translate.decode ("The numeric character reference &#xf7pop should be decoded"));
1025    }
1026
1027    public void testPoorlyTerminatedNumericCharacterReference2 ()
1028    {
1029        assertEquals (
1030            "poorly terminated numeric character reference doesn't work",
1031            "The numeric character reference \u00f7<br> should be decoded",
1032            Translate.decode ("The numeric character reference &#xf7<br> should be decoded"));
1033    }
1034
1035    public void testPoorlyTerminatedNumericCharacterReference3 ()
1036    {
1037        assertEquals (
1038            "poorly terminated numeric character reference doesn't work",
1039            "The numeric character reference \u00f7xpert should be decoded",
1040            Translate.decode ("The numeric character reference &#xf7xpert should be decoded"));
1041    }
1042
1043    public void testEncode ()
1044    {
1045        assertEquals (
1046            "encode doesn't work",
1047            "Character entity reference: &divide;, another: &nbsp;, numeric character reference: &#9831;.",
1048            Translate.encode ("Character entity reference: \u00f7, another: \u00a0, numeric character reference: \u2667."));
1049    }
1050
1051    public void testEncodeLink ()
1052    {
1053        assertEquals (
1054            "encode link doesn't work",
1055            "&lt;a HREF=&quot;http://www.w3.org/TR/REC-html40/sgml/entities.html&quot;&gt;http://www.w3.org/TR/REC-html40/sgml/entities.html&lt;/a&gt;",
1056            Translate.encode ("<a HREF=\"http://www.w3.org/TR/REC-html40/sgml/entities.html\">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>"));
1057    }
1058
1059    public byte[] encodedecode (byte[] bytes)
1060        throws
1061            IOException JavaDoc
1062    {
1063        InputStream JavaDoc in;
1064        ByteArrayOutputStream JavaDoc out;
1065        byte[] data;
1066
1067        // encode
1068
in = new ByteArrayInputStream JavaDoc (bytes);
1069        out = new ByteArrayOutputStream JavaDoc ();
1070        Translate.encode (in, new PrintStream JavaDoc (out, false, "ISO-8859-1"));
1071        in.close ();
1072        out.close ();
1073        data = out.toByteArray ();
1074
1075        // decode
1076
in = new ByteArrayInputStream JavaDoc (data);
1077        out = new ByteArrayOutputStream JavaDoc ();
1078        Translate.decode (in, new PrintStream JavaDoc (out, false, "ISO-8859-1"));
1079        in.close ();
1080        out.close ();
1081        data = out.toByteArray ();
1082
1083        return (data);
1084    }
1085
1086    public void check (byte[] reference, byte[] result)
1087        throws
1088            IOException JavaDoc
1089    {
1090        InputStream JavaDoc ref;
1091        InputStream JavaDoc in;
1092        int i;
1093        int i1;
1094        int i2;
1095
1096        ref = new ByteArrayInputStream JavaDoc (reference);
1097        in = new ByteArrayInputStream JavaDoc (result);
1098        i = 0;
1099        do
1100        {
1101            i1 = ref.read ();
1102            i2 = in.read ();
1103            if (i1 != i2)
1104                fail ("byte difference detected at offset " + i + " expected " + i1 + ", actual " + i2);
1105            i++;
1106        }
1107        while (-1 != i1);
1108        ref.close ();
1109        in.close ();
1110    }
1111
1112    public void testHexNumericEncoding ()
1113        throws
1114            IOException JavaDoc
1115    {
1116        try
1117        {
1118            Translate.ENCODE_HEXADECIMAL = true;
1119            assertEquals (
1120                "hex value incorrect",
1121                "&#x5ab; is a non-existant character.",
1122                Translate.encode ("\u05AB is a non-existant character."));
1123        }
1124        finally
1125        {
1126            Translate.ENCODE_HEXADECIMAL = false;
1127        }
1128    }
1129
1130    public void testLastCharacterEntityReference ()
1131        throws
1132            IOException JavaDoc
1133    {
1134        assertEquals (
1135            "poorly terminated numeric character reference doesn't work",
1136            "The character entity reference\u200cshould be decoded",
1137            Translate.decode ("The character entity reference&zwnjshould be decoded"));
1138    }
1139
1140    public void testEncodeDecodePage () throws IOException JavaDoc
1141    {
1142        URL JavaDoc url;
1143        URLConnection JavaDoc connection;
1144        InputStream JavaDoc in;
1145        ByteArrayOutputStream JavaDoc out;
1146        byte[] bytes;
1147        byte[] result;
1148        int c;
1149
1150        // get some bytes
1151
url = new URL JavaDoc ("http://sourceforge.net/projects/htmlparser");
1152        connection = url.openConnection ();
1153        in = connection.getInputStream ();
1154        out = new ByteArrayOutputStream JavaDoc ();
1155        while (-1 != (c = in.read ()))
1156            out.write (c);
1157        in.close ();
1158        out.close ();
1159        bytes = out.toByteArray ();
1160
1161        // run it through
1162
result = encodedecode (bytes);
1163        
1164        // check
1165
check (bytes, result);
1166    }
1167
1168    /**
1169     * Check all references read in from the w3.org site.
1170     * If this test fails but the others pass, suspect that the list of
1171     * entity references has been augmented. The updated list is in the
1172     * CharacterEntityReferenceList.java file in your home directory.
1173     */

1174    public void testEncodeDecodeAll ()
1175    {
1176        CharacterReference[] list;
1177        StringBuffer JavaDoc stimulus;
1178        StringBuffer JavaDoc response;
1179        CharacterReference ref;
1180        String JavaDoc string;
1181
1182        list = getReferences ();
1183        stimulus = new StringBuffer JavaDoc ();
1184        response = new StringBuffer JavaDoc ();
1185        for (int i = 0; i < list.length; i++)
1186        {
1187            ref = list[i];
1188            stimulus.append ((char)ref.getCharacter ());
1189            response.append ("&");
1190            response.append (ref.getKernel ());
1191            response.append (";");
1192        }
1193        string = Translate.encode (stimulus.toString ());
1194        if (!string.equals (response.toString ()))
1195            fail ("encoding incorrect, expected \n\"" + response.toString () + "\", encoded \n\"" + string + "\"");
1196        string = Translate.decode (string);
1197        if (!string.equals (stimulus.toString ()))
1198            fail ("decoding incorrect, expected \n\"" + stimulus.toString () + "\", decoded \n\"" + string + "\", encoded \n\"" + response.toString () + "\"");
1199    }
1200
1201    public void testEncodeDecodeRandom ()
1202    {
1203        Random JavaDoc random;
1204        CharacterReference[] list;
1205        StringBuffer JavaDoc stimulus;
1206        StringBuffer JavaDoc response;
1207        char character;
1208        CharacterReference ref;
1209        String JavaDoc string;
1210
1211        random = new Random JavaDoc ();
1212        list = getReferences ();
1213        stimulus = new StringBuffer JavaDoc ();
1214        response = new StringBuffer JavaDoc ();
1215        for (int i = 0; i < 1000; i++)
1216        {
1217            for (int j = 0; j < 10; j++)
1218            {
1219                // some random characters
1220
for (int k = 0; k < 10; k++)
1221                {
1222                    character = (char)random.nextInt (127);
1223                    if (character >= ' ')
1224                    {
1225                        if ('&' == character)
1226                        {
1227                            stimulus.append (character);
1228                            response.append ("&amp;");
1229                        }
1230                        else if ('"' == character)
1231                        {
1232                            stimulus.append (character);
1233                            response.append ("&quot;");
1234                        }
1235                        else if ('<' == character)
1236                        {
1237                            stimulus.append (character);
1238                            response.append ("&lt;");
1239                        }
1240                        else if ('>' == character)
1241                        {
1242                            stimulus.append (character);
1243                            response.append ("&gt;");
1244                        }
1245                        else
1246                        {
1247                            stimulus.append (character);
1248                            response.append (character);
1249                        }
1250                    }
1251                }
1252                ref = list[random.nextInt (list.length)];
1253                stimulus.append ((char)ref.getCharacter ());
1254                response.append ("&");
1255                response.append (ref.getKernel ());
1256                response.append (";");
1257                // some more random characters
1258
for (int k = 0; k < 10; k++)
1259                {
1260                    character = (char)random.nextInt (127);
1261                    if (character >= ' ')
1262                    {
1263                        if ('&' == character)
1264                        {
1265                            stimulus.append (character);
1266                            response.append ("&amp;");
1267                        }
1268                        else if ('"' == character)
1269                        {
1270                            stimulus.append (character);
1271                            response.append ("&quot;");
1272                        }
1273                        else if ('<' == character)
1274                        {
1275                            stimulus.append (character);
1276                            response.append ("&lt;");
1277                        }
1278                        else if ('>' == character)
1279                        {
1280                            stimulus.append (character);
1281                            response.append ("&gt;");
1282                        }
1283                        else
1284                        {
1285                            stimulus.append (character);
1286                            response.append (character);
1287                        }
1288                    }
1289                }
1290            }
1291            string = Translate.encode (stimulus.toString ());
1292            if (!string.equals (response.toString ()))
1293                fail ("encoding incorrect, expected \n\"" + response.toString () + "\", encoded \n\"" + string + "\"");
1294            string = Translate.decode (string);
1295            if (!string.equals (stimulus.toString ()))
1296                fail ("decoding incorrect, expected \n\"" + stimulus.toString () + "\", decoded \n\"" + string + "\", encoded \n\"" + response.toString () + "\"");
1297            stimulus.setLength (0);
1298            response.setLength (0);
1299        }
1300        
1301    }
1302
1303    public void testEncodeDecodeRandomNoSemi ()
1304    {
1305        Random JavaDoc random;
1306        CharacterReference[] list;
1307        StringBuffer JavaDoc stimulus;
1308        StringBuffer JavaDoc response;
1309        char character;
1310        int index;
1311        CharacterReference ref;
1312        String JavaDoc kernel;
1313        ArrayList JavaDoc forbidden;
1314        String JavaDoc string;
1315
1316        random = new Random JavaDoc ();
1317        list = getReferences ();
1318        stimulus = new StringBuffer JavaDoc ();
1319        response = new StringBuffer JavaDoc ();
1320        for (int i = 0; i < 1000; i++)
1321        {
1322            for (int j = 0; j < 10; j++)
1323            {
1324                // some random characters
1325
for (int k = 0; k < 10; k++)
1326                {
1327                    character = (char)random.nextInt (127);
1328                    if (character >= ' ')
1329                    {
1330                        if ('&' == character)
1331                        {
1332                            stimulus.append (character);
1333                            response.append ("&amp;");
1334                        }
1335                        else if ('"' == character)
1336                        {
1337                            stimulus.append (character);
1338                            response.append ("&quot;");
1339                        }
1340                        else if ('<' == character)
1341                        {
1342                            stimulus.append (character);
1343                            response.append ("&lt;");
1344                        }
1345                        else if ('>' == character)
1346                        {
1347                            stimulus.append (character);
1348                            response.append ("&gt;");
1349                        }
1350                        else
1351                        {
1352                            stimulus.append (character);
1353                            response.append (character);
1354                        }
1355                    }
1356                }
1357                index = random.nextInt (list.length);
1358                ref = list[index];
1359                kernel = ref.getKernel ();
1360                stimulus.append ((char)ref.getCharacter ());
1361                response.append ("&");
1362                response.append (kernel);
1363                // to be fair, we ensure that the next character isn't valid
1364
// for a different reference, i.e. &sup shouldn't be followed
1365
// by a 1, 2, 3 or e
1366
forbidden = new ArrayList JavaDoc ();
1367                for (int k = index + 1; k < list.length; k++)
1368                    if (list[k].getKernel ().regionMatches (
1369                        0,
1370                        kernel,
1371                        0,
1372                        kernel.length ()))
1373                        forbidden.add (new Character JavaDoc (list[k].getKernel ().charAt (kernel.length ())));
1374                    else
1375                        break;
1376                do
1377                {
1378                    character = (char)random.nextInt (127);
1379                    if ( (' ' <= character)
1380                        && ('&' != character)
1381                        && ('"' != character)
1382                        && ('<' != character)
1383                        && ('>' != character)
1384                        && (';' != character)
1385                        && !(forbidden.contains (new Character JavaDoc (character))))
1386                    {
1387                        stimulus.append (character);
1388                        response.append (character);
1389                        character = 0;
1390                    }
1391                    else
1392                        character = ' ';
1393                        
1394                }
1395                while (0 != character);
1396                // some more random characters
1397
for (int k = 0; k < 10; k++)
1398                {
1399                    character = (char)random.nextInt (127);
1400                    if (character >= ' ')
1401                    {
1402                        if ('&' == character)
1403                        {
1404                            stimulus.append (character);
1405                            response.append ("&amp;");
1406                        }
1407                        else if ('"' == character)
1408                        {
1409                            stimulus.append (character);
1410                            response.append ("&quot;");
1411                        }
1412                        else if ('<' == character)
1413                        {
1414                            stimulus.append (character);
1415                            response.append ("&lt;");
1416                        }
1417                        else if ('>' == character)
1418                        {
1419                            stimulus.append (character);
1420                            response.append ("&gt;");
1421                        }
1422                        else
1423                        {
1424                            stimulus.append (character);
1425                            response.append (character);
1426                        }
1427                    }
1428                }
1429            }
1430            string = Translate.decode (response.toString ());
1431            if (!string.equals (stimulus.toString ()))
1432                fail ("decoding incorrect:\nexpected \"" + stimulus.toString () + "\"\n decoded \"" + string + "\"\n encoded \"" + response.toString () + "\"");
1433            stimulus.setLength (0);
1434            response.setLength (0);
1435        }
1436    }
1437}
1438
1439
1440
1441
Popular Tags