EncodingTest


1   /* Copyright 2002-2004 Elliotte Rusty Harold
2      
3      This library is free software; you can redistribute it and/or modify
4      it under the terms of version 2.1 of the GNU Lesser General Public 
5      License as published by the Free Software Foundation.
6      
7      This library is distributed in the hope that it will be useful,
8      but WITHOUT ANY WARRANTY; without even the implied warranty of
9      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
10     GNU Lesser General Public License for more details.
11     
12     You should have received a copy of the GNU Lesser General Public
13     License along with this library; if not, write to the 
14     Free Software Foundation, Inc., 59 Temple Place, Suite 330, 
15     Boston, MA 02111-1307  USA
16     
17     You can contact Elliotte Rusty Harold by sending e-mail to
18     elharo@metalab.unc.edu. Please include the word "XOM" in the
19     subject line. The XOM home page is located at http://www.xom.nu/
20  */
21  
22  package nu.xom.tests;
23  
24  import java.io.ByteArrayInputStream  ;
25  import java.io.ByteArrayOutputStream  ;
26  import java.io.IOException  ;
27  import java.io.InputStream  ;
28  import java.io.UnsupportedEncodingException  ;
29  
30  import nu.xom.Attribute;
31  import nu.xom.Builder;
32  import nu.xom.Document;
33  import nu.xom.Element;
34  import nu.xom.ParsingException;
35  import nu.xom.Serializer;
36  
37  import com.ibm.icu.text.UTF16;
38  
39  /**
40   * <p>
41   *   Check serialization of almost all of Unicode
42   *   in a variety of encodings.
43   * </p>
44   * 
45   * @author Elliotte Rusty Harold
46   * @version 1.0
47   *
48   */
49  public class EncodingTest extends XOMTestCase {
50  
51      
52      public EncodingTest(String   name) {
53          super(name);
54      }
55  
56      
57      private Document doc;
58  
59      protected void setUp() {
60          
61          Element root = new Element("root");
62          doc = new Document(root);           
63  
64          for (int i = 0x20; i <= 0xD7FF; i++) {
65              Element data = new Element("d");
66              data.appendChild(String.valueOf(((char) i)));
67              data.addAttribute(new Attribute("c", String.valueOf(i)));
68              root.appendChild(data);
69          }
70          
71          // skip surrogates between 0xD800 and 0xDFFF
72          for (int i = 0xE000; i <= 0xFFFD; i++) {
73              Element data = new Element("d");
74              data.appendChild(String.valueOf(((char) i)));
75              data.addAttribute(new Attribute("c", String.valueOf(i)));
76              root.appendChild(data);
77          }
78  
79          // Test Plane-1 characters. These are tricky because Java 
80          // strings encode them as surrogate pairs. We'll test with
81          // the characters from 1D100 to 1D1FF (the musical symbols)
82          for (int i = 0; i < 256; i++) {
83              int u = 0x1D100 + i;
84              // algorithm from RFC 2781
85              /* int uprime = u - 0x10000;
86              int W1 = 0xD800;
87              int W2 = 0xDC00;
88              W2 = W2 | (uprime & 0x7FF );
89              W1 = W1 | (uprime & 0xFF800); */
90              Element data = new Element("d");
91              // data.appendChild( String.valueOf(((char) W1)) + ((char) W2) );
92              String   s = UTF16.valueOf(u);
93              data.appendChild( s );
94              data.addAttribute(new Attribute("c", String.valueOf(u)));
95              // data.addAttribute(new Attribute("c", String.valueOf(W1)));
96              root.appendChild(data);
97          }        
98          
99      }
100     
101     
102     protected void tearDown() {
103       doc = null;
104       System.gc();   
105     } 
106     
107     
108     public void testEUCJP() throws ParsingException, IOException   {
109         checkAll("EUC-JP");
110     } 
111 
112     
113     public void testShift_JIS() throws ParsingException, IOException   {
114         checkAll("Shift_JIS");
115     } 
116 
117 
118     public void testISO2022JP() throws ParsingException, IOException   {
119         checkAll("ISO-2022-JP");
120     } 
121 
122 
123     public void testGeneric() throws ParsingException, IOException   {
124         checkAll("Cp1252");
125     }
126     
127 
128     // Main purpose here is to test a character set whose name is 
129     // case dependent
130     public void testMacRoman() throws ParsingException, IOException   {
131         checkAll("MacRoman");
132     }
133     
134 
135     public void testBig5() throws ParsingException, IOException   {
136         checkAll("Big5");
137     } 
138 
139     public void testUSASCII() throws ParsingException, IOException   {
140         checkAll("US-ASCII");
141     }
142     
143     public void testASCII() throws ParsingException, IOException   {
144         checkAll("ASCII");
145     }
146 
147     public void testLatin1() throws ParsingException, IOException   {       
148         checkAll("ISO-8859-1");        
149     }
150 
151     public void testLatin2() throws ParsingException, IOException   {
152         checkAll("ISO-8859-2");
153     }
154     
155     public void testLatin3() throws ParsingException, IOException   {
156         checkAll("ISO-8859-3");
157     }
158     
159     public void testLatin4() throws ParsingException, IOException   {
160         checkAll("ISO-8859-4");
161     }
162     
163     public void testCyrillic() throws ParsingException, IOException   {
164         checkAll("ISO-8859-5");
165     }
166     
167     public void testArabic() throws ParsingException, IOException   {
168         checkAll("ISO-8859-6");
169     }
170     
171     public void testGreek() throws ParsingException, IOException   {
172         checkAll("ISO-8859-7");
173     }
174     
175     public void testThai() throws ParsingException, IOException   {
176         checkAll("TIS-620");
177     }
178     
179     public void testHebrew() throws ParsingException, IOException   {
180         checkAll("ISO-8859-8");
181     }
182     
183     public void testLatin5() throws ParsingException, IOException   {
184         checkAll("ISO-8859-9");
185     }
186 
187     public void testUTF8() throws ParsingException, IOException   {
188         checkAll("UTF-8");
189     }
190     
191     public void testUTF16() throws ParsingException, IOException   {
192         checkAll("UTF-16");
193     } 
194 
195     public void testUCS2() throws ParsingException, IOException   {
196         checkAll("ISO-10646-UCS-2");
197     }
198     
199     public void testEBCDIC() throws ParsingException, IOException   {
200         checkAll("Cp037");
201     }
202     
203     // These encodings are only available after Java 1.3
204     private static boolean java14OrLater = false;
205     
206     static {
207         String   version = System.getProperty("java.version");
208         String   majorVersion = version.substring(0, 3);
209         double versionNumber = Double.parseDouble(majorVersion);
210         if (versionNumber >= 1.4) java14OrLater = true; 
211     }   
212     
213     public void testLatin7() throws ParsingException, IOException   {
214         if (java14OrLater) checkAll("ISO-8859-13");
215     }
216     
217     public void testLatin9() throws ParsingException, IOException   {
218         if (java14OrLater) checkAll("ISO-8859-15");
219     } 
220 
221     public void testGB18030() throws ParsingException, IOException   {
222         if (java14OrLater) checkAll("GB18030");
223     } 
224 
225     // These encodings are not installed in all distributions by 
226     // default. They are only found currently in IBM's Java 1.4.1 VM. 
227     // They don't seem to be supported in the 1.5 alpha
228     // either.    
229     public void testUCS4() throws ParsingException, IOException   {
230         if (charsetAvailable("ISO-10646-UCS-4")) checkAll("ISO-10646-UCS-4");
231     } 
232 
233     public void testLatin6() throws ParsingException, IOException   {
234         if (charsetAvailable("ISO-8859-10")) checkAll("ISO-8859-10");
235     } 
236 
237     public void testLatin8() throws ParsingException, IOException   {
238         if (charsetAvailable("ISO-8859-14")) checkAll("ISO-8859-14");
239     }
240 
241     public void testLatin10() throws ParsingException, IOException   {
242         if (charsetAvailable("ISO-8859-16")) checkAll("ISO-8859-16");
243     }     
244         
245     
246     // Test that with an encoding XOM does not specifically support
247     // but the VM does, everything still works.
248     public void testUnsupportedEncoding() 
249       throws ParsingException, IOException   {
250         checkAll("Cp1252");
251     } 
252     
253 
254     private static boolean charsetAvailable(String   name) {
255         // hack to avoid using 1.4 classes
256         try {
257             "d".getBytes(name);
258             return true;
259         }
260         catch (UnsupportedEncodingException   ex) {
261             return false;   
262         }        
263         
264     }
265        
266     
267     private void checkAll(String   encoding) 
268       throws ParsingException, IOException   {
269         
270         Builder builder = new Builder();
271         byte[] data = null;
272         ByteArrayOutputStream   out = new ByteArrayOutputStream  (100000);    
273         // Write data into a byte array using encoding
274         Serializer serializer = new Serializer(out, encoding);
275         serializer.write(doc);
276         serializer.flush();
277         out.flush();
278         out.close();
279         data = out.toByteArray();
280         InputStream   in = new ByteArrayInputStream  (data);
281         Document reparsed = builder.build(in);
282         in.close();
283         serializer = null;
284         
285         Element reparsedRoot = reparsed.getRootElement();
286         int childCount = reparsedRoot.getChildCount();
287         for (int i = 0; i < childCount; i++) {
288             Element test = (Element) reparsedRoot.getChild(i); 
289             String   value = test.getValue();
290             int expected 
291               = Integer.parseInt(test.getAttributeValue("c"));
292             // workaround for EBCDIC bugs
293             if (expected == 133 && encoding.equalsIgnoreCase("Cp037")) {
294                 continue;
295             }
296             int actual = value.charAt(0);
297             if (value.length() > 1) {
298                 actual = UTF16.charAt(value, 0);
299             }
300             // This doesn't work for all encodings, because there are
301             // a few cases where you write a Unicode compatibility 
302             // character such as an Arabic presentation form,
303             // but read back what is essentially a different version 
304             // of the same character. That is the mapping from some
305             // legacy character sets to Unicode is not always 1-1.
306             assertEquals("Expected 0x" 
307               + Integer.toHexString(expected).toUpperCase()
308               + " but was 0x" 
309               + Integer.toHexString(actual).toUpperCase(), expected, actual);
310         } 
311         
312         in = null;
313             
314     }
315 
316     
317     private void checkSome(String   encoding) 
318       throws ParsingException, IOException   {
319         
320         Builder builder = new Builder();
321         byte[] data = null;
322         ByteArrayOutputStream   out = new ByteArrayOutputStream  (100000);    
323         // Write data into a byte array using encoding
324         Serializer serializer = new Serializer(out, encoding);
325         serializer.write(doc);
326         serializer.flush();
327         out.flush();
328         out.close();
329         data = out.toByteArray();
330         InputStream   in = new ByteArrayInputStream  (data);
331         Document reparsed = builder.build(in);
332         in.close();
333         serializer = null;
334         
335         Element reparsedRoot = reparsed.getRootElement();
336         int childCount = reparsedRoot.getChildCount();
337         for (int i = 0; i < childCount; i++) {
338             Element test = (Element) reparsedRoot.getChild(i); 
339             String   value = test.getValue();
340             int expected 
341               = Integer.parseInt(test.getAttributeValue("c"));
342             // workaround for EBCDIC bugs
343             if (expected == 133 && encoding.equalsIgnoreCase("Cp037")) {
344                 continue;
345             }
346             int actual = value.charAt(0);
347             if (value.length() > 1) {
348                 actual = UTF16.charAt(value, 0);
349             }
350             if (expected != actual) System.err.println(expected);
351         } 
352         
353         in = null;
354             
355     }
356     
357     
358 }
359
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags