KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > nu > xom > tests > EncodingTest


1 /* Copyright 2002-2004 Elliotte Rusty Harold
2    
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6    
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10    GNU Lesser General Public License for more details.
11    
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307 USA
16    
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@metalab.unc.edu. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */

21
22 package nu.xom.tests;
23
24 import java.io.ByteArrayInputStream JavaDoc;
25 import java.io.ByteArrayOutputStream JavaDoc;
26 import java.io.IOException JavaDoc;
27 import java.io.InputStream JavaDoc;
28 import java.io.UnsupportedEncodingException JavaDoc;
29
30 import nu.xom.Attribute;
31 import nu.xom.Builder;
32 import nu.xom.Document;
33 import nu.xom.Element;
34 import nu.xom.ParsingException;
35 import nu.xom.Serializer;
36
37 import com.ibm.icu.text.UTF16;
38
39 /**
40  * <p>
41  * Check serialization of almost all of Unicode
42  * in a variety of encodings.
43  * </p>
44  *
45  * @author Elliotte Rusty Harold
46  * @version 1.0
47  *
48  */

49 public class EncodingTest extends XOMTestCase {
50
51     
52     public EncodingTest(String JavaDoc name) {
53         super(name);
54     }
55
56     
57     private Document doc;
58
59     protected void setUp() {
60         
61         Element root = new Element("root");
62         doc = new Document(root);
63
64         for (int i = 0x20; i <= 0xD7FF; i++) {
65             Element data = new Element("d");
66             data.appendChild(String.valueOf(((char) i)));
67             data.addAttribute(new Attribute("c", String.valueOf(i)));
68             root.appendChild(data);
69         }
70         
71         // skip surrogates between 0xD800 and 0xDFFF
72
for (int i = 0xE000; i <= 0xFFFD; i++) {
73             Element data = new Element("d");
74             data.appendChild(String.valueOf(((char) i)));
75             data.addAttribute(new Attribute("c", String.valueOf(i)));
76             root.appendChild(data);
77         }
78
79         // Test Plane-1 characters. These are tricky because Java
80
// strings encode them as surrogate pairs. We'll test with
81
// the characters from 1D100 to 1D1FF (the musical symbols)
82
for (int i = 0; i < 256; i++) {
83             int u = 0x1D100 + i;
84             // algorithm from RFC 2781
85
/* int uprime = u - 0x10000;
86             int W1 = 0xD800;
87             int W2 = 0xDC00;
88             W2 = W2 | (uprime & 0x7FF );
89             W1 = W1 | (uprime & 0xFF800); */

90             Element data = new Element("d");
91             // data.appendChild( String.valueOf(((char) W1)) + ((char) W2) );
92
String JavaDoc s = UTF16.valueOf(u);
93             data.appendChild( s );
94             data.addAttribute(new Attribute("c", String.valueOf(u)));
95             // data.addAttribute(new Attribute("c", String.valueOf(W1)));
96
root.appendChild(data);
97         }
98         
99     }
100     
101     
102     protected void tearDown() {
103       doc = null;
104       System.gc();
105     }
106     
107     
108     public void testEUCJP() throws ParsingException, IOException JavaDoc {
109         checkAll("EUC-JP");
110     }
111
112     
113     public void testShift_JIS() throws ParsingException, IOException JavaDoc {
114         checkAll("Shift_JIS");
115     }
116
117
118     public void testISO2022JP() throws ParsingException, IOException JavaDoc {
119         checkAll("ISO-2022-JP");
120     }
121
122
123     public void testGeneric() throws ParsingException, IOException JavaDoc {
124         checkAll("Cp1252");
125     }
126     
127
128     // Main purpose here is to test a character set whose name is
129
// case dependent
130
public void testMacRoman() throws ParsingException, IOException JavaDoc {
131         checkAll("MacRoman");
132     }
133     
134
135     public void testBig5() throws ParsingException, IOException JavaDoc {
136         checkAll("Big5");
137     }
138
139     public void testUSASCII() throws ParsingException, IOException JavaDoc {
140         checkAll("US-ASCII");
141     }
142     
143     public void testASCII() throws ParsingException, IOException JavaDoc {
144         checkAll("ASCII");
145     }
146
147     public void testLatin1() throws ParsingException, IOException JavaDoc {
148         checkAll("ISO-8859-1");
149     }
150
151     public void testLatin2() throws ParsingException, IOException JavaDoc {
152         checkAll("ISO-8859-2");
153     }
154     
155     public void testLatin3() throws ParsingException, IOException JavaDoc {
156         checkAll("ISO-8859-3");
157     }
158     
159     public void testLatin4() throws ParsingException, IOException JavaDoc {
160         checkAll("ISO-8859-4");
161     }
162     
163     public void testCyrillic() throws ParsingException, IOException JavaDoc {
164         checkAll("ISO-8859-5");
165     }
166     
167     public void testArabic() throws ParsingException, IOException JavaDoc {
168         checkAll("ISO-8859-6");
169     }
170     
171     public void testGreek() throws ParsingException, IOException JavaDoc {
172         checkAll("ISO-8859-7");
173     }
174     
175     public void testThai() throws ParsingException, IOException JavaDoc {
176         checkAll("TIS-620");
177     }
178     
179     public void testHebrew() throws ParsingException, IOException JavaDoc {
180         checkAll("ISO-8859-8");
181     }
182     
183     public void testLatin5() throws ParsingException, IOException JavaDoc {
184         checkAll("ISO-8859-9");
185     }
186
187     public void testUTF8() throws ParsingException, IOException JavaDoc {
188         checkAll("UTF-8");
189     }
190     
191     public void testUTF16() throws ParsingException, IOException JavaDoc {
192         checkAll("UTF-16");
193     }
194
195     public void testUCS2() throws ParsingException, IOException JavaDoc {
196         checkAll("ISO-10646-UCS-2");
197     }
198     
199     public void testEBCDIC() throws ParsingException, IOException JavaDoc {
200         checkAll("Cp037");
201     }
202     
203     // These encodings are only available after Java 1.3
204
private static boolean java14OrLater = false;
205     
206     static {
207         String JavaDoc version = System.getProperty("java.version");
208         String JavaDoc majorVersion = version.substring(0, 3);
209         double versionNumber = Double.parseDouble(majorVersion);
210         if (versionNumber >= 1.4) java14OrLater = true;
211     }
212     
213     public void testLatin7() throws ParsingException, IOException JavaDoc {
214         if (java14OrLater) checkAll("ISO-8859-13");
215     }
216     
217     public void testLatin9() throws ParsingException, IOException JavaDoc {
218         if (java14OrLater) checkAll("ISO-8859-15");
219     }
220
221     public void testGB18030() throws ParsingException, IOException JavaDoc {
222         if (java14OrLater) checkAll("GB18030");
223     }
224
225     // These encodings are not installed in all distributions by
226
// default. They are only found currently in IBM's Java 1.4.1 VM.
227
// They don't seem to be supported in the 1.5 alpha
228
// either.
229
public void testUCS4() throws ParsingException, IOException JavaDoc {
230         if (charsetAvailable("ISO-10646-UCS-4")) checkAll("ISO-10646-UCS-4");
231     }
232
233     public void testLatin6() throws ParsingException, IOException JavaDoc {
234         if (charsetAvailable("ISO-8859-10")) checkAll("ISO-8859-10");
235     }
236
237     public void testLatin8() throws ParsingException, IOException JavaDoc {
238         if (charsetAvailable("ISO-8859-14")) checkAll("ISO-8859-14");
239     }
240
241     public void testLatin10() throws ParsingException, IOException JavaDoc {
242         if (charsetAvailable("ISO-8859-16")) checkAll("ISO-8859-16");
243     }
244         
245     
246     // Test that with an encoding XOM does not specifically support
247
// but the VM does, everything still works.
248
public void testUnsupportedEncoding()
249       throws ParsingException, IOException JavaDoc {
250         checkAll("Cp1252");
251     }
252     
253
254     private static boolean charsetAvailable(String JavaDoc name) {
255         // hack to avoid using 1.4 classes
256
try {
257             "d".getBytes(name);
258             return true;
259         }
260         catch (UnsupportedEncodingException JavaDoc ex) {
261             return false;
262         }
263         
264     }
265        
266     
267     private void checkAll(String JavaDoc encoding)
268       throws ParsingException, IOException JavaDoc {
269         
270         Builder builder = new Builder();
271         byte[] data = null;
272         ByteArrayOutputStream JavaDoc out = new ByteArrayOutputStream JavaDoc(100000);
273         // Write data into a byte array using encoding
274
Serializer serializer = new Serializer(out, encoding);
275         serializer.write(doc);
276         serializer.flush();
277         out.flush();
278         out.close();
279         data = out.toByteArray();
280         InputStream JavaDoc in = new ByteArrayInputStream JavaDoc(data);
281         Document reparsed = builder.build(in);
282         in.close();
283         serializer = null;
284         
285         Element reparsedRoot = reparsed.getRootElement();
286         int childCount = reparsedRoot.getChildCount();
287         for (int i = 0; i < childCount; i++) {
288             Element test = (Element) reparsedRoot.getChild(i);
289             String JavaDoc value = test.getValue();
290             int expected
291               = Integer.parseInt(test.getAttributeValue("c"));
292             // workaround for EBCDIC bugs
293
if (expected == 133 && encoding.equalsIgnoreCase("Cp037")) {
294                 continue;
295             }
296             int actual = value.charAt(0);
297             if (value.length() > 1) {
298                 actual = UTF16.charAt(value, 0);
299             }
300             // This doesn't work for all encodings, because there are
301
// a few cases where you write a Unicode compatibility
302
// character such as an Arabic presentation form,
303
// but read back what is essentially a different version
304
// of the same character. That is the mapping from some
305
// legacy character sets to Unicode is not always 1-1.
306
assertEquals("Expected 0x"
307               + Integer.toHexString(expected).toUpperCase()
308               + " but was 0x"
309               + Integer.toHexString(actual).toUpperCase(), expected, actual);
310         }
311         
312         in = null;
313             
314     }
315
316     
317     private void checkSome(String JavaDoc encoding)
318       throws ParsingException, IOException JavaDoc {
319         
320         Builder builder = new Builder();
321         byte[] data = null;
322         ByteArrayOutputStream JavaDoc out = new ByteArrayOutputStream JavaDoc(100000);
323         // Write data into a byte array using encoding
324
Serializer serializer = new Serializer(out, encoding);
325         serializer.write(doc);
326         serializer.flush();
327         out.flush();
328         out.close();
329         data = out.toByteArray();
330         InputStream JavaDoc in = new ByteArrayInputStream JavaDoc(data);
331         Document reparsed = builder.build(in);
332         in.close();
333         serializer = null;
334         
335         Element reparsedRoot = reparsed.getRootElement();
336         int childCount = reparsedRoot.getChildCount();
337         for (int i = 0; i < childCount; i++) {
338             Element test = (Element) reparsedRoot.getChild(i);
339             String JavaDoc value = test.getValue();
340             int expected
341               = Integer.parseInt(test.getAttributeValue("c"));
342             // workaround for EBCDIC bugs
343
if (expected == 133 && encoding.equalsIgnoreCase("Cp037")) {
344                 continue;
345             }
346             int actual = value.charAt(0);
347             if (value.length() > 1) {
348                 actual = UTF16.charAt(value, 0);
349             }
350             if (expected != actual) System.err.println(expected);
351         }
352         
353         in = null;
354             
355     }
356     
357     
358 }
359
Popular Tags