KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > io > UTF8


1 /*
2  * The Apache Software License, Version 1.1
3  *
4  *
5  * Copyright (c) 2000 The Apache Software Foundation. All rights
6  * reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in
17  * the documentation and/or other materials provided with the
18  * distribution.
19  *
20  * 3. The end-user documentation included with the redistribution,
21  * if any, must include the following acknowledgment:
22  * "This product includes software developed by the
23  * Apache Software Foundation (http://www.apache.org/)."
24  * Alternately, this acknowledgment may appear in the software itself,
25  * if and wherever such third-party acknowledgments normally appear.
26  *
27  * 4. The names "Xerces" and "Apache Software Foundation" must
28  * not be used to endorse or promote products derived from this
29  * software without prior written permission. For written
30  * permission, please contact apache@apache.org.
31  *
32  * 5. Products derived from this software may not be called "Apache",
33  * nor may "Apache" appear in their name, without prior written
34  * permission of the Apache Software Foundation.
35  *
36  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47  * SUCH DAMAGE.
48  * ====================================================================
49  *
50  * This software consists of voluntary contributions made by many
51  * individuals on behalf of the Apache Software Foundation and was
52  * originally based on software copyright (c) 1999, International
53  * Business Machines, Inc., http://www.apache.org. For more
54  * information on the Apache Software Foundation, please see
55  * <http://www.apache.org/>.
56  */

57
58 package io;
59
60 import java.io.IOException JavaDoc;
61 import java.io.InputStream JavaDoc;
62 import java.io.InputStreamReader JavaDoc;
63 import java.io.Reader JavaDoc;
64
65 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
66
67 /**
68  * This program tests the ability of the customized UTF-8 reader used in
69  * Xerces to correctly handle all valid Unicode characters. The UTF-8
70  * reader included in the JDK is also tested as a reference but this
71  * reader currently has a problem with surrogate characters (up to Java
72  * 1.3).
73  *
74  * @author Andy Clark, IBM
75  *
76  * @version $Id: UTF8.java,v 1.2 2005/01/26 08:28:45 jkjome Exp $
77  */

78 public class UTF8 {
79
80     //
81
// MAIN
82
//
83

84     /** Main program entry. */
85     public static void main(String JavaDoc[] argv) throws Exception JavaDoc {
86
87         final int BLOCK_READ_SIZE = 2048;
88
89         //
90
// Test Java reference implementation of UTF-8 decoder
91
//
92

93         System.err.println("#");
94         System.err.println("# Testing Java UTF-8 decoder");
95         System.err.println("#");
96
97         // test character by character
98
try {
99             InputStream JavaDoc stream = new UTF8Producer();
100             Reader JavaDoc reader = new InputStreamReader JavaDoc(stream, "UTF8");
101             long time = testCharByChar(reader);
102             System.err.println("PASS ("+time+" ms)");
103             reader.close();
104         }
105         catch (IOException JavaDoc e) {
106             System.err.println("FAIL: "+e.getMessage());
107         }
108         
109         // test character array
110
try {
111             InputStream JavaDoc stream = new UTF8Producer();
112             Reader JavaDoc reader = new InputStreamReader JavaDoc(stream, "UTF8");
113             long time = testCharArray(reader, BLOCK_READ_SIZE);
114             System.err.println("PASS ("+time+" ms)");
115             reader.close();
116         }
117         catch (IOException JavaDoc e) {
118             System.err.println("FAIL: "+e.getMessage());
119         }
120         
121         //
122
// Test custom implementation of UTF-8 decoder
123
//
124

125         System.err.println("#");
126         System.err.println("# Testing custom UTF-8 decoder");
127         System.err.println("#");
128
129         // test character by character
130
try {
131             InputStream JavaDoc stream = new UTF8Producer();
132             Reader JavaDoc reader = new UTF8Reader(stream);
133             long time = testCharByChar(reader);
134             System.err.println("PASS ("+time+" ms)");
135             reader.close();
136         }
137         catch (IOException JavaDoc e) {
138             System.err.println("FAIL: "+e.getMessage());
139         }
140         
141         // test character array
142
try {
143             InputStream JavaDoc stream = new UTF8Producer();
144             Reader JavaDoc reader = new UTF8Reader(stream);
145             long time = testCharArray(reader, BLOCK_READ_SIZE);
146             System.err.println("PASS ("+time+" ms)");
147             reader.close();
148         }
149         catch (IOException JavaDoc e) {
150             System.err.println("FAIL: "+e.getMessage());
151         }
152         
153     } // main(String[])
154

155     //
156
// Public static methods
157
//
158

159     /** This function tests the specified reader character by character. */
160     public static long testCharByChar(Reader JavaDoc reader) throws Exception JavaDoc {
161
162         long before = System.currentTimeMillis();
163         System.err.println("# Testing character by character");
164
165         System.err.println("testing 0x000000 -> 0x00007F");
166         for (int i = 0; i < 0x0080; i++) {
167             int c = reader.read();
168             if (c != i) {
169                 expectedChar(null, i, c);
170             }
171         }
172         System.err.println("testing 0x000080 -> 0x0007FF");
173         for (int i = 0x0080; i < 0x0800; i++) {
174             int c = reader.read();
175             if (c != i) {
176                 expectedChar(null, i, c);
177             }
178         }
179         System.err.println("testing 0x000800 -> 0x00D7FF");
180         for (int i = 0x0800; i < 0xD800; i++) {
181             int c = reader.read();
182             if (c != i) {
183                 expectedChar(null, i, c);
184             }
185         }
186         System.err.println("testing 0x00E000 -> 0x00FFFF");
187         for (int i = 0xE000; i < 0x010000; i++) {
188             int c = reader.read();
189             if (c != i) {
190                 expectedChar(null, i, c);
191             }
192         }
193         System.err.println("testing 0x010000 -> 0x110000");
194         for (int i = 0x10000; i < 0x110000; i++) {
195             // vars
196
int uuuuu = (i >> 16) & 0x001F;
197             int wwww = uuuuu - 1;
198             int zzzz = (i >> 12) & 0x000F;
199             int yyyyyy = (i >> 6) & 0x003F;
200             int xxxxxx = i & 0x003F;
201             int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4);
202             int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
203             // high surrogate
204
int c = reader.read();
205             if (c != hs) {
206                 expectedChar("high surrogate", hs, c);
207             }
208             // low surrogate
209
c = reader.read();
210             if (c != ls) {
211                 expectedChar("low surrogate", ls, c);
212             }
213         }
214         System.err.println("checking EOF");
215         int c = reader.read();
216         if (c != -1) {
217             extraChar(c);
218         }
219         long after = System.currentTimeMillis();
220
221         return after - before;
222
223     } // testCharByChar(Reader):long
224

225     /**
226      * This function tests the given reader by performing block character
227      * reads of the specified size.
228      */

229     public static long testCharArray(Reader JavaDoc reader, int size) throws Exception JavaDoc {
230
231         long before = System.currentTimeMillis();
232         System.err.println("# Testing character array of size "+size);
233
234         char[] ch = new char[size];
235         int count = 0;
236         int position = 0;
237
238         System.err.println("testing 0x000000 -> 0x00007F");
239         for (int i = 0; i < 0x0080; i++) {
240             if (position == count) {
241                 count = load(reader, ch);
242                 position = 0;
243             }
244             int c = ch[position++];
245             if (c != i) {
246                 expectedChar(null, i, c);
247             }
248         }
249         System.err.println("testing 0x000080 -> 0x0007FF");
250         for (int i = 0x0080; i < 0x0800; i++) {
251             if (position == count) {
252                 count = load(reader, ch);
253                 position = 0;
254             }
255             int c = ch[position++];
256             if (c != i) {
257                 expectedChar(null, i, c);
258             }
259         }
260         System.err.println("testing 0x000800 -> 0x00D7FF");
261         for (int i = 0x0800; i < 0xD800; i++) {
262             if (position == count) {
263                 count = load(reader, ch);
264                 position = 0;
265             }
266             int c = ch[position++];
267             if (c != i) {
268                 expectedChar(null, i, c);
269             }
270         }
271         System.err.println("testing 0x00E000 -> 0x00FFFF");
272         for (int i = 0xE000; i < 0x010000; i++) {
273             if (position == count) {
274                 count = load(reader, ch);
275                 position = 0;
276             }
277             int c = ch[position++];
278             if (c != i) {
279                 expectedChar(null, i, c);
280             }
281         }
282         System.err.println("testing 0x010000 -> 0x110000");
283         for (int i = 0x10000; i < 0x110000; i++) {
284             // vars
285
int uuuuu = (i >> 16) & 0x001F;
286             int wwww = uuuuu - 1;
287             int zzzz = (i >> 12) & 0x000F;
288             int yyyyyy = (i >> 6) & 0x003F;
289             int xxxxxx = i & 0x003F;
290             int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4);
291             int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
292             // high surrogate
293
if (position == count) {
294                 count = load(reader, ch);
295                 position = 0;
296             }
297             int c = ch[position++];
298             if (c != hs) {
299                 expectedChar("high surrogate", hs, c);
300             }
301             // low surrogate
302
if (position == count) {
303                 count = load(reader, ch);
304                 position = 0;
305             }
306             c = ch[position++];
307             if (c != ls) {
308                 expectedChar("low surrogate", ls, c);
309             }
310         }
311         System.err.println("checking EOF");
312         if (position == count) {
313             count = load(reader, ch);
314             position = 0;
315         }
316         if (count != -1) {
317             extraChar(ch[position]);
318         }
319         long after = System.currentTimeMillis();
320
321         return after - before;
322
323     } // testCharArray(Reader):long
324

325     //
326
// Private static methods
327
//
328

329     /** Loads another block of characters from the reader. */
330     private static int load(Reader JavaDoc reader, char[] ch) throws IOException JavaDoc {
331         int count = reader.read(ch, 0, ch.length);
332         return count;
333     } // load(Reader,char[]):int
334

335     /** Creates an I/O exception for expected character. */
336     private static void expectedChar(String JavaDoc prefix, int ec, int fc) throws IOException JavaDoc {
337         StringBuffer JavaDoc str = new StringBuffer JavaDoc();
338         str.append("expected ");
339         if (prefix != null) {
340             str.append(prefix);
341             str.append(' ');
342         }
343         str.append("0x");
344         str.append(Integer.toHexString(ec));
345         str.append(" but found 0x");
346         if (fc != -1) {
347             str.append(Integer.toHexString(fc));
348         }
349         else {
350             str.append("EOF");
351         }
352         String JavaDoc message = str.toString();
353         throw new IOException JavaDoc(message);
354     } // expectedChar(String,int,int)
355

356     /** Creates an I/O exception for extra character. */
357     private static void extraChar(int c) throws IOException JavaDoc {
358         StringBuffer JavaDoc str = new StringBuffer JavaDoc();
359         str.append("found extra character 0x");
360         str.append(Integer.toHexString(c));
361         String JavaDoc message = str.toString();
362         throw new IOException JavaDoc(message);
363     } // extraChar(int)
364

365     //
366
// Classes
367
//
368

369     /**
370      * This classes produces a stream of UTF-8 byte sequences for all
371      * valid Unicode characters.
372      *
373      * @author Andy Clark, IBM
374      */

375     public static class UTF8Producer
376         extends InputStream JavaDoc {
377
378         //
379
// Data
380
//
381

382         /** The current code point. */
383         private int fCodePoint;
384
385         /** The current byte of the current code point. */
386         private int fByte;
387
388         //
389
// InputStream methods
390
//
391

392         /** Reads the next character. */
393         public int read() throws IOException JavaDoc {
394
395             // UTF-8: [0xxx xxxx]
396
// Unicode: [0000 0000] [0xxx xxxx]
397
if (fCodePoint < 0x0080) {
398                 int b = fCodePoint;
399                 fCodePoint++;
400                 fByte = 0;
401                 return b;
402             }
403
404             // UTF-8: [110y yyyy] [10xx xxxx]
405
// Unicode: [0000 0yyy] [yyxx xxxx]
406
if (fCodePoint < 0x0800) {
407                 switch (fByte) {
408                     case 0: {
409                         int b = 0x00C0 | ((fCodePoint >> 6) & 0x001F);
410                         fByte++;
411                         return b;
412                     }
413                     case 1: {
414                         int b = 0x0080 | (fCodePoint & 0x003F);
415                         fCodePoint++;
416                         fByte = 0;
417                         return b;
418                     }
419                     default: {
420                         throw new RuntimeException JavaDoc("byte "+fByte+" of 2 byte UTF-8 sequence");
421                     }
422                 }
423             }
424
425             // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
426
// Unicode: [zzzz yyyy] [yyxx xxxx]*
427
if (fCodePoint < 0x10000) {
428                 switch (fByte) {
429                     case 0: {
430                         int b = 0x00E0 | ((fCodePoint >> 12) & 0x000F);
431                         fByte++;
432                         return b;
433                     }
434                     case 1: {
435                         int b = 0x0080 | ((fCodePoint >> 6) & 0x003F);
436                         fByte++;
437                         return b;
438                     }
439                     case 2: {
440                         int b = 0x0080 | (fCodePoint & 0x003F);
441                         fCodePoint++;
442                         // skip surrogate blocks
443
if (fCodePoint == 0xD800) {
444                             fCodePoint = 0xE000;
445                         }
446                         fByte = 0;
447                         return b;
448                     }
449                     default: {
450                         throw new RuntimeException JavaDoc("byte "+fByte+" of 3 byte UTF-8 sequence");
451                     }
452                 }
453             }
454
455             // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
456
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
457
// [1101 11yy] [yyxx xxxx] (low surrogate)
458
// * uuuuu = wwww + 1
459
// [0000 0000] [000u uuuu] [zzzz yyyy] [yyxx xxxx]
460
if (fCodePoint < 0x110000) {
461                 switch (fByte) {
462                     case 0: {
463                         int uuuuu = (fCodePoint >> 16) & 0x001F;
464                         int b = 0x00F0 | (uuuuu >> 2);
465                         fByte++;
466                         return b;
467                     }
468                     case 1: {
469                         int uuuuu = (fCodePoint >> 16) & 0x001F;
470                         int zzzz = (fCodePoint >> 12) & 0x000F;
471                         int b = 0x0080 | ((uuuuu << 4) & 0x0030) | zzzz;
472                         fByte++;
473                         return b;
474                     }
475                     case 2: {
476                         int yyyyyy = (fCodePoint >> 6) & 0x003F;
477                         int b = 0x0080 | yyyyyy;
478                         fByte++;
479                         return b;
480                     }
481                     case 3: {
482                         int xxxxxx = fCodePoint & 0x003F;
483                         int b = 0x0080 | xxxxxx;
484                         fCodePoint++;
485                         fByte = 0;
486                         return b;
487                     }
488                     default: {
489                         throw new RuntimeException JavaDoc("byte "+fByte+" of 4 byte UTF-8 sequence");
490                     }
491                 }
492             }
493             
494             // done
495
return -1;
496
497         } // read():int
498

499     } // class UTF8Producer
500

501 } // class UTF8
502
Popular Tags