KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > xerces > impl > io > UTF8Reader


1 /*
2  * Copyright 2000-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 package org.apache.xerces.impl.io;
18
19 import java.io.InputStream JavaDoc;
20 import java.io.IOException JavaDoc;
21 import java.io.Reader JavaDoc;
22
23 import java.util.Locale JavaDoc;
24 import org.apache.xerces.util.MessageFormatter;
25 import org.apache.xerces.impl.msg.XMLMessageFormatter;
26
27 /**
28  * <p>A UTF-8 reader.</p>
29  *
30  * @xerces.internal
31  *
32  * @author Andy Clark, IBM
33  *
34  * @version $Id: UTF8Reader.java,v 1.11 2004/10/04 22:07:41 mrglavas Exp $
35  */

36 public class UTF8Reader
37     extends Reader JavaDoc {
38
39     //
40
// Constants
41
//
42

43     /** Default byte buffer size (2048). */
44     public static final int DEFAULT_BUFFER_SIZE = 2048;
45
46     // debugging
47

48     /** Debug read. */
49     private static final boolean DEBUG_READ = false;
50
51     //
52
// Data
53
//
54

55     /** Input stream. */
56     protected InputStream JavaDoc fInputStream;
57
58     /** Byte buffer. */
59     protected byte[] fBuffer;
60
61     /** Offset into buffer. */
62     protected int fOffset;
63
64     /** Surrogate character. */
65     private int fSurrogate = -1;
66
67     // message formatter; used to produce localized
68
// exception messages
69
private MessageFormatter fFormatter = null;
70
71     //Locale to use for messages
72
private Locale JavaDoc fLocale = null;
73
74     //
75
// Constructors
76
//
77

78     /**
79      * Constructs a UTF-8 reader from the specified input stream
80      * using the default buffer size. Primarily for testing.
81      *
82      * @param inputStream The input stream.
83      */

84     public UTF8Reader(InputStream JavaDoc inputStream) {
85         this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());
86     } // <init>(InputStream, MessageFormatter)
87

88     /**
89      * Constructs a UTF-8 reader from the specified input stream
90      * using the default buffer size and the given MessageFormatter.
91      *
92      * @param inputStream The input stream.
93      * @param messageFormatter given MessageFormatter
94      * @param locale Locale to use for messages
95      */

96     public UTF8Reader(InputStream JavaDoc inputStream, MessageFormatter messageFormatter,
97             Locale JavaDoc locale) {
98         this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
99     } // <init>(InputStream, MessageFormatter, Locale)
100

101     /**
102      * Constructs a UTF-8 reader from the specified input stream,
103      * buffer size and MessageFormatter.
104      *
105      * @param inputStream The input stream.
106      * @param size The initial buffer size.
107      * @param messageFormatter the formatter for localizing/formatting errors.
108      * @param locale the Locale to use for messages
109      */

110     public UTF8Reader(InputStream JavaDoc inputStream, int size,
111             MessageFormatter messageFormatter, Locale JavaDoc locale) {
112         fInputStream = inputStream;
113         fBuffer = new byte[size];
114         fFormatter = messageFormatter;
115         fLocale = locale;
116     } // <init>(InputStream, int, MessageFormatter, Locale)
117

118     //
119
// Reader methods
120
//
121

122     /**
123      * Read a single character. This method will block until a character is
124      * available, an I/O error occurs, or the end of the stream is reached.
125      *
126      * <p> Subclasses that intend to support efficient single-character input
127      * should override this method.
128      *
129      * @return The character read, as an integer in the range 0 to 16383
130      * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
131      * been reached
132      *
133      * @exception IOException If an I/O error occurs
134      */

135     public int read() throws IOException JavaDoc {
136
137         // decode character
138
int c = fSurrogate;
139         if (fSurrogate == -1) {
140             // NOTE: We use the index into the buffer if there are remaining
141
// bytes from the last block read. -Ac
142
int index = 0;
143
144             // get first byte
145
int b0 = index == fOffset
146                    ? fInputStream.read() : fBuffer[index++] & 0x00FF;
147             if (b0 == -1) {
148                 return -1;
149             }
150
151             // UTF-8: [0xxx xxxx]
152
// Unicode: [0000 0000] [0xxx xxxx]
153
if (b0 < 0x80) {
154                 c = (char)b0;
155             }
156
157             // UTF-8: [110y yyyy] [10xx xxxx]
158
// Unicode: [0000 0yyy] [yyxx xxxx]
159
else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
160                 int b1 = index == fOffset
161                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
162                 if (b1 == -1) {
163                     expectedByte(2, 2);
164                 }
165                 if ((b1 & 0xC0) != 0x80) {
166                     invalidByte(2, 2, b1);
167                 }
168                 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
169             }
170
171             // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
172
// Unicode: [zzzz yyyy] [yyxx xxxx]
173
else if ((b0 & 0xF0) == 0xE0) {
174                 int b1 = index == fOffset
175                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
176                 if (b1 == -1) {
177                     expectedByte(2, 3);
178                 }
179                 if ((b1 & 0xC0) != 0x80
180                     || (b0 == 0xED && b1 >= 0xA0)
181                     || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
182                     invalidByte(2, 3, b1);
183                 }
184                 int b2 = index == fOffset
185                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
186                 if (b2 == -1) {
187                     expectedByte(3, 3);
188                 }
189                 if ((b2 & 0xC0) != 0x80) {
190                     invalidByte(3, 3, b2);
191                 }
192                 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
193                     (b2 & 0x003F);
194             }
195
196             // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
197
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
198
// [1101 11yy] [yyxx xxxx] (low surrogate)
199
// * uuuuu = wwww + 1
200
else if ((b0 & 0xF8) == 0xF0) {
201                 int b1 = index == fOffset
202                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
203                 if (b1 == -1) {
204                     expectedByte(2, 4);
205                 }
206                 if ((b1 & 0xC0) != 0x80
207                     || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
208                     invalidByte(2, 3, b1);
209                 }
210                 int b2 = index == fOffset
211                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
212                 if (b2 == -1) {
213                     expectedByte(3, 4);
214                 }
215                 if ((b2 & 0xC0) != 0x80) {
216                     invalidByte(3, 3, b2);
217                 }
218                 int b3 = index == fOffset
219                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
220                 if (b3 == -1) {
221                     expectedByte(4, 4);
222                 }
223                 if ((b3 & 0xC0) != 0x80) {
224                     invalidByte(4, 4, b3);
225                 }
226                 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
227                 if (uuuuu > 0x10) {
228                     invalidSurrogate(uuuuu);
229                 }
230                 int wwww = uuuuu - 1;
231                 int hs = 0xD800 |
232                          ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
233                          ((b2 >> 4) & 0x0003);
234                 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
235                 c = hs;
236                 fSurrogate = ls;
237             }
238
239             // error
240
else {
241                 invalidByte(1, 1, b0);
242             }
243         }
244
245         // use surrogate
246
else {
247             fSurrogate = -1;
248         }
249
250         // return character
251
if (DEBUG_READ) {
252             System.out.println("read(): 0x"+Integer.toHexString(c));
253         }
254         return c;
255
256     } // read():int
257

258     /**
259      * Read characters into a portion of an array. This method will block
260      * until some input is available, an I/O error occurs, or the end of the
261      * stream is reached.
262      *
263      * @param ch Destination buffer
264      * @param offset Offset at which to start storing characters
265      * @param length Maximum number of characters to read
266      *
267      * @return The number of characters read, or -1 if the end of the
268      * stream has been reached
269      *
270      * @exception IOException If an I/O error occurs
271      */

272     public int read(char ch[], int offset, int length) throws IOException JavaDoc {
273
274         // handle surrogate
275
int out = offset;
276         if (fSurrogate != -1) {
277             ch[offset + 1] = (char)fSurrogate;
278             fSurrogate = -1;
279             length--;
280             out++;
281         }
282
283         // read bytes
284
int count = 0;
285         if (fOffset == 0) {
286             // adjust length to read
287
if (length > fBuffer.length) {
288                 length = fBuffer.length;
289             }
290
291             // perform read operation
292
count = fInputStream.read(fBuffer, 0, length);
293             if (count == -1) {
294                 return -1;
295             }
296             count += out - offset;
297         }
298
299         // skip read; last character was in error
300
// NOTE: Having an offset value other than zero means that there was
301
// an error in the last character read. In this case, we have
302
// skipped the read so we don't consume any bytes past the
303
// error. By signalling the error on the next block read we
304
// allow the method to return the most valid characters that
305
// it can on the previous block read. -Ac
306
else {
307             count = fOffset;
308             fOffset = 0;
309         }
310
311         // convert bytes to characters
312
final int total = count;
313         int in;
314         byte byte1;
315         final byte byte0 = 0;
316         for (in = 0; in < total; in++) {
317             byte1 = fBuffer[in];
318             if (byte1 >= byte0) {
319                 ch[out++] = (char)byte1;
320             }
321             else {
322                 break;
323             }
324         }
325         for ( ; in < total; in++) {
326             byte1 = fBuffer[in];
327
328             // UTF-8: [0xxx xxxx]
329
// Unicode: [0000 0000] [0xxx xxxx]
330
if (byte1 >= byte0) {
331                 ch[out++] = (char)byte1;
332                 continue;
333             }
334
335             // UTF-8: [110y yyyy] [10xx xxxx]
336
// Unicode: [0000 0yyy] [yyxx xxxx]
337
int b0 = byte1 & 0x0FF;
338             if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
339                 int b1 = -1;
340                 if (++in < total) {
341                     b1 = fBuffer[in] & 0x00FF;
342                 }
343                 else {
344                     b1 = fInputStream.read();
345                     if (b1 == -1) {
346                         if (out > offset) {
347                             fBuffer[0] = (byte)b0;
348                             fOffset = 1;
349                             return out - offset;
350                         }
351                         expectedByte(2, 2);
352                     }
353                     count++;
354                 }
355                 if ((b1 & 0xC0) != 0x80) {
356                     if (out > offset) {
357                         fBuffer[0] = (byte)b0;
358                         fBuffer[1] = (byte)b1;
359                         fOffset = 2;
360                         return out - offset;
361                     }
362                     invalidByte(2, 2, b1);
363                 }
364                 int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
365                 ch[out++] = (char)c;
366                 count -= 1;
367                 continue;
368             }
369
370             // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
371
// Unicode: [zzzz yyyy] [yyxx xxxx]
372
if ((b0 & 0xF0) == 0xE0) {
373                 int b1 = -1;
374                 if (++in < total) {
375                     b1 = fBuffer[in] & 0x00FF;
376                 }
377                 else {
378                     b1 = fInputStream.read();
379                     if (b1 == -1) {
380                         if (out > offset) {
381                             fBuffer[0] = (byte)b0;
382                             fOffset = 1;
383                             return out - offset;
384                         }
385                         expectedByte(2, 3);
386                     }
387                     count++;
388                 }
389                 if ((b1 & 0xC0) != 0x80
390                     || (b0 == 0xED && b1 >= 0xA0)
391                     || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
392                     if (out > offset) {
393                         fBuffer[0] = (byte)b0;
394                         fBuffer[1] = (byte)b1;
395                         fOffset = 2;
396                         return out - offset;
397                     }
398                     invalidByte(2, 3, b1);
399                 }
400                 int b2 = -1;
401                 if (++in < total) {
402                     b2 = fBuffer[in] & 0x00FF;
403                 }
404                 else {
405                     b2 = fInputStream.read();
406                     if (b2 == -1) {
407                         if (out > offset) {
408                             fBuffer[0] = (byte)b0;
409                             fBuffer[1] = (byte)b1;
410                             fOffset = 2;
411                             return out - offset;
412                         }
413                         expectedByte(3, 3);
414                     }
415                     count++;
416                 }
417                 if ((b2 & 0xC0) != 0x80) {
418                     if (out > offset) {
419                         fBuffer[0] = (byte)b0;
420                         fBuffer[1] = (byte)b1;
421                         fBuffer[2] = (byte)b2;
422                         fOffset = 3;
423                         return out - offset;
424                     }
425                     invalidByte(3, 3, b2);
426                 }
427                 int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
428                         (b2 & 0x003F);
429                 ch[out++] = (char)c;
430                 count -= 2;
431                 continue;
432             }
433
434             // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
435
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
436
// [1101 11yy] [yyxx xxxx] (low surrogate)
437
// * uuuuu = wwww + 1
438
if ((b0 & 0xF8) == 0xF0) {
439                 int b1 = -1;
440                 if (++in < total) {
441                     b1 = fBuffer[in] & 0x00FF;
442                 }
443                 else {
444                     b1 = fInputStream.read();
445                     if (b1 == -1) {
446                         if (out > offset) {
447                             fBuffer[0] = (byte)b0;
448                             fOffset = 1;
449                             return out - offset;
450                         }
451                         expectedByte(2, 4);
452                     }
453                     count++;
454                 }
455                 if ((b1 & 0xC0) != 0x80
456                     || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
457                     if (out > offset) {
458                         fBuffer[0] = (byte)b0;
459                         fBuffer[1] = (byte)b1;
460                         fOffset = 2;
461                         return out - offset;
462                     }
463                     invalidByte(2, 4, b1);
464                 }
465                 int b2 = -1;
466                 if (++in < total) {
467                     b2 = fBuffer[in] & 0x00FF;
468                 }
469                 else {
470                     b2 = fInputStream.read();
471                     if (b2 == -1) {
472                         if (out > offset) {
473                             fBuffer[0] = (byte)b0;
474                             fBuffer[1] = (byte)b1;
475                             fOffset = 2;
476                             return out - offset;
477                         }
478                         expectedByte(3, 4);
479                     }
480                     count++;
481                 }
482                 if ((b2 & 0xC0) != 0x80) {
483                     if (out > offset) {
484                         fBuffer[0] = (byte)b0;
485                         fBuffer[1] = (byte)b1;
486                         fBuffer[2] = (byte)b2;
487                         fOffset = 3;
488                         return out - offset;
489                     }
490                     invalidByte(3, 4, b2);
491                 }
492                 int b3 = -1;
493                 if (++in < total) {
494                     b3 = fBuffer[in] & 0x00FF;
495                 }
496                 else {
497                     b3 = fInputStream.read();
498                     if (b3 == -1) {
499                         if (out > offset) {
500                             fBuffer[0] = (byte)b0;
501                             fBuffer[1] = (byte)b1;
502                             fBuffer[2] = (byte)b2;
503                             fOffset = 3;
504                             return out - offset;
505                         }
506                         expectedByte(4, 4);
507                     }
508                     count++;
509                 }
510                 if ((b3 & 0xC0) != 0x80) {
511                     if (out > offset) {
512                         fBuffer[0] = (byte)b0;
513                         fBuffer[1] = (byte)b1;
514                         fBuffer[2] = (byte)b2;
515                         fBuffer[3] = (byte)b3;
516                         fOffset = 4;
517                         return out - offset;
518                     }
519                     invalidByte(4, 4, b2);
520                 }
521
522                 // decode bytes into surrogate characters
523
int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
524                 if (uuuuu > 0x10) {
525                     invalidSurrogate(uuuuu);
526                 }
527                 int wwww = uuuuu - 1;
528                 int zzzz = b1 & 0x000F;
529                 int yyyyyy = b2 & 0x003F;
530                 int xxxxxx = b3 & 0x003F;
531                 int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4);
532                 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
533
534                 // set characters
535
ch[out++] = (char)hs;
536                 ch[out++] = (char)ls;
537                 count -= 2;
538                 continue;
539             }
540
541             // error
542
if (out > offset) {
543                 fBuffer[0] = (byte)b0;
544                 fOffset = 1;
545                 return out - offset;
546             }
547             invalidByte(1, 1, b0);
548         }
549
550         // return number of characters converted
551
if (DEBUG_READ) {
552             System.out.println("read(char[],"+offset+','+length+"): count="+count);
553         }
554         return count;
555
556     } // read(char[],int,int)
557

558     /**
559      * Skip characters. This method will block until some characters are
560      * available, an I/O error occurs, or the end of the stream is reached.
561      *
562      * @param n The number of characters to skip
563      *
564      * @return The number of characters actually skipped
565      *
566      * @exception IOException If an I/O error occurs
567      */

568     public long skip(long n) throws IOException JavaDoc {
569
570         long remaining = n;
571         final char[] ch = new char[fBuffer.length];
572         do {
573             int length = ch.length < remaining ? ch.length : (int)remaining;
574             int count = read(ch, 0, length);
575             if (count > 0) {
576                 remaining -= count;
577             }
578             else {
579                 break;
580             }
581         } while (remaining > 0);
582
583         long skipped = n - remaining;
584         return skipped;
585
586     } // skip(long):long
587

588     /**
589      * Tell whether this stream is ready to be read.
590      *
591      * @return True if the next read() is guaranteed not to block for input,
592      * false otherwise. Note that returning false does not guarantee that the
593      * next read will block.
594      *
595      * @exception IOException If an I/O error occurs
596      */

597     public boolean ready() throws IOException JavaDoc {
598         return false;
599     } // ready()
600

601     /**
602      * Tell whether this stream supports the mark() operation.
603      */

604     public boolean markSupported() {
605         return false;
606     } // markSupported()
607

608     /**
609      * Mark the present position in the stream. Subsequent calls to reset()
610      * will attempt to reposition the stream to this point. Not all
611      * character-input streams support the mark() operation.
612      *
613      * @param readAheadLimit Limit on the number of characters that may be
614      * read while still preserving the mark. After
615      * reading this many characters, attempting to
616      * reset the stream may fail.
617      *
618      * @exception IOException If the stream does not support mark(),
619      * or if some other I/O error occurs
620      */

621     public void mark(int readAheadLimit) throws IOException JavaDoc {
622         throw new IOException JavaDoc(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object JavaDoc[]{"mark()", "UTF-8"}));
623     } // mark(int)
624

625     /**
626      * Reset the stream. If the stream has been marked, then attempt to
627      * reposition it at the mark. If the stream has not been marked, then
628      * attempt to reset it in some way appropriate to the particular stream,
629      * for example by repositioning it to its starting point. Not all
630      * character-input streams support the reset() operation, and some support
631      * reset() without supporting mark().
632      *
633      * @exception IOException If the stream has not been marked,
634      * or if the mark has been invalidated,
635      * or if the stream does not support reset(),
636      * or if some other I/O error occurs
637      */

638     public void reset() throws IOException JavaDoc {
639         fOffset = 0;
640         fSurrogate = -1;
641     } // reset()
642

643     /**
644      * Close the stream. Once a stream has been closed, further read(),
645      * ready(), mark(), or reset() invocations will throw an IOException.
646      * Closing a previously-closed stream, however, has no effect.
647      *
648      * @exception IOException If an I/O error occurs
649      */

650     public void close() throws IOException JavaDoc {
651         fInputStream.close();
652     } // close()
653

654     //
655
// Private methods
656
//
657

658     /** Throws an exception for expected byte. */
659     private void expectedByte(int position, int count)
660         throws MalformedByteSequenceException {
661
662         throw new MalformedByteSequenceException(fFormatter,
663             fLocale,
664             XMLMessageFormatter.XML_DOMAIN,
665             "ExpectedByte",
666             new Object JavaDoc[] {Integer.toString(position), Integer.toString(count)});
667
668     } // expectedByte(int,int)
669

670     /** Throws an exception for invalid byte. */
671     private void invalidByte(int position, int count, int c)
672         throws MalformedByteSequenceException {
673
674         throw new MalformedByteSequenceException(fFormatter,
675             fLocale,
676             XMLMessageFormatter.XML_DOMAIN,
677             "InvalidByte",
678             new Object JavaDoc [] {Integer.toString(position), Integer.toString(count)});
679
680     } // invalidByte(int,int,int)
681

682     /** Throws an exception for invalid surrogate bits. */
683     private void invalidSurrogate(int uuuuu) throws MalformedByteSequenceException {
684
685         throw new MalformedByteSequenceException(fFormatter,
686             fLocale,
687             XMLMessageFormatter.XML_DOMAIN,
688             "InvalidHighSurrogate",
689             new Object JavaDoc[] {Integer.toHexString(uuuuu)});
690
691     } // invalidSurrogate(int)
692

693 } // class UTF8Reader
Popular Tags