KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > javolution > io > Utf8StreamReader


1 /*
2  * Javolution - Java(TM) Solution for Real-Time and Embedded Systems
3  * Copyright (C) 2005 - Javolution (http://javolution.org/)
4  * All rights reserved.
5  *
6  * Permission to use, copy, modify, and distribute this software is
7  * freely granted, provided that this notice is preserved.
8  */

9 package javolution.io;
10
11 import j2me.lang.IllegalStateException;
12 import j2me.io.CharConversionException;
13
14 import java.io.IOException;
15 import java.io.InputStream;
16 import java.io.Reader;
17
18 import javolution.lang.Appendable;
19 import javolution.lang.Reusable;
20
21 /**
22  * <p> This class represents a UTF-8 stream reader.</p>
23  *
24  * <p> This reader supports surrogate <code>char</code> pairs (representing
25  * characters in the range [U+10000 .. U+10FFFF]). It can also be used
26  * to read characters unicodes (31 bits) directly
27  * (ref. {@link #read()}).</p>
28  *
29  * <p> Each invocation of one of the <code>read()</code> methods may cause one
30  * or more bytes to be read from the underlying byte-input stream.
31  * To enable the efficient conversion of bytes to characters, more bytes may
32  * be read ahead from the underlying stream than are necessary to satisfy
33  * the current read operation.</p>
34  *
35  * <p> Instances of this class can be reused for different input streams
36  * and can be part of a higher level component (e.g. parser) in order
37  * to avoid dynamic buffer allocation when the input source changes.
38  * Also wrapping using a <code>java.io.BufferedReader</code> is unnescessary
39  * as instances of this class embed their own data buffers.</p>
40  *
41  * <p> Note: This reader is unsynchronized and does not test if the UTF-8
42  * encoding is well-formed (e.g. UTF-8 sequences longer than
43  * necessary to encode a character).</p>
44  *
45  * @author <a HREF="mailto:jean-marie@dautelle.com">Jean-Marie Dautelle</a>
46  * @version 2.0, December 9, 2004
47  * @see Utf8StreamWriter
48  */

49 public final class Utf8StreamReader extends Reader implements Reusable {
50
51     /**
52      * Holds the current input stream or <code>null</code> if closed.
53      */

54     private InputStream _inputStream;
55
56     /**
57      * Holds the start index.
58      */

59     private int _start;
60
61     /**
62      * Holds the end index.
63      */

64     private int _end;
65
66     /**
67      * Holds the bytes buffer.
68      */

69     private final byte[] _bytes;
70
71     /**
72      * Creates a UTF-8 reader having a byte buffer of moderate capacity (2048).
73      */

74     public Utf8StreamReader() {
75         _bytes = new byte[2048];
76     }
77
78     /**
79      * Creates a UTF-8 reader having a byte buffer of specified capacity.
80      *
81      * @param capacity the capacity of the byte buffer.
82      */

83     public Utf8StreamReader(int capacity) {
84         _bytes = new byte[capacity];
85     }
86
87     /**
88      * Sets the input stream to use for reading until this reader is closed.
89      * For example:<pre>
90      * Reader reader = new Utf8StreamReader().setInputStream(inStream);
91      * </pre> is equivalent but reads twice as fast as <pre>
92      * Reader reader = new j2me.io.InputStreamReader(inStream, "UTF-8");
93      * </pre>
94      *
95      * @param inStream the input stream.
96      * @return this UTF-8 reader.
97      * @throws IllegalStateException if this reader is being reused and
98      * it has not been {@link #close closed} or {@link #reset reset}.
99      */

100     public Utf8StreamReader setInputStream(InputStream inStream) {
101         if (_inputStream != null)
102             throw new IllegalStateException("Reader not closed or reset");
103         _inputStream = inStream;
104         return this;
105     }
106
107     /**
108      * Indicates if this stream is ready to be read.
109      *
110      * @return <code>true</code> if the next read() is guaranteed not to block
111      * for input; <code>false</code> otherwise.
112      * @throws IOException if an I/O error occurs.
113      */

114     public boolean ready() throws IOException {
115         if (_inputStream == null)
116             throw new IOException("Stream closed");
117         return ((_end - _start) > 0) || (_inputStream.available() != 0);
118     }
119
120     /**
121      * Closes and {@link #reset resets} this reader for reuse.
122      *
123      * @throws IOException if an I/O error occurs.
124      */

125     public void close() throws IOException {
126         if (_inputStream != null) {
127             _inputStream.close();
128             reset();
129         }
130     }
131
132     /**
133      * Reads a single character. This method will block until a character is
134      * available, an I/O error occurs or the end of the stream is reached.
135      *
136      * @return the 31-bits Unicode of the character read, or -1 if the end of
137      * the stream has been reached.
138      * @throws IOException if an I/O error occurs.
139      */

140     public int read() throws IOException {
141         byte b = _bytes[_start];
142         return ((b >= 0) && (_start++ < _end)) ? b : read2();
143     }
144
145     // Reads one full character, blocks if necessary.
146
private int read2() throws IOException {
147         if (_start < _end) {
148             byte b = _bytes[_start++];
149
150             // Decodes UTF-8.
151
if ((b >= 0) && (_moreBytes == 0)) {
152                 // 0xxxxxxx
153
return b;
154             } else if (((b & 0xc0) == 0x80) && (_moreBytes != 0)) {
155                 // 10xxxxxx (continuation byte)
156
_code = (_code << 6) | (b & 0x3f); // Adds 6 bits to code.
157
if (--_moreBytes == 0) {
158                     return _code;
159                 } else {
160                     return read2();
161                 }
162             } else if (((b & 0xe0) == 0xc0) && (_moreBytes == 0)) {
163                 // 110xxxxx
164
_code = b & 0x1f;
165                 _moreBytes = 1;
166                 return read2();
167             } else if (((b & 0xf0) == 0xe0) && (_moreBytes == 0)) {
168                 // 1110xxxx
169
_code = b & 0x0f;
170                 _moreBytes = 2;
171                 return read2();
172             } else if (((b & 0xf8) == 0xf0) && (_moreBytes == 0)) {
173                 // 11110xxx
174
_code = b & 0x07;
175                 _moreBytes = 3;
176                 return read2();
177             } else if (((b & 0xfc) == 0xf8) && (_moreBytes == 0)) {
178                 // 111110xx
179
_code = b & 0x03;
180                 _moreBytes = 4;
181                 return read2();
182             } else if (((b & 0xfe) == 0xfc) && (_moreBytes == 0)) {
183                 // 1111110x
184
_code = b & 0x01;
185                 _moreBytes = 5;
186                 return read2();
187             } else {
188                 throw new CharConversionException("Invalid UTF-8 Encoding");
189             }
190         } else { // No more bytes in buffer.
191
if (_inputStream == null)
192                 throw new IOException("Stream closed");
193             _start = 0;
194             _end = _inputStream.read(_bytes, 0, _bytes.length);
195             if (_end > 0) {
196                 return read2(); // Continues.
197
} else { // Done.
198
if (_moreBytes == 0) {
199                     return -1;
200                 } else { // Incomplete sequence.
201
throw new CharConversionException(
202                             "Unexpected end of stream");
203                 }
204             }
205         }
206     }
207
208     private int _code;
209
210     private int _moreBytes;
211
212     /**
213      * Reads characters into a portion of an array. This method will block
214      * until some input is available, an I/O error occurs or the end of
215      * the stream is reached.
216      *
217      * <p> Note: Characters between U+10000 and U+10FFFF are represented
218      * by surrogate pairs (two <code>char</code>).</p>
219      *
220      * @param cbuf the destination buffer.
221      * @param off the offset at which to start storing characters.
222      * @param len the maximum number of characters to read
223      * @return the number of characters read, or -1 if the end of the
224      * stream has been reached
225      * @throws IOException if an I/O error occurs.
226      */

227     public int read(char cbuf[], int off, int len) throws IOException {
228         if (_inputStream == null)
229             throw new IOException("Stream closed");
230         if (_start >= _end) { // Fills buffer.
231
_start = 0;
232             _end = _inputStream.read(_bytes, 0, _bytes.length);
233             if (_end <= 0) { // Done.
234
return _end;
235             }
236         }
237         final int off_plus_len = off + len;
238         for (int i = off; i < off_plus_len;) {
239             // assert(_start < _end)
240
byte b = _bytes[_start];
241             if ((b >= 0) && (++_start < _end)) {
242                 cbuf[i++] = (char) b; // Most common case.
243
} else if (b < 0) {
244                 if (i < off_plus_len - 1) { // Up to two 'char' can be read.
245
int code = read2();
246                     if (code < 0x10000) {
247                         cbuf[i++] = (char) code;
248                     } else if (code <= 0x10ffff) { // Surrogates.
249
cbuf[i++] = (char) (((code - 0x10000) >> 10) + 0xd800);
250                         cbuf[i++] = (char) (((code - 0x10000) & 0x3ff) + 0xdc00);
251                     } else {
252                         throw new CharConversionException("Cannot convert U+"
253                                 + Integer.toHexString(code)
254                                 + " to char (code greater than U+10FFFF)");
255                     }
256                     if (_start < _end) {
257                         continue;
258                     }
259                 }
260                 return i - off;
261             } else { // End of buffer (_start >= _end).
262
cbuf[i++] = (char) b;
263                 return i - off;
264             }
265         }
266         return len;
267     }
268
269     /**
270      * Reads characters into the specified appendable. This method will block
271      * until the end of the stream is reached.
272      *
273      * @param dest the destination buffer.
274      * @throws IOException if an I/O error occurs.
275      */

276     public void read(Appendable dest) throws IOException {
277         if (_inputStream == null)
278             throw new IOException("Stream closed");
279         while (true) {
280             if (_start >= _end) { // Fills buffer.
281
_start = 0;
282                 _end = _inputStream.read(_bytes, 0, _bytes.length);
283                 if (_end <= 0) { // Done.
284
break;
285                 }
286             }
287             byte b = _bytes[_start];
288             if (b >= 0) {
289                 dest.append((char) b); // Most common case.
290
_start++;
291             } else {
292                 int code = read2();
293                 if (code < 0x10000) {
294                     dest.append((char) code);
295                 } else if (code <= 0x10ffff) { // Surrogates.
296
dest.append((char) (((code - 0x10000) >> 10) + 0xd800));
297                     dest.append((char) (((code - 0x10000) & 0x3ff) + 0xdc00));
298                 } else {
299                     throw new CharConversionException("Cannot convert U+"
300                             + Integer.toHexString(code)
301                             + " to char (code greater than U+10FFFF)");
302                 }
303             }
304         }
305     }
306
307     // Implements Reusable.
308
public void reset() {
309         _code = 0;
310         _end = 0;
311         _inputStream = null;
312         _moreBytes = 0;
313         _start = 0;
314     }
315
316 }
Popular Tags