KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > jasper > xmlparser > UTF8Reader


1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */

17
18 package org.apache.jasper.xmlparser;
19
20 import java.io.InputStream JavaDoc;
21 import java.io.IOException JavaDoc;
22 import java.io.Reader JavaDoc;
23 import java.io.UTFDataFormatException JavaDoc;
24 import org.apache.jasper.compiler.Localizer;
25
26 /**
27  * @author Andy Clark, IBM
28  *
29  * @version $Id: UTF8Reader.java 467222 2006-10-24 03:17:11Z markt $
30  */

31 public class UTF8Reader
32     extends Reader JavaDoc {
33
34     private org.apache.commons.logging.Log log=
35         org.apache.commons.logging.LogFactory.getLog( UTF8Reader.class );
36     
37     //
38
// Constants
39
//
40

41     /** Default byte buffer size (2048). */
42     public static final int DEFAULT_BUFFER_SIZE = 2048;
43
44     // debugging
45

46     /** Debug read. */
47     private static final boolean DEBUG_READ = false;
48
49     //
50
// Data
51
//
52

53     /** Input stream. */
54     protected InputStream JavaDoc fInputStream;
55
56     /** Byte buffer. */
57     protected byte[] fBuffer;
58
59     /** Offset into buffer. */
60     protected int fOffset;
61
62     /** Surrogate character. */
63     private int fSurrogate = -1;
64
65     //
66
// Constructors
67
//
68

69     /**
70      * Constructs a UTF-8 reader from the specified input stream,
71      * buffer size and MessageFormatter.
72      *
73      * @param inputStream The input stream.
74      * @param size The initial buffer size.
75      */

76     public UTF8Reader(InputStream JavaDoc inputStream, int size) {
77         fInputStream = inputStream;
78         fBuffer = new byte[size];
79     }
80
81     //
82
// Reader methods
83
//
84

85     /**
86      * Read a single character. This method will block until a character is
87      * available, an I/O error occurs, or the end of the stream is reached.
88      *
89      * <p> Subclasses that intend to support efficient single-character input
90      * should override this method.
91      *
92      * @return The character read, as an integer in the range 0 to 16383
93      * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
94      * been reached
95      *
96      * @exception IOException If an I/O error occurs
97      */

98     public int read() throws IOException JavaDoc {
99
100         // decode character
101
int c = fSurrogate;
102         if (fSurrogate == -1) {
103             // NOTE: We use the index into the buffer if there are remaining
104
// bytes from the last block read. -Ac
105
int index = 0;
106
107             // get first byte
108
int b0 = index == fOffset
109                    ? fInputStream.read() : fBuffer[index++] & 0x00FF;
110             if (b0 == -1) {
111                 return -1;
112             }
113
114             // UTF-8: [0xxx xxxx]
115
// Unicode: [0000 0000] [0xxx xxxx]
116
if (b0 < 0x80) {
117                 c = (char)b0;
118             }
119
120             // UTF-8: [110y yyyy] [10xx xxxx]
121
// Unicode: [0000 0yyy] [yyxx xxxx]
122
else if ((b0 & 0xE0) == 0xC0) {
123                 int b1 = index == fOffset
124                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
125                 if (b1 == -1) {
126                     expectedByte(2, 2);
127                 }
128                 if ((b1 & 0xC0) != 0x80) {
129                     invalidByte(2, 2, b1);
130                 }
131                 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
132             }
133
134             // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
135
// Unicode: [zzzz yyyy] [yyxx xxxx]
136
else if ((b0 & 0xF0) == 0xE0) {
137                 int b1 = index == fOffset
138                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
139                 if (b1 == -1) {
140                     expectedByte(2, 3);
141                 }
142                 if ((b1 & 0xC0) != 0x80) {
143                     invalidByte(2, 3, b1);
144                 }
145                 int b2 = index == fOffset
146                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
147                 if (b2 == -1) {
148                     expectedByte(3, 3);
149                 }
150                 if ((b2 & 0xC0) != 0x80) {
151                     invalidByte(3, 3, b2);
152                 }
153                 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
154                     (b2 & 0x003F);
155             }
156
157             // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
158
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
159
// [1101 11yy] [yyxx xxxx] (low surrogate)
160
// * uuuuu = wwww + 1
161
else if ((b0 & 0xF8) == 0xF0) {
162                 int b1 = index == fOffset
163                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
164                 if (b1 == -1) {
165                     expectedByte(2, 4);
166                 }
167                 if ((b1 & 0xC0) != 0x80) {
168                     invalidByte(2, 3, b1);
169                 }
170                 int b2 = index == fOffset
171                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
172                 if (b2 == -1) {
173                     expectedByte(3, 4);
174                 }
175                 if ((b2 & 0xC0) != 0x80) {
176                     invalidByte(3, 3, b2);
177                 }
178                 int b3 = index == fOffset
179                        ? fInputStream.read() : fBuffer[index++] & 0x00FF;
180                 if (b3 == -1) {
181                     expectedByte(4, 4);
182                 }
183                 if ((b3 & 0xC0) != 0x80) {
184                     invalidByte(4, 4, b3);
185                 }
186                 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
187                 if (uuuuu > 0x10) {
188                     invalidSurrogate(uuuuu);
189                 }
190                 int wwww = uuuuu - 1;
191                 int hs = 0xD800 |
192                          ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
193                          ((b2 >> 4) & 0x0003);
194                 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
195                 c = hs;
196                 fSurrogate = ls;
197             }
198
199             // error
200
else {
201                 invalidByte(1, 1, b0);
202             }
203         }
204
205         // use surrogate
206
else {
207             fSurrogate = -1;
208         }
209
210         // return character
211
if (DEBUG_READ) {
212             if (log.isDebugEnabled())
213                 log.debug("read(): 0x"+Integer.toHexString(c));
214         }
215         return c;
216
217     } // read():int
218

219     /**
220      * Read characters into a portion of an array. This method will block
221      * until some input is available, an I/O error occurs, or the end of the
222      * stream is reached.
223      *
224      * @param ch Destination buffer
225      * @param offset Offset at which to start storing characters
226      * @param length Maximum number of characters to read
227      *
228      * @return The number of characters read, or -1 if the end of the
229      * stream has been reached
230      *
231      * @exception IOException If an I/O error occurs
232      */

233     public int read(char ch[], int offset, int length) throws IOException JavaDoc {
234
235         // handle surrogate
236
int out = offset;
237         if (fSurrogate != -1) {
238             ch[offset + 1] = (char)fSurrogate;
239             fSurrogate = -1;
240             length--;
241             out++;
242         }
243
244         // read bytes
245
int count = 0;
246         if (fOffset == 0) {
247             // adjust length to read
248
if (length > fBuffer.length) {
249                 length = fBuffer.length;
250             }
251
252             // perform read operation
253
count = fInputStream.read(fBuffer, 0, length);
254             if (count == -1) {
255                 return -1;
256             }
257             count += out - offset;
258         }
259
260         // skip read; last character was in error
261
// NOTE: Having an offset value other than zero means that there was
262
// an error in the last character read. In this case, we have
263
// skipped the read so we don't consume any bytes past the
264
// error. By signalling the error on the next block read we
265
// allow the method to return the most valid characters that
266
// it can on the previous block read. -Ac
267
else {
268             count = fOffset;
269             fOffset = 0;
270         }
271
272         // convert bytes to characters
273
final int total = count;
274         for (int in = 0; in < total; in++) {
275             int b0 = fBuffer[in] & 0x00FF;
276
277             // UTF-8: [0xxx xxxx]
278
// Unicode: [0000 0000] [0xxx xxxx]
279
if (b0 < 0x80) {
280                 ch[out++] = (char)b0;
281                 continue;
282             }
283
284             // UTF-8: [110y yyyy] [10xx xxxx]
285
// Unicode: [0000 0yyy] [yyxx xxxx]
286
if ((b0 & 0xE0) == 0xC0) {
287                 int b1 = -1;
288                 if (++in < total) {
289                     b1 = fBuffer[in] & 0x00FF;
290                 }
291                 else {
292                     b1 = fInputStream.read();
293                     if (b1 == -1) {
294                         if (out > offset) {
295                             fBuffer[0] = (byte)b0;
296                             fOffset = 1;
297                             return out - offset;
298                         }
299                         expectedByte(2, 2);
300                     }
301                     count++;
302                 }
303                 if ((b1 & 0xC0) != 0x80) {
304                     if (out > offset) {
305                         fBuffer[0] = (byte)b0;
306                         fBuffer[1] = (byte)b1;
307                         fOffset = 2;
308                         return out - offset;
309                     }
310                     invalidByte(2, 2, b1);
311                 }
312                 int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
313                 ch[out++] = (char)c;
314                 count -= 1;
315                 continue;
316             }
317
318             // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
319
// Unicode: [zzzz yyyy] [yyxx xxxx]
320
if ((b0 & 0xF0) == 0xE0) {
321                 int b1 = -1;
322                 if (++in < total) {
323                     b1 = fBuffer[in] & 0x00FF;
324                 }
325                 else {
326                     b1 = fInputStream.read();
327                     if (b1 == -1) {
328                         if (out > offset) {
329                             fBuffer[0] = (byte)b0;
330                             fOffset = 1;
331                             return out - offset;
332                         }
333                         expectedByte(2, 3);
334                     }
335                     count++;
336                 }
337                 if ((b1 & 0xC0) != 0x80) {
338                     if (out > offset) {
339                         fBuffer[0] = (byte)b0;
340                         fBuffer[1] = (byte)b1;
341                         fOffset = 2;
342                         return out - offset;
343                     }
344                     invalidByte(2, 3, b1);
345                 }
346                 int b2 = -1;
347                 if (++in < total) {
348                     b2 = fBuffer[in] & 0x00FF;
349                 }
350                 else {
351                     b2 = fInputStream.read();
352                     if (b2 == -1) {
353                         if (out > offset) {
354                             fBuffer[0] = (byte)b0;
355                             fBuffer[1] = (byte)b1;
356                             fOffset = 2;
357                             return out - offset;
358                         }
359                         expectedByte(3, 3);
360                     }
361                     count++;
362                 }
363                 if ((b2 & 0xC0) != 0x80) {
364                     if (out > offset) {
365                         fBuffer[0] = (byte)b0;
366                         fBuffer[1] = (byte)b1;
367                         fBuffer[2] = (byte)b2;
368                         fOffset = 3;
369                         return out - offset;
370                     }
371                     invalidByte(3, 3, b2);
372                 }
373                 int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
374                         (b2 & 0x003F);
375                 ch[out++] = (char)c;
376                 count -= 2;
377                 continue;
378             }
379
380             // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
381
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
382
// [1101 11yy] [yyxx xxxx] (low surrogate)
383
// * uuuuu = wwww + 1
384
if ((b0 & 0xF8) == 0xF0) {
385                 int b1 = -1;
386                 if (++in < total) {
387                     b1 = fBuffer[in] & 0x00FF;
388                 }
389                 else {
390                     b1 = fInputStream.read();
391                     if (b1 == -1) {
392                         if (out > offset) {
393                             fBuffer[0] = (byte)b0;
394                             fOffset = 1;
395                             return out - offset;
396                         }
397                         expectedByte(2, 4);
398                     }
399                     count++;
400                 }
401                 if ((b1 & 0xC0) != 0x80) {
402                     if (out > offset) {
403                         fBuffer[0] = (byte)b0;
404                         fBuffer[1] = (byte)b1;
405                         fOffset = 2;
406                         return out - offset;
407                     }
408                     invalidByte(2, 4, b1);
409                 }
410                 int b2 = -1;
411                 if (++in < total) {
412                     b2 = fBuffer[in] & 0x00FF;
413                 }
414                 else {
415                     b2 = fInputStream.read();
416                     if (b2 == -1) {
417                         if (out > offset) {
418                             fBuffer[0] = (byte)b0;
419                             fBuffer[1] = (byte)b1;
420                             fOffset = 2;
421                             return out - offset;
422                         }
423                         expectedByte(3, 4);
424                     }
425                     count++;
426                 }
427                 if ((b2 & 0xC0) != 0x80) {
428                     if (out > offset) {
429                         fBuffer[0] = (byte)b0;
430                         fBuffer[1] = (byte)b1;
431                         fBuffer[2] = (byte)b2;
432                         fOffset = 3;
433                         return out - offset;
434                     }
435                     invalidByte(3, 4, b2);
436                 }
437                 int b3 = -1;
438                 if (++in < total) {
439                     b3 = fBuffer[in] & 0x00FF;
440                 }
441                 else {
442                     b3 = fInputStream.read();
443                     if (b3 == -1) {
444                         if (out > offset) {
445                             fBuffer[0] = (byte)b0;
446                             fBuffer[1] = (byte)b1;
447                             fBuffer[2] = (byte)b2;
448                             fOffset = 3;
449                             return out - offset;
450                         }
451                         expectedByte(4, 4);
452                     }
453                     count++;
454                 }
455                 if ((b3 & 0xC0) != 0x80) {
456                     if (out > offset) {
457                         fBuffer[0] = (byte)b0;
458                         fBuffer[1] = (byte)b1;
459                         fBuffer[2] = (byte)b2;
460                         fBuffer[3] = (byte)b3;
461                         fOffset = 4;
462                         return out - offset;
463                     }
464                     invalidByte(4, 4, b2);
465                 }
466
467                 // decode bytes into surrogate characters
468
int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
469                 if (uuuuu > 0x10) {
470                     invalidSurrogate(uuuuu);
471                 }
472                 int wwww = uuuuu - 1;
473                 int zzzz = b1 & 0x000F;
474                 int yyyyyy = b2 & 0x003F;
475                 int xxxxxx = b3 & 0x003F;
476                 int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4);
477                 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
478
479                 // set characters
480
ch[out++] = (char)hs;
481                 ch[out++] = (char)ls;
482                 count -= 2;
483                 continue;
484             }
485
486             // error
487
if (out > offset) {
488                 fBuffer[0] = (byte)b0;
489                 fOffset = 1;
490                 return out - offset;
491             }
492             invalidByte(1, 1, b0);
493         }
494
495         // return number of characters converted
496
if (DEBUG_READ) {
497             if (log.isDebugEnabled())
498                 log.debug("read(char[],"+offset+','+length+"): count="+count);
499         }
500         return count;
501
502     } // read(char[],int,int)
503

504     /**
505      * Skip characters. This method will block until some characters are
506      * available, an I/O error occurs, or the end of the stream is reached.
507      *
508      * @param n The number of characters to skip
509      *
510      * @return The number of characters actually skipped
511      *
512      * @exception IOException If an I/O error occurs
513      */

514     public long skip(long n) throws IOException JavaDoc {
515
516         long remaining = n;
517         final char[] ch = new char[fBuffer.length];
518         do {
519             int length = ch.length < remaining ? ch.length : (int)remaining;
520             int count = read(ch, 0, length);
521             if (count > 0) {
522                 remaining -= count;
523             }
524             else {
525                 break;
526             }
527         } while (remaining > 0);
528
529         long skipped = n - remaining;
530         return skipped;
531
532     } // skip(long):long
533

534     /**
535      * Tell whether this stream is ready to be read.
536      *
537      * @return True if the next read() is guaranteed not to block for input,
538      * false otherwise. Note that returning false does not guarantee that the
539      * next read will block.
540      *
541      * @exception IOException If an I/O error occurs
542      */

543     public boolean ready() throws IOException JavaDoc {
544         return false;
545     } // ready()
546

547     /**
548      * Tell whether this stream supports the mark() operation.
549      */

550     public boolean markSupported() {
551         return false;
552     } // markSupported()
553

554     /**
555      * Mark the present position in the stream. Subsequent calls to reset()
556      * will attempt to reposition the stream to this point. Not all
557      * character-input streams support the mark() operation.
558      *
559      * @param readAheadLimit Limit on the number of characters that may be
560      * read while still preserving the mark. After
561      * reading this many characters, attempting to
562      * reset the stream may fail.
563      *
564      * @exception IOException If the stream does not support mark(),
565      * or if some other I/O error occurs
566      */

567     public void mark(int readAheadLimit) throws IOException JavaDoc {
568     throw new IOException JavaDoc(
569                 Localizer.getMessage("jsp.error.xml.operationNotSupported",
570                      "mark()", "UTF-8"));
571     }
572
573     /**
574      * Reset the stream. If the stream has been marked, then attempt to
575      * reposition it at the mark. If the stream has not been marked, then
576      * attempt to reset it in some way appropriate to the particular stream,
577      * for example by repositioning it to its starting point. Not all
578      * character-input streams support the reset() operation, and some support
579      * reset() without supporting mark().
580      *
581      * @exception IOException If the stream has not been marked,
582      * or if the mark has been invalidated,
583      * or if the stream does not support reset(),
584      * or if some other I/O error occurs
585      */

586     public void reset() throws IOException JavaDoc {
587         fOffset = 0;
588         fSurrogate = -1;
589     } // reset()
590

591     /**
592      * Close the stream. Once a stream has been closed, further read(),
593      * ready(), mark(), or reset() invocations will throw an IOException.
594      * Closing a previously-closed stream, however, has no effect.
595      *
596      * @exception IOException If an I/O error occurs
597      */

598     public void close() throws IOException JavaDoc {
599         fInputStream.close();
600     } // close()
601

602     //
603
// Private methods
604
//
605

606     /** Throws an exception for expected byte. */
607     private void expectedByte(int position, int count)
608         throws UTFDataFormatException JavaDoc {
609
610         throw new UTFDataFormatException JavaDoc(
611                 Localizer.getMessage("jsp.error.xml.expectedByte",
612                      Integer.toString(position),
613                      Integer.toString(count)));
614
615     } // expectedByte(int,int,int)
616

617     /** Throws an exception for invalid byte. */
618     private void invalidByte(int position, int count, int c)
619         throws UTFDataFormatException JavaDoc {
620
621         throw new UTFDataFormatException JavaDoc(
622                 Localizer.getMessage("jsp.error.xml.invalidByte",
623                      Integer.toString(position),
624                      Integer.toString(count)));
625     } // invalidByte(int,int,int,int)
626

627     /** Throws an exception for invalid surrogate bits. */
628     private void invalidSurrogate(int uuuuu) throws UTFDataFormatException JavaDoc {
629         
630         throw new UTFDataFormatException JavaDoc(
631                 Localizer.getMessage("jsp.error.xml.invalidHighSurrogate",
632                      Integer.toHexString(uuuuu)));
633     } // invalidSurrogate(int)
634

635 } // class UTF8Reader
636
Popular Tags