InputStreamWrapper


1   /*
2   Copyright (c) 2004, Dennis M. Sosnoski.
3   All rights reserved.
4   
5   Redistribution and use in source and binary forms, with or without modification,
6   are permitted provided that the following conditions are met:
7   
8    * Redistributions of source code must retain the above copyright notice, this
9      list of conditions and the following disclaimer.
10   * Redistributions in binary form must reproduce the above copyright notice,
11     this list of conditions and the following disclaimer in the documentation
12     and/or other materials provided with the distribution.
13   * Neither the name of JiBX nor the names of its contributors may be used
14     to endorse or promote products derived from this software without specific
15     prior written permission.
16  
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21  ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28  
29  package org.jibx.runtime.impl;
30  
31  import java.io.IOException  ;
32  import java.io.InputStream  ;
33  import java.io.InputStreamReader  ;
34  import java.io.Reader  ;
35  
36  /**
37   * Wrapper for input stream that supports multiple character encodings. This is
38   * needed because the XPP3 pull parser does not support detecting the character
39   * encoding for a document based on the content of the document. If used with a
40   * common encoding this performs the conversion to characters using an inner
41   * reader class; otherwise, this creates the appropriate reader type
42   *
43   * @author Dennis M. Sosnoski
44   * @version 1.0
45   */
46  
47  public class InputStreamWrapper
48  {
49      /** Default input buffer size. */
50      private static final int BUFFER_SIZE = 2048;
51      
52      /** Name of encoding to be used for stream. */
53      private String   m_encodingName;
54      
55      /** Stream for byte input. */
56      private InputStream   m_stream;
57      
58      /** Flag for end of stream reached. */
59      private boolean m_isEnd;
60      
61      /** Buffer for input bytes. */
62      private byte[] m_buffer;
63      
64      /** Offset past end of bytes in buffer. */
65      private int m_endOffset;
66      
67      /** Current offset for generating character from buffer. */
68      private int m_emptyOffset;
69      
70      /** Scan position offset used for lookahead in buffer. */
71      private int m_scanOffset;
72      
73      /**
74       * Constructor.
75       */
76      
77      public InputStreamWrapper() {
78          m_buffer = new byte[BUFFER_SIZE];
79      }
80      
81      /**
82       * Set input stream with encoding to be defined later. If an input stream is
83       * currently open when this is called the existing stream is closed, with
84       * any errors ignored.
85       *
86       * @param ins stream for document data input
87       */
88      
89      public void setInput(InputStream   ins) {
90          try {
91              close();
92          } catch (IOException   e) { /* deliberately empty */ }
93          m_stream = ins;
94          reset();
95      }
96      
97      /**
98       * Set input stream with specified encoding. If an input stream is currently
99       * open when this is called the existing stream is closed, with any errors
100      * ignored.
101      *
102      * @param ins stream for document data input
103      * @param enc character encoding used for input from stream
104      * (<code>null</code> if to be determined from XML input)
105      * @throws IOException
106      */
107     
108     public void setInput(InputStream   ins, String   enc) throws IOException   {
109         setInput(ins);
110         setEncoding(enc);
111     }
112     
113     /**
114      * Set encoding for stream. This call is only valid if the encoding has not
115      * been set previously, and if the encoding is a recognized type.
116      *
117      * @param enc character encoding used for input from stream
118      * (<code>null</code> if to be determined from XML input)
119      * @throws IOException if unknown encoding, or encoding already set
120      */
121     
122     public void setEncoding(String   enc) throws IOException   {
123         if (m_encodingName == null) {
124             m_encodingName = enc;
125         } else {
126             throw new IOException  ("Encoding has already been set for stream");
127         }
128     }
129     
130     /**
131      * Reads data into the buffer. Any retained data is first copied down to the
132      * start of the buffer array. Next, data is read from the wrapped stream
133      * into the available space in the buffer. The actual number of characters
134      * read by a call to this method is normally between one and the space
135      * available in the buffer array.
136      * 
137      * @return <code>true</code> if data has been read into buffer,
138      * <code>false</code> if not
139      * @throws IOException on error reading from wrapped stream
140      */
141     
142     private boolean fillBuffer() throws IOException   {
143         if (m_isEnd) {
144             return false;
145         } else {
146             
147             // move remaining data in buffer down to start
148             int rem = m_endOffset - m_emptyOffset;
149             if (rem > 0) {
150                 System.arraycopy(m_buffer, m_emptyOffset, m_buffer, 0, rem);
151             }
152             m_emptyOffset = 0;
153             
154             // read to maximum capacity of buffer
155             int max = m_buffer.length - rem;
156             int actual = m_stream.read(m_buffer, rem, max);
157             if (actual >= 0) {
158                 m_endOffset = rem + actual;
159                 return true;
160             } else {
161                 m_endOffset = rem;
162                 m_isEnd = true;
163                 return false;
164             }
165         }
166     }
167     
168     /**
169      * Reads data into the buffer to at least a minimum number of bytes. Any
170      * retained data is first copied down to the start of the buffer array.
171      * Next, data is read from the wrapped stream into the available space in
172      * the buffer until the end of the input stream is reached or at least the
173      * requested number of bytes are present in the buffer.
174      * 
175      * @param min number of bytes required
176      * @return <code>true</code> if buffer contains at least the required byte
177      * count on return, <code>false</code> if not
178      * @throws IOException on error reading from wrapped stream
179      */
180     
181     private boolean require(int min) throws IOException   {
182         while (m_endOffset - m_emptyOffset < min) {
183             if (!fillBuffer()) {
184                 return false;
185             }
186         }
187         return true;
188     }
189     
190     /**
191      * Check if a character is XML whitespace.
192      * 
193      * @return <code>true</code> if whitespace, <code>false</code> if not
194      */
195     
196     private boolean isWhite(int chr) {
197         return chr == ' ' || chr == 0x09 || chr == 0x0A || chr == 0x0D;
198     }
199     
200     /**
201      * Reads a space or equals ('=') delimited token from the scan position in
202      * the buffer. This treats bytes in the buffer as equivalent to characters.
203      * 
204      * @return token read from buffer
205      * @throws IOException on error reading from wrapped stream
206      */
207     
208     private String   scanToken() throws IOException   {
209         boolean skipping = true;
210         StringBuffer   buff = new StringBuffer  ();
211         while (require(m_scanOffset+1)) {
212             char chr = (char)m_buffer[m_scanOffset++];
213             if (skipping) {
214                 if (!isWhite(chr)) {
215                     skipping = false;
216                     buff.append(chr);
217                     if (chr == '=') {
218                         return buff.toString();
219                     }
220                 }
221             } else if (isWhite(chr) || chr == '=') {
222                 m_scanOffset--;
223                 return buff.toString();
224             } else {
225                 buff.append(chr);
226             }
227         }
228         return null;
229     }
230     
231     /**
232      * Reads a quote delimited token from the scan position in the buffer. This
233      * treats bytes in the buffer as equivalent to characters, and skips past
234      * any leading whitespace.
235      * 
236      * @return token read from buffer
237      * @throws IOException on error reading from wrapped stream
238      */
239     
240     private String   scanQuoted() throws IOException   {
241         boolean skipping = true;
242         int quot = 0;
243         StringBuffer   buff = new StringBuffer  ();
244         while (require(m_scanOffset+1)) {
245             char chr = (char)m_buffer[m_scanOffset++];
246             if (skipping) {
247                 if (!isWhite(chr)) {
248                     if (chr == '"' || chr == '\'') {
249                         skipping = false;
250                         quot = chr;
251                     } else {
252                         break;
253                     }
254                 }
255             } else if (chr == quot) {
256                 return buff.toString();
257             } else {
258                 buff.append(chr);
259             }
260         }
261         return null;
262     }
263     
264     /**
265      * Get reader for wrapped input stream. This creates and returns a reader
266      * using the appropriate encoding, if necessary reading and examining the
267      * first part of the stream (including the XML declaration, if present) to
268      * determine the encoding.
269      *
270      * @throws IOException if error reading from document or creating a reader
271      * for the encoding found
272      */
273     
274     public Reader   getReader() throws IOException   {
275         
276         // check if we need to determine an encoding
277         if (m_encodingName == null) {
278             
279             // try to get enough input to decide if anything other than default
280             m_encodingName = "UTF-8";
281             if (require(4)) {
282                 
283                 // get first four bytes for initial determination
284                 int bom = (((m_buffer[0] << 8) + (m_buffer[1] & 0xFF) << 8) +
285                     (m_buffer[2] & 0xFF) << 8) + (m_buffer[3] & 0xFF);
286                 if (bom == 0x3C3F786D) {
287                     
288                     // read encoding declaration with single byte characters
289                     m_scanOffset = 2;
290                     String   token = scanToken();
291                     if ("xml".equals(token)) {
292                         while ((token = scanToken()) != null &&
293                             !"?>".equals(token)) {
294                             if ("encoding".equals(token)) {
295                                 if ("=".equals(scanToken())) {
296                                     token = scanQuoted();
297                                     if (token != null) {
298                                         m_encodingName = token;
299                                         break;
300                                     }
301                                 }
302                             } else if ("=".equals(token)) {
303                                 scanQuoted();
304                             }
305                         }
306                     }
307                     
308                 } else if (bom == 0x0000FEFF || bom == 0xFFFE0000 ||
309                     bom == 0x0000FFFE || bom == 0xFEFF0000) {
310                     
311                     // just use generic UCS-4 and let the libaries figure it out
312                     m_encodingName = "UCS-4";
313                     
314                 } else if ((bom & 0xFFFFFF00) == 0xEFBBBF00) {
315                     
316                     // UTF-8 as specified by byte order mark
317                     m_encodingName = "UTF-8";
318                     
319                 } else {
320                     int upper = bom & 0xFFFF0000;
321                     if (upper == 0xFEFF0000 || bom == 0x003C003F) {
322                         
323                         // assume UTF-16BE for 16-bit BE
324                         m_encodingName = "UTF-16BE";
325                         
326                     } else if (upper == 0xFFFE0000 || bom == 0x3C003F00) {
327                         
328                         // assume UTF-16LE for 16-bit LE
329                         m_encodingName = "UTF-16LE";
330                         
331                     } else if (bom == 0x4C6FA794){
332                         
333                         // just because we can, even though nobody should
334                         m_encodingName = "EBCDIC";
335                     }
336                 }
337             }
338         }
339         if (m_encodingName.equalsIgnoreCase("UTF-8")) {
340             return new WrappedStreamUTF8Reader();
341         } else if (m_encodingName.equalsIgnoreCase("ISO-8859-1") ||
342             m_encodingName.equalsIgnoreCase("ASCII")) {
343             return new WrappedStreamISO88591Reader();
344         } else {
345             return new InputStreamReader  (new WrappedStream(), m_encodingName);
346         }
347     }
348     
349     /**
350      * Get encoding for input document. This call may not return an accurate
351      * result until after {@link #getReader} is called.
352      *
353      * @return character encoding for input document
354      */
355     
356     public String   getEncoding() {
357         return m_encodingName;
358     }
359     
360     /**
361      * Close document input. Completes reading of document input, including
362      * closing the input medium.
363      *
364      * @throws IOException on error closing document
365      */
366 
367     public void close() throws IOException   {
368         if (m_stream != null) {
369             m_stream.close();
370             m_stream = null;
371         }
372         reset();
373     }
374     
375     /**
376      * Reset to initial state for reuse.
377      */
378     
379     public void reset() {
380         m_isEnd = false;
381         m_endOffset = 0;
382         m_emptyOffset = 0;
383         m_encodingName = null;
384     }
385     
386     /**
387      * Stream that just uses the enclosing class to buffer input from the
388      * wrapped stream.
389      */
390     
391     private class WrappedStream extends InputStream  
392     {
393         /* (non-Javadoc)
394          * @see java.io.InputStream#available()
395          */
396         
397         public int available() throws IOException   {
398             return m_endOffset - m_emptyOffset + m_stream.available();
399         }
400         
401         /* (non-Javadoc)
402          * @see java.io.InputStream#close()
403          */
404         
405         public void close() throws IOException   {
406             InputStreamWrapper.this.close();
407         }
408         
409         /* (non-Javadoc)
410          * @see java.io.InputStream#read(byte[], int, int)
411          */
412         
413         public int read(byte[] b, int off, int len) throws IOException   {
414             int avail;
415             int actual = 0;
416             while (len > (avail = m_endOffset - m_emptyOffset)) {
417                 System.arraycopy(m_buffer, m_emptyOffset, b, off, avail);
418                 off += avail;
419                 len -= avail;
420                 actual += avail;
421                 m_emptyOffset = m_endOffset = 0;
422                 if (!fillBuffer()) {
423                     return actual == 0 ? -1 : actual;
424                 }
425             }
426             System.arraycopy(m_buffer, m_emptyOffset, b, off, len);
427             m_emptyOffset += len;
428             return actual + len;
429         }
430         
431         /* (non-Javadoc)
432          * @see java.io.InputStream#read(byte[])
433          */
434         
435         public int read(byte[] b) throws IOException   {
436             return read(b, 0, b.length);
437         }
438         
439         /* (non-Javadoc)
440          * @see java.io.InputStream#skip(long)
441          */
442         
443         public long skip(long n) throws IOException   {
444             int avail = m_endOffset - m_emptyOffset;
445             if (n >= (long)avail) {
446                 return avail + m_stream.skip(n - avail);
447             } else {
448                 m_emptyOffset += (int)n;
449                 return n;
450             }
451         }
452         
453         /* (non-Javadoc)
454          * @see java.io.InputStream#read()
455          */
456         
457         public int read() throws IOException   {
458             if (m_emptyOffset >= m_endOffset && !fillBuffer()) {
459                 return -1;
460             } else {
461                 return m_buffer[m_emptyOffset++];
462             }
463         }
464     }
465     
466     /**
467      * Reader for input stream using UTF-8 encoding. This uses the enclosing
468      * class to buffer input from the stream, interpreting it as characters on
469      * demand.
470      */
471     
472     private class WrappedStreamUTF8Reader extends Reader  
473     {
474         /* (non-Javadoc)
475          * @see java.io.Reader#close()
476          */
477         
478         public void close() throws IOException   {
479             InputStreamWrapper.this.close();
480         }
481         
482         /* (non-Javadoc)
483          * @see java.io.Reader#read(char[], int, int)
484          */
485         
486         public int read(char[] b, int off, int len) throws IOException   {
487             
488             // load up local variables for conversion loop
489             int end = off + len;
490             int empty = m_emptyOffset;
491             byte[] buff = m_buffer;
492             while (off < end) {
493                 
494                 // fill buffer if less than maximum byte count in character
495                 if (empty + 3 > m_endOffset) {
496                     m_emptyOffset = empty;
497                     fillBuffer();
498                     empty = m_emptyOffset;
499                     if (empty == m_endOffset) {
500                         int actual = len + off - end;
501                         return actual > 0 ? actual : -1;
502                     }
503                 }
504                 
505                 // check for single-byte vs multi-byte character next
506                 int byt = buff[empty++];
507                 if (byt >= 0) {
508                     
509                     // single-byte character, just store to output array
510                     b[off++] = (char)byt;
511                     if (byt == 0) {
512                         System.err.println("Wrote null");
513                     }
514                     
515                 } else if ((byt & 0xE0) == 0xC0) {
516                     
517                     // double-byte character, check bytes available and store
518                     if (empty < m_endOffset) {
519                         b[off++] = (char)(((byt & 0x1F) << 6) +
520                             (buff[empty++] & 0x3F));
521                         if (b[off-1] == 0) {
522                             System.err.println("Wrote null");
523                         }
524                     } else {
525                         throw new IOException  ("UTF-8 conversion error");
526                     }
527                     
528                 } else {
529                     
530                     // three-byte character, check bytes available and store
531                     if (empty + 1 < m_endOffset) {
532                         int byt2 = buff[empty++] & 0x3F;
533                         b[off++] = (char)((((byt & 0x0F) << 6) +
534                             byt2 << 6) + (buff[empty++] & 0x3F));
535                         if (b[off-1] == 0) {
536                             System.err.println("Wrote null");
537                         }
538                     } else {
539                         throw new IOException  ("UTF-8 conversion error");
540                     }
541                 }
542             }
543             m_emptyOffset = empty;
544             return len;
545         }
546         
547         /* (non-Javadoc)
548          * @see java.io.Reader#read(char[])
549          */
550         
551         public int read(char[] b) throws IOException   {
552             return read(b, 0, b.length);
553         }
554         
555         /* (non-Javadoc)
556          * @see java.io.Reader#read()
557          */
558         
559         public int read() throws IOException   {
560             
561             // fill buffer if less than maximum byte count in character
562             if (m_emptyOffset + 3 > m_endOffset) {
563                 fillBuffer();
564                 if (m_emptyOffset == m_endOffset) {
565                     return -1;
566                 }
567             }
568             
569             // check for single-byte vs multi-byte character next
570             int byt = m_buffer[m_emptyOffset++];
571             if (byt >= 0) {
572                 
573                 // single-byte character, just store to output array
574                 return byt & 0xFF;
575                 
576             } else if ((byt & 0xE0) == 0xC0) {
577                 
578                 // double-byte character, check bytes available and store
579                 if (m_emptyOffset < m_endOffset) {
580                     return ((byt & 0x1F) << 6) +
581                         (m_buffer[m_emptyOffset++] & 0x3F);
582                 } else {
583                     throw new IOException  ("UTF-8 conversion error");
584                 }
585                 
586             } else {
587                 
588                 // three-byte character, check bytes available and store
589                 if (m_emptyOffset + 1 < m_endOffset) {
590                     int byt2 = m_buffer[m_emptyOffset++] & 0xFF;
591                     return (((byt & 0x0F) << 6) +
592                         byt2 << 6) + (m_buffer[m_emptyOffset++] & 0x3F);
593                 } else {
594                     throw new IOException  ("UTF-8 conversion error");
595                 }
596             }
597         }
598         
599         /* (non-Javadoc)
600          * @see java.io.Reader#ready()
601          */
602         
603         public boolean ready() throws IOException   {
604             return m_emptyOffset + 2 < m_endOffset;
605         }
606     }
607     
608     /**
609      * Reader for input stream using ISO8859-1 encoding. This uses the enclosing
610      * class to buffer input from the stream, interpreting it as characters on
611      * demand.
612      */
613     
614     private class WrappedStreamISO88591Reader extends Reader  
615     {
616         /* (non-Javadoc)
617          * @see java.io.Reader#close()
618          */
619         
620         public void close() throws IOException   {
621             InputStreamWrapper.this.close();
622         }
623         
624         /* (non-Javadoc)
625          * @see java.io.Reader#read(char[], int, int)
626          */
627         
628         public int read(char[] b, int off, int len) throws IOException   {
629             
630             // load up local variables for conversion loop
631             int end = off + len;
632             int empty = m_emptyOffset;
633             byte[] buff = m_buffer;
634             while (off < end) {
635                 
636                 // make sure there's data in buffer
637                 int avail = m_endOffset - empty;
638                 if (avail == 0) {
639                     m_emptyOffset = empty;
640                     if (fillBuffer()) {
641                         empty = m_emptyOffset;
642                         avail = m_endOffset - empty;
643                     } else {
644                         int actual = len + off - end;
645                         return actual > 0 ? actual : -1;
646                     }
647                 }
648                 
649                 // find count of bytes to convert to characters
650                 int use = end - off;
651                 if (use > avail) {
652                     use = avail;
653                 }
654                 
655                 // convert bytes directly to characters
656                 int limit = empty + use;
657                 while (empty < limit) {
658                     b[off++] = (char)(buff[empty++] & 0xFF);
659                 }
660             }
661             m_emptyOffset = empty;
662             return len;
663         }
664         
665         /* (non-Javadoc)
666          * @see java.io.Reader#read(char[])
667          */
668         
669         public int read(char[] b) throws IOException   {
670             return read(b, 0, b.length);
671         }
672         
673         /* (non-Javadoc)
674          * @see java.io.Reader#read()
675          */
676         
677         public int read() throws IOException   {
678             if (m_emptyOffset >= m_endOffset && !fillBuffer()) {
679                 return -1;
680             } else {
681                 return m_buffer[m_emptyOffset++] & 0xFF;
682             }
683         }
684         
685         /* (non-Javadoc)
686          * @see java.io.Reader#ready()
687          */
688         
689         public boolean ready() throws IOException   {
690             return m_emptyOffset < m_endOffset;
691         }
692     }
693 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags