EncodingHeuristics


1   /* Copyright 2002, 2003, 2005 Elliotte Rusty Harold
2      
3      This library is free software; you can redistribute it and/or modify
4      it under the terms of version 2.1 of the GNU Lesser General Public 
5      License as published by the Free Software Foundation.
6      
7      This library is distributed in the hope that it will be useful,
8      but WITHOUT ANY WARRANTY; without even the implied warranty of
9      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
10     GNU Lesser General Public License for more details.
11     
12     You should have received a copy of the GNU Lesser General Public
13     License along with this library; if not, write to the 
14     Free Software Foundation, Inc., 59 Temple Place, Suite 330, 
15     Boston, MA 02111-1307  USA
16     
17     You can contact Elliotte Rusty Harold by sending e-mail to
18     elharo@metalab.unc.edu. Please include the word "XOM" in the
19     subject line. The XOM home page is located at http://www.xom.nu/
20  */
21  
22  package nu.xom.xinclude;
23  
24  import java.io.IOException  ;
25  import java.io.InputStream  ;
26  
27  /**
28   * <p>
29   * <code>EncodingHeuristics</code> reads from a stream
30   * (which should be buffered) and attempts to guess
31   * what the encoding of the text in the stream is.
32   * Byte order marks are stripped from the stream.
33   * If it fails to determine the type of the encoding,
34   * it returns the default UTF-8. 
35   * </p>
36   *
37   *
38   * @author Elliotte Rusty Harold
39   * @version 1.0
40   */
41  class EncodingHeuristics {
42  
43    // No instances allowed
44    private EncodingHeuristics() {}
45  
46    
47    /**
48      * <p>
49      * This utility method uses a variety of heuristics to
50      * attempt to guess the encoding from the initial
51      * characters.
52      * </p>
53      *
54      * @param in   <code>InputStream</code> to read from. 
55      * @return String  The name of the encoding.
56      * @throws IOException if the stream cannot be reset back 
57      *      to where it was when the method was invoked.
58      */    
59      public static String   readEncodingFromStream(InputStream   in)
60        throws IOException   {
61       
62          // This may fail if there are a lot of space 
63          // characters before the end of the encoding declaration
64          in.mark(1024);
65          
66          try {
67            // Lots of things can go wrong here. If any do,  
68            // return "UTF-8" as the default.
69              int byte1 = in.read();
70              int byte2 = in.read();
71              if (byte1 == 0xFE && byte2 == 0xFF) {
72                  // Don't reset because the byte order mark should not be 
73                  // included per section 4.3 of the XInclude spec
74                  return "UnicodeBig";          
75              }        
76              else if (byte1 == 0xFF && byte2 == 0xFE) {
77                  // Don't reset because the byte order mark should not be 
78                  // included per section 4.3 of the XInclude spec
79                  return "UnicodeLittle";        
80              }        
81              
82              /* In accordance with the Character Model,
83                 when the text format is a Unicode encoding, the XInclude 
84                 processor must fail the inclusion when the text in the 
85                 selected range is non-normalized. When transcoding 
86                 characters to a Unicode encoding from a legacy encoding,
87                 a normalizing transcoder must be used. */
88                      
89              int byte3 = in.read();
90              // check for UTF-8 byte order mark
91              if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) {
92                  // Don't reset because the byte order mark should not be 
93                  // included per section 4.3 of the XInclude spec
94                  return "UTF-8";          
95              }
96              
97              int byte4 = in.read();
98              if (byte1 == 0x00 
99                && byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) {
100                 // Don't reset because the byte order mark should not be 
101                 // included per section 4.3 of the XInclude spec
102                 // Most Java VMs don't support this next one
103                 return "UTF32BE";          
104             }
105             else if (byte1 == 0x00 && byte2 == 0x00 
106               && byte3 == 0xFF && byte4 == 0xFE) {
107                 // Don't reset because the byte order mark should not be 
108                 // included per section 4.3 of the XInclude spec
109                 // Most Java VMs don't support this next one
110                 return "UTF32LE";         
111             }
112             
113             // no byte order mark present; first character must be 
114             // less than sign or white space
115             // Let's look for less-than signs first
116             if (byte1 == 0x00 && byte2 == 0x00 
117               && byte3 == 0x00 && byte4 == '<') {
118                 in.reset();
119                 return "UTF32BE";          
120             }
121             else if (byte1 == '<' && byte2 == 0x00 
122               && byte3 == 0x00 && byte4 == 0x00) {
123                 in.reset();
124                 return "UTF32LE";          
125             }
126             else if (byte1 == 0x00 && byte2 == '<' 
127               && byte3 == 0x00 && byte4 == '?') {
128                 in.reset();
129                 return "UnicodeBigUnmarked";          
130             }
131             else if (byte1 == '<' && byte2 == 0x00 
132               && byte3 == '?' && byte4 == 0x00) {
133                 in.reset();
134                 return "UnicodeLittleUnmarked";          
135             }
136             else if (byte1 == '<' && byte2 == '?' 
137               && byte3 == 'x' && byte4 == 'm') {
138               // ASCII compatible, must read encoding declaration. 
139               // 1024 bytes will be far enough to read most 
140               // XML declarations
141               byte[] data = new byte[1024];
142               data[0] = (byte) byte1;
143               data[1] = (byte) byte2;
144               data[2] = (byte) byte3;
145               data[3] = (byte) byte4;
146               int length = in.read(data, 4, 1020) + 4;
147               // Use Latin-1 (ISO-8859-1) because it's ASCII compatible
148               // and all byte sequences are legal Latin-1 sequences 
149               // so I don't have to worry about encoding errors if I  
150               // slip past the end of the XML/text declaration
151               String   declaration=new String  (data, 0, length, "8859_1");
152               // If any of these throw a 
153               // StringIndexOutOfBoundsException,
154               // we just fall into the catch block and return null
155               // since this can't be well-formed XML
156               String   encoding = findEncodingDeclaration(declaration);
157               in.reset();
158               return encoding;
159               
160             }
161             else if (byte1 == 0x4C && byte2 == 0x6F 
162               && byte3 == 0xA7 && byte4 == 0x94) {
163               // EBCDIC compatible, must read encoding declaration 
164               byte[] buffer = new byte[1016];
165               for (int i = 0; i < buffer.length; i++) {
166                   int c = in.read();
167                   if (c == -1) break;
168                   buffer[i] = (byte) c;
169               }
170               in.reset();
171               // Most EBCDIC encodings are compatible with Cp037 over
172               // the range we care about
173               return findEncodingDeclaration(new String  (buffer, "Cp037"));
174             }
175         
176         }   
177         catch (Exception   ex) {
178             in.reset();
179             return "UTF-8";        
180         }
181         
182         // no XML or text declaration present
183         in.reset();
184         return "UTF-8";
185         
186     }
187 
188     
189     private static String   findEncodingDeclaration(String   declaration)
190         throws IOException   {
191         
192           int position = declaration.indexOf("encoding") + 8;
193           char c;
194           // get rid of white space before equals sign
195           while (true) {
196               c = declaration.charAt(position++);
197               if (c !=' ' && c != '\t' && c != '\r' && c != '\n') {
198                   break;
199               }
200           }
201           if (c != '=') { // malformed
202               throw new IOException  ("Couldn't determine encoding");
203           }
204           // get rid of white space after equals sign
205           while (true) {
206               c = declaration.charAt(position++);
207               if (c !=' ' && c != '\t' && c != '\r' && c != '\n') {
208                   break;
209               }
210           }
211           char delimiter = c;
212           if (delimiter != '\'' && delimiter != '"') { // malformed
213               return "UTF-8";
214           }
215           // now positioned to read encoding name
216           StringBuffer   encodingName = new StringBuffer  ();
217           while (true) {
218               c = declaration.charAt(position++);
219               if (c == delimiter) break;
220               encodingName.append(c);
221           }
222           return encodingName.toString();
223           
224     }
225 
226 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags