KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > nu > xom > xinclude > EncodingHeuristics


1 /* Copyright 2002, 2003, 2005 Elliotte Rusty Harold
2    
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6    
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10    GNU Lesser General Public License for more details.
11    
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307 USA
16    
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@metalab.unc.edu. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */

21
22 package nu.xom.xinclude;
23
24 import java.io.IOException JavaDoc;
25 import java.io.InputStream JavaDoc;
26
27 /**
28  * <p>
29  * <code>EncodingHeuristics</code> reads from a stream
30  * (which should be buffered) and attempts to guess
31  * what the encoding of the text in the stream is.
32  * Byte order marks are stripped from the stream.
33  * If it fails to determine the type of the encoding,
34  * it returns the default UTF-8.
35  * </p>
36  *
37  *
38  * @author Elliotte Rusty Harold
39  * @version 1.0
40  */

41 class EncodingHeuristics {
42
43   // No instances allowed
44
private EncodingHeuristics() {}
45
46   
47   /**
48     * <p>
49     * This utility method uses a variety of heuristics to
50     * attempt to guess the encoding from the initial
51     * characters.
52     * </p>
53     *
54     * @param in <code>InputStream</code> to read from.
55     * @return String The name of the encoding.
56     * @throws IOException if the stream cannot be reset back
57     * to where it was when the method was invoked.
58     */

59     public static String JavaDoc readEncodingFromStream(InputStream JavaDoc in)
60       throws IOException JavaDoc {
61      
62         // This may fail if there are a lot of space
63
// characters before the end of the encoding declaration
64
in.mark(1024);
65         
66         try {
67           // Lots of things can go wrong here. If any do,
68
// return "UTF-8" as the default.
69
int byte1 = in.read();
70             int byte2 = in.read();
71             if (byte1 == 0xFE && byte2 == 0xFF) {
72                 // Don't reset because the byte order mark should not be
73
// included per section 4.3 of the XInclude spec
74
return "UnicodeBig";
75             }
76             else if (byte1 == 0xFF && byte2 == 0xFE) {
77                 // Don't reset because the byte order mark should not be
78
// included per section 4.3 of the XInclude spec
79
return "UnicodeLittle";
80             }
81             
82             /* In accordance with the Character Model,
83                when the text format is a Unicode encoding, the XInclude
84                processor must fail the inclusion when the text in the
85                selected range is non-normalized. When transcoding
86                characters to a Unicode encoding from a legacy encoding,
87                a normalizing transcoder must be used. */

88                     
89             int byte3 = in.read();
90             // check for UTF-8 byte order mark
91
if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) {
92                 // Don't reset because the byte order mark should not be
93
// included per section 4.3 of the XInclude spec
94
return "UTF-8";
95             }
96             
97             int byte4 = in.read();
98             if (byte1 == 0x00
99               && byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) {
100                 // Don't reset because the byte order mark should not be
101
// included per section 4.3 of the XInclude spec
102
// Most Java VMs don't support this next one
103
return "UTF32BE";
104             }
105             else if (byte1 == 0x00 && byte2 == 0x00
106               && byte3 == 0xFF && byte4 == 0xFE) {
107                 // Don't reset because the byte order mark should not be
108
// included per section 4.3 of the XInclude spec
109
// Most Java VMs don't support this next one
110
return "UTF32LE";
111             }
112             
113             // no byte order mark present; first character must be
114
// less than sign or white space
115
// Let's look for less-than signs first
116
if (byte1 == 0x00 && byte2 == 0x00
117               && byte3 == 0x00 && byte4 == '<') {
118                 in.reset();
119                 return "UTF32BE";
120             }
121             else if (byte1 == '<' && byte2 == 0x00
122               && byte3 == 0x00 && byte4 == 0x00) {
123                 in.reset();
124                 return "UTF32LE";
125             }
126             else if (byte1 == 0x00 && byte2 == '<'
127               && byte3 == 0x00 && byte4 == '?') {
128                 in.reset();
129                 return "UnicodeBigUnmarked";
130             }
131             else if (byte1 == '<' && byte2 == 0x00
132               && byte3 == '?' && byte4 == 0x00) {
133                 in.reset();
134                 return "UnicodeLittleUnmarked";
135             }
136             else if (byte1 == '<' && byte2 == '?'
137               && byte3 == 'x' && byte4 == 'm') {
138               // ASCII compatible, must read encoding declaration.
139
// 1024 bytes will be far enough to read most
140
// XML declarations
141
byte[] data = new byte[1024];
142               data[0] = (byte) byte1;
143               data[1] = (byte) byte2;
144               data[2] = (byte) byte3;
145               data[3] = (byte) byte4;
146               int length = in.read(data, 4, 1020) + 4;
147               // Use Latin-1 (ISO-8859-1) because it's ASCII compatible
148
// and all byte sequences are legal Latin-1 sequences
149
// so I don't have to worry about encoding errors if I
150
// slip past the end of the XML/text declaration
151
String JavaDoc declaration=new String JavaDoc(data, 0, length, "8859_1");
152               // If any of these throw a
153
// StringIndexOutOfBoundsException,
154
// we just fall into the catch block and return null
155
// since this can't be well-formed XML
156
String JavaDoc encoding = findEncodingDeclaration(declaration);
157               in.reset();
158               return encoding;
159               
160             }
161             else if (byte1 == 0x4C && byte2 == 0x6F
162               && byte3 == 0xA7 && byte4 == 0x94) {
163               // EBCDIC compatible, must read encoding declaration
164
byte[] buffer = new byte[1016];
165               for (int i = 0; i < buffer.length; i++) {
166                   int c = in.read();
167                   if (c == -1) break;
168                   buffer[i] = (byte) c;
169               }
170               in.reset();
171               // Most EBCDIC encodings are compatible with Cp037 over
172
// the range we care about
173
return findEncodingDeclaration(new String JavaDoc(buffer, "Cp037"));
174             }
175         
176         }
177         catch (Exception JavaDoc ex) {
178             in.reset();
179             return "UTF-8";
180         }
181         
182         // no XML or text declaration present
183
in.reset();
184         return "UTF-8";
185         
186     }
187
188     
189     private static String JavaDoc findEncodingDeclaration(String JavaDoc declaration)
190         throws IOException JavaDoc {
191         
192           int position = declaration.indexOf("encoding") + 8;
193           char c;
194           // get rid of white space before equals sign
195
while (true) {
196               c = declaration.charAt(position++);
197               if (c !=' ' && c != '\t' && c != '\r' && c != '\n') {
198                   break;
199               }
200           }
201           if (c != '=') { // malformed
202
throw new IOException JavaDoc("Couldn't determine encoding");
203           }
204           // get rid of white space after equals sign
205
while (true) {
206               c = declaration.charAt(position++);
207               if (c !=' ' && c != '\t' && c != '\r' && c != '\n') {
208                   break;
209               }
210           }
211           char delimiter = c;
212           if (delimiter != '\'' && delimiter != '"') { // malformed
213
return "UTF-8";
214           }
215           // now positioned to read encoding name
216
StringBuffer JavaDoc encodingName = new StringBuffer JavaDoc();
217           while (true) {
218               c = declaration.charAt(position++);
219               if (c == delimiter) break;
220               encodingName.append(c);
221           }
222           return encodingName.toString();
223           
224     }
225
226 }
Popular Tags