KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > netbeans > modules > xml > core > lib > EncodingHelper


1 /*
2  * The contents of this file are subject to the terms of the Common Development
3  * and Distribution License (the License). You may not use this file except in
4  * compliance with the License.
5  *
6  * You can obtain a copy of the License at http://www.netbeans.org/cddl.html
7  * or http://www.netbeans.org/cddl.txt.
8  *
9  * When distributing Covered Code, include this CDDL Header Notice in each file
10  * and include the License file at http://www.netbeans.org/cddl.txt.
11  * If applicable, add the following below the CDDL Header, with the fields
12  * enclosed by brackets [] replaced by your own identifying information:
13  * "Portions Copyrighted [year] [name of copyright owner]"
14  *
15  * The Original Software is NetBeans. The Initial Developer of the Original
16  * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
17  * Microsystems, Inc. All Rights Reserved.
18  */

19 package org.netbeans.modules.xml.core.lib;
20
21 import java.io.*;
22 import javax.swing.text.*;
23
24 /**
25  * XML uses inband encoding detection - this class obtains it.
26  *
27  * @author Petr Kuzel
28  * @version 1.0
29  */

30 public class EncodingHelper extends Object JavaDoc {
31
32     // heuristic constant guessing max prolog length
33
private static final int EXPECTED_PROLOG_LENGTH = 1000;
34
35     /** Detect input stream encoding.
36     * The stream stays intact.
37     * @return java encoding names ("UTF8", "ASCII", etc.) or null
38     * if the stream is not markable or enoding cannot be detected.
39     */

40     public static String JavaDoc detectEncoding(InputStream in) throws IOException {
41
42         if (! in.markSupported()) {
43             if ( Util.THIS.isLoggable() ) /* then */ Util.THIS.debug("EncodingHelper got unmarkable stream: " + in.getClass()); // NOI18N
44
return null;
45         }
46
47         try {
48             in.mark(EXPECTED_PROLOG_LENGTH);
49
50             byte[] bytes = new byte[EXPECTED_PROLOG_LENGTH];
51             for (int i = 0; i<bytes.length; i++) {
52                 try {
53                     int datum = in.read();
54                     if (datum == -1) break;
55                     bytes[i] = (byte) datum;
56                 } catch (EOFException ex) {
57                 }
58             }
59
60             String JavaDoc enc = autoDetectEncoding(bytes);
61             if (enc == null) return null;
62             
63             enc = detectDeclaredEncoding(bytes, enc);
64             if (enc == null) return null;
65             
66             return Convertors.iana2java (enc);
67         } finally {
68             in.reset();
69         }
70     }
71
72         
73     /**
74      * @return Java encoding family identifier or <tt>null</tt> for unrecognized
75      */

76     static String JavaDoc autoDetectEncoding(byte[] buf) throws IOException {
77         
78
79         if (buf.length >= 4) {
80             switch (buf[0]) {
81                 case 0:
82                     // byte order mark of (1234-big endian) or (2143) USC-4
83
// or '<' encoded as UCS-4 (1234, 2143, 3412) or UTF-16BE
84
if (buf[1] == (byte)0x3c && buf[2] == (byte)0x00 && buf[3] == (byte)0x3f) {
85                         return "UnicodeBigUnmarked";
86                     }
87                     // else it's probably UCS-4
88
break;
89
90                 case 0x3c:
91                     switch (buf[1]) {
92                         // First character is '<'; could be XML without
93
// an XML directive such as "<hello>", "<!-- ...", // NOI18N
94
// and so on.
95

96                         // 3c 00 3f 00 UTF-16 little endian
97
case 0x00:
98                             if (buf [2] == (byte)0x3f && buf [3] == (byte)0x00) {
99                                 return "UnicodeLittleUnmarked";
100                             }
101                             break;
102
103                         // 3c 3f 78 6d == ASCII and supersets '<?xm'
104
case '?':
105                             if (buf [2] == 'x' && buf [3] == 'm') {
106                                 return "UTF8"; // NOI18N
107
}
108                             break;
109                     }
110                     break;
111
112                 // 4c 6f a7 94 ... some EBCDIC code page
113
case 0x4c:
114                     if (buf[1] == (byte)0x6f && buf[2] == (byte)0xa7 && buf[3] == (byte)0x94) {
115                         return "Cp037"; // NOI18N
116
}
117                     break;
118
119                 // UTF-16 big-endian marked
120
case (byte)0xfe:
121                     if (buf[1] == (byte)0xff && (buf[2] != 0 || buf[3] != 0)) {
122                         return "UnicodeBig"; // NOI18N
123
}
124                     break;
125
126                 // UTF-16 little-endian marked
127
case (byte)0xff:
128                     if (buf[1] == (byte)0xfe && (buf[2] != 0 || buf[3] != 0)) {
129                         return "UnicodeLittle"; // NOI18N
130
}
131                     break;
132                     
133                 // UTF-8 byte order mark
134
case (byte)0xef:
135                     if (buf[1] == (byte)0xbb && buf[2] == (byte)0xbf) {
136                         return "UTF8"; //NOI18N
137
}
138                     break;
139                     
140             }
141         }
142
143         return null;
144     }
145
146     /**
147      * Look for encoding='' anyway stop at <tt>?></tt>
148      * @return found encoding or null if none declared
149      */

150     static String JavaDoc detectDeclaredEncoding(byte[] data, String JavaDoc baseEncoding) throws IOException {
151
152         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
153         Reader r;
154         char delimiter = '"';
155
156         r = new InputStreamReader(new ByteArrayInputStream(data), baseEncoding);
157         try {
158             for (int c = r.read(); c != -1; c = r.read()) {
159                 buf.append((char)c);
160             }
161         } catch (IOException ex) {
162             // EOF of data out of boundary
163
// dont care try to guess from given data
164
}
165         
166         String JavaDoc s = buf.toString();
167         
168         int iend = s.indexOf("?>");
169         iend = iend == -1 ? s.length() : iend;
170         
171         int iestart = s.indexOf("encoding");
172         if (iestart == -1 || iestart > iend) return null;
173         
174         char[] chars = s.toCharArray();
175         
176         int i = iestart;
177         
178         for (; i<iend; i++) {
179             if (chars[i] == '=') break;
180         }
181         
182         for (; i<iend; i++) {
183             if (chars[i] == '\'' || chars[i] == '"') {
184                 delimiter = chars[i];
185                 break;
186             }
187                 
188         }
189
190         i++;
191         
192         int ivalstart = i;
193         for (; i<iend; i++) {
194             if (chars[i] == delimiter) {
195                 return new String JavaDoc(chars, ivalstart, i - ivalstart);
196             }
197         }
198         
199         return null;
200     }
201     
202     /**
203      * Parse MIME content type for attributes.
204      */

205     static String JavaDoc parseMIMECharSet(String JavaDoc mime) {
206         
207         final String JavaDoc CHARSET = "charset";
208         
209         if (mime != null) {
210             int i;
211
212             mime = mime.toLowerCase ();
213             i = mime.indexOf (';');
214             if (i != -1) {
215                 String JavaDoc attributes;
216
217                 attributes = mime.substring (i + 1);
218                 mime = mime.substring (0, i);
219
220                 // use "charset=..." if it's available // NOI18N
221
i = attributes.indexOf (CHARSET); // NOI18N
222
if (i != -1) {
223                     attributes = attributes.substring (i + CHARSET.length());
224                     // strip out subsequent attributes
225
if ((i = attributes.indexOf (';')) != -1)
226                         attributes = attributes.substring (0, i);
227                     // find start of value
228
if ((i = attributes.indexOf ('=')) != -1) {
229                         attributes = attributes.substring (i + 1);
230                         // strip out rfc822 comments
231
if ((i = attributes.indexOf ('(')) != -1)
232                             attributes = attributes.substring (0, i);
233                         // double quotes are optional
234
if ((i = attributes.indexOf ('"')) != -1) {
235                             attributes = attributes.substring (i + 1);
236                             attributes = attributes.substring (0,
237                                                                attributes.indexOf ('"'));
238                         }
239                         return attributes.trim();
240                         // XXX "\;", "\)" etc were mishandled above // NOI18N
241
}
242                 }
243             }
244         }
245         
246         return null;
247     }
248
249     
250     
251     /** Document itself is encoded as Unicode, but in
252     * the document prolog is an encoding attribute.
253     * @return java encoding names ("UTF8", "ASCII", etc.) or null if no guess
254     */

255     public static String JavaDoc detectEncoding(Document doc) throws IOException {
256
257         if (doc == null) return null;
258
259         try {
260
261             String JavaDoc text = doc.getText(0,
262                                       doc.getLength() > EXPECTED_PROLOG_LENGTH ?
263                                       EXPECTED_PROLOG_LENGTH : doc.getLength()
264                                      );
265             InputStream in = new ByteArrayInputStream(text.getBytes());
266             return detectEncoding(in);
267
268         } catch (BadLocationException ex) {
269             throw new RuntimeException JavaDoc(ex.toString());
270         }
271
272     }
273
274 }
275
Popular Tags