KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > netbeans > modules > xml > schema > core > EncodingHelper


1 /*
2  * The contents of this file are subject to the terms of the Common Development
3  * and Distribution License (the License). You may not use this file except in
4  * compliance with the License.
5  *
6  * You can obtain a copy of the License at http://www.netbeans.org/cddl.html
7  * or http://www.netbeans.org/cddl.txt.
8  *
9  * When distributing Covered Code, include this CDDL Header Notice in each file
10  * and include the License file at http://www.netbeans.org/cddl.txt.
11  * If applicable, add the following below the CDDL Header, with the fields
12  * enclosed by brackets [] replaced by your own identifying information:
13  * "Portions Copyrighted [year] [name of copyright owner]"
14  *
15  * The Original Software is NetBeans. The Initial Developer of the Original
16  * Software is Sun Microsystems, Inc. Portions Copyright 1997-2007 Sun
17  * Microsystems, Inc. All Rights Reserved.
18  */

19 package org.netbeans.modules.xml.schema.core;
20
21 import java.io.ByteArrayInputStream JavaDoc;
22 import java.io.EOFException JavaDoc;
23 import java.io.IOException JavaDoc;
24 import java.io.InputStream JavaDoc;
25 import java.io.InputStreamReader JavaDoc;
26 import java.io.Reader JavaDoc;
27 import javax.swing.text.BadLocationException JavaDoc;
28 import javax.swing.text.Document JavaDoc;
29
30 /**
31  * XML uses inband encoding detection - this class obtains it.
32  *
33  * @author Petr Kuzel
34  * @version 1.0
35  */

36 public class EncodingHelper extends Object JavaDoc {
37
38     // heuristic constant guessing max prolog length
39
private static final int EXPECTED_PROLOG_LENGTH = 1000;
40
41     /** Detect input stream encoding.
42     * The stream stays intact.
43     * @return java encoding names ("UTF8", "ASCII", etc.) or null
44     * if the stream is not markable or enoding cannot be detected.
45     */

46     public static String JavaDoc detectEncoding(InputStream JavaDoc in) throws IOException JavaDoc {
47
48         if (! in.markSupported()) {
49             return null;
50         }
51
52         try {
53             in.mark(EXPECTED_PROLOG_LENGTH);
54
55             byte[] bytes = new byte[EXPECTED_PROLOG_LENGTH];
56             for (int i = 0; i<bytes.length; i++) {
57                 try {
58                     int datum = in.read();
59                     if (datum == -1) break;
60                     bytes[i] = (byte) datum;
61                 } catch (EOFException JavaDoc ex) {
62                 }
63             }
64
65             String JavaDoc enc = autoDetectEncoding(bytes);
66             if (enc == null) return null;
67             
68             enc = detectDeclaredEncoding(bytes, enc);
69             if (enc == null) return null;
70             
71             return Convertors.iana2java (enc);
72         } finally {
73             in.reset();
74         }
75     }
76
77         
78     /**
79      * @return Java encoding family identifier or <tt>null</tt> for unrecognized
80      */

81     static String JavaDoc autoDetectEncoding(byte[] buf) throws IOException JavaDoc {
82         
83
84         if (buf.length >= 4) {
85             switch (buf[0]) {
86                 case 0:
87                     // byte order mark of (1234-big endian) or (2143) USC-4
88
// or '<' encoded as UCS-4 (1234, 2143, 3412) or UTF-16BE
89
if (buf[1] == (byte)0x3c && buf[2] == (byte)0x00 && buf[3] == (byte)0x3f) {
90                         return "UnicodeBigUnmarked";
91                     }
92                     // else it's probably UCS-4
93
break;
94
95                 case 0x3c:
96                     switch (buf[1]) {
97                         // First character is '<'; could be XML without
98
// an XML directive such as "<hello>", "<!-- ...", // NOI18N
99
// and so on.
100

101                         // 3c 00 3f 00 UTF-16 little endian
102
case 0x00:
103                             if (buf [2] == (byte)0x3f && buf [3] == (byte)0x00) {
104                                 return "UnicodeLittleUnmarked";
105                             }
106                             break;
107
108                         // 3c 3f 78 6d == ASCII and supersets '<?xm'
109
case '?':
110                             if (buf [2] == 'x' && buf [3] == 'm') {
111                                 return "UTF8"; // NOI18N
112
}
113                             break;
114                     }
115                     break;
116
117                 // 4c 6f a7 94 ... some EBCDIC code page
118
case 0x4c:
119                     if (buf[1] == (byte)0x6f && buf[2] == (byte)0xa7 && buf[3] == (byte)0x94) {
120                         return "Cp037"; // NOI18N
121
}
122                     break;
123
124                 // UTF-16 big-endian marked
125
case (byte)0xfe:
126                     if (buf[1] == (byte)0xff && (buf[2] != 0 || buf[3] != 0)) {
127                         return "UnicodeBig"; // NOI18N
128
}
129                     break;
130
131                 // UTF-16 little-endian marked
132
case (byte)0xff:
133                     if (buf[1] == (byte)0xfe && (buf[2] != 0 || buf[3] != 0)) {
134                         return "UnicodeLittle"; // NOI18N
135
}
136                     break;
137                     
138                 // UTF-8 byte order mark
139
case (byte)0xef:
140                     if (buf[1] == (byte)0xbb && buf[2] == (byte)0xbf) {
141                         return "UTF8"; //NOI18N
142
}
143                     break;
144                     
145             }
146         }
147
148         return null;
149     }
150
151     /**
152      * Look for encoding='' anyway stop at <tt>?></tt>
153      * @return found encoding or null if none declared
154      */

155     static String JavaDoc detectDeclaredEncoding(byte[] data, String JavaDoc baseEncoding) throws IOException JavaDoc {
156
157         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
158         Reader JavaDoc r;
159         char delimiter = '"';
160
161         r = new InputStreamReader JavaDoc(new ByteArrayInputStream JavaDoc(data), baseEncoding);
162         try {
163             for (int c = r.read(); c != -1; c = r.read()) {
164                 buf.append((char)c);
165             }
166         } catch (IOException JavaDoc ex) {
167             // EOF of data out of boundary
168
// dont care try to guess from given data
169
}
170         
171         String JavaDoc s = buf.toString();
172         
173         int iend = s.indexOf("?>");
174         iend = iend == -1 ? s.length() : iend;
175         
176         int iestart = s.indexOf("encoding");
177         if (iestart == -1 || iestart > iend) return null;
178         
179         char[] chars = s.toCharArray();
180         
181         int i = iestart;
182         
183         for (; i<iend; i++) {
184             if (chars[i] == '=') break;
185         }
186         
187         for (; i<iend; i++) {
188             if (chars[i] == '\'' || chars[i] == '"') {
189                 delimiter = chars[i];
190                 break;
191             }
192                 
193         }
194
195         i++;
196         
197         int ivalstart = i;
198         for (; i<iend; i++) {
199             if (chars[i] == delimiter) {
200                 return new String JavaDoc(chars, ivalstart, i - ivalstart);
201             }
202         }
203         
204         return null;
205     }
206     
207     /**
208      * Parse MIME content type for attributes.
209      */

210     static String JavaDoc parseMIMECharSet(String JavaDoc mime) {
211         
212         final String JavaDoc CHARSET = "charset";
213         
214         if (mime != null) {
215             int i;
216
217             mime = mime.toLowerCase ();
218             i = mime.indexOf (';');
219             if (i != -1) {
220                 String JavaDoc attributes;
221
222                 attributes = mime.substring (i + 1);
223 // mime = mime.substring (0, i);
224

225                 // use "charset=..." if it's available // NOI18N
226
i = attributes.indexOf (CHARSET); // NOI18N
227
if (i != -1) {
228                     attributes = attributes.substring (i + CHARSET.length());
229                     // strip out subsequent attributes
230
if ((i = attributes.indexOf (';')) != -1)
231                         attributes = attributes.substring (0, i);
232                     // find start of value
233
if ((i = attributes.indexOf ('=')) != -1) {
234                         attributes = attributes.substring (i + 1);
235                         // strip out rfc822 comments
236
if ((i = attributes.indexOf ('(')) != -1)
237                             attributes = attributes.substring (0, i);
238                         // double quotes are optional
239
if ((i = attributes.indexOf ('"')) != -1) {
240                             attributes = attributes.substring (i + 1);
241                             attributes = attributes.substring (0,
242                                                                attributes.indexOf ('"'));
243                         }
244                         return attributes.trim();
245                         // XXX "\;", "\)" etc were mishandled above // NOI18N
246
}
247                 }
248             }
249         }
250         
251         return null;
252     }
253
254     
255     
256     /** Document itself is encoded as Unicode, but in
257     * the document prolog is an encoding attribute.
258     * @return java encoding names ("UTF8", "ASCII", etc.) or null if no guess
259     */

260     public static String JavaDoc detectEncoding(Document JavaDoc doc) throws IOException JavaDoc {
261
262         if (doc == null) return null;
263
264         try {
265
266             String JavaDoc text = doc.getText(0,
267                                       doc.getLength() > EXPECTED_PROLOG_LENGTH ?
268                                       EXPECTED_PROLOG_LENGTH : doc.getLength()
269                                      );
270             InputStream JavaDoc in = new ByteArrayInputStream JavaDoc(text.getBytes());
271             return detectEncoding(in);
272
273         } catch (BadLocationException JavaDoc ex) {
274             throw new RuntimeException JavaDoc(ex.toString());
275         }
276
277     }
278 }
279
Popular Tags