KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > netbeans > modules > tasklist > providers > XMLEncodingHelper


1 /*
2  * The contents of this file are subject to the terms of the Common Development
3  * and Distribution License (the License). You may not use this file except in
4  * compliance with the License.
5  *
6  * You can obtain a copy of the License at http://www.netbeans.org/cddl.html
7  * or http://www.netbeans.org/cddl.txt.
8  *
9  * When distributing Covered Code, include this CDDL Header Notice in each file
10  * and include the License file at http://www.netbeans.org/cddl.txt.
11  * If applicable, add the following below the CDDL Header, with the fields
12  * enclosed by brackets [] replaced by your own identifying information:
13  * "Portions Copyrighted [year] [name of copyright owner]"
14  *
15  * The Original Software is NetBeans. The Initial Developer of the Original
16  * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
17  * Microsystems, Inc. All Rights Reserved.
18  */

19 package org.netbeans.modules.tasklist.providers;
20
21 import org.openide.ErrorManager;
22
23 import java.io.*;
24
25 /**
26  * XML uses inband encoding detection - this class obtains it.
27  *
28  * @author Petr Kuzel
29  * @version 1.0
30  */

31 final class XMLEncodingHelper extends Object JavaDoc {
32
33     //
34
// taken from XML module xml.core.lib.EncodingHelper
35
//
36

37     // heuristic constant guessing max prolog length
38
private static final int EXPECTED_PROLOG_LENGTH = 1000;
39
40     /** Detect input stream encoding.
41     * The stream stays intact.
42     * @return iana encoding names or Java hisrotical ("UTF8", "ASCII", etc.) or null
43     * if the stream is not markable or enoding cannot be detected.
44     */

45     public static String JavaDoc detectEncoding(InputStream in) throws IOException {
46
47         if (! in.markSupported()) {
48             ErrorManager.getDefault().log("XMLEncodingHelper got unmarkable stream: " + in.getClass()); // NOI18N
49
return null;
50         }
51
52         try {
53             in.mark(EXPECTED_PROLOG_LENGTH);
54
55             byte[] bytes = new byte[EXPECTED_PROLOG_LENGTH];
56             for (int i = 0; i<bytes.length; i++) {
57                 try {
58                     int datum = in.read();
59                     if (datum == -1) break;
60                     bytes[i] = (byte) datum;
61                 } catch (EOFException ex) {
62                 }
63             }
64
65             String JavaDoc enc = autoDetectEncoding(bytes);
66             if (enc == null) return null;
67             
68             enc = detectDeclaredEncoding(bytes, enc);
69             if (enc == null) return null;
70             
71             return enc;
72         } finally {
73             in.reset();
74         }
75     }
76
77         
78     /**
79      * @return Java encoding family identifier or <tt>null</tt> for unrecognized
80      */

81     static String JavaDoc autoDetectEncoding(byte[] buf) throws IOException {
82         
83
84         if (buf.length >= 4) {
85             switch (buf[0]) {
86                 case 0:
87                     // byte order mark of (1234-big endian) or (2143) USC-4
88
// or '<' encoded as UCS-4 (1234, 2143, 3412) or UTF-16BE
89
if (buf[1] == (byte)0x3c && buf[2] == (byte)0x00 && buf[3] == (byte)0x3f) {
90                         return "UnicodeBigUnmarked"; // NOI18N
91
}
92                     // else it's probably UCS-4
93
break;
94
95                 case 0x3c:
96                     switch (buf[1]) {
97                         // First character is '<'; could be XML without
98
// an XML directive such as "<hello>", "<!-- ...", // NOI18N
99
// and so on.
100

101                         // 3c 00 3f 00 UTF-16 little endian
102
case 0x00:
103                             if (buf [2] == (byte)0x3f && buf [3] == (byte)0x00) {
104                                 return "UnicodeLittleUnmarked"; // NOI18N
105
}
106                             break;
107
108                         // 3c 3f 78 6d == ASCII and supersets '<?xm'
109
case '?':
110                             if (buf [2] == 'x' && buf [3] == 'm') { // NOI18N
111
return "UTF8"; // NOI18N
112
}
113                             break;
114                     }
115                     break;
116
117                 // 4c 6f a7 94 ... some EBCDIC code page
118
case 0x4c:
119                     if (buf[1] == (byte)0x6f && buf[2] == (byte)0xa7 && buf[3] == (byte)0x94) {
120                         return "Cp037"; // NOI18N
121
}
122                     break;
123
124                 // UTF-16 big-endian marked
125
case (byte)0xfe:
126                     if (buf[1] == (byte)0xff && (buf[2] != 0 || buf[3] != 0)) {
127                         return "UnicodeBig"; // NOI18N
128
}
129                     break;
130
131                 // UTF-16 little-endian marked
132
case (byte)0xff:
133                     if (buf[1] == (byte)0xfe && (buf[2] != 0 || buf[3] != 0)) {
134                         return "UnicodeLittle"; // NOI18N
135
}
136                     break;
137                     
138                 // UTF-8 byte order mark
139
case (byte)0xef:
140                     if (buf[1] == (byte)0xbb && buf[2] == (byte)0xbf) {
141                         return "UTF8"; //NOI18N
142
}
143                     break;
144                     
145             }
146         }
147
148         return null;
149     }
150
151     /**
152      * Look for encoding='' anyway stop at <tt>?></tt>
153      * @return found encoding or null if none declared
154      */

155     static String JavaDoc detectDeclaredEncoding(byte[] data, String JavaDoc baseEncoding) throws IOException {
156
157         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
158         Reader r;
159         char delimiter = '"';
160
161         r = new InputStreamReader(new ByteArrayInputStream(data), baseEncoding);
162         try {
163             for (int c = r.read(); c != -1; c = r.read()) {
164                 buf.append((char)c);
165             }
166         } catch (IOException ex) {
167             // EOF of data out of boundary
168
// dont care try to guess from given data
169
}
170         
171         String JavaDoc s = buf.toString();
172         
173         int iend = s.indexOf("?>");
174         iend = iend == -1 ? s.length() : iend;
175         
176         int iestart = s.indexOf("encoding"); // NOI18N
177
if (iestart == -1 || iestart > iend) return null;
178         
179         char[] chars = s.toCharArray();
180         
181         int i = iestart;
182         
183         for (; i<iend; i++) {
184             if (chars[i] == '=') break;
185         }
186         
187         for (; i<iend; i++) {
188             if (chars[i] == '\'' || chars[i] == '"') {
189                 delimiter = chars[i];
190                 break;
191             }
192                 
193         }
194
195         i++;
196         
197         int ivalstart = i;
198         for (; i<iend; i++) {
199             if (chars[i] == delimiter) {
200                 return new String JavaDoc(chars, ivalstart, i - ivalstart);
201             }
202         }
203         
204         return null;
205     }
206
207 }
208
Popular Tags