KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > netbeans > modules > diff > XMLEncodingHelper


1 /*
2  * The contents of this file are subject to the terms of the Common Development
3  * and Distribution License (the License). You may not use this file except in
4  * compliance with the License.
5  *
6  * You can obtain a copy of the License at http://www.netbeans.org/cddl.html
7  * or http://www.netbeans.org/cddl.txt.
8  *
9  * When distributing Covered Code, include this CDDL Header Notice in each file
10  * and include the License file at http://www.netbeans.org/cddl.txt.
11  * If applicable, add the following below the CDDL Header, with the fields
12  * enclosed by brackets [] replaced by your own identifying information:
13  * "Portions Copyrighted [year] [name of copyright owner]"
14  *
15  * The Original Software is NetBeans. The Initial Developer of the Original
16  * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
17  * Microsystems, Inc. All Rights Reserved.
18  */

19 package org.netbeans.modules.diff;
20
21 import org.openide.ErrorManager;
22
23 import java.io.*;
24
25 /**
26  * XML uses inband encoding detection - this class obtains it.
27  *
28  * <p>Copy&amp;pasted from <tt>taslist/api/.../XMLEncodingHelper</tt>
29  *
30  * @author Petr Kuzel
31  * @version 1.0
32  */

33 final class XMLEncodingHelper extends Object JavaDoc {
34
35     //
36
// taken from XML module xml.core.lib.EncodingHelper
37
//
38

39     // heuristic constant guessing max prolog length
40
private static final int EXPECTED_PROLOG_LENGTH = 1000;
41
42     /** Detect input stream encoding.
43     * The stream stays intact.
44     * @return iana encoding names or Java hisrotical ("UTF8", "ASCII", etc.) or null
45     * if the stream is not markable or enoding cannot be detected.
46     */

47     public static String JavaDoc detectEncoding(InputStream in) throws IOException {
48
49         if (! in.markSupported()) {
50             ErrorManager.getDefault().log("XMLEncodingHelper got unmarkable stream: " + in.getClass()); // NOI18N
51
return null;
52         }
53
54         try {
55             in.mark(EXPECTED_PROLOG_LENGTH);
56
57             byte[] bytes = new byte[EXPECTED_PROLOG_LENGTH];
58             for (int i = 0; i<bytes.length; i++) {
59                 try {
60                     int datum = in.read();
61                     if (datum == -1) break;
62                     bytes[i] = (byte) datum;
63                 } catch (EOFException ex) {
64                 }
65             }
66
67             String JavaDoc enc = autoDetectEncoding(bytes);
68             if (enc == null) return null;
69             
70             enc = detectDeclaredEncoding(bytes, enc);
71             if (enc == null) return null;
72             
73             return enc;
74         } finally {
75             in.reset();
76         }
77     }
78
79         
80     /**
81      * @return Java encoding family identifier or <tt>null</tt> for unrecognized
82      */

83     static String JavaDoc autoDetectEncoding(byte[] buf) throws IOException {
84         
85
86         if (buf.length >= 4) {
87             switch (buf[0]) {
88                 case 0:
89                     // byte order mark of (1234-big endian) or (2143) USC-4
90
// or '<' encoded as UCS-4 (1234, 2143, 3412) or UTF-16BE
91
if (buf[1] == (byte)0x3c && buf[2] == (byte)0x00 && buf[3] == (byte)0x3f) {
92                         return "UnicodeBigUnmarked"; // NOI18N
93
}
94                     // else it's probably UCS-4
95
break;
96
97                 case 0x3c:
98                     switch (buf[1]) {
99                         // First character is '<'; could be XML without
100
// an XML directive such as "<hello>", "<!-- ...", // NOI18N
101
// and so on.
102

103                         // 3c 00 3f 00 UTF-16 little endian
104
case 0x00:
105                             if (buf [2] == (byte)0x3f && buf [3] == (byte)0x00) {
106                                 return "UnicodeLittleUnmarked"; // NOI18N
107
}
108                             break;
109
110                         // 3c 3f 78 6d == ASCII and supersets '<?xm'
111
case '?':
112                             if (buf [2] == 'x' && buf [3] == 'm') { // NOI18N
113
return "UTF8"; // NOI18N
114
}
115                             break;
116                     }
117                     break;
118
119                 // 4c 6f a7 94 ... some EBCDIC code page
120
case 0x4c:
121                     if (buf[1] == (byte)0x6f && buf[2] == (byte)0xa7 && buf[3] == (byte)0x94) {
122                         return "Cp037"; // NOI18N
123
}
124                     break;
125
126                 // UTF-16 big-endian marked
127
case (byte)0xfe:
128                     if (buf[1] == (byte)0xff && (buf[2] != 0 || buf[3] != 0)) {
129                         return "UnicodeBig"; // NOI18N
130
}
131                     break;
132
133                 // UTF-16 little-endian marked
134
case (byte)0xff:
135                     if (buf[1] == (byte)0xfe && (buf[2] != 0 || buf[3] != 0)) {
136                         return "UnicodeLittle"; // NOI18N
137
}
138                     break;
139                     
140                 // UTF-8 byte order mark
141
case (byte)0xef:
142                     if (buf[1] == (byte)0xbb && buf[2] == (byte)0xbf) {
143                         return "UTF8"; //NOI18N
144
}
145                     break;
146                     
147             }
148         }
149
150         return null;
151     }
152
153     /**
154      * Look for encoding='' anyway stop at <tt>?></tt>
155      * @return found encoding or null if none declared
156      */

157     static String JavaDoc detectDeclaredEncoding(byte[] data, String JavaDoc baseEncoding) throws IOException {
158
159         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
160         Reader r;
161         char delimiter = '"';
162
163         r = new InputStreamReader(new ByteArrayInputStream(data), baseEncoding);
164         try {
165             for (int c = r.read(); c != -1; c = r.read()) {
166                 buf.append((char)c);
167             }
168         } catch (IOException ex) {
169             // EOF of data out of boundary
170
// dont care try to guess from given data
171
}
172         
173         String JavaDoc s = buf.toString();
174         
175         int iend = s.indexOf("?>");
176         iend = iend == -1 ? s.length() : iend;
177         
178         int iestart = s.indexOf("encoding"); // NOI18N
179
if (iestart == -1 || iestart > iend) return null;
180         
181         char[] chars = s.toCharArray();
182         
183         int i = iestart;
184         
185         for (; i<iend; i++) {
186             if (chars[i] == '=') break;
187         }
188         
189         for (; i<iend; i++) {
190             if (chars[i] == '\'' || chars[i] == '"') {
191                 delimiter = chars[i];
192                 break;
193             }
194                 
195         }
196
197         i++;
198         
199         int ivalstart = i;
200         for (; i<iend; i++) {
201             if (chars[i] == delimiter) {
202                 return new String JavaDoc(chars, ivalstart, i - ivalstart);
203             }
204         }
205         
206         return null;
207     }
208
209 }
210
Popular Tags