KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > xmlutil > XmlEncodingDetector


1 /*
2  * Copyright 2004 Outerthought bvba and Schaubroeck nv
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.outerj.daisy.xmlutil;
17
18 import java.util.regex.Pattern JavaDoc;
19 import java.util.regex.Matcher JavaDoc;
20 import java.text.MessageFormat JavaDoc;
21 import java.io.UnsupportedEncodingException JavaDoc;
22
23 /**
24  * Utility code to detect the encoding of XML provided as a byte array.
25  * This code is based on the class com.sun.syndication.io.XmlReader
26  * from the Rome project (https://rome.dev.java.net/), which is licensed
27  * under the Apache V2 license (and doesn't include a NOTICE file).
28  */

29 public class XmlEncodingDetector {
30     private static final String JavaDoc UTF_8 = "UTF-8";
31     private static final String JavaDoc UTF_16BE = "UTF-16BE";
32     private static final String JavaDoc UTF_16LE = "UTF-16LE";
33     private static final String JavaDoc UTF_16 = "UTF-16";
34
35     public static String JavaDoc detectEncoding(byte[] data) {
36         String JavaDoc bomEnc = getBOMEncoding(data);
37         String JavaDoc xmlGuessEnc = getXMLGuessEncoding(data);
38         String JavaDoc xmlEnc = getXMLPrologEncoding(data, xmlGuessEnc);
39         String JavaDoc encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
40         return encoding;
41     }
42
43     // returns the BOM in the stream, NULL if not present,
44
// if there was BOM the in the stream it is consumed
45
private static String JavaDoc getBOMEncoding(byte[] bytes) {
46         String JavaDoc encoding = null;
47
48         if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
49             encoding = UTF_16BE;
50         } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
51             encoding = UTF_16LE;
52         } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
53             encoding = UTF_8;
54         }
55         return encoding;
56     }
57
58     // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
59
private static String JavaDoc getXMLGuessEncoding(byte[] bytes) {
60         String JavaDoc encoding = null;
61
62         if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
63                 encoding = UTF_16BE;
64         }
65         else
66         if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
67                 encoding = UTF_16LE;
68         }
69         else
70         if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
71             encoding = UTF_8;
72         }
73         return encoding;
74     }
75
76     private static final Pattern JavaDoc ENCODING_PATTERN = Pattern.compile("^<\\?xml.*encoding=\"(.*)\".*\\?>");
77
78     // returns the encoding declared in the <?xml encoding=...?>, NULL if none
79
private static String JavaDoc getXMLPrologEncoding(byte[] data,String JavaDoc guessedEnc) {
80         String JavaDoc encoding = null;
81         if (guessedEnc!=null) {
82             if (data.length > -1) {
83                 int endFirstLinePos = Math.min(data.length, 1024);
84                 for (int i = 0; i < 1024 && i < data.length; i++) {
85                     if (data[i] == '\n' || data[i] == '\r') {
86                         endFirstLinePos = i;
87                         break;
88                     }
89                 }
90                 String JavaDoc prolog = null;
91                 try {
92                     prolog = new String JavaDoc(data, 0, endFirstLinePos, guessedEnc);
93                 } catch (UnsupportedEncodingException JavaDoc e) {
94                     throw new RuntimeException JavaDoc(e);
95                 }
96                 Matcher JavaDoc m = ENCODING_PATTERN.matcher(prolog);
97                 encoding = (m.find()) ? m.group(1).toUpperCase() : null;
98             }
99         }
100         return encoding;
101     }
102
103     private static String JavaDoc calculateRawEncoding(String JavaDoc bomEnc,String JavaDoc xmlGuessEnc,String JavaDoc xmlEnc) {
104         String JavaDoc encoding;
105         if (bomEnc==null) {
106             if (xmlGuessEnc==null || xmlEnc==null) {
107                 encoding = UTF_8;
108             }
109             else
110             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
111                 encoding = xmlGuessEnc;
112             }
113             else {
114                 encoding = xmlEnc;
115             }
116         }
117         else
118         if (bomEnc.equals(UTF_8)) {
119             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
120                 throw new RuntimeException JavaDoc(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}));
121             }
122             if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
123                 throw new RuntimeException JavaDoc(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}));
124             }
125             encoding = UTF_8;
126         }
127         else
128         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
129             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
130                 throw new RuntimeException JavaDoc(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}));
131             }
132             if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
133                 throw new RuntimeException JavaDoc(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}));
134             }
135             encoding =bomEnc;
136         }
137         else {
138             throw new RuntimeException JavaDoc(RAW_EX_2.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}));
139         }
140         return encoding;
141     }
142
143     private static final MessageFormat JavaDoc RAW_EX_1 = new MessageFormat JavaDoc(
144             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
145
146     private static final MessageFormat JavaDoc RAW_EX_2 = new MessageFormat JavaDoc(
147             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
148 }
149
Popular Tags