| 1 16 package org.outerj.daisy.xmlutil; 17 18 import java.util.regex.Pattern ; 19 import java.util.regex.Matcher ; 20 import java.text.MessageFormat ; 21 import java.io.UnsupportedEncodingException ; 22 23 29 public class XmlEncodingDetector { 30 private static final String UTF_8 = "UTF-8"; 31 private static final String UTF_16BE = "UTF-16BE"; 32 private static final String UTF_16LE = "UTF-16LE"; 33 private static final String UTF_16 = "UTF-16"; 34 35 public static String detectEncoding(byte[] data) { 36 String bomEnc = getBOMEncoding(data); 37 String xmlGuessEnc = getXMLGuessEncoding(data); 38 String xmlEnc = getXMLPrologEncoding(data, xmlGuessEnc); 39 String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 40 return encoding; 41 } 42 43 private static String getBOMEncoding(byte[] bytes) { 46 String encoding = null; 47 48 if (bytes[0] == 0xFE && bytes[1] == 0xFF) { 49 encoding = UTF_16BE; 50 } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) { 51 encoding = UTF_16LE; 52 } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { 53 encoding = UTF_8; 54 } 55 return encoding; 56 } 57 58 private static String getXMLGuessEncoding(byte[] bytes) { 60 String encoding = null; 61 62 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) { 63 encoding = UTF_16BE; 64 } 65 else 66 if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) { 67 encoding = UTF_16LE; 68 } 69 else 70 if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) { 71 encoding = UTF_8; 72 } 73 return encoding; 74 } 75 76 private static final Pattern ENCODING_PATTERN = Pattern.compile("^<\\?xml.*encoding=\"(.*)\".*\\?>"); 77 78 private static String getXMLPrologEncoding(byte[] data,String guessedEnc) { 80 String encoding = null; 81 if (guessedEnc!=null) { 82 if (data.length > -1) { 83 int endFirstLinePos = Math.min(data.length, 1024); 84 for (int i = 0; i < 1024 && i < data.length; i++) { 85 if (data[i] == '\n' || data[i] == '\r') { 86 endFirstLinePos = i; 87 break; 88 } 89 } 90 String prolog = null; 91 try { 92 prolog = new String (data, 0, endFirstLinePos, guessedEnc); 93 } catch (UnsupportedEncodingException e) { 94 throw new RuntimeException (e); 95 } 96 Matcher m = ENCODING_PATTERN.matcher(prolog); 97 encoding = (m.find()) ? m.group(1).toUpperCase() : null; 98 } 99 } 100 return encoding; 101 } 102 103 private static String calculateRawEncoding(String bomEnc,String xmlGuessEnc,String xmlEnc) { 104 String encoding; 105 if (bomEnc==null) { 106 if (xmlGuessEnc==null || xmlEnc==null) { 107 encoding = UTF_8; 108 } 109 else 110 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 111 encoding = xmlGuessEnc; 112 } 113 else { 114 encoding = xmlEnc; 115 } 116 } 117 else 118 if (bomEnc.equals(UTF_8)) { 119 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) { 120 throw new RuntimeException (RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc})); 121 } 122 if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) { 123 throw new RuntimeException (RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc})); 124 } 125 encoding = UTF_8; 126 } 127 else 128 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 129 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) { 130 throw new RuntimeException (RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc})); 131 } 132 if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 133 throw new RuntimeException (RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc})); 134 } 135 encoding =bomEnc; 136 } 137 else { 138 throw new RuntimeException (RAW_EX_2.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc})); 139 } 140 return encoding; 141 } 142 143 private static final MessageFormat RAW_EX_1 = new MessageFormat ( 144 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"); 145 146 private static final MessageFormat RAW_EX_2 = new MessageFormat ( 147 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"); 148 } 149 | Popular Tags |