1 21 22 package nu.xom.xinclude; 23 24 import java.io.IOException ; 25 import java.io.InputStream ; 26 27 41 class EncodingHeuristics { 42 43 private EncodingHeuristics() {} 45 46 47 59 public static String readEncodingFromStream(InputStream in) 60 throws IOException { 61 62 in.mark(1024); 65 66 try { 67 int byte1 = in.read(); 70 int byte2 = in.read(); 71 if (byte1 == 0xFE && byte2 == 0xFF) { 72 return "UnicodeBig"; 75 } 76 else if (byte1 == 0xFF && byte2 == 0xFE) { 77 return "UnicodeLittle"; 80 } 81 82 88 89 int byte3 = in.read(); 90 if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) { 92 return "UTF-8"; 95 } 96 97 int byte4 = in.read(); 98 if (byte1 == 0x00 99 && byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) { 100 return "UTF32BE"; 104 } 105 else if (byte1 == 0x00 && byte2 == 0x00 106 && byte3 == 0xFF && byte4 == 0xFE) { 107 return "UTF32LE"; 111 } 112 113 if (byte1 == 0x00 && byte2 == 0x00 117 && byte3 == 0x00 && byte4 == '<') { 118 in.reset(); 119 return "UTF32BE"; 120 } 121 else if (byte1 == '<' && byte2 == 0x00 122 && byte3 == 0x00 && byte4 == 0x00) { 123 in.reset(); 124 return "UTF32LE"; 125 } 126 else if (byte1 == 0x00 && byte2 == '<' 127 && byte3 == 0x00 && byte4 == '?') { 128 in.reset(); 129 return "UnicodeBigUnmarked"; 130 } 131 else if (byte1 == '<' && byte2 == 0x00 132 && byte3 == '?' && byte4 == 0x00) { 133 in.reset(); 134 return "UnicodeLittleUnmarked"; 135 } 136 else if (byte1 == '<' && byte2 == '?' 137 && byte3 == 'x' && byte4 == 'm') { 138 byte[] data = new byte[1024]; 142 data[0] = (byte) byte1; 143 data[1] = (byte) byte2; 144 data[2] = (byte) byte3; 145 data[3] = (byte) byte4; 146 int length = in.read(data, 4, 1020) + 4; 147 String declaration=new String (data, 0, length, "8859_1"); 152 String encoding = findEncodingDeclaration(declaration); 157 in.reset(); 158 return encoding; 159 160 } 161 else if (byte1 == 0x4C && byte2 == 0x6F 162 && byte3 == 0xA7 && byte4 == 0x94) { 163 byte[] buffer = new byte[1016]; 165 for (int i = 0; i < buffer.length; i++) { 166 int c = in.read(); 167 if (c == -1) break; 168 buffer[i] = (byte) c; 169 } 170 in.reset(); 171 return findEncodingDeclaration(new String (buffer, "Cp037")); 174 } 175 176 } 177 catch (Exception ex) { 178 in.reset(); 179 return "UTF-8"; 180 } 181 182 in.reset(); 184 return "UTF-8"; 185 186 } 187 188 189 private static String findEncodingDeclaration(String declaration) 190 throws IOException { 191 192 int position = declaration.indexOf("encoding") + 8; 193 char c; 194 while (true) { 196 c = declaration.charAt(position++); 197 if (c !=' ' && c != '\t' && c != '\r' && c != '\n') { 198 break; 199 } 200 } 201 if (c != '=') { throw new IOException ("Couldn't determine encoding"); 203 } 204 while (true) { 206 c = declaration.charAt(position++); 207 if (c !=' ' && c != '\t' && c != '\r' && c != '\n') { 208 break; 209 } 210 } 211 char delimiter = c; 212 if (delimiter != '\'' && delimiter != '"') { return "UTF-8"; 214 } 215 StringBuffer encodingName = new StringBuffer (); 217 while (true) { 218 c = declaration.charAt(position++); 219 if (c == delimiter) break; 220 encodingName.append(c); 221 } 222 return encodingName.toString(); 223 224 } 225 226 } | Popular Tags |