1 35 36 package groovy.util; 37 38 import java.io.*; 39 import java.nio.charset.Charset ; 40 import java.util.*; 41 42 70 public class CharsetToolkit { 71 private byte[] buffer; 72 private Charset defaultCharset; 73 private Charset charset; 74 private boolean enforce8Bit = true; 75 private File file; 76 77 82 public CharsetToolkit(File file) throws IOException { 83 this.file = file; 84 InputStream input = new FileInputStream(file); 85 byte[] bytes = new byte[4096]; 86 int bytesRead = input.read(bytes); 87 if (bytesRead == -1) { 88 this.buffer = new byte[0]; 89 } 90 else if (bytesRead < 4096) { 91 byte[] bytesToGuess = new byte[bytesRead]; 92 System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead); 93 this.buffer = bytesToGuess; 94 } 95 else { 96 this.buffer = bytes; 97 } 98 this.defaultCharset = getDefaultSystemCharset(); 99 this.charset = null; 100 } 101 102 109 public void setDefaultCharset(Charset defaultCharset) { 110 if (defaultCharset != null) 111 this.defaultCharset = defaultCharset; 112 else 113 this.defaultCharset = getDefaultSystemCharset(); 114 } 115 116 public Charset getCharset() { 117 if (this.charset == null) 118 this.charset = guessEncoding(); 119 return charset; 120 } 121 122 129 public void setEnforce8Bit(boolean enforce) { 130 this.enforce8Bit = enforce; 131 } 132 133 138 public boolean getEnforce8Bit() { 139 return this.enforce8Bit; 140 } 141 142 146 public Charset getDefaultCharset() { 147 return defaultCharset; 148 } 149 150 174 private Charset guessEncoding() { 175 if (hasUTF8Bom()) 178 return Charset.forName("UTF-8"); 179 if (hasUTF16LEBom()) 180 return Charset.forName("UTF-16LE"); 181 if (hasUTF16BEBom()) 182 return Charset.forName("UTF-16BE"); 183 184 boolean highOrderBit = false; 187 188 boolean validU8Char = true; 191 192 194 int length = buffer.length; 195 int i = 0; 196 while (i < length - 6) { 197 byte b0 = buffer[i]; 198 byte b1 = buffer[i + 1]; 199 byte b2 = buffer[i + 2]; 200 byte b3 = buffer[i + 3]; 201 byte b4 = buffer[i + 4]; 202 byte b5 = buffer[i + 5]; 203 if (b0 < 0) { 204 highOrderBit = true; 207 if (isTwoBytesSequence(b0)) { 209 if (!isContinuationChar(b1)) 212 validU8Char = false; 213 else 214 i++; 215 } 216 else if (isThreeBytesSequence(b0)) { 218 if (!(isContinuationChar(b1) && isContinuationChar(b2))) 221 validU8Char = false; 222 else 223 i += 2; 224 } 225 else if (isFourBytesSequence(b0)) { 227 if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) 230 validU8Char = false; 231 else 232 i += 3; 233 } 234 else if (isFiveBytesSequence(b0)) { 236 if (!(isContinuationChar(b1) 239 && isContinuationChar(b2) 240 && isContinuationChar(b3) 241 && isContinuationChar(b4))) 242 validU8Char = false; 243 else 244 i += 4; 245 } 246 else if (isSixBytesSequence(b0)) { 248 if (!(isContinuationChar(b1) 251 && isContinuationChar(b2) 252 && isContinuationChar(b3) 253 && isContinuationChar(b4) 254 && isContinuationChar(b5))) 255 validU8Char = false; 256 else 257 i += 5; 258 } 259 else 260 validU8Char = false; 261 } 262 if (!validU8Char) 263 break; 264 i++; 265 } 266 if (!highOrderBit) { 269 if (this.enforce8Bit) 271 return this.defaultCharset; 272 else 273 return Charset.forName("US-ASCII"); 274 } 275 if (validU8Char) 278 return Charset.forName("UTF-8"); 279 return this.defaultCharset; 281 } 282 283 289 private static boolean isContinuationChar(byte b) { 290 return -128 <= b && b <= -65; 291 } 292 293 299 private static boolean isTwoBytesSequence(byte b) { 300 return -64 <= b && b <= -33; 301 } 302 303 309 private static boolean isThreeBytesSequence(byte b) { 310 return -32 <= b && b <= -17; 311 } 312 313 319 private static boolean isFourBytesSequence(byte b) { 320 return -16 <= b && b <= -9; 321 } 322 323 329 private static boolean isFiveBytesSequence(byte b) { 330 return -8 <= b && b <= -5; 331 } 332 333 339 private static boolean isSixBytesSequence(byte b) { 340 return -4 <= b && b <= -3; 341 } 342 343 348 public static Charset getDefaultSystemCharset() { 349 return Charset.forName(System.getProperty("file.encoding")); 350 } 351 352 357 public boolean hasUTF8Bom() { 358 if (buffer.length >= 3) 359 return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65); 360 else 361 return false; 362 } 363 364 370 public boolean hasUTF16LEBom() { 371 if (buffer.length >= 2) 372 return (buffer[0] == -1 && buffer[1] == -2); 373 else 374 return false; 375 } 376 377 383 public boolean hasUTF16BEBom() { 384 if (buffer.length >= 2) 385 return (buffer[0] == -2 && buffer[1] == -1); 386 else 387 return false; 388 } 389 390 398 public BufferedReader getReader() throws FileNotFoundException { 399 LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset())); 400 if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) { 401 try { 402 reader.read(); 403 } 404 catch (IOException e) { 405 } 408 } 409 return reader; 410 } 411 412 418 public static Charset [] getAvailableCharsets() { 419 Collection collection = Charset.availableCharsets().values(); 420 return (Charset []) collection.toArray(new Charset [collection.size()]); 421 } 422 } 423 | Popular Tags |