CharsetToolkit


1   /*
2    * $Id: CharsetToolkit.java,v 1.2 2004/07/11 19:41:25 glaforge Exp $
3    *
4    * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved.
5    *
6    * Redistribution and use of this software and associated documentation
7    * ("Software"), with or without modification, are permitted provided that the
8    * following conditions are met:
9    *  1. Redistributions of source code must retain copyright statements and
10   * notices. Redistributions must also contain a copy of this document.
11   *  2. Redistributions in binary form must reproduce the above copyright
12   * notice, this list of conditions and the following disclaimer in the
13   * documentation and/or other materials provided with the distribution.
14   *  3. The name "groovy" must not be used to endorse or promote products
15   * derived from this Software without prior written permission of The Codehaus.
16   * For written permission, please contact info@codehaus.org.
17   *  4. Products derived from this Software may not be called "groovy" nor may
18   * "groovy" appear in their names without prior written permission of The
19   * Codehaus. "groovy" is a registered trademark of The Codehaus.
20   *  5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/
21   *
22   * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY
23   * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25   * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR
26   * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
32   * DAMAGE.
33   *
34   */
35  
36  package groovy.util;
37  
38  import java.io.*;
39  import java.nio.charset.Charset  ;
40  import java.util.*;
41  
42  /**
43   * <p>Utility class to guess the encoding of a given text file.</p>
44   *
45   * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
46   * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
47   * is wide enough, the charset should also be discovered.</p>
48   *
49   * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
50   *
51   * <p>Usage:</p>
52   * <pre>
53   * // guess the encoding
54   * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
55   *
56   * // create a reader with the correct charset
57   * CharsetToolkit toolkit = new CharsetToolkit(file);
58   * BufferedReader reader = toolkit.getReader();
59   *
60   * // read the file content
61   * String line;
62   * while ((line = br.readLine())!= null)
63   * {
64   *     System.out.println(line);
65   * }
66   * </pre>
67   *
68   * @author Guillaume Laforge
69   */
70  public class CharsetToolkit {
71      private byte[] buffer;
72      private Charset   defaultCharset;
73      private Charset   charset;
74      private boolean enforce8Bit = true;
75      private File file;
76  
77      /**
78       * Constructor of the <code>CharsetToolkit</code> utility class.
79       *
80       * @param file of which we want to know the encoding.
81       */
82      public CharsetToolkit(File file) throws IOException {
83          this.file = file;
84          InputStream input = new FileInputStream(file);
85          byte[] bytes = new byte[4096];
86          int bytesRead = input.read(bytes);
87          if (bytesRead == -1) {
88              this.buffer = new byte[0];
89          }
90          else if (bytesRead < 4096) {
91              byte[] bytesToGuess = new byte[bytesRead];
92              System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
93              this.buffer = bytesToGuess;
94          }
95          else {
96              this.buffer = bytes;
97          }
98          this.defaultCharset = getDefaultSystemCharset();
99          this.charset = null;
100     }
101 
102     /**
103      * Defines the default <code>Charset</code> used in case the buffer represents
104      * an 8-bit <code>Charset</code>.
105      *
106      * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
107      * if an 8-bit <code>Charset</code> is encountered.
108      */
109     public void setDefaultCharset(Charset   defaultCharset) {
110         if (defaultCharset != null)
111             this.defaultCharset = defaultCharset;
112         else
113             this.defaultCharset = getDefaultSystemCharset();
114     }
115 
116     public Charset   getCharset() {
117         if (this.charset == null)
118             this.charset = guessEncoding();
119         return charset;
120     }
121 
122     /**
123      * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
124      * It might be a file without any special character in the range 128-255, but that may be or become
125      * a file encoded with the default <code>charset</code> rather than US-ASCII.
126      *
127      * @param enforce a boolean specifying the use or not of US-ASCII.
128      */
129     public void setEnforce8Bit(boolean enforce) {
130         this.enforce8Bit = enforce;
131     }
132 
133     /**
134      * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
135      *
136      * @return a boolean representing the flag of use of US-ASCII.
137      */
138     public boolean getEnforce8Bit() {
139         return this.enforce8Bit;
140     }
141 
142     /**
143      * Retrieves the default Charset
144      * @return
145      */
146     public Charset   getDefaultCharset() {
147         return defaultCharset;
148     }
149 
150     /**
151      * <p>Guess the encoding of the provided buffer.</p>
152      * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
153      * return the charset implied by this BOM. Otherwise, the file would not be a human
154      * readable text file.</p>
155      *
156      * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
157      * If it is not UTF-8, we assume the encoding is the default system encoding
158      * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
159      *
160      * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
161      * <pre>
162      * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
163      * 0000 0000-0000 007F       0xxxxxxx
164      * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
165      * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
166      * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
167      * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
168      * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
169      * </pre>
170      * <p>With UTF-8, 0xFE and 0xFF never appear.</p>
171      *
172      * @return the Charset recognized.
173      */
174     private Charset   guessEncoding() {
175         // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
176         // otherwise, the file would not be human readable
177         if (hasUTF8Bom())
178             return Charset.forName("UTF-8");
179         if (hasUTF16LEBom())
180             return Charset.forName("UTF-16LE");
181         if (hasUTF16BEBom())
182             return Charset.forName("UTF-16BE");
183 
184         // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
185         // otherwise, the file is in US-ASCII
186         boolean highOrderBit = false;
187 
188         // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
189         // if it's not the case, we can assume the encoding is the default encoding of the system
190         boolean validU8Char = true;
191 
192         // TODO the buffer is not read up to the end, but up to length - 6
193 
194         int length = buffer.length;
195         int i = 0;
196         while (i < length - 6) {
197             byte b0 = buffer[i];
198             byte b1 = buffer[i + 1];
199             byte b2 = buffer[i + 2];
200             byte b3 = buffer[i + 3];
201             byte b4 = buffer[i + 4];
202             byte b5 = buffer[i + 5];
203             if (b0 < 0) {
204                 // a high order bit was encountered, thus the encoding is not US-ASCII
205                 // it may be either an 8-bit encoding or UTF-8
206                 highOrderBit = true;
207                 // a two-bytes sequence was encoutered
208                 if (isTwoBytesSequence(b0)) {
209                     // there must be one continuation byte of the form 10xxxxxx,
210                     // otherwise the following characteris is not a valid UTF-8 construct
211                     if (!isContinuationChar(b1))
212                         validU8Char = false;
213                     else
214                         i++;
215                 }
216                 // a three-bytes sequence was encoutered
217                 else if (isThreeBytesSequence(b0)) {
218                     // there must be two continuation bytes of the form 10xxxxxx,
219                     // otherwise the following characteris is not a valid UTF-8 construct
220                     if (!(isContinuationChar(b1) && isContinuationChar(b2)))
221                         validU8Char = false;
222                     else
223                         i += 2;
224                 }
225                 // a four-bytes sequence was encoutered
226                 else if (isFourBytesSequence(b0)) {
227                     // there must be three continuation bytes of the form 10xxxxxx,
228                     // otherwise the following characteris is not a valid UTF-8 construct
229                     if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
230                         validU8Char = false;
231                     else
232                         i += 3;
233                 }
234                 // a five-bytes sequence was encoutered
235                 else if (isFiveBytesSequence(b0)) {
236                     // there must be four continuation bytes of the form 10xxxxxx,
237                     // otherwise the following characteris is not a valid UTF-8 construct
238                     if (!(isContinuationChar(b1)
239                         && isContinuationChar(b2)
240                         && isContinuationChar(b3)
241                         && isContinuationChar(b4)))
242                         validU8Char = false;
243                     else
244                         i += 4;
245                 }
246                 // a six-bytes sequence was encoutered
247                 else if (isSixBytesSequence(b0)) {
248                     // there must be five continuation bytes of the form 10xxxxxx,
249                     // otherwise the following characteris is not a valid UTF-8 construct
250                     if (!(isContinuationChar(b1)
251                         && isContinuationChar(b2)
252                         && isContinuationChar(b3)
253                         && isContinuationChar(b4)
254                         && isContinuationChar(b5)))
255                         validU8Char = false;
256                     else
257                         i += 5;
258                 }
259                 else
260                     validU8Char = false;
261             }
262             if (!validU8Char)
263                 break;
264             i++;
265         }
266         // if no byte with an high order bit set, the encoding is US-ASCII
267         // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
268         if (!highOrderBit) {
269             // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
270             if (this.enforce8Bit)
271                 return this.defaultCharset;
272             else
273                 return Charset.forName("US-ASCII");
274         }
275         // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
276         // otherwise the file would not be human readable
277         if (validU8Char)
278             return Charset.forName("UTF-8");
279         // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
280         return this.defaultCharset;
281     }
282 
283     /**
284      * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
285      *
286      * @param b a byte.
287      * @return true if it's a continuation char.
288      */
289     private static boolean isContinuationChar(byte b) {
290         return -128 <= b && b <= -65;
291     }
292 
293     /**
294      * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
295      *
296      * @param b a byte.
297      * @return true if it's the first byte of a two-bytes sequence.
298      */
299     private static boolean isTwoBytesSequence(byte b) {
300         return -64 <= b && b <= -33;
301     }
302 
303     /**
304      * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
305      *
306      * @param b a byte.
307      * @return true if it's the first byte of a three-bytes sequence.
308      */
309     private static boolean isThreeBytesSequence(byte b) {
310         return -32 <= b && b <= -17;
311     }
312 
313     /**
314      * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
315      *
316      * @param b a byte.
317      * @return true if it's the first byte of a four-bytes sequence.
318      */
319     private static boolean isFourBytesSequence(byte b) {
320         return -16 <= b && b <= -9;
321     }
322 
323     /**
324      * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
325      *
326      * @param b a byte.
327      * @return true if it's the first byte of a five-bytes sequence.
328      */
329     private static boolean isFiveBytesSequence(byte b) {
330         return -8 <= b && b <= -5;
331     }
332 
333     /**
334      * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
335      *
336      * @param b a byte.
337      * @return true if it's the first byte of a six-bytes sequence.
338      */
339     private static boolean isSixBytesSequence(byte b) {
340         return -4 <= b && b <= -3;
341     }
342 
343     /**
344      * Retrieve the default charset of the system.
345      *
346      * @return the default <code>Charset</code>.
347      */
348     public static Charset   getDefaultSystemCharset() {
349         return Charset.forName(System.getProperty("file.encoding"));
350     }
351 
352     /**
353      * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
354      *
355      * @return true if the buffer has a BOM for UTF8.
356      */
357     public boolean hasUTF8Bom() {
358         if (buffer.length >= 3)
359             return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
360         else
361             return false;
362     }
363 
364     /**
365      * Has a Byte Order Marker for UTF-16 Low Endian
366      * (ucs-2le, ucs-4le, and ucs-16le).
367      *
368      * @return true if the buffer has a BOM for UTF-16 Low Endian.
369      */
370     public boolean hasUTF16LEBom() {
371         if (buffer.length >= 2)
372             return (buffer[0] == -1 && buffer[1] == -2);
373         else
374             return false;
375     }
376 
377     /**
378      * Has a Byte Order Marker for UTF-16 Big Endian
379      * (utf-16 and ucs-2).
380      *
381      * @return true if the buffer has a BOM for UTF-16 Big Endian.
382      */
383     public boolean hasUTF16BEBom() {
384         if (buffer.length >= 2)
385             return (buffer[0] == -2 && buffer[1] == -1);
386         else
387             return false;
388     }
389 
390     /**
391      * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
392      * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
393      * method <code>guessEncoding()</code>.
394      *
395      * @return a <code>BufferedReader</code>
396      * @throws FileNotFoundException if the file is not found.
397      */
398     public BufferedReader getReader() throws FileNotFoundException {
399         LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
400         if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
401             try {
402                 reader.read();
403             }
404             catch (IOException e) {
405                 // should never happen, as a file with no content
406                 // but with a BOM has at least one char
407             }
408         }
409         return reader;
410     }
411 
412     /**
413      * Retrieves all the available <code>Charset</code>s on the platform,
414      * among which the default <code>charset</code>.
415      *
416      * @return an array of <code>Charset</code>s.
417      */
418     public static Charset  [] getAvailableCharsets() {
419         Collection collection = Charset.availableCharsets().values();
420         return (Charset  []) collection.toArray(new Charset  [collection.size()]);
421     }
422 }
423
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags