CharsetDetector


1   /**
2   *******************************************************************************
3   * Copyright (C) 2005-2006, International Business Machines Corporation and    *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   package com.ibm.icu.text;
8   
9   import java.io.InputStream  ;
10  import java.io.Reader  ;
11  import java.io.IOException  ;
12  import java.util.ArrayList  ;
13  import java.util.Collections  ;
14  import java.util.Arrays  ;
15  
16  
17  /**
18   * <code>CharsetDetector</code> provides a facility for detecting the
19   * charset or encoding of character data in an unknown format.
20   * The input data can either be from an input stream or an array of bytes.
21   * The result of the detection operation is a list of possibly matching
22   * charsets, or, for simple use, you can just ask for a Java Reader that
23   * will will work over the input data.
24   * <p/>
25   * Character set detection is at best an imprecise operation.  The detection
26   * process will attempt to identify the charset that best matches the characteristics
27   * of the byte data, but the process is partly statistical in nature, and
28   * the results can not be guaranteed to always be correct.
29   * <p/>
30   * For best accuracy in charset detection, the input data should be primarily
31   * in a single language, and a minimum of a few hundred bytes worth of plain text
32   * in the language are needed.  The detection process will attempt to
33   * ignore html or xml style markup that could otherwise obscure the content.
34   * <p/>
35   * @draft ICU 3.4
36   * @provisional This API might change or be removed in a future release.
37   */
38  public class CharsetDetector {
39  
40  //   Question: Should we have getters corresponding to the setters for inut text
41  //   and declared encoding?
42  
43  //   A thought: If we were to create our own type of Java Reader, we could defer
44  //   figuring out an actual charset for data that starts out with too much English
45  //   only ASCII until the user actually read through to something that didn't look
46  //   like 7 bit English.  If  nothing else ever appeared, we would never need to
47  //   actually choose the "real" charset.  All assuming that the application just
48  //   wants the data, and doesn't care about a char set name.
49  
50      /**
51       *   Constructor
52       * 
53       * @draft ICU 3.4
54       * @provisional This API might change or be removed in a future release.
55       */
56      public CharsetDetector() {
57      }
58  
59      /**
60       * Set the declared encoding for charset detection.
61       *  The declared encoding of an input text is an encoding obtained
62       *  from an http header or xml declaration or similar source that
63       *  can be provided as additional information to the charset detector.  
64       *  A match between a declared encoding and a possible detected encoding
65       *  will raise the quality of that detected encoding by a small delta,
66       *  and will also appear as a "reason" for the match.
67       * <p/>
68       * A declared encoding that is incompatible with the input data being
69       * analyzed will not be added to the list of possible encodings.
70       * 
71       *  @param encoding The declared encoding 
72       *
73       * @draft ICU 3.4
74       * @provisional This API might change or be removed in a future release.
75       */
76      public CharsetDetector setDeclaredEncoding(String   encoding) {
77          fDeclaredEncoding = encoding;
78          return this;
79      }
80      
81      /**
82       * Set the input text (byte) data whose charset is to be detected.
83       * 
84       * @param in the input text of unknown encoding
85       * 
86       * @return This CharsetDetector
87       *
88       * @draft ICU 3.4
89       * @provisional This API might change or be removed in a future release.
90       */
91      public CharsetDetector setText(byte [] in) {
92          fRawInput  = in;
93          fRawLength = in.length;
94          
95          MungeInput();
96          
97          return this;
98      }
99      
100     private static final int kBufSize = 8000;
101 
102     /**
103      * Set the input text (byte) data whose charset is to be detected.
104      *  <p/>
105      *   The input stream that supplies the character data must have markSupported()
106      *   == true; the charset detection process will read a small amount of data,
107      *   then return the stream to its original position via
108      *   the InputStream.reset() operation.  The exact amount that will
109      *   be read depends on the characteristics of the data itself.
110      *
111      * @param in the input text of unknown encoding
112      * 
113      * @return This CharsetDetector
114      *
115      * @draft ICU 3.4
116      * @provisional This API might change or be removed in a future release.
117      */
118     
119     public CharsetDetector setText(InputStream   in) throws IOException   {
120         fInputStream = in;
121         fInputStream.mark(kBufSize);
122         fRawInput = new byte[kBufSize];   // Always make a new buffer because the
123                                           //   previous one may have come from the caller,
124                                           //   in which case we can't touch it.
125         fRawLength = 0;
126         int remainingLength = kBufSize;
127         while (remainingLength > 0 ) {
128             // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
129             int  bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
130             if (bytesRead <= 0) {
131                  break;
132             }
133             fRawLength += bytesRead;
134             remainingLength -= bytesRead;
135         }
136         fInputStream.reset();
137         
138         MungeInput();                     // Strip html markup, collect byte stats.
139         return this;
140     }
141 
142   
143     /**
144      * Return the charset that best matches the supplied input data.
145      * 
146      * Note though, that because the detection 
147      * only looks at the start of the input data,
148      * there is a possibility that the returned charset will fail to handle
149      * the full set of input data.
150      * <p/>
151      * Raise an exception if 
152      *  <ul>
153      *    <li>no charset appears to match the data.</li>
154      *    <li>no input text has been provided</li>
155      *  </ul>
156      *
157      * @return a CharsetMatch object representing the best matching charset, or
158      *         <code>null</code> if there are no matches.
159      *
160      * @draft ICU 3.4
161      * @provisional This API might change or be removed in a future release.
162      */
163     public CharsetMatch detect() {
164 //   TODO:  A better implementation would be to copy the detect loop from
165 //          detectAll(), and cut it short as soon as a match with a high confidence
166 //          is found.  This is something to be done later, after things are otherwise
167 //          working.
168         CharsetMatch matches[] = detectAll();
169         
170         if (matches == null || matches.length == 0) {
171             return null;
172         }
173         
174         return matches[0];
175      }
176     
177     /**
178      *  Return an array of all charsets that appear to be plausible
179      *  matches with the input data.  The array is ordered with the
180      *  best quality match first.
181      * <p/>
182      * Raise an exception if 
183      *  <ul>
184      *    <li>no charsets appear to match the input data.</li>
185      *    <li>no input text has been provided</li>
186      *  </ul>
187      * 
188      * @return An array of CharsetMatch objects representing possibly matching charsets.
189      *
190      * @draft ICU 3.4
191      * @provisional This API might change or be removed in a future release.
192      */
193     public CharsetMatch[] detectAll() {
194         CharsetRecognizer csr;
195         int               i;
196         int               detectResults;
197         int               confidence;
198         ArrayList           matches = new ArrayList  ();
199         
200         //  Iterate over all possible charsets, remember all that
201         //    give a match quality > 0.
202         for (i=0; i<fCSRecognizers.size(); i++) {
203             csr = (CharsetRecognizer)fCSRecognizers.get(i);
204             detectResults = csr.match(this);
205             confidence = detectResults & 0x000000ff;
206             if (confidence > 0) {
207                 CharsetMatch  m = new CharsetMatch(this, csr, confidence);
208                 matches.add(m);
209             }
210         }
211         Collections.sort(matches);      // CharsetMatch compares on confidence
212         Collections.reverse(matches);   //  Put best match first.
213         CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
214         resultArray = (CharsetMatch[]) matches.toArray(resultArray);
215         return resultArray;
216     }
217 
218     
219     /**
220      * Autodetect the charset of an inputStream, and return a Java Reader
221      * to access the converted input data.
222      * <p/>
223      * This is a convenience method that is equivalent to
224      *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
225      * <p/>
226      *   For the input stream that supplies the character data, markSupported()
227      *   must be true; the  charset detection will read a small amount of data,
228      *   then return the stream to its original position via
229      *   the InputStream.reset() operation.  The exact amount that will
230      *    be read depends on the characteristics of the data itself.
231      *<p/>
232      * Raise an exception if no charsets appear to match the input data.
233      * 
234      * @param in The source of the byte data in the unknown charset.
235      *
236      * @param declaredEncoding  A declared encoding for the data, if available,
237      *           or null or an empty string if none is available.
238      *
239      * @draft ICU 3.4
240      * @provisional This API might change or be removed in a future release.
241      */
242     public Reader   getReader(InputStream   in, String   declaredEncoding) {
243         fDeclaredEncoding = declaredEncoding;
244         
245         try {
246             setText(in);
247             
248             CharsetMatch match = detect();
249             
250             if (match == null) {
251                 return null;
252             }
253             
254             return match.getReader();
255         } catch (IOException   e) {
256             return null;
257         }
258     }
259 
260     /**
261      * Autodetect the charset of an inputStream, and return a String
262      * containing the converted input data.
263      * <p/>
264      * This is a convenience method that is equivalent to
265      *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
266      *<p/>
267      * Raise an exception if no charsets appear to match the input data.
268      * 
269      * @param in The source of the byte data in the unknown charset.
270      *
271      * @param declaredEncoding  A declared encoding for the data, if available,
272      *           or null or an empty string if none is available.
273      *
274      * @draft ICU 3.4
275      * @provisional This API might change or be removed in a future release.
276      */
277     public String   getString(byte[] in, String   declaredEncoding)
278     {
279         fDeclaredEncoding = declaredEncoding;
280        
281         try {
282             setText(in);
283             
284             CharsetMatch match = detect();
285             
286             if (match == null) {
287                 return null;
288             }
289             
290             return match.getString(-1);
291         } catch (IOException   e) {
292             return null;
293         }
294     }
295 
296  
297     /**
298      * Get the names of all char sets that can be recognized by the char set detector.
299      *
300      * @return an array of the names of all charsets that can be recognized
301      * by the charset detector.
302      *
303      * @draft ICU 3.4
304      * @provisional This API might change or be removed in a future release.
305      */
306     public static String  [] getAllDetectableCharsets() {
307         return fCharsetNames;
308     }
309     
310     /**
311      * Test whether or not input filtering is enabled.
312      * 
313      * @return <code>true</code> if input text will be filtered.
314      * 
315      * @see #enableInputFilter
316      *
317      * @draft ICU 3.4
318      * @provisional This API might change or be removed in a future release.
319      */
320     public boolean inputFilterEnabled()
321     {
322         return fStripTags;
323     }
324     
325     /**
326      * Enable filtering of input text. If filtering is enabled,
327      * text within angle brackets ("<" and ">") will be removed
328      * before detection.
329      * 
330      * @param filter <code>true</code> to enable input text filtering.
331      * 
332      * @return The previous setting.
333      *
334      * @draft ICU 3.4
335      * @provisional This API might change or be removed in a future release.
336      */
337     public boolean enableInputFilter(boolean filter)
338     {
339         boolean previous = fStripTags;
340         
341         fStripTags = filter;
342         
343         return previous;
344     }
345     
346     /**
347      *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
348      *               it by removing what appears to be html markup.
349      * 
350      * @internal
351      */
352     private void MungeInput() {
353         int srci = 0;
354         int dsti = 0;
355         byte b;
356         boolean  inMarkup = false;
357         int      openTags = 0;
358         int      badTags  = 0;
359         
360         //
361         //  html / xml markup stripping.
362         //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
363         //     discard everything within < brackets >
364         //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
365         //     guess as to whether the input was actually marked up at all.
366         if (fStripTags) {
367             for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
368                 b = fRawInput[srci];
369                 if (b == (byte)'<') {
370                     if (inMarkup) {
371                         badTags++;
372                     }
373                     inMarkup = true;
374                     openTags++;
375                 }
376                 
377                 if (! inMarkup) {
378                     fInputBytes[dsti++] = b;
379                 }
380                 
381                 if (b == (byte)'>') {
382                     inMarkup = false;
383                 }        
384             }
385             
386             fInputLen = dsti;
387         }
388         
389         //
390         //  If it looks like this input wasn't marked up, or if it looks like it's
391         //    essentially nothing but markup abandon the markup stripping.
392         //    Detection will have to work on the unstripped input.
393         //
394         if (openTags<5 || openTags/5 < badTags || 
395                 (fInputLen < 100 && fRawLength>600)) {
396             int limit = fRawLength;
397             
398             if (limit > kBufSize) {
399                 limit = kBufSize;
400             }
401             
402             for (srci=0; srci<limit; srci++) {
403                 fInputBytes[srci] = fRawInput[srci];
404             }
405             fInputLen = srci;
406         }
407         
408         //
409         // Tally up the byte occurence statistics.
410         //   These are available for use by the various detectors.
411         //
412         Arrays.fill(fByteStats, (short)0);
413         for (srci=0; srci<fInputLen; srci++) {
414             int val = fInputBytes[srci] & 0x00ff;
415             fByteStats[val]++;
416         }
417         
418         fC1Bytes = false;
419         for (int i = 0x80; i <= 0x9F; i += 1) {
420             if (fByteStats[i] != 0) {
421                 fC1Bytes = true;
422                 break;
423             }
424         }
425      }
426 
427     /**
428      *  The following items are accessed by individual CharsetRecongizers during
429      *     the recognition process
430      * 
431      * @internal
432      */
433     byte[]      fInputBytes =       // The text to be checked.  Markup will have been
434                    new byte[kBufSize];  //   removed if appropriate.
435     
436     int         fInputLen;          // Length of the byte data in fInputText.
437     
438     short       fByteStats[] =      // byte frequency statistics for the input text.
439                    new short[256];  //   Value is percent, not absolute.
440                                     //   Value is rounded up, so zero really means zero occurences.
441     
442     boolean     fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
443                    false;
444     
445     String        fDeclaredEncoding;
446     
447     
448 
449     //
450     //  Stuff private to CharsetDetector
451     //
452     byte[]               fRawInput;     // Original, untouched input bytes.
453                                         //  If user gave us a byte array, this is it.
454                                         //  If user gave us a stream, it's read to a 
455                                         //  buffer here.
456     int                  fRawLength;    // Length of data in fRawInput array.
457     
458     InputStream            fInputStream;  // User's input stream, or null if the user
459                                         //   gave us a byte array.
460      
461     boolean              fStripTags =   // If true, setText() will strip tags from input text.
462                            false;
463     
464     
465     /**
466      *  List of recognizers for all charsets known to the implementation.
467      *
468      * @internal
469      */
470     private static ArrayList   fCSRecognizers = createRecognizers();
471     private static String   [] fCharsetNames;
472     
473    /**
474      * Create the singleton instances of the CharsetRecognizer classes
475      * 
476      * @internal
477      */
478     private static ArrayList   createRecognizers() {
479         ArrayList   recognizers = new ArrayList  ();
480         
481         recognizers.add(new CharsetRecog_UTF8());
482         
483         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
484         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
485         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
486         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
487         
488         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
489         recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
490         recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
491         recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
492         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
493         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
494         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
495         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
496         
497         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
498         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
499         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
500         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
501         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
502         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
503         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
504         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
505         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
506         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
507         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
508         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
509         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
510         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
511         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
512         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
513         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
514         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
515         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
516         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
517         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
518         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
519         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
520         
521         // Create an array of all charset names, as a side effect.
522         // Needed for the getAllDetectableCharsets() API.
523         String  [] charsetNames = new String   [recognizers.size()];
524         int out = 0;
525         
526         for (int i = 0; i < recognizers.size(); i++) {
527             String   name = ((CharsetRecognizer)recognizers.get(i)).getName();
528             
529             if (out == 0 || ! name.equals(charsetNames[out - 1])) {
530                 charsetNames[out++] = name;
531             }
532         }
533         
534         fCharsetNames = new String  [out];
535         System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
536         
537         return recognizers;
538     }
539 }
540
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags