CharsetMatch


1   /**
2   *******************************************************************************
3   * Copyright (C) 2005-2006, International Business Machines Corporation and    *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   package com.ibm.icu.text;
8   
9   import java.io.ByteArrayInputStream  ;
10  import java.io.IOException  ;
11  import java.io.InputStream  ;
12  import java.io.InputStreamReader  ;
13  import java.io.Reader  ;
14  
15  
16  /**
17   * This class represents a charset that has been identified by a CharsetDetector
18   * as a possible encoding for a set of input data.  From an instance of this
19   * class, you can ask for a confidence level in the charset identification,
20   * or for Java Reader or String to access the original byte data in Unicode form.
21   * <p/>
22   * Instances of this class are created only by CharsetDetectors.
23   * <p/>
24   * Note:  this class has a natural ordering that is inconsistent with equals.
25   *        The natural ordering is based on the match confidence value.
26   *
27   * @draft ICU 3.4
28   * @provisional This API might change or be removed in a future release.
29   */
30  public class CharsetMatch implements Comparable   {
31  
32      
33      /**
34       * Create a java.io.Reader for reading the Unicode character data corresponding
35       * to the original byte data supplied to the Charset detect operation.
36       * <p/>
37       * CAUTION:  if the source of the byte data was an InputStream, a Reader
38       * can be created for only one matching char set using this method.  If more 
39       * than one charset needs to be tried, the caller will need to reset
40       * the InputStream and create InputStreamReaders itself, based on the charset name.
41       *
42       * @return the Reader for the Unicode character data.
43       *
44       * @draft ICU 3.4
45       * @provisional This API might change or be removed in a future release.
46       */
47      public Reader   getReader() {
48          InputStream   inputStream = fInputStream;
49          
50          if (inputStream == null) {
51              inputStream = new ByteArrayInputStream  (fRawInput, 0, fRawLength);
52          }
53          
54          try {
55              inputStream.reset();
56              return new InputStreamReader  (inputStream, getName());
57          } catch (IOException   e) {
58              return null;
59          }
60      }
61  
62      /**
63       * Create a Java String from Unicode character data corresponding
64       * to the original byte data supplied to the Charset detect operation.
65       *
66       * @return a String created from the converted input data.
67       *
68       * @draft ICU 3.4
69       * @provisional This API might change or be removed in a future release.
70       */
71      public String   getString()  throws java.io.IOException   {
72          return getString(-1);
73  
74      }
75  
76      /**
77       * Create a Java String from Unicode character data corresponding
78       * to the original byte data supplied to the Charset detect operation.
79       * The length of the returned string is limited to the specified size;
80       * the string will be trunctated to this length if necessary.  A limit value of
81       * zero or less is ignored, and treated as no limit.
82       *
83       * @param maxLength The maximium length of the String to be created when the
84       *                  source of the data is an input stream, or -1 for
85       *                  unlimited length.
86       * @return a String created from the converted input data.
87       *
88       * @draft ICU 3.4
89       * @provisional This API might change or be removed in a future release.
90       */
91      public String   getString(int maxLength) throws java.io.IOException   {
92          String   result = null;
93          if (fInputStream != null) {
94              StringBuffer   sb = new StringBuffer  ();
95              char[] buffer = new char[1024];
96              Reader   reader = getReader();
97              int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
98              int bytesRead = 0;
99              
100             while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
101                 sb.append(buffer, 0, bytesRead);
102                 max -= bytesRead;
103             }
104             
105             reader.close();
106             
107             return sb.toString();
108         } else {
109             result = new String  (fRawInput, getName());            
110         }
111         return result;
112 
113     }
114     
115     /**
116      * Get an indication of the confidence in the charset detected.
117      * Confidence values range from 0-100, with larger numbers indicating
118      * a better match of the input data to the characteristics of the
119      * charset.
120      *
121      * @return the confidence in the charset match
122      *
123      * @draft ICU 3.4
124      * @provisional This API might change or be removed in a future release.
125      */
126     public int getConfidence() {
127         return fConfidence;
128     }
129     
130 
131     /**
132      * Bit flag indicating the match is based on the the encoding scheme.
133      *
134      * @see #getMatchType
135      * @draft ICU 3.4
136      * @provisional This API might change or be removed in a future release.
137      */
138     static public final int ENCODING_SCHEME    = 1;
139     
140     /**
141      * Bit flag indicating the match is based on the presence of a BOM.
142      * 
143      * @see #getMatchType
144      * @draft ICU 3.4
145      * @provisional This API might change or be removed in a future release.
146      */
147     static public final int BOM                = 2;
148     
149     /**
150      * Bit flag indicating he match is based on the declared encoding.
151      * 
152      * @see #getMatchType
153      * @draft ICU 3.4
154      * @provisional This API might change or be removed in a future release.
155      */
156     static public final int DECLARED_ENCODING  = 4;
157     
158     /**
159      * Bit flag indicating the match is based on language statistics.
160      *
161      * @see #getMatchType
162      * @draft ICU 3.4
163      * @provisional This API might change or be removed in a future release.
164      */
165     static public final int LANG_STATISTICS    = 8;
166     
167     /**
168      * Return flags indicating what it was about the input data 
169      * that caused this charset to be considered as a possible match.
170      * The result is a bitfield containing zero or more of the flags
171      * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
172      * A result of zero means no information is available.
173      * <p>
174      * Note: currently, this method always returns zero.
175      * <p>
176      *
177      * @return the type of match found for this charset.
178      *
179      * @draft ICU 3.4
180      * @provisional This API might change or be removed in a future release.
181      */
182     public int getMatchType() {
183 //      TODO: create a list of enum-like constants for common combinations of types of matches.
184         return 0;
185     }
186 
187     /**
188      * Get the name of the detected charset.  
189      * The name will be one that can be used with other APIs on the
190      * platform that accept charset names.  It is the "Canonical name"
191      * as defined by the class java.nio.charset.Charset; for
192      * charsets that are registered with the IANA charset registry,
193      * this is the MIME-preferred registerd name.
194      *
195      * @see java.nio.charset.Charset
196      * @see java.io.InputStreamReader
197      *
198      * @return The name of the charset.
199      *
200      * @draft ICU 3.4
201      * @provisional This API might change or be removed in a future release.
202      */
203     public String   getName() {
204         return fRecognizer.getName();
205     }
206     
207     /**
208      * Get the ISO code for the language of the detected charset.  
209      *
210      * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
211      *
212      * @draft ICU 3.4
213      * @provisional This API might change or be removed in a future release.
214      */
215     public String   getLanguage() {
216         return fRecognizer.getLanguage();
217     }
218     
219     
220     /**
221      * Compare to other CharsetMatch objects.
222      * Comparison is based on the match confidence value, which 
223      *   allows CharsetDetector.detectAll() to order its results. 
224      *
225      * @param o the CharsetMatch object to compare against.
226      * @return  a negative integer, zero, or a positive integer as the 
227      *          confidence level of this CharsetMatch
228      *      is less than, equal to, or greater than that of
229      *          the argument.
230      * @throws ClassCastException if the argument is not a CharsetMatch.
231      * @draft ICU 3.4
232      * @provisional This API might change or be removed in a future release.
233      */
234     public int compareTo (Object   o) {
235         CharsetMatch other = (CharsetMatch)o;
236         int compareResult = 0;
237         if (this.fConfidence > other.fConfidence) {
238             compareResult = 1;
239         } else if (this.fConfidence < other.fConfidence) {
240             compareResult = -1;
241         }
242         return compareResult;
243     }
244     
245     /**
246      *  Constructor.  Implementation internal
247      *
248      * @internal
249      */
250     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
251         fRecognizer = rec;
252         fConfidence = conf;
253         
254         // The references to the original aplication input data must be copied out
255         //   of the charset recognizer to here, in case the application resets the
256         //   recognizer before using this CharsetMatch.
257         if (det.fInputStream == null) {
258             // We only want the existing input byte data if it came straight from the user,
259             //   not if is just the head of a stream.
260             fRawInput    = det.fRawInput;
261             fRawLength   = det.fRawLength;
262         }
263         fInputStream = det.fInputStream;
264     }
265 
266     
267     //
268     //   Private Data
269     //
270     private int                 fConfidence;
271     private CharsetRecognizer   fRecognizer;
272     private byte[]              fRawInput = null;     // Original, untouched input bytes.
273                                                       //  If user gave us a byte array, this is it.
274     private int                 fRawLength;           // Length of data in fRawInput array.
275 
276     private InputStream           fInputStream = null;  // User's input stream, or null if the user
277                                                       //   gave us a byte array.
278 }
279
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags