KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > CharsetMatch


1 /**
2 *******************************************************************************
3 * Copyright (C) 2005-2006, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
6 */

7 package com.ibm.icu.text;
8
9 import java.io.ByteArrayInputStream JavaDoc;
10 import java.io.IOException JavaDoc;
11 import java.io.InputStream JavaDoc;
12 import java.io.InputStreamReader JavaDoc;
13 import java.io.Reader JavaDoc;
14
15
16 /**
17  * This class represents a charset that has been identified by a CharsetDetector
18  * as a possible encoding for a set of input data. From an instance of this
19  * class, you can ask for a confidence level in the charset identification,
20  * or for Java Reader or String to access the original byte data in Unicode form.
21  * <p/>
22  * Instances of this class are created only by CharsetDetectors.
23  * <p/>
24  * Note: this class has a natural ordering that is inconsistent with equals.
25  * The natural ordering is based on the match confidence value.
26  *
27  * @draft ICU 3.4
28  * @provisional This API might change or be removed in a future release.
29  */

30 public class CharsetMatch implements Comparable JavaDoc {
31
32     
33     /**
34      * Create a java.io.Reader for reading the Unicode character data corresponding
35      * to the original byte data supplied to the Charset detect operation.
36      * <p/>
37      * CAUTION: if the source of the byte data was an InputStream, a Reader
38      * can be created for only one matching char set using this method. If more
39      * than one charset needs to be tried, the caller will need to reset
40      * the InputStream and create InputStreamReaders itself, based on the charset name.
41      *
42      * @return the Reader for the Unicode character data.
43      *
44      * @draft ICU 3.4
45      * @provisional This API might change or be removed in a future release.
46      */

47     public Reader JavaDoc getReader() {
48         InputStream JavaDoc inputStream = fInputStream;
49         
50         if (inputStream == null) {
51             inputStream = new ByteArrayInputStream JavaDoc(fRawInput, 0, fRawLength);
52         }
53         
54         try {
55             inputStream.reset();
56             return new InputStreamReader JavaDoc(inputStream, getName());
57         } catch (IOException JavaDoc e) {
58             return null;
59         }
60     }
61
62     /**
63      * Create a Java String from Unicode character data corresponding
64      * to the original byte data supplied to the Charset detect operation.
65      *
66      * @return a String created from the converted input data.
67      *
68      * @draft ICU 3.4
69      * @provisional This API might change or be removed in a future release.
70      */

71     public String JavaDoc getString() throws java.io.IOException JavaDoc {
72         return getString(-1);
73
74     }
75
76     /**
77      * Create a Java String from Unicode character data corresponding
78      * to the original byte data supplied to the Charset detect operation.
79      * The length of the returned string is limited to the specified size;
80      * the string will be trunctated to this length if necessary. A limit value of
81      * zero or less is ignored, and treated as no limit.
82      *
83      * @param maxLength The maximium length of the String to be created when the
84      * source of the data is an input stream, or -1 for
85      * unlimited length.
86      * @return a String created from the converted input data.
87      *
88      * @draft ICU 3.4
89      * @provisional This API might change or be removed in a future release.
90      */

91     public String JavaDoc getString(int maxLength) throws java.io.IOException JavaDoc {
92         String JavaDoc result = null;
93         if (fInputStream != null) {
94             StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
95             char[] buffer = new char[1024];
96             Reader JavaDoc reader = getReader();
97             int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
98             int bytesRead = 0;
99             
100             while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
101                 sb.append(buffer, 0, bytesRead);
102                 max -= bytesRead;
103             }
104             
105             reader.close();
106             
107             return sb.toString();
108         } else {
109             result = new String JavaDoc(fRawInput, getName());
110         }
111         return result;
112
113     }
114     
115     /**
116      * Get an indication of the confidence in the charset detected.
117      * Confidence values range from 0-100, with larger numbers indicating
118      * a better match of the input data to the characteristics of the
119      * charset.
120      *
121      * @return the confidence in the charset match
122      *
123      * @draft ICU 3.4
124      * @provisional This API might change or be removed in a future release.
125      */

126     public int getConfidence() {
127         return fConfidence;
128     }
129     
130
131     /**
132      * Bit flag indicating the match is based on the the encoding scheme.
133      *
134      * @see #getMatchType
135      * @draft ICU 3.4
136      * @provisional This API might change or be removed in a future release.
137      */

138     static public final int ENCODING_SCHEME = 1;
139     
140     /**
141      * Bit flag indicating the match is based on the presence of a BOM.
142      *
143      * @see #getMatchType
144      * @draft ICU 3.4
145      * @provisional This API might change or be removed in a future release.
146      */

147     static public final int BOM = 2;
148     
149     /**
150      * Bit flag indicating he match is based on the declared encoding.
151      *
152      * @see #getMatchType
153      * @draft ICU 3.4
154      * @provisional This API might change or be removed in a future release.
155      */

156     static public final int DECLARED_ENCODING = 4;
157     
158     /**
159      * Bit flag indicating the match is based on language statistics.
160      *
161      * @see #getMatchType
162      * @draft ICU 3.4
163      * @provisional This API might change or be removed in a future release.
164      */

165     static public final int LANG_STATISTICS = 8;
166     
167     /**
168      * Return flags indicating what it was about the input data
169      * that caused this charset to be considered as a possible match.
170      * The result is a bitfield containing zero or more of the flags
171      * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
172      * A result of zero means no information is available.
173      * <p>
174      * Note: currently, this method always returns zero.
175      * <p>
176      *
177      * @return the type of match found for this charset.
178      *
179      * @draft ICU 3.4
180      * @provisional This API might change or be removed in a future release.
181      */

182     public int getMatchType() {
183 // TODO: create a list of enum-like constants for common combinations of types of matches.
184
return 0;
185     }
186
187     /**
188      * Get the name of the detected charset.
189      * The name will be one that can be used with other APIs on the
190      * platform that accept charset names. It is the "Canonical name"
191      * as defined by the class java.nio.charset.Charset; for
192      * charsets that are registered with the IANA charset registry,
193      * this is the MIME-preferred registerd name.
194      *
195      * @see java.nio.charset.Charset
196      * @see java.io.InputStreamReader
197      *
198      * @return The name of the charset.
199      *
200      * @draft ICU 3.4
201      * @provisional This API might change or be removed in a future release.
202      */

203     public String JavaDoc getName() {
204         return fRecognizer.getName();
205     }
206     
207     /**
208      * Get the ISO code for the language of the detected charset.
209      *
210      * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
211      *
212      * @draft ICU 3.4
213      * @provisional This API might change or be removed in a future release.
214      */

215     public String JavaDoc getLanguage() {
216         return fRecognizer.getLanguage();
217     }
218     
219     
220     /**
221      * Compare to other CharsetMatch objects.
222      * Comparison is based on the match confidence value, which
223      * allows CharsetDetector.detectAll() to order its results.
224      *
225      * @param o the CharsetMatch object to compare against.
226      * @return a negative integer, zero, or a positive integer as the
227      * confidence level of this CharsetMatch
228      * is less than, equal to, or greater than that of
229      * the argument.
230      * @throws ClassCastException if the argument is not a CharsetMatch.
231      * @draft ICU 3.4
232      * @provisional This API might change or be removed in a future release.
233      */

234     public int compareTo (Object JavaDoc o) {
235         CharsetMatch other = (CharsetMatch)o;
236         int compareResult = 0;
237         if (this.fConfidence > other.fConfidence) {
238             compareResult = 1;
239         } else if (this.fConfidence < other.fConfidence) {
240             compareResult = -1;
241         }
242         return compareResult;
243     }
244     
245     /**
246      * Constructor. Implementation internal
247      *
248      * @internal
249      */

250     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
251         fRecognizer = rec;
252         fConfidence = conf;
253         
254         // The references to the original aplication input data must be copied out
255
// of the charset recognizer to here, in case the application resets the
256
// recognizer before using this CharsetMatch.
257
if (det.fInputStream == null) {
258             // We only want the existing input byte data if it came straight from the user,
259
// not if is just the head of a stream.
260
fRawInput = det.fRawInput;
261             fRawLength = det.fRawLength;
262         }
263         fInputStream = det.fInputStream;
264     }
265
266     
267     //
268
// Private Data
269
//
270
private int fConfidence;
271     private CharsetRecognizer fRecognizer;
272     private byte[] fRawInput = null; // Original, untouched input bytes.
273
// If user gave us a byte array, this is it.
274
private int fRawLength; // Length of data in fRawInput array.
275

276     private InputStream JavaDoc fInputStream = null; // User's input stream, or null if the user
277
// gave us a byte array.
278
}
279
Popular Tags