KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > CharsetRecog_Unicode


1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2006, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  *
7  */

8
9 package com.ibm.icu.text;
10
11 /**
12  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
13  * BOM will be used if it is present.
14  *
15  * @internal
16  */

17 abstract class CharsetRecog_Unicode extends CharsetRecognizer {
18
19     /* (non-Javadoc)
20      * @see com.ibm.icu.text.CharsetRecognizer#getName()
21      */

22     abstract String JavaDoc getName();
23
24     /* (non-Javadoc)
25      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
26      */

27     abstract int match(CharsetDetector det);
28     
29     static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
30     {
31         String JavaDoc getName()
32         {
33             return "UTF-16BE";
34         }
35         
36         int match(CharsetDetector det)
37         {
38             byte[] input = det.fRawInput;
39             
40             if ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF) {
41                 return 100;
42             }
43             
44             // TODO: Do some statastics to check for unsigned UTF-16BE
45
return 0;
46         }
47     }
48     
49     static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
50     {
51         String JavaDoc getName()
52         {
53             return "UTF-16LE";
54         }
55         
56         int match(CharsetDetector det)
57         {
58             byte[] input = det.fRawInput;
59             
60             if ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) {
61                 return 100;
62             }
63             
64             // TODO: Do some statastics to check for unsigned UTF-16LE
65
return 0;
66         }
67     }
68     
69     static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
70     {
71         abstract int getChar(byte[] input, int index);
72         
73         abstract String JavaDoc getName();
74         
75         int match(CharsetDetector det)
76         {
77             byte[] input = det.fRawInput;
78             int limit = (det.fRawLength / 4) * 4;
79             int numValid = 0;
80             int numInvalid = 0;
81             boolean hasBOM = false;
82             int confidence = 0;
83             
84             if (getChar(input, 0) == 0x0000FEFF) {
85                 hasBOM = true;
86             }
87             
88             for(int i = 0; i < limit; i += 4) {
89                 int ch = getChar(input, i);
90                 
91                 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
92                     numInvalid += 1;
93                 } else {
94                     numValid += 1;
95                 }
96             }
97             
98             
99             // Cook up some sort of confidence score, based on presense of a BOM
100
// and the existence of valid and/or invalid multi-byte sequences.
101
if (hasBOM && numInvalid==0) {
102                 confidence = 100;
103             } else if (hasBOM && numValid > numInvalid*10) {
104                 confidence = 80;
105             } else if (numValid > 3 && numInvalid == 0) {
106                 confidence = 100;
107             } else if (numValid > 0 && numInvalid == 0) {
108                 confidence = 80;
109             } else if (numValid > numInvalid*10) {
110                 // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
111
confidence = 25;
112             }
113             
114             return confidence;
115         }
116     }
117     
118     static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
119     {
120         int getChar(byte[] input, int index)
121         {
122             return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
123                    (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
124         }
125         
126         String JavaDoc getName()
127         {
128             return "UTF-32BE";
129         }
130     }
131
132     
133     static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
134     {
135         int getChar(byte[] input, int index)
136         {
137             return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
138                    (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
139         }
140         
141         String JavaDoc getName()
142         {
143             return "UTF-32LE";
144         }
145     }
146 }
147
Popular Tags