KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > CharsetRecog_UTF8


1 /**
2 *******************************************************************************
3 * Copyright (C) 2005, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
6 */

7 package com.ibm.icu.text;
8
9 /**
10  * Charset recognizer for UTF-8
11  *
12  * @internal
13  */

14 class CharsetRecog_UTF8 extends CharsetRecognizer {
15
16     String JavaDoc getName() {
17         return "UTF-8";
18     }
19
20     /* (non-Javadoc)
21      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
22      */

23     int match(CharsetDetector det) {
24         boolean hasBOM = false;
25         int numValid = 0;
26         int numInvalid = 0;
27         byte input[] = det.fRawInput;
28         int i;
29         int trailBytes = 0;
30         int confidence;
31         
32         if (det.fRawLength >= 3 &&
33                 input[0]==0xef && input[1]==0xbb & input[2]==0xbf) {
34             hasBOM = true;
35         }
36         
37         // Scan for multi-byte sequences
38
for (i=0; i<det.fRawLength; i++) {
39             int b = input[i];
40             if ((b & 0x80) == 0) {
41                 continue; // ASCII
42
}
43             
44             // Hi bit on char found. Figure out how long the sequence should be
45
if ((b & 0x0e0) == 0x0c0) {
46                 trailBytes = 1;
47             } else if ((b & 0x0f0) == 0x0e0) {
48                 trailBytes = 2;
49             } else if ((b & 0x0f8) == 0xf0) {
50                 trailBytes = 3;
51             } else {
52                 numInvalid++;
53                 if (numInvalid > 5) {
54                     break;
55                 }
56                 trailBytes = 0;
57             }
58                 
59             // Verify that we've got the right number of trail bytes in the sequence
60
for (;;) {
61                 i++;
62                 if (i>=det.fRawLength) {
63                     break;
64                 }
65                 b = input[i];
66                 if ((b & 0xc0) != 0x080) {
67                     numInvalid++;
68                     break;
69                 }
70                 if (--trailBytes == 0) {
71                     numValid++;
72                     break;
73                 }
74             }
75                         
76         }
77         
78         // Cook up some sort of confidence score, based on presense of a BOM
79
// and the existence of valid and/or invalid multi-byte sequences.
80
confidence = 0;
81         if (hasBOM && numInvalid==0) {
82             confidence = 100;
83         } else if (hasBOM && numValid > numInvalid*10) {
84             confidence = 80;
85         } else if (numValid > 3 && numInvalid == 0) {
86             confidence = 100;
87         } else if (numValid > 0 && numInvalid == 0) {
88             confidence = 80;
89         } else if (numValid == 0 && numInvalid == 0) {
90             // Plain ASCII.
91
confidence = 10;
92         } else if (numValid > numInvalid*10) {
93             // Probably corruput utf-8 data. Valid sequences aren't likely by chance.
94
confidence = 25;
95         }
96         return confidence;
97     }
98
99 }
100
Popular Tags