KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > CharsetRecog_2022


1 /*
2 *******************************************************************************
3 * Copyright (C) 2005, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
6 */

7 package com.ibm.icu.text;
8
9 /**
10  * class CharsetRecog_2022 part of the ICU charset detection imlementation.
11  * This is a superclass for the individual detectors for
12  * each of the detectable members of the ISO 2022 family
13  * of encodings.
14  *
15  * The separate classes are nested within this class.
16  *
17  * @internal
18  */

19 abstract class CharsetRecog_2022 extends CharsetRecognizer {
20
21     
22     /**
23      * Matching function shared among the 2022 detectors JP, CN and KR
24      * Counts up the number of legal an unrecognized escape sequences in
25      * the sample of text, and computes a score based on the total number &
26      * the proportion that fit the encoding.
27      *
28      *
29      * @param text the byte buffer containing text to analyse
30      * @param textLen the size of the text in the byte.
31      * @param escapeSequences the byte escape sequences to test for.
32      * @return match quality, in the range of 0-100.
33      */

34     int match(byte [] text, int textLen, byte [][] escapeSequences) {
35         int i, j;
36         int escN;
37         int hits = 0;
38         int misses = 0;
39         int shifts = 0;
40         int quality;
41         scanInput:
42             for (i=0; i<textLen; i++) {
43                 if (text[i] == 0x1b) {
44                     checkEscapes:
45                         for (escN=0; escN<escapeSequences.length; escN++) {
46                             byte [] seq = escapeSequences[escN];
47                             
48                             for (j=1; j<seq.length; j++) {
49                                 if (seq[j] != text[i+j]) {
50                                     continue checkEscapes;
51                                 }
52                             }
53                             
54                             hits++;
55                             i += seq.length-1;
56                             continue scanInput;
57                         }
58                 
59                         misses++;
60                 }
61                 
62                 if (text[i] == 0x0e || text[i] == 0x0f) {
63                     // Shift in/out
64
shifts++;
65                 }
66             }
67         
68         if (hits == 0) {
69             return 0;
70         }
71         
72         //
73
// Initial quality is based on relative proportion of recongized vs.
74
// unrecognized escape sequences.
75
// All good: quality = 100;
76
// half or less good: quality = 0;
77
// linear inbetween.
78
quality = (100*hits - 100*misses) / (hits + misses);
79         
80         // Back off quality if there were too few escape sequences seen.
81
// Include shifts in this computation, so that KR does not get penalized
82
// for having only a single Escape sequence, but many shifts.
83
if (hits+shifts < 5) {
84             quality -= (5-(hits+shifts))*10;
85         }
86         
87         if (quality < 0) {
88             quality = 0;
89         }
90         return quality;
91     }
92
93     
94  
95     
96     static class CharsetRecog_2022JP extends CharsetRecog_2022 {
97         private byte [] [] escapeSequences = {
98                 {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992
99
{0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990
100
{0x1b, 0x24, 0x40}, // JIS C 6226-1978
101
{0x1b, 0x24, 0x41}, // GB 2312-80
102
{0x1b, 0x24, 0x42}, // JIS X 208-1983
103
{0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997
104
{0x1b, 0x28, 0x42}, // ASCII
105
{0x1b, 0x28, 0x48}, // JIS-Roman
106
{0x1b, 0x28, 0x49}, // Half-width katakana
107
{0x1b, 0x28, 0x4a}, // JIS-Roman
108
{0x1b, 0x2e, 0x41}, // ISO 8859-1
109
{0x1b, 0x2e, 0x46} // ISO 8859-7
110
};
111         
112         String JavaDoc getName() {
113             return "ISO-2022-JP";
114         }
115         
116         int match(CharsetDetector det) {
117             return match(det.fInputBytes, det.fInputLen, escapeSequences);
118         }
119     }
120
121     static class CharsetRecog_2022KR extends CharsetRecog_2022 {
122         private byte [] [] escapeSequences = {
123                 {0x1b, 0x24, 0x29, 0x43}
124                  };
125         
126         String JavaDoc getName() {
127             return "ISO-2022-KR";
128         }
129         
130         int match(CharsetDetector det) {
131             return match(det.fInputBytes, det.fInputLen, escapeSequences);
132         }
133         
134     }
135
136     static class CharsetRecog_2022CN extends CharsetRecog_2022 {
137         private byte [] [] escapeSequences = {
138                 {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80
139
{0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
140
{0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
141
{0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165
142
{0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
143
{0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
144
{0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
145
{0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
146
{0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
147
{0x1b, 0x4e}, // SS2
148
{0x1b, 0x4f}, // SS3
149
};
150         
151         String JavaDoc getName() {
152             return "ISO-2022-CN";
153         }
154         
155         
156         int match(CharsetDetector det) {
157             return match(det.fInputBytes, det.fInputLen, escapeSequences);
158         }
159     }
160     
161     }
162
163
Popular Tags