CharsetRecog_mbcs


1   /*
2    ****************************************************************************
3    * Copyright (C) 2005-2006, International Business Machines Corporation and *
4    * others. All Rights Reserved.                                             *
5    ****************************************************************************
6    *
7    */
8   package com.ibm.icu.text;
9   
10  import java.util.Arrays  ;
11  
12  /**
13   * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
14   *                   Match is determined mostly by the input data adhering to the
15   *                   encoding scheme for the charset, and, optionally,
16   *                   frequency-of-occurence of characters.
17   * <p/>
18   *                   Instances of this class are singletons, one per encoding
19   *                   being recognized.  They are created in the main
20   *                   CharsetDetector class and kept in the global list of available
21   *                   encodings to be checked.  The specific encoding being recognized
22   *                   is determined by subclass.
23   * 
24   * @internal                  
25   */
26  abstract class CharsetRecog_mbcs extends CharsetRecognizer {
27  
28     /**
29       * Get the IANA name of this charset.
30       * @return the charset name.
31       */
32      abstract String        getName() ;
33      
34      
35      /**
36       * Test the match of this charset with the input text data
37       *      which is obtained via the CharsetDetector object.
38       * 
39       * @param det  The CharsetDetector, which contains the input text
40       *             to be checked for being in this charset.
41       * @return     Two values packed into one int  (Damn java, anyhow)
42       *             <br/>
43       *             bits 0-7:  the match confidence, ranging from 0-100
44       *             <br/>
45       *             bits 8-15: The match reason, an enum-like value.
46       */
47      int match(CharsetDetector det, int [] commonChars) {
48          int   singleByteCharCount = 0;
49          int   doubleByteCharCount = 0;
50          int   commonCharCount     = 0;
51          int   badCharCount        = 0;
52          int   totalCharCount      = 0;
53          int   confidence          = 0;
54          iteratedChar   iter       = new iteratedChar();
55          
56          detectBlock: {
57              for (iter.reset(); nextChar(iter, det);) {
58                  totalCharCount++;
59                  if (iter.error) {
60                      badCharCount++; 
61                  } else {
62                      
63                      if (iter.charValue <= 0xff) {
64                          singleByteCharCount++;
65                      } else {
66                          doubleByteCharCount++;
67                          if (commonChars != null) {
68                              if (Arrays.binarySearch(commonChars, iter.charValue) >= 0) {
69                                  commonCharCount++;
70                              }
71                          }
72                      }
73                  }
74                  if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
75                      // Bail out early if the byte data is not matching the encoding scheme.
76                      break detectBlock;
77                  }
78              }
79              
80              if (doubleByteCharCount <= 10 && badCharCount== 0) {
81                  // Not many multi-byte chars.
82                  //   ASCII or ISO file?  It's probably not our encoding,
83                  //   but is not incompatible with our encoding, so don't give it a zero.
84                  confidence = 10;
85                  break detectBlock;
86              }
87              
88              //
89              //  No match if there are too many characters that don't fit the encoding scheme.
90              //    (should we have zero tolerance for these?)
91              //
92              if (doubleByteCharCount < 20*badCharCount) {
93                  confidence = 0;
94                  break detectBlock;
95              }
96              
97              if (commonChars == null) {
98                  // We have no statistics on frequently occuring characters.
99                  //  Assess confidence purely on having a reasonable number of
100                 //  multi-byte characters (the more the better
101                 confidence = 30 + doubleByteCharCount - 20*badCharCount;
102                 if (confidence > 100) {
103                     confidence = 100;
104                 }
105             }else {
106                 //
107                 // Frequency of occurence statistics exist.
108                 //
109                 double maxVal = Math.log((float)doubleByteCharCount / 4);
110                 double scaleFactor = 90.0 / maxVal;
111                 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
112                 confidence = Math.min(confidence, 100);
113             }
114         }   // end of detectBlock:
115         
116         return confidence;
117     }
118     
119      // "Character"  iterated character class.
120      //    Recognizers for specific mbcs encodings make their "characters" available
121      //    by providing a nextChar() function that fills in an instance of iteratedChar
122      //    with the next char from the input.
123      //    The returned characters are not converted to Unicode, but remain as the raw
124      //    bytes (concatenated into an int) from the codepage data.
125      //
126      //  For Asian charsets, use the raw input rather than the input that has been
127      //   stripped of markup.  Detection only considers multi-byte chars, effectively
128      //   stripping markup anyway, and double byte chars do occur in markup too.
129      //
130      static class iteratedChar {
131          int             charValue = 0;             // 1-4 bytes from the raw input data
132          int             index     = 0;
133          int             nextIndex = 0;
134          boolean         error     = false;
135          boolean         done      = false;
136          
137          void reset() {
138              charValue = 0;
139              index     = -1;
140              nextIndex = 0;
141              error     = false;
142              done      = false;
143          }
144          
145          int nextByte(CharsetDetector det) {
146              if (nextIndex >= det.fRawLength) {
147                  done = true;
148                  return -1;
149              }
150              int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;
151              return byteValue;
152          }       
153      }
154      
155      /**
156       * Get the next character (however many bytes it is) from the input data
157       *    Subclasses for specific charset encodings must implement this function
158       *    to get characters according to the rules of their encoding scheme.
159       * 
160       *  This function is not a method of class iteratedChar only because
161       *   that would require a lot of extra derived classes, which is awkward.
162       * @param it  The iteratedChar "struct" into which the returned char is placed.
163       * @param det The charset detector, which is needed to get at the input byte data
164       *            being iterated over.
165       * @return    True if a character was returned, false at end of input.
166       */
167      abstract boolean nextChar(iteratedChar it, CharsetDetector det);
168      
169 
170 
171      
172      
173      /**
174       *   Shift-JIS charset recognizer.   
175       *
176       */
177      static class CharsetRecog_sjis extends CharsetRecog_mbcs {
178          static int [] commonChars = 
179              // TODO:  This set of data comes from the character frequency-
180              //        of-occurence analysis tool.  The data needs to be moved
181              //        into a resource and loaded from there.
182             {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 
183              0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 
184              0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 
185              0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 
186              0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 
187              0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
188          
189          boolean nextChar(iteratedChar it, CharsetDetector det) {
190              it.index = it.nextIndex;
191              it.error = false;
192              int firstByte;
193              firstByte = it.charValue = it.nextByte(det);
194              if (firstByte < 0) {
195                  return false;
196              }
197              
198              if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
199                  return true;
200              }
201              
202              int secondByte = it.nextByte(det);
203              if (secondByte < 0)  {
204                  return false;          
205              }
206              it.charValue = (firstByte << 8) | secondByte;
207              if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
208                  // Illegal second byte value.
209                  it.error = true;
210              }
211              return true;
212          }
213          
214          int match(CharsetDetector det) {
215              return match(det, commonChars);
216          }
217          
218          String   getName() {
219              return "Shift_JIS";
220          }
221          
222          public String   getLanguage()
223          {
224              return "ja";
225          }
226 
227          
228      }
229      
230      
231      /**
232       *   Big5 charset recognizer.   
233       *
234       */
235      static class CharsetRecog_big5 extends CharsetRecog_mbcs {
236          static int [] commonChars = 
237              // TODO:  This set of data comes from the character frequency-
238              //        of-occurence analysis tool.  The data needs to be moved
239              //        into a resource and loaded from there.
240             {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 
241              0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 
242              0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 
243              0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 
244              0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 
245              0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 
246              0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 
247              0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 
248              0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 
249              0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
250           
251          boolean nextChar(iteratedChar it, CharsetDetector det) {
252              it.index = it.nextIndex;
253              it.error = false;
254              int firstByte;
255              firstByte = it.charValue = it.nextByte(det);
256              if (firstByte < 0) {
257                  return false;
258              }
259              
260              if (firstByte <= 0x7f || firstByte==0xff) {
261                  // single byte character.
262                  return true;
263              }
264              
265              int secondByte = it.nextByte(det);
266              if (secondByte < 0)  {
267                  return false;          
268              }
269              it.charValue = (it.charValue << 8) | secondByte;
270 
271              if (secondByte < 0x40 ||
272                  secondByte ==0x7f ||
273                  secondByte == 0xff) {
274                      it.error = true;
275              }
276              return true;
277          }
278          
279          int match(CharsetDetector det) {
280              return match(det, commonChars);
281          }
282          
283          String   getName() {
284              return "Big5";
285          }
286          
287          
288          public String   getLanguage()
289          {
290              return "zh";
291          }
292      }
293      
294      
295      /**
296       *   EUC charset recognizers.  One abstract class that provides the common function
297       *             for getting the next character according to the EUC encoding scheme,
298       *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.   
299       *
300       */
301      abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
302          
303          /*
304           *  (non-Javadoc)
305           *  Get the next character value for EUC based encodings.
306           *  Character "value" is simply the raw bytes that make up the character
307           *     packed into an int.
308           */
309          boolean nextChar(iteratedChar it, CharsetDetector det) {
310              it.index = it.nextIndex;
311              it.error = false;
312              int firstByte  = 0;
313              int secondByte = 0;
314              int thirdByte  = 0;
315              int fourthByte = 0;
316              
317              buildChar: {
318                  firstByte = it.charValue = it.nextByte(det);                 
319                  if (firstByte < 0) {
320                      // Ran off the end of the input data
321                      it.done = true;
322                      break buildChar;
323                  }
324                  if (firstByte <= 0x8d) {
325                      // single byte char
326                      break buildChar;
327                  }
328                  
329                  secondByte = it.nextByte(det);
330                  it.charValue = (it.charValue << 8) | secondByte;
331                  
332                  if (firstByte >= 0xA1 && firstByte <= 0xfe) {
333                      // Two byte Char
334                      if (secondByte < 0xa1) {
335                          it.error = true;
336                      }
337                      break buildChar;
338                  }
339                  if (firstByte == 0x8e) {
340                      // Code Set 2.
341                      //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
342                      //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
343                      // We don't know which we've got.
344                      // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
345                      //   bytes will look like a well formed 2 byte char.  
346                      if (secondByte < 0xa1) {
347                          it.error = true;
348                      }
349                      break buildChar;                     
350                  }
351                  
352                  if (firstByte == 0x8f) {
353                      // Code set 3.
354                      // Three byte total char size, two bytes of actual char value.
355                      thirdByte    = it.nextByte(det);
356                      it.charValue = (it.charValue << 8) | thirdByte;
357                      if (thirdByte < 0xa1) {
358                          it.error = true;
359                      }
360                  }
361               }
362              
363              return (it.done == false);
364          }
365          
366          /**
367           * The charset recognize for EUC-JP.  A singleton instance of this class
368           *    is created and kept by the public CharsetDetector class
369           */
370          static class CharsetRecog_euc_jp extends CharsetRecog_euc {
371              static int [] commonChars = 
372                  // TODO:  This set of data comes from the character frequency-
373                  //        of-occurence analysis tool.  The data needs to be moved
374                  //        into a resource and loaded from there.
375                 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 
376                  0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 
377                  0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 
378                  0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 
379                  0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 
380                  0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 
381                  0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 
382                  0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 
383                  0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 
384                  0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};             
385              String   getName() {
386                  return "EUC-JP";
387              }
388              
389              int match(CharsetDetector det) {
390                  return match(det, commonChars);
391              }
392              
393              public String   getLanguage()
394              {
395                  return "ja";
396              }
397          }
398          
399          /**
400           * The charset recognize for EUC-KR.  A singleton instance of this class
401           *    is created and kept by the public CharsetDetector class
402           */
403          static class CharsetRecog_euc_kr extends CharsetRecog_euc {
404              static int [] commonChars = 
405                  // TODO:  This set of data comes from the character frequency-
406                  //        of-occurence analysis tool.  The data needs to be moved
407                  //        into a resource and loaded from there.
408                 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 
409                  0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 
410                  0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 
411                  0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 
412                  0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 
413                  0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 
414                  0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 
415                  0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 
416                  0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 
417                  0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
418              
419              String   getName() {
420                  return "EUC-KR";
421              }
422              
423              int match(CharsetDetector det) {
424                  return match(det, commonChars);
425              }
426              
427              public String   getLanguage()
428              {
429                  return "ko";
430              }
431          }
432      }
433      
434      /**
435       * 
436       *   GB-18030 recognizer. Uses simplified Chinese statistics.   
437       *
438       */
439      static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
440          
441          /*
442           *  (non-Javadoc)
443           *  Get the next character value for EUC based encodings.
444           *  Character "value" is simply the raw bytes that make up the character
445           *     packed into an int.
446           */
447          boolean nextChar(iteratedChar it, CharsetDetector det) {
448              it.index = it.nextIndex;
449              it.error = false;
450              int firstByte  = 0;
451              int secondByte = 0;
452              int thirdByte  = 0;
453              int fourthByte = 0;
454              
455              buildChar: {
456                  firstByte = it.charValue = it.nextByte(det); 
457                  
458                  if (firstByte < 0) {
459                      // Ran off the end of the input data
460                      it.done = true;
461                      break buildChar;
462                  }
463                  
464                  if (firstByte <= 0x80) {
465                      // single byte char
466                      break buildChar;
467                  }
468                  
469                  secondByte = it.nextByte(det);
470                  it.charValue = (it.charValue << 8) | secondByte;
471                  
472                  if (firstByte >= 0x81 && firstByte <= 0xFE) {
473                      // Two byte Char
474                      if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
475                          break buildChar;
476                      }
477                      
478                      // Four byte char
479                      if (secondByte >= 0x30 && secondByte <= 0x39) {
480                          thirdByte = it.nextByte(det);
481                          
482                          if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
483                              fourthByte = it.nextByte(det);
484                              
485                              if (fourthByte >= 0x30 && fourthByte <= 0x39) {
486                                  it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
487                                  break buildChar;
488                              }
489                          }
490                      }
491                      
492                      it.error = true;
493                      break buildChar;
494                  }
495              }
496                  
497              return (it.done == false);
498          }
499          
500          static int [] commonChars = 
501              // TODO:  This set of data comes from the character frequency-
502              //        of-occurence analysis tool.  The data needs to be moved
503              //        into a resource and loaded from there.
504             {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 
505              0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 
506              0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 
507              0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 
508              0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 
509              0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 
510              0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 
511              0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 
512              0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 
513              0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
514 
515          
516          String   getName() {
517              return "GB18030";
518          }
519          
520          int match(CharsetDetector det) {
521              return match(det, commonChars);
522          }
523          
524          public String   getLanguage()
525          {
526              return "zh";
527          }
528      }
529      
530      
531 }
532
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags