CharsetRecog_Unicode


1   /*
2    *******************************************************************************
3    * Copyright (C) 1996-2006, International Business Machines Corporation and    *
4    * others. All Rights Reserved.                                                *
5    *******************************************************************************
6    *
7    */
8   
9   package com.ibm.icu.text;
10  
11  /**
12   * This class matches UTF-16 and UTF-32, both big- and little-endian. The
13   * BOM will be used if it is present.
14   * 
15   * @internal
16   */
17  abstract class CharsetRecog_Unicode extends CharsetRecognizer {
18  
19      /* (non-Javadoc)
20       * @see com.ibm.icu.text.CharsetRecognizer#getName()
21       */
22      abstract String   getName();
23  
24      /* (non-Javadoc)
25       * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
26       */
27      abstract int match(CharsetDetector det);
28      
29      static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
30      {
31          String   getName()
32          {
33              return "UTF-16BE";
34          }
35          
36          int match(CharsetDetector det)
37          {
38              byte[] input = det.fRawInput;
39              
40              if ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF) {
41                  return 100;
42              }
43              
44              // TODO: Do some statastics to check for unsigned UTF-16BE
45              return 0;
46          }
47      }
48      
49      static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
50      {
51          String   getName()
52          {
53              return "UTF-16LE";
54          }
55          
56          int match(CharsetDetector det)
57          {
58              byte[] input = det.fRawInput;
59              
60              if ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) {
61                  return 100;
62              }
63              
64              // TODO: Do some statastics to check for unsigned UTF-16LE
65              return 0;
66          }
67      }
68      
69      static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
70      {
71          abstract int getChar(byte[] input, int index);
72          
73          abstract String   getName();
74          
75          int match(CharsetDetector det)
76          {
77              byte[] input   = det.fRawInput;
78              int limit      = (det.fRawLength / 4) * 4;
79              int numValid   = 0;
80              int numInvalid = 0;
81              boolean hasBOM = false;
82              int confidence = 0;
83              
84              if (getChar(input, 0) == 0x0000FEFF) {
85                  hasBOM = true;
86              }
87              
88              for(int i = 0; i < limit; i += 4) {
89                  int ch = getChar(input, i);
90                  
91                  if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
92                      numInvalid += 1;
93                  } else {
94                      numValid += 1;
95                  }
96              }
97              
98              
99              // Cook up some sort of confidence score, based on presense of a BOM
100             //    and the existence of valid and/or invalid multi-byte sequences.
101             if (hasBOM && numInvalid==0) {
102                 confidence = 100;
103             } else if (hasBOM && numValid > numInvalid*10) {
104                 confidence = 80;
105             } else if (numValid > 3 && numInvalid == 0) {
106                 confidence = 100;            
107             } else if (numValid > 0 && numInvalid == 0) {
108                 confidence = 80;
109             } else if (numValid > numInvalid*10) {
110                 // Probably corruput UTF-32BE data.  Valid sequences aren't likely by chance.
111                 confidence = 25;
112             }
113             
114             return confidence;
115         }
116     }
117     
118     static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
119     {
120         int getChar(byte[] input, int index)
121         {
122             return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
123                    (input[index + 2] & 0xFF) <<  8 | (input[index + 3] & 0xFF);
124         }
125         
126         String   getName()
127         {
128             return "UTF-32BE";
129         }
130     }
131 
132     
133     static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
134     {
135         int getChar(byte[] input, int index)
136         {
137             return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
138                    (input[index + 1] & 0xFF) <<  8 | (input[index + 0] & 0xFF);
139         }
140         
141         String   getName()
142         {
143             return "UTF-32LE";
144         }
145     }
146 }
147
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags