1 8 package com.ibm.icu.text; 9 10 import java.util.Arrays ; 11 12 26 abstract class CharsetRecog_mbcs extends CharsetRecognizer { 27 28 32 abstract String getName() ; 33 34 35 47 int match(CharsetDetector det, int [] commonChars) { 48 int singleByteCharCount = 0; 49 int doubleByteCharCount = 0; 50 int commonCharCount = 0; 51 int badCharCount = 0; 52 int totalCharCount = 0; 53 int confidence = 0; 54 iteratedChar iter = new iteratedChar(); 55 56 detectBlock: { 57 for (iter.reset(); nextChar(iter, det);) { 58 totalCharCount++; 59 if (iter.error) { 60 badCharCount++; 61 } else { 62 63 if (iter.charValue <= 0xff) { 64 singleByteCharCount++; 65 } else { 66 doubleByteCharCount++; 67 if (commonChars != null) { 68 if (Arrays.binarySearch(commonChars, iter.charValue) >= 0) { 69 commonCharCount++; 70 } 71 } 72 } 73 } 74 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 75 break detectBlock; 77 } 78 } 79 80 if (doubleByteCharCount <= 10 && badCharCount== 0) { 81 confidence = 10; 85 break detectBlock; 86 } 87 88 if (doubleByteCharCount < 20*badCharCount) { 93 confidence = 0; 94 break detectBlock; 95 } 96 97 if (commonChars == null) { 98 confidence = 30 + doubleByteCharCount - 20*badCharCount; 102 if (confidence > 100) { 103 confidence = 100; 104 } 105 }else { 106 double maxVal = Math.log((float)doubleByteCharCount / 4); 110 double scaleFactor = 90.0 / maxVal; 111 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10); 112 confidence = Math.min(confidence, 100); 113 } 114 } 116 return confidence; 117 } 118 119 static class iteratedChar { 131 int charValue = 0; int index = 0; 133 int nextIndex = 0; 134 boolean error = false; 135 boolean done = false; 136 137 void reset() { 138 charValue = 0; 139 index = -1; 140 nextIndex = 0; 141 error = false; 142 done = false; 143 } 144 145 int nextByte(CharsetDetector det) { 146 if (nextIndex >= det.fRawLength) { 147 done = true; 148 return -1; 149 } 150 int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff; 151 return byteValue; 152 } 153 } 154 155 167 abstract boolean nextChar(iteratedChar it, CharsetDetector det); 168 169 170 171 172 173 177 static class CharsetRecog_sjis extends CharsetRecog_mbcs { 178 static int [] commonChars = 179 {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 183 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 184 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 185 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 186 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 187 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 188 189 boolean nextChar(iteratedChar it, CharsetDetector det) { 190 it.index = it.nextIndex; 191 it.error = false; 192 int firstByte; 193 firstByte = it.charValue = it.nextByte(det); 194 if (firstByte < 0) { 195 return false; 196 } 197 198 if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) { 199 return true; 200 } 201 202 int secondByte = it.nextByte(det); 203 if (secondByte < 0) { 204 return false; 205 } 206 it.charValue = (firstByte << 8) | secondByte; 207 if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) { 208 it.error = true; 210 } 211 return true; 212 } 213 214 int match(CharsetDetector det) { 215 return match(det, commonChars); 216 } 217 218 String getName() { 219 return "Shift_JIS"; 220 } 221 222 public String getLanguage() 223 { 224 return "ja"; 225 } 226 227 228 } 229 230 231 235 static class CharsetRecog_big5 extends CharsetRecog_mbcs { 236 static int [] commonChars = 237 {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 241 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 242 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 243 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 244 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 245 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 246 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 247 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 248 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 249 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 250 251 boolean nextChar(iteratedChar it, CharsetDetector det) { 252 it.index = it.nextIndex; 253 it.error = false; 254 int firstByte; 255 firstByte = it.charValue = it.nextByte(det); 256 if (firstByte < 0) { 257 return false; 258 } 259 260 if (firstByte <= 0x7f || firstByte==0xff) { 261 return true; 263 } 264 265 int secondByte = it.nextByte(det); 266 if (secondByte < 0) { 267 return false; 268 } 269 it.charValue = (it.charValue << 8) | secondByte; 270 271 if (secondByte < 0x40 || 272 secondByte ==0x7f || 273 secondByte == 0xff) { 274 it.error = true; 275 } 276 return true; 277 } 278 279 int match(CharsetDetector det) { 280 return match(det, commonChars); 281 } 282 283 String getName() { 284 return "Big5"; 285 } 286 287 288 public String getLanguage() 289 { 290 return "zh"; 291 } 292 } 293 294 295 301 abstract static class CharsetRecog_euc extends CharsetRecog_mbcs { 302 303 309 boolean nextChar(iteratedChar it, CharsetDetector det) { 310 it.index = it.nextIndex; 311 it.error = false; 312 int firstByte = 0; 313 int secondByte = 0; 314 int thirdByte = 0; 315 int fourthByte = 0; 316 317 buildChar: { 318 firstByte = it.charValue = it.nextByte(det); 319 if (firstByte < 0) { 320 it.done = true; 322 break buildChar; 323 } 324 if (firstByte <= 0x8d) { 325 break buildChar; 327 } 328 329 secondByte = it.nextByte(det); 330 it.charValue = (it.charValue << 8) | secondByte; 331 332 if (firstByte >= 0xA1 && firstByte <= 0xfe) { 333 if (secondByte < 0xa1) { 335 it.error = true; 336 } 337 break buildChar; 338 } 339 if (firstByte == 0x8e) { 340 if (secondByte < 0xa1) { 347 it.error = true; 348 } 349 break buildChar; 350 } 351 352 if (firstByte == 0x8f) { 353 thirdByte = it.nextByte(det); 356 it.charValue = (it.charValue << 8) | thirdByte; 357 if (thirdByte < 0xa1) { 358 it.error = true; 359 } 360 } 361 } 362 363 return (it.done == false); 364 } 365 366 370 static class CharsetRecog_euc_jp extends CharsetRecog_euc { 371 static int [] commonChars = 372 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 376 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 377 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 378 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 379 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 380 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 381 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 382 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 383 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 384 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 385 String getName() { 386 return "EUC-JP"; 387 } 388 389 int match(CharsetDetector det) { 390 return match(det, commonChars); 391 } 392 393 public String getLanguage() 394 { 395 return "ja"; 396 } 397 } 398 399 403 static class CharsetRecog_euc_kr extends CharsetRecog_euc { 404 static int [] commonChars = 405 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 409 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 410 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 411 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 412 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 413 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 414 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 415 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 416 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 417 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 418 419 String getName() { 420 return "EUC-KR"; 421 } 422 423 int match(CharsetDetector det) { 424 return match(det, commonChars); 425 } 426 427 public String getLanguage() 428 { 429 return "ko"; 430 } 431 } 432 } 433 434 439 static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs { 440 441 447 boolean nextChar(iteratedChar it, CharsetDetector det) { 448 it.index = it.nextIndex; 449 it.error = false; 450 int firstByte = 0; 451 int secondByte = 0; 452 int thirdByte = 0; 453 int fourthByte = 0; 454 455 buildChar: { 456 firstByte = it.charValue = it.nextByte(det); 457 458 if (firstByte < 0) { 459 it.done = true; 461 break buildChar; 462 } 463 464 if (firstByte <= 0x80) { 465 break buildChar; 467 } 468 469 secondByte = it.nextByte(det); 470 it.charValue = (it.charValue << 8) | secondByte; 471 472 if (firstByte >= 0x81 && firstByte <= 0xFE) { 473 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) { 475 break buildChar; 476 } 477 478 if (secondByte >= 0x30 && secondByte <= 0x39) { 480 thirdByte = it.nextByte(det); 481 482 if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 483 fourthByte = it.nextByte(det); 484 485 if (fourthByte >= 0x30 && fourthByte <= 0x39) { 486 it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte; 487 break buildChar; 488 } 489 } 490 } 491 492 it.error = true; 493 break buildChar; 494 } 495 } 496 497 return (it.done == false); 498 } 499 500 static int [] commonChars = 501 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 505 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 506 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 507 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 508 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 509 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 510 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 511 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 512 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 513 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 514 515 516 String getName() { 517 return "GB18030"; 518 } 519 520 int match(CharsetDetector det) { 521 return match(det, commonChars); 522 } 523 524 public String getLanguage() 525 { 526 return "zh"; 527 } 528 } 529 530 531 } 532 | Popular Tags |