1 7 package com.ibm.icu.text; 8 9 import java.io.InputStream ; 10 import java.io.Reader ; 11 import java.io.IOException ; 12 import java.util.ArrayList ; 13 import java.util.Collections ; 14 import java.util.Arrays ; 15 16 17 38 public class CharsetDetector { 39 40 43 50 56 public CharsetDetector() { 57 } 58 59 76 public CharsetDetector setDeclaredEncoding(String encoding) { 77 fDeclaredEncoding = encoding; 78 return this; 79 } 80 81 91 public CharsetDetector setText(byte [] in) { 92 fRawInput = in; 93 fRawLength = in.length; 94 95 MungeInput(); 96 97 return this; 98 } 99 100 private static final int kBufSize = 8000; 101 102 118 119 public CharsetDetector setText(InputStream in) throws IOException { 120 fInputStream = in; 121 fInputStream.mark(kBufSize); 122 fRawInput = new byte[kBufSize]; fRawLength = 0; 126 int remainingLength = kBufSize; 127 while (remainingLength > 0 ) { 128 int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength); 130 if (bytesRead <= 0) { 131 break; 132 } 133 fRawLength += bytesRead; 134 remainingLength -= bytesRead; 135 } 136 fInputStream.reset(); 137 138 MungeInput(); return this; 140 } 141 142 143 163 public CharsetMatch detect() { 164 CharsetMatch matches[] = detectAll(); 169 170 if (matches == null || matches.length == 0) { 171 return null; 172 } 173 174 return matches[0]; 175 } 176 177 193 public CharsetMatch[] detectAll() { 194 CharsetRecognizer csr; 195 int i; 196 int detectResults; 197 int confidence; 198 ArrayList matches = new ArrayList (); 199 200 for (i=0; i<fCSRecognizers.size(); i++) { 203 csr = (CharsetRecognizer)fCSRecognizers.get(i); 204 detectResults = csr.match(this); 205 confidence = detectResults & 0x000000ff; 206 if (confidence > 0) { 207 CharsetMatch m = new CharsetMatch(this, csr, confidence); 208 matches.add(m); 209 } 210 } 211 Collections.sort(matches); Collections.reverse(matches); CharsetMatch [] resultArray = new CharsetMatch[matches.size()]; 214 resultArray = (CharsetMatch[]) matches.toArray(resultArray); 215 return resultArray; 216 } 217 218 219 242 public Reader getReader(InputStream in, String declaredEncoding) { 243 fDeclaredEncoding = declaredEncoding; 244 245 try { 246 setText(in); 247 248 CharsetMatch match = detect(); 249 250 if (match == null) { 251 return null; 252 } 253 254 return match.getReader(); 255 } catch (IOException e) { 256 return null; 257 } 258 } 259 260 277 public String getString(byte[] in, String declaredEncoding) 278 { 279 fDeclaredEncoding = declaredEncoding; 280 281 try { 282 setText(in); 283 284 CharsetMatch match = detect(); 285 286 if (match == null) { 287 return null; 288 } 289 290 return match.getString(-1); 291 } catch (IOException e) { 292 return null; 293 } 294 } 295 296 297 306 public static String [] getAllDetectableCharsets() { 307 return fCharsetNames; 308 } 309 310 320 public boolean inputFilterEnabled() 321 { 322 return fStripTags; 323 } 324 325 337 public boolean enableInputFilter(boolean filter) 338 { 339 boolean previous = fStripTags; 340 341 fStripTags = filter; 342 343 return previous; 344 } 345 346 352 private void MungeInput() { 353 int srci = 0; 354 int dsti = 0; 355 byte b; 356 boolean inMarkup = false; 357 int openTags = 0; 358 int badTags = 0; 359 360 if (fStripTags) { 367 for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) { 368 b = fRawInput[srci]; 369 if (b == (byte)'<') { 370 if (inMarkup) { 371 badTags++; 372 } 373 inMarkup = true; 374 openTags++; 375 } 376 377 if (! inMarkup) { 378 fInputBytes[dsti++] = b; 379 } 380 381 if (b == (byte)'>') { 382 inMarkup = false; 383 } 384 } 385 386 fInputLen = dsti; 387 } 388 389 if (openTags<5 || openTags/5 < badTags || 395 (fInputLen < 100 && fRawLength>600)) { 396 int limit = fRawLength; 397 398 if (limit > kBufSize) { 399 limit = kBufSize; 400 } 401 402 for (srci=0; srci<limit; srci++) { 403 fInputBytes[srci] = fRawInput[srci]; 404 } 405 fInputLen = srci; 406 } 407 408 Arrays.fill(fByteStats, (short)0); 413 for (srci=0; srci<fInputLen; srci++) { 414 int val = fInputBytes[srci] & 0x00ff; 415 fByteStats[val]++; 416 } 417 418 fC1Bytes = false; 419 for (int i = 0x80; i <= 0x9F; i += 1) { 420 if (fByteStats[i] != 0) { 421 fC1Bytes = true; 422 break; 423 } 424 } 425 } 426 427 433 byte[] fInputBytes = new byte[kBufSize]; 436 int fInputLen; 438 short fByteStats[] = new short[256]; 442 boolean fC1Bytes = false; 444 445 String fDeclaredEncoding; 446 447 448 449 byte[] fRawInput; int fRawLength; 458 InputStream fInputStream; 461 boolean fStripTags = false; 463 464 465 470 private static ArrayList fCSRecognizers = createRecognizers(); 471 private static String [] fCharsetNames; 472 473 478 private static ArrayList createRecognizers() { 479 ArrayList recognizers = new ArrayList (); 480 481 recognizers.add(new CharsetRecog_UTF8()); 482 483 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE()); 484 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE()); 485 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE()); 486 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE()); 487 488 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis()); 489 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP()); 490 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN()); 491 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR()); 492 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030()); 493 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp()); 494 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr()); 495 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5()); 496 497 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da()); 498 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de()); 499 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en()); 500 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es()); 501 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr()); 502 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it()); 503 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl()); 504 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no()); 505 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt()); 506 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv()); 507 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs()); 508 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu()); 509 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl()); 510 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro()); 511 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru()); 512 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar()); 513 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el()); 514 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he()); 515 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he()); 516 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251()); 517 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256()); 518 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R()); 519 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr()); 520 521 String [] charsetNames = new String [recognizers.size()]; 524 int out = 0; 525 526 for (int i = 0; i < recognizers.size(); i++) { 527 String name = ((CharsetRecognizer)recognizers.get(i)).getName(); 528 529 if (out == 0 || ! name.equals(charsetNames[out - 1])) { 530 charsetNames[out++] = name; 531 } 532 } 533 534 fCharsetNames = new String [out]; 535 System.arraycopy(charsetNames, 0, fCharsetNames, 0, out); 536 537 return recognizers; 538 } 539 } 540 | Popular Tags |