1 54 package org.w3c.tidy; 55 56 60 public final class EncodingUtils 61 { 62 63 66 public static final int UNICODE_BOM_BE = 0xFEFF; 67 68 71 public static final int UNICODE_BOM = UNICODE_BOM_BE; 72 73 76 public static final int UNICODE_BOM_LE = 0xFFFE; 77 78 81 public static final int UNICODE_BOM_UTF8 = 0xEFBBBF; 82 83 88 public static final int FSM_ASCII = 0; 89 90 93 public static final int FSM_ESC = 1; 94 95 98 public static final int FSM_ESCD = 2; 99 100 103 public static final int FSM_ESCDP = 3; 104 105 108 public static final int FSM_ESCP = 4; 109 110 113 public static final int FSM_NONASCII = 5; 114 115 118 public static final int MAX_UTF8_FROM_UCS4 = 0x10FFFF; 119 120 123 public static final int MAX_UTF16_FROM_UCS4 = 0x10FFFF; 124 125 128 public static final int LOW_UTF16_SURROGATE = 0xD800; 129 130 133 public static final int UTF16_SURROGATES_BEGIN = 0x10000; 134 135 138 public static final int UTF16_LOW_SURROGATE_BEGIN = 0xD800; 139 140 143 public static final int UTF16_LOW_SURROGATE_END = 0xDBFF; 144 145 148 public static final int UTF16_HIGH_SURROGATE_BEGIN = 0xDC00; 149 150 153 public static final int UTF16_HIGH_SURROGATE_END = 0xDFFF; 154 155 158 public static final int HIGH_UTF16_SURROGATE = 0xDFFF; 159 160 163 private static final int UTF8_BYTE_SWAP_NOT_A_CHAR = 0xFFFE; 164 165 168 private static final int UTF8_NOT_A_CHAR = 0xFFFF; 169 170 173 private static final int[] WIN2UNICODE = { 174 0x20AC, 175 0x0000, 176 0x201A, 177 0x0192, 178 0x201E, 179 0x2026, 180 0x2020, 181 0x2021, 182 0x02C6, 183 0x2030, 184 0x0160, 185 0x2039, 186 0x0152, 187 0x0000, 188 0x017D, 189 0x0000, 190 0x0000, 191 0x2018, 192 0x2019, 193 0x201C, 194 0x201D, 195 0x2022, 196 0x2013, 197 0x2014, 198 0x02DC, 199 0x2122, 200 0x0161, 201 0x203A, 202 0x0153, 203 0x0000, 204 0x017E, 205 0x0178}; 206 207 210 private static final int[] MAC2UNICODE = { 0x00C4, 213 0x00C5, 214 0x00C7, 215 0x00C9, 216 0x00D1, 217 0x00D6, 218 0x00DC, 219 0x00E1, 220 0x00E0, 221 0x00E2, 222 0x00E4, 223 0x00E3, 224 0x00E5, 225 0x00E7, 226 0x00E9, 227 0x00E8, 228 0x00EA, 229 0x00EB, 230 0x00ED, 231 0x00EC, 232 0x00EE, 233 0x00EF, 234 0x00F1, 235 0x00F3, 236 0x00F2, 237 0x00F4, 238 0x00F6, 239 0x00F5, 240 0x00FA, 241 0x00F9, 242 0x00FB, 243 0x00FC, 244 0x2020, 245 0x00B0, 246 0x00A2, 247 0x00A3, 248 0x00A7, 249 0x2022, 250 0x00B6, 251 0x00DF, 252 0x00AE, 253 0x00A9, 254 0x2122, 255 0x00B4, 256 0x00A8, 257 0x2260, 258 0x00C6, 259 0x00D8, 260 0x221E, 261 0x00B1, 262 0x2264, 263 0x2265, 264 0x00A5, 265 0x00B5, 266 0x2202, 267 0x2211, 268 0x220F, 270 0x03C0, 271 0x222B, 272 0x00AA, 273 0x00BA, 274 0x03A9, 275 0x00E6, 276 0x00F8, 277 0x00BF, 278 0x00A1, 279 0x00AC, 280 0x221A, 281 0x0192, 282 0x2248, 283 0x2206, 284 0x00AB, 285 0x00BB, 286 0x2026, 287 0x00A0, 288 0x00C0, 289 0x00C3, 290 0x00D5, 291 0x0152, 292 0x0153, 293 0x2013, 294 0x2014, 295 0x201C, 296 0x201D, 297 0x2018, 298 0x2019, 299 0x00F7, 300 0x25CA, 301 0x00FF, 303 0x0178, 304 0x2044, 305 0x20AC, 306 0x2039, 307 0x203A, 308 0xFB01, 309 0xFB02, 310 0x2021, 311 0x00B7, 312 0x201A, 313 0x201E, 314 0x2030, 315 0x00C2, 316 0x00CA, 317 0x00C1, 318 0x00CB, 319 0x00C8, 320 0x00CD, 321 0x00CE, 322 0x00CF, 323 0x00CC, 324 0x00D3, 325 0x00D4, 326 0xF8FF, 329 0x00D2, 330 0x00DA, 331 0x00DB, 332 0x00D9, 333 0x0131, 334 0x02C6, 335 0x02DC, 336 0x00AF, 337 0x02D8, 338 0x02D9, 339 0x02DA, 340 0x00B8, 341 0x02DD, 342 0x02DB, 343 0x02C7}; 344 345 349 private static final int[] SYMBOL2UNICODE = { 350 0x0000, 351 0x0001, 352 0x0002, 353 0x0003, 354 0x0004, 355 0x0005, 356 0x0006, 357 0x0007, 358 0x0008, 359 0x0009, 360 0x000A, 361 0x000B, 362 0x000C, 363 0x000D, 364 0x000E, 365 0x000F, 366 367 0x0010, 368 0x0011, 369 0x0012, 370 0x0013, 371 0x0014, 372 0x0015, 373 0x0016, 374 0x0017, 375 0x0018, 376 0x0019, 377 0x001A, 378 0x001B, 379 0x001C, 380 0x001D, 381 0x001E, 382 0x001F, 383 384 0x0020, 385 0x0021, 386 0x2200, 387 0x0023, 388 0x2203, 389 0x0025, 390 0x0026, 391 0x220D, 392 0x0028, 393 0x0029, 394 0x2217, 395 0x002B, 396 0x002C, 397 0x2212, 398 0x002E, 399 0x002F, 400 401 0x0030, 402 0x0031, 403 0x0032, 404 0x0033, 405 0x0034, 406 0x0035, 407 0x0036, 408 0x0037, 409 0x0038, 410 0x0039, 411 0x003A, 412 0x003B, 413 0x003C, 414 0x003D, 415 0x003E, 416 0x003F, 417 418 0x2245, 419 0x0391, 420 0x0392, 421 0x03A7, 422 0x0394, 423 0x0395, 424 0x03A6, 425 0x0393, 426 0x0397, 427 0x0399, 428 0x03D1, 429 0x039A, 430 0x039B, 431 0x039C, 432 0x039D, 433 0x039F, 434 435 0x03A0, 436 0x0398, 437 0x03A1, 438 0x03A3, 439 0x03A4, 440 0x03A5, 441 0x03C2, 442 0x03A9, 443 0x039E, 444 0x03A8, 445 0x0396, 446 0x005B, 447 0x2234, 448 0x005D, 449 0x22A5, 450 0x005F, 451 452 0x00AF, 453 0x03B1, 454 0x03B2, 455 0x03C7, 456 0x03B4, 457 0x03B5, 458 0x03C6, 459 0x03B3, 460 0x03B7, 461 0x03B9, 462 0x03D5, 463 0x03BA, 464 0x03BB, 465 0x03BC, 466 0x03BD, 467 0x03BF, 468 469 0x03C0, 470 0x03B8, 471 0x03C1, 472 0x03C3, 473 0x03C4, 474 0x03C5, 475 0x03D6, 476 0x03C9, 477 0x03BE, 478 0x03C8, 479 0x03B6, 480 0x007B, 481 0x007C, 482 0x007D, 483 0x223C, 484 0x003F, 485 486 0x0000, 487 0x0000, 488 0x0000, 489 0x0000, 490 0x0000, 491 0x0000, 492 0x0000, 493 0x0000, 494 0x0000, 495 0x0000, 496 0x0000, 497 0x0000, 498 0x0000, 499 0x0000, 500 0x0000, 501 0x0000, 502 503 0x0000, 504 0x0000, 505 0x0000, 506 0x0000, 507 0x0000, 508 0x0000, 509 0x0000, 510 0x0000, 511 0x0000, 512 0x0000, 513 0x0000, 514 0x0000, 515 0x0000, 516 0x0000, 517 0x0000, 518 0x0000, 519 520 0x00A0, 521 0x03D2, 522 0x2032, 523 0x2264, 524 0x2044, 525 0x221E, 526 0x0192, 527 0x2663, 528 0x2666, 529 0x2665, 530 0x2660, 531 0x2194, 532 0x2190, 533 0x2191, 534 0x2192, 535 0x2193, 536 537 0x00B0, 538 0x00B1, 539 0x2033, 540 0x2265, 541 0x00D7, 542 0x221D, 543 0x2202, 544 0x00B7, 545 0x00F7, 546 0x2260, 547 0x2261, 548 0x2248, 549 0x2026, 550 0x003F, 551 0x003F, 552 0x21B5, 553 554 0x2135, 555 0x2111, 556 0x211C, 557 0x2118, 558 0x2297, 559 0x2295, 560 0x2205, 561 0x2229, 562 0x222A, 563 0x2283, 564 0x2287, 565 0x2284, 566 0x2282, 567 0x2286, 568 0x2208, 569 0x2209, 570 571 0x2220, 572 0x2207, 573 0x00AE, 574 0x00A9, 575 0x2122, 576 0x220F, 577 0x221A, 578 0x22C5, 579 0x00AC, 580 0x2227, 581 0x2228, 582 0x21D4, 583 0x21D0, 584 0x21D1, 585 0x21D2, 586 0x21D3, 587 588 0x25CA, 589 0x2329, 590 0x00AE, 591 0x00A9, 592 0x2122, 593 0x2211, 594 0x003F, 595 0x003F, 596 0x003F, 597 0x003F, 598 0x003F, 599 0x003F, 600 0x003F, 601 0x003F, 602 0x003F, 603 0x003F, 604 605 0x20AC, 606 0x232A, 607 0x222B, 608 0x2320, 609 0x003F, 610 0x2321, 611 0x003F, 612 0x003F, 613 0x003F, 614 0x003F, 615 0x003F, 616 0x003F, 617 0x003F, 618 0x003F, 619 0x003F, 620 0x003F}; 621 622 625 private static final ValidUTF8Sequence[] VALID_UTF8 = { 626 new ValidUTF8Sequence(0x0000, 0x007F, 1, new char[]{0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), 627 new ValidUTF8Sequence(0x0080, 0x07FF, 2, new char[]{0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}), 628 new ValidUTF8Sequence(0x0800, 0x0FFF, 3, new char[]{0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}), 629 new ValidUTF8Sequence(0x1000, 0xFFFF, 3, new char[]{0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}), 630 new ValidUTF8Sequence(0x10000, 0x3FFFF, 4, new char[]{0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}), 631 new ValidUTF8Sequence(0x40000, 0xFFFFF, 4, new char[]{0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}), 632 new ValidUTF8Sequence(0x100000, 0x10FFFF, 4, new char[]{0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})}; 633 634 637 private static final int NUM_UTF8_SEQUENCES = VALID_UTF8.length; 638 639 642 private static final int[] OFFSET_UTF8_SEQUENCES = {0, 1, 2, 4, NUM_UTF8_SEQUENCES}; 648 651 private EncodingUtils() 652 { 653 } 655 656 661 protected static int decodeWin1252(int c) 662 { 663 return WIN2UNICODE[c - 128]; 664 } 665 666 671 protected static int decodeMacRoman(int c) 672 { 673 if (127 < c) 674 { 675 c = MAC2UNICODE[c - 128]; 676 } 677 return c; 678 } 679 680 685 static int decodeSymbolFont(int c) 686 { 687 if (c > 255) 688 { 689 return c; 690 } 691 692 return SYMBOL2UNICODE[c]; 693 } 694 695 705 static boolean decodeUTF8BytesToChar(int[] c, int firstByte, byte[] successorBytes, GetBytes getter, int[] count, 706 int startInSuccessorBytesArray) 707 { 708 byte[] buf = new byte[10]; 709 710 int ch = 0; 711 int n = 0; 712 int i, bytes = 0; 713 boolean hasError = false; 714 715 if (successorBytes.length != 0) 716 { 717 buf = successorBytes; 718 } 719 720 if (firstByte == StreamIn.END_OF_STREAM) { 723 c[0] = firstByte; 725 count[0] = 1; 726 return false; 727 } 728 729 ch = TidyUtils.toUnsigned(firstByte); 731 if (ch <= 0x7F) { 733 n = ch; 734 bytes = 1; 735 } 736 else if ((ch & 0xE0) == 0xC0) 737 { 738 n = ch & 31; 739 bytes = 2; 740 } 741 else if ((ch & 0xF0) == 0xE0) 742 { 743 n = ch & 15; 744 bytes = 3; 745 } 746 else if ((ch & 0xF8) == 0xF0) 747 { 748 n = ch & 7; 749 bytes = 4; 750 } 751 else if ((ch & 0xFC) == 0xF8) 752 { 753 n = ch & 3; 754 bytes = 5; 755 hasError = true; 756 } 757 else if ((ch & 0xFE) == 0xFC) 758 { 759 n = ch & 1; 760 bytes = 6; 761 hasError = true; 762 } 763 else 764 { 765 n = ch; 767 bytes = 1; 768 hasError = true; 769 } 770 771 for (i = 1; i < bytes; ++i) 772 { 773 int[] tempCount = new int[1]; 775 if (getter != null && (bytes - i > 0)) 777 { 778 tempCount[0] = 1; int[] buftocopy = new int[]{buf[startInSuccessorBytesArray + i - 1]}; 780 781 getter.doGet(buftocopy, tempCount, false); 782 if (tempCount[0] <= 0) { 785 hasError = true; 786 bytes = i; 787 break; 788 } 789 } 790 791 if ((buf[startInSuccessorBytesArray + i - 1] & 0xC0) != 0x80) 792 { 793 hasError = true; 795 bytes = i; 796 if (getter != null) 797 { 798 int[] buftocopy = new int[]{buf[startInSuccessorBytesArray + i - 1]}; 799 tempCount[0] = 1; getter.doGet(buftocopy, tempCount, true); 801 } 802 break; 803 } 804 805 n = (n << 6) | (buf[startInSuccessorBytesArray + i - 1] & 0x3F); 806 } 807 808 if (!hasError && ((n == UTF8_BYTE_SWAP_NOT_A_CHAR) || (n == UTF8_NOT_A_CHAR))) 809 { 810 hasError = true; 811 } 812 813 if (!hasError && (n > MAX_UTF8_FROM_UCS4)) 814 { 815 hasError = true; 816 } 817 818 if (!hasError && (n >= UTF16_LOW_SURROGATE_BEGIN) && (n <= UTF16_HIGH_SURROGATE_END)) 819 { 820 hasError = true; 822 } 823 824 if (!hasError) 825 { 826 int lo = OFFSET_UTF8_SEQUENCES[bytes - 1]; 827 int hi = OFFSET_UTF8_SEQUENCES[bytes] - 1; 828 829 if ((n < VALID_UTF8[lo].lowChar) || (n > VALID_UTF8[hi].highChar)) 831 { 832 hasError = true; 833 } 834 else 835 { 836 hasError = true; 838 for (i = lo; i <= hi; i++) 839 { 840 int tempCount; 841 char theByte; 843 for (tempCount = 0; tempCount < bytes; tempCount++) 844 { 845 if (!TidyUtils.toBoolean(tempCount)) 846 { 847 theByte = (char) firstByte; 848 } 849 else 850 { 851 theByte = (char) buf[startInSuccessorBytesArray + tempCount - 1]; 852 } 853 if ((theByte >= VALID_UTF8[i].validBytes[(tempCount * 2)]) 854 && (theByte <= VALID_UTF8[i].validBytes[(tempCount * 2) + 1])) 855 { 856 hasError = false; 857 } 858 if (hasError) 859 { 860 break; 861 } 862 } 863 } 864 } 865 } 866 867 count[0] = bytes; 868 869 c[0] = n; 870 871 return hasError; 874 875 } 876 877 885 static boolean encodeCharToUTF8Bytes(int c, byte[] encodebuf, PutBytes putter, int[] count) 886 { 887 int bytes = 0; 888 889 byte[] buf = new byte[10]; 890 891 if (encodebuf != null) 892 { 893 buf = encodebuf; 894 } 895 896 boolean hasError = false; 897 898 if (c <= 0x7F) { 900 buf[0] = (byte) c; 901 bytes = 1; 902 } 903 else if (c <= 0x7FF) { 905 buf[0] = (byte) (0xC0 | (c >> 6)); 906 buf[1] = (byte) (0x80 | (c & 0x3F)); 907 bytes = 2; 908 } 909 else if (c <= 0xFFFF) { 911 buf[0] = (byte) (0xE0 | (c >> 12)); 912 buf[1] = (byte) (0x80 | ((c >> 6) & 0x3F)); 913 buf[2] = (byte) (0x80 | (c & 0x3F)); 914 bytes = 3; 915 if ((c == UTF8_BYTE_SWAP_NOT_A_CHAR) || (c == UTF8_NOT_A_CHAR)) 916 { 917 hasError = true; 918 } 919 else if ((c >= UTF16_LOW_SURROGATE_BEGIN) && (c <= UTF16_HIGH_SURROGATE_END)) 920 { 921 hasError = true; 923 } 924 } 925 else if (c <= 0x1FFFFF) { 927 buf[0] = (byte) (0xF0 | (c >> 18)); 928 buf[1] = (byte) (0x80 | ((c >> 12) & 0x3F)); 929 buf[2] = (byte) (0x80 | ((c >> 6) & 0x3F)); 930 buf[3] = (byte) (0x80 | (c & 0x3F)); 931 bytes = 4; 932 if (c > MAX_UTF8_FROM_UCS4) 933 { 934 hasError = true; 935 } 936 } 937 else if (c <= 0x3FFFFFF) { 939 buf[0] = (byte) (0xF8 | (c >> 24)); 940 buf[1] = (byte) (0x80 | (c >> 18)); 941 buf[2] = (byte) (0x80 | ((c >> 12) & 0x3F)); 942 buf[3] = (byte) (0x80 | ((c >> 6) & 0x3F)); 943 buf[4] = (byte) (0x80 | (c & 0x3F)); 944 bytes = 5; 945 hasError = true; 946 } 947 else if (c <= 0x7FFFFFFF) { 949 buf[0] = (byte) (0xFC | (c >> 30)); 950 buf[1] = (byte) (0x80 | ((c >> 24) & 0x3F)); 951 buf[2] = (byte) (0x80 | ((c >> 18) & 0x3F)); 952 buf[3] = (byte) (0x80 | ((c >> 12) & 0x3F)); 953 buf[4] = (byte) (0x80 | ((c >> 6) & 0x3F)); 954 buf[5] = (byte) (0x80 | (c & 0x3F)); 955 bytes = 6; 956 hasError = true; 957 } 958 else 959 { 960 hasError = true; 961 } 962 963 if (!hasError && putter != null) { 965 int[] tempCount = new int[]{bytes}; 966 putter.doPut(buf, tempCount); 967 968 if (tempCount[0] < bytes) 969 { 970 hasError = true; 971 } 972 } 973 974 count[0] = bytes; 975 return hasError; 976 } 977 978 982 static interface GetBytes 983 { 984 985 991 void doGet(int[] buf, int[] count, boolean unget); 992 } 993 994 997 static interface PutBytes 998 { 999 1000 1005 void doPut(byte[] buf, int[] count); 1006 } 1007} 1008 | Popular Tags |