1 8 9 package com.ibm.icu.text; 10 11 17 abstract class CharsetRecog_Unicode extends CharsetRecognizer { 18 19 22 abstract String getName(); 23 24 27 abstract int match(CharsetDetector det); 28 29 static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode 30 { 31 String getName() 32 { 33 return "UTF-16BE"; 34 } 35 36 int match(CharsetDetector det) 37 { 38 byte[] input = det.fRawInput; 39 40 if ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF) { 41 return 100; 42 } 43 44 return 0; 46 } 47 } 48 49 static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode 50 { 51 String getName() 52 { 53 return "UTF-16LE"; 54 } 55 56 int match(CharsetDetector det) 57 { 58 byte[] input = det.fRawInput; 59 60 if ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) { 61 return 100; 62 } 63 64 return 0; 66 } 67 } 68 69 static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode 70 { 71 abstract int getChar(byte[] input, int index); 72 73 abstract String getName(); 74 75 int match(CharsetDetector det) 76 { 77 byte[] input = det.fRawInput; 78 int limit = (det.fRawLength / 4) * 4; 79 int numValid = 0; 80 int numInvalid = 0; 81 boolean hasBOM = false; 82 int confidence = 0; 83 84 if (getChar(input, 0) == 0x0000FEFF) { 85 hasBOM = true; 86 } 87 88 for(int i = 0; i < limit; i += 4) { 89 int ch = getChar(input, i); 90 91 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { 92 numInvalid += 1; 93 } else { 94 numValid += 1; 95 } 96 } 97 98 99 if (hasBOM && numInvalid==0) { 102 confidence = 100; 103 } else if (hasBOM && numValid > numInvalid*10) { 104 confidence = 80; 105 } else if (numValid > 3 && numInvalid == 0) { 106 confidence = 100; 107 } else if (numValid > 0 && numInvalid == 0) { 108 confidence = 80; 109 } else if (numValid > numInvalid*10) { 110 confidence = 25; 112 } 113 114 return confidence; 115 } 116 } 117 118 static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 119 { 120 int getChar(byte[] input, int index) 121 { 122 return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 | 123 (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF); 124 } 125 126 String getName() 127 { 128 return "UTF-32BE"; 129 } 130 } 131 132 133 static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 134 { 135 int getChar(byte[] input, int index) 136 { 137 return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 | 138 (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF); 139 } 140 141 String getName() 142 { 143 return "UTF-32LE"; 144 } 145 } 146 } 147 | Popular Tags |