1 7 package com.ibm.icu.text; 8 9 14 class CharsetRecog_UTF8 extends CharsetRecognizer { 15 16 String getName() { 17 return "UTF-8"; 18 } 19 20 23 int match(CharsetDetector det) { 24 boolean hasBOM = false; 25 int numValid = 0; 26 int numInvalid = 0; 27 byte input[] = det.fRawInput; 28 int i; 29 int trailBytes = 0; 30 int confidence; 31 32 if (det.fRawLength >= 3 && 33 input[0]==0xef && input[1]==0xbb & input[2]==0xbf) { 34 hasBOM = true; 35 } 36 37 for (i=0; i<det.fRawLength; i++) { 39 int b = input[i]; 40 if ((b & 0x80) == 0) { 41 continue; } 43 44 if ((b & 0x0e0) == 0x0c0) { 46 trailBytes = 1; 47 } else if ((b & 0x0f0) == 0x0e0) { 48 trailBytes = 2; 49 } else if ((b & 0x0f8) == 0xf0) { 50 trailBytes = 3; 51 } else { 52 numInvalid++; 53 if (numInvalid > 5) { 54 break; 55 } 56 trailBytes = 0; 57 } 58 59 for (;;) { 61 i++; 62 if (i>=det.fRawLength) { 63 break; 64 } 65 b = input[i]; 66 if ((b & 0xc0) != 0x080) { 67 numInvalid++; 68 break; 69 } 70 if (--trailBytes == 0) { 71 numValid++; 72 break; 73 } 74 } 75 76 } 77 78 confidence = 0; 81 if (hasBOM && numInvalid==0) { 82 confidence = 100; 83 } else if (hasBOM && numValid > numInvalid*10) { 84 confidence = 80; 85 } else if (numValid > 3 && numInvalid == 0) { 86 confidence = 100; 87 } else if (numValid > 0 && numInvalid == 0) { 88 confidence = 80; 89 } else if (numValid == 0 && numInvalid == 0) { 90 confidence = 10; 92 } else if (numValid > numInvalid*10) { 93 confidence = 25; 95 } 96 return confidence; 97 } 98 99 } 100 | Popular Tags |