1 16 17 package org.apache.xerces.impl.io; 18 19 import java.io.InputStream ; 20 import java.io.IOException ; 21 import java.io.Reader ; 22 23 import java.util.Locale ; 24 import org.apache.xerces.util.MessageFormatter; 25 import org.apache.xerces.impl.msg.XMLMessageFormatter; 26 27 36 public class UTF8Reader 37 extends Reader { 38 39 43 44 public static final int DEFAULT_BUFFER_SIZE = 2048; 45 46 48 49 private static final boolean DEBUG_READ = false; 50 51 55 56 protected InputStream fInputStream; 57 58 59 protected byte[] fBuffer; 60 61 62 protected int fOffset; 63 64 65 private int fSurrogate = -1; 66 67 private MessageFormatter fFormatter = null; 70 71 private Locale fLocale = null; 73 74 78 84 public UTF8Reader(InputStream inputStream) { 85 this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault()); 86 } 88 96 public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter, 97 Locale locale) { 98 this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale); 99 } 101 110 public UTF8Reader(InputStream inputStream, int size, 111 MessageFormatter messageFormatter, Locale locale) { 112 fInputStream = inputStream; 113 fBuffer = new byte[size]; 114 fFormatter = messageFormatter; 115 fLocale = locale; 116 } 118 122 135 public int read() throws IOException { 136 137 int c = fSurrogate; 139 if (fSurrogate == -1) { 140 int index = 0; 143 144 int b0 = index == fOffset 146 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 147 if (b0 == -1) { 148 return -1; 149 } 150 151 if (b0 < 0x80) { 154 c = (char)b0; 155 } 156 157 else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { 160 int b1 = index == fOffset 161 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 162 if (b1 == -1) { 163 expectedByte(2, 2); 164 } 165 if ((b1 & 0xC0) != 0x80) { 166 invalidByte(2, 2, b1); 167 } 168 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F); 169 } 170 171 else if ((b0 & 0xF0) == 0xE0) { 174 int b1 = index == fOffset 175 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 176 if (b1 == -1) { 177 expectedByte(2, 3); 178 } 179 if ((b1 & 0xC0) != 0x80 180 || (b0 == 0xED && b1 >= 0xA0) 181 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { 182 invalidByte(2, 3, b1); 183 } 184 int b2 = index == fOffset 185 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 186 if (b2 == -1) { 187 expectedByte(3, 3); 188 } 189 if ((b2 & 0xC0) != 0x80) { 190 invalidByte(3, 3, b2); 191 } 192 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) | 193 (b2 & 0x003F); 194 } 195 196 else if ((b0 & 0xF8) == 0xF0) { 201 int b1 = index == fOffset 202 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 203 if (b1 == -1) { 204 expectedByte(2, 4); 205 } 206 if ((b1 & 0xC0) != 0x80 207 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { 208 invalidByte(2, 3, b1); 209 } 210 int b2 = index == fOffset 211 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 212 if (b2 == -1) { 213 expectedByte(3, 4); 214 } 215 if ((b2 & 0xC0) != 0x80) { 216 invalidByte(3, 3, b2); 217 } 218 int b3 = index == fOffset 219 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 220 if (b3 == -1) { 221 expectedByte(4, 4); 222 } 223 if ((b3 & 0xC0) != 0x80) { 224 invalidByte(4, 4, b3); 225 } 226 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003); 227 if (uuuuu > 0x10) { 228 invalidSurrogate(uuuuu); 229 } 230 int wwww = uuuuu - 1; 231 int hs = 0xD800 | 232 ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) | 233 ((b2 >> 4) & 0x0003); 234 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F); 235 c = hs; 236 fSurrogate = ls; 237 } 238 239 else { 241 invalidByte(1, 1, b0); 242 } 243 } 244 245 else { 247 fSurrogate = -1; 248 } 249 250 if (DEBUG_READ) { 252 System.out.println("read(): 0x"+Integer.toHexString(c)); 253 } 254 return c; 255 256 } 258 272 public int read(char ch[], int offset, int length) throws IOException { 273 274 int out = offset; 276 if (fSurrogate != -1) { 277 ch[offset + 1] = (char)fSurrogate; 278 fSurrogate = -1; 279 length--; 280 out++; 281 } 282 283 int count = 0; 285 if (fOffset == 0) { 286 if (length > fBuffer.length) { 288 length = fBuffer.length; 289 } 290 291 count = fInputStream.read(fBuffer, 0, length); 293 if (count == -1) { 294 return -1; 295 } 296 count += out - offset; 297 } 298 299 else { 307 count = fOffset; 308 fOffset = 0; 309 } 310 311 final int total = count; 313 int in; 314 byte byte1; 315 final byte byte0 = 0; 316 for (in = 0; in < total; in++) { 317 byte1 = fBuffer[in]; 318 if (byte1 >= byte0) { 319 ch[out++] = (char)byte1; 320 } 321 else { 322 break; 323 } 324 } 325 for ( ; in < total; in++) { 326 byte1 = fBuffer[in]; 327 328 if (byte1 >= byte0) { 331 ch[out++] = (char)byte1; 332 continue; 333 } 334 335 int b0 = byte1 & 0x0FF; 338 if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { 339 int b1 = -1; 340 if (++in < total) { 341 b1 = fBuffer[in] & 0x00FF; 342 } 343 else { 344 b1 = fInputStream.read(); 345 if (b1 == -1) { 346 if (out > offset) { 347 fBuffer[0] = (byte)b0; 348 fOffset = 1; 349 return out - offset; 350 } 351 expectedByte(2, 2); 352 } 353 count++; 354 } 355 if ((b1 & 0xC0) != 0x80) { 356 if (out > offset) { 357 fBuffer[0] = (byte)b0; 358 fBuffer[1] = (byte)b1; 359 fOffset = 2; 360 return out - offset; 361 } 362 invalidByte(2, 2, b1); 363 } 364 int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F); 365 ch[out++] = (char)c; 366 count -= 1; 367 continue; 368 } 369 370 if ((b0 & 0xF0) == 0xE0) { 373 int b1 = -1; 374 if (++in < total) { 375 b1 = fBuffer[in] & 0x00FF; 376 } 377 else { 378 b1 = fInputStream.read(); 379 if (b1 == -1) { 380 if (out > offset) { 381 fBuffer[0] = (byte)b0; 382 fOffset = 1; 383 return out - offset; 384 } 385 expectedByte(2, 3); 386 } 387 count++; 388 } 389 if ((b1 & 0xC0) != 0x80 390 || (b0 == 0xED && b1 >= 0xA0) 391 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { 392 if (out > offset) { 393 fBuffer[0] = (byte)b0; 394 fBuffer[1] = (byte)b1; 395 fOffset = 2; 396 return out - offset; 397 } 398 invalidByte(2, 3, b1); 399 } 400 int b2 = -1; 401 if (++in < total) { 402 b2 = fBuffer[in] & 0x00FF; 403 } 404 else { 405 b2 = fInputStream.read(); 406 if (b2 == -1) { 407 if (out > offset) { 408 fBuffer[0] = (byte)b0; 409 fBuffer[1] = (byte)b1; 410 fOffset = 2; 411 return out - offset; 412 } 413 expectedByte(3, 3); 414 } 415 count++; 416 } 417 if ((b2 & 0xC0) != 0x80) { 418 if (out > offset) { 419 fBuffer[0] = (byte)b0; 420 fBuffer[1] = (byte)b1; 421 fBuffer[2] = (byte)b2; 422 fOffset = 3; 423 return out - offset; 424 } 425 invalidByte(3, 3, b2); 426 } 427 int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) | 428 (b2 & 0x003F); 429 ch[out++] = (char)c; 430 count -= 2; 431 continue; 432 } 433 434 if ((b0 & 0xF8) == 0xF0) { 439 int b1 = -1; 440 if (++in < total) { 441 b1 = fBuffer[in] & 0x00FF; 442 } 443 else { 444 b1 = fInputStream.read(); 445 if (b1 == -1) { 446 if (out > offset) { 447 fBuffer[0] = (byte)b0; 448 fOffset = 1; 449 return out - offset; 450 } 451 expectedByte(2, 4); 452 } 453 count++; 454 } 455 if ((b1 & 0xC0) != 0x80 456 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { 457 if (out > offset) { 458 fBuffer[0] = (byte)b0; 459 fBuffer[1] = (byte)b1; 460 fOffset = 2; 461 return out - offset; 462 } 463 invalidByte(2, 4, b1); 464 } 465 int b2 = -1; 466 if (++in < total) { 467 b2 = fBuffer[in] & 0x00FF; 468 } 469 else { 470 b2 = fInputStream.read(); 471 if (b2 == -1) { 472 if (out > offset) { 473 fBuffer[0] = (byte)b0; 474 fBuffer[1] = (byte)b1; 475 fOffset = 2; 476 return out - offset; 477 } 478 expectedByte(3, 4); 479 } 480 count++; 481 } 482 if ((b2 & 0xC0) != 0x80) { 483 if (out > offset) { 484 fBuffer[0] = (byte)b0; 485 fBuffer[1] = (byte)b1; 486 fBuffer[2] = (byte)b2; 487 fOffset = 3; 488 return out - offset; 489 } 490 invalidByte(3, 4, b2); 491 } 492 int b3 = -1; 493 if (++in < total) { 494 b3 = fBuffer[in] & 0x00FF; 495 } 496 else { 497 b3 = fInputStream.read(); 498 if (b3 == -1) { 499 if (out > offset) { 500 fBuffer[0] = (byte)b0; 501 fBuffer[1] = (byte)b1; 502 fBuffer[2] = (byte)b2; 503 fOffset = 3; 504 return out - offset; 505 } 506 expectedByte(4, 4); 507 } 508 count++; 509 } 510 if ((b3 & 0xC0) != 0x80) { 511 if (out > offset) { 512 fBuffer[0] = (byte)b0; 513 fBuffer[1] = (byte)b1; 514 fBuffer[2] = (byte)b2; 515 fBuffer[3] = (byte)b3; 516 fOffset = 4; 517 return out - offset; 518 } 519 invalidByte(4, 4, b2); 520 } 521 522 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003); 524 if (uuuuu > 0x10) { 525 invalidSurrogate(uuuuu); 526 } 527 int wwww = uuuuu - 1; 528 int zzzz = b1 & 0x000F; 529 int yyyyyy = b2 & 0x003F; 530 int xxxxxx = b3 & 0x003F; 531 int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4); 532 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; 533 534 ch[out++] = (char)hs; 536 ch[out++] = (char)ls; 537 count -= 2; 538 continue; 539 } 540 541 if (out > offset) { 543 fBuffer[0] = (byte)b0; 544 fOffset = 1; 545 return out - offset; 546 } 547 invalidByte(1, 1, b0); 548 } 549 550 if (DEBUG_READ) { 552 System.out.println("read(char[],"+offset+','+length+"): count="+count); 553 } 554 return count; 555 556 } 558 568 public long skip(long n) throws IOException { 569 570 long remaining = n; 571 final char[] ch = new char[fBuffer.length]; 572 do { 573 int length = ch.length < remaining ? ch.length : (int)remaining; 574 int count = read(ch, 0, length); 575 if (count > 0) { 576 remaining -= count; 577 } 578 else { 579 break; 580 } 581 } while (remaining > 0); 582 583 long skipped = n - remaining; 584 return skipped; 585 586 } 588 597 public boolean ready() throws IOException { 598 return false; 599 } 601 604 public boolean markSupported() { 605 return false; 606 } 608 621 public void mark(int readAheadLimit) throws IOException { 622 throw new IOException (fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object []{"mark()", "UTF-8"})); 623 } 625 638 public void reset() throws IOException { 639 fOffset = 0; 640 fSurrogate = -1; 641 } 643 650 public void close() throws IOException { 651 fInputStream.close(); 652 } 654 658 659 private void expectedByte(int position, int count) 660 throws MalformedByteSequenceException { 661 662 throw new MalformedByteSequenceException(fFormatter, 663 fLocale, 664 XMLMessageFormatter.XML_DOMAIN, 665 "ExpectedByte", 666 new Object [] {Integer.toString(position), Integer.toString(count)}); 667 668 } 670 671 private void invalidByte(int position, int count, int c) 672 throws MalformedByteSequenceException { 673 674 throw new MalformedByteSequenceException(fFormatter, 675 fLocale, 676 XMLMessageFormatter.XML_DOMAIN, 677 "InvalidByte", 678 new Object [] {Integer.toString(position), Integer.toString(count)}); 679 680 } 682 683 private void invalidSurrogate(int uuuuu) throws MalformedByteSequenceException { 684 685 throw new MalformedByteSequenceException(fFormatter, 686 fLocale, 687 XMLMessageFormatter.XML_DOMAIN, 688 "InvalidHighSurrogate", 689 new Object [] {Integer.toHexString(uuuuu)}); 690 691 } 693 } | Popular Tags |