1 57 58 package com.sun.org.apache.xerces.internal.impl.io; 59 60 import java.io.InputStream ; 61 import java.io.IOException ; 62 import java.io.Reader ; 63 64 import java.util.Locale ; 65 import com.sun.org.apache.xerces.internal.util.MessageFormatter; 66 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter; 67 68 75 public class UTF8Reader 76 extends Reader { 77 78 82 83 public static final int DEFAULT_BUFFER_SIZE = 2048; 84 85 87 88 private static final boolean DEBUG_READ = false; 89 90 94 95 protected InputStream fInputStream; 96 97 98 protected byte[] fBuffer; 99 100 101 protected int fOffset; 102 103 104 private int fSurrogate = -1; 105 106 private MessageFormatter fFormatter = null; 109 110 private Locale fLocale = null; 112 113 117 123 public UTF8Reader(InputStream inputStream) { 124 this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault()); 125 } 127 135 public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter, 136 Locale locale) { 137 this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale); 138 } 140 149 public UTF8Reader(InputStream inputStream, int size, 150 MessageFormatter messageFormatter, Locale locale) { 151 fInputStream = inputStream; 152 fBuffer = new byte[size]; 153 fFormatter = messageFormatter; 154 fLocale = locale; 155 } 157 161 174 public int read() throws IOException { 175 176 int c = fSurrogate; 178 if (fSurrogate == -1) { 179 int index = 0; 182 183 int b0 = index == fOffset 185 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 186 if (b0 == -1) { 187 return -1; 188 } 189 190 if (b0 < 0x80) { 193 c = (char)b0; 194 } 195 196 else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { 199 int b1 = index == fOffset 200 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 201 if (b1 == -1) { 202 expectedByte(2, 2); 203 } 204 if ((b1 & 0xC0) != 0x80) { 205 invalidByte(2, 2, b1); 206 } 207 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F); 208 } 209 210 else if ((b0 & 0xF0) == 0xE0) { 213 int b1 = index == fOffset 214 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 215 if (b1 == -1) { 216 expectedByte(2, 3); 217 } 218 if ((b1 & 0xC0) != 0x80 219 || (b0 == 0xED && b1 >= 0xA0) 220 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { 221 invalidByte(2, 3, b1); 222 } 223 int b2 = index == fOffset 224 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 225 if (b2 == -1) { 226 expectedByte(3, 3); 227 } 228 if ((b2 & 0xC0) != 0x80) { 229 invalidByte(3, 3, b2); 230 } 231 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) | 232 (b2 & 0x003F); 233 } 234 235 else if ((b0 & 0xF8) == 0xF0) { 240 int b1 = index == fOffset 241 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 242 if (b1 == -1) { 243 expectedByte(2, 4); 244 } 245 if ((b1 & 0xC0) != 0x80 246 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { 247 invalidByte(2, 3, b1); 248 } 249 int b2 = index == fOffset 250 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 251 if (b2 == -1) { 252 expectedByte(3, 4); 253 } 254 if ((b2 & 0xC0) != 0x80) { 255 invalidByte(3, 3, b2); 256 } 257 int b3 = index == fOffset 258 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 259 if (b3 == -1) { 260 expectedByte(4, 4); 261 } 262 if ((b3 & 0xC0) != 0x80) { 263 invalidByte(4, 4, b3); 264 } 265 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003); 266 if (uuuuu > 0x10) { 267 invalidSurrogate(uuuuu); 268 } 269 int wwww = uuuuu - 1; 270 int hs = 0xD800 | 271 ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) | 272 ((b2 >> 4) & 0x0003); 273 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F); 274 c = hs; 275 fSurrogate = ls; 276 } 277 278 else { 280 invalidByte(1, 1, b0); 281 } 282 } 283 284 else { 286 fSurrogate = -1; 287 } 288 289 if (DEBUG_READ) { 291 System.out.println("read(): 0x"+Integer.toHexString(c)); 292 } 293 return c; 294 295 } 297 311 public int read(char ch[], int offset, int length) throws IOException { 312 313 int out = offset; 315 if (fSurrogate != -1) { 316 ch[offset + 1] = (char)fSurrogate; 317 fSurrogate = -1; 318 length--; 319 out++; 320 } 321 322 int count = 0; 324 if (fOffset == 0) { 325 if (length > fBuffer.length) { 327 length = fBuffer.length; 328 } 329 330 count = fInputStream.read(fBuffer, 0, length); 332 if (count == -1) { 333 return -1; 334 } 335 count += out - offset; 336 } 337 338 else { 346 count = fOffset; 347 fOffset = 0; 348 } 349 350 final int total = count; 352 int in; 353 byte byte1; 354 final byte byte0 = 0; 355 for (in = 0; in < total; in++) { 356 byte1 = fBuffer[in]; 357 if (byte1 >= byte0) { 358 ch[out++] = (char)byte1; 359 } 360 else { 361 break; 362 } 363 } 364 for ( ; in < total; in++) { 365 byte1 = fBuffer[in]; 366 367 if (byte1 >= byte0) { 370 ch[out++] = (char)byte1; 371 continue; 372 } 373 374 int b0 = byte1 & 0x0FF; 377 if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { 378 int b1 = -1; 379 if (++in < total) { 380 b1 = fBuffer[in] & 0x00FF; 381 } 382 else { 383 b1 = fInputStream.read(); 384 if (b1 == -1) { 385 if (out > offset) { 386 fBuffer[0] = (byte)b0; 387 fOffset = 1; 388 return out - offset; 389 } 390 expectedByte(2, 2); 391 } 392 count++; 393 } 394 if ((b1 & 0xC0) != 0x80) { 395 if (out > offset) { 396 fBuffer[0] = (byte)b0; 397 fBuffer[1] = (byte)b1; 398 fOffset = 2; 399 return out - offset; 400 } 401 invalidByte(2, 2, b1); 402 } 403 int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F); 404 ch[out++] = (char)c; 405 count -= 1; 406 continue; 407 } 408 409 if ((b0 & 0xF0) == 0xE0) { 412 int b1 = -1; 413 if (++in < total) { 414 b1 = fBuffer[in] & 0x00FF; 415 } 416 else { 417 b1 = fInputStream.read(); 418 if (b1 == -1) { 419 if (out > offset) { 420 fBuffer[0] = (byte)b0; 421 fOffset = 1; 422 return out - offset; 423 } 424 expectedByte(2, 3); 425 } 426 count++; 427 } 428 if ((b1 & 0xC0) != 0x80 429 || (b0 == 0xED && b1 >= 0xA0) 430 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { 431 if (out > offset) { 432 fBuffer[0] = (byte)b0; 433 fBuffer[1] = (byte)b1; 434 fOffset = 2; 435 return out - offset; 436 } 437 invalidByte(2, 3, b1); 438 } 439 int b2 = -1; 440 if (++in < total) { 441 b2 = fBuffer[in] & 0x00FF; 442 } 443 else { 444 b2 = fInputStream.read(); 445 if (b2 == -1) { 446 if (out > offset) { 447 fBuffer[0] = (byte)b0; 448 fBuffer[1] = (byte)b1; 449 fOffset = 2; 450 return out - offset; 451 } 452 expectedByte(3, 3); 453 } 454 count++; 455 } 456 if ((b2 & 0xC0) != 0x80) { 457 if (out > offset) { 458 fBuffer[0] = (byte)b0; 459 fBuffer[1] = (byte)b1; 460 fBuffer[2] = (byte)b2; 461 fOffset = 3; 462 return out - offset; 463 } 464 invalidByte(3, 3, b2); 465 } 466 int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) | 467 (b2 & 0x003F); 468 ch[out++] = (char)c; 469 count -= 2; 470 continue; 471 } 472 473 if ((b0 & 0xF8) == 0xF0) { 478 int b1 = -1; 479 if (++in < total) { 480 b1 = fBuffer[in] & 0x00FF; 481 } 482 else { 483 b1 = fInputStream.read(); 484 if (b1 == -1) { 485 if (out > offset) { 486 fBuffer[0] = (byte)b0; 487 fOffset = 1; 488 return out - offset; 489 } 490 expectedByte(2, 4); 491 } 492 count++; 493 } 494 if ((b1 & 0xC0) != 0x80 495 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { 496 if (out > offset) { 497 fBuffer[0] = (byte)b0; 498 fBuffer[1] = (byte)b1; 499 fOffset = 2; 500 return out - offset; 501 } 502 invalidByte(2, 4, b1); 503 } 504 int b2 = -1; 505 if (++in < total) { 506 b2 = fBuffer[in] & 0x00FF; 507 } 508 else { 509 b2 = fInputStream.read(); 510 if (b2 == -1) { 511 if (out > offset) { 512 fBuffer[0] = (byte)b0; 513 fBuffer[1] = (byte)b1; 514 fOffset = 2; 515 return out - offset; 516 } 517 expectedByte(3, 4); 518 } 519 count++; 520 } 521 if ((b2 & 0xC0) != 0x80) { 522 if (out > offset) { 523 fBuffer[0] = (byte)b0; 524 fBuffer[1] = (byte)b1; 525 fBuffer[2] = (byte)b2; 526 fOffset = 3; 527 return out - offset; 528 } 529 invalidByte(3, 4, b2); 530 } 531 int b3 = -1; 532 if (++in < total) { 533 b3 = fBuffer[in] & 0x00FF; 534 } 535 else { 536 b3 = fInputStream.read(); 537 if (b3 == -1) { 538 if (out > offset) { 539 fBuffer[0] = (byte)b0; 540 fBuffer[1] = (byte)b1; 541 fBuffer[2] = (byte)b2; 542 fOffset = 3; 543 return out - offset; 544 } 545 expectedByte(4, 4); 546 } 547 count++; 548 } 549 if ((b3 & 0xC0) != 0x80) { 550 if (out > offset) { 551 fBuffer[0] = (byte)b0; 552 fBuffer[1] = (byte)b1; 553 fBuffer[2] = (byte)b2; 554 fBuffer[3] = (byte)b3; 555 fOffset = 4; 556 return out - offset; 557 } 558 invalidByte(4, 4, b2); 559 } 560 561 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003); 563 if (uuuuu > 0x10) { 564 invalidSurrogate(uuuuu); 565 } 566 int wwww = uuuuu - 1; 567 int zzzz = b1 & 0x000F; 568 int yyyyyy = b2 & 0x003F; 569 int xxxxxx = b3 & 0x003F; 570 int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4); 571 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; 572 573 ch[out++] = (char)hs; 575 ch[out++] = (char)ls; 576 count -= 2; 577 continue; 578 } 579 580 if (out > offset) { 582 fBuffer[0] = (byte)b0; 583 fOffset = 1; 584 return out - offset; 585 } 586 invalidByte(1, 1, b0); 587 } 588 589 if (DEBUG_READ) { 591 System.out.println("read(char[],"+offset+','+length+"): count="+count); 592 } 593 return count; 594 595 } 597 607 public long skip(long n) throws IOException { 608 609 long remaining = n; 610 final char[] ch = new char[fBuffer.length]; 611 do { 612 int length = ch.length < remaining ? ch.length : (int)remaining; 613 int count = read(ch, 0, length); 614 if (count > 0) { 615 remaining -= count; 616 } 617 else { 618 break; 619 } 620 } while (remaining > 0); 621 622 long skipped = n - remaining; 623 return skipped; 624 625 } 627 636 public boolean ready() throws IOException { 637 return false; 638 } 640 643 public boolean markSupported() { 644 return false; 645 } 647 660 public void mark(int readAheadLimit) throws IOException { 661 throw new IOException (fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object []{"mark()", "UTF-8"})); 662 } 664 677 public void reset() throws IOException { 678 fOffset = 0; 679 fSurrogate = -1; 680 } 682 689 public void close() throws IOException { 690 fInputStream.close(); 691 } 693 697 698 private void expectedByte(int position, int count) 699 throws MalformedByteSequenceException { 700 701 throw new MalformedByteSequenceException(fFormatter, 702 fLocale, 703 XMLMessageFormatter.XML_DOMAIN, 704 "ExpectedByte", 705 new Object [] {Integer.toString(position), Integer.toString(count)}); 706 707 } 709 710 private void invalidByte(int position, int count, int c) 711 throws MalformedByteSequenceException { 712 713 throw new MalformedByteSequenceException(fFormatter, 714 fLocale, 715 XMLMessageFormatter.XML_DOMAIN, 716 "InvalidByte", 717 new Object [] {Integer.toString(position), Integer.toString(count)}); 718 719 } 721 722 private void invalidSurrogate(int uuuuu) throws MalformedByteSequenceException { 723 724 throw new MalformedByteSequenceException(fFormatter, 725 fLocale, 726 XMLMessageFormatter.XML_DOMAIN, 727 "InvalidHighSurrogate", 728 new Object [] {Integer.toHexString(uuuuu)}); 729 730 } 732 } | Popular Tags |