1 24 package com.catcode.odf; 25 26 import java.io.InputStream ; 27 import java.io.IOException ; 28 29 import java.util.ArrayList ; 30 import java.util.Collections ; 31 import java.util.regex.Matcher ; 32 import java.util.regex.Pattern ; 33 import java.io.FilterInputStream ; 34 35 60 61 public class OpenDocumentTextInputStream extends FilterInputStream 62 { 63 private StringBuffer tagBuffer; 65 private String textNamespace; private static final Pattern elementNamePattern = 67 Pattern.compile("^/?(?:([\\p{L}\\p{N}_.-]+):)?([\\p{L}\\p{N}_.-]+)"); 68 69 75 private int[] utf8Output; 76 private int utf8OutputPosition; 77 private int utf8OutputLength; 78 79 86 private static ArrayList captureList; 87 private int captureDepth; 88 89 96 private static ArrayList omitList; 97 private int omitDepth; 98 99 private boolean rootElement; 100 104 private static final String [] stdFiveEntities = { 105 "apos", "quot", "lt", "gt", "amp" 106 }; 107 static final byte[] stdFiveValues = { 108 '\'', '"', '<', '>', '&' 109 }; 110 111 116 public OpenDocumentTextInputStream( InputStream in ) 117 { 118 this( in, null, null ); 119 } 120 121 141 public OpenDocumentTextInputStream( InputStream in, 142 ArrayList capture, ArrayList omit ) 143 { 144 super( in ); 145 146 147 utf8Output = new int[4]; 148 utf8OutputPosition = 0; 149 utf8OutputLength = 0; 150 rootElement = true; 151 152 if (capture == null) 153 { 154 captureList = new ArrayList (4); 155 captureList.add( new ElementPostProcess( "h", '\n') ); 156 captureList.add( new ElementPostProcess( "p", '\n' ) ); 157 captureList.add( new ElementPostProcess( "tab", '\t' ) ); 158 captureList.add( new ElementPostProcess( "s", ' ') ); 159 } 160 else 161 { 162 this.captureList = capture; 163 } 164 if (omit == null) 165 { 166 omitList = new ArrayList (1); 167 omitList.add( new ElementPostProcess( "tracked-changes", '\0' ) ); 168 } 169 else 170 { 171 this.omitList = omit; 172 } 173 captureDepth = 0; 174 omitDepth = 0; 175 } 176 177 191 public int read( ) throws IOException 192 { 193 int theByte = 0; 194 int result = 0; 195 while (theByte == 0) 196 { 197 200 if (utf8OutputPosition < utf8OutputLength) 201 { 202 theByte = utf8Output[utf8OutputPosition++]; 203 } 204 else 205 { 206 theByte = in.read( ); 207 if (theByte == '<') 208 { 209 collectTag(); 210 theByte = 0; 211 } 212 else if (theByte == '&') 213 { 214 collectEntity(); 215 theByte = 0; 216 } 217 else if ((omitDepth > 0 || captureDepth == 0) && theByte != -1) 218 { 219 theByte = 0; 220 } 221 } 222 } 223 return theByte; 224 } 225 226 231 public int read(byte b[]) throws IOException 232 { 233 return read(b, 0, b.length); 234 } 235 236 242 public int read(byte b[], int off, int len) throws IOException 243 { 244 if (b == null) 245 { 246 throw new NullPointerException (); 247 } 248 else if ((off < 0) || (off > b.length) || (len < 0) || 249 ((off + len) > b.length) || ((off + len) < 0)) 250 { 251 throw new IndexOutOfBoundsException (); 252 } 253 else if (len == 0) 254 { 255 return 0; 256 } 257 258 int c = read(); 259 260 if (c == -1) 261 { 262 return -1; 263 } 264 265 b[off] = (byte) c; 266 267 int i = 1; 268 try 269 { 270 for (; i < len ; i++) 271 { 272 c = read(); 273 if (c == -1) 274 { 275 break; 276 } 277 if (b != null) 278 { 279 b[off + i] = (byte) c; 280 } 281 } 282 } 283 catch (IOException ee) 284 { 285 } 286 return i; 287 } 288 289 296 public long skip(long n) throws IOException 297 { 298 byte[] tmpbuf = new byte[512]; 299 long remaining = 0; 300 301 if (n < 0) 302 { 303 throw new IllegalArgumentException ("negative skip length"); 304 } 305 int max = (int) Math.min(n, Integer.MAX_VALUE); 306 int total = 0; 307 while (total < max) 308 { 309 int len = max - total; 310 if (len > tmpbuf.length) 311 { 312 len = tmpbuf.length; 313 } 314 len = read(tmpbuf, 0, len); 315 if (len == -1) 316 { 317 break; 318 } 319 total += len; 320 } 321 return total; 322 } 323 324 342 protected void collectEntity( ) throws IOException 343 { 344 StringBuffer strBuf = new StringBuffer (10); 345 String entityString; 346 int entityValue; 347 int b; 348 int i; 350 b = super.read(); 351 while (b != ';' && b != -1) 352 { 353 if (b != -1) 354 { 355 strBuf.append( (char) b ); 356 b = super.read(); 357 } 358 } 359 if (b != -1) 360 { 361 if (strBuf.charAt(0) == '#') 362 { 363 364 entityString = strBuf.substring(1).toLowerCase(); 365 if (entityString.startsWith("x")) 366 { 367 entityString = entityString.substring(1); 368 entityValue = Integer.parseInt( entityString, 16 ); 369 } 370 else 371 { 372 entityValue = Integer.parseInt( entityString, 10 ); 373 } 374 createUTF8Output( entityValue ); 375 } 376 else 377 { 378 379 entityString = strBuf.toString(); 380 for (i=0; i < stdFiveEntities.length && 381 !entityString.equals( stdFiveEntities[i] ); i++) 382 ; 384 if (i == stdFiveEntities.length) 385 { 386 throw new IllegalArgumentException ( "Unknown entity &" 387 + entityString + ";" ); 388 } 389 utf8Output[0] = stdFiveValues[i]; 390 utf8OutputLength = 1; 391 } 392 } 393 else 394 { 395 utf8Output[0] = -1; 396 utf8OutputLength = 1; 397 } 398 utf8OutputPosition = 0; 399 } 400 401 406 protected void createUTF8Output( int value ) 407 { 408 417 if (value > 0x10ffff) 418 { 419 throw new IllegalArgumentException ( 420 value + " outside Unicode range." 421 ); 422 } 423 if (value <= 0x7f) 424 { 425 utf8Output[0] = value & 0x7f; 426 utf8OutputLength = 1; 427 } 428 else if (value <= 0x7ff) 429 { 430 utf8Output[0] = 0xc0 | ((value >> 6) & 0x1f); 431 utf8Output[1] = 0x80 | (value & 0x3f); 432 utf8OutputLength = 2; 433 } 434 else if (value <= 0xffff) 435 { 436 utf8Output[0] = 0xe0 | ((value >> 12) & 0xf); 437 utf8Output[1] = 0x80 | ((value >> 6) & 0x3f); 438 utf8Output[2] = 0x80 | (value & 0x3f); 439 utf8OutputLength = 3; 440 } 441 else 442 { 443 utf8Output[0] = 0xf0 | ((value >> 18) & 0x7); 444 utf8Output[1] = 0x80 | ((value >> 12) & 0x3f); 445 utf8Output[2] = 0x80 | ((value >> 6) & 0x3f); 446 utf8Output[3] = 0x80 | (value & 0x3f); 447 utf8OutputLength = 4; 448 } 449 } 450 451 465 protected void collectTag() throws IOException 466 { 467 int b = 0; 468 int nUTF8; 469 470 tagBuffer = new StringBuffer (50); 471 b = super.read(); 472 while (b != '>' && b != -1) 473 { 474 if (b > 127) 475 { 476 b = collectUTF8( b ); 477 } 478 479 if (b == 0x09 || b == 0x0a || b == 0x0d || b == 0x0085 480 || b == 0x2028 || b == 0x2029) 481 { 482 b = 0x20; 483 } 484 tagBuffer.append( (char) b ); 485 b = super.read(); 486 } 487 if (b != -1) 488 { 489 analyzeTag( tagBuffer.toString() ); 490 } 491 else 492 { 493 utf8Output[0] = -1; 494 utf8OutputLength = 1; 495 utf8OutputPosition = 0; 496 } 497 } 498 499 505 protected int collectUTF8( int startByte ) throws IOException 506 { 507 int highBits = (startByte >> 4) & 0x0f; 508 int nUTF8; 509 int[] utf8Buf = new int[4]; 510 int oneByte = 0; 511 int result; 512 int i; 513 514 utf8Buf[0] = startByte; 515 if (highBits == 12 || highBits == 13) 516 { 517 nUTF8 = 1; 518 } 519 else if (highBits == 14) 520 { 521 nUTF8 = 2; 522 } 523 else 524 { 525 nUTF8 = 3; 526 } 527 for (i=0; i < nUTF8 && oneByte != -1; i++) 528 { 529 oneByte = super.read(); 530 if (oneByte != -1) 531 { 532 utf8Buf[i+1] = oneByte; 533 } 534 } 535 if (oneByte != -1) 536 { 537 result = 0; 538 switch (highBits) 539 { 540 case 12: 541 case 13: 542 result = ((utf8Buf[0] & 0x1f) << 6) 543 | (utf8Buf[1] & 0x3f); 544 break; 545 case 14: 546 result = ((utf8Buf[0] & 0x0f) << 12) 547 | ((utf8Buf[1] & 0x3f) << 6) 548 | (utf8Buf[2] & 0x3f); 549 break; 550 case 15: 551 result = ((utf8Buf[0] & 0x07) << 18) 552 | ((utf8Buf[1] & 0x3f) << 12) 553 | ((utf8Buf[2] & 0x3f) << 6) 554 | (utf8Buf[3] & 0x3f); 555 break; 556 } 557 } 558 else 559 { 560 result = -1; 561 } 562 return result; 563 } 564 565 570 protected void analyzeTag( String tag ) 571 { 572 Matcher m; 573 String prefix; 574 String name; 575 boolean isOpeningTag; 576 boolean isClosingTag; 577 int position; 578 579 if ( !tag.startsWith("!") && !tag.startsWith( "?" ) ) 580 { 581 m = elementNamePattern.matcher( tag ); 582 if (m.find()) 583 { 584 prefix = m.group(1); 585 name = m.group(2); 586 587 591 if (rootElement && !tag.startsWith("?") && 592 !tag.startsWith("!")) 593 { 594 Pattern textURI = 595 Pattern.compile("xmlns:?([\\p{L}\\p{N}_.-]*)\\s*=\\s*" + 596 "\"urn:oasis:names:tc:opendocument:xmlns:text:1.0\""); 597 m = textURI.matcher( tag ); 598 if (m.find()) 599 { 600 textNamespace = m.group(1); 601 } 602 else 603 { 604 throw new IllegalArgumentException ( 605 "Cannot find namespace for text" 606 ); 607 } 608 rootElement = false; 609 } 610 611 isOpeningTag = !tag.startsWith("/"); 612 isClosingTag = tag.startsWith("/") || tag.endsWith("/"); 613 if (prefix.equals(textNamespace)) 614 { 615 position = findTag( omitList, name ); 616 if (position >= 0) 617 { 618 if (isOpeningTag) 619 { 620 omitDepth++; 621 } 622 if (isClosingTag) 623 { 624 omitDepth--; 625 } 626 } 627 628 position = findTag( captureList, name ); 629 if (position >= 0) 630 { 631 ElementPostProcess elementInfo = 632 (ElementPostProcess) captureList.get(position); 633 if (isOpeningTag) 634 { 635 captureDepth++; 636 } 637 if (isClosingTag) 638 { 639 if ( elementInfo.getPostProcess() != '\0' && 640 omitDepth == 0) 641 { 642 utf8Output[0] = elementInfo.getPostProcess(); 643 utf8OutputLength = 1; 644 utf8OutputPosition = 0; 645 } 646 captureDepth--; 647 } 648 } 649 } 650 } 651 else 652 { 653 throw new IllegalArgumentException ( "Unknown tag <" + 654 tag + ">"); 655 } 656 } 657 } 658 659 666 private int findTag( ArrayList list, String name ) 667 { 668 int result = -1; 669 int i = 0; 670 while (i < list.size() && result == -1) 671 { 672 if (((ElementPostProcess)list.get(i)).getName().equals(name)) 673 { 674 result = i; 675 } 676 else 677 { 678 i++; 679 } 680 } 681 return result; 682 } 683 } 684 685 | Popular Tags |