1 31 package org.pdfbox.pdfparser; 32 33 import java.io.BufferedInputStream ; 34 import java.io.InputStream ; 35 import java.io.IOException ; 36 import java.io.OutputStream ; 37 38 import java.util.ArrayList ; 39 import java.util.List ; 40 41 import org.pdfbox.io.ByteArrayPushBackInputStream; 42 import org.pdfbox.io.PushBackInputStream; 43 import org.pdfbox.io.RandomAccess; 44 45 import org.pdfbox.cos.COSArray; 46 import org.pdfbox.cos.COSBase; 47 import org.pdfbox.cos.COSBoolean; 48 import org.pdfbox.cos.COSDictionary; 49 import org.pdfbox.cos.COSDocument; 50 import org.pdfbox.cos.COSInteger; 51 import org.pdfbox.cos.COSName; 52 import org.pdfbox.cos.COSNull; 53 import org.pdfbox.cos.COSNumber; 54 import org.pdfbox.cos.COSObject; 55 import org.pdfbox.cos.COSStream; 56 import org.pdfbox.cos.COSString; 57 58 import org.pdfbox.persistence.util.COSObjectKey; 59 60 67 public abstract class BaseParser 68 { 69 72 public static final byte[] ENDSTREAM = 73 new byte[] {101,110,100,115,116,114,101,97,109}; 75 78 public static final String DEF = "def"; 79 80 83 protected PushBackInputStream pdfSource; 85 86 91 private List xrefs = new ArrayList (); 92 93 private COSDocument document; 94 95 102 public BaseParser( InputStream input) throws IOException 103 { 104 pdfSource = new PushBackInputStream( new BufferedInputStream ( input, 16384 ), 4096 ); 106 } 107 108 115 protected BaseParser(byte[] input) throws IOException 116 { 117 pdfSource = new ByteArrayPushBackInputStream(input); 118 } 119 120 125 public void setDocument( COSDocument doc ) 126 { 127 document = doc; 128 } 129 130 private static boolean isHexDigit(char ch) 131 { 132 return (ch >= '0' && ch <= '9') || 133 (ch >= 'a' && ch <= 'f') || 134 (ch >= 'A' && ch <= 'F'); 135 } 139 140 147 private COSBase parseCOSDictionaryValue() throws IOException 148 { 149 COSBase retval = null; 150 COSBase number = parseDirObject(); 151 skipSpaces(); 152 char next = (char)pdfSource.peek(); 153 if( next >= '0' && next <= '9' ) 154 { 155 COSBase generationNumber = parseDirObject(); 156 skipSpaces(); 157 char r = (char)pdfSource.read(); 158 if( r != 'R' ) 159 { 160 throw new IOException ( "expected='R' actual='" + r + "' " + pdfSource ); 161 } 162 COSObjectKey key = new COSObjectKey(((COSInteger) number).intValue(), 163 ((COSInteger) generationNumber).intValue()); 164 retval = document.getObjectFromPool(key); 165 } 166 else 167 { 168 retval = number; 169 } 170 return retval; 171 } 172 173 180 protected COSDictionary parseCOSDictionary() throws IOException 181 { 182 char c = (char)pdfSource.read(); 183 if( c != '<') 184 { 185 throw new IOException ( "expected='<' actual='" + c + "'" ); 186 } 187 c = (char)pdfSource.read(); 188 if( c != '<') 189 { 190 throw new IOException ( "expected='<' actual='" + c + "' " + pdfSource ); 191 } 192 skipSpaces(); 193 COSDictionary obj = new COSDictionary(); 194 boolean done = false; 195 while( !done ) 196 { 197 skipSpaces(); 198 c = (char)pdfSource.peek(); 199 if( c == '>') 200 { 201 done = true; 202 } 203 else 204 { 205 COSName key = parseCOSName(); 206 COSBase value = parseCOSDictionaryValue(); 207 skipSpaces(); 208 if( ((char)pdfSource.peek()) == 'd' ) 209 { 210 String potentialDEF = readString(); 213 if( !potentialDEF.equals( DEF ) ) 214 { 215 pdfSource.unread( potentialDEF.getBytes() ); 216 } 217 else 218 { 219 skipSpaces(); 220 } 221 } 222 223 if( value == null ) 224 { 225 throw new IOException ("Bad Dictionary Declaration " + pdfSource ); 226 } 227 obj.setItem( key, value ); 228 } 229 } 230 char ch = (char)pdfSource.read(); 231 if( ch != '>' ) 232 { 233 throw new IOException ( "expected='>' actual='" + ch + "'" ); 234 } 235 ch = (char)pdfSource.read(); 236 if( ch != '>' ) 237 { 238 throw new IOException ( "expected='>' actual='" + ch + "'" ); 239 } 240 return obj; 241 } 242 243 253 protected COSStream parseCOSStream( COSDictionary dic, RandomAccess file ) throws IOException 254 { 255 COSStream stream = new COSStream( dic, file ); 256 OutputStream out = null; 257 try 258 { 259 String streamString = readString(); 260 262 if (!streamString.equals("stream")) 263 { 264 throw new IOException ("expected='stream' actual='" + streamString + "'"); 265 } 266 267 270 int whitespace = pdfSource.read(); 271 272 while (whitespace == 0x20) 276 { 277 whitespace = pdfSource.read(); 278 } 279 280 if( whitespace == 0x0D ) 281 { 282 whitespace = pdfSource.read(); 283 if( whitespace != 0x0A ) 284 { 285 pdfSource.unread( whitespace ); 286 } 291 } 292 else if (whitespace == 0x0A) 293 { 294 } 296 else 297 { 298 pdfSource.unread( whitespace ); 302 305 } 306 307 308 COSBase streamLength = dic.getDictionaryObject(COSName.LENGTH); 309 319 320 323 out = stream.createFilteredStream( streamLength ); 325 String endStream = null; 326 348 readUntilEndStream( out ); 349 350 skipSpaces(); 351 endStream = readString(); 352 353 if (!endStream.equals("endstream")) 354 { 355 readUntilEndStream( out ); 356 endStream = readString(); 357 if( !endStream.equals( "endstream" ) ) 358 { 359 throw new IOException ("expected='endstream' actual='" + endStream + "' " + pdfSource); 360 } 361 } 362 } 363 finally 364 { 365 if( out != null ) 366 { 367 out.close(); 368 } 369 } 370 return stream; 371 } 372 373 private void readUntilEndStream( OutputStream out ) throws IOException 374 { 375 int currentIndex = 0; 376 int byteRead = 0; 377 int additionalBytes=0; 379 byte[] buffer = new byte[ENDSTREAM.length+additionalBytes]; 380 int writeIndex = 0; 381 while(!cmpCircularBuffer( buffer, currentIndex, ENDSTREAM ) && byteRead != -1 ) 382 { 383 writeIndex = currentIndex - buffer.length; 384 if( writeIndex >= 0 ) 385 { 386 out.write( buffer[writeIndex%buffer.length] ); 387 } 388 byteRead = pdfSource.read(); 389 buffer[currentIndex%buffer.length] = (byte)byteRead; 390 currentIndex++; 391 } 392 393 411 412 426 pdfSource.unread( ENDSTREAM ); 427 428 } 429 430 434 private boolean cmpCircularBuffer( byte[] buffer, int currentIndex, byte[] compareTo ) 435 { 436 int cmpLen = compareTo.length; 437 int buflen = buffer.length; 438 boolean match = true; 439 int off = currentIndex-cmpLen; 440 if( off < 0 ) 441 { 442 match = false; 443 } 444 for( int i=0; match && i<cmpLen; ++i ) 445 { 446 match = buffer[(off+i)%buflen] == compareTo[i]; 447 } 448 return match; 449 } 450 451 458 protected COSString parseCOSString() throws IOException 459 { 460 char nextChar = (char)pdfSource.read(); 461 COSString retval = new COSString(); 462 char openBrace; 463 char closeBrace; 464 if( nextChar == '(' ) 465 { 466 openBrace = '('; 467 closeBrace = ')'; 468 } 469 else if( nextChar == '<' ) 470 { 471 openBrace = '<'; 472 closeBrace = '>'; 473 } 474 else 475 { 476 throw new IOException ( "parseCOSString string should start with '(' or '<' and not '" + 477 nextChar + "' " + pdfSource ); 478 } 479 480 int braces = 1; 483 int c = pdfSource.read(); 484 while( braces > 0 && c != -1) 485 { 486 char ch = (char)c; 487 int nextc = -2; 493 if(ch == closeBrace) 494 { 495 braces--; 496 byte[] nextThreeBytes = new byte[3]; 497 int amountRead = pdfSource.read(nextThreeBytes); 498 499 if( amountRead == 3 ) 516 { 517 if( nextThreeBytes[0] == 0x0d && 518 nextThreeBytes[1] == 0x0a && 519 nextThreeBytes[2] == 0x2f ) 520 { 521 braces = 0; 522 } 523 } 524 pdfSource.unread( nextThreeBytes, 0, amountRead ); 525 if( braces != 0 ) 526 { 527 retval.append( ch ); 528 } 529 } 530 else if( ch == openBrace ) 531 { 532 braces++; 533 retval.append( ch ); 534 } 535 else if( ch == '\\' ) 536 { 537 char next = (char)pdfSource.read(); 539 switch(next) 540 { 541 case 'n': 542 retval.append( '\n' ); 543 break; 544 case 'r': 545 retval.append( '\r' ); 546 break; 547 case 't': 548 retval.append( '\t' ); 549 break; 550 case 'b': 551 retval.append( '\b' ); 552 break; 553 case 'f': 554 retval.append( '\f' ); 555 break; 556 case '(': 557 case ')': 558 case '\\': 559 retval.append( next ); 560 break; 561 case 10: 562 case 13: 563 c = pdfSource.read(); 565 while( isEOL(c) && c != -1) 566 { 567 c = pdfSource.read(); 568 } 569 nextc = c; 570 break; 571 case '0': 572 case '1': 573 case '2': 574 case '3': 575 case '4': 576 case '5': 577 case '6': 578 case '7': 579 { 580 StringBuffer octal = new StringBuffer (); 581 octal.append( next ); 582 c = pdfSource.read(); 583 char digit = (char)c; 584 if( digit >= '0' && digit <= '7' ) 585 { 586 octal.append( digit ); 587 c = pdfSource.read(); 588 digit = (char)c; 589 if( digit >= '0' && digit <= '7' ) 590 { 591 octal.append( digit ); 592 } 593 else 594 { 595 nextc = c; 596 } 597 } 598 else 599 { 600 nextc = c; 601 } 602 603 int character = 0; 604 try 605 { 606 character = Integer.parseInt( octal.toString(), 8 ); 607 } 608 catch( NumberFormatException e ) 609 { 610 throw new IOException ( "Error: Expected octal character, actual='" + octal + "'" ); 611 } 612 retval.append( character ); 613 break; 614 } 615 default: 616 { 617 retval.append( '\\' ); 618 retval.append( next ); 619 } 624 } 625 } 626 else 627 { 628 if( openBrace == '<' ) 629 { 630 if( isHexDigit(ch) ) 631 { 632 retval.append( ch ); 633 } 634 } 635 else 636 { 637 retval.append( ch ); 638 } 639 } 640 if (nextc != -2) 641 { 642 c = nextc; 643 } 644 else 645 { 646 c = pdfSource.read(); 647 } 648 } 649 if (c != -1) 650 { 651 pdfSource.unread(c); 652 } 653 if( openBrace == '<' ) 654 { 655 retval = COSString.createFromHexString( retval.getString() ); 656 } 657 return retval; 658 } 659 660 667 protected COSArray parseCOSArray() throws IOException 668 { 669 char ch = (char)pdfSource.read(); 670 if( ch != '[') 671 { 672 throw new IOException ( "expected='[' actual='" + ch + "'" ); 673 } 674 COSArray po = new COSArray(); 675 COSBase pbo = null; 676 skipSpaces(); 677 int i = 0; 678 while( ((i = pdfSource.peek()) > 0) && ((char)i != ']') ) 679 { 680 pbo = parseDirObject(); 681 if( pbo instanceof COSObject ) 682 { 683 COSInteger genNumber = (COSInteger)po.remove( po.size() -1 ); 684 COSInteger number = (COSInteger)po.remove( po.size() -1 ); 685 COSObjectKey key = new COSObjectKey(number.intValue(), genNumber.intValue()); 686 pbo = document.getObjectFromPool(key); 687 } 688 if( pbo != null ) 689 { 690 po.add( pbo ); 691 } 692 else 693 { 694 } 696 skipSpaces(); 697 } 698 pdfSource.read(); skipSpaces(); 700 return po; 701 } 702 703 709 protected boolean isEndOfName(char ch) 710 { 711 return (ch == ' ' || ch == 13 || ch == 10 || ch == 9 || ch == '>' || ch == '<' 712 || ch == '[' || ch =='/' || ch ==']' || ch ==')' || ch =='(' || 713 ch == -1 ); 715 } 716 717 724 protected COSName parseCOSName() throws IOException 725 { 726 COSName retval = null; 727 int c = pdfSource.read(); 728 if( (char)c != '/') 729 { 730 throw new IOException ("expected='/' actual='" + (char)c + "'-" + c + " " + pdfSource ); 731 } 732 StringBuffer buffer = new StringBuffer (); 734 c = pdfSource.read(); 735 while( c != -1 ) 736 { 737 char ch = (char)c; 738 if(ch == '#') 739 { 740 char ch1 = (char)pdfSource.read(); 741 char ch2 = (char)pdfSource.read(); 742 743 if (isHexDigit(ch1) && isHexDigit(ch2)) 751 { 752 String hex = "" + ch1 + ch2; 753 try 754 { 755 buffer.append( (char) Integer.parseInt(hex, 16)); 756 } 757 catch (NumberFormatException e) 758 { 759 throw new IOException ("Error: expected hex number, actual='" + hex + "'"); 760 } 761 c = pdfSource.read(); 762 } 763 else 764 { 765 pdfSource.unread(ch2); 766 c = ch1; 767 buffer.append( ch ); 768 } 769 } 770 else if (isEndOfName(ch)) 771 { 772 break; 773 } 774 else 775 { 776 buffer.append( ch ); 777 c = pdfSource.read(); 778 } 779 } 780 if (c != -1) 781 { 782 pdfSource.unread(c); 783 } 784 retval = COSName.getPDFName( buffer.toString() ); 785 return retval; 786 } 787 788 795 protected COSBoolean parseBoolean() throws IOException 796 { 797 COSBoolean retval = null; 798 char c = (char)pdfSource.peek(); 799 if( c == 't' ) 800 { 801 byte[] trueArray = new byte[ 4 ]; 802 int amountRead = pdfSource.read( trueArray, 0, 4 ); 803 String trueString = new String ( trueArray, 0, amountRead ); 804 if( !trueString.equals( "true" ) ) 805 { 806 throw new IOException ( "Error parsing boolean: expected='true' actual='" + trueString + "'" ); 807 } 808 else 809 { 810 retval = COSBoolean.TRUE; 811 } 812 } 813 else if( c == 'f' ) 814 { 815 byte[] falseArray = new byte[ 5 ]; 816 int amountRead = pdfSource.read( falseArray, 0, 5 ); 817 String falseString = new String ( falseArray, 0, amountRead ); 818 if( !falseString.equals( "false" ) ) 819 { 820 throw new IOException ( "Error parsing boolean: expected='true' actual='" + falseString + "'" ); 821 } 822 else 823 { 824 retval = COSBoolean.FALSE; 825 } 826 } 827 else 828 { 829 throw new IOException ( "Error parsing boolean expected='t or f' actual='" + c + "'" ); 830 } 831 return retval; 832 } 833 834 841 protected COSBase parseDirObject() throws IOException 842 { 843 COSBase retval = null; 844 845 skipSpaces(); 846 int nextByte = pdfSource.peek(); 847 char c = (char)nextByte; 848 switch(c) 849 { 850 case '<': 851 { 852 int leftBracket = pdfSource.read(); c = (char)pdfSource.peek(); pdfSource.unread( leftBracket ); 855 if(c == '<') 856 { 857 858 retval = parseCOSDictionary(); 859 skipSpaces(); 860 } 861 else 862 { 863 retval = parseCOSString(); 864 } 865 break; 866 } 867 case '[': { 869 retval = parseCOSArray(); 870 break; 871 } 872 case '(': 873 retval = parseCOSString(); 874 break; 875 case '/': retval = parseCOSName(); 877 break; 878 case 'n': { 880 String nullString = readString(); 881 if( !nullString.equals( "null") ) 882 { 883 throw new IOException ("Expected='null' actual='" + nullString + "'"); 884 } 885 retval = COSNull.NULL; 886 break; 887 } 888 case 't': 889 { 890 byte[] trueBytes = new byte[4]; 891 int amountRead = pdfSource.read( trueBytes, 0, 4 ); 892 String trueString = new String ( trueBytes, 0, amountRead ); 893 if( trueString.equals( "true" ) ) 894 { 895 retval = COSBoolean.TRUE; 896 } 897 else 898 { 899 throw new IOException ( "expected true actual='" + trueString + "' " + pdfSource ); 900 } 901 break; 902 } 903 case 'f': 904 { 905 byte[] falseBytes = new byte[5]; 906 int amountRead = pdfSource.read( falseBytes, 0, 5 ); 907 String falseString = new String ( falseBytes, 0, amountRead ); 908 if( falseString.equals( "false" ) ) 909 { 910 retval = COSBoolean.FALSE; 911 } 912 else 913 { 914 throw new IOException ( "expected false actual='" + falseString + "' " + pdfSource ); 915 } 916 break; 917 } 918 case 'R': 919 pdfSource.read(); 920 retval = new COSObject(null); 921 break; 922 case (char)-1: 923 return null; 924 default: 925 { 926 if( Character.isDigit(c) || c == '-' || c == '+' || c == '.') 927 { 928 StringBuffer buf = new StringBuffer (); 929 int ic = pdfSource.read(); 930 c = (char)ic; 931 while( Character.isDigit( c )|| 932 c == '-' || 933 c == '+' || 934 c == '.' || 935 c == 'E' || 936 c == 'e' ) 937 { 938 buf.append( c ); 939 ic = pdfSource.read(); 940 c = (char)ic; 941 } 942 if( ic != -1 ) 943 { 944 pdfSource.unread( ic ); 945 } 946 retval = COSNumber.get( buf.toString() ); 947 } 948 else 949 { 950 String badString = readString(); 954 if( badString == null || badString.length() == 0 ) 957 { 958 int peek = pdfSource.peek(); 959 throw new IOException ( "Unknown dir object c='" + c + 961 "' cInt=" + (int)c + " peek='" + (char)peek + "' peekInt=" + peek + " " + pdfSource ); 962 } 963 964 } 965 } 966 } 967 return retval; 968 } 969 970 977 protected String readString() throws IOException 978 { 979 skipSpaces(); 980 StringBuffer buffer = new StringBuffer (); 981 int c = pdfSource.read(); 982 while( !isEndOfName((char)c) && !isClosing(c) && c != -1 ) 983 { 984 buffer.append( (char)c ); 985 c = pdfSource.read(); 986 } 987 if (c != -1) 988 { 989 pdfSource.unread(c); 990 } 991 return buffer.toString(); 992 } 993 994 1003 protected String readExpectedString( String theString ) throws IOException 1004 { 1005 int c = pdfSource.read(); 1006 while( isWhitespace(c) && c != -1) 1007 { 1008 c = pdfSource.read(); 1009 } 1010 StringBuffer buffer = new StringBuffer ( theString.length() ); 1011 int charsRead = 0; 1012 while( !isEOL(c) && c != -1 && charsRead < theString.length() ) 1013 { 1014 char next = (char)c; 1015 buffer.append( next ); 1016 if( theString.charAt( charsRead ) == next ) 1017 { 1018 charsRead++; 1019 } 1020 else 1021 { 1022 throw new IOException ( "Error: Expected to read '" + theString + 1023 "' instead started reading '" +buffer.toString() + "'" ); 1024 } 1025 c = pdfSource.read(); 1026 } 1027 while( isEOL(c) && c != -1 ) 1028 { 1029 c = pdfSource.read(); 1030 } 1031 if (c != -1) 1032 { 1033 pdfSource.unread(c); 1034 } 1035 return buffer.toString(); 1036 } 1037 1038 1047 protected String readString( int length ) throws IOException 1048 { 1049 skipSpaces(); 1050 1051 int c = pdfSource.read(); 1052 1053 StringBuffer buffer = new StringBuffer (length); 1056 while( !isWhitespace(c) && !isClosing(c) && c != -1 && buffer.length() < length && 1057 c != '[' && 1058 c != '<' && 1059 c != '(' && 1060 c != '/' ) 1061 { 1062 buffer.append( (char)c ); 1063 c = pdfSource.read(); 1064 } 1065 if (c != -1) 1066 { 1067 pdfSource.unread(c); 1068 } 1069 return buffer.toString(); 1070 } 1071 1072 1079 protected boolean isClosing() throws IOException 1080 { 1081 return isClosing(pdfSource.peek()); 1082 } 1083 1084 1090 protected boolean isClosing(int c) 1091 { 1092 return c == ']'; 1093 } 1094 1095 1102 protected String readLine() throws IOException 1103 { 1104 int c = pdfSource.read(); 1105 while(isWhitespace(c) && c != -1) 1106 { 1107 c = pdfSource.read(); 1108 } 1109 StringBuffer buffer = new StringBuffer ( 11 ); 1110 1111 while( !isEOL(c) && c != -1 ) 1112 { 1113 buffer.append( (char)c ); 1114 c = pdfSource.read(); 1115 } 1116 while( isEOL(c) && c != -1 ) 1117 { 1118 c = pdfSource.read(); 1119 } 1120 if (c != -1) 1121 { 1122 pdfSource.unread(c); 1123 } 1124 return buffer.toString(); 1125 } 1126 1127 1134 protected boolean isEOL() throws IOException 1135 { 1136 return isEOL(pdfSource.peek()); 1137 } 1138 1139 1145 protected boolean isEOL(int c) 1146 { 1147 return c == 10 || c == 13; 1148 } 1149 1150 1157 protected boolean isWhitespace() throws IOException 1158 { 1159 return isWhitespace( pdfSource.peek() ); 1160 } 1161 1162 1169 protected boolean isWhitespace( int c ) 1170 { 1171 return c == 0 || c == 9 || c == 12 || c == 10 1172 || c == 13 || c == 32; 1173 } 1174 1175 1180 protected void skipSpaces() throws IOException 1181 { 1182 int c = pdfSource.read(); 1184 while(c == 0 || c == 9 || c == 12 || c == 10 1186 || c == 13 || c == 32 || c == 37) { 1188 if ( c == 37 ) 1189 { 1190 c = pdfSource.read(); 1192 while(!isEOL(c) && c != -1) 1193 { 1194 c = pdfSource.read(); 1195 } 1196 } 1197 else 1198 { 1199 c = pdfSource.read(); 1200 } 1201 } 1202 if (c != -1) 1203 { 1204 pdfSource.unread(c); 1205 } 1206 } 1208 1209 1216 protected int readInt() throws IOException 1217 { 1218 skipSpaces(); 1219 int retval = 0; 1220 1221 int lastByte = 0; 1222 StringBuffer intBuffer = new StringBuffer (); 1223 while( (lastByte = pdfSource.read() ) != 32 && 1224 lastByte != 10 && 1225 lastByte != 13 && 1226 lastByte != 0 && lastByte != -1 ) 1228 { 1229 intBuffer.append( (char)lastByte ); 1230 } 1231 try 1232 { 1233 retval = Integer.parseInt( intBuffer.toString() ); 1234 } 1235 catch( NumberFormatException e ) 1236 { 1237 throw new IOException ( "Error: Expected an integer type, actual='" + intBuffer + "'" ); 1238 } 1239 return retval; 1240 } 1241 1242 1247 public void addXref( PDFXref xref ) 1248 { 1249 xrefs.add(xref); 1250 } 1251 1252 1257 public List getXrefs() 1258 { 1259 return xrefs; 1260 } 1261} | Popular Tags |