1 31 package org.pdfbox.pdfparser; 32 33 import java.io.File ; 34 import java.io.InputStream ; 35 import java.io.IOException ; 36 37 import java.util.Iterator ; 38 39 import org.pdfbox.cos.COSBase; 40 import org.pdfbox.cos.COSDictionary; 41 import org.pdfbox.cos.COSDocument; 42 import org.pdfbox.cos.COSObject; 43 import org.pdfbox.cos.COSStream; 44 import org.pdfbox.exceptions.WrappedIOException; 45 import org.pdfbox.io.RandomAccess; 46 47 import org.pdfbox.pdmodel.PDDocument; 48 49 import org.pdfbox.pdmodel.fdf.FDFDocument; 50 51 import org.pdfbox.persistence.util.COSObjectKey; 52 53 59 public class PDFParser extends BaseParser 60 { 61 private static final int SPACE_BYTE = 32; 62 63 private static final String PDF_HEADER = "%PDF-"; 64 private COSDocument document; 65 66 69 private File tempDirectory = null; 70 71 private RandomAccess raf = null; 72 73 80 public PDFParser( InputStream input ) throws IOException 81 { 82 this(input, null); 83 } 84 85 92 public PDFParser(InputStream input, RandomAccess rafi) 93 throws IOException 94 { 95 super(input); 96 this.raf = rafi; 97 } 98 99 107 public void setTempDirectory( File tmpDir ) 108 { 109 tempDirectory = tmpDir; 110 } 111 112 118 public void parse() throws IOException 119 { 120 try 121 { 122 if ( raf == null ) 123 { 124 if( tempDirectory != null ) 125 { 126 document = new COSDocument( tempDirectory ); 127 } 128 else 129 { 130 document = new COSDocument(); 131 } 132 } 133 else 134 { 135 document = new COSDocument( raf ); 136 } 137 setDocument( document ); 138 String header = readLine(); 139 document.setHeaderString( header ); 140 141 if( header.length() < PDF_HEADER.length()+1 ) 142 { 143 throw new IOException ( "Error: Header is corrupt '" + header + "'" ); 144 } 145 146 int headerStart = header.indexOf( PDF_HEADER ); 149 150 if( headerStart > 0 ) 153 { 154 header = header.substring( headerStart, header.length() ); 156 } 157 158 try 159 { 160 float pdfVersion = Float.parseFloat( 161 header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) ); 162 document.setVersion( pdfVersion ); 163 } 164 catch( NumberFormatException e ) 165 { 166 throw new IOException ( "Error getting pdf version:" + e ); 167 } 168 169 skipHeaderFillBytes(); 170 171 172 Object nextObject; 173 boolean wasLastParsedObjectAnXref = false; 174 try 175 { 176 while( (nextObject = parseObject()) != null ) 177 { 178 if( nextObject instanceof PDFXref ) 179 { 180 PDFXref xref = (PDFXref)nextObject; 181 addXref(xref); 182 wasLastParsedObjectAnXref = true; 183 } 184 else 185 { 186 wasLastParsedObjectAnXref = false; 187 } 188 skipSpaces(); 189 } 190 if( document.getTrailer() == null ) 191 { 192 COSDictionary trailer = new COSDictionary(); 193 Iterator xrefIter = document.getObjectsByType( "XRef" ).iterator(); 194 while( xrefIter.hasNext() ) 195 { 196 COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject(); 197 trailer.addAll( next ); 198 } 199 document.setTrailer( trailer ); 200 } 201 if( !document.isEncrypted() ) 202 { 203 document.dereferenceObjectStreams(); 204 } 205 } 206 catch( IOException e ) 207 { 208 if( wasLastParsedObjectAnXref ) 209 { 210 } 213 else 214 { 215 throw e; 217 } 218 } 219 } 220 catch( Throwable t ) 221 { 222 if( document != null ) 225 { 226 document.close(); 227 } 228 if( t instanceof IOException ) 229 { 230 throw (IOException )t; 231 } 232 else 233 { 234 throw new WrappedIOException( t ); 235 } 236 } 237 finally 238 { 239 pdfSource.close(); 240 } 241 } 242 243 249 protected void skipHeaderFillBytes() throws IOException 250 { 251 skipSpaces(); 252 int c = pdfSource.peek(); 253 254 if( !Character.isDigit( (char)c ) ) 255 { 256 readLine(); 259 } 260 } 262 263 272 public COSDocument getDocument() throws IOException 273 { 274 if( document == null ) 275 { 276 throw new IOException ( "You must call parse() before calling getDocument()" ); 277 } 278 return document; 279 } 280 281 289 public PDDocument getPDDocument() throws IOException 290 { 291 return new PDDocument( getDocument() ); 292 } 293 294 302 public FDFDocument getFDFDocument() throws IOException 303 { 304 return new FDFDocument( getDocument() ); 305 } 306 307 314 private Object parseObject() throws IOException 315 { 316 Object object = null; 317 skipSpaces(); 318 char peekedChar = (char)pdfSource.peek(); 319 while( peekedChar == 'e' ) 320 { 321 readString(); 324 skipSpaces(); 325 peekedChar = (char)pdfSource.peek(); 326 } 327 if( pdfSource.isEOF() ) 328 { 329 } 332 else if( peekedChar == 'x' || 333 peekedChar == 't' || 334 peekedChar == 's') 335 { 336 338 if( peekedChar == 'x' || peekedChar == 't' ) 340 { 341 object = parseXrefSection(); 342 } 343 344 if( peekedChar == 'x' || peekedChar == 's') 346 { 347 skipSpaces(); 348 while( pdfSource.peek() == 'x' ) 349 { 350 parseXrefSection(); 351 } 352 String startxref = readString(); 353 if( !startxref.equals( "startxref" ) ) 354 { 355 throw new IOException ( "expected='startxref' actual='" + startxref + "' " + pdfSource ); 356 } 357 skipSpaces(); 358 readInt(); 360 } 361 362 String eof = readExpectedString( "%%EOF" ); 365 if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() ) 366 { 367 throw new IOException ( "expected='%%EOF' actual='" + eof + "' next=" + readString() + 368 " next=" +readString() ); 369 } 370 else if( !pdfSource.isEOF() ) 371 { 372 pdfSource.fillBuffer(); 375 if( pdfSource.available() < 1000 ) 376 { 377 byte[] data = new byte[ 1000 ]; 379 380 int amountRead = pdfSource.read( data ); 381 if( amountRead != -1 ) 382 { 383 pdfSource.unread( data, 0, amountRead ); 384 } 385 boolean atEndOfFile = true; for( int i=0; i<amountRead-3 && atEndOfFile; i++ ) 387 { 388 atEndOfFile = !(data[i] == 'E' && 389 data[i+1] == 'O' && 390 data[i+2] == 'F' ); 391 } 392 if( atEndOfFile ) 393 { 394 while( pdfSource.read( data, 0, data.length ) != -1 ) 395 { 396 } 398 } 399 } 400 } 401 } 402 else 403 { 404 int number = -1; 405 int genNum = -1; 406 String objectKey = null; 407 boolean missingObjectNumber = false; 408 try 409 { 410 char peeked = (char)pdfSource.peek(); 411 if( peeked == '<' ) 412 { 413 missingObjectNumber = true; 414 } 415 else 416 { 417 number = readInt(); 418 } 419 } 420 catch( IOException e ) 421 { 422 number = readInt(); 427 } 428 if( !missingObjectNumber ) 429 { 430 skipSpaces(); 431 genNum = readInt(); 432 433 objectKey = readString( 3 ); 434 if( !objectKey.equals( "obj" ) ) 437 { 438 throw new IOException ("expected='obj' actual='" + objectKey + "' " + pdfSource ); 439 } 440 } 441 else 442 { 443 number = -1; 444 genNum = -1; 445 } 446 447 skipSpaces(); 448 COSBase pb = parseDirObject(); 449 String endObjectKey = readString(); 450 if( endObjectKey.equals( "stream" ) ) 451 { 452 pdfSource.unread( endObjectKey.getBytes() ); 453 pdfSource.unread( ' ' ); 454 if( pb instanceof COSDictionary ) 455 { 456 pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() ); 457 } 458 else 459 { 460 throw new IOException ("stream not preceded by dictionary"); 463 } 464 endObjectKey = readString(); 465 } 466 COSObjectKey key = new COSObjectKey( number, genNum ); 467 COSObject pdfObject = document.getObjectFromPool( key ); 468 object = pdfObject; 469 pdfObject.setObject(pb); 470 471 if( !endObjectKey.equals( "endobj" ) ) 472 { 473 if( !pdfSource.isEOF() ) 474 { 475 try 476 { 477 Float.parseFloat( endObjectKey ); 480 pdfSource.unread( SPACE_BYTE ); 481 pdfSource.unread( endObjectKey.getBytes() ); 482 } 483 catch( NumberFormatException e ) 484 { 485 String secondEndObjectKey = readString(); 488 if( !secondEndObjectKey.equals( "endobj" ) ) 489 { 490 if( isClosing() ) 491 { 492 pdfSource.read(); 497 } 498 skipSpaces(); 499 String thirdPossibleEndObj = readString(); 500 if( !thirdPossibleEndObj.equals( "endobj" ) ) 501 { 502 throw new IOException ("expected='endobj' firstReadAttempt='" + endObjectKey + "' " + 503 "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource); 504 } 505 } 506 } 507 } 508 } 509 skipSpaces(); 510 511 } 512 return object; 514 } 515 516 517 524 protected PDFXref parseXrefSection() throws IOException 525 { 526 int[] params = new int[2]; 527 parseXrefTable(params); 528 parseTrailer(); 529 530 return new PDFXref(params[0], params[1]); 531 } 532 533 542 protected void parseXrefTable(int[] params) throws IOException 543 { 544 String nextLine = null; 545 546 nextLine = readLine(); 547 if( nextLine.equals( "xref" ) ) 548 { 549 params[0] = readInt(); 550 params[1] = readInt(); 551 nextLine = readString(); 552 } 553 skipSpaces(); 554 while( !nextLine.equals( "trailer" ) && !pdfSource.isEOF() && !isEndOfName((char)pdfSource.peek())) 555 { 556 nextLine = readString(); 558 skipSpaces(); 559 } 560 skipSpaces(); 561 } 562 563 private void parseTrailer() throws IOException 564 { 565 COSDictionary parsedTrailer = parseCOSDictionary(); 566 COSDictionary docTrailer = document.getTrailer(); 567 if( docTrailer == null ) 568 { 569 document.setTrailer( parsedTrailer ); 570 } 571 else 572 { 573 docTrailer.addAll( parsedTrailer ); 574 } 575 } 576 } 577 | Popular Tags |