PDFParser


1   /**
2    * Copyright (c) 2003-2006, www.pdfbox.org
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions are met:
7    *
8    * 1. Redistributions of source code must retain the above copyright notice,
9    *    this list of conditions and the following disclaimer.
10   * 2. Redistributions in binary form must reproduce the above copyright notice,
11   *    this list of conditions and the following disclaimer in the documentation
12   *    and/or other materials provided with the distribution.
13   * 3. Neither the name of pdfbox; nor the names of its
14   *    contributors may be used to endorse or promote products derived from this
15   *    software without specific prior written permission.
16   *
17   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20   * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27   *
28   * http://www.pdfbox.org
29   *
30   */
31  package org.pdfbox.pdfparser;
32  
33  import java.io.File  ;
34  import java.io.InputStream  ;
35  import java.io.IOException  ;
36  
37  import java.util.Iterator  ;
38  
39  import org.pdfbox.cos.COSBase;
40  import org.pdfbox.cos.COSDictionary;
41  import org.pdfbox.cos.COSDocument;
42  import org.pdfbox.cos.COSObject;
43  import org.pdfbox.cos.COSStream;
44  import org.pdfbox.exceptions.WrappedIOException;
45  import org.pdfbox.io.RandomAccess;
46  
47  import org.pdfbox.pdmodel.PDDocument;
48  
49  import org.pdfbox.pdmodel.fdf.FDFDocument;
50  
51  import org.pdfbox.persistence.util.COSObjectKey;
52  
53  /**
54   * This class will handle the parsing of the PDF document.
55   *
56   * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
57   * @version $Revision: 1.53 $
58   */
59  public class PDFParser extends BaseParser
60  {
61      private static final int SPACE_BYTE = 32;
62  
63      private static final String   PDF_HEADER = "%PDF-";
64      private COSDocument document;
65  
66      /**
67       * Temp file directory.
68       */
69      private File   tempDirectory = null;
70  
71      private RandomAccess raf = null;
72  
73      /**
74       * Constructor.
75       *
76       * @param input The input stream that contains the PDF document.
77       *
78       * @throws IOException If there is an error initializing the stream.
79       */
80      public PDFParser( InputStream   input ) throws IOException  
81      {
82          this(input, null);
83      }
84  
85      /**
86       * Constructor to allow control over RandomAccessFile.
87       * @param input The input stream that contains the PDF document.
88       * @param rafi The RandomAccessFile to be used in internal COSDocument
89       *
90       * @throws IOException If there is an error initializing the stream.
91       */
92      public PDFParser(InputStream   input, RandomAccess rafi)
93          throws IOException  
94      {
95          super(input);
96          this.raf = rafi;
97      }
98  
99      /**
100      * This is the directory where pdfbox will create a temporary file
101      * for storing pdf document stream in.  By default this directory will
102      * be the value of the system property java.io.tmpdir.
103      *
104      * @param tmpDir The directory to create scratch files needed to store
105      *        pdf document streams.
106      */
107     public void setTempDirectory( File   tmpDir )
108     {
109         tempDirectory = tmpDir;
110     }
111 
112     /**
113      * This will prase the stream and create the PDF document.  This will close
114      * the stream when it is done parsing.
115      *
116      * @throws IOException If there is an error reading from the stream.
117      */
118     public void parse() throws IOException  
119     {
120         try
121         {
122             if ( raf == null )
123             {
124                 if( tempDirectory != null )
125                 {
126                     document = new COSDocument( tempDirectory );
127                 }
128                 else
129                 {
130                     document = new COSDocument();
131                 }
132             }
133             else
134             {
135                 document = new COSDocument( raf );
136             }
137             setDocument( document );
138             String   header = readLine();
139             document.setHeaderString( header );
140 
141             if( header.length() < PDF_HEADER.length()+1 )
142             {
143                 throw new IOException  ( "Error: Header is corrupt '" + header + "'" );
144             }
145 
146             //sometimes there are some garbage bytes in the header before the header
147             //actually starts, so lets try to find the header first.
148             int headerStart = header.indexOf( PDF_HEADER );
149 
150             //greater than zero because if it is zero then
151             //there is no point of trimming
152             if( headerStart > 0 )
153             {
154                 //trim off any leading characters
155                 header = header.substring( headerStart, header.length() );
156             }
157 
158             try
159             {
160                 float pdfVersion = Float.parseFloat( 
161                     header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) );
162                 document.setVersion( pdfVersion );
163             }
164             catch( NumberFormatException   e )
165             {
166                 throw new IOException  ( "Error getting pdf version:" + e );
167             }
168 
169             skipHeaderFillBytes();
170 
171 
172             Object   nextObject;
173             boolean wasLastParsedObjectAnXref = false;
174             try
175             {
176                 while( (nextObject = parseObject()) != null )
177                 {
178                     if( nextObject instanceof PDFXref )
179                     {
180                         PDFXref xref = (PDFXref)nextObject;
181                         addXref(xref);
182                         wasLastParsedObjectAnXref = true;
183                     }
184                     else
185                     {
186                         wasLastParsedObjectAnXref = false;
187                     }
188                     skipSpaces();
189                 }
190                 if( document.getTrailer() == null )
191                 {
192                     COSDictionary trailer = new COSDictionary();
193                     Iterator   xrefIter = document.getObjectsByType( "XRef" ).iterator();
194                     while( xrefIter.hasNext() )
195                     {
196                         COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject();
197                         trailer.addAll( next );
198                     }
199                     document.setTrailer( trailer );
200                 }
201                 if( !document.isEncrypted() )
202                 {
203                     document.dereferenceObjectStreams();
204                 }
205             }
206             catch( IOException   e )
207             {
208                 if( wasLastParsedObjectAnXref )
209                 {
210                     //Then we assume that there is just random garbage after
211                     //the xref, not sure why the PDF spec allows this but it does.
212                 }
213                 else
214                 {
215                     //some other error so just pass it along
216                     throw e;
217                 }
218             }
219         }
220         catch( Throwable   t )
221         {
222             //so if the PDF is corrupt then close the document and clear
223             //all resources to it
224             if( document != null )
225             {
226                 document.close();
227             }
228             if( t instanceof IOException   )
229             {
230                 throw (IOException  )t;
231             }
232             else
233             {
234                 throw new WrappedIOException( t );
235             }
236         }
237         finally
238         {
239             pdfSource.close();
240         }
241     }
242 
243     /**
244      * This will skip a header's binary fill bytes.  This is in accordance to
245      * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header"
246      *
247      * @throws IOException If there is an error reading from the stream.
248     */
249     protected void skipHeaderFillBytes() throws IOException  
250     {
251         skipSpaces();
252         int c = pdfSource.peek();
253         
254         if( !Character.isDigit( (char)c ) )
255         {
256             // Fill bytes conform with PDF reference (but without comment sign)
257             // => skip until EOL
258             readLine();
259         }
260         // else: no fill bytes
261     }
262 
263     /**
264      * This will get the document that was parsed.  parse() must be called before this is called.
265      * When you are done with this document you must call close() on it to release
266      * resources.
267      *
268      * @return The document that was parsed.
269      *
270      * @throws IOException If there is an error getting the document.
271      */
272     public COSDocument getDocument() throws IOException  
273     {
274         if( document == null )
275         {
276             throw new IOException  ( "You must call parse() before calling getDocument()" );
277         }
278         return document;
279     }
280 
281     /**
282      * This will get the PD document that was parsed.  When you are done with
283      * this document you must call close() on it to release resources.
284      *
285      * @return The document at the PD layer.
286      *
287      * @throws IOException If there is an error getting the document.
288      */
289     public PDDocument getPDDocument() throws IOException  
290     {
291         return new PDDocument( getDocument() );
292     }
293 
294     /**
295      * This will get the FDF document that was parsed.  When you are done with
296      * this document you must call close() on it to release resources.
297      *
298      * @return The document at the PD layer.
299      *
300      * @throws IOException If there is an error getting the document.
301      */
302     public FDFDocument getFDFDocument() throws IOException  
303     {
304         return new FDFDocument( getDocument() );
305     }
306 
307     /**
308      * This will parse a document object from the stream.
309      *
310      * @return The parsed object.
311      *
312      * @throws IOException If an IO error occurs.
313      */
314     private Object   parseObject() throws IOException  
315     {
316         Object   object = null;
317         skipSpaces();
318         char peekedChar = (char)pdfSource.peek();
319         while( peekedChar == 'e' )
320         {
321             //there are times when there are multiple endobj, so lets
322             //just read them and move on.
323             readString();
324             skipSpaces();
325             peekedChar = (char)pdfSource.peek();
326         }
327         if( pdfSource.isEOF() )
328         {
329             //"Skipping because of EOF" );
330             //end of file we will return a null object and call it a day.
331         }
332         else if( peekedChar == 'x' ||
333                  peekedChar == 't' ||
334                  peekedChar == 's')
335         {
336             //System.out.println( "parseObject() parsing xref" );
337 
338             //FDF documents do not always have the xref
339             if( peekedChar == 'x' || peekedChar == 't' )
340             {
341                 object = parseXrefSection();
342             }
343             
344             //if peeked char is xref or startxref
345             if( peekedChar == 'x' || peekedChar == 's')
346             {
347                 skipSpaces();
348                 while( pdfSource.peek() == 'x' )
349                 {
350                     parseXrefSection();
351                 }
352                 String   startxref = readString();
353                 if( !startxref.equals( "startxref" ) )
354                 {
355                     throw new IOException  ( "expected='startxref' actual='" + startxref + "' " + pdfSource );
356                 }
357                 skipSpaces();
358                 //read some integer that is in the stream but PDFBox doesn't use
359                 readInt();
360             }
361 
362             //This MUST be readLine because readString strips out comments
363             //and it will think that %% is a comment in from of the EOF
364             String   eof = readExpectedString( "%%EOF" );
365             if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() )
366             {
367                 throw new IOException  ( "expected='%%EOF' actual='" + eof + "' next=" + readString() +
368                                        " next=" +readString() );
369             }
370             else if( !pdfSource.isEOF() )
371             {
372                 //we might really be at the end of the file, there might just be some crap at the
373                 //end of the file.
374                 pdfSource.fillBuffer();
375                 if( pdfSource.available() < 1000 )
376                 {
377                     //We need to determine if we are at the end of the file.
378                     byte[] data = new byte[ 1000 ];
379 
380                     int amountRead = pdfSource.read( data );
381                     if( amountRead != -1 )
382                     {
383                         pdfSource.unread( data, 0, amountRead );
384                     }
385                     boolean atEndOfFile = true;//we assume yes unless we find another.
386                     for( int i=0; i<amountRead-3 && atEndOfFile; i++ )
387                     {
388                         atEndOfFile = !(data[i] == 'E' &&
389                                         data[i+1] == 'O' &&
390                                         data[i+2] == 'F' );
391                     }
392                     if( atEndOfFile )
393                     {
394                         while( pdfSource.read( data, 0, data.length ) != -1 )
395                         {
396                             //read until done.
397                         }
398                     }
399                 }
400             }
401         }
402         else
403         {
404             int number = -1;
405             int genNum = -1;
406             String   objectKey = null;
407             boolean missingObjectNumber = false;
408             try
409             {
410                 char peeked = (char)pdfSource.peek();
411                 if( peeked == '<' )
412                 {
413                     missingObjectNumber = true;
414                 }
415                 else
416                 {
417                     number = readInt();
418                 }
419             }
420             catch( IOException   e )
421             {
422                 //ok for some reason "GNU Ghostscript 5.10" puts two endobj
423                 //statements after an object, of course this is nonsense
424                 //but because we want to support as many PDFs as possible
425                 //we will simply try again
426                 number = readInt();
427             }
428             if( !missingObjectNumber )
429             {
430                 skipSpaces();
431                 genNum = readInt();
432 
433                 objectKey = readString( 3 );
434                 //System.out.println( "parseObject() num=" + number + 
435                 //" genNumber=" + genNum + " key='" + objectKey + "'" );
436                 if( !objectKey.equals( "obj" ) )
437                 {
438                     throw new IOException  ("expected='obj' actual='" + objectKey + "' " + pdfSource );
439                 }
440             }
441             else
442             {
443                 number = -1;
444                 genNum = -1;
445             }
446 
447             skipSpaces();
448             COSBase pb = parseDirObject();
449             String   endObjectKey = readString();
450             if( endObjectKey.equals( "stream" ) )
451             {
452                 pdfSource.unread( endObjectKey.getBytes() );
453                 pdfSource.unread( ' ' );
454                 if( pb instanceof COSDictionary )
455                 {
456                     pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
457                 }
458                 else
459                 {
460                     // this is not legal
461                     // the combination of a dict and the stream/endstream forms a complete stream object
462                     throw new IOException  ("stream not preceded by dictionary");
463                 }
464                 endObjectKey = readString();
465             }
466             COSObjectKey key = new COSObjectKey( number, genNum );
467             COSObject pdfObject = document.getObjectFromPool( key );
468             object = pdfObject;
469             pdfObject.setObject(pb);
470 
471             if( !endObjectKey.equals( "endobj" ) )
472             {
473                 if( !pdfSource.isEOF() )
474                 {
475                     try
476                     {
477                         //It is possible that the endobj  is missing, there
478                         //are several PDFs out there that do that so skip it and move on.
479                         Float.parseFloat( endObjectKey );
480                         pdfSource.unread( SPACE_BYTE );
481                         pdfSource.unread( endObjectKey.getBytes() );
482                     }
483                     catch( NumberFormatException   e )
484                     {
485                         //we will try again incase there was some garbage which
486                         //some writers will leave behind.
487                         String   secondEndObjectKey = readString();
488                         if( !secondEndObjectKey.equals( "endobj" ) )
489                         {
490                             if( isClosing() )
491                             {
492                                 //found a case with 17506.pdf object 41 that was like this
493                                 //41 0 obj [/Pattern /DeviceGray] ] endobj
494                                 //notice the second array close, here we are reading it 
495                                 //and ignoring and attempting to continue
496                                 pdfSource.read();
497                             }
498                             skipSpaces();
499                             String   thirdPossibleEndObj = readString();
500                             if( !thirdPossibleEndObj.equals( "endobj" ) )
501                             {
502                                 throw new IOException  ("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
503                                     "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
504                             }
505                         }
506                     }
507                 }
508             }
509             skipSpaces();
510 
511         }
512         //System.out.println( "parsed=" + object );
513         return object;
514     }
515 
516 
517     /**
518      * This will parse the xref table and trailers from the stream.
519      *
520      * @return a new PDFXref
521      *
522      * @throws IOException If an IO error occurs.
523      */
524     protected PDFXref parseXrefSection() throws IOException  
525     {
526         int[] params = new int[2];
527         parseXrefTable(params);
528         parseTrailer();
529 
530         return new PDFXref(params[0], params[1]);
531     }
532 
533     /**
534      * This will parse the xref table from the stream.
535      *
536      * It stores the starting object number and the count
537      * 
538      * @param params The start and count parameters
539      *
540      * @throws IOException If an IO error occurs.
541      */
542     protected void parseXrefTable(int[] params) throws IOException  
543     {
544         String   nextLine = null;
545 
546         nextLine = readLine();
547         if( nextLine.equals( "xref" ) )
548         {
549             params[0] = readInt();
550             params[1] = readInt();
551             nextLine = readString();
552         }
553         skipSpaces();
554         while( !nextLine.equals( "trailer" ) && !pdfSource.isEOF() && !isEndOfName((char)pdfSource.peek()))
555         {
556             //skip past all the xref entries.
557             nextLine = readString();
558             skipSpaces();
559         }
560         skipSpaces();
561     }
562 
563     private void parseTrailer() throws IOException  
564     {
565         COSDictionary parsedTrailer = parseCOSDictionary();
566         COSDictionary docTrailer = document.getTrailer();
567         if( docTrailer == null )
568         {
569             document.setTrailer( parsedTrailer );
570         }
571         else
572         {
573             docTrailer.addAll( parsedTrailer );
574         }
575     }
576 }
577
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags