KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > pdfbox > pdfparser > PDFParser


1 /**
2  * Copyright (c) 2003-2006, www.pdfbox.org
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  * 3. Neither the name of pdfbox; nor the names of its
14  * contributors may be used to endorse or promote products derived from this
15  * software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * http://www.pdfbox.org
29  *
30  */

31 package org.pdfbox.pdfparser;
32
33 import java.io.File JavaDoc;
34 import java.io.InputStream JavaDoc;
35 import java.io.IOException JavaDoc;
36
37 import java.util.Iterator JavaDoc;
38
39 import org.pdfbox.cos.COSBase;
40 import org.pdfbox.cos.COSDictionary;
41 import org.pdfbox.cos.COSDocument;
42 import org.pdfbox.cos.COSObject;
43 import org.pdfbox.cos.COSStream;
44 import org.pdfbox.exceptions.WrappedIOException;
45 import org.pdfbox.io.RandomAccess;
46
47 import org.pdfbox.pdmodel.PDDocument;
48
49 import org.pdfbox.pdmodel.fdf.FDFDocument;
50
51 import org.pdfbox.persistence.util.COSObjectKey;
52
53 /**
54  * This class will handle the parsing of the PDF document.
55  *
56  * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
57  * @version $Revision: 1.53 $
58  */

59 public class PDFParser extends BaseParser
60 {
61     private static final int SPACE_BYTE = 32;
62
63     private static final String JavaDoc PDF_HEADER = "%PDF-";
64     private COSDocument document;
65
66     /**
67      * Temp file directory.
68      */

69     private File JavaDoc tempDirectory = null;
70
71     private RandomAccess raf = null;
72
73     /**
74      * Constructor.
75      *
76      * @param input The input stream that contains the PDF document.
77      *
78      * @throws IOException If there is an error initializing the stream.
79      */

80     public PDFParser( InputStream JavaDoc input ) throws IOException JavaDoc
81     {
82         this(input, null);
83     }
84
85     /**
86      * Constructor to allow control over RandomAccessFile.
87      * @param input The input stream that contains the PDF document.
88      * @param rafi The RandomAccessFile to be used in internal COSDocument
89      *
90      * @throws IOException If there is an error initializing the stream.
91      */

92     public PDFParser(InputStream JavaDoc input, RandomAccess rafi)
93         throws IOException JavaDoc
94     {
95         super(input);
96         this.raf = rafi;
97     }
98
99     /**
100      * This is the directory where pdfbox will create a temporary file
101      * for storing pdf document stream in. By default this directory will
102      * be the value of the system property java.io.tmpdir.
103      *
104      * @param tmpDir The directory to create scratch files needed to store
105      * pdf document streams.
106      */

107     public void setTempDirectory( File JavaDoc tmpDir )
108     {
109         tempDirectory = tmpDir;
110     }
111
112     /**
113      * This will prase the stream and create the PDF document. This will close
114      * the stream when it is done parsing.
115      *
116      * @throws IOException If there is an error reading from the stream.
117      */

118     public void parse() throws IOException JavaDoc
119     {
120         try
121         {
122             if ( raf == null )
123             {
124                 if( tempDirectory != null )
125                 {
126                     document = new COSDocument( tempDirectory );
127                 }
128                 else
129                 {
130                     document = new COSDocument();
131                 }
132             }
133             else
134             {
135                 document = new COSDocument( raf );
136             }
137             setDocument( document );
138             String JavaDoc header = readLine();
139             document.setHeaderString( header );
140
141             if( header.length() < PDF_HEADER.length()+1 )
142             {
143                 throw new IOException JavaDoc( "Error: Header is corrupt '" + header + "'" );
144             }
145
146             //sometimes there are some garbage bytes in the header before the header
147
//actually starts, so lets try to find the header first.
148
int headerStart = header.indexOf( PDF_HEADER );
149
150             //greater than zero because if it is zero then
151
//there is no point of trimming
152
if( headerStart > 0 )
153             {
154                 //trim off any leading characters
155
header = header.substring( headerStart, header.length() );
156             }
157
158             try
159             {
160                 float pdfVersion = Float.parseFloat(
161                     header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) );
162                 document.setVersion( pdfVersion );
163             }
164             catch( NumberFormatException JavaDoc e )
165             {
166                 throw new IOException JavaDoc( "Error getting pdf version:" + e );
167             }
168
169             skipHeaderFillBytes();
170
171
172             Object JavaDoc nextObject;
173             boolean wasLastParsedObjectAnXref = false;
174             try
175             {
176                 while( (nextObject = parseObject()) != null )
177                 {
178                     if( nextObject instanceof PDFXref )
179                     {
180                         PDFXref xref = (PDFXref)nextObject;
181                         addXref(xref);
182                         wasLastParsedObjectAnXref = true;
183                     }
184                     else
185                     {
186                         wasLastParsedObjectAnXref = false;
187                     }
188                     skipSpaces();
189                 }
190                 if( document.getTrailer() == null )
191                 {
192                     COSDictionary trailer = new COSDictionary();
193                     Iterator JavaDoc xrefIter = document.getObjectsByType( "XRef" ).iterator();
194                     while( xrefIter.hasNext() )
195                     {
196                         COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject();
197                         trailer.addAll( next );
198                     }
199                     document.setTrailer( trailer );
200                 }
201                 if( !document.isEncrypted() )
202                 {
203                     document.dereferenceObjectStreams();
204                 }
205             }
206             catch( IOException JavaDoc e )
207             {
208                 if( wasLastParsedObjectAnXref )
209                 {
210                     //Then we assume that there is just random garbage after
211
//the xref, not sure why the PDF spec allows this but it does.
212
}
213                 else
214                 {
215                     //some other error so just pass it along
216
throw e;
217                 }
218             }
219         }
220         catch( Throwable JavaDoc t )
221         {
222             //so if the PDF is corrupt then close the document and clear
223
//all resources to it
224
if( document != null )
225             {
226                 document.close();
227             }
228             if( t instanceof IOException JavaDoc )
229             {
230                 throw (IOException JavaDoc)t;
231             }
232             else
233             {
234                 throw new WrappedIOException( t );
235             }
236         }
237         finally
238         {
239             pdfSource.close();
240         }
241     }
242
243     /**
244      * This will skip a header's binary fill bytes. This is in accordance to
245      * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header"
246      *
247      * @throws IOException If there is an error reading from the stream.
248     */

249     protected void skipHeaderFillBytes() throws IOException JavaDoc
250     {
251         skipSpaces();
252         int c = pdfSource.peek();
253         
254         if( !Character.isDigit( (char)c ) )
255         {
256             // Fill bytes conform with PDF reference (but without comment sign)
257
// => skip until EOL
258
readLine();
259         }
260         // else: no fill bytes
261
}
262
263     /**
264      * This will get the document that was parsed. parse() must be called before this is called.
265      * When you are done with this document you must call close() on it to release
266      * resources.
267      *
268      * @return The document that was parsed.
269      *
270      * @throws IOException If there is an error getting the document.
271      */

272     public COSDocument getDocument() throws IOException JavaDoc
273     {
274         if( document == null )
275         {
276             throw new IOException JavaDoc( "You must call parse() before calling getDocument()" );
277         }
278         return document;
279     }
280
281     /**
282      * This will get the PD document that was parsed. When you are done with
283      * this document you must call close() on it to release resources.
284      *
285      * @return The document at the PD layer.
286      *
287      * @throws IOException If there is an error getting the document.
288      */

289     public PDDocument getPDDocument() throws IOException JavaDoc
290     {
291         return new PDDocument( getDocument() );
292     }
293
294     /**
295      * This will get the FDF document that was parsed. When you are done with
296      * this document you must call close() on it to release resources.
297      *
298      * @return The document at the PD layer.
299      *
300      * @throws IOException If there is an error getting the document.
301      */

302     public FDFDocument getFDFDocument() throws IOException JavaDoc
303     {
304         return new FDFDocument( getDocument() );
305     }
306
307     /**
308      * This will parse a document object from the stream.
309      *
310      * @return The parsed object.
311      *
312      * @throws IOException If an IO error occurs.
313      */

314     private Object JavaDoc parseObject() throws IOException JavaDoc
315     {
316         Object JavaDoc object = null;
317         skipSpaces();
318         char peekedChar = (char)pdfSource.peek();
319         while( peekedChar == 'e' )
320         {
321             //there are times when there are multiple endobj, so lets
322
//just read them and move on.
323
readString();
324             skipSpaces();
325             peekedChar = (char)pdfSource.peek();
326         }
327         if( pdfSource.isEOF() )
328         {
329             //"Skipping because of EOF" );
330
//end of file we will return a null object and call it a day.
331
}
332         else if( peekedChar == 'x' ||
333                  peekedChar == 't' ||
334                  peekedChar == 's')
335         {
336             //System.out.println( "parseObject() parsing xref" );
337

338             //FDF documents do not always have the xref
339
if( peekedChar == 'x' || peekedChar == 't' )
340             {
341                 object = parseXrefSection();
342             }
343             
344             //if peeked char is xref or startxref
345
if( peekedChar == 'x' || peekedChar == 's')
346             {
347                 skipSpaces();
348                 while( pdfSource.peek() == 'x' )
349                 {
350                     parseXrefSection();
351                 }
352                 String JavaDoc startxref = readString();
353                 if( !startxref.equals( "startxref" ) )
354                 {
355                     throw new IOException JavaDoc( "expected='startxref' actual='" + startxref + "' " + pdfSource );
356                 }
357                 skipSpaces();
358                 //read some integer that is in the stream but PDFBox doesn't use
359
readInt();
360             }
361
362             //This MUST be readLine because readString strips out comments
363
//and it will think that %% is a comment in from of the EOF
364
String JavaDoc eof = readExpectedString( "%%EOF" );
365             if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() )
366             {
367                 throw new IOException JavaDoc( "expected='%%EOF' actual='" + eof + "' next=" + readString() +
368                                        " next=" +readString() );
369             }
370             else if( !pdfSource.isEOF() )
371             {
372                 //we might really be at the end of the file, there might just be some crap at the
373
//end of the file.
374
pdfSource.fillBuffer();
375                 if( pdfSource.available() < 1000 )
376                 {
377                     //We need to determine if we are at the end of the file.
378
byte[] data = new byte[ 1000 ];
379
380                     int amountRead = pdfSource.read( data );
381                     if( amountRead != -1 )
382                     {
383                         pdfSource.unread( data, 0, amountRead );
384                     }
385                     boolean atEndOfFile = true;//we assume yes unless we find another.
386
for( int i=0; i<amountRead-3 && atEndOfFile; i++ )
387                     {
388                         atEndOfFile = !(data[i] == 'E' &&
389                                         data[i+1] == 'O' &&
390                                         data[i+2] == 'F' );
391                     }
392                     if( atEndOfFile )
393                     {
394                         while( pdfSource.read( data, 0, data.length ) != -1 )
395                         {
396                             //read until done.
397
}
398                     }
399                 }
400             }
401         }
402         else
403         {
404             int number = -1;
405             int genNum = -1;
406             String JavaDoc objectKey = null;
407             boolean missingObjectNumber = false;
408             try
409             {
410                 char peeked = (char)pdfSource.peek();
411                 if( peeked == '<' )
412                 {
413                     missingObjectNumber = true;
414                 }
415                 else
416                 {
417                     number = readInt();
418                 }
419             }
420             catch( IOException JavaDoc e )
421             {
422                 //ok for some reason "GNU Ghostscript 5.10" puts two endobj
423
//statements after an object, of course this is nonsense
424
//but because we want to support as many PDFs as possible
425
//we will simply try again
426
number = readInt();
427             }
428             if( !missingObjectNumber )
429             {
430                 skipSpaces();
431                 genNum = readInt();
432
433                 objectKey = readString( 3 );
434                 //System.out.println( "parseObject() num=" + number +
435
//" genNumber=" + genNum + " key='" + objectKey + "'" );
436
if( !objectKey.equals( "obj" ) )
437                 {
438                     throw new IOException JavaDoc("expected='obj' actual='" + objectKey + "' " + pdfSource );
439                 }
440             }
441             else
442             {
443                 number = -1;
444                 genNum = -1;
445             }
446
447             skipSpaces();
448             COSBase pb = parseDirObject();
449             String JavaDoc endObjectKey = readString();
450             if( endObjectKey.equals( "stream" ) )
451             {
452                 pdfSource.unread( endObjectKey.getBytes() );
453                 pdfSource.unread( ' ' );
454                 if( pb instanceof COSDictionary )
455                 {
456                     pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
457                 }
458                 else
459                 {
460                     // this is not legal
461
// the combination of a dict and the stream/endstream forms a complete stream object
462
throw new IOException JavaDoc("stream not preceded by dictionary");
463                 }
464                 endObjectKey = readString();
465             }
466             COSObjectKey key = new COSObjectKey( number, genNum );
467             COSObject pdfObject = document.getObjectFromPool( key );
468             object = pdfObject;
469             pdfObject.setObject(pb);
470
471             if( !endObjectKey.equals( "endobj" ) )
472             {
473                 if( !pdfSource.isEOF() )
474                 {
475                     try
476                     {
477                         //It is possible that the endobj is missing, there
478
//are several PDFs out there that do that so skip it and move on.
479
Float.parseFloat( endObjectKey );
480                         pdfSource.unread( SPACE_BYTE );
481                         pdfSource.unread( endObjectKey.getBytes() );
482                     }
483                     catch( NumberFormatException JavaDoc e )
484                     {
485                         //we will try again incase there was some garbage which
486
//some writers will leave behind.
487
String JavaDoc secondEndObjectKey = readString();
488                         if( !secondEndObjectKey.equals( "endobj" ) )
489                         {
490                             if( isClosing() )
491                             {
492                                 //found a case with 17506.pdf object 41 that was like this
493
//41 0 obj [/Pattern /DeviceGray] ] endobj
494
//notice the second array close, here we are reading it
495
//and ignoring and attempting to continue
496
pdfSource.read();
497                             }
498                             skipSpaces();
499                             String JavaDoc thirdPossibleEndObj = readString();
500                             if( !thirdPossibleEndObj.equals( "endobj" ) )
501                             {
502                                 throw new IOException JavaDoc("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
503                                     "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
504                             }
505                         }
506                     }
507                 }
508             }
509             skipSpaces();
510
511         }
512         //System.out.println( "parsed=" + object );
513
return object;
514     }
515
516
517     /**
518      * This will parse the xref table and trailers from the stream.
519      *
520      * @return a new PDFXref
521      *
522      * @throws IOException If an IO error occurs.
523      */

524     protected PDFXref parseXrefSection() throws IOException JavaDoc
525     {
526         int[] params = new int[2];
527         parseXrefTable(params);
528         parseTrailer();
529
530         return new PDFXref(params[0], params[1]);
531     }
532
533     /**
534      * This will parse the xref table from the stream.
535      *
536      * It stores the starting object number and the count
537      *
538      * @param params The start and count parameters
539      *
540      * @throws IOException If an IO error occurs.
541      */

542     protected void parseXrefTable(int[] params) throws IOException JavaDoc
543     {
544         String JavaDoc nextLine = null;
545
546         nextLine = readLine();
547         if( nextLine.equals( "xref" ) )
548         {
549             params[0] = readInt();
550             params[1] = readInt();
551             nextLine = readString();
552         }
553         skipSpaces();
554         while( !nextLine.equals( "trailer" ) && !pdfSource.isEOF() && !isEndOfName((char)pdfSource.peek()))
555         {
556             //skip past all the xref entries.
557
nextLine = readString();
558             skipSpaces();
559         }
560         skipSpaces();
561     }
562
563     private void parseTrailer() throws IOException JavaDoc
564     {
565         COSDictionary parsedTrailer = parseCOSDictionary();
566         COSDictionary docTrailer = document.getTrailer();
567         if( docTrailer == null )
568         {
569             document.setTrailer( parsedTrailer );
570         }
571         else
572         {
573             docTrailer.addAll( parsedTrailer );
574         }
575     }
576 }
577
Popular Tags