PDFStreamParser


1   /**
2    * Copyright (c) 2003-2006, www.pdfbox.org
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions are met:
7    *
8    * 1. Redistributions of source code must retain the above copyright notice,
9    *    this list of conditions and the following disclaimer.
10   * 2. Redistributions in binary form must reproduce the above copyright notice,
11   *    this list of conditions and the following disclaimer in the documentation
12   *    and/or other materials provided with the distribution.
13   * 3. Neither the name of pdfbox; nor the names of its
14   *    contributors may be used to endorse or promote products derived from this
15   *    software without specific prior written permission.
16   *
17   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20   * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27   *
28   * http://www.pdfbox.org
29   *
30   */
31  package org.pdfbox.pdfparser;
32  
33  import java.io.ByteArrayOutputStream  ;
34  import java.io.InputStream  ;
35  import java.io.IOException  ;
36  
37  import java.util.ArrayList  ;
38  import java.util.List  ;
39  
40  import org.pdfbox.cos.COSBase;
41  import org.pdfbox.cos.COSBoolean;
42  import org.pdfbox.cos.COSDictionary;
43  import org.pdfbox.cos.COSName;
44  import org.pdfbox.cos.COSNull;
45  import org.pdfbox.cos.COSNumber;
46  import org.pdfbox.cos.COSObject;
47  import org.pdfbox.cos.COSStream;
48  import org.pdfbox.io.RandomAccess;
49  
50  import org.pdfbox.pdmodel.common.PDStream;
51  import org.pdfbox.util.PDFOperator;
52  import org.pdfbox.util.ImageParameters;
53  
54  /**
55   * This will parse a PDF byte stream and extract operands and such.
56   *
57   * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
58   * @version $Revision: 1.32 $
59   */
60  public class PDFStreamParser extends BaseParser
61  {
62      private List   streamObjects = new ArrayList  ( 100 );
63      private RandomAccess file;
64      private PDFOperator lastBIToken = null;
65  
66      /**
67       * Constructor that takes a stream to parse.
68       *
69       * @param stream The stream to read data from.
70       * @param raf The random access file.
71       *
72       * @throws IOException If there is an error reading from the stream.
73       */
74      public PDFStreamParser( InputStream   stream, RandomAccess raf ) throws IOException  
75      {
76          super( stream );
77          file = raf;
78      }
79      
80      /**
81       * Constructor.
82       *
83       * @param stream The stream to parse.
84       *
85       * @throws IOException If there is an error initializing the stream.
86       */
87      public PDFStreamParser( PDStream stream ) throws IOException  
88      {
89         this( stream.createInputStream(), stream.getStream().getScratchFile() );
90      }
91  
92      /**
93       * Constructor.
94       *
95       * @param stream The stream to parse.
96       *
97       * @throws IOException If there is an error initializing the stream.
98       */
99      public PDFStreamParser( COSStream stream ) throws IOException  
100     {
101        this( stream.getUnfilteredStream(), stream.getScratchFile() );
102     }
103 
104     /**
105      * This will parse the tokens in the stream.  This will close the
106      * stream when it is finished parsing.
107      *
108      * @throws IOException If there is an error while parsing the stream.
109      */
110     public void parse() throws IOException  
111     {
112         try
113         {
114             Object   token = null;
115             while( (token = parseNextToken()) != null )
116             {
117                 streamObjects.add( token );
118             }
119         }
120         finally
121         {
122             pdfSource.close();
123         }
124     }
125 
126     /**
127      * This will get the tokens that were parsed from the stream.
128      *
129      * @return All of the tokens in the stream.
130      */
131     public List   getTokens()
132     {
133         return streamObjects;
134     }
135 
136     /**
137      * This will parse the next token in the stream.
138      *
139      * @return The next token in the stream or null if there are no more tokens in the stream.
140      *
141      * @throws IOException If an io error occurs while parsing the stream.
142      */
143     private Object   parseNextToken() throws IOException  
144     {
145         Object   retval = null;
146 
147         skipSpaces();
148         int nextByte = pdfSource.peek();
149         if( ((byte)nextByte) == -1 )
150         {
151             return null;
152         }
153         char c = (char)nextByte;
154         switch(c)
155         {
156             case '<':
157             {
158                 int leftBracket = pdfSource.read();//pull off first left bracket
159                 c = (char)pdfSource.peek(); //check for second left bracket
160                 pdfSource.unread( leftBracket ); //put back first bracket
161                 if(c == '<')
162                 {
163 
164                     COSDictionary pod = parseCOSDictionary();
165                     skipSpaces();
166                     if((char)pdfSource.peek() == 's')
167                     {
168                         retval = parseCOSStream( pod, file );
169                     }
170                     else
171                     {
172                         retval = pod;
173                     }
174                 }
175                 else
176                 {
177                     retval = parseCOSString();
178                 }
179                 break;
180             }
181             case '[': // array
182             {
183                 retval = parseCOSArray();
184                 break;
185             }
186             case '(': // string
187                 retval = parseCOSString();
188                 break;
189             case '/':   // name
190                 retval = parseCOSName();
191                 break;
192             case 'n':   // null
193             {
194                 String   nullString = readString();
195                 if( nullString.equals( "null") )
196                 {
197                     retval = COSNull.NULL;
198                 }
199                 else
200                 {
201                     retval = PDFOperator.getOperator( nullString );
202                 }
203                 break;
204             }
205             case 't':
206             case 'f':
207             {
208                 String   next = readString();
209                 if( next.equals( "true" ) )
210                 {
211                     retval = COSBoolean.TRUE;
212                     break;
213                 }
214                 else if( next.equals( "false" ) )
215                 {
216                     retval = COSBoolean.FALSE;
217                 }
218                 else
219                 {
220                     retval = PDFOperator.getOperator( next );
221                 }
222                 break;
223             }
224             case 'R':
225             {
226                 String   line = readString();
227                 if( line.equals( "R" ) )
228                 {
229                     retval = new COSObject( null );
230                 }
231                 else
232                 {
233                     retval = PDFOperator.getOperator( line );
234                 }
235                 break;
236             }
237             case '0':
238             case '1':
239             case '2':
240             case '3':
241             case '4':
242             case '5':
243             case '6':
244             case '7':
245             case '8':
246             case '9':
247             case '-':
248             case '+':
249             case '.':
250             {
251                 if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
252                 {
253                     StringBuffer   buf = new StringBuffer  ();
254                     while( Character.isDigit(( c = (char)pdfSource.peek()) )|| c== '-' || c== '+' || c =='.' )
255                     {
256                         buf.append( c );
257                         pdfSource.read();
258                     }
259                     retval = COSNumber.get( buf.toString() );
260                 }
261                 else
262                 {
263                     throw new IOException  ( "Unknown dir object c='" + c +
264                         "' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
265                 }
266                 break;
267             }
268             case 'B':
269             {
270                 String   next = readString();
271                 retval = PDFOperator.getOperator( next );
272 
273                 if( next.equals( "BI" ) )
274                 {
275                     lastBIToken = (PDFOperator)retval;
276                     COSDictionary imageParams = new COSDictionary();
277                     lastBIToken.setImageParameters( new ImageParameters( imageParams ) );
278                     Object   nextToken = null;
279                     while( (nextToken = parseNextToken()) instanceof COSName )
280                     {
281                         Object   value = parseNextToken();
282                         imageParams.setItem( (COSName)nextToken, (COSBase)value );
283                     }
284                     //final token will be the image data, maybe??
285                     PDFOperator imageData = (PDFOperator)nextToken;
286                     lastBIToken.setImageData( imageData.getImageData() );
287                 }
288                 break;
289             }
290             case 'I':
291             {
292                 //ImageParameters imageParams = lastBIToken.getImageParameters();
293                 
294                 //int expectedBytes = (int)Math.ceil(imageParams.getHeight() * imageParams.getWidth() *
295                 //                    (imageParams.getBitsPerComponent()/8) );
296                 //Special case for ID operator
297                 String   id = "" + (char)pdfSource.read() + (char)pdfSource.read();
298                 if( !id.equals( "ID" ) )
299                 {
300                     throw new IOException  ( "Error: Expected operator 'ID' actual='" + id + "'" );
301                 }
302                 ByteArrayOutputStream   imageData = new ByteArrayOutputStream  ();
303                 //boolean foundEnd = false;
304                 if( this.isWhitespace() )
305                 {
306                     //pull off the whitespace character
307                     pdfSource.read();
308                 }
309                 int twoBytesAgo = 0;
310                 int lastByte = pdfSource.read();
311                 int currentByte = pdfSource.read();
312                 int count = 0;
313                 //PDF spec is kinda unclear about this.  Should a whitespace
314                 //always appear before EI? Not sure, I found a PDF
315                 //(UnderstandingWebSphereClassLoaders.pdf) which has EI as part
316                 //of the image data and will stop parsing prematurely if there is
317                 //not a check for <whitespace>EI<whitespace>.
318                 while( !(isWhitespace( twoBytesAgo ) &&
319                          lastByte == 'E' &&
320                          currentByte == 'I' &&
321                          isWhitespace() //&&
322                          //amyuni2_05d__pdf1_3_acro4x.pdf has image data that
323                          //is compressed, so expectedBytes is useless here.
324                          //count >= expectedBytes
325                          ) &&
326                        !pdfSource.isEOF() )
327                 {
328                     imageData.write( lastByte );
329                     twoBytesAgo = lastByte;
330                     lastByte = currentByte;
331                     currentByte = pdfSource.read();
332                     count++;
333                 }
334                 pdfSource.unread( 'I' ); //unread the EI operator
335                 pdfSource.unread( 'E' );
336                 retval = PDFOperator.getOperator( "ID" );
337                 ((PDFOperator)retval).setImageData( imageData.toByteArray() );
338                 break;
339             }
340             case ']':
341             {
342                 // some ']' around without its previous '['
343                 // this means a PDF is somewhat corrupt but we will continue to parse.
344                 pdfSource.read();
345                 retval = COSNull.NULL;  // must be a better solution than null...
346                 break;
347             }
348             default:
349             {
350                 //we must be an operator
351                 String   operator = readOperator();
352                 if( operator.trim().length() == 0 )
353                 {
354                     //we have a corrupt stream, stop reading here
355                     retval = null;
356                 }
357                 else
358                 {
359                     retval = PDFOperator.getOperator( operator );
360                 }
361             }
362 
363         }
364 
365         return retval;
366     }
367 
368     /**
369      * This will read an operator from the stream.
370      *
371      * @return The operator that was read from the stream.
372      *
373      * @throws IOException If there is an error reading from the stream.
374      */
375     protected String   readOperator() throws IOException  
376     {
377         skipSpaces();
378 
379         //average string size is around 2 and the normal string buffer size is
380         //about 16 so lets save some space.
381         StringBuffer   buffer = new StringBuffer  (4);
382         while(
383             !isWhitespace() &&
384             !isClosing() &&
385             !pdfSource.isEOF() &&
386             pdfSource.peek() != (int)'[' &&
387             pdfSource.peek() != (int)'<' &&
388             pdfSource.peek() != (int)'(' &&
389             pdfSource.peek() != (int)'/' &&
390             (pdfSource.peek() < (int)'0' ||
391              pdfSource.peek() > (int)'9' ) )
392         {
393             buffer.append( (char)pdfSource.read() );
394         }
395         return buffer.toString();
396     }
397 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags