KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > pdfbox > pdfparser > PDFStreamParser


1 /**
2  * Copyright (c) 2003-2006, www.pdfbox.org
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  * 3. Neither the name of pdfbox; nor the names of its
14  * contributors may be used to endorse or promote products derived from this
15  * software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * http://www.pdfbox.org
29  *
30  */

31 package org.pdfbox.pdfparser;
32
33 import java.io.ByteArrayOutputStream JavaDoc;
34 import java.io.InputStream JavaDoc;
35 import java.io.IOException JavaDoc;
36
37 import java.util.ArrayList JavaDoc;
38 import java.util.List JavaDoc;
39
40 import org.pdfbox.cos.COSBase;
41 import org.pdfbox.cos.COSBoolean;
42 import org.pdfbox.cos.COSDictionary;
43 import org.pdfbox.cos.COSName;
44 import org.pdfbox.cos.COSNull;
45 import org.pdfbox.cos.COSNumber;
46 import org.pdfbox.cos.COSObject;
47 import org.pdfbox.cos.COSStream;
48 import org.pdfbox.io.RandomAccess;
49
50 import org.pdfbox.pdmodel.common.PDStream;
51 import org.pdfbox.util.PDFOperator;
52 import org.pdfbox.util.ImageParameters;
53
54 /**
55  * This will parse a PDF byte stream and extract operands and such.
56  *
57  * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
58  * @version $Revision: 1.32 $
59  */

60 public class PDFStreamParser extends BaseParser
61 {
62     private List JavaDoc streamObjects = new ArrayList JavaDoc( 100 );
63     private RandomAccess file;
64     private PDFOperator lastBIToken = null;
65
66     /**
67      * Constructor that takes a stream to parse.
68      *
69      * @param stream The stream to read data from.
70      * @param raf The random access file.
71      *
72      * @throws IOException If there is an error reading from the stream.
73      */

74     public PDFStreamParser( InputStream JavaDoc stream, RandomAccess raf ) throws IOException JavaDoc
75     {
76         super( stream );
77         file = raf;
78     }
79     
80     /**
81      * Constructor.
82      *
83      * @param stream The stream to parse.
84      *
85      * @throws IOException If there is an error initializing the stream.
86      */

87     public PDFStreamParser( PDStream stream ) throws IOException JavaDoc
88     {
89        this( stream.createInputStream(), stream.getStream().getScratchFile() );
90     }
91
92     /**
93      * Constructor.
94      *
95      * @param stream The stream to parse.
96      *
97      * @throws IOException If there is an error initializing the stream.
98      */

99     public PDFStreamParser( COSStream stream ) throws IOException JavaDoc
100     {
101        this( stream.getUnfilteredStream(), stream.getScratchFile() );
102     }
103
104     /**
105      * This will parse the tokens in the stream. This will close the
106      * stream when it is finished parsing.
107      *
108      * @throws IOException If there is an error while parsing the stream.
109      */

110     public void parse() throws IOException JavaDoc
111     {
112         try
113         {
114             Object JavaDoc token = null;
115             while( (token = parseNextToken()) != null )
116             {
117                 streamObjects.add( token );
118             }
119         }
120         finally
121         {
122             pdfSource.close();
123         }
124     }
125
126     /**
127      * This will get the tokens that were parsed from the stream.
128      *
129      * @return All of the tokens in the stream.
130      */

131     public List JavaDoc getTokens()
132     {
133         return streamObjects;
134     }
135
136     /**
137      * This will parse the next token in the stream.
138      *
139      * @return The next token in the stream or null if there are no more tokens in the stream.
140      *
141      * @throws IOException If an io error occurs while parsing the stream.
142      */

143     private Object JavaDoc parseNextToken() throws IOException JavaDoc
144     {
145         Object JavaDoc retval = null;
146
147         skipSpaces();
148         int nextByte = pdfSource.peek();
149         if( ((byte)nextByte) == -1 )
150         {
151             return null;
152         }
153         char c = (char)nextByte;
154         switch(c)
155         {
156             case '<':
157             {
158                 int leftBracket = pdfSource.read();//pull off first left bracket
159
c = (char)pdfSource.peek(); //check for second left bracket
160
pdfSource.unread( leftBracket ); //put back first bracket
161
if(c == '<')
162                 {
163
164                     COSDictionary pod = parseCOSDictionary();
165                     skipSpaces();
166                     if((char)pdfSource.peek() == 's')
167                     {
168                         retval = parseCOSStream( pod, file );
169                     }
170                     else
171                     {
172                         retval = pod;
173                     }
174                 }
175                 else
176                 {
177                     retval = parseCOSString();
178                 }
179                 break;
180             }
181             case '[': // array
182
{
183                 retval = parseCOSArray();
184                 break;
185             }
186             case '(': // string
187
retval = parseCOSString();
188                 break;
189             case '/': // name
190
retval = parseCOSName();
191                 break;
192             case 'n': // null
193
{
194                 String JavaDoc nullString = readString();
195                 if( nullString.equals( "null") )
196                 {
197                     retval = COSNull.NULL;
198                 }
199                 else
200                 {
201                     retval = PDFOperator.getOperator( nullString );
202                 }
203                 break;
204             }
205             case 't':
206             case 'f':
207             {
208                 String JavaDoc next = readString();
209                 if( next.equals( "true" ) )
210                 {
211                     retval = COSBoolean.TRUE;
212                     break;
213                 }
214                 else if( next.equals( "false" ) )
215                 {
216                     retval = COSBoolean.FALSE;
217                 }
218                 else
219                 {
220                     retval = PDFOperator.getOperator( next );
221                 }
222                 break;
223             }
224             case 'R':
225             {
226                 String JavaDoc line = readString();
227                 if( line.equals( "R" ) )
228                 {
229                     retval = new COSObject( null );
230                 }
231                 else
232                 {
233                     retval = PDFOperator.getOperator( line );
234                 }
235                 break;
236             }
237             case '0':
238             case '1':
239             case '2':
240             case '3':
241             case '4':
242             case '5':
243             case '6':
244             case '7':
245             case '8':
246             case '9':
247             case '-':
248             case '+':
249             case '.':
250             {
251                 if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
252                 {
253                     StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
254                     while( Character.isDigit(( c = (char)pdfSource.peek()) )|| c== '-' || c== '+' || c =='.' )
255                     {
256                         buf.append( c );
257                         pdfSource.read();
258                     }
259                     retval = COSNumber.get( buf.toString() );
260                 }
261                 else
262                 {
263                     throw new IOException JavaDoc( "Unknown dir object c='" + c +
264                         "' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
265                 }
266                 break;
267             }
268             case 'B':
269             {
270                 String JavaDoc next = readString();
271                 retval = PDFOperator.getOperator( next );
272
273                 if( next.equals( "BI" ) )
274                 {
275                     lastBIToken = (PDFOperator)retval;
276                     COSDictionary imageParams = new COSDictionary();
277                     lastBIToken.setImageParameters( new ImageParameters( imageParams ) );
278                     Object JavaDoc nextToken = null;
279                     while( (nextToken = parseNextToken()) instanceof COSName )
280                     {
281                         Object JavaDoc value = parseNextToken();
282                         imageParams.setItem( (COSName)nextToken, (COSBase)value );
283                     }
284                     //final token will be the image data, maybe??
285
PDFOperator imageData = (PDFOperator)nextToken;
286                     lastBIToken.setImageData( imageData.getImageData() );
287                 }
288                 break;
289             }
290             case 'I':
291             {
292                 //ImageParameters imageParams = lastBIToken.getImageParameters();
293

294                 //int expectedBytes = (int)Math.ceil(imageParams.getHeight() * imageParams.getWidth() *
295
// (imageParams.getBitsPerComponent()/8) );
296
//Special case for ID operator
297
String JavaDoc id = "" + (char)pdfSource.read() + (char)pdfSource.read();
298                 if( !id.equals( "ID" ) )
299                 {
300                     throw new IOException JavaDoc( "Error: Expected operator 'ID' actual='" + id + "'" );
301                 }
302                 ByteArrayOutputStream JavaDoc imageData = new ByteArrayOutputStream JavaDoc();
303                 //boolean foundEnd = false;
304
if( this.isWhitespace() )
305                 {
306                     //pull off the whitespace character
307
pdfSource.read();
308                 }
309                 int twoBytesAgo = 0;
310                 int lastByte = pdfSource.read();
311                 int currentByte = pdfSource.read();
312                 int count = 0;
313                 //PDF spec is kinda unclear about this. Should a whitespace
314
//always appear before EI? Not sure, I found a PDF
315
//(UnderstandingWebSphereClassLoaders.pdf) which has EI as part
316
//of the image data and will stop parsing prematurely if there is
317
//not a check for <whitespace>EI<whitespace>.
318
while( !(isWhitespace( twoBytesAgo ) &&
319                          lastByte == 'E' &&
320                          currentByte == 'I' &&
321                          isWhitespace() //&&
322
//amyuni2_05d__pdf1_3_acro4x.pdf has image data that
323
//is compressed, so expectedBytes is useless here.
324
//count >= expectedBytes
325
) &&
326                        !pdfSource.isEOF() )
327                 {
328                     imageData.write( lastByte );
329                     twoBytesAgo = lastByte;
330                     lastByte = currentByte;
331                     currentByte = pdfSource.read();
332                     count++;
333                 }
334                 pdfSource.unread( 'I' ); //unread the EI operator
335
pdfSource.unread( 'E' );
336                 retval = PDFOperator.getOperator( "ID" );
337                 ((PDFOperator)retval).setImageData( imageData.toByteArray() );
338                 break;
339             }
340             case ']':
341             {
342                 // some ']' around without its previous '['
343
// this means a PDF is somewhat corrupt but we will continue to parse.
344
pdfSource.read();
345                 retval = COSNull.NULL; // must be a better solution than null...
346
break;
347             }
348             default:
349             {
350                 //we must be an operator
351
String JavaDoc operator = readOperator();
352                 if( operator.trim().length() == 0 )
353                 {
354                     //we have a corrupt stream, stop reading here
355
retval = null;
356                 }
357                 else
358                 {
359                     retval = PDFOperator.getOperator( operator );
360                 }
361             }
362
363         }
364
365         return retval;
366     }
367
368     /**
369      * This will read an operator from the stream.
370      *
371      * @return The operator that was read from the stream.
372      *
373      * @throws IOException If there is an error reading from the stream.
374      */

375     protected String JavaDoc readOperator() throws IOException JavaDoc
376     {
377         skipSpaces();
378
379         //average string size is around 2 and the normal string buffer size is
380
//about 16 so lets save some space.
381
StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(4);
382         while(
383             !isWhitespace() &&
384             !isClosing() &&
385             !pdfSource.isEOF() &&
386             pdfSource.peek() != (int)'[' &&
387             pdfSource.peek() != (int)'<' &&
388             pdfSource.peek() != (int)'(' &&
389             pdfSource.peek() != (int)'/' &&
390             (pdfSource.peek() < (int)'0' ||
391              pdfSource.peek() > (int)'9' ) )
392         {
393             buffer.append( (char)pdfSource.read() );
394         }
395         return buffer.toString();
396     }
397 }
Popular Tags