BaseParser


1   /**
2    * Copyright (c) 2003-2006, www.pdfbox.org
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions are met:
7    *
8    * 1. Redistributions of source code must retain the above copyright notice,
9    *    this list of conditions and the following disclaimer.
10   * 2. Redistributions in binary form must reproduce the above copyright notice,
11   *    this list of conditions and the following disclaimer in the documentation
12   *    and/or other materials provided with the distribution.
13   * 3. Neither the name of pdfbox; nor the names of its
14   *    contributors may be used to endorse or promote products derived from this
15   *    software without specific prior written permission.
16   *
17   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20   * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27   *
28   * http://www.pdfbox.org
29   *
30   */
31  package org.pdfbox.pdfparser;
32  
33  import java.io.BufferedInputStream  ;
34  import java.io.InputStream  ;
35  import java.io.IOException  ;
36  import java.io.OutputStream  ;
37  
38  import java.util.ArrayList  ;
39  import java.util.List  ;
40  
41  import org.pdfbox.io.ByteArrayPushBackInputStream;
42  import org.pdfbox.io.PushBackInputStream;
43  import org.pdfbox.io.RandomAccess;
44  
45  import org.pdfbox.cos.COSArray;
46  import org.pdfbox.cos.COSBase;
47  import org.pdfbox.cos.COSBoolean;
48  import org.pdfbox.cos.COSDictionary;
49  import org.pdfbox.cos.COSDocument;
50  import org.pdfbox.cos.COSInteger;
51  import org.pdfbox.cos.COSName;
52  import org.pdfbox.cos.COSNull;
53  import org.pdfbox.cos.COSNumber;
54  import org.pdfbox.cos.COSObject;
55  import org.pdfbox.cos.COSStream;
56  import org.pdfbox.cos.COSString;
57  
58  import org.pdfbox.persistence.util.COSObjectKey;
59  
60  /**
61   * This class is used to contain parsing logic that will be used by both the
62   * PDFParser and the COSStreamParser.
63   *
64   * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
65   * @version $Revision: 1.59 $
66   */
67  public abstract class BaseParser
68  {
69      /**
70       * This is a byte array that will be used for comparisons.
71       */
72      public static final byte[] ENDSTREAM = 
73          new byte[] {101,110,100,115,116,114,101,97,109};//"endstream".getBytes( "ISO-8859-1" );
74  
75      /**
76       * This is a byte array that will be used for comparisons.
77       */
78      public static final String   DEF = "def";
79  
80      /**
81       * This is the stream that will be read from.
82       */
83      //protected PushBackByteArrayStream pdfSource;
84      protected PushBackInputStream pdfSource;
85  
86      /**
87       * moved xref here, is a persistence construct
88       * maybe not needed anyway when not read from behind with delayed
89       * access to objects.
90       */
91      private List   xrefs = new ArrayList  ();
92  
93      private COSDocument document;
94  
95      /**
96       * Constructor.
97       *
98       * @param input The input stream to read the data from.
99       * 
100      * @throws IOException If there is an error reading the input stream.
101      */
102     public BaseParser( InputStream   input) throws IOException  
103     {
104         //pdfSource = new PushBackByteArrayStream( input );
105         pdfSource = new PushBackInputStream( new BufferedInputStream  ( input, 16384 ), 4096 );
106     }
107     
108     /**
109      * Constructor.
110      *
111      * @param input The array to read the data from.
112      * 
113      * @throws IOException If there is an error reading the byte data.
114      */
115     protected BaseParser(byte[] input) throws IOException  
116     {
117         pdfSource = new ByteArrayPushBackInputStream(input);
118     }
119     
120     /**
121      * Set the document for this stream.
122      * 
123      * @param doc The current document.
124      */
125     public void setDocument( COSDocument doc )
126     {
127         document = doc;
128     }
129 
130     private static boolean isHexDigit(char ch)
131     {
132         return (ch >= '0' && ch <= '9') || 
133         (ch >= 'a' && ch <= 'f') || 
134         (ch >= 'A' && ch <= 'F');
135         // the line below can lead to problems with certain versions of the IBM JIT compiler
136         // (and is slower anyway)
137         //return (HEXDIGITS.indexOf(ch) != -1);
138     }
139 
140     /**
141      * This will parse a PDF dictionary value.
142      *
143      * @return The parsed Dictionary object.
144      *
145      * @throws IOException If there is an error parsing the dictionary object.
146      */
147     private COSBase parseCOSDictionaryValue() throws IOException  
148     {
149         COSBase retval = null;
150         COSBase number = parseDirObject();
151         skipSpaces();
152         char next = (char)pdfSource.peek();
153         if( next >= '0' && next <= '9' )
154         {
155             COSBase generationNumber = parseDirObject();
156             skipSpaces();
157             char r = (char)pdfSource.read();
158             if( r != 'R' )
159             {
160                 throw new IOException  ( "expected='R' actual='" + r + "' " + pdfSource );
161             }
162             COSObjectKey key = new COSObjectKey(((COSInteger) number).intValue(),
163                                                 ((COSInteger) generationNumber).intValue());
164             retval = document.getObjectFromPool(key);
165         }
166         else
167         {
168             retval = number;
169         }
170         return retval;
171     }
172 
173     /**
174      * This will parse a PDF dictionary.
175      *
176      * @return The parsed dictionary.
177      *
178      * @throws IOException IF there is an error reading the stream.
179      */
180     protected COSDictionary parseCOSDictionary() throws IOException  
181     {
182         char c = (char)pdfSource.read();
183         if( c != '<')
184         {
185             throw new IOException  ( "expected='<' actual='" + c + "'" );
186         }
187         c = (char)pdfSource.read();
188         if( c != '<')
189         {
190             throw new IOException  ( "expected='<' actual='" + c + "' " + pdfSource );
191         }
192         skipSpaces();
193         COSDictionary obj = new COSDictionary();
194         boolean done = false;
195         while( !done )
196         {
197             skipSpaces();
198             c = (char)pdfSource.peek();
199             if( c == '>')
200             {
201                 done = true;
202             }
203             else
204             {
205                 COSName key = parseCOSName();
206                 COSBase value = parseCOSDictionaryValue();
207                 skipSpaces();
208                 if( ((char)pdfSource.peek()) == 'd' )
209                 {
210                     //if the next string is 'def' then we are parsing a cmap stream
211                     //and want to ignore it, otherwise throw an exception.
212                     String   potentialDEF = readString();
213                     if( !potentialDEF.equals( DEF ) )
214                     {
215                         pdfSource.unread( potentialDEF.getBytes() );
216                     }
217                     else
218                     {
219                         skipSpaces();
220                     }
221                 }
222 
223                 if( value == null )
224                 {
225                     throw new IOException  ("Bad Dictionary Declaration " + pdfSource );
226                 }
227                 obj.setItem( key, value );
228             }
229         }
230         char ch = (char)pdfSource.read();
231         if( ch != '>' )
232         {
233             throw new IOException  ( "expected='>' actual='" + ch + "'" );
234         }
235         ch = (char)pdfSource.read();
236         if( ch != '>' )
237         {
238             throw new IOException  ( "expected='>' actual='" + ch + "'" );
239         }
240         return obj;
241     }
242 
243     /**
244      * This will read a COSStream from the input stream.
245      *
246      * @param file The file to write the stream to when reading.
247      * @param dic The dictionary that goes with this stream.
248      *
249      * @return The parsed pdf stream.
250      *
251      * @throws IOException If there is an error reading the stream.
252      */
253     protected COSStream parseCOSStream( COSDictionary dic, RandomAccess file ) throws IOException  
254     {
255         COSStream stream = new COSStream( dic, file );
256         OutputStream   out = null;
257         try
258         {
259             String   streamString = readString();
260             //long streamLength;
261 
262             if (!streamString.equals("stream"))
263             {
264                 throw new IOException  ("expected='stream' actual='" + streamString + "'");
265             }
266 
267             //PDF Ref 3.2.7 A stream must be followed by either
268             //a CRLF or LF but nothing else.
269 
270             int whitespace = pdfSource.read();
271             
272             //see brother_scan_cover.pdf, it adds whitespaces
273             //after the stream but before the start of the 
274             //data, so just read those first
275             while (whitespace == 0x20)
276             {
277                 whitespace = pdfSource.read();
278             }
279 
280             if( whitespace == 0x0D )
281             {
282                 whitespace = pdfSource.read();
283                 if( whitespace != 0x0A )
284                 {
285                     pdfSource.unread( whitespace );
286                     //The spec says this is invalid but it happens in the real
287                     //world so we must support it.
288                     //throw new IOException("expected='0x0A' actual='0x" +
289                     //    Integer.toHexString(whitespace) + "' " + pdfSource);
290                 }
291             }
292             else if (whitespace == 0x0A)
293             {
294                 //that is fine
295             }
296             else
297             {
298                 //we are in an error.
299                 //but again we will do a lenient parsing and just assume that everything
300                 //is fine
301                 pdfSource.unread( whitespace );
302                 //throw new IOException("expected='0x0D or 0x0A' actual='0x" +
303                 //Integer.toHexString(whitespace) + "' " + pdfSource);
304 
305             }
306 
307 
308             COSBase streamLength = dic.getDictionaryObject(COSName.LENGTH);
309             /*long length = -1;
310             if( streamLength instanceof COSNumber )
311             {
312                 length = ((COSNumber)streamLength).intValue();
313             }
314             else if( streamLength instanceof COSObject &&
315                      ((COSObject)streamLength).getObject() instanceof COSNumber )
316             {
317                 length = ((COSNumber)((COSObject)streamLength).getObject()).intValue();
318             }*/
319 
320             //length = -1;
321             //streamLength = null;
322 
323             //Need to keep track of the
324             out = stream.createFilteredStream( streamLength );
325             String   endStream = null;
326             //the length is wrong in some pdf documents which means
327             //that PDFBox must basically ignore it in order to be able to read
328             //the most number of PDF documents.  This of course is a penalty hit,
329             //maybe I could implement a faster parser.
330             /**if( length != -1 )
331             {
332                 byte[] buffer = new byte[1024];
333                 int amountRead = 0;
334                 int totalAmountRead = 0;
335                 while( amountRead != -1 && totalAmountRead < length )
336                 {
337                     int maxAmountToRead = Math.min(buffer.length, (int)(length-totalAmountRead));
338                     amountRead = pdfSource.read(buffer,0,maxAmountToRead);
339                     totalAmountRead += amountRead;
340                     if( amountRead != -1 )
341                     {
342                         out.write( buffer, 0, amountRead );
343                     }
344                 }
345             }
346             else
347             {**/
348                 readUntilEndStream( out );
349             /**}*/
350             skipSpaces();
351             endStream = readString();
352 
353             if (!endStream.equals("endstream"))
354             {
355                 readUntilEndStream( out );
356                 endStream = readString();
357                 if( !endStream.equals( "endstream" ) )
358                 {
359                     throw new IOException  ("expected='endstream' actual='" + endStream + "' " + pdfSource);
360                 }
361             }
362         }
363         finally
364         {
365             if( out != null )
366             {
367                 out.close();
368             }
369         }
370         return stream;
371     }
372 
373     private void readUntilEndStream( OutputStream   out ) throws IOException  
374     {
375         int currentIndex = 0;
376         int byteRead = 0;
377         //this is the additional bytes buffered but not written
378         int additionalBytes=0;
379         byte[] buffer = new byte[ENDSTREAM.length+additionalBytes];
380         int writeIndex = 0;
381         while(!cmpCircularBuffer( buffer, currentIndex, ENDSTREAM ) && byteRead != -1 )
382         {
383             writeIndex = currentIndex - buffer.length;
384             if( writeIndex >= 0 )
385             {
386                 out.write( buffer[writeIndex%buffer.length] );
387             }
388             byteRead = pdfSource.read();
389             buffer[currentIndex%buffer.length] = (byte)byteRead;
390             currentIndex++;
391         }
392 
393         //we want to ignore the end of the line data when reading a stream
394         //so will make an attempt to ignore it.
395         /*writeIndex = currentIndex - buffer.length;
396         if( buffer[writeIndex%buffer.length] == 13 &&
397             buffer[(writeIndex+1)%buffer.length] == 10 )
398         {
399             //then ignore the newline before the endstream
400         }
401         else if( buffer[(writeIndex+1)%buffer.length] == 10 )
402         {
403             //Then first byte is data, second byte is newline
404             out.write( buffer[writeIndex%buffer.length] );
405         }
406         else
407         {
408             out.write( buffer[writeIndex%buffer.length] );
409             out.write( buffer[(writeIndex+1)%buffer.length] );
410         }*/
411 
412         /**
413          * Old way of handling newlines before endstream
414         for( int i=0; i<additionalBytes; i++ )
415         {
416             writeIndex = currentIndex - buffer.length;
417             if( writeIndex >=0 &&
418                 //buffer[writeIndex%buffer.length] != 10 &&
419                 buffer[writeIndex%buffer.length] != 13 )
420             {
421                 out.write( buffer[writeIndex%buffer.length] );
422             }
423             currentIndex++;
424         }
425         */
426         pdfSource.unread( ENDSTREAM );
427 
428     }
429 
430     /**
431      * This basically checks to see if the next compareTo.length bytes of the
432      * buffer match the compareTo byte array.
433      */
434     private boolean cmpCircularBuffer( byte[] buffer, int currentIndex, byte[] compareTo )
435     {
436         int cmpLen = compareTo.length;
437         int buflen = buffer.length;
438         boolean match = true;
439         int off = currentIndex-cmpLen;
440         if( off < 0 )
441         {
442             match = false;
443         }
444         for( int i=0; match && i<cmpLen; ++i )
445         {
446             match = buffer[(off+i)%buflen] == compareTo[i];
447         }
448         return match;
449     }
450 
451     /**
452      * This will parse a PDF string.
453      *
454      * @return The parsed PDF string.
455      *
456      * @throws IOException If there is an error reading from the stream.
457      */
458     protected COSString parseCOSString() throws IOException  
459     {
460         char nextChar = (char)pdfSource.read();
461         COSString retval = new COSString();
462         char openBrace;
463         char closeBrace;
464         if( nextChar == '(' )
465         {
466             openBrace = '(';
467             closeBrace = ')';
468         }
469         else if( nextChar == '<' )
470         {
471             openBrace = '<';
472             closeBrace = '>';
473         }
474         else
475         {
476             throw new IOException  ( "parseCOSString string should start with '(' or '<' and not '" +
477                                    nextChar + "' " + pdfSource );
478         }
479 
480         //This is the number of braces read
481         //
482         int braces = 1;
483         int c = pdfSource.read();
484         while( braces > 0 && c != -1)
485         {
486             char ch = (char)c;
487             int nextc = -2; // not yet read
488             //if( log.isDebugEnabled() )
489             //{
490             //    log.debug( "Parsing COSString character '" + c + "' code=" + (int)c );
491             //}
492 
493             if(ch == closeBrace)
494             {
495                 braces--;
496                 byte[] nextThreeBytes = new byte[3];
497                 int amountRead = pdfSource.read(nextThreeBytes);
498                 
499                 //lets handle the special case seen in Bull  River Rules and Regulations.pdf
500                 //The dictionary looks like this
501                 //    2 0 obj
502                 //    <<
503                 //        /Type /Info
504                 //        /Creator (PaperPort http://www.scansoft.com)
505                 //        /Producer (sspdflib 1.0 http://www.scansoft.com)
506                 //        /Title ( (5)
507                 //        /Author ()
508                 //        /Subject ()
509                 //
510                 // Notice the /Title, the braces are not even but they should
511                 // be.  So lets assume that if we encounter an this scenario
512                 //   <end_brace><new_line><opening_slash> then that
513                 // means that there is an error in the pdf and assume that
514                 // was the end of the document.
515                 if( amountRead == 3 )
516                 {
517                     if( nextThreeBytes[0] == 0x0d &&
518                         nextThreeBytes[1] == 0x0a &&
519                         nextThreeBytes[2] == 0x2f )
520                     {
521                         braces = 0;
522                     }
523                 }
524                 pdfSource.unread( nextThreeBytes, 0, amountRead );
525                 if( braces != 0 )
526                 {
527                     retval.append( ch );
528                 }
529             }
530             else if( ch == openBrace )
531             {
532                 braces++;
533                 retval.append( ch );
534             }
535             else if( ch == '\\' )
536             {
537                  //patched by ram
538                 char next = (char)pdfSource.read();
539                 switch(next)
540                 {
541                     case 'n':
542                         retval.append( '\n' );
543                         break;
544                     case 'r':
545                         retval.append( '\r' );
546                         break;
547                     case 't':
548                         retval.append( '\t' );
549                         break;
550                     case 'b':
551                         retval.append( '\b' );
552                         break;
553                     case 'f':
554                         retval.append( '\f' );
555                         break;
556                     case '(':
557                     case ')':
558                     case '\\':
559                         retval.append( next );
560                         break;
561                     case 10:
562                     case 13:
563                         //this is a break in the line so ignore it and the newline and continue
564                         c = pdfSource.read();
565                         while( isEOL(c) && c != -1)
566                         {
567                             c = pdfSource.read();
568                         }
569                         nextc = c;
570                         break;
571                     case '0':
572                     case '1':
573                     case '2':
574                     case '3':
575                     case '4':
576                     case '5':
577                     case '6':
578                     case '7':
579                     {
580                         StringBuffer   octal = new StringBuffer  ();
581                         octal.append( next );
582                         c = pdfSource.read();
583                         char digit = (char)c;
584                         if( digit >= '0' && digit <= '7' )
585                         {
586                             octal.append( digit );
587                             c = pdfSource.read();
588                             digit = (char)c;
589                             if( digit >= '0' && digit <= '7' )
590                             {
591                                 octal.append( digit );
592                             }
593                             else 
594                             {
595                                 nextc = c;
596                             }
597                         }
598                         else
599                         {
600                             nextc = c;
601                         }   
602 
603                         int character = 0;
604                         try
605                         {
606                             character = Integer.parseInt( octal.toString(), 8 );
607                         }
608                         catch( NumberFormatException   e )
609                         {
610                             throw new IOException  ( "Error: Expected octal character, actual='" + octal + "'" );
611                         }
612                         retval.append( character );
613                         break;
614                     }
615                     default:
616                     {
617                         retval.append( '\\' );
618                         retval.append( next );
619                         //another ficken problem with PDF's, sometimes the \ doesn't really
620                         //mean escape like the PDF spec says it does, sometimes is should be literal
621                         //which is what we will assume here.
622                         //throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource );
623                     }
624                 }
625             }
626             else
627             {
628                 if( openBrace == '<' )
629                 {
630                     if( isHexDigit(ch) )
631                     {
632                         retval.append( ch );
633                     }
634                 }
635                 else
636                 {
637                     retval.append( ch );
638                 }
639             }
640             if (nextc != -2)
641             {
642                 c = nextc;
643             }
644             else 
645             {
646                 c = pdfSource.read();
647             }
648         }
649         if (c != -1)
650         {
651             pdfSource.unread(c);
652         }
653         if( openBrace == '<' )
654         {
655             retval = COSString.createFromHexString( retval.getString() );
656         }
657         return retval;
658     }
659 
660     /**
661      * This will parse a PDF array object.
662      *
663      * @return The parsed PDF array.
664      *
665      * @throws IOException If there is an error parsing the stream.
666      */
667     protected COSArray parseCOSArray() throws IOException  
668     {
669         char ch = (char)pdfSource.read();
670         if( ch != '[')
671         {
672             throw new IOException  ( "expected='[' actual='" + ch + "'" );
673         }
674         COSArray po = new COSArray();
675         COSBase pbo = null;
676         skipSpaces();
677         int i = 0;
678         while( ((i = pdfSource.peek()) > 0) && ((char)i != ']') )
679         {
680             pbo = parseDirObject();
681             if( pbo instanceof COSObject )
682             {
683                 COSInteger genNumber = (COSInteger)po.remove( po.size() -1 );
684                 COSInteger number = (COSInteger)po.remove( po.size() -1 );
685                 COSObjectKey key = new COSObjectKey(number.intValue(), genNumber.intValue());
686                 pbo = document.getObjectFromPool(key);
687             }
688             if( pbo != null )
689             {
690                 po.add( pbo );
691             }
692             else
693             {
694                 //it could be a bad object in the array which is just skipped
695             }
696             skipSpaces();
697         }
698         pdfSource.read(); //read ']'
699         skipSpaces();
700         return po;
701     }
702 
703     /**
704      * Determine if a character terminates a PDF name.
705      *
706      * @param ch The character
707      * @return <code>true</code> if the character terminates a PDF name, otherwise <code>false</code>.
708      */
709     protected boolean isEndOfName(char ch)
710     {
711         return (ch == ' ' || ch == 13 || ch == 10 || ch == 9 || ch == '>' || ch == '<'
712             || ch == '[' || ch =='/' || ch ==']' || ch ==')' || ch =='(' ||
713             ch == -1 //EOF
714             );
715     }
716 
717     /**
718      * This will parse a PDF name from the stream.
719      *
720      * @return The parsed PDF name.
721      *
722      * @throws IOException If there is an error reading from the stream.
723      */
724     protected COSName parseCOSName() throws IOException  
725     {
726         COSName retval = null;
727         int c = pdfSource.read();
728         if( (char)c != '/')
729         {
730             throw new IOException  ("expected='/' actual='" + (char)c + "'-" + c + " " + pdfSource );
731         }
732         // costruisce il nome
733         StringBuffer   buffer = new StringBuffer  ();
734         c = pdfSource.read();
735         while( c != -1 )
736         {
737             char ch = (char)c;
738             if(ch == '#')
739             {
740                 char ch1 = (char)pdfSource.read();
741                 char ch2 = (char)pdfSource.read();
742 
743                 // Prior to PDF v1.2, the # was not a special character.  Also,
744                 // it has been observed that various PDF tools do not follow the
745                 // spec with respect to the # escape, even though they report
746                 // PDF versions of 1.2 or later.  The solution here is that we
747                 // interpret the # as an escape only when it is followed by two
748                 // valid hex digits.
749                 //
750                 if (isHexDigit(ch1) && isHexDigit(ch2))
751                 {
752                     String   hex = "" + ch1 + ch2;
753                     try
754                     {
755                         buffer.append( (char) Integer.parseInt(hex, 16));
756                     }
757                     catch (NumberFormatException   e)
758                     {
759                         throw new IOException  ("Error: expected hex number, actual='" + hex + "'");
760                     }
761                     c = pdfSource.read();
762                 }
763                 else
764                 {
765                     pdfSource.unread(ch2);
766                     c = ch1;
767                     buffer.append( ch );
768                 }
769             }
770             else if (isEndOfName(ch))
771             {
772                 break;
773             }
774             else
775             {
776                 buffer.append( ch );
777                 c = pdfSource.read();
778             }
779         }
780         if (c != -1)
781         {
782             pdfSource.unread(c);
783         }
784         retval = COSName.getPDFName( buffer.toString() );
785         return retval;
786     }
787 
788     /**
789      * This will parse a boolean object from the stream.
790      *
791      * @return The parsed boolean object.
792      *
793      * @throws IOException If an IO error occurs during parsing.
794      */
795     protected COSBoolean parseBoolean() throws IOException  
796     {
797         COSBoolean retval = null;
798         char c = (char)pdfSource.peek();
799         if( c == 't' )
800         {
801             byte[] trueArray = new byte[ 4 ];
802             int amountRead = pdfSource.read( trueArray, 0, 4 );
803             String   trueString = new String  ( trueArray, 0, amountRead );
804             if( !trueString.equals( "true" ) )
805             {
806                 throw new IOException  ( "Error parsing boolean: expected='true' actual='" + trueString + "'" );
807             }
808             else
809             {
810                 retval = COSBoolean.TRUE;
811             }
812         }
813         else if( c == 'f' )
814         {
815             byte[] falseArray = new byte[ 5 ];
816             int amountRead = pdfSource.read( falseArray, 0, 5 );
817             String   falseString = new String  ( falseArray, 0, amountRead );
818             if( !falseString.equals( "false" ) )
819             {
820                 throw new IOException  ( "Error parsing boolean: expected='true' actual='" + falseString + "'" );
821             }
822             else
823             {
824                 retval = COSBoolean.FALSE;
825             }
826         }
827         else
828         {
829             throw new IOException  ( "Error parsing boolean expected='t or f' actual='" + c + "'" );
830         }
831         return retval;
832     }
833 
834     /**
835      * This will parse a directory object from the stream.
836      *
837      * @return The parsed object.
838      *
839      * @throws IOException If there is an error during parsing.
840      */
841     protected COSBase parseDirObject() throws IOException  
842     {
843         COSBase retval = null;
844 
845         skipSpaces();
846         int nextByte = pdfSource.peek();
847         char c = (char)nextByte;
848         switch(c)
849         {
850             case '<':
851             {
852                 int leftBracket = pdfSource.read();//pull off first left bracket
853                 c = (char)pdfSource.peek(); //check for second left bracket
854                 pdfSource.unread( leftBracket );
855                 if(c == '<')
856                 {
857 
858                     retval = parseCOSDictionary();
859                     skipSpaces();
860                 }
861                 else
862                 {
863                     retval = parseCOSString();
864                 }
865                 break;
866             }
867             case '[': // array
868             {
869                 retval = parseCOSArray();
870                 break;
871             }
872             case '(':
873                 retval = parseCOSString();
874                 break;
875             case '/':   // name
876                 retval = parseCOSName();
877                 break;
878             case 'n':   // null
879             {
880                 String   nullString = readString();
881                 if( !nullString.equals( "null") )
882                 {
883                     throw new IOException  ("Expected='null' actual='" + nullString + "'");
884                 }
885                 retval = COSNull.NULL;
886                 break;
887             }
888             case 't':
889             {
890                 byte[] trueBytes = new byte[4];
891                 int amountRead = pdfSource.read( trueBytes, 0, 4 );
892                 String   trueString = new String  ( trueBytes, 0, amountRead );
893                 if( trueString.equals( "true" ) )
894                 {
895                     retval = COSBoolean.TRUE;
896                 }
897                 else
898                 {
899                     throw new IOException  ( "expected true actual='" + trueString + "' " + pdfSource );
900                 }
901                 break;
902             }
903             case 'f':
904             {
905                 byte[] falseBytes = new byte[5];
906                 int amountRead = pdfSource.read( falseBytes, 0, 5 );
907                 String   falseString = new String  ( falseBytes, 0, amountRead );
908                 if( falseString.equals( "false" ) )
909                 {
910                     retval = COSBoolean.FALSE;
911                 }
912                 else
913                 {
914                     throw new IOException  ( "expected false actual='" + falseString + "' " + pdfSource );
915                 }
916                 break;
917             }
918             case 'R':
919                 pdfSource.read();
920                 retval = new COSObject(null);
921                 break;
922             case (char)-1:
923                 return null;
924             default:
925             {
926                 if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
927                 {
928                     StringBuffer   buf = new StringBuffer  ();
929                     int ic = pdfSource.read();
930                     c = (char)ic;
931                     while( Character.isDigit( c )||
932                            c == '-' ||
933                            c == '+' ||
934                            c == '.' ||
935                            c == 'E' ||
936                            c == 'e' )
937                     {
938                         buf.append( c );
939                         ic = pdfSource.read();
940                         c = (char)ic;
941                     }
942                     if( ic != -1 )
943                     {
944                         pdfSource.unread( ic );
945                     }
946                     retval = COSNumber.get( buf.toString() );
947                 }
948                 else
949                 {
950                     //This is not suppose to happen, but we will allow for it
951                     //so we are more compatible with POS writers that don't
952                     //follow the spec
953                     String   badString = readString();
954                     //throw new IOException( "Unknown dir object c='" + c +
955                     //"' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
956                     if( badString == null || badString.length() == 0 )
957                     {
958                         int peek = pdfSource.peek();
959                         // we can end up in an infinite loop otherwise
960                         throw new IOException  ( "Unknown dir object c='" + c +
961                            "' cInt=" + (int)c + " peek='" + (char)peek + "' peekInt=" + peek + " " + pdfSource );
962                     }
963 
964                 }
965             }
966         }
967         return retval;
968     }
969 
970     /**
971      * This will read the next string from the stream.
972      *
973      * @return The string that was read from the stream.
974      *
975      * @throws IOException If there is an error reading from the stream.
976      */
977     protected String   readString() throws IOException  
978     {
979         skipSpaces();
980         StringBuffer   buffer = new StringBuffer  ();
981         int c = pdfSource.read();
982         while( !isEndOfName((char)c) && !isClosing(c) && c != -1 )
983         {
984             buffer.append( (char)c );
985             c = pdfSource.read();
986         }
987         if (c != -1)
988         {
989             pdfSource.unread(c);
990         }
991         return buffer.toString();
992     }
993 
994     /**
995      * This will read bytes until the end of line marker occurs.
996      *
997      * @param theString The next expected string in the stream.
998      *
999      * @return The characters between the current position and the end of the line.
1000     *
1001     * @throws IOException If there is an error reading from the stream or theString does not match what was read.
1002     */
1003    protected String   readExpectedString( String   theString ) throws IOException  
1004    {
1005        int c = pdfSource.read();
1006        while( isWhitespace(c) && c != -1)
1007        {
1008            c = pdfSource.read();
1009        }
1010        StringBuffer   buffer = new StringBuffer  ( theString.length() );
1011        int charsRead = 0;
1012        while( !isEOL(c) && c != -1 && charsRead < theString.length() )
1013        {
1014            char next = (char)c;
1015            buffer.append( next );
1016            if( theString.charAt( charsRead ) == next )
1017            {
1018                charsRead++;
1019            }
1020            else
1021            {
1022                throw new IOException  ( "Error: Expected to read '" + theString +
1023                    "' instead started reading '" +buffer.toString() + "'" );
1024            }
1025            c = pdfSource.read();
1026        }
1027        while( isEOL(c) && c != -1 )
1028        {
1029            c = pdfSource.read();
1030        }
1031        if (c != -1)
1032        {
1033            pdfSource.unread(c);
1034        }
1035        return buffer.toString();
1036    }
1037
1038    /**
1039     * This will read the next string from the stream up to a certain length.
1040     *
1041     * @param length The length to stop reading at.
1042     *
1043     * @return The string that was read from the stream of length 0 to length.
1044     *
1045     * @throws IOException If there is an error reading from the stream.
1046     */
1047    protected String   readString( int length ) throws IOException  
1048    {
1049        skipSpaces();
1050
1051        int c = pdfSource.read();
1052        
1053        //average string size is around 2 and the normal string buffer size is
1054        //about 16 so lets save some space.
1055        StringBuffer   buffer = new StringBuffer  (length);
1056        while( !isWhitespace(c) && !isClosing(c) && c != -1 && buffer.length() < length &&
1057            c != '[' &&
1058            c != '<' &&
1059            c != '(' &&
1060            c != '/' )
1061        {
1062            buffer.append( (char)c );
1063            c = pdfSource.read();
1064        }
1065        if (c != -1)
1066        {
1067            pdfSource.unread(c);
1068        }
1069        return buffer.toString();
1070    }
1071
1072    /**
1073     * This will tell if the next character is a closing brace( close of PDF array ).
1074     *
1075     * @return true if the next byte is ']', false otherwise.
1076     *
1077     * @throws IOException If an IO error occurs.
1078     */
1079    protected boolean isClosing() throws IOException  
1080    {
1081        return isClosing(pdfSource.peek());
1082    }
1083    
1084    /**
1085     * This will tell if the next character is a closing brace( close of PDF array ).
1086     *
1087     * @param c The character to check against end of line
1088     * @return true if the next byte is ']', false otherwise.
1089     */
1090    protected boolean isClosing(int c) 
1091    {
1092        return c == ']';
1093    }
1094
1095    /**
1096     * This will read bytes until the end of line marker occurs.
1097     *
1098     * @return The characters between the current position and the end of the line.
1099     *
1100     * @throws IOException If there is an error reading from the stream.
1101     */
1102    protected String   readLine() throws IOException  
1103    {
1104        int c = pdfSource.read();
1105        while(isWhitespace(c) && c != -1)
1106        {
1107            c = pdfSource.read();
1108        }
1109        StringBuffer   buffer = new StringBuffer  ( 11 );
1110        
1111        while( !isEOL(c) && c != -1 )
1112        {
1113            buffer.append( (char)c );
1114            c = pdfSource.read();
1115        }
1116        while( isEOL(c) && c != -1 )
1117        {
1118            c = pdfSource.read();
1119        }
1120        if (c != -1)
1121        {
1122            pdfSource.unread(c);
1123        }
1124        return buffer.toString();
1125    }
1126
1127    /**
1128     * This will tell if the next byte to be read is an end of line byte.
1129     *
1130     * @return true if the next byte is 0x0A or 0x0D.
1131     *
1132     * @throws IOException If there is an error reading from the stream.
1133     */
1134    protected boolean isEOL() throws IOException  
1135    {
1136        return isEOL(pdfSource.peek());
1137    }
1138    
1139    /**
1140     * This will tell if the next byte to be read is an end of line byte.
1141     *
1142     * @param c The character to check against end of line
1143     * @return true if the next byte is 0x0A or 0x0D.
1144     */
1145    protected boolean isEOL(int c)
1146    {
1147        return c == 10 || c == 13;
1148    }
1149
1150    /**
1151     * This will tell if the next byte is whitespace or not.
1152     *
1153     * @return true if the next byte in the stream is a whitespace character.
1154     *
1155     * @throws IOException If there is an error reading from the stream.
1156     */
1157    protected boolean isWhitespace() throws IOException  
1158    {
1159        return isWhitespace( pdfSource.peek() );
1160    }
1161
1162    /**
1163     * This will tell if the next byte is whitespace or not.
1164     *
1165     * @param c The character to check against whitespace
1166     *
1167     * @return true if the next byte in the stream is a whitespace character.
1168     */
1169    protected boolean isWhitespace( int c )
1170    {
1171        return c == 0 || c == 9 || c == 12  || c == 10
1172        || c == 13 || c == 32;
1173    }
1174
1175    /**
1176     * This will skip all spaces and comments that are present.
1177     *
1178     * @throws IOException If there is an error reading from the stream.
1179     */
1180    protected void skipSpaces() throws IOException  
1181    {
1182        //log( "skipSpaces() " + pdfSource );
1183        int c = pdfSource.read();
1184        // identical to, but faster as: isWhiteSpace(c) || c == 37
1185        while(c == 0 || c == 9 || c == 12  || c == 10
1186                || c == 13 || c == 32 || c == 37)//37 is the % character, a comment
1187        {
1188            if ( c == 37 )
1189            {
1190                // skip past the comment section
1191                c = pdfSource.read();
1192                while(!isEOL(c) && c != -1)
1193                {
1194                    c = pdfSource.read();
1195                }
1196            }
1197            else 
1198            {
1199                c = pdfSource.read();
1200            }
1201        }
1202        if (c != -1)
1203        {
1204            pdfSource.unread(c);
1205        }
1206        //log( "skipSpaces() done peek='" + (char)pdfSource.peek() + "'" );
1207    }
1208
1209    /**
1210     * This will read an integer from the stream.
1211     *
1212     * @return The integer that was read from the stream.
1213     *
1214     * @throws IOException If there is an error reading from the stream.
1215     */
1216    protected int readInt() throws IOException  
1217    {
1218        skipSpaces();
1219        int retval = 0;
1220
1221        int lastByte = 0;
1222        StringBuffer   intBuffer = new StringBuffer  ();
1223        while( (lastByte = pdfSource.read() ) != 32 &&
1224        lastByte != 10 &&
1225        lastByte != 13 &&
1226        lastByte != 0 && //See sourceforge bug 853328
1227        lastByte != -1 )
1228        {
1229            intBuffer.append( (char)lastByte );
1230        }
1231        try
1232        {
1233            retval = Integer.parseInt( intBuffer.toString() );
1234        }
1235        catch( NumberFormatException   e )
1236        {
1237            throw new IOException  ( "Error: Expected an integer type, actual='" + intBuffer + "'" );
1238        }
1239        return retval;
1240    }
1241
1242    /**
1243     * This will add an xref.
1244     *
1245     * @param xref The xref to add.
1246     */
1247    public void addXref( PDFXref xref )
1248    {
1249        xrefs.add(xref);
1250    }
1251
1252    /**
1253     * This will get all of the xrefs.
1254     *
1255     * @return A list of all xrefs.
1256     */
1257    public List   getXrefs()
1258    {
1259        return xrefs;
1260    }
1261}
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags