Parser


1   /*
2     Copyright � 2006 Stefano Chizzolini. http://clown.stefanochizzolini.it
3   
4     Contributors:
5       * Stefano Chizzolini (original code developer, info@stefanochizzolini.it):
6         contributed code is Copyright � 2006 by Stefano Chizzolini.
7   
8     This file should be part of the source code distribution of "PDF Clown library"
9     (the Program): see the accompanying README files for more info.
10  
11    This Program is free software; you can redistribute it and/or modify it under
12    the terms of the GNU General Public License as published by the Free Software
13    Foundation; either version 2 of the License, or (at your option) any later version.
14  
15    This Program is distributed in the hope that it will be useful, but WITHOUT ANY
16    WARRANTY, either expressed or implied; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the License for more details.
18  
19    You should have received a copy of the GNU General Public License along with this
20    Program (see README files); if not, go to the GNU website (http://www.gnu.org/).
21  
22    Redistribution and use, with or without modification, are permitted provided that such
23    redistributions retain the above copyright notice, license and disclaimer, along with
24    this list of conditions.
25  */
26  
27  package it.stefanochizzolini.clown.documents.contents.tokens;
28  
29  import it.stefanochizzolini.clown.bytes.IInputStream;
30  import it.stefanochizzolini.clown.documents.contents.objects.Operation;
31  import it.stefanochizzolini.clown.objects.PdfArray;
32  import it.stefanochizzolini.clown.objects.PdfBoolean;
33  import it.stefanochizzolini.clown.objects.PdfDate;
34  import it.stefanochizzolini.clown.objects.PdfDictionary;
35  import it.stefanochizzolini.clown.objects.PdfDirectObject;
36  import it.stefanochizzolini.clown.objects.PdfHex;
37  import it.stefanochizzolini.clown.objects.PdfInteger;
38  import it.stefanochizzolini.clown.objects.PdfLiteral;
39  import it.stefanochizzolini.clown.objects.PdfName;
40  import it.stefanochizzolini.clown.objects.PdfNull;
41  import it.stefanochizzolini.clown.objects.PdfReal;
42  import it.stefanochizzolini.clown.objects.PdfReference;
43  import it.stefanochizzolini.clown.objects.PdfStream;
44  import it.stefanochizzolini.clown.tokens.FileFormatException;
45  import it.stefanochizzolini.clown.tokens.TokenTypeEnum;
46  import java.io.EOFException  ;
47  import java.util.ArrayList  ;
48  import java.util.Date  ;
49  import java.util.List  ;
50  
51  /**
52    Content stream parser [PDF:1.6:3.7.1].
53  */
54  public class Parser
55  {
56  /*
57  TODO:IMPL this parser evaluates a subset of the lexical domain of the token parser (clown.serialization.Parser): it should be better to derive both parsers from a common parsing engine in order to avoid unwieldy duplications.
58  */
59    // <class>
60    // <static>
61    // <interface>
62    // <protected>
63    protected static int getHex(
64      int c
65      )
66    {
67      if(c >= '0' && c <= '9')
68        return (c - '0');
69      if(c >= 'A' && c <= 'F')
70        return (c - 'A' + 10);
71      if(c >= 'a' && c <= 'f')
72        return (c - 'a' + 10);
73      return -1;
74    }
75  
76    /**
77      Evaluates whether a character is a delimiter [PDF:1.6:3.1.1].
78    */
79    protected static boolean isDelimiter(
80      int c
81      )
82    {
83      return (c == '(' || c == ')' || c == '<' || c == '>' || c == '[' || c == ']' || c == '/' || c == '%');
84    }
85  
86    /**
87      Evaluates whether a character is an EOL marker [PDF:1.6:3.1.1].
88    */
89    protected static boolean isEOL(
90      int c
91      )
92    {
93      return (c == 12 || c == 15);
94    }
95  
96    /**
97      Evaluates whether a character is a white-space [PDF:1.6:3.1.1].
98    */
99    protected static boolean isWhitespace(
100     int c
101     )
102   {
103     return (c == 0 || c == 9 || c == 10 || c == 12 || c == 13 || c == 32);
104   }
105   // </protected>
106   // </interface>
107   // </static>
108 
109   // <dynamic>
110   // <fields>
111   private IInputStream stream;
112   private Object   token;
113   private TokenTypeEnum tokenType;
114   // </fields>
115 
116   // <constructors>
117   /**
118     <h3>Remarks</h3>
119     <p>For internal use only.</p>
120   */
121   public Parser(
122     IInputStream stream
123     )
124   {
125     this.stream = stream;
126   }
127   // </constructors>
128 
129   // <interface>
130   // <public>
131   public long getLength(
132     )
133   {return stream.getLength();}
134 
135   public long getPosition(
136     )
137   {return stream.getPosition();}
138 
139   public IInputStream getStream(
140     )
141   {return stream;}
142 
143   /**
144     Gets the currently-parsed token.
145     @return The current token.
146   */
147   public Object   getToken(
148     )
149   {return token;}
150 
151   /**
152     Gets the currently-parsed token type.
153     @return The current token type.
154   */
155   public TokenTypeEnum getTokenType(
156     )
157   {return tokenType;}
158 
159   /**
160     @param offset Number of tokens to be skipped before reaching the intended one.
161   */
162   public boolean moveNext(
163     int offset
164     ) throws FileFormatException
165   {
166     for(
167       int index = 0;
168       index < offset;
169       index++
170       )
171     {
172       if(!moveNext())
173         return false;
174     }
175 
176     return true;
177   }
178 
179   /**
180     Parse the next token [PDF:1.6:3.1].
181     <h3>Contract</h3>
182     <ul>
183      <li>Preconditions:
184       <ol>
185        <li>To properly parse the current token, the pointer MUST be just before its starting (leading whitespaces are ignored).</li>
186       </ol>
187      </li>
188      <li>Postconditions:
189       <ol>
190        <li id="moveNext_contract_post[0]">When this method terminates, the pointer IS at the last byte of the current token.</li>
191       </ol>
192      </li>
193      <li>Invariants:
194       <ol>
195        <li>The byte-level position of the pointer IS anytime (during token parsing) at the end of the current token (whereas the 'current token' represents the token-level position of the pointer).</li>
196       </ol>
197      </li>
198      <li>Side-effects:
199       <ol>
200        <li>See <a HREF="#moveNext_contract_post[0]">Postconditions</a>.</li>
201       </ol>
202      </li>
203     </ul>
204     @return Whether a new token was found.
205   */
206   public boolean moveNext(
207     ) throws FileFormatException
208   {
209     /*
210       NOTE: It'd be interesting to evaluate an alternative regular-expression-based
211       implementation...
212     */
213     StringBuilder   buffer = null;
214     token = null;
215     int c = 0;
216 
217     // Skip leading white-space characters [PDF:1.6:3.1.1].
218     try
219     {
220       do
221       {
222         c = stream.readUnsignedByte();
223       } while(isWhitespace(c)); // Keep goin' till there's a white-space character...
224     }
225     catch(EOFException   e)
226     {return false;}
227 
228     // Which character is it?
229     switch(c)
230     {
231       case '/': // Name.
232         tokenType = TokenTypeEnum.Name;
233 
234         buffer = new StringBuilder  ();
235         try
236         {
237           while(true)
238           {
239             c = stream.readUnsignedByte();
240             if(isDelimiter(c) || isWhitespace(c))
241               break;
242             // Is it an hexadecimal code [PDF:1.6:3.2.4]?
243             if(c == '#')
244             {
245               try
246               {c = (getHex(stream.readUnsignedByte()) << 4) + getHex(stream.readUnsignedByte());}
247               catch(EOFException   e)
248               {throw new FileFormatException("Unexpected EOF (malformed hexadecimal code in name object).",e,stream.getPosition());}
249             }
250 
251             buffer.append((char)c);
252           }
253         }
254         catch(EOFException   e)
255         {throw new FileFormatException("Unexpected EOF (malformed name object).",e,stream.getPosition());}
256 
257         stream.skip(-1); // Recover the first byte after the current token.
258         break;
259       case '0':
260       case '1':
261       case '2':
262       case '3':
263       case '4':
264       case '5':
265       case '6':
266       case '7':
267       case '8':
268       case '9':
269       case '.':
270       case '-':
271       case '+': // Number [PDF:1.6:3.2.2] | Indirect reference.
272         switch(c)
273         {
274           case '.': // Decimal point.
275             tokenType = TokenTypeEnum.Real;
276             break;
277           default: // Digit or signum.
278             tokenType = TokenTypeEnum.Integer; // By default (it may be real).
279             break;
280         }
281 
282         // Building the number...
283         buffer = new StringBuilder  ();
284         try
285         {
286           do
287           {
288             buffer.append((char)c);
289             c = stream.readUnsignedByte();
290             if(c == '.')
291               tokenType = TokenTypeEnum.Real;
292             else if(c < '0' || c > '9')
293               break;
294           } while(true);
295         }
296         catch(EOFException   e)
297         {throw new FileFormatException("Unexpected EOF (malformed number object).",e,stream.getPosition());}
298 
299         stream.skip(-1); // Recover the first byte after the current token.
300         break;
301       case '[': // Array (begin).
302         tokenType = TokenTypeEnum.ArrayBegin;
303         break;
304       case ']': // Array (end).
305         tokenType = TokenTypeEnum.ArrayEnd;
306         break;
307       case '<': // Dictionary (begin) | Hexadecimal string.
308         try
309         {c = stream.readUnsignedByte();}
310         catch(EOFException   e)
311         {throw new FileFormatException("Unexpected EOF (isolated opening angle-bracket character).",e,stream.getPosition());}
312         // Is it a dictionary (2nd angle bracket [PDF:1.6:3.2.6])?
313         if(c == '<')
314         {
315           tokenType = TokenTypeEnum.DictionaryBegin;
316           break;
317         }
318 
319         // Hexadecimal string (single angle bracket [PDF:1.6:3.2.3]).
320         tokenType = TokenTypeEnum.Hex;
321 
322         buffer = new StringBuilder  ();
323         try
324         {
325           while(true)
326           {
327             c = stream.readUnsignedByte();
328             // String end?
329             if(c == '>')
330               break;
331 
332             buffer.append((char)c);
333           }
334         }
335         catch(EOFException   e)
336         {throw new FileFormatException("Unexpected EOF (malformed hex string).",e,stream.getPosition());}
337 
338         break;
339       case '>': // Dictionary (end).
340         try
341         {c = stream.readUnsignedByte();}
342         catch(EOFException   e)
343         {throw new FileFormatException("Unexpected EOF (malformed dictionary).",e,stream.getPosition());}
344         if(c != '>')
345           throw new FileFormatException("Malformed dictionary.",stream.getPosition());
346 
347         tokenType = TokenTypeEnum.DictionaryEnd;
348 
349         break;
350       case '%': // Comment.
351         tokenType = TokenTypeEnum.Comment;
352         // Skipping comment content...
353         try
354         {
355           do
356           {c = stream.readUnsignedByte();} while(!isEOL(c));
357         }
358         catch(EOFException   e)
359         {/* Let it go. */}
360 
361         break;
362       case '(': // Literal string.
363         tokenType = TokenTypeEnum.Literal;
364 
365         buffer = new StringBuilder  ();
366         int level = 0;
367         try
368         {
369           while(true)
370           {
371             c = stream.readUnsignedByte();
372             if(c == '(')
373               level++;
374             else if(c == ')')
375               level--;
376             else if(c == '\\')
377             {
378               boolean lineBreak = false;
379               c = stream.readUnsignedByte();
380               switch(c)
381               {
382                 case 'n':
383                   c = '\n';
384                   break;
385                 case 'r':
386                   c = '\r';
387                   break;
388                 case 't':
389                   c = '\t';
390                   break;
391                 case 'b':
392                   c = '\b';
393                   break;
394                 case 'f':
395                   c = '\f';
396                   break;
397                 case '(':
398                 case ')':
399                 case '\\':
400                   break;
401                 case '\r':
402                   lineBreak = true;
403                   c = stream.readUnsignedByte();
404                   if(c != '\n')
405                     stream.skip(-1);
406                   break;
407                 case '\n':
408                   lineBreak = true;
409                   break;
410                 default:
411                 {
412                   // Is it outside the octal encoding?
413                   if(c < '0' || c > '7')
414                     break;
415 
416                   // Octal [PDF:1.6:3.2.3].
417                   int octal = c - '0';
418                   c = stream.readUnsignedByte();
419                   // Octal end?
420                   if(c < '0' || c > '7')
421                   {c = octal; stream.skip(-1); break;}
422                   octal = (octal << 3) + c - '0';
423                   c = stream.readUnsignedByte();
424                   // Octal end?
425                   if(c < '0' || c > '7')
426                   {c = octal; stream.skip(-1); break;}
427                   octal = (octal << 3) + c - '0';
428                   c = octal & 0xff;
429                   break;
430                 }
431               }
432               if(lineBreak)
433                 continue;
434             }
435             else if(c == '\r')
436             {
437               c = stream.readUnsignedByte();
438               if(c != '\n')
439               {c = '\n'; stream.skip(-1);}
440             }
441             if(level == -1)
442               break;
443 
444             buffer.append((char)c);
445           }
446         }
447         catch(EOFException   e)
448         {throw new FileFormatException("Unexpected EOF (malformed literal string).",e,stream.getPosition());}
449 
450         break;
451       default: // Keyword.
452         tokenType = TokenTypeEnum.Keyword;
453 
454         buffer = new StringBuilder  ();
455         try
456         {
457           do
458           {
459             buffer.append((char)c);
460             c = stream.readUnsignedByte();
461           } while(!isDelimiter(c) && !isWhitespace(c));
462         }
463         catch(EOFException   e)
464         {/* Let it go. */}
465         stream.skip(-1); // Recover the first byte after the current token.
466 
467         break;
468     }
469 
470     if(buffer != null)
471     {
472       /*
473         Here we prepare the current token state.
474       */
475       // Which token type?
476       switch(tokenType)
477       {
478         case Keyword:
479           token = buffer.toString();
480           // Late recognition.
481           if(((String  )token).equals("false")
482             || ((String  )token).equals("true")) // Boolean.
483           {
484             tokenType = TokenTypeEnum.Boolean;
485             token = Boolean.parseBoolean((String  )token);
486           }
487           else if(((String  )token).equals("null")) // Null.
488           {
489             tokenType = TokenTypeEnum.Null;
490             token = null;
491           }
492           break;
493         case Comment:
494         case Hex:
495         case Name:
496           token = buffer.toString();
497           break;
498         case Literal:
499           token = buffer.toString();
500           // Late recognition.
501           if(((String  )token).startsWith("D:")) // Date.
502           {
503             tokenType = TokenTypeEnum.Date;
504             token = PdfDate.toDate((String  )token);
505           }
506           break;
507         case Integer:
508           token = Integer.parseInt(buffer.toString());
509           break;
510         case Real:
511           token = Float.parseFloat(buffer.toString());
512           break;
513       }
514     }
515 
516     return true;
517   }
518 
519   public Operation parseOperation(
520     ) throws FileFormatException
521   {
522     List  <PdfDirectObject> operands = new ArrayList  <PdfDirectObject>();
523     // Populate the operation.
524     while(true)
525     {
526       // Did we reach the operator keyword?
527       if(tokenType == TokenTypeEnum.Keyword)
528         break;
529 
530       operands.add(parsePdfObject()); moveNext();
531     }
532 
533     return new Operation(
534       (String  )token,
535       operands
536       );
537   }
538 
539   /**
540     Parse the current PDF object [PDF:1.6:3.2].
541     <h3>Contract</h3>
542     <ul>
543      <li>Preconditions:
544       <ol>
545        <li>When this method is invoked, the pointer MUST be at the first
546        token of the requested object.</li>
547       </ol>
548      </li>
549      <li>Postconditions:
550       <ol>
551        <li id="parsePdfObject_contract_post[0]">When this method terminates,
552        the pointer IS at the last token of the requested object.</li>
553       </ol>
554      </li>
555      <li>Invariants:
556       <ol>
557        <li>(none).</li>
558       </ol>
559      </li>
560      <li>Side-effects:
561       <ol>
562        <li>See <a HREF="#parsePdfObject_contract_post[0]">Postconditions</a>.</li>
563       </ol>
564      </li>
565     </ul>
566   */
567   protected PdfDirectObject parsePdfObject(
568     ) throws FileFormatException
569   {
570     switch(tokenType)
571     {
572       case Integer:
573         return new PdfInteger((Integer  )token);
574       case Name:
575         return new PdfName((String  )token,true);
576       case Literal:
577         return new PdfLiteral((String  )token);
578       case DictionaryBegin:
579         PdfDictionary dictionary = new PdfDictionary();
580         // Populate the dictionary.
581         while(true)
582         {
583           // Key.
584           moveNext();
585           if(tokenType == TokenTypeEnum.DictionaryEnd)
586             break;
587           PdfName key = (PdfName)parsePdfObject();
588 
589           // Value.
590           moveNext();
591           PdfDirectObject value = (PdfDirectObject)parsePdfObject();
592 
593           // Add the current entry to the dictionary!
594           dictionary.put(key,value);
595         }
596         return dictionary;
597       case ArrayBegin:
598         PdfArray array = new PdfArray();
599         // Populate the array.
600         while(true)
601         {
602           // Value.
603           moveNext();
604           if(tokenType == TokenTypeEnum.ArrayEnd)
605             break;
606 
607           // Add the current item to the array!
608           array.add((PdfDirectObject)parsePdfObject());
609         }
610         return array;
611       case Real:
612         return new PdfReal((Float  )token);
613       case Boolean:
614         return new PdfBoolean((Boolean  )token);
615       case Date:
616         return new PdfDate((Date)token);
617       case Hex:
618         return new PdfHex((String  )token);
619       case Null:
620         return PdfNull.Null;
621       default:
622         return null;
623     }
624   }
625 
626   public void seek(
627     long offset
628     )
629   {stream.seek(offset);}
630 
631   public void skip(
632     long offset
633     )
634   {stream.skip(offset);}
635 
636   /**
637     Moves to the last whitespace after the current position in order to let read
638     the first non-whitespace.
639   */
640   public boolean skipWhitespace(
641     )
642   {
643     int b;
644     try
645     {
646       do
647       {b = stream.readUnsignedByte();} while(isWhitespace(b)); // Keep goin' till there's a white-space character...
648     }
649     catch(EOFException   e)
650     {return false;}
651     stream.skip(-1); // Recover the last whitespace position.
652 
653     return true;
654   }
655   // </public>
656   // </interface>
657   // </dynamic>
658   // </class>
659 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags