KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > it > stefanochizzolini > clown > documents > contents > tokens > Parser


1 /*
2   Copyright © 2006 Stefano Chizzolini. http://clown.stefanochizzolini.it
3
4   Contributors:
5     * Stefano Chizzolini (original code developer, info@stefanochizzolini.it):
6       contributed code is Copyright © 2006 by Stefano Chizzolini.
7
8   This file should be part of the source code distribution of "PDF Clown library"
9   (the Program): see the accompanying README files for more info.
10
11   This Program is free software; you can redistribute it and/or modify it under
12   the terms of the GNU General Public License as published by the Free Software
13   Foundation; either version 2 of the License, or (at your option) any later version.
14
15   This Program is distributed in the hope that it will be useful, but WITHOUT ANY
16   WARRANTY, either expressed or implied; without even the implied warranty of
17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the License for more details.
18
19   You should have received a copy of the GNU General Public License along with this
20   Program (see README files); if not, go to the GNU website (http://www.gnu.org/).
21
22   Redistribution and use, with or without modification, are permitted provided that such
23   redistributions retain the above copyright notice, license and disclaimer, along with
24   this list of conditions.
25 */

26
27 package it.stefanochizzolini.clown.documents.contents.tokens;
28
29 import it.stefanochizzolini.clown.bytes.IInputStream;
30 import it.stefanochizzolini.clown.documents.contents.objects.Operation;
31 import it.stefanochizzolini.clown.objects.PdfArray;
32 import it.stefanochizzolini.clown.objects.PdfBoolean;
33 import it.stefanochizzolini.clown.objects.PdfDate;
34 import it.stefanochizzolini.clown.objects.PdfDictionary;
35 import it.stefanochizzolini.clown.objects.PdfDirectObject;
36 import it.stefanochizzolini.clown.objects.PdfHex;
37 import it.stefanochizzolini.clown.objects.PdfInteger;
38 import it.stefanochizzolini.clown.objects.PdfLiteral;
39 import it.stefanochizzolini.clown.objects.PdfName;
40 import it.stefanochizzolini.clown.objects.PdfNull;
41 import it.stefanochizzolini.clown.objects.PdfReal;
42 import it.stefanochizzolini.clown.objects.PdfReference;
43 import it.stefanochizzolini.clown.objects.PdfStream;
44 import it.stefanochizzolini.clown.tokens.FileFormatException;
45 import it.stefanochizzolini.clown.tokens.TokenTypeEnum;
46 import java.io.EOFException JavaDoc;
47 import java.util.ArrayList JavaDoc;
48 import java.util.Date JavaDoc;
49 import java.util.List JavaDoc;
50
51 /**
52   Content stream parser [PDF:1.6:3.7.1].
53 */

54 public class Parser
55 {
56 /*
57 TODO:IMPL this parser evaluates a subset of the lexical domain of the token parser (clown.serialization.Parser): it should be better to derive both parsers from a common parsing engine in order to avoid unwieldy duplications.
58 */

59   // <class>
60
// <static>
61
// <interface>
62
// <protected>
63
protected static int getHex(
64     int c
65     )
66   {
67     if(c >= '0' && c <= '9')
68       return (c - '0');
69     if(c >= 'A' && c <= 'F')
70       return (c - 'A' + 10);
71     if(c >= 'a' && c <= 'f')
72       return (c - 'a' + 10);
73     return -1;
74   }
75
76   /**
77     Evaluates whether a character is a delimiter [PDF:1.6:3.1.1].
78   */

79   protected static boolean isDelimiter(
80     int c
81     )
82   {
83     return (c == '(' || c == ')' || c == '<' || c == '>' || c == '[' || c == ']' || c == '/' || c == '%');
84   }
85
86   /**
87     Evaluates whether a character is an EOL marker [PDF:1.6:3.1.1].
88   */

89   protected static boolean isEOL(
90     int c
91     )
92   {
93     return (c == 12 || c == 15);
94   }
95
96   /**
97     Evaluates whether a character is a white-space [PDF:1.6:3.1.1].
98   */

99   protected static boolean isWhitespace(
100     int c
101     )
102   {
103     return (c == 0 || c == 9 || c == 10 || c == 12 || c == 13 || c == 32);
104   }
105   // </protected>
106
// </interface>
107
// </static>
108

109   // <dynamic>
110
// <fields>
111
private IInputStream stream;
112   private Object JavaDoc token;
113   private TokenTypeEnum tokenType;
114   // </fields>
115

116   // <constructors>
117
/**
118     <h3>Remarks</h3>
119     <p>For internal use only.</p>
120   */

121   public Parser(
122     IInputStream stream
123     )
124   {
125     this.stream = stream;
126   }
127   // </constructors>
128

129   // <interface>
130
// <public>
131
public long getLength(
132     )
133   {return stream.getLength();}
134
135   public long getPosition(
136     )
137   {return stream.getPosition();}
138
139   public IInputStream getStream(
140     )
141   {return stream;}
142
143   /**
144     Gets the currently-parsed token.
145     @return The current token.
146   */

147   public Object JavaDoc getToken(
148     )
149   {return token;}
150
151   /**
152     Gets the currently-parsed token type.
153     @return The current token type.
154   */

155   public TokenTypeEnum getTokenType(
156     )
157   {return tokenType;}
158
159   /**
160     @param offset Number of tokens to be skipped before reaching the intended one.
161   */

162   public boolean moveNext(
163     int offset
164     ) throws FileFormatException
165   {
166     for(
167       int index = 0;
168       index < offset;
169       index++
170       )
171     {
172       if(!moveNext())
173         return false;
174     }
175
176     return true;
177   }
178
179   /**
180     Parse the next token [PDF:1.6:3.1].
181     <h3>Contract</h3>
182     <ul>
183      <li>Preconditions:
184       <ol>
185        <li>To properly parse the current token, the pointer MUST be just before its starting (leading whitespaces are ignored).</li>
186       </ol>
187      </li>
188      <li>Postconditions:
189       <ol>
190        <li id="moveNext_contract_post[0]">When this method terminates, the pointer IS at the last byte of the current token.</li>
191       </ol>
192      </li>
193      <li>Invariants:
194       <ol>
195        <li>The byte-level position of the pointer IS anytime (during token parsing) at the end of the current token (whereas the 'current token' represents the token-level position of the pointer).</li>
196       </ol>
197      </li>
198      <li>Side-effects:
199       <ol>
200        <li>See <a HREF="#moveNext_contract_post[0]">Postconditions</a>.</li>
201       </ol>
202      </li>
203     </ul>
204     @return Whether a new token was found.
205   */

206   public boolean moveNext(
207     ) throws FileFormatException
208   {
209     /*
210       NOTE: It'd be interesting to evaluate an alternative regular-expression-based
211       implementation...
212     */

213     StringBuilder JavaDoc buffer = null;
214     token = null;
215     int c = 0;
216
217     // Skip leading white-space characters [PDF:1.6:3.1.1].
218
try
219     {
220       do
221       {
222         c = stream.readUnsignedByte();
223       } while(isWhitespace(c)); // Keep goin' till there's a white-space character...
224
}
225     catch(EOFException JavaDoc e)
226     {return false;}
227
228     // Which character is it?
229
switch(c)
230     {
231       case '/': // Name.
232
tokenType = TokenTypeEnum.Name;
233
234         buffer = new StringBuilder JavaDoc();
235         try
236         {
237           while(true)
238           {
239             c = stream.readUnsignedByte();
240             if(isDelimiter(c) || isWhitespace(c))
241               break;
242             // Is it an hexadecimal code [PDF:1.6:3.2.4]?
243
if(c == '#')
244             {
245               try
246               {c = (getHex(stream.readUnsignedByte()) << 4) + getHex(stream.readUnsignedByte());}
247               catch(EOFException JavaDoc e)
248               {throw new FileFormatException("Unexpected EOF (malformed hexadecimal code in name object).",e,stream.getPosition());}
249             }
250
251             buffer.append((char)c);
252           }
253         }
254         catch(EOFException JavaDoc e)
255         {throw new FileFormatException("Unexpected EOF (malformed name object).",e,stream.getPosition());}
256
257         stream.skip(-1); // Recover the first byte after the current token.
258
break;
259       case '0':
260       case '1':
261       case '2':
262       case '3':
263       case '4':
264       case '5':
265       case '6':
266       case '7':
267       case '8':
268       case '9':
269       case '.':
270       case '-':
271       case '+': // Number [PDF:1.6:3.2.2] | Indirect reference.
272
switch(c)
273         {
274           case '.': // Decimal point.
275
tokenType = TokenTypeEnum.Real;
276             break;
277           default: // Digit or signum.
278
tokenType = TokenTypeEnum.Integer; // By default (it may be real).
279
break;
280         }
281
282         // Building the number...
283
buffer = new StringBuilder JavaDoc();
284         try
285         {
286           do
287           {
288             buffer.append((char)c);
289             c = stream.readUnsignedByte();
290             if(c == '.')
291               tokenType = TokenTypeEnum.Real;
292             else if(c < '0' || c > '9')
293               break;
294           } while(true);
295         }
296         catch(EOFException JavaDoc e)
297         {throw new FileFormatException("Unexpected EOF (malformed number object).",e,stream.getPosition());}
298
299         stream.skip(-1); // Recover the first byte after the current token.
300
break;
301       case '[': // Array (begin).
302
tokenType = TokenTypeEnum.ArrayBegin;
303         break;
304       case ']': // Array (end).
305
tokenType = TokenTypeEnum.ArrayEnd;
306         break;
307       case '<': // Dictionary (begin) | Hexadecimal string.
308
try
309         {c = stream.readUnsignedByte();}
310         catch(EOFException JavaDoc e)
311         {throw new FileFormatException("Unexpected EOF (isolated opening angle-bracket character).",e,stream.getPosition());}
312         // Is it a dictionary (2nd angle bracket [PDF:1.6:3.2.6])?
313
if(c == '<')
314         {
315           tokenType = TokenTypeEnum.DictionaryBegin;
316           break;
317         }
318
319         // Hexadecimal string (single angle bracket [PDF:1.6:3.2.3]).
320
tokenType = TokenTypeEnum.Hex;
321
322         buffer = new StringBuilder JavaDoc();
323         try
324         {
325           while(true)
326           {
327             c = stream.readUnsignedByte();
328             // String end?
329
if(c == '>')
330               break;
331
332             buffer.append((char)c);
333           }
334         }
335         catch(EOFException JavaDoc e)
336         {throw new FileFormatException("Unexpected EOF (malformed hex string).",e,stream.getPosition());}
337
338         break;
339       case '>': // Dictionary (end).
340
try
341         {c = stream.readUnsignedByte();}
342         catch(EOFException JavaDoc e)
343         {throw new FileFormatException("Unexpected EOF (malformed dictionary).",e,stream.getPosition());}
344         if(c != '>')
345           throw new FileFormatException("Malformed dictionary.",stream.getPosition());
346
347         tokenType = TokenTypeEnum.DictionaryEnd;
348
349         break;
350       case '%': // Comment.
351
tokenType = TokenTypeEnum.Comment;
352         // Skipping comment content...
353
try
354         {
355           do
356           {c = stream.readUnsignedByte();} while(!isEOL(c));
357         }
358         catch(EOFException JavaDoc e)
359         {/* Let it go. */}
360
361         break;
362       case '(': // Literal string.
363
tokenType = TokenTypeEnum.Literal;
364
365         buffer = new StringBuilder JavaDoc();
366         int level = 0;
367         try
368         {
369           while(true)
370           {
371             c = stream.readUnsignedByte();
372             if(c == '(')
373               level++;
374             else if(c == ')')
375               level--;
376             else if(c == '\\')
377             {
378               boolean lineBreak = false;
379               c = stream.readUnsignedByte();
380               switch(c)
381               {
382                 case 'n':
383                   c = '\n';
384                   break;
385                 case 'r':
386                   c = '\r';
387                   break;
388                 case 't':
389                   c = '\t';
390                   break;
391                 case 'b':
392                   c = '\b';
393                   break;
394                 case 'f':
395                   c = '\f';
396                   break;
397                 case '(':
398                 case ')':
399                 case '\\':
400                   break;
401                 case '\r':
402                   lineBreak = true;
403                   c = stream.readUnsignedByte();
404                   if(c != '\n')
405                     stream.skip(-1);
406                   break;
407                 case '\n':
408                   lineBreak = true;
409                   break;
410                 default:
411                 {
412                   // Is it outside the octal encoding?
413
if(c < '0' || c > '7')
414                     break;
415
416                   // Octal [PDF:1.6:3.2.3].
417
int octal = c - '0';
418                   c = stream.readUnsignedByte();
419                   // Octal end?
420
if(c < '0' || c > '7')
421                   {c = octal; stream.skip(-1); break;}
422                   octal = (octal << 3) + c - '0';
423                   c = stream.readUnsignedByte();
424                   // Octal end?
425
if(c < '0' || c > '7')
426                   {c = octal; stream.skip(-1); break;}
427                   octal = (octal << 3) + c - '0';
428                   c = octal & 0xff;
429                   break;
430                 }
431               }
432               if(lineBreak)
433                 continue;
434             }
435             else if(c == '\r')
436             {
437               c = stream.readUnsignedByte();
438               if(c != '\n')
439               {c = '\n'; stream.skip(-1);}
440             }
441             if(level == -1)
442               break;
443
444             buffer.append((char)c);
445           }
446         }
447         catch(EOFException JavaDoc e)
448         {throw new FileFormatException("Unexpected EOF (malformed literal string).",e,stream.getPosition());}
449
450         break;
451       default: // Keyword.
452
tokenType = TokenTypeEnum.Keyword;
453
454         buffer = new StringBuilder JavaDoc();
455         try
456         {
457           do
458           {
459             buffer.append((char)c);
460             c = stream.readUnsignedByte();
461           } while(!isDelimiter(c) && !isWhitespace(c));
462         }
463         catch(EOFException JavaDoc e)
464         {/* Let it go. */}
465         stream.skip(-1); // Recover the first byte after the current token.
466

467         break;
468     }
469
470     if(buffer != null)
471     {
472       /*
473         Here we prepare the current token state.
474       */

475       // Which token type?
476
switch(tokenType)
477       {
478         case Keyword:
479           token = buffer.toString();
480           // Late recognition.
481
if(((String JavaDoc)token).equals("false")
482             || ((String JavaDoc)token).equals("true")) // Boolean.
483
{
484             tokenType = TokenTypeEnum.Boolean;
485             token = Boolean.parseBoolean((String JavaDoc)token);
486           }
487           else if(((String JavaDoc)token).equals("null")) // Null.
488
{
489             tokenType = TokenTypeEnum.Null;
490             token = null;
491           }
492           break;
493         case Comment:
494         case Hex:
495         case Name:
496           token = buffer.toString();
497           break;
498         case Literal:
499           token = buffer.toString();
500           // Late recognition.
501
if(((String JavaDoc)token).startsWith("D:")) // Date.
502
{
503             tokenType = TokenTypeEnum.Date;
504             token = PdfDate.toDate((String JavaDoc)token);
505           }
506           break;
507         case Integer:
508           token = Integer.parseInt(buffer.toString());
509           break;
510         case Real:
511           token = Float.parseFloat(buffer.toString());
512           break;
513       }
514     }
515
516     return true;
517   }
518
519   public Operation parseOperation(
520     ) throws FileFormatException
521   {
522     List JavaDoc<PdfDirectObject> operands = new ArrayList JavaDoc<PdfDirectObject>();
523     // Populate the operation.
524
while(true)
525     {
526       // Did we reach the operator keyword?
527
if(tokenType == TokenTypeEnum.Keyword)
528         break;
529
530       operands.add(parsePdfObject()); moveNext();
531     }
532
533     return new Operation(
534       (String JavaDoc)token,
535       operands
536       );
537   }
538
539   /**
540     Parse the current PDF object [PDF:1.6:3.2].
541     <h3>Contract</h3>
542     <ul>
543      <li>Preconditions:
544       <ol>
545        <li>When this method is invoked, the pointer MUST be at the first
546        token of the requested object.</li>
547       </ol>
548      </li>
549      <li>Postconditions:
550       <ol>
551        <li id="parsePdfObject_contract_post[0]">When this method terminates,
552        the pointer IS at the last token of the requested object.</li>
553       </ol>
554      </li>
555      <li>Invariants:
556       <ol>
557        <li>(none).</li>
558       </ol>
559      </li>
560      <li>Side-effects:
561       <ol>
562        <li>See <a HREF="#parsePdfObject_contract_post[0]">Postconditions</a>.</li>
563       </ol>
564      </li>
565     </ul>
566   */

567   protected PdfDirectObject parsePdfObject(
568     ) throws FileFormatException
569   {
570     switch(tokenType)
571     {
572       case Integer:
573         return new PdfInteger((Integer JavaDoc)token);
574       case Name:
575         return new PdfName((String JavaDoc)token,true);
576       case Literal:
577         return new PdfLiteral((String JavaDoc)token);
578       case DictionaryBegin:
579         PdfDictionary dictionary = new PdfDictionary();
580         // Populate the dictionary.
581
while(true)
582         {
583           // Key.
584
moveNext();
585           if(tokenType == TokenTypeEnum.DictionaryEnd)
586             break;
587           PdfName key = (PdfName)parsePdfObject();
588
589           // Value.
590
moveNext();
591           PdfDirectObject value = (PdfDirectObject)parsePdfObject();
592
593           // Add the current entry to the dictionary!
594
dictionary.put(key,value);
595         }
596         return dictionary;
597       case ArrayBegin:
598         PdfArray array = new PdfArray();
599         // Populate the array.
600
while(true)
601         {
602           // Value.
603
moveNext();
604           if(tokenType == TokenTypeEnum.ArrayEnd)
605             break;
606
607           // Add the current item to the array!
608
array.add((PdfDirectObject)parsePdfObject());
609         }
610         return array;
611       case Real:
612         return new PdfReal((Float JavaDoc)token);
613       case Boolean:
614         return new PdfBoolean((Boolean JavaDoc)token);
615       case Date:
616         return new PdfDate((Date)token);
617       case Hex:
618         return new PdfHex((String JavaDoc)token);
619       case Null:
620         return PdfNull.Null;
621       default:
622         return null;
623     }
624   }
625
626   public void seek(
627     long offset
628     )
629   {stream.seek(offset);}
630
631   public void skip(
632     long offset
633     )
634   {stream.skip(offset);}
635
636   /**
637     Moves to the last whitespace after the current position in order to let read
638     the first non-whitespace.
639   */

640   public boolean skipWhitespace(
641     )
642   {
643     int b;
644     try
645     {
646       do
647       {b = stream.readUnsignedByte();} while(isWhitespace(b)); // Keep goin' till there's a white-space character...
648
}
649     catch(EOFException JavaDoc e)
650     {return false;}
651     stream.skip(-1); // Recover the last whitespace position.
652

653     return true;
654   }
655   // </public>
656
// </interface>
657
// </dynamic>
658
// </class>
659
}
Popular Tags