KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > etymon > pj > PdfParser


1 package com.etymon.pj;
2
3 import java.io.*;
4 import java.util.*;
5 import com.etymon.pj.exception.*;
6 import com.etymon.pj.object.*;
7
8 public class PdfParser {
9
10     public static void getObjects(Pdf pdf, RandomAccessFile raf)
11         throws IOException, PjException {
12         long[][] xref = getXref(pdf, raf);
13         byte[] data;
14         PjObject object;
15         Hashtable ht = new Hashtable();
16         for (int x = 0; x < xref.length; x++) {
17             if (xref[x][2] == 1) {
18                 raf.seek(xref[x][0]);
19                 data = readUntil(raf, "endobj");
20                 object = PdfParser.parse(pdf, raf, xref, data, 0);
21                 pdf._objects.setObjectAt(object, x);
22             }
23         }
24     }
25
26     private static PjObject getObject(Pdf pdf, RandomAccessFile raf, long[][] xref, int num)
27         throws IOException, PjException {
28         // check if the object has been loaded
29
PjObject obj = pdf._objects.objectAt(num);
30         if (obj != null) {
31             return obj;
32         }
33         // otherwise we have to load it
34
raf.seek(xref[num][0]);
35         byte[] data = readUntil(raf, "endobj");
36         obj = PdfParser.parse(pdf, raf, xref, data, 0);
37         pdf._objects.setObjectAt(obj, num);
38         return obj;
39     }
40     
41     private static long[][] getXref(Pdf pdf, RandomAccessFile raf) throws
42         IOException, PjException {
43         // we assume that the cross-reference table as a whole
44
// (including all "sections") is contiguous in terms
45
// of object numbers; in other words, we assume that
46
// '/Size n' in the trailer dictionary indicates not
47
// only that n is the number of cross reference
48
// entries in the table, but also that (n-1) is the
49
// largest object number in use; this allow us to use
50
// a long[][] for storing the table, because we can
51
// allocate it as long[n][3]. I think this is
52
// implicit in the PDF spec but I couldn't find a
53
// clear statement about it. If it turns out that
54
// this is incorrect, we'll have to change all the
55
// code to use a Vector instead of an array.
56
long lastXref = getStartXref(raf);
57         return getNextXref(pdf, raf, lastXref, null);
58     }
59
60     private static long[][] getNextXref(Pdf pdf, RandomAccessFile raf, long start,
61                    long[][] xref)
62         throws IOException, PjException {
63         raf.seek(start);
64         byte[] xrefData = readUntil(raf, "trailer");
65         byte[] trailerData = readUntil(raf, "startxref");
66         PjDictionary trailer =
67             (PjDictionary)(PdfParser.parse(pdf, raf, xref, trailerData, 0));
68         Hashtable h = trailer.getHashtable();
69         long[][] xr;
70         if (xref == null) {
71             xr = new long[((PjNumber)(h.get(
72                 new PjName("Size")))).getInt()][3];
73             pdf._trailer = h;
74         } else {
75             xr = xref;
76         }
77         // recursively collect previous xref data
78
PjNumber prev = (PjNumber)(h.get(new PjName("Prev")));
79         if (prev != null) {
80             xr = getNextXref(pdf, raf, prev.getLong(), xr);
81         }
82         // now overlay this xref data
83
PdfParser.parseXref(xrefData, xr, 0);
84         return xr;
85     }
86
87     private static long getStartXref(RandomAccessFile raf) throws
88         IOException, PjException {
89         // locate startxref near the end of the file
90
int scan = 0;
91         for (int retry = PjConst.SCAN_STARTXREF_RETRY; retry > 0; retry--) {
92             scan = scan + PjConst.SCAN_STARTXREF;
93             long fileSize = raf.length();
94             raf.seek(fileSize - scan);
95             byte[] buffer = readUntil(raf, "startxref");
96             // next line should be the startxref value
97
buffer = readUntil(raf, "%%EOF");
98             if (buffer.length != 0) {
99                 // now parse the long value from the buffer
100
StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
101                 boolean abort = false;
102                 int x = 0;
103                 while ( (abort == false) && (Character.isDigit((char)(buffer[x]))) ) {
104                     sb.append((char)(buffer[x]));
105                     x++;
106                     if (x >= buffer.length) {
107                         abort = true;
108                     }
109                 }
110                 if (abort == false) {
111                     return new Long JavaDoc(new String JavaDoc(sb)).longValue();
112                 }
113             }
114         }
115         throw new StartxrefFormatException("Unexpected end of file (startxref).");
116     }
117
118     public static byte[] readUntil(RandomAccessFile raf, String JavaDoc
119                       endstr) throws IOException {
120         StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
121         char c = '\0';
122         String JavaDoc s;
123         char[] compare = new char[endstr.length()];
124         char lastEol = '\0';
125         boolean eof = false;
126         boolean done = false;
127         do {
128             try {
129                 c = (char)(raf.readUnsignedByte());
130                 switch (lastEol) {
131                 case '\0':
132                     if ( (c == '\r') || (c ==
133                                  '\n') ) {
134                         if (sb.length() >=
135                             endstr.length()) {
136                             sb.getChars(sb.length() -
137                                     endstr.length(),
138                                     sb.length(),
139                                     compare, 0);
140                             s = new String JavaDoc(compare);
141                             if (s.equals(endstr)) {
142                                 lastEol = c;
143                             }
144                         }
145                     }
146                     sb.append(c);
147                     break;
148                 case '\n':
149                     raf.seek(raf.getFilePointer() - 1);
150                     done = true;
151                     break;
152                 case '\r':
153                     if (c == '\n') {
154                         sb.append(c);
155                     } else {
156                         raf.seek(raf.getFilePointer()
157                              - 1);
158                     }
159                     done = true;
160                     break;
161                 }
162             }
163             catch (EOFException e) {
164                 eof = true;
165             }
166         } while ( (eof == false) && (done == false) );
167         int y = sb.length();
168         byte[] buffer = new byte[y];
169         for (int x = 0; x < y; x++) {
170             buffer[x] = (byte)(sb.charAt(x));
171         }
172         return buffer;
173     }
174     
175     // deprecated
176
// RandomAccessFile.readLine() does not seem to work!
177
// this is a replacement, but it also discards the trailing
178
// '\r' and/or '\n'
179
protected static String JavaDoc readLine(RandomAccessFile raf) throws
180         IOException {
181         char c = '\0';
182         StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
183         boolean endOfLine = false;
184         boolean endOfFile = false;
185         boolean startOfNext = false;
186         boolean firstChar = true;
187         do {
188             try {
189                 c = (char)(raf.readUnsignedByte());
190                 if ( (c != '\r') && (c != '\n') ) {
191                     if (endOfLine) {
192                         startOfNext = true;
193                     } else {
194                         sb.append(c);
195                     }
196                 } else {
197                     endOfLine = true;
198                 }
199                 firstChar = false;
200             }
201             catch (EOFException e) {
202                 endOfFile = true;
203             }
204         } while ( (endOfFile == false) && (startOfNext ==
205                              false) );
206         if (startOfNext) {
207             raf.seek(raf.getFilePointer() - 1);
208         }
209         if ( (endOfFile) && (firstChar) ) {
210             return null;
211         } else {
212             return sb.toString();
213         }
214     }
215
216     
217     public static void parseXref(byte[] data, long[][] xref, int start) throws XrefFormatException {
218         PdfParserState state = new PdfParserState();
219         state._data = data;
220         state._pos = start;
221         getLine(state); // initial "xref"
222
if (state._token.equals("xref") == false) {
223             throw new XrefFormatException("Start of xref not found (xref).");
224         }
225         StringTokenizer st;
226         int index, count, x;
227         while (state._pos < state._data.length) {
228             getLine(state);
229             st = new StringTokenizer(state._token);
230             if (state._token.equals("trailer")) {
231                 return;
232             }
233             index = Integer.parseInt(st.nextToken());
234             count = Integer.parseInt(st.nextToken());
235             for (x = 0; x < count; x++) {
236                 getLine(state);
237                 st = new StringTokenizer(state._token);
238                 xref[index][0] = new Integer JavaDoc(
239                     st.nextToken()).longValue();
240                 xref[index][1] = new Integer JavaDoc(
241                     st.nextToken()).longValue();
242                 if (st.nextToken().equals("n")) {
243                     xref[index][2] = 1;
244                 } else {
245                     xref[index][2] = 0;
246                 }
247                 index++;
248             }
249         }
250     }
251     
252     public static PjObject parse(Pdf pdf, RandomAccessFile raf, long[][] xref, byte[] data, int start)
253         throws IOException, PjException {
254         PdfParserState state = new PdfParserState();
255         state._data = data;
256         state._pos = start;
257         state._stream = -1;
258         Stack stack = new Stack();
259         boolean endFlag = false;
260         while ( ( ! endFlag ) && (getToken(state)) ) {
261             if (state._stream != -1) {
262                 stack.push(state._streamToken);
263                 state._stream = -1;
264             }
265             else if (state._token.equals("startxref")) {
266                 endFlag = true;
267             }
268             else if (state._token.equals("endobj")) {
269                 endFlag = true;
270             }
271             else if (state._token.equals("%%EOF")) {
272                 endFlag = true;
273             }
274             else if (state._token.equals("endstream")) {
275                 byte[] stream = (byte[])(stack.pop());
276                 PjStreamDictionary pjsd = new PjStreamDictionary(
277                     ((PjDictionary)(stack.pop())).getHashtable());
278                 PjStream pjs = new PjStream(pjsd, stream);
279                 stack.push(pjs);
280             }
281             else if (state._token.equals("stream")) {
282                 // get length of stream
283
PjObject obj = ((PjObject)(
284                     (((PjDictionary)(stack.peek())).
285                     getHashtable().
286                             get(new PjName("Length")))));
287                 if (obj instanceof PjReference) {
288                     obj = getObject(pdf, raf, xref,
289                             ((PjReference)(obj)).getObjNumber().getInt());
290                 }
291                 state._stream =
292                     ((PjNumber)(obj)).getInt();
293
294                 // the following if() clause added to
295
// handle the case of "Length" being
296
// incorrect (larger than the actual
297
// stream length)
298
if ( state._stream >
299                      (state._data.length - state._pos)
300                     ) {
301                     state._stream =
302                         state._data.length -
303                         state._pos - 17;
304                 }
305
306                 if (state._pos < state._data.length) {
307                     if ((char)(state._data[state._pos]) == '\r') {
308                         state._pos++;
309                     }
310                     if ( (state._pos < state._data.length) &&
311                          ((char)(state._data[state._pos]) ==
312                           '\n') ) {
313                         state._pos++;
314                     }
315                 }
316             }
317             else if (state._token.equals("null")) {
318                 stack.push(new PjNull());
319             }
320             else if (state._token.equals("true")) {
321                 stack.push(new PjBoolean(true));
322             }
323             else if (state._token.equals("false")) {
324                 stack.push(new PjBoolean(false));
325             }
326             else if (state._token.equals("R")) {
327                 // we ignore the generation number
328
// because all objects get reset to
329
// generation 0 when we collapse the
330
// incremental updates
331
stack.pop(); // the generation number
332
PjNumber obj = (PjNumber)(stack.pop());
333                 stack.push(new PjReference(obj, PjNumber.ZERO));
334             }
335             else if ( (state._token.charAt(0) == '<') &&
336                   (state._token.startsWith("<<") == false) ) {
337                 stack.push(new PjString(PjString.decodePdf(state._token)));
338             }
339             else if (
340                 (Character.isDigit(state._token.charAt(0)))
341                 || (state._token.charAt(0) == '-')
342                 || (state._token.charAt(0) == '.') ) {
343                 stack.push(new PjNumber(new Float JavaDoc(state._token).floatValue()));
344             }
345             else if (state._token.charAt(0) == '(') {
346                 stack.push(new PjString(PjString.decodePdf(state._token)));
347             }
348             else if (state._token.charAt(0) == '/') {
349                 stack.push(new PjName(state._token.substring(1)));
350             }
351             else if (state._token.equals(">>")) {
352                 boolean done = false;
353                 Object JavaDoc obj;
354                 Hashtable h = new Hashtable();
355                 while ( ! done ) {
356                     obj = stack.pop();
357                     if ( (obj instanceof String JavaDoc) &&
358                          (((String JavaDoc)obj).equals("<<")) ) {
359                         done = true;
360                     } else {
361                         h.put((PjName)(stack.pop()),
362                               (PjObject)obj);
363                     }
364                 }
365                 // figure out what kind of dictionary we have
366
PjDictionary dictionary = new PjDictionary(h);
367                 if (PjPage.isLike(dictionary)) {
368                     stack.push(new PjPage(h));
369                 }
370                 else if (PjPages.isLike(dictionary)) {
371                     stack.push(new PjPages(h));
372                 }
373                 else if (PjFontType1.isLike(dictionary)) {
374                     stack.push(new PjFontType1(h));
375                 }
376                 else if (PjFontDescriptor.isLike(dictionary)) {
377                     stack.push(new PjFontDescriptor(h));
378                 }
379                 else if (PjResources.isLike(dictionary)) {
380                     stack.push(new PjResources(h));
381                 }
382                 else if (PjCatalog.isLike(dictionary)) {
383                     stack.push(new PjCatalog(h));
384                 }
385                 else if (PjInfo.isLike(dictionary)) {
386                     stack.push(new PjInfo(h));
387                 }
388                 else if (PjEncoding.isLike(dictionary)) {
389                     stack.push(new PjEncoding(h));
390                 }
391                 else {
392                     stack.push(dictionary);
393                 }
394             }
395             else if (state._token.equals("]")) {
396                 boolean done = false;
397                 Object JavaDoc obj;
398                 Vector v = new Vector();
399                 while ( ! done ) {
400                     obj = stack.pop();
401                     if ( (obj instanceof String JavaDoc) &&
402                          (((String JavaDoc)obj).equals("[")) ) {
403                         done = true;
404                     } else {
405                         v.insertElementAt((PjObject)obj, 0);
406                     }
407                 }
408                 // figure out what kind of array we have
409
PjArray array = new PjArray(v);
410                 if (PjRectangle.isLike(array)) {
411                     stack.push(new PjRectangle(v));
412                 }
413                 else if (PjProcSet.isLike(array)) {
414                     stack.push(new PjProcSet(v));
415                 }
416                 else {
417                     stack.push(array);
418                 }
419             }
420             else if (state._token.startsWith("%")) {
421                 // do nothing
422
}
423             else {
424                 stack.push(state._token);
425             }
426         }
427         return (PjObject)(stack.pop());
428     }
429
430     private static boolean getLine(PdfParserState state) {
431         StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
432         char c;
433         while (state._pos < state._data.length) {
434             c = (char)(state._data[state._pos]);
435             state._pos++;
436             switch (c) {
437             case '\r':
438                 if ( (state._pos < state._data.length) &&
439                      ((char)(state._data[state._pos]) == '\n') ) {
440                     state._pos++;
441                 }
442             case '\n':
443                 state._token = sb.toString();
444                 return true;
445             default:
446                 sb.append(c);
447             }
448         }
449         return false;
450     }
451     
452     private static boolean getToken(PdfParserState state) {
453         if (state._stream != -1) {
454             state._streamToken = new byte[state._stream];
455             System.arraycopy(state._data, state._pos, state._streamToken, 0,
456                      state._stream);
457             state._pos = state._pos + state._stream;
458             return true;
459         }
460         skipWhitespace(state);
461         StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
462         boolean firstChar = true;
463         boolean string = false;
464         int stringParen = 0;
465         boolean hstring = false;
466         char c = '\0';
467         char last;
468         int x;
469         while (state._pos < state._data.length) {
470             last = c;
471             c = (char)(state._data[state._pos]);
472             state._pos++;
473             if (firstChar) {
474                 switch (c) {
475                 case '(':
476                     string = true;
477                     stringParen = 0;
478                     break;
479                 case ']':
480                     state._token = "]";
481                     return true;
482                 case '>':
483                     if ( (state._pos < state._data.length) &&
484                          ((char)(state._data[state._pos]) ==
485                           '>') ) {
486                         state._pos++;
487                         state._token = ">>";
488                         return true;
489                     }
490                     break;
491                 case '%':
492                     sb.append('%');
493                     while ( (state._pos < state._data.length) &&
494                         ((c = (char)(state._data[state._pos])) != '\n') &&
495                         (c != '\r') ) {
496                         sb.append(c);
497                         state._pos++;
498                     }
499                     state._token = sb.toString();
500                     return true;
501                 default:
502                 }
503             }
504             if ( (string) || (hstring) ) {
505                 if (string) {
506                     if ( (c == '(') && (last != '\\') ) {
507                         stringParen++;
508                     }
509                     if ( (c == ')') && (last != '\\') ) {
510                         if (stringParen == 1) {
511                             sb.append(c);
512                             state._token = sb.toString();
513                             return true;
514                         } else {
515                             stringParen--;
516                         }
517                     }
518                 } else {
519                     // hex string
520
if (c == '>') {
521                         sb.append(c);
522                         state._token = sb.toString();
523                         return true;
524                     }
525                 }
526                 sb.append(c);
527             } else {
528                 if (isWhitespace(c)) {
529                     state._token = sb.toString();
530                     return true;
531                 } else {
532                     switch (c) {
533                     case '[':
534                         if ( ! firstChar ) {
535                             state._pos--;
536                             state._token = sb.toString();
537                             return true;
538                         } else {
539                             state._token = "[";
540                             return true;
541                         }
542                     case '<':
543                         if ( ! firstChar ) {
544                             state._pos--;
545                             state._token = sb.toString();
546                             return true;
547                         } else {
548                             if ( (state._pos < state._data.length) &&
549                                  ((char)(state._data[state._pos]) ==
550                                   '<') ) {
551                                 // dictionary
552
state._pos++;
553                                 state._token = "<<";
554                                 return true;
555                             } else {
556                                 // hex string
557
hstring = true;
558                                 sb.append(c);
559                             }
560                         }
561                         break;
562                     case ']':
563                     case '/':
564                     case '(':
565                         if ( ! firstChar ) {
566                             state._pos--;
567                             state._token =
568                                 sb.toString();
569                             return true;
570                         } else {
571                             sb.append(c);
572                             break;
573                         }
574                     case '>':
575                         if ( (state._pos <
576                             state._data.length) &&
577                              ((char)(state._data[state._pos]) == '>') ) {
578                             state._pos--;
579                             state._token =
580                                 sb.toString();
581                             return true;
582                         } else {
583                             sb.append(c);
584                         }
585                         break;
586                     default:
587                         sb.append(c);
588                     }
589                 }
590             }
591             if (firstChar) {
592                 firstChar = false;
593             }
594         }
595         return false;
596     }
597
598     private static void skipWhitespace(PdfParserState state) {
599         while ( (state._pos < state._data.length) && (isWhitespace((char)(state._data[state._pos]))) ) {
600             state._pos++;
601         }
602     }
603
604     private static boolean isWhitespace(char c) {
605         switch (c) {
606         case ' ':
607         case '\t':
608         case '\r':
609         case '\n':
610             return true;
611         default:
612             return false;
613         }
614     }
615
616 }
617
Popular Tags