KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > PDFParser


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * Created on Jul 14, 2003
20  *
21  */

22 package org.archive.crawler.extractor;
23
24 import com.lowagie.text.pdf.PdfReader;
25 import com.lowagie.text.pdf.PdfName;
26 import com.lowagie.text.pdf.PdfObject;
27 import com.lowagie.text.pdf.PdfDictionary;
28 import com.lowagie.text.pdf.PRIndirectReference;
29 import com.lowagie.text.pdf.PdfArray;
30
31 import java.io.*;
32 import java.util.*;
33
34
35 /** Supports PDF parsing operations. For now this primarily means
36  * extracting URIs, but the logic in extractURIs() could easily be adopted/extended
37  * for a variety of PDF processing tasks.
38  *
39  * @author Parker Thompson
40  *
41  */

42 //TODO make this more effecient, it currently had to read the whole file into memory
43
// before processing can begin, and appears to take much longer than it "should"
44
// to parse small, but admittedly complex, documents.
45
public class PDFParser {
46
47     ArrayList<String JavaDoc> foundURIs;
48     ArrayList<ArrayList<Integer JavaDoc>> encounteredReferences;
49     PdfReader documentReader;
50     byte[] document;
51     PdfDictionary catalog;
52
53     public PDFParser(String JavaDoc doc) throws IOException {
54         resetState();
55         getInFromFile(doc);
56         initialize();
57     }
58      public PDFParser(byte[] doc) throws IOException{
59         resetState();
60         document = doc;
61         initialize();
62     }
63
64     /** Reinitialize the object as though a new one were created.
65      */

66     protected void resetState(){
67         foundURIs = new ArrayList<String JavaDoc>();
68         encounteredReferences = new ArrayList<ArrayList<Integer JavaDoc>>();
69         documentReader = null;
70         document = null;
71         catalog = null;
72
73         for(int i=0; i < encounteredReferences.size(); i++){
74             encounteredReferences.add(new ArrayList<Integer JavaDoc>());
75         }
76     }
77
78     /**
79      * Reset the object and initialize it with a new byte array (the document).
80      * @param doc
81      * @throws IOException
82      */

83     public void resetState(byte[] doc) throws IOException{
84         resetState();
85         document = doc;
86         initialize();
87     }
88
89     /** Reinitialize the object as though a new one were created, complete
90      * with a valid pointer to a document that can be read
91      * @param doc
92      * @throws IOException
93      */

94     public void resetState(String JavaDoc doc) throws IOException{
95         resetState();
96         getInFromFile(doc);
97         initialize();
98     }
99
100     /**
101      * Read a file named 'doc' and store its' bytes for later processing.
102      * @param doc
103      * @throws IOException
104      */

105     protected void getInFromFile(String JavaDoc doc) throws IOException{
106         File documentOnDisk = new File(doc);
107
108         long length = documentOnDisk.length();
109         document = new byte[(int)length];
110
111         FileInputStream inStream = new FileInputStream(documentOnDisk);
112
113         inStream.read(document);
114     }
115
116     /**
117      * Indicates, based on a PDFObject's generation/id pair whether
118      * the parser has already encountered this object (or a reference to it)
119      * so we don't infinitely loop on circuits within the PDF.
120      * @param generation
121      * @param id
122      * @return True if already seen.
123      */

124     protected boolean haveSeen(int generation, int id){
125
126         // if we can't store this generation grow our list until we can
127
if(generation >= encounteredReferences.size()){
128             for(int i=encounteredReferences.size(); i <= generation; i++){
129                 encounteredReferences.add(new ArrayList<Integer JavaDoc>());
130             }
131
132             // clearly we haven't seen it
133
return false;
134         }
135
136         ArrayList<Integer JavaDoc> generationList
137          = encounteredReferences.get(generation);
138         
139         for (int i: generationList) {
140             if(i == id){
141                 return true;
142             }
143         }
144         return false;
145     }
146
147     /**
148      * Note that an object (id/generation pair) has been seen by this parser
149      * so that it can be handled differently when it is encountered again.
150      * @param generation
151      * @param id
152      */

153     protected void markAsSeen(int generation, int id){
154         ArrayList<Integer JavaDoc> objectIds = encounteredReferences.get(generation);
155         objectIds.add(id);
156     }
157
158     /**
159      * Get a list of URIs retrieved from the Pdf during the
160      * extractURIs operation.
161      * @return A list of URIs retrieved from the Pdf during the
162      * extractURIs operation.
163      */

164     public ArrayList getURIs(){
165         return foundURIs;
166     }
167
168     /**
169      * Initialize opens the document for reading. This is done implicitly
170      * by the constuctor. This should only need to be called directly following
171      * a reset.
172      * @throws IOException
173      */

174     protected void initialize() throws IOException{
175         if(document != null){
176             documentReader = new PdfReader(document);
177         }
178
179         catalog = documentReader.getCatalog();
180     }
181
182     /**
183      * Extract URIs from all objects found in a Pdf document's catalog.
184      * Returns an array list representing all URIs found in the document catalog tree.
185      * @return URIs from all objects found in a Pdf document's catalog.
186      */

187     public ArrayList extractURIs(){
188         extractURIs(catalog);
189         return getURIs();
190     }
191
192     /**
193      * Parse a PdfDictionary, looking for URIs recursively and adding
194      * them to foundURIs
195      * @param entity
196      */

197     protected void extractURIs(PdfObject entity){
198
199             // deal with dictionaries
200
if(entity.isDictionary()){
201
202                 PdfDictionary dictionary= (PdfDictionary)entity;
203
204                 @SuppressWarnings JavaDoc("unchecked")
205                 Set<PdfName> allkeys = dictionary.getKeys();
206                 for (PdfName key: allkeys) {
207                     PdfObject value = dictionary.get(key);
208
209                     // see if it's the key is a UR[I,L]
210
if( key.toString().equals("/URI") ||
211                     key.toString().equals("/URL") ) {
212                         foundURIs.add(value.toString());
213
214                     }else{
215                         this.extractURIs(value);
216                     }
217
218                 }
219
220             // deal with arrays
221
}else if(entity.isArray()){
222
223                 PdfArray array = (PdfArray)entity;
224                 ArrayList arrayObjects = array.getArrayList();
225                 Iterator objectList = arrayObjects.iterator();
226
227                 while(objectList.hasNext()){
228                     this.extractURIs( (PdfObject)objectList.next());
229                 }
230
231             // deal with indirect references
232
}else if(entity.getClass() == PRIndirectReference.class){
233
234                     PRIndirectReference indirect = (PRIndirectReference)entity;
235
236                     // if we've already seen a reference to this object
237
if( haveSeen( indirect.getGeneration(), indirect.getNumber()) ){
238                         return;
239
240                     // note that we've seen it if it's new
241
}else{
242                         markAsSeen(indirect.getGeneration(), indirect.getNumber() );
243                     }
244
245                     // dereference the "pointer" and process the object
246
indirect.getReader(); // FIXME: examine side-effects
247
PdfObject direct = PdfReader.getPdfObject(indirect);
248
249                     this.extractURIs(direct);
250             }
251     }
252
253     public static void main(String JavaDoc[] argv){
254
255         try{
256             PDFParser parser = new PDFParser("/home/parkert/files/pdfspec.pdf");
257
258             ArrayList uris = parser.extractURIs();
259
260             Iterator i = uris.iterator();
261
262             while(i.hasNext()){
263                 String JavaDoc uri = (String JavaDoc)i.next();
264                 System.out.println("got uri: " + uri);
265             }
266
267         }catch(IOException e){
268             e.printStackTrace();
269         }
270     }
271 }
272
Popular Tags