1 22 package org.archive.crawler.extractor; 23 24 import com.lowagie.text.pdf.PdfReader; 25 import com.lowagie.text.pdf.PdfName; 26 import com.lowagie.text.pdf.PdfObject; 27 import com.lowagie.text.pdf.PdfDictionary; 28 import com.lowagie.text.pdf.PRIndirectReference; 29 import com.lowagie.text.pdf.PdfArray; 30 31 import java.io.*; 32 import java.util.*; 33 34 35 42 public class PDFParser { 46 47 ArrayList<String > foundURIs; 48 ArrayList<ArrayList<Integer >> encounteredReferences; 49 PdfReader documentReader; 50 byte[] document; 51 PdfDictionary catalog; 52 53 public PDFParser(String doc) throws IOException { 54 resetState(); 55 getInFromFile(doc); 56 initialize(); 57 } 58 public PDFParser(byte[] doc) throws IOException{ 59 resetState(); 60 document = doc; 61 initialize(); 62 } 63 64 66 protected void resetState(){ 67 foundURIs = new ArrayList<String >(); 68 encounteredReferences = new ArrayList<ArrayList<Integer >>(); 69 documentReader = null; 70 document = null; 71 catalog = null; 72 73 for(int i=0; i < encounteredReferences.size(); i++){ 74 encounteredReferences.add(new ArrayList<Integer >()); 75 } 76 } 77 78 83 public void resetState(byte[] doc) throws IOException{ 84 resetState(); 85 document = doc; 86 initialize(); 87 } 88 89 94 public void resetState(String doc) throws IOException{ 95 resetState(); 96 getInFromFile(doc); 97 initialize(); 98 } 99 100 105 protected void getInFromFile(String doc) throws IOException{ 106 File documentOnDisk = new File(doc); 107 108 long length = documentOnDisk.length(); 109 document = new byte[(int)length]; 110 111 FileInputStream inStream = new FileInputStream(documentOnDisk); 112 113 inStream.read(document); 114 } 115 116 124 protected boolean haveSeen(int generation, int id){ 125 126 if(generation >= encounteredReferences.size()){ 128 for(int i=encounteredReferences.size(); i <= generation; i++){ 129 encounteredReferences.add(new ArrayList<Integer >()); 130 } 131 132 return false; 134 } 135 136 ArrayList<Integer > generationList 137 = encounteredReferences.get(generation); 138 139 for (int i: generationList) { 140 if(i == id){ 141 return true; 142 } 143 } 144 return false; 145 } 146 147 153 protected void markAsSeen(int generation, int id){ 154 ArrayList<Integer > objectIds = encounteredReferences.get(generation); 155 objectIds.add(id); 156 } 157 158 164 public ArrayList getURIs(){ 165 return foundURIs; 166 } 167 168 174 protected void initialize() throws IOException{ 175 if(document != null){ 176 documentReader = new PdfReader(document); 177 } 178 179 catalog = documentReader.getCatalog(); 180 } 181 182 187 public ArrayList extractURIs(){ 188 extractURIs(catalog); 189 return getURIs(); 190 } 191 192 197 protected void extractURIs(PdfObject entity){ 198 199 if(entity.isDictionary()){ 201 202 PdfDictionary dictionary= (PdfDictionary)entity; 203 204 @SuppressWarnings ("unchecked") 205 Set<PdfName> allkeys = dictionary.getKeys(); 206 for (PdfName key: allkeys) { 207 PdfObject value = dictionary.get(key); 208 209 if( key.toString().equals("/URI") || 211 key.toString().equals("/URL") ) { 212 foundURIs.add(value.toString()); 213 214 }else{ 215 this.extractURIs(value); 216 } 217 218 } 219 220 }else if(entity.isArray()){ 222 223 PdfArray array = (PdfArray)entity; 224 ArrayList arrayObjects = array.getArrayList(); 225 Iterator objectList = arrayObjects.iterator(); 226 227 while(objectList.hasNext()){ 228 this.extractURIs( (PdfObject)objectList.next()); 229 } 230 231 }else if(entity.getClass() == PRIndirectReference.class){ 233 234 PRIndirectReference indirect = (PRIndirectReference)entity; 235 236 if( haveSeen( indirect.getGeneration(), indirect.getNumber()) ){ 238 return; 239 240 }else{ 242 markAsSeen(indirect.getGeneration(), indirect.getNumber() ); 243 } 244 245 indirect.getReader(); PdfObject direct = PdfReader.getPdfObject(indirect); 248 249 this.extractURIs(direct); 250 } 251 } 252 253 public static void main(String [] argv){ 254 255 try{ 256 PDFParser parser = new PDFParser("/home/parkert/files/pdfspec.pdf"); 257 258 ArrayList uris = parser.extractURIs(); 259 260 Iterator i = uris.iterator(); 261 262 while(i.hasNext()){ 263 String uri = (String )i.next(); 264 System.out.println("got uri: " + uri); 265 } 266 267 }catch(IOException e){ 268 e.printStackTrace(); 269 } 270 } 271 } 272 | Popular Tags |