1 22 package org.archive.crawler.extractor; 23 24 import java.io.File ; 25 import java.io.IOException ; 26 import java.util.ArrayList ; 27 import java.util.Iterator ; 28 import java.util.logging.Logger ; 29 30 import org.apache.commons.httpclient.URIException; 31 import org.archive.crawler.datamodel.CoreAttributeConstants; 32 import org.archive.crawler.datamodel.CrawlURI; 33 import org.archive.crawler.framework.ToeThread; 34 35 36 42 public class ExtractorPDF extends Extractor implements CoreAttributeConstants { 43 44 private static final long serialVersionUID = -6040669467531928494L; 45 46 private static final Logger LOGGER = 47 Logger.getLogger(ExtractorPDF.class.getName()); 48 private static int DEFAULT_MAX_SIZE_TO_PARSE = 5*1024*1024; 50 private long maxSizeToParse = DEFAULT_MAX_SIZE_TO_PARSE; 52 53 private static Logger logger = 54 Logger.getLogger("org.archive.crawler.extractor.ExtractorPDF"); 55 56 protected long numberOfCURIsHandled = 0; 57 protected long numberOfLinksExtracted = 0; 58 59 62 public ExtractorPDF(String name) { 63 super(name, "PDF extractor. Link extraction on PDF documents."); 64 } 65 66 protected void extract(CrawlURI curi){ 67 if (!isHttpTransactionContentToProcess(curi) || 68 !isExpectedMimeType(curi.getContentType(), 69 "application/pdf")) { 70 return; 71 } 72 73 numberOfCURIsHandled++; 74 75 File tempFile; 76 77 if(curi.getHttpRecorder().getRecordedInput().getSize()>maxSizeToParse) 78 { 79 return; 80 } 81 82 int sn = ((ToeThread)Thread.currentThread()).getSerialNumber(); 83 tempFile = new File (getController().getScratchDisk(),"tt"+sn+"tmp.pdf"); 84 85 PDFParser parser; 86 ArrayList uris; 87 try { 88 curi.getHttpRecorder().getRecordedInput(). 89 copyContentBodyTo(tempFile); 90 parser = new PDFParser(tempFile.getAbsolutePath()); 91 uris = parser.extractURIs(); 92 } catch (IOException e) { 93 curi.addLocalizedError(getName(), e, "ExtractorPDF IOException"); 94 return; 95 } catch (RuntimeException e) { 96 curi.addLocalizedError(getName(), e, 99 "ExtractorPDF RuntimeException"); 100 return; 101 } finally { 102 tempFile.delete(); 103 } 104 105 if(uris!=null && uris.size()>0) { 106 Iterator iter = uris.iterator(); 107 while(iter.hasNext()) { 108 String uri = (String )iter.next(); 109 try { 110 curi.createAndAddLink(uri,Link.NAVLINK_MISC,Link.NAVLINK_HOP); 111 } catch (URIException e1) { 112 if (getController() != null) { 115 getController().logUriError(e1, curi.getUURI(), uri); 116 } else { 117 LOGGER.info(curi + ", " + uri + ": " + 118 e1.getMessage()); 119 } 120 } 121 } 122 numberOfLinksExtracted += uris.size(); 123 } 124 125 logger.fine(curi+" has "+uris.size()+" links."); 126 curi.linkExtractorFinished(); 128 } 129 130 135 public String report() { 136 StringBuffer ret = new StringBuffer (); 137 ret.append("Processor: org.archive.crawler.extractor.ExtractorPDF\n"); 138 ret.append(" Function: Link extraction on PDF documents\n"); 139 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 140 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 141 142 return ret.toString(); 143 } 144 } 145 | Popular Tags |