KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorPDF


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * Created on Jul 11, 2003
20  *
21  */

22 package org.archive.crawler.extractor;
23
24 import java.io.File JavaDoc;
25 import java.io.IOException JavaDoc;
26 import java.util.ArrayList JavaDoc;
27 import java.util.Iterator JavaDoc;
28 import java.util.logging.Logger JavaDoc;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.datamodel.CoreAttributeConstants;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.ToeThread;
34
35
36 /** Allows the caller to process a CrawlURI representing a PDF
37  * for the purpose of extracting URIs
38  *
39  * @author Parker Thompson
40  *
41  */

42 public class ExtractorPDF extends Extractor implements CoreAttributeConstants {
43
44     private static final long serialVersionUID = -6040669467531928494L;
45
46     private static final Logger JavaDoc LOGGER =
47         Logger.getLogger(ExtractorPDF.class.getName());
48     private static int DEFAULT_MAX_SIZE_TO_PARSE = 5*1024*1024; // 5MB
49

50     // TODO: make configurable
51
private long maxSizeToParse = DEFAULT_MAX_SIZE_TO_PARSE;
52
53     private static Logger JavaDoc logger =
54         Logger.getLogger("org.archive.crawler.extractor.ExtractorPDF");
55
56     protected long numberOfCURIsHandled = 0;
57     protected long numberOfLinksExtracted = 0;
58
59     /**
60      * @param name
61      */

62     public ExtractorPDF(String JavaDoc name) {
63         super(name, "PDF extractor. Link extraction on PDF documents.");
64     }
65
66     protected void extract(CrawlURI curi){
67         if (!isHttpTransactionContentToProcess(curi) ||
68                 !isExpectedMimeType(curi.getContentType(),
69                     "application/pdf")) {
70             return;
71         }
72
73         numberOfCURIsHandled++;
74
75         File JavaDoc tempFile;
76
77         if(curi.getHttpRecorder().getRecordedInput().getSize()>maxSizeToParse)
78         {
79             return;
80         }
81
82         int sn = ((ToeThread)Thread.currentThread()).getSerialNumber();
83         tempFile = new File JavaDoc(getController().getScratchDisk(),"tt"+sn+"tmp.pdf");
84
85         PDFParser parser;
86         ArrayList JavaDoc uris;
87         try {
88             curi.getHttpRecorder().getRecordedInput().
89                 copyContentBodyTo(tempFile);
90             parser = new PDFParser(tempFile.getAbsolutePath());
91             uris = parser.extractURIs();
92         } catch (IOException JavaDoc e) {
93             curi.addLocalizedError(getName(), e, "ExtractorPDF IOException");
94             return;
95         } catch (RuntimeException JavaDoc e) {
96             // Truncated/corrupt PDFs may generate ClassCast exceptions, or
97
// other problems
98
curi.addLocalizedError(getName(), e,
99                 "ExtractorPDF RuntimeException");
100             return;
101         } finally {
102             tempFile.delete();
103         }
104
105         if(uris!=null && uris.size()>0) {
106             Iterator JavaDoc iter = uris.iterator();
107             while(iter.hasNext()) {
108                 String JavaDoc uri = (String JavaDoc)iter.next();
109                 try {
110                     curi.createAndAddLink(uri,Link.NAVLINK_MISC,Link.NAVLINK_HOP);
111                 } catch (URIException e1) {
112                     // There may not be a controller (e.g. If we're being run
113
// by the extractor tool).
114
if (getController() != null) {
115                         getController().logUriError(e1, curi.getUURI(), uri);
116                     } else {
117                         LOGGER.info(curi + ", " + uri + ": " +
118                             e1.getMessage());
119                     }
120                 }
121             }
122             numberOfLinksExtracted += uris.size();
123         }
124
125         logger.fine(curi+" has "+uris.size()+" links.");
126         // Set flag to indicate that link extraction is completed.
127
curi.linkExtractorFinished();
128     }
129
130     /**
131      * Provide a human-readable textual summary of this Processor's state.
132      *
133      * @see org.archive.crawler.framework.Processor#report()
134      */

135     public String JavaDoc report() {
136         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
137         ret.append("Processor: org.archive.crawler.extractor.ExtractorPDF\n");
138         ret.append(" Function: Link extraction on PDF documents\n");
139         ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
140         ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
141
142         return ret.toString();
143     }
144 }
145
Popular Tags