ExtractorPDF


1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Jul 11, 2003
20   *
21   */
22  package org.archive.crawler.extractor;
23  
24  import java.io.File  ;
25  import java.io.IOException  ;
26  import java.util.ArrayList  ;
27  import java.util.Iterator  ;
28  import java.util.logging.Logger  ;
29  
30  import org.apache.commons.httpclient.URIException;
31  import org.archive.crawler.datamodel.CoreAttributeConstants;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.framework.ToeThread;
34  
35  
36  /** Allows the caller to process a CrawlURI representing a PDF
37   *  for the purpose of extracting URIs
38   *
39   * @author Parker Thompson
40   *
41   */
42  public class ExtractorPDF extends Extractor implements CoreAttributeConstants {
43  
44      private static final long serialVersionUID = -6040669467531928494L;
45  
46      private static final Logger   LOGGER =
47          Logger.getLogger(ExtractorPDF.class.getName());
48      private static int DEFAULT_MAX_SIZE_TO_PARSE = 5*1024*1024; // 5MB
49  
50      // TODO: make configurable
51      private long maxSizeToParse = DEFAULT_MAX_SIZE_TO_PARSE;
52  
53      private static Logger   logger =
54          Logger.getLogger("org.archive.crawler.extractor.ExtractorPDF");
55  
56      protected long numberOfCURIsHandled = 0;
57      protected long numberOfLinksExtracted = 0;
58  
59      /**
60       * @param name
61       */
62      public ExtractorPDF(String   name) {
63          super(name, "PDF extractor. Link extraction on PDF documents.");
64      }
65  
66      protected void extract(CrawlURI curi){
67          if (!isHttpTransactionContentToProcess(curi) ||
68                  !isExpectedMimeType(curi.getContentType(),
69                      "application/pdf")) {
70              return;
71          }
72  
73          numberOfCURIsHandled++;
74  
75          File   tempFile;
76  
77          if(curi.getHttpRecorder().getRecordedInput().getSize()>maxSizeToParse)
78          {
79              return;
80          }
81  
82          int sn = ((ToeThread)Thread.currentThread()).getSerialNumber();
83          tempFile = new File  (getController().getScratchDisk(),"tt"+sn+"tmp.pdf");
84  
85          PDFParser parser;
86          ArrayList   uris;
87          try {
88              curi.getHttpRecorder().getRecordedInput().
89                  copyContentBodyTo(tempFile);
90              parser = new PDFParser(tempFile.getAbsolutePath());
91              uris = parser.extractURIs();
92          } catch (IOException   e) {
93              curi.addLocalizedError(getName(), e, "ExtractorPDF IOException");
94              return;
95          } catch (RuntimeException   e) {
96              // Truncated/corrupt  PDFs may generate ClassCast exceptions, or
97              // other problems
98              curi.addLocalizedError(getName(), e,
99                  "ExtractorPDF RuntimeException");
100             return;
101         } finally {
102             tempFile.delete();
103         }
104 
105         if(uris!=null && uris.size()>0) {
106             Iterator   iter = uris.iterator();
107             while(iter.hasNext()) {
108                 String   uri = (String  )iter.next();
109                 try {
110                     curi.createAndAddLink(uri,Link.NAVLINK_MISC,Link.NAVLINK_HOP);
111                 } catch (URIException e1) {
112                     // There may not be a controller (e.g. If we're being run
113                     // by the extractor tool).
114                     if (getController() != null) {
115                         getController().logUriError(e1, curi.getUURI(), uri);
116                     } else {
117                         LOGGER.info(curi + ", " + uri + ": " +
118                             e1.getMessage());
119                     }
120                 }
121             }
122             numberOfLinksExtracted += uris.size();
123         }
124 
125         logger.fine(curi+" has "+uris.size()+" links.");
126         // Set flag to indicate that link extraction is completed.
127         curi.linkExtractorFinished();
128     }
129 
130     /**
131      * Provide a human-readable textual summary of this Processor's state.
132      *
133      * @see org.archive.crawler.framework.Processor#report()
134      */
135     public String   report() {
136         StringBuffer   ret = new StringBuffer  ();
137         ret.append("Processor: org.archive.crawler.extractor.ExtractorPDF\n");
138         ret.append("  Function:          Link extraction on PDF documents\n");
139         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
140         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
141 
142         return ret.toString();
143     }
144 }
145
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags