PdfParser


1   /* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.parse.pdf;
5   
6   import org.pdfbox.encryption.DocumentEncryption;
7   import org.pdfbox.pdfparser.PDFParser;
8   import org.pdfbox.pdmodel.PDDocument;
9   import org.pdfbox.pdmodel.PDDocumentInformation;
10  import org.pdfbox.util.PDFTextStripper;
11  
12  import org.pdfbox.exceptions.CryptographyException;
13  import org.pdfbox.exceptions.InvalidPasswordException;
14  
15  import net.nutch.protocol.Content;
16  import net.nutch.util.LogFormatter;
17  import net.nutch.parse.Parser;
18  import net.nutch.parse.Parse;
19  import net.nutch.parse.ParseData;
20  import net.nutch.parse.ParseImpl;
21  import net.nutch.parse.Outlink;
22  import net.nutch.parse.ParseException;
23  
24  import java.text.SimpleDateFormat  ;
25  import java.util.Calendar  ;
26  
27  import java.util.Properties  ;
28  import java.util.logging.Logger  ;
29  
30  import java.io.ByteArrayInputStream  ;
31  import java.io.IOException  ;
32  
33  /*********************************************
34   * parser for mime type application/pdf.
35   * It is based on org.pdfbox.*. We have to see how well it does the job.
36   * 
37   * @author John Xing
38   *
39   * Note on 20040614 by Xing:
40   * Some codes are stacked here for convenience (see inline comments).
41   * They may be moved to more appropriate places when new codebase
42   * stabilizes, especially after code for indexing is written.
43   *
44   *********************************************/
45  
46  public class PdfParser implements Parser {
47    public static final Logger   LOG =
48      LogFormatter.getLogger("net.nutch.parse.pdf");
49  
50    public PdfParser () {
51      // redirect org.apache.log4j.Logger to java's native logger, in order
52      // to, at least, suppress annoying log4j warnings.
53      // Note on 20040614 by Xing:
54      // log4j is used by pdfbox. This snippet'd better be moved
55      // to a common place shared by all parsers that use log4j.
56      org.apache.log4j.Logger rootLogger =
57        org.apache.log4j.Logger.getRootLogger();
58  
59      rootLogger.setLevel(org.apache.log4j.Level.INFO);
60  
61      org.apache.log4j.Appender appender = new org.apache.log4j.WriterAppender(
62        new org.apache.log4j.SimpleLayout(),
63        net.nutch.util.LogFormatter.getLogStream(
64          this.LOG, java.util.logging.Level.INFO));
65  
66      rootLogger.addAppender(appender);
67    }
68  
69    public Parse getParse(Content content) throws ParseException {
70  
71      // check that contentType is one we can handle
72      String   contentType = content.getContentType();
73      if (contentType != null && !contentType.startsWith("application/pdf"))
74        throw new ParseException(
75          "Content-Type not application/pdf: "+contentType);
76  
77      // in memory representation of pdf file
78      PDDocument pdf = null;
79  
80      String   text = null;
81      String   title = null;
82  
83      try {
84  
85        byte[] raw = content.getContent();
86  
87        String   contentLength = content.get("Content-Length");
88        if (contentLength != null
89              && raw.length != Integer.parseInt(contentLength)) {
90            throw new ParseException("Content truncated at "+raw.length
91              +" bytes. Parser can't handle incomplete pdf file.");
92        }
93  
94        PDFParser parser = new PDFParser(
95          new ByteArrayInputStream  (raw));
96        parser.parse();
97  
98        pdf = parser.getPDDocument();
99  
100       if (pdf.isEncrypted()) {
101         DocumentEncryption decryptor = new DocumentEncryption(pdf);
102         //Just try using the default password and move on
103         decryptor.decryptDocument("");
104       }
105 
106       // collect text
107       PDFTextStripper stripper = new PDFTextStripper();
108       text = stripper.getText(pdf);
109 
110       // collect title
111       PDDocumentInformation info = pdf.getDocumentInformation();
112       title = info.getTitle();
113       // more useful info, currently not used. please keep them for future use.
114       // pdf.getPageCount();
115       // info.getAuthor()
116       // info.getSubject()
117       // info.getKeywords()
118       // info.getCreator()
119       // info.getProducer()
120       // info.getTrapped()
121       // formatDate(info.getCreationDate())
122       // formatDate(info.getModificationDate())
123 
124     } catch (ParseException e) {
125       throw e;
126     } catch (CryptographyException e) {
127       throw new ParseException("Error decrypting document. "+e);
128     } catch (InvalidPasswordException e) {
129       throw new ParseException("Can't decrypt document. "+e);
130     } catch (Exception   e) { // run time exception
131       throw new ParseException("Can't be handled as pdf document. "+e);
132     } finally {
133       try {
134         if (pdf != null)
135           pdf.close();
136         } catch (IOException   e) {
137           // nothing to do
138         }
139     }
140 
141     if (text == null)
142       text = "";
143 
144     if (title == null)
145       title = "";
146 
147     // collect outlink
148     Outlink[] outlinks = new Outlink[0];
149 
150     // collect meta data
151     Properties   metadata = new Properties  ();
152     metadata.putAll(content.getMetadata()); // copy through
153 
154     ParseData parseData = new ParseData(title, outlinks, metadata);
155     return new ParseImpl(text, parseData);
156     // any filter?
157     //return HtmlParseFilters.filter(content, parse, root);
158   }
159 
160   // format date
161   // currently not used. please keep it for future use.
162   private String   formatDate(Calendar   date) {
163     String   retval = null;
164     if(date != null) {
165       SimpleDateFormat   formatter = new SimpleDateFormat  ();
166       retval = formatter.format(date.getTime());
167     }
168     return retval;
169   }
170 
171 }
172
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags