KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > pdf > PdfParser


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.parse.pdf;
5
6 import org.pdfbox.encryption.DocumentEncryption;
7 import org.pdfbox.pdfparser.PDFParser;
8 import org.pdfbox.pdmodel.PDDocument;
9 import org.pdfbox.pdmodel.PDDocumentInformation;
10 import org.pdfbox.util.PDFTextStripper;
11
12 import org.pdfbox.exceptions.CryptographyException;
13 import org.pdfbox.exceptions.InvalidPasswordException;
14
15 import net.nutch.protocol.Content;
16 import net.nutch.util.LogFormatter;
17 import net.nutch.parse.Parser;
18 import net.nutch.parse.Parse;
19 import net.nutch.parse.ParseData;
20 import net.nutch.parse.ParseImpl;
21 import net.nutch.parse.Outlink;
22 import net.nutch.parse.ParseException;
23
24 import java.text.SimpleDateFormat JavaDoc;
25 import java.util.Calendar JavaDoc;
26
27 import java.util.Properties JavaDoc;
28 import java.util.logging.Logger JavaDoc;
29
30 import java.io.ByteArrayInputStream JavaDoc;
31 import java.io.IOException JavaDoc;
32
33 /*********************************************
34  * parser for mime type application/pdf.
35  * It is based on org.pdfbox.*. We have to see how well it does the job.
36  *
37  * @author John Xing
38  *
39  * Note on 20040614 by Xing:
40  * Some codes are stacked here for convenience (see inline comments).
41  * They may be moved to more appropriate places when new codebase
42  * stabilizes, especially after code for indexing is written.
43  *
44  *********************************************/

45
46 public class PdfParser implements Parser {
47   public static final Logger JavaDoc LOG =
48     LogFormatter.getLogger("net.nutch.parse.pdf");
49
50   public PdfParser () {
51     // redirect org.apache.log4j.Logger to java's native logger, in order
52
// to, at least, suppress annoying log4j warnings.
53
// Note on 20040614 by Xing:
54
// log4j is used by pdfbox. This snippet'd better be moved
55
// to a common place shared by all parsers that use log4j.
56
org.apache.log4j.Logger rootLogger =
57       org.apache.log4j.Logger.getRootLogger();
58
59     rootLogger.setLevel(org.apache.log4j.Level.INFO);
60
61     org.apache.log4j.Appender appender = new org.apache.log4j.WriterAppender(
62       new org.apache.log4j.SimpleLayout(),
63       net.nutch.util.LogFormatter.getLogStream(
64         this.LOG, java.util.logging.Level.INFO));
65
66     rootLogger.addAppender(appender);
67   }
68
69   public Parse getParse(Content content) throws ParseException {
70
71     // check that contentType is one we can handle
72
String JavaDoc contentType = content.getContentType();
73     if (contentType != null && !contentType.startsWith("application/pdf"))
74       throw new ParseException(
75         "Content-Type not application/pdf: "+contentType);
76
77     // in memory representation of pdf file
78
PDDocument pdf = null;
79
80     String JavaDoc text = null;
81     String JavaDoc title = null;
82
83     try {
84
85       byte[] raw = content.getContent();
86
87       String JavaDoc contentLength = content.get("Content-Length");
88       if (contentLength != null
89             && raw.length != Integer.parseInt(contentLength)) {
90           throw new ParseException("Content truncated at "+raw.length
91             +" bytes. Parser can't handle incomplete pdf file.");
92       }
93
94       PDFParser parser = new PDFParser(
95         new ByteArrayInputStream JavaDoc(raw));
96       parser.parse();
97
98       pdf = parser.getPDDocument();
99
100       if (pdf.isEncrypted()) {
101         DocumentEncryption decryptor = new DocumentEncryption(pdf);
102         //Just try using the default password and move on
103
decryptor.decryptDocument("");
104       }
105
106       // collect text
107
PDFTextStripper stripper = new PDFTextStripper();
108       text = stripper.getText(pdf);
109
110       // collect title
111
PDDocumentInformation info = pdf.getDocumentInformation();
112       title = info.getTitle();
113       // more useful info, currently not used. please keep them for future use.
114
// pdf.getPageCount();
115
// info.getAuthor()
116
// info.getSubject()
117
// info.getKeywords()
118
// info.getCreator()
119
// info.getProducer()
120
// info.getTrapped()
121
// formatDate(info.getCreationDate())
122
// formatDate(info.getModificationDate())
123

124     } catch (ParseException e) {
125       throw e;
126     } catch (CryptographyException e) {
127       throw new ParseException("Error decrypting document. "+e);
128     } catch (InvalidPasswordException e) {
129       throw new ParseException("Can't decrypt document. "+e);
130     } catch (Exception JavaDoc e) { // run time exception
131
throw new ParseException("Can't be handled as pdf document. "+e);
132     } finally {
133       try {
134         if (pdf != null)
135           pdf.close();
136         } catch (IOException JavaDoc e) {
137           // nothing to do
138
}
139     }
140
141     if (text == null)
142       text = "";
143
144     if (title == null)
145       title = "";
146
147     // collect outlink
148
Outlink[] outlinks = new Outlink[0];
149
150     // collect meta data
151
Properties JavaDoc metadata = new Properties JavaDoc();
152     metadata.putAll(content.getMetadata()); // copy through
153

154     ParseData parseData = new ParseData(title, outlinks, metadata);
155     return new ParseImpl(text, parseData);
156     // any filter?
157
//return HtmlParseFilters.filter(content, parse, root);
158
}
159
160   // format date
161
// currently not used. please keep it for future use.
162
private String JavaDoc formatDate(Calendar JavaDoc date) {
163     String JavaDoc retval = null;
164     if(date != null) {
165       SimpleDateFormat JavaDoc formatter = new SimpleDateFormat JavaDoc();
166       retval = formatter.format(date.getTime());
167     }
168     return retval;
169   }
170
171 }
172
Popular Tags