1 2 3 4 package net.nutch.parse.pdf; 5 6 import org.pdfbox.encryption.DocumentEncryption; 7 import org.pdfbox.pdfparser.PDFParser; 8 import org.pdfbox.pdmodel.PDDocument; 9 import org.pdfbox.pdmodel.PDDocumentInformation; 10 import org.pdfbox.util.PDFTextStripper; 11 12 import org.pdfbox.exceptions.CryptographyException; 13 import org.pdfbox.exceptions.InvalidPasswordException; 14 15 import net.nutch.protocol.Content; 16 import net.nutch.util.LogFormatter; 17 import net.nutch.parse.Parser; 18 import net.nutch.parse.Parse; 19 import net.nutch.parse.ParseData; 20 import net.nutch.parse.ParseImpl; 21 import net.nutch.parse.Outlink; 22 import net.nutch.parse.ParseException; 23 24 import java.text.SimpleDateFormat ; 25 import java.util.Calendar ; 26 27 import java.util.Properties ; 28 import java.util.logging.Logger ; 29 30 import java.io.ByteArrayInputStream ; 31 import java.io.IOException ; 32 33 45 46 public class PdfParser implements Parser { 47 public static final Logger LOG = 48 LogFormatter.getLogger("net.nutch.parse.pdf"); 49 50 public PdfParser () { 51 org.apache.log4j.Logger rootLogger = 57 org.apache.log4j.Logger.getRootLogger(); 58 59 rootLogger.setLevel(org.apache.log4j.Level.INFO); 60 61 org.apache.log4j.Appender appender = new org.apache.log4j.WriterAppender( 62 new org.apache.log4j.SimpleLayout(), 63 net.nutch.util.LogFormatter.getLogStream( 64 this.LOG, java.util.logging.Level.INFO)); 65 66 rootLogger.addAppender(appender); 67 } 68 69 public Parse getParse(Content content) throws ParseException { 70 71 String contentType = content.getContentType(); 73 if (contentType != null && !contentType.startsWith("application/pdf")) 74 throw new ParseException( 75 "Content-Type not application/pdf: "+contentType); 76 77 PDDocument pdf = null; 79 80 String text = null; 81 String title = null; 82 83 try { 84 85 byte[] raw = content.getContent(); 86 87 String contentLength = content.get("Content-Length"); 88 if (contentLength != null 89 && raw.length != Integer.parseInt(contentLength)) { 90 throw new ParseException("Content truncated at "+raw.length 91 +" bytes. Parser can't handle incomplete pdf file."); 92 } 93 94 PDFParser parser = new PDFParser( 95 new ByteArrayInputStream (raw)); 96 parser.parse(); 97 98 pdf = parser.getPDDocument(); 99 100 if (pdf.isEncrypted()) { 101 DocumentEncryption decryptor = new DocumentEncryption(pdf); 102 decryptor.decryptDocument(""); 104 } 105 106 PDFTextStripper stripper = new PDFTextStripper(); 108 text = stripper.getText(pdf); 109 110 PDDocumentInformation info = pdf.getDocumentInformation(); 112 title = info.getTitle(); 113 124 } catch (ParseException e) { 125 throw e; 126 } catch (CryptographyException e) { 127 throw new ParseException("Error decrypting document. "+e); 128 } catch (InvalidPasswordException e) { 129 throw new ParseException("Can't decrypt document. "+e); 130 } catch (Exception e) { throw new ParseException("Can't be handled as pdf document. "+e); 132 } finally { 133 try { 134 if (pdf != null) 135 pdf.close(); 136 } catch (IOException e) { 137 } 139 } 140 141 if (text == null) 142 text = ""; 143 144 if (title == null) 145 title = ""; 146 147 Outlink[] outlinks = new Outlink[0]; 149 150 Properties metadata = new Properties (); 152 metadata.putAll(content.getMetadata()); 154 ParseData parseData = new ParseData(title, outlinks, metadata); 155 return new ParseImpl(text, parseData); 156 } 159 160 private String formatDate(Calendar date) { 163 String retval = null; 164 if(date != null) { 165 SimpleDateFormat formatter = new SimpleDateFormat (); 166 retval = formatter.format(date.getTime()); 167 } 168 return retval; 169 } 170 171 } 172 | Popular Tags |