1 package org.contineo.core.text.parser; 2 3 import java.io.ByteArrayOutputStream ; 4 import java.io.File ; 5 import java.io.FileInputStream ; 6 import java.io.InputStream ; 7 import java.io.OutputStreamWriter ; 8 import java.text.DateFormat ; 9 import java.util.Calendar ; 10 import java.util.Date ; 11 import org.apache.log4j.Level; 12 import org.apache.log4j.Logger; 13 import org.contineo.core.LoggingManager; 14 import org.pdfbox.encryption.DocumentEncryption; 15 import org.pdfbox.pdmodel.PDDocument; 16 import org.pdfbox.pdmodel.PDDocumentInformation; 17 import org.pdfbox.util.PDFTextStripper; 18 19 24 public class PDFParser implements Parser { 25 26 29 private StringBuffer content = new StringBuffer (""); 30 private String author; 31 private String title; 32 private String sourceDate; 33 private String keywords; 34 private Logger logger; 35 36 37 38 public PDFParser(File file) { 39 author = ""; 40 title = ""; 41 sourceDate = ""; 42 keywords = ""; 43 logger = LoggingManager.getLogger(this.getClass()); 44 init(file); 45 } 46 47 protected void init(File file) { 48 PDDocument pdfDocument = null; 49 try { 50 InputStream is = new FileInputStream (file); 51 org.pdfbox.pdfparser.PDFParser parser = new org.pdfbox.pdfparser.PDFParser( is ); 52 if (parser != null) 53 parser.parse(); 54 else throw new Exception ("Can not parse pdf file " + file.getName()); 55 56 pdfDocument = parser.getPDDocument(); 57 if (pdfDocument == null) 58 throw new Exception ("Can not get pdf document " + file.getName() + " for parsing"); 59 60 try { 61 PDDocumentInformation information = pdfDocument.getDocumentInformation(); 62 if (information == null) 63 throw new Exception ("Can not get information from pdf document " + file.getName()); 64 author = information.getAuthor(); 65 if (author == null) 66 author = ""; 67 title = information.getTitle(); 68 if (title == null) 69 title = ""; 70 Calendar calendar = information.getCreationDate(); 71 Date date = null; 72 if (calendar != null) 73 date = calendar.getTime(); 74 if (date != null) 75 sourceDate = DateFormat.getDateInstance().format(date); 76 77 keywords = information.getKeywords(); 78 if (keywords == null) 79 keywords = ""; 80 } catch (Exception e) { 81 if (logger.isEnabledFor(Level.ERROR)) 82 logger.error(e.getMessage()); 83 } 84 85 if( pdfDocument.isEncrypted() ) { 86 DocumentEncryption decryptor = new DocumentEncryption( pdfDocument ); 87 decryptor.decryptDocument( "" ); 89 } 90 91 ByteArrayOutputStream out = new ByteArrayOutputStream (); 93 OutputStreamWriter writer = new OutputStreamWriter ( out ); 94 PDFTextStripper stripper = new PDFTextStripper(); 95 stripper.writeText( pdfDocument, writer ); 96 writer.close(); 97 content = new StringBuffer (out.toString()); 98 is.close(); 99 out.close(); 100 } catch (Exception ex) { 101 if (logger.isEnabledFor(Level.ERROR)) 102 logger.error(ex.getMessage()); 103 } finally { 104 try { 105 if (pdfDocument != null) pdfDocument.close(); 106 } catch (Exception e) { 107 if (logger.isEnabledFor(Level.ERROR)) 108 logger.fatal(e.getMessage()); 109 } 110 } 111 } 112 113 117 public StringBuffer getContent() { 118 return content; 119 } 120 121 122 public String getVersion() { 123 return ""; 124 } 125 126 129 133 public String getAuthor() { 134 return author; 135 } 136 137 140 144 public String getSourceDate() { 145 return sourceDate; 146 } 147 148 151 155 public String getKeywords() { 156 return keywords; 157 } 158 159 162 166 public String getTitle() { 167 return title; 168 } 169 170 } 171 | Popular Tags |