1 package org.jahia.utils.fileparsers; 2 3 import java.io.*; 4 5 import org.jahia.utils.*; 6 import org.pdfbox.util.PDFTextStripper; 7 import org.pdfbox.pdmodel.PDDocument; 8 9 16 public class PDFExtractor implements FileExtractor { 17 18 private static org.apache.log4j.Logger logger = 19 org.apache.log4j.Logger.getLogger (PDFExtractor.class); 20 21 private String path = null; 22 private long lastModifed; 23 24 public PDFExtractor(){ 25 } 26 27 35 public String getContentAsString(String path, long lastModified, 36 InputStream fileStream) 37 throws Exception { 38 return getContentAsString(path, lastModified, fileStream, null); 39 } 40 41 50 public String getContentAsString(String path, long lastModified, 51 InputStream fileStream, 52 String charSet) throws Exception { 53 this.path = path; 54 this.lastModifed = lastModified; 55 String strVal = null; 56 57 if (fileStream != null) { 58 Reader pdfReader = null; 59 try { 60 long startTime = System.currentTimeMillis(); 61 pdfReader = this.getPDFReader(fileStream, charSet); 62 long elapsedTime = System.currentTimeMillis() - startTime; 63 logger.info("Finished pdf extraction with PDFBox in " + 64 elapsedTime + "ms."); 65 66 startTime = System.currentTimeMillis(); 67 strVal = FileUtils.readerToString( 68 pdfReader); 69 elapsedTime = System.currentTimeMillis() - startTime; 70 logger.info("Finished reading pdf Reader to String in " + 71 elapsedTime + "ms."); 72 } 73 catch (Throwable t) { 74 logger.debug("Error extracting dpdf file " + this.path ,t); 75 } 76 finally { 77 try { 78 if (pdfReader != null) { 79 pdfReader.close(); 80 } 81 } 82 catch (Throwable t) { 83 } 84 } 85 } 86 return strVal; 87 } 88 89 public Reader getPDFReader(InputStream fileStream) throws IOException { 90 return getPDFReader(fileStream, null); 91 } 92 93 public Reader getPDFReader(InputStream fileStream, 94 String charSet) throws IOException 95 { 96 Reader reader = null; 97 PDDocument pdfDocument = null; 98 try { 99 pdfDocument = PDDocument.load(fileStream); 100 if(pdfDocument.isEncrypted()) { 101 pdfDocument.decrypt(""); 103 } 104 ByteArrayOutputStream out = new ByteArrayOutputStream(); 106 OutputStreamWriter writer = new OutputStreamWriter(out); 107 PDFTextStripper stripper = new PDFTextStripper(); 108 stripper.writeText(pdfDocument, writer); 109 writer.close(); 110 byte[] contents = out.toByteArray(); 111 if ( charSet != null ){ 112 reader = new InputStreamReader(new ByteArrayInputStream(contents), 113 charSet); 114 } else { 115 reader = new InputStreamReader(new ByteArrayInputStream(contents)); 116 } 117 } 118 catch( Throwable t ) 119 { 120 logger.debug(t); 121 throw new IOException(" Exception occured parsing pdf :" + t); 122 } 123 finally 124 { 125 if( pdfDocument != null ) 126 { try { 127 pdfDocument.close(); 128 } catch ( Throwable t ){ 129 } 130 } 131 } 132 return reader; 133 } 134 } 135 | Popular Tags |