1 40 package org.dspace.app.mediafilter; 41 42 import java.io.ByteArrayInputStream ; 43 import java.io.InputStream ; 44 45 import org.apache.log4j.Logger; 46 import org.pdfbox.pdfparser.PDFParser; 47 import org.pdfbox.pdmodel.PDDocument; 48 import org.pdfbox.util.PDFTextStripper; 49 50 56 public class PDFFilter extends MediaFilter 57 { 58 59 private static Logger log = Logger.getLogger(PDFFilter.class); 60 61 public String getFilteredName(String oldFilename) 62 { 63 return oldFilename + ".txt"; 64 } 65 66 70 public String getBundleName() 71 { 72 return "TEXT"; 73 } 74 75 78 public String getFormatString() 79 { 80 return "Text"; 81 } 82 83 86 public String getDescription() 87 { 88 return "Extracted text"; 89 } 90 91 97 public InputStream getDestinationStream(InputStream source) 98 throws Exception 99 { 100 PDFTextStripper pts = new PDFTextStripper(); 103 PDFParser parser = null; 104 String extractedText = null; 105 106 try 107 { 108 parser = new PDFParser(source); 109 parser.parse(); 110 extractedText = pts.getText(new PDDocument(parser.getDocument())); 111 } 112 finally 113 { 114 try 115 { 116 parser.getDocument().close(); 117 } 118 catch(Exception e) 119 { 120 log.error("Error closing temporary PDF file: " + e.getMessage(), e); 121 } 122 } 123 124 if (MediaFilterManager.isVerbose) 127 { 128 System.out.println(extractedText); 129 } 130 131 132 byte[] textBytes = extractedText.getBytes(); 134 ByteArrayInputStream bais = new ByteArrayInputStream (textBytes); 135 136 return bais; 138 139 } 140 } 141 | Popular Tags |