1 23 24 package org.apache.slide.extractor; 25 26 import org.pdfbox.util.PDFTextStripper; 27 import org.pdfbox.pdfparser.PDFParser; 28 import org.pdfbox.pdmodel.PDDocument; 29 30 import java.io.*; 31 32 37 public class PDFExtractor extends AbstractContentExtractor 38 { 39 40 public PDFExtractor(String uri, String contentType, String namespace) 41 { 42 super(uri, contentType, namespace); 43 } 44 45 public Reader extract(InputStream content) throws ExtractorException 46 { 47 try 48 { 49 PDFParser parser = new PDFParser( content ); 50 parser.parse(); 51 52 PDDocument document = parser.getPDDocument(); 53 54 CharArrayWriter writer = new CharArrayWriter(); 55 56 PDFTextStripper stripper = new PDFTextStripper(); 57 stripper.setLineSeparator("\n"); 58 stripper.writeText(document, writer); 59 60 document.close(); 61 writer.close(); 62 63 return new CharArrayReader(writer.toCharArray()); 64 } 65 catch(Exception e ) 66 { 67 throw new ExtractorException(e.getMessage()); 68 } 69 } 70 71 public static void main(String [] args) throws Exception 72 { 73 FileInputStream in = new FileInputStream(args[0]); 74 75 PDFExtractor ex = new PDFExtractor(null, null, null); 76 77 Reader reader = ex.extract(in); 78 79 int c = 0; 80 do 81 { 82 c = reader.read(); 83 System.out.print((char)c); 84 } 85 while(c != -1); 86 } 87 } 88 | Popular Tags |