KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > jahia > utils > fileparsers > PDFExtractor


1 package org.jahia.utils.fileparsers;
2
3 import java.io.*;
4
5 import org.jahia.utils.*;
6 import org.pdfbox.util.PDFTextStripper;
7 import org.pdfbox.pdmodel.PDDocument;
8
9 /**
10  * Created by IntelliJ IDEA.
11  * User: toto
12  * Date: Aug 23, 2003
13  * Time: 3:52:46 AM
14  * To change this template use Options | File Templates.
15  */

16 public class PDFExtractor implements FileExtractor {
17
18     private static org.apache.log4j.Logger logger =
19             org.apache.log4j.Logger.getLogger (PDFExtractor.class);
20
21     private String JavaDoc path = null;
22     private long lastModifed;
23
24     public PDFExtractor(){
25     }
26
27     /**
28      *
29      * @param path String
30      * @param lastModified long
31      * @param fileStream InputStream
32      * @throws Exception
33      * @return String
34      */

35     public String JavaDoc getContentAsString(String JavaDoc path, long lastModified,
36                                      InputStream fileStream)
37     throws Exception JavaDoc {
38        return getContentAsString(path, lastModified, fileStream, null);
39     }
40
41     /**
42      *
43      * @param path String
44      * @param lastModified long
45      * @param fileStream InputStream
46      * @param charSet String
47      * @throws Exception
48      * @return String
49      */

50     public String JavaDoc getContentAsString(String JavaDoc path, long lastModified,
51                                      InputStream fileStream,
52                                      String JavaDoc charSet) throws Exception JavaDoc {
53         this.path = path;
54         this.lastModifed = lastModified;
55         String JavaDoc strVal = null;
56
57         if (fileStream != null) {
58              Reader pdfReader = null;
59              try {
60                  long startTime = System.currentTimeMillis();
61                  pdfReader = this.getPDFReader(fileStream, charSet);
62                  long elapsedTime = System.currentTimeMillis() - startTime;
63                  logger.info("Finished pdf extraction with PDFBox in " +
64                              elapsedTime + "ms.");
65
66                  startTime = System.currentTimeMillis();
67                  strVal = FileUtils.readerToString(
68                      pdfReader);
69                  elapsedTime = System.currentTimeMillis() - startTime;
70                  logger.info("Finished reading pdf Reader to String in " +
71                              elapsedTime + "ms.");
72              }
73              catch (Throwable JavaDoc t) {
74                  logger.debug("Error extracting dpdf file " + this.path ,t);
75              }
76              finally {
77                  try {
78                      if (pdfReader != null) {
79                          pdfReader.close();
80                      }
81                  }
82                  catch (Throwable JavaDoc t) {
83                  }
84              }
85          }
86          return strVal;
87     }
88
89     public Reader getPDFReader(InputStream fileStream) throws IOException {
90         return getPDFReader(fileStream, null);
91     }
92
93     public Reader getPDFReader(InputStream fileStream,
94                                String JavaDoc charSet) throws IOException
95     {
96         Reader reader = null;
97         PDDocument pdfDocument = null;
98         try {
99             pdfDocument = PDDocument.load(fileStream);
100             if(pdfDocument.isEncrypted()) {
101                 //Just try using the default password and move on
102
pdfDocument.decrypt("");
103             }
104             //create a tmp output stream with the size of the content.
105
ByteArrayOutputStream out = new ByteArrayOutputStream();
106             OutputStreamWriter writer = new OutputStreamWriter(out);
107             PDFTextStripper stripper = new PDFTextStripper();
108             stripper.writeText(pdfDocument, writer);
109             writer.close();
110             byte[] contents = out.toByteArray();
111             if ( charSet != null ){
112                 reader = new InputStreamReader(new ByteArrayInputStream(contents),
113                                              charSet);
114             } else {
115                 reader = new InputStreamReader(new ByteArrayInputStream(contents));
116             }
117         }
118         catch( Throwable JavaDoc t )
119         {
120             logger.debug(t);
121             throw new IOException(" Exception occured parsing pdf :" + t);
122         }
123         finally
124         {
125             if( pdfDocument != null )
126             { try {
127                     pdfDocument.close();
128                 } catch ( Throwable JavaDoc t ){
129                 }
130             }
131         }
132         return reader;
133     }
134 }
135
Popular Tags