KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > jahia > services > fileextraction > PDFExtractor


1 package org.jahia.services.fileextraction;
2
3 import java.io.*;
4 import java.util.*;
5
6 import org.jahia.utils.*;
7 import org.jahia.services.sites.JahiaSitesSlideService;
8 import org.pdfbox.cos.COSDictionary;
9 import org.pdfbox.cos.COSName;
10 import org.pdfbox.cos.COSString;
11 import org.pdfbox.util.PDFTextStripper;
12 import org.pdfbox.util.*;
13 import org.pdfbox.pdmodel.PDDocument;
14 import org.pdfbox.pdmodel.PDDocumentInformation;
15
16 import org.apache.slide.util.conf.Configuration;
17 import org.apache.slide.util.conf.ConfigurationException;
18
19
20 /**
21  * Created by IntelliJ IDEA.
22  * User: toto
23  * Date: Aug 23, 2003
24  * Time: 3:52:46 AM
25  * To change this template use Options | File Templates.
26  */

27 public class PDFExtractor implements FileExtractor {
28
29     private static org.apache.log4j.Logger logger =
30             org.apache.log4j.Logger.getLogger (PDFExtractor.class);
31
32     private String JavaDoc path = null;
33     private long lastModifed;
34
35     protected List instructions = new ArrayList();
36     protected Map propertyMap = new HashMap();
37
38     public PDFExtractor(){
39     }
40
41     /**
42      *
43      * @param path String
44      * @param lastModified long
45      * @param fileStream InputStream
46      * @throws Exception
47      * @return String
48      */

49     public synchronized ExtractedDocument getExtractedDocument(
50                                             String JavaDoc path,
51                                             long lastModified,
52                                             InputStream fileStream,
53                                             String JavaDoc charSet)
54     throws Exception JavaDoc{
55
56         ExtractedDocumentImpl extDoc = new ExtractedDocumentImpl();
57
58         PDDocument pdfDocument = null;
59         try
60         {
61             pdfDocument = PDDocument.load( fileStream );
62
63             if( pdfDocument.isEncrypted() )
64             {
65                 //Just try using the default password and move on
66
pdfDocument.decrypt( "" );
67             }
68
69             //create a tmp output stream with the size of the content.
70
ByteArrayOutputStream out = new ByteArrayOutputStream();
71             OutputStreamWriter writer = null;
72             if ( charSet != null ) {
73                 writer = new OutputStreamWriter(out,charSet);
74             } else {
75                 writer = new OutputStreamWriter(out);
76             }
77             PDFTextStripper stripper = new PDFTextStripper();
78             stripper.writeText(pdfDocument, writer);
79             writer.close();
80
81             String JavaDoc content = out.toString(charSet);
82             if ( content == null ){
83                 content = "";
84             }
85             extDoc.setContent(content);
86
87             PDDocumentInformation info = pdfDocument.getDocumentInformation();
88             /*
89             extDoc.setProperty("Title",info.getTitle());
90             extDoc.setProperty("Author",info.getAuthor());
91             extDoc.setProperty("Creator",info.getCreator());
92             extDoc.setProperty("Subject",info.getSubject());
93             extDoc.setProperty("Keywords",info.getKeywords());
94             extDoc.setProperty("Producer",info.getProducer());
95             Calendar cal = info.getCreationDate();
96             if ( cal != null ){
97                 extDoc.setProperty("CreationDate",String.valueOf(cal.getTimeInMillis()));
98             }
99             cal = info.getModificationDate();
100             if ( cal != null ){
101                 extDoc.setProperty("ModdificationDate",String.valueOf(cal.getTimeInMillis()));
102             }
103             */

104             /*
105             int summarySize = Math.min( contents.length, 500 );
106             // Add the summary as an UnIndexed field, so that it is stored and returned
107             // with hit documents for display.
108             extDoc.setProperty("Summary", content.substring(0, summarySize) );
109             */

110
111             this.configure(((JahiaSitesSlideService)JahiaSitesSlideService.getInstance()).getConfiguration().getConfiguration("pdf-property-mapping"));
112
113             COSDictionary dict = info.getDictionary();
114             Iterator iterator = dict.keyList().iterator();
115             COSName key = null;
116             Calendar cal = null;
117             String JavaDoc val = null;
118             while ( iterator.hasNext() ){
119                 try {
120                     key = (COSName)iterator.next();
121                     logger.debug("Found Pdf property : key=" + key.getName());
122
123                     if ( !propertyMap.containsKey(key.getName()) ) {
124                         continue;
125                     }
126
127                     cal = this.getCalendar(info,key);
128                     if ( cal == null ){
129                         COSString value = (COSString)info.getDictionary().getDictionaryObject( key );
130                         if( value != null )
131                         {
132                             val = value.getString();
133                         }
134                     } else {
135                         val = String.valueOf(cal.getTimeInMillis());
136                     }
137                     logger.debug("Found Pdf property : value=" + val);
138                     extDoc.setProperty((String JavaDoc)propertyMap.get(key.getName()),val);
139                 } catch ( Throwable JavaDoc t) {
140                     logger.debug("Error handling pdf properties", t);
141                 }
142             }
143
144         } catch( Throwable JavaDoc t ) {
145             logger.debug(t);
146             throw new IOException(" Exception occured parsing pdf :" + t);
147         } finally {
148             if( pdfDocument != null )
149             { try {
150                     pdfDocument.close();
151                 } catch ( Throwable JavaDoc t ){
152                 }
153             }
154         }
155         return extDoc;
156     }
157
158     /**
159      *
160      * @param path String
161      * @param lastModified long
162      * @param fileStream InputStream
163      * @throws Exception
164      * @return String
165      */

166     public synchronized ExtractedDocument getExtractedDocument(
167                                              String JavaDoc path,
168                                              long lastModified,
169                                              InputStream fileStream)
170     throws Exception JavaDoc{
171         return this.getExtractedDocument(path, lastModified, fileStream, null);
172     }
173
174     /**
175      *
176      * @param path String
177      * @param lastModified long
178      * @param fileStream InputStream
179      * @throws Exception
180      * @return String
181      */

182     public String JavaDoc getContentAsString(String JavaDoc path, long lastModified,
183                                      InputStream fileStream)
184     throws Exception JavaDoc {
185        return getContentAsString(path, lastModified, fileStream, null);
186     }
187
188     /**
189      *
190      * @param path String
191      * @param lastModified long
192      * @param fileStream InputStream
193      * @param charSet String
194      * @throws Exception
195      * @return String
196      */

197     public String JavaDoc getContentAsString(String JavaDoc path, long lastModified,
198                                      InputStream fileStream,
199                                      String JavaDoc charSet) throws Exception JavaDoc {
200         this.path = path;
201         this.lastModifed = lastModified;
202         String JavaDoc strVal = null;
203
204         if (fileStream != null) {
205              Reader pdfReader = null;
206              try {
207                  long startTime = System.currentTimeMillis();
208                  pdfReader = this.getPDFReader(fileStream, charSet);
209                  long elapsedTime = System.currentTimeMillis() - startTime;
210                  logger.info("Finished pdf extraction with PDFBox in " +
211                              elapsedTime + "ms.");
212
213                  startTime = System.currentTimeMillis();
214                  strVal = FileUtils.readerToString(
215                      pdfReader);
216                  elapsedTime = System.currentTimeMillis() - startTime;
217                  logger.info("Finished reading pdf Reader to String in " +
218                              elapsedTime + "ms.");
219              }
220              catch (Throwable JavaDoc t) {
221                  logger.debug("Error extracting dpdf file " + this.path ,t);
222              }
223              finally {
224                  try {
225                      if (pdfReader != null) {
226                          pdfReader.close();
227                      }
228                  }
229                  catch (Throwable JavaDoc t) {
230                  }
231              }
232          }
233          return strVal;
234     }
235
236     public Reader getPDFReader(InputStream fileStream) throws IOException {
237         return getPDFReader(fileStream, null);
238     }
239
240     public Reader getPDFReader(InputStream fileStream,
241                                String JavaDoc charSet) throws IOException
242     {
243         Reader reader = null;
244         PDDocument pdfDocument = null;
245         try {
246             pdfDocument = PDDocument.load(fileStream);
247             if(pdfDocument.isEncrypted()) {
248                 //Just try using the default password and move on
249
pdfDocument.decrypt("");
250             }
251             //create a tmp output stream with the size of the content.
252
ByteArrayOutputStream out = new ByteArrayOutputStream();
253             OutputStreamWriter writer = new OutputStreamWriter(out);
254             PDFTextStripper stripper = new PDFTextStripper();
255             stripper.writeText(pdfDocument, writer);
256             writer.close();
257             byte[] contents = out.toByteArray();
258             if ( charSet != null ){
259                 reader = new InputStreamReader(new ByteArrayInputStream(contents),
260                                              charSet);
261             } else {
262                 reader = new InputStreamReader(new ByteArrayInputStream(contents));
263             }
264         }
265         catch( Throwable JavaDoc t )
266         {
267             logger.debug(t);
268             throw new IOException(" Exception occured parsing pdf :" + t);
269         }
270         finally
271         {
272             if( pdfDocument != null )
273             { try {
274                     pdfDocument.close();
275                 } catch ( Throwable JavaDoc t ){
276                 }
277             }
278         }
279         return reader;
280     }
281
282     public void configure(Configuration configuration) throws ConfigurationException {
283         Enumeration instructions = configuration.getConfigurations("instruction");
284         while (instructions.hasMoreElements()) {
285             Configuration extract = (Configuration)instructions.nextElement();
286             String JavaDoc property = extract.getAttribute("property");
287             String JavaDoc id = extract.getAttribute("id");
288             propertyMap.put(id, property);
289         }
290     }
291
292     /**
293      * This will get a date item from the dictionary.
294      *
295      * @param key The key to the date item.
296      *
297      * @return The value if it exists or null.
298      */

299     private Calendar getCalendar(PDDocumentInformation info, COSName key){
300         if ( info == null || key == null ){
301             return null;
302         }
303         Calendar retval = null;
304         COSString value = (COSString)info.getDictionary().getDictionaryObject( key );
305
306         if( value != null )
307         {
308             //lets first verify that the string is valid.
309
String JavaDoc strValue = value.getString();
310             int index = 0;
311             if( strValue.startsWith( "D:" ) )
312             {
313                 index = 2;
314             }
315             StringBuffer JavaDoc buff = new StringBuffer JavaDoc("D:");
316
317             //boolean validDate = true;
318
for( int i=index; i<strValue.length(); i++ )
319             {
320                 if ( Character.isDigit( strValue.charAt( i ) ) ){
321                     buff.append( strValue.charAt( i ) );
322                 } else {
323                     break;
324                 }
325                 //validDate = validDate && Character.isDigit( strValue.charAt( i ) );
326
}
327             //if( validDate )
328
//{
329
DateConverter converter = new DateConverter();
330                 try
331                 {
332                     retval = converter.toCalendar( buff.toString() );
333                 }
334                 catch( IOException e )
335                 {
336                     retval = null;
337                 }
338             //}
339
}
340         return retval;
341     }
342 }
343
Popular Tags