1 package org.jahia.services.fileextraction; 2 3 import java.io.*; 4 import java.util.*; 5 6 import org.jahia.utils.*; 7 import org.jahia.services.sites.JahiaSitesSlideService; 8 import org.pdfbox.cos.COSDictionary; 9 import org.pdfbox.cos.COSName; 10 import org.pdfbox.cos.COSString; 11 import org.pdfbox.util.PDFTextStripper; 12 import org.pdfbox.util.*; 13 import org.pdfbox.pdmodel.PDDocument; 14 import org.pdfbox.pdmodel.PDDocumentInformation; 15 16 import org.apache.slide.util.conf.Configuration; 17 import org.apache.slide.util.conf.ConfigurationException; 18 19 20 27 public class PDFExtractor implements FileExtractor { 28 29 private static org.apache.log4j.Logger logger = 30 org.apache.log4j.Logger.getLogger (PDFExtractor.class); 31 32 private String path = null; 33 private long lastModifed; 34 35 protected List instructions = new ArrayList(); 36 protected Map propertyMap = new HashMap(); 37 38 public PDFExtractor(){ 39 } 40 41 49 public synchronized ExtractedDocument getExtractedDocument( 50 String path, 51 long lastModified, 52 InputStream fileStream, 53 String charSet) 54 throws Exception { 55 56 ExtractedDocumentImpl extDoc = new ExtractedDocumentImpl(); 57 58 PDDocument pdfDocument = null; 59 try 60 { 61 pdfDocument = PDDocument.load( fileStream ); 62 63 if( pdfDocument.isEncrypted() ) 64 { 65 pdfDocument.decrypt( "" ); 67 } 68 69 ByteArrayOutputStream out = new ByteArrayOutputStream(); 71 OutputStreamWriter writer = null; 72 if ( charSet != null ) { 73 writer = new OutputStreamWriter(out,charSet); 74 } else { 75 writer = new OutputStreamWriter(out); 76 } 77 PDFTextStripper stripper = new PDFTextStripper(); 78 stripper.writeText(pdfDocument, writer); 79 writer.close(); 80 81 String content = out.toString(charSet); 82 if ( content == null ){ 83 content = ""; 84 } 85 extDoc.setContent(content); 86 87 PDDocumentInformation info = pdfDocument.getDocumentInformation(); 88 104 110 111 this.configure(((JahiaSitesSlideService)JahiaSitesSlideService.getInstance()).getConfiguration().getConfiguration("pdf-property-mapping")); 112 113 COSDictionary dict = info.getDictionary(); 114 Iterator iterator = dict.keyList().iterator(); 115 COSName key = null; 116 Calendar cal = null; 117 String val = null; 118 while ( iterator.hasNext() ){ 119 try { 120 key = (COSName)iterator.next(); 121 logger.debug("Found Pdf property : key=" + key.getName()); 122 123 if ( !propertyMap.containsKey(key.getName()) ) { 124 continue; 125 } 126 127 cal = this.getCalendar(info,key); 128 if ( cal == null ){ 129 COSString value = (COSString)info.getDictionary().getDictionaryObject( key ); 130 if( value != null ) 131 { 132 val = value.getString(); 133 } 134 } else { 135 val = String.valueOf(cal.getTimeInMillis()); 136 } 137 logger.debug("Found Pdf property : value=" + val); 138 extDoc.setProperty((String )propertyMap.get(key.getName()),val); 139 } catch ( Throwable t) { 140 logger.debug("Error handling pdf properties", t); 141 } 142 } 143 144 } catch( Throwable t ) { 145 logger.debug(t); 146 throw new IOException(" Exception occured parsing pdf :" + t); 147 } finally { 148 if( pdfDocument != null ) 149 { try { 150 pdfDocument.close(); 151 } catch ( Throwable t ){ 152 } 153 } 154 } 155 return extDoc; 156 } 157 158 166 public synchronized ExtractedDocument getExtractedDocument( 167 String path, 168 long lastModified, 169 InputStream fileStream) 170 throws Exception { 171 return this.getExtractedDocument(path, lastModified, fileStream, null); 172 } 173 174 182 public String getContentAsString(String path, long lastModified, 183 InputStream fileStream) 184 throws Exception { 185 return getContentAsString(path, lastModified, fileStream, null); 186 } 187 188 197 public String getContentAsString(String path, long lastModified, 198 InputStream fileStream, 199 String charSet) throws Exception { 200 this.path = path; 201 this.lastModifed = lastModified; 202 String strVal = null; 203 204 if (fileStream != null) { 205 Reader pdfReader = null; 206 try { 207 long startTime = System.currentTimeMillis(); 208 pdfReader = this.getPDFReader(fileStream, charSet); 209 long elapsedTime = System.currentTimeMillis() - startTime; 210 logger.info("Finished pdf extraction with PDFBox in " + 211 elapsedTime + "ms."); 212 213 startTime = System.currentTimeMillis(); 214 strVal = FileUtils.readerToString( 215 pdfReader); 216 elapsedTime = System.currentTimeMillis() - startTime; 217 logger.info("Finished reading pdf Reader to String in " + 218 elapsedTime + "ms."); 219 } 220 catch (Throwable t) { 221 logger.debug("Error extracting dpdf file " + this.path ,t); 222 } 223 finally { 224 try { 225 if (pdfReader != null) { 226 pdfReader.close(); 227 } 228 } 229 catch (Throwable t) { 230 } 231 } 232 } 233 return strVal; 234 } 235 236 public Reader getPDFReader(InputStream fileStream) throws IOException { 237 return getPDFReader(fileStream, null); 238 } 239 240 public Reader getPDFReader(InputStream fileStream, 241 String charSet) throws IOException 242 { 243 Reader reader = null; 244 PDDocument pdfDocument = null; 245 try { 246 pdfDocument = PDDocument.load(fileStream); 247 if(pdfDocument.isEncrypted()) { 248 pdfDocument.decrypt(""); 250 } 251 ByteArrayOutputStream out = new ByteArrayOutputStream(); 253 OutputStreamWriter writer = new OutputStreamWriter(out); 254 PDFTextStripper stripper = new PDFTextStripper(); 255 stripper.writeText(pdfDocument, writer); 256 writer.close(); 257 byte[] contents = out.toByteArray(); 258 if ( charSet != null ){ 259 reader = new InputStreamReader(new ByteArrayInputStream(contents), 260 charSet); 261 } else { 262 reader = new InputStreamReader(new ByteArrayInputStream(contents)); 263 } 264 } 265 catch( Throwable t ) 266 { 267 logger.debug(t); 268 throw new IOException(" Exception occured parsing pdf :" + t); 269 } 270 finally 271 { 272 if( pdfDocument != null ) 273 { try { 274 pdfDocument.close(); 275 } catch ( Throwable t ){ 276 } 277 } 278 } 279 return reader; 280 } 281 282 public void configure(Configuration configuration) throws ConfigurationException { 283 Enumeration instructions = configuration.getConfigurations("instruction"); 284 while (instructions.hasMoreElements()) { 285 Configuration extract = (Configuration)instructions.nextElement(); 286 String property = extract.getAttribute("property"); 287 String id = extract.getAttribute("id"); 288 propertyMap.put(id, property); 289 } 290 } 291 292 299 private Calendar getCalendar(PDDocumentInformation info, COSName key){ 300 if ( info == null || key == null ){ 301 return null; 302 } 303 Calendar retval = null; 304 COSString value = (COSString)info.getDictionary().getDictionaryObject( key ); 305 306 if( value != null ) 307 { 308 String strValue = value.getString(); 310 int index = 0; 311 if( strValue.startsWith( "D:" ) ) 312 { 313 index = 2; 314 } 315 StringBuffer buff = new StringBuffer ("D:"); 316 317 for( int i=index; i<strValue.length(); i++ ) 319 { 320 if ( Character.isDigit( strValue.charAt( i ) ) ){ 321 buff.append( strValue.charAt( i ) ); 322 } else { 323 break; 324 } 325 } 327 DateConverter converter = new DateConverter(); 330 try 331 { 332 retval = converter.toCalendar( buff.toString() ); 333 } 334 catch( IOException e ) 335 { 336 retval = null; 337 } 338 } 340 return retval; 341 } 342 } 343 | Popular Tags |