1 31 32 package org.opencms.search.extractors; 33 34 import org.opencms.util.CmsStringUtil; 35 36 import java.io.InputStream ; 37 import java.util.HashMap ; 38 import java.util.Map ; 39 40 import org.pdfbox.encryption.DocumentEncryption; 41 import org.pdfbox.pdfparser.PDFParser; 42 import org.pdfbox.pdmodel.PDDocument; 43 import org.pdfbox.pdmodel.PDDocumentInformation; 44 import org.pdfbox.util.PDFTextStripper; 45 46 55 public final class CmsExtractorPdf extends A_CmsTextExtractor { 56 57 58 private static final CmsExtractorPdf INSTANCE = new CmsExtractorPdf(); 59 60 63 private CmsExtractorPdf() { 64 65 } 67 68 73 public static I_CmsTextExtractor getExtractor() { 74 75 return INSTANCE; 76 } 77 78 81 public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { 82 83 PDDocument pdfDocument = null; 84 85 try { 86 PDFParser parser = new PDFParser(in); 87 parser.parse(); 88 89 pdfDocument = parser.getPDDocument(); 90 91 if (pdfDocument.isEncrypted()) { 93 DocumentEncryption decryptor = new DocumentEncryption(pdfDocument); 94 decryptor.decryptDocument(""); 96 } 97 98 PDFTextStripper stripper = new PDFTextStripper(); 100 PDDocumentInformation info = pdfDocument.getDocumentInformation(); 101 102 Map metaInfo = new HashMap (); 103 String meta; 105 meta = info.getTitle(); 106 if (CmsStringUtil.isNotEmpty(meta)) { 107 metaInfo.put(I_CmsExtractionResult.META_TITLE, meta); 108 } 109 meta = info.getKeywords(); 110 if (CmsStringUtil.isNotEmpty(meta)) { 111 metaInfo.put(I_CmsExtractionResult.META_KEYWORDS, meta); 112 } 113 meta = info.getSubject(); 114 if (CmsStringUtil.isNotEmpty(meta)) { 115 metaInfo.put(I_CmsExtractionResult.META_SUBJECT, meta); 116 } 117 meta = info.getAuthor(); 119 if (CmsStringUtil.isNotEmpty(meta)) { 120 metaInfo.put(I_CmsExtractionResult.META_AUTHOR, meta); 121 } 122 meta = info.getCreator(); 123 if (CmsStringUtil.isNotEmpty(meta)) { 124 metaInfo.put(I_CmsExtractionResult.META_CREATOR, meta); 125 } 126 meta = info.getProducer(); 127 if (CmsStringUtil.isNotEmpty(meta)) { 128 metaInfo.put(I_CmsExtractionResult.META_PRODUCER, meta); 129 } 130 if (info.getCreationDate() != null) { 131 metaInfo.put(I_CmsExtractionResult.META_DATE_CREATED, info.getCreationDate().getTime()); 132 } 133 if (info.getModificationDate() != null) { 134 metaInfo.put(I_CmsExtractionResult.META_DATE_LASTMODIFIED, info.getModificationDate().getTime()); 135 } 136 137 String result = stripper.getText(pdfDocument); 139 140 stripper = null; 142 info = null; 143 144 return new CmsExtractionResult(result, metaInfo); 146 147 } finally { 148 if (pdfDocument != null) { 149 pdfDocument.close(); 150 } 151 } 152 } 153 } 154 | Popular Tags |