1 31 32 package org.opencms.search.extractors; 33 34 import org.opencms.util.CmsStringUtil; 35 36 import java.util.Date ; 37 import java.util.HashMap ; 38 import java.util.Map ; 39 40 import org.apache.poi.hpsf.DocumentSummaryInformation; 41 import org.apache.poi.hpsf.PropertySetFactory; 42 import org.apache.poi.hpsf.Section; 43 import org.apache.poi.hpsf.SummaryInformation; 44 import org.apache.poi.hpsf.wellknown.PropertyIDMap; 45 import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; 46 import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; 47 48 57 public abstract class A_CmsTextExtractorMsOfficeBase extends A_CmsTextExtractor implements POIFSReaderListener { 58 59 60 protected static final String ENCODING_CP1252 = "Cp1252"; 61 62 63 protected static final String ENCODING_UTF16 = "UTF-16"; 64 65 66 protected static final String POWERPOINT_EVENT_NAME = "PowerPoint Document"; 67 68 69 protected static final int PPT_TEXTBYTE_ATOM = 4008; 70 71 72 protected static final int PPT_TEXTCHAR_ATOM = 4000; 73 74 75 private DocumentSummaryInformation m_documentSummary; 76 77 78 private SummaryInformation m_summary; 79 80 83 public void processPOIFSReaderEvent(POIFSReaderEvent event) { 84 85 try { 86 if ((m_summary == null) && event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) { 87 m_summary = (SummaryInformation)PropertySetFactory.create(event.getStream()); 88 return; 89 } 90 if ((m_documentSummary == null) 91 && event.getName().startsWith(DocumentSummaryInformation.DEFAULT_STREAM_NAME)) { 92 m_documentSummary = (DocumentSummaryInformation)PropertySetFactory.create(event.getStream()); 93 return; 94 } 95 } catch (Exception e) { 96 } 98 } 99 100 103 protected void cleanup() { 104 105 m_summary = null; 106 m_documentSummary = null; 107 } 108 109 114 protected Map extractMetaInformation() { 115 116 Map metaInfo = new HashMap (); 117 String meta; 118 if (m_summary != null) { 119 Section section = (Section)m_summary.getSections().get(0); 121 122 meta = (String )section.getProperty(PropertyIDMap.PID_TITLE); 123 if (CmsStringUtil.isNotEmpty(meta)) { 124 metaInfo.put(I_CmsExtractionResult.META_TITLE, meta); 125 } 126 meta = (String )section.getProperty(PropertyIDMap.PID_KEYWORDS); 127 if (CmsStringUtil.isNotEmpty(meta)) { 128 metaInfo.put(I_CmsExtractionResult.META_KEYWORDS, meta); 129 } 130 meta = (String )section.getProperty(PropertyIDMap.PID_SUBJECT); 131 if (CmsStringUtil.isNotEmpty(meta)) { 132 metaInfo.put(I_CmsExtractionResult.META_SUBJECT, meta); 133 } 134 meta = (String )section.getProperty(PropertyIDMap.PID_COMMENTS); 135 if (CmsStringUtil.isNotEmpty(meta)) { 136 metaInfo.put(I_CmsExtractionResult.META_COMMENTS, meta); 137 } 138 meta = (String )section.getProperty(PropertyIDMap.PID_AUTHOR); 140 if (CmsStringUtil.isNotEmpty(meta)) { 141 metaInfo.put(I_CmsExtractionResult.META_AUTHOR, meta); 142 } 143 Date date; 144 date = (Date )section.getProperty(PropertyIDMap.PID_CREATE_DTM); 145 if ((date != null) && (date.getTime() > 0)) { 146 metaInfo.put(I_CmsExtractionResult.META_DATE_CREATED, date); 149 } 150 date = (Date )section.getProperty(PropertyIDMap.PID_LASTSAVE_DTM); 151 if ((date != null) && (date.getTime() > 0)) { 152 metaInfo.put(I_CmsExtractionResult.META_DATE_LASTMODIFIED, date); 153 } 154 } 155 if (m_documentSummary != null) { 156 Section section = (Section)m_documentSummary.getSections().get(0); 158 159 meta = (String )section.getProperty(PropertyIDMap.PID_COMPANY); 161 if (CmsStringUtil.isNotEmpty(meta)) { 162 metaInfo.put(I_CmsExtractionResult.META_COMPANY, meta); 163 } 164 meta = (String )section.getProperty(PropertyIDMap.PID_MANAGER); 165 if (CmsStringUtil.isNotEmpty(meta)) { 166 metaInfo.put(I_CmsExtractionResult.META_MANAGER, meta); 167 } 168 meta = (String )section.getProperty(PropertyIDMap.PID_CATEGORY); 169 if (CmsStringUtil.isNotEmpty(meta)) { 170 metaInfo.put(I_CmsExtractionResult.META_CATEGORY, meta); 171 } 172 } 173 174 return metaInfo; 175 } 176 } | Popular Tags |