KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > search > extractors > A_CmsTextExtractorMsOfficeBase


1 /*
2  * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/extractors/A_CmsTextExtractorMsOfficeBase.java,v $
3  * Date : $Date: 2005/07/29 10:35:06 $
4  * Version: $Revision: 1.7 $
5  *
6  * This library is part of OpenCms -
7  * the Open Source Content Mananagement System
8  *
9  * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * For further information about Alkacon Software GmbH, please see the
22  * company website: http://www.alkacon.com
23  *
24  * For further information about OpenCms, please see the
25  * project website: http://www.opencms.org
26  *
27  * You should have received a copy of the GNU Lesser General Public
28  * License along with this library; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30  */

31
32 package org.opencms.search.extractors;
33
34 import org.opencms.util.CmsStringUtil;
35
36 import java.util.Date JavaDoc;
37 import java.util.HashMap JavaDoc;
38 import java.util.Map JavaDoc;
39
40 import org.apache.poi.hpsf.DocumentSummaryInformation;
41 import org.apache.poi.hpsf.PropertySetFactory;
42 import org.apache.poi.hpsf.Section;
43 import org.apache.poi.hpsf.SummaryInformation;
44 import org.apache.poi.hpsf.wellknown.PropertyIDMap;
45 import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
46 import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
47
48 /**
49  * Base class to extract summary information from MS office documents.<p>
50  *
51  * @author Alexander Kandzior
52  *
53  * @version $Revision: 1.7 $
54  *
55  * @since 6.0.0
56  */

57 public abstract class A_CmsTextExtractorMsOfficeBase extends A_CmsTextExtractor implements POIFSReaderListener {
58
59     /** Windows Cp1252 endocing (western europe) is used as default for single byte fields. */
60     protected static final String JavaDoc ENCODING_CP1252 = "Cp1252";
61
62     /** UTF-16 encoding is used for double byte fields. */
63     protected static final String JavaDoc ENCODING_UTF16 = "UTF-16";
64
65     /** Event event name for a MS PowerPoint document. */
66     protected static final String JavaDoc POWERPOINT_EVENT_NAME = "PowerPoint Document";
67
68     /** PPT text byte atom. */
69     protected static final int PPT_TEXTBYTE_ATOM = 4008;
70
71     /** PPT text char atom. */
72     protected static final int PPT_TEXTCHAR_ATOM = 4000;
73
74     /** The summary of the POI document. */
75     private DocumentSummaryInformation m_documentSummary;
76
77     /** The summary of the POI document. */
78     private SummaryInformation m_summary;
79
80     /**
81      * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
82      */

83     public void processPOIFSReaderEvent(POIFSReaderEvent event) {
84
85         try {
86             if ((m_summary == null) && event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) {
87                 m_summary = (SummaryInformation)PropertySetFactory.create(event.getStream());
88                 return;
89             }
90             if ((m_documentSummary == null)
91                 && event.getName().startsWith(DocumentSummaryInformation.DEFAULT_STREAM_NAME)) {
92                 m_documentSummary = (DocumentSummaryInformation)PropertySetFactory.create(event.getStream());
93                 return;
94             }
95         } catch (Exception JavaDoc e) {
96             // ignore
97
}
98     }
99
100     /**
101      * Cleans up some internal memory.<p>
102      */

103     protected void cleanup() {
104
105         m_summary = null;
106         m_documentSummary = null;
107     }
108
109     /**
110      * Returns a map with the extracted meta information from the document.<p>
111      *
112      * @return a map with the extracted meta information from the document
113      */

114     protected Map JavaDoc extractMetaInformation() {
115
116         Map JavaDoc metaInfo = new HashMap JavaDoc();
117         String JavaDoc meta;
118         if (m_summary != null) {
119             // can't use convenience methods on summary since they can't deal with multiple sections
120
Section section = (Section)m_summary.getSections().get(0);
121
122             meta = (String JavaDoc)section.getProperty(PropertyIDMap.PID_TITLE);
123             if (CmsStringUtil.isNotEmpty(meta)) {
124                 metaInfo.put(I_CmsExtractionResult.META_TITLE, meta);
125             }
126             meta = (String JavaDoc)section.getProperty(PropertyIDMap.PID_KEYWORDS);
127             if (CmsStringUtil.isNotEmpty(meta)) {
128                 metaInfo.put(I_CmsExtractionResult.META_KEYWORDS, meta);
129             }
130             meta = (String JavaDoc)section.getProperty(PropertyIDMap.PID_SUBJECT);
131             if (CmsStringUtil.isNotEmpty(meta)) {
132                 metaInfo.put(I_CmsExtractionResult.META_SUBJECT, meta);
133             }
134             meta = (String JavaDoc)section.getProperty(PropertyIDMap.PID_COMMENTS);
135             if (CmsStringUtil.isNotEmpty(meta)) {
136                 metaInfo.put(I_CmsExtractionResult.META_COMMENTS, meta);
137             }
138             // extract other available meta information
139
meta = (String JavaDoc)section.getProperty(PropertyIDMap.PID_AUTHOR);
140             if (CmsStringUtil.isNotEmpty(meta)) {
141                 metaInfo.put(I_CmsExtractionResult.META_AUTHOR, meta);
142             }
143             Date JavaDoc date;
144             date = (Date JavaDoc)section.getProperty(PropertyIDMap.PID_CREATE_DTM);
145             if ((date != null) && (date.getTime() > 0)) {
146                 // it's unlikley any PowerPoint documents where created before 1970,
147
// and apparently POI contains an issue calculating the time correctly sometimes
148
metaInfo.put(I_CmsExtractionResult.META_DATE_CREATED, date);
149             }
150             date = (Date JavaDoc)section.getProperty(PropertyIDMap.PID_LASTSAVE_DTM);
151             if ((date != null) && (date.getTime() > 0)) {
152                 metaInfo.put(I_CmsExtractionResult.META_DATE_LASTMODIFIED, date);
153             }
154         }
155         if (m_documentSummary != null) {
156             // can't use convenience methods on document since they can't deal with multiple sections
157
Section section = (Section)m_documentSummary.getSections().get(0);
158
159             // extract available meta information from document summary
160
meta = (String JavaDoc)section.getProperty(PropertyIDMap.PID_COMPANY);
161             if (CmsStringUtil.isNotEmpty(meta)) {
162                 metaInfo.put(I_CmsExtractionResult.META_COMPANY, meta);
163             }
164             meta = (String JavaDoc)section.getProperty(PropertyIDMap.PID_MANAGER);
165             if (CmsStringUtil.isNotEmpty(meta)) {
166                 metaInfo.put(I_CmsExtractionResult.META_MANAGER, meta);
167             }
168             meta = (String JavaDoc)section.getProperty(PropertyIDMap.PID_CATEGORY);
169             if (CmsStringUtil.isNotEmpty(meta)) {
170                 metaInfo.put(I_CmsExtractionResult.META_CATEGORY, meta);
171             }
172         }
173
174         return metaInfo;
175     }
176 }
Popular Tags