KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > search > extractors > CmsExtractorMsPowerPoint


1 /*
2  * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/extractors/CmsExtractorMsPowerPoint.java,v $
3  * Date : $Date: 2006/03/27 14:53:01 $
4  * Version: $Revision: 1.8 $
5  *
6  * This library is part of OpenCms -
7  * the Open Source Content Mananagement System
8  *
9  * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * For further information about Alkacon Software GmbH, please see the
22  * company website: http://www.alkacon.com
23  *
24  * For further information about OpenCms, please see the
25  * project website: http://www.opencms.org
26  *
27  * You should have received a copy of the GNU Lesser General Public
28  * License along with this library; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30  */

31
32 package org.opencms.search.extractors;
33
34 import org.opencms.i18n.CmsEncoder;
35
36 import java.io.InputStream JavaDoc;
37 import java.util.Map JavaDoc;
38
39 import org.apache.poi.poifs.eventfilesystem.POIFSReader;
40 import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
41 import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
42 import org.apache.poi.poifs.filesystem.DocumentInputStream;
43 import org.apache.poi.util.LittleEndian;
44
45 /**
46  * Extracts the text form an MS PowerPoint document.<p>
47  *
48  * @author Alexander Kandzior
49  *
50  * @version $Revision: 1.8 $
51  *
52  * @since 6.0.0
53  */

54 public final class CmsExtractorMsPowerPoint extends A_CmsTextExtractorMsOfficeBase implements POIFSReaderListener {
55
56     /** The buffer that is written with the content of the PPT. */
57     private StringBuffer JavaDoc m_buffer;
58
59     /**
60      * Hide the public constructor.<p>
61      */

62     private CmsExtractorMsPowerPoint() {
63
64         m_buffer = new StringBuffer JavaDoc(4096);
65     }
66
67     /**
68      * Returns an instance of this text extractor.<p>
69      *
70      * @return an instance of this text extractor
71      */

72     public static I_CmsTextExtractor getExtractor() {
73
74         // since this extractor requires a member variable we have no static instance
75
return new CmsExtractorMsPowerPoint();
76     }
77
78     /**
79      * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String)
80      */

81     public I_CmsExtractionResult extractText(InputStream JavaDoc in, String JavaDoc encoding) throws Exception JavaDoc {
82
83         POIFSReader reader = new POIFSReader();
84         reader.registerListener(this);
85         reader.read(in);
86         
87         // extract all information
88
Map JavaDoc metaInfo = extractMetaInformation();
89         String JavaDoc result = removeControlChars(m_buffer.toString());
90
91         // free some memory
92
m_buffer = new StringBuffer JavaDoc(4096);
93         cleanup();
94
95         // return the final result
96
return new CmsExtractionResult(result, metaInfo);
97     }
98
99     /**
100      * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
101      */

102     public void processPOIFSReaderEvent(POIFSReaderEvent event) {
103
104         try {
105
106             // super implementation handles document summary
107
super.processPOIFSReaderEvent(event);
108
109             // make sue this is a PPT document
110
if (!event.getName().startsWith(POWERPOINT_EVENT_NAME)) {
111                 return;
112             }
113
114             DocumentInputStream input = event.getStream();
115             byte[] buffer = new byte[input.available()];
116             input.read(buffer, 0, input.available());
117
118             for (int i = 0; i < buffer.length - 20; i++) {
119                 int type = LittleEndian.getUShort(buffer, i + 2);
120                 int size = (int)LittleEndian.getUInt(buffer, i + 4) + 3;
121
122                 String JavaDoc encoding = null;
123                 switch (type) {
124                     case PPT_TEXTBYTE_ATOM:
125                         // this pice is single-byte encoded, let's assume Cp1252 since this is most likley
126
// anyone who knows how to find out the "right" encoding - please email me
127
encoding = ENCODING_CP1252;
128                     case PPT_TEXTCHAR_ATOM:
129                         if (encoding == null) {
130                             // this piece is double-byte encoded, use UTF-16
131
encoding = ENCODING_UTF16;
132                         }
133                         int start = i + 4 + 1;
134                         int end = start + size;
135
136                         byte[] buf = new byte[size];
137                         System.arraycopy(buffer, start, buf, 0, buf.length);
138
139                         m_buffer.append(CmsEncoder.createString(buf, encoding));
140                         i = end;
141                     default:
142                 // noop
143
}
144             }
145         } catch (Exception JavaDoc e) {
146             // ignore
147
}
148     }
149 }
Popular Tags