KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > blandware > atleap > common > parsers > PlainTextExtractor


1 /*
2  * Copyright 2005 Blandware (http://www.blandware.com)
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package com.blandware.atleap.common.parsers;
17
18 import com.blandware.atleap.common.Constants;
19 import com.blandware.atleap.common.parsers.excel.ExcelPlainTextExtractor;
20 import com.blandware.atleap.common.parsers.exception.PlainTextExtractorException;
21 import com.blandware.atleap.common.parsers.exception.UnsupportedMimeTypeException;
22 import com.blandware.atleap.common.parsers.html.HTMLPlainTextExtractor;
23 import com.blandware.atleap.common.parsers.pdf.PDFPlainTextExtractor;
24 import com.blandware.atleap.common.parsers.ppt.PowerPointPlainTextExtractor;
25 import com.blandware.atleap.common.parsers.rtf.RTFPlainTextExtractor;
26 import com.blandware.atleap.common.parsers.txt.TXTPlainTextExtractor;
27 import com.blandware.atleap.common.parsers.word.WordPlainTextExtractor;
28 import com.blandware.atleap.common.parsers.xml.XMLPlainTextExtractor;
29
30 import java.io.*;
31
32 /**
33  * <p>
34  * Used to extract a plain text from formatted documents. Currently the
35  * following formats supported: MS Word, MS Excel Workbook, MS PowerPoint (97+),
36  * RTF, PDF, HTML, XML and plain text.
37  * </p>
38  * <p>
39  * To use, first instantiate with <code>new PlainTextExtractor()</code>, then
40  * call the <code>extract</code> method.
41  * </p>
42  * <p>
43  * One common ruleset is applied to figure out encoding to use during extracting
44  * (first matched rule is used to obtain encoding):
45  * <ol>
46  * <li>
47  * If encoding doesn't make sense for this format, it's totally ignored.
48  * </li>
49  * <li>
50  * If extractor finds out the encoding from document by itself (for example,
51  * HTML files often contain such information), it uses it.
52  * </li>
53  * <li>
54  * If encoding is given to extractor as a parameter to <code>extract</code>
55  * method, it's used.
56  * </li>
57  * <li>
58  * Otherwise default encoding is used (it's currently
59  * <code>Constants.DEFAULT_ENCODING</code>)
60  * </li>
61  * </ol>
62  * </p>
63  *
64  * @author Roman Puchkovskiy <a HREF="mailto:roman.puchkovskiy@blandware.com">
65  * &lt;roman.puchkovskiy@blandware.com&gt;</a>
66  * @version $Revision: 1.4 $ $Date: 2005/08/14 12:27:54 $
67  */

68 public class PlainTextExtractor {
69
70     /*
71      * Internal field used to store encoding that was used by extractor during
72      * extracting process
73      */

74     protected String JavaDoc usedEncoding = null;
75
76     /**
77      * Constructs new PlainTextExtractor instance
78      */

79     public PlainTextExtractor() {}
80
81     /**
82      * Extracts a plain text from a formatted document to a given writer.
83      *
84      * @param input the stream that supplies the document
85      * @param mimeType the mime type of the document
86      * @param output the writer which will accept the extracted text
87      * @param encoding the encoding of the document in the stream. If the
88      * <code>encoding</code> is <code>null</code>, then the extractor uses
89      * its default encoding (currently all extractors use
90      * <code>Constants.DEFAULT_ENCODING</code>).
91      * @throws UnsupportedMimeTypeException throwed when a given mime type is
92      * not supported
93      * @throws PlainTextExtractorException any other exception raised during
94      * extracting
95      */

96     public void extract(InputStream input, String JavaDoc mimeType, Writer output,
97                         String JavaDoc encoding) throws UnsupportedMimeTypeException,
98             PlainTextExtractorException {
99         SpecificPlainTextExtractor extractor = null;
100
101         if (mimeType == null) {
102             throw new IllegalArgumentException JavaDoc("mimeType parameter is null");
103         }
104         if (mimeType.equals("application/msword")) {
105             extractor = new WordPlainTextExtractor();
106         } else if (mimeType.equals("application/vnd.ms-excel")) {
107             extractor = new ExcelPlainTextExtractor();
108         } else if (mimeType.equals("application/vnd.ms-powerpoint")) {
109             extractor = new PowerPointPlainTextExtractor();
110         } else if (mimeType.equals("application/pdf")) {
111             extractor = new PDFPlainTextExtractor();
112         } else if (mimeType.equals("application/rtf")) {
113             extractor = new RTFPlainTextExtractor();
114         } else if (mimeType.equals("text/html")) {
115             extractor = new HTMLPlainTextExtractor();
116         } else if (mimeType.equals("application/xhtml+xml")) {
117             extractor = new HTMLPlainTextExtractor();
118         } else if (mimeType.equals("text/xml")) {
119             extractor = new XMLPlainTextExtractor();
120         } else if (mimeType.equals("text/plain")) {
121             extractor = new TXTPlainTextExtractor();
122         } else {
123             throw new UnsupportedMimeTypeException("This mimeType is not supported: "
124                                                    + mimeType);
125         }
126         if (extractor != null) {
127             extractor.extract(input, output, encoding);
128             usedEncoding = extractor.getUsedEncoding();
129         } else {
130             usedEncoding = null;
131         }
132     }
133
134     /**
135      * Extracts a plain text from a formatted document and returns it as a
136      * string.
137      *
138      * @param input the stream that supplies the document
139      * @param mimeType the mime type of the document
140      * @param encoding the encoding of the document in the stream. If the
141      * <code>encoding</code> is <code>null</code>, then the extractor uses
142      * its default encoding (currently all extractors use
143      * <code>Constants.DEFAULT_ENCODING</code>).
144      * @return the extracted text as a string
145      * @throws UnsupportedMimeTypeException throwed when a given mime type is
146      * not supported
147      * @throws PlainTextExtractorException any other exception raised during
148      * extracting
149      */

150     public String JavaDoc extract(InputStream input, String JavaDoc mimeType, String JavaDoc encoding)
151             throws UnsupportedMimeTypeException, PlainTextExtractorException {
152         StringWriter writer = new StringWriter();
153         extract(input, mimeType, writer, encoding);
154         return writer.toString();
155     }
156
157     /**
158      * Extracts a plain text from a formatted document to a given writer. The
159      * document is assumed to have the default encoding.
160      *
161      * @param input the stream that supplies the document
162      * @param mimeType the mime type of the document
163      * @param output the writer which will accept the extracted text
164      * @throws UnsupportedMimeTypeException throwed when a given mime type is
165      * not supported
166      * @throws PlainTextExtractorException any other exception raised during
167      * extracting
168      */

169     public void extract(InputStream input, String JavaDoc mimeType, Writer output)
170             throws UnsupportedMimeTypeException, PlainTextExtractorException {
171         extract(input, mimeType, output, null);
172     }
173
174     /**
175      * Extracts a plain text from a formatted document and returns it as a string.
176      * The document is assumed to have the default encoding.
177      *
178      * @param input the stream that supplies the document
179      * @param mimeType the mime type of the document
180      * @return the extracted text as a string
181      * @throws UnsupportedMimeTypeException throwed when a given mime type is
182      * not supported
183      * @throws PlainTextExtractorException any other exception raised during
184      * extracting
185      */

186     public String JavaDoc extract(InputStream input, String JavaDoc mimeType)
187             throws UnsupportedMimeTypeException, PlainTextExtractorException {
188         return extract(input, mimeType, (String JavaDoc)null);
189     }
190
191     /**
192      * Extracts a plain text from a formatted document to a given writer. The
193      * document is given as a <code>String</code>.
194      *
195      * @param input the string that supplies the document
196      * @param mimeType the mime type of the document
197      * @param output the writer which will accept the extracted text
198      * @param encoding the encoding of the document in the stream. If the
199      * <code>encoding</code> is <code>null</code>, then the extractor uses
200      * its default encoding (currently all extractors use
201      * <code>Constants.DEFAULT_ENCODING</code>). Also, that encoding is used to
202      * convert an input string to a byte stream (if not given, it is again
203      * assumed to be <code>Constants.DEFAULT_ENCODING</code>).
204      * @throws UnsupportedMimeTypeException throwed when a given mime type is
205      * not supported
206      * @throws PlainTextExtractorException any other exception raised during
207      * extracting
208      */

209     public void extract(String JavaDoc input, String JavaDoc mimeType, Writer output,
210                         String JavaDoc encoding) throws UnsupportedMimeTypeException,
211             PlainTextExtractorException {
212         try {
213             extract(stringToInputStream(input, encoding), mimeType,
214                     output, encoding);
215         } catch (UnsupportedEncodingException e) {
216             throw new PlainTextExtractorException(e);
217         }
218     }
219
220     /**
221      * Extracts a plain text from a formatted document and returns it as a
222      * string. The document is given as a <code>String</code>.
223      *
224      * @param input the string that supplies the document
225      * @param mimeType the mime type of the document
226      * @param encoding the encoding of the document in the stream. If the
227      * <code>encoding</code> is <code>null</code>, then the extractor uses
228      * its default encoding (currently all extractors use
229      * <code>Constants.DEFAULT_ENCODING</code>). Also, that encoding is used to
230      * convert an input string to a byte stream (if not given, it is again
231      * assumed to be <code>Constants.DEFAULT_ENCODING</code>).
232      * @return the extracted text as a string
233      * @throws UnsupportedMimeTypeException throwed when a given mime type is
234      * not supported
235      * @throws PlainTextExtractorException any other exception raised during
236      * extracting
237      */

238     public String JavaDoc extract(String JavaDoc input, String JavaDoc mimeType, String JavaDoc encoding)
239             throws UnsupportedMimeTypeException, PlainTextExtractorException {
240         try {
241             return extract(stringToInputStream(input, encoding), mimeType,
242                            encoding);
243         } catch (UnsupportedEncodingException e) {
244             throw new PlainTextExtractorException(e);
245         }
246     }
247
248     /**
249      * Extracts a plain text from a formatted document to a given writer. The
250      * document is assumed to have the default encoding. The document is given
251      * as a <code>String</code>, which is decoded to bytes using default
252      * encoding too.
253      *
254      * @param input the string that supplies the document
255      * @param mimeType the mime type of the document
256      * @param output the writer which will accept the extracted text
257      * @throws UnsupportedMimeTypeException throwed when a given mime type is
258      * not supported
259      * @throws PlainTextExtractorException any other exception raised during
260      * extracting
261      */

262     public void extract(String JavaDoc input, String JavaDoc mimeType, Writer output)
263             throws UnsupportedMimeTypeException, PlainTextExtractorException {
264         extract(input, mimeType, output, null);
265     }
266
267     /**
268      * Extracts a plain text from a formatted document and returns it as a string.
269      * The document is assumed to have the default encoding. The document
270      * is given as a <code>String</code>, which is decoded to bytes using
271      * default encoding too.
272      *
273      * @param input the string that supplies the document
274      * @param mimeType the mime type of the document
275      * @return the extracted text as a string
276      * @throws UnsupportedMimeTypeException throwed when a given mime type is
277      * not supported
278      * @throws PlainTextExtractorException any other exception raised during
279      * extracting
280      */

281     public String JavaDoc extract(String JavaDoc input, String JavaDoc mimeType)
282             throws UnsupportedMimeTypeException, PlainTextExtractorException {
283         return extract(input, mimeType, (String JavaDoc) null);
284     }
285
286     /**
287      * <p>
288      * Returns encoding that was used for extracting. If encoding has no sense
289      * for particular document format or it's unknown for extractor, returns
290      * <code>null</code>.
291      * </p>
292      * <p>
293      * This method should be called after calling <code>extract</code>; before
294      * it this method may return anything.
295      * </p>
296      *
297      * @return encoding used or <code>null</code>
298      */

299     public String JavaDoc getUsedEncoding() {
300         return usedEncoding;
301     }
302
303     /**
304      * Converts a <code>String</code> to an <code>InputStream</code> using given
305      * <code>encoding</code>. If the <code>encoding</code> is <code>null</code>,
306      * it's assumed to be a default encoding
307      * (<code>Constants.DEFAULT_ENCODING</code>).
308      * @param input the string to be converted
309      * @param encoding the encoding
310      * @return an InputStream that supplies bytes from the <code>string</code>
311      * @throws UnsupportedEncodingException
312      */

313     protected InputStream stringToInputStream(String JavaDoc input, String JavaDoc encoding)
314             throws UnsupportedEncodingException {
315         if (encoding == null || encoding.trim().length() == 0) {
316             encoding = Constants.DEFAULT_ENCODING;
317         }
318         return new ByteArrayInputStream(input.getBytes(encoding));
319     }
320 }
321
Popular Tags