PlainTextExtractor


1   /*
2    *  Copyright 2005 Blandware (http://www.blandware.com)
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   */
16  package com.blandware.atleap.common.parsers;
17  
18  import com.blandware.atleap.common.Constants;
19  import com.blandware.atleap.common.parsers.excel.ExcelPlainTextExtractor;
20  import com.blandware.atleap.common.parsers.exception.PlainTextExtractorException;
21  import com.blandware.atleap.common.parsers.exception.UnsupportedMimeTypeException;
22  import com.blandware.atleap.common.parsers.html.HTMLPlainTextExtractor;
23  import com.blandware.atleap.common.parsers.pdf.PDFPlainTextExtractor;
24  import com.blandware.atleap.common.parsers.ppt.PowerPointPlainTextExtractor;
25  import com.blandware.atleap.common.parsers.rtf.RTFPlainTextExtractor;
26  import com.blandware.atleap.common.parsers.txt.TXTPlainTextExtractor;
27  import com.blandware.atleap.common.parsers.word.WordPlainTextExtractor;
28  import com.blandware.atleap.common.parsers.xml.XMLPlainTextExtractor;
29  
30  import java.io.*;
31  
32  /**
33   * <p>
34   * Used to extract a plain text from formatted documents. Currently the
35   * following formats supported: MS Word, MS Excel Workbook, MS PowerPoint (97+),
36   * RTF, PDF, HTML, XML and plain text.
37   * </p>
38   * <p>
39   * To use, first instantiate with <code>new PlainTextExtractor()</code>, then
40   * call the <code>extract</code> method.
41   * </p>
42   * <p>
43   * One common ruleset is applied to figure out encoding to use during extracting
44   * (first matched rule is used to obtain encoding):
45   * <ol>
46   * <li>
47   * If encoding doesn't make sense for this format, it's totally ignored.
48   * </li>
49   * <li>
50   * If extractor finds out the encoding from document by itself (for example,
51   * HTML files often contain such information), it uses it.
52   * </li>
53   * <li>
54   * If encoding is given to extractor as a parameter to <code>extract</code>
55   * method, it's used.
56   * </li>
57   * <li>
58   * Otherwise default encoding is used (it's currently
59   * <code>Constants.DEFAULT_ENCODING</code>)
60   * </li>
61   * </ol>
62   * </p>
63   *
64   * @author Roman Puchkovskiy <a HREF="mailto:roman.puchkovskiy@blandware.com">
65   * &lt;roman.puchkovskiy@blandware.com&gt;</a>
66   * @version $Revision: 1.4 $ $Date: 2005/08/14 12:27:54 $
67   */
68  public class PlainTextExtractor {
69  
70      /*
71       * Internal field used to store encoding that was used by extractor during
72       * extracting process
73       */
74      protected String   usedEncoding = null;
75  
76      /**
77       * Constructs new PlainTextExtractor instance
78       */
79      public PlainTextExtractor() {}
80  
81      /**
82       * Extracts a plain text from a formatted document to a given writer.
83       *
84       * @param input the stream that supplies the document
85       * @param mimeType the mime type of the document
86       * @param output the writer which will accept the extracted text
87       * @param encoding the encoding of the document in the stream. If the
88       * <code>encoding</code> is <code>null</code>, then the extractor uses
89       * its default encoding (currently all extractors use
90       * <code>Constants.DEFAULT_ENCODING</code>).
91       * @throws UnsupportedMimeTypeException throwed when a given mime type is
92       * not supported
93       * @throws PlainTextExtractorException any other exception raised during
94       * extracting
95       */
96      public void extract(InputStream input, String   mimeType, Writer output,
97                          String   encoding) throws UnsupportedMimeTypeException,
98              PlainTextExtractorException {
99          SpecificPlainTextExtractor extractor = null;
100 
101         if (mimeType == null) {
102             throw new IllegalArgumentException  ("mimeType parameter is null");
103         }
104         if (mimeType.equals("application/msword")) {
105             extractor = new WordPlainTextExtractor();
106         } else if (mimeType.equals("application/vnd.ms-excel")) {
107             extractor = new ExcelPlainTextExtractor();
108         } else if (mimeType.equals("application/vnd.ms-powerpoint")) {
109             extractor = new PowerPointPlainTextExtractor();
110         } else if (mimeType.equals("application/pdf")) {
111             extractor = new PDFPlainTextExtractor();
112         } else if (mimeType.equals("application/rtf")) {
113             extractor = new RTFPlainTextExtractor();
114         } else if (mimeType.equals("text/html")) {
115             extractor = new HTMLPlainTextExtractor();
116         } else if (mimeType.equals("application/xhtml+xml")) {
117             extractor = new HTMLPlainTextExtractor();
118         } else if (mimeType.equals("text/xml")) {
119             extractor = new XMLPlainTextExtractor();
120         } else if (mimeType.equals("text/plain")) {
121             extractor = new TXTPlainTextExtractor();
122         } else {
123             throw new UnsupportedMimeTypeException("This mimeType is not supported: "
124                                                    + mimeType);
125         }
126         if (extractor != null) {
127             extractor.extract(input, output, encoding);
128             usedEncoding = extractor.getUsedEncoding();
129         } else {
130             usedEncoding = null;
131         }
132     }
133 
134     /**
135      * Extracts a plain text from a formatted document and returns it as a
136      * string.
137      *
138      * @param input the stream that supplies the document
139      * @param mimeType the mime type of the document
140      * @param encoding the encoding of the document in the stream. If the
141      * <code>encoding</code> is <code>null</code>, then the extractor uses
142      * its default encoding (currently all extractors use
143      * <code>Constants.DEFAULT_ENCODING</code>).
144      * @return the extracted text as a string
145      * @throws UnsupportedMimeTypeException throwed when a given mime type is
146      * not supported
147      * @throws PlainTextExtractorException any other exception raised during
148      * extracting
149      */
150     public String   extract(InputStream input, String   mimeType, String   encoding)
151             throws UnsupportedMimeTypeException, PlainTextExtractorException {
152         StringWriter writer = new StringWriter();
153         extract(input, mimeType, writer, encoding);
154         return writer.toString();
155     }
156 
157     /**
158      * Extracts a plain text from a formatted document to a given writer. The
159      * document is assumed to have the default encoding.
160      *
161      * @param input the stream that supplies the document
162      * @param mimeType the mime type of the document
163      * @param output the writer which will accept the extracted text
164      * @throws UnsupportedMimeTypeException throwed when a given mime type is
165      * not supported
166      * @throws PlainTextExtractorException any other exception raised during
167      * extracting
168      */
169     public void extract(InputStream input, String   mimeType, Writer output)
170             throws UnsupportedMimeTypeException, PlainTextExtractorException {
171         extract(input, mimeType, output, null);
172     }
173 
174     /**
175      * Extracts a plain text from a formatted document and returns it as a string.
176      * The document is assumed to have the default encoding.
177      *
178      * @param input the stream that supplies the document
179      * @param mimeType the mime type of the document
180      * @return the extracted text as a string
181      * @throws UnsupportedMimeTypeException throwed when a given mime type is
182      * not supported
183      * @throws PlainTextExtractorException any other exception raised during
184      * extracting
185      */
186     public String   extract(InputStream input, String   mimeType)
187             throws UnsupportedMimeTypeException, PlainTextExtractorException {
188         return extract(input, mimeType, (String  )null);
189     }
190 
191     /**
192      * Extracts a plain text from a formatted document to a given writer. The
193      * document is given as a <code>String</code>.
194      *
195      * @param input the string that supplies the document
196      * @param mimeType the mime type of the document
197      * @param output the writer which will accept the extracted text
198      * @param encoding the encoding of the document in the stream. If the
199      * <code>encoding</code> is <code>null</code>, then the extractor uses
200      * its default encoding (currently all extractors use
201      * <code>Constants.DEFAULT_ENCODING</code>). Also, that encoding is used to
202      * convert an input string to a byte stream (if not given, it is again
203      * assumed to be <code>Constants.DEFAULT_ENCODING</code>).
204      * @throws UnsupportedMimeTypeException throwed when a given mime type is
205      * not supported
206      * @throws PlainTextExtractorException any other exception raised during
207      * extracting
208      */
209     public void extract(String   input, String   mimeType, Writer output,
210                         String   encoding) throws UnsupportedMimeTypeException,
211             PlainTextExtractorException {
212         try {
213             extract(stringToInputStream(input, encoding), mimeType,
214                     output, encoding);
215         } catch (UnsupportedEncodingException e) {
216             throw new PlainTextExtractorException(e);
217         }
218     }
219 
220     /**
221      * Extracts a plain text from a formatted document and returns it as a
222      * string. The document is given as a <code>String</code>.
223      *
224      * @param input the string that supplies the document
225      * @param mimeType the mime type of the document
226      * @param encoding the encoding of the document in the stream. If the
227      * <code>encoding</code> is <code>null</code>, then the extractor uses
228      * its default encoding (currently all extractors use
229      * <code>Constants.DEFAULT_ENCODING</code>). Also, that encoding is used to
230      * convert an input string to a byte stream (if not given, it is again
231      * assumed to be <code>Constants.DEFAULT_ENCODING</code>).
232      * @return the extracted text as a string
233      * @throws UnsupportedMimeTypeException throwed when a given mime type is
234      * not supported
235      * @throws PlainTextExtractorException any other exception raised during
236      * extracting
237      */
238     public String   extract(String   input, String   mimeType, String   encoding)
239             throws UnsupportedMimeTypeException, PlainTextExtractorException {
240         try {
241             return extract(stringToInputStream(input, encoding), mimeType,
242                            encoding);
243         } catch (UnsupportedEncodingException e) {
244             throw new PlainTextExtractorException(e);
245         }
246     }
247 
248     /**
249      * Extracts a plain text from a formatted document to a given writer. The
250      * document is assumed to have the default encoding. The document is given
251      * as a <code>String</code>, which is decoded to bytes using default
252      * encoding too.
253      *
254      * @param input the string that supplies the document
255      * @param mimeType the mime type of the document
256      * @param output the writer which will accept the extracted text
257      * @throws UnsupportedMimeTypeException throwed when a given mime type is
258      * not supported
259      * @throws PlainTextExtractorException any other exception raised during
260      * extracting
261      */
262     public void extract(String   input, String   mimeType, Writer output)
263             throws UnsupportedMimeTypeException, PlainTextExtractorException {
264         extract(input, mimeType, output, null);
265     }
266 
267     /**
268      * Extracts a plain text from a formatted document and returns it as a string.
269      * The document is assumed to have the default encoding. The document
270      * is given as a <code>String</code>, which is decoded to bytes using
271      * default encoding too.
272      *
273      * @param input the string that supplies the document
274      * @param mimeType the mime type of the document
275      * @return the extracted text as a string
276      * @throws UnsupportedMimeTypeException throwed when a given mime type is
277      * not supported
278      * @throws PlainTextExtractorException any other exception raised during
279      * extracting
280      */
281     public String   extract(String   input, String   mimeType)
282             throws UnsupportedMimeTypeException, PlainTextExtractorException {
283         return extract(input, mimeType, (String  ) null);
284     }
285 
286     /**
287      * <p>
288      * Returns encoding that was used for extracting. If encoding has no sense
289      * for particular document format or it's unknown for extractor, returns
290      * <code>null</code>.
291      * </p>
292      * <p>
293      * This method should be called after calling <code>extract</code>; before
294      * it this method may return anything.
295      * </p>
296      *
297      * @return encoding used or <code>null</code>
298      */
299     public String   getUsedEncoding() {
300         return usedEncoding;
301     }
302 
303     /**
304      * Converts a <code>String</code> to an <code>InputStream</code> using given
305      * <code>encoding</code>. If the <code>encoding</code> is <code>null</code>,
306      * it's assumed to be a default encoding
307      * (<code>Constants.DEFAULT_ENCODING</code>).
308      * @param input the string to be converted
309      * @param encoding the encoding
310      * @return an InputStream that supplies bytes from the <code>string</code>
311      * @throws UnsupportedEncodingException
312      */
313     protected InputStream stringToInputStream(String   input, String   encoding)
314             throws UnsupportedEncodingException {
315         if (encoding == null || encoding.trim().length() == 0) {
316             encoding = Constants.DEFAULT_ENCODING;
317         }
318         return new ByteArrayInputStream(input.getBytes(encoding));
319     }
320 }
321
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags