KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > blandware > atleap > common > parsers > pdf > PDFPlainTextExtractor


1 /*
2  * Copyright 2005 Blandware (http://www.blandware.com)
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package com.blandware.atleap.common.parsers.pdf;
17
18 import com.blandware.atleap.common.parsers.SpecificPlainTextExtractor;
19 import com.blandware.atleap.common.parsers.exception.PlainTextExtractorException;
20 import org.pdfbox.pdmodel.PDDocument;
21 import org.pdfbox.util.PDFTextStripper;
22
23 import java.io.InputStream JavaDoc;
24 import java.io.Writer JavaDoc;
25
26 /**
27  * An extractor that extracts a plain text from PDF documents.
28  *
29  * @see SpecificPlainTextExtractor
30  * @author Roman Puchkovskiy <a HREF="mailto:roman.puchkovskiy@blandware.com">
31  * &lt;roman.puchkovskiy@blandware.com&gt;</a>
32  * @version $Revision: 1.3 $ $Date: 2005/08/14 12:27:55 $
33  */

34 public class PDFPlainTextExtractor implements SpecificPlainTextExtractor {
35     public PDFPlainTextExtractor() {
36     }
37
38     /**
39      * Extracts a plain text from a PDF document.
40      *
41      * @param input the input stream that supplies a PDF document for extraction
42      * @param output the writer that will accept the extracted text
43      * @param encoding ignored
44      * @throws PlainTextExtractorException throwed on exception raised during
45      * extracting
46      */

47     public void extract(InputStream JavaDoc input, Writer JavaDoc output, String JavaDoc encoding)
48             throws PlainTextExtractorException {
49         // TODO: Find out why the stripper hangs on some PDFs
50
// TODO: Workaround encoding problems
51
try {
52             PDDocument document = null;
53             PDFTextStripper stripper = new PDFTextStripper();
54
55             document = PDDocument.load(input);
56             if (document.isEncrypted()) {
57                 // Trying do decrypt with an empty password
58
document.decrypt("");
59             }
60             stripper.writeText(document, output);
61             document.close();
62         } catch (Throwable JavaDoc e) {
63             throw new PlainTextExtractorException(e);
64         }
65     }
66
67     /**
68      * @see com.blandware.atleap.common.parsers.SpecificPlainTextExtractor#getUsedEncoding()
69      */

70     public String JavaDoc getUsedEncoding() {
71         return null;
72     }
73 }
74
Popular Tags