KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > textextraction > impl > PDFTextExtractor


1 /*
2  * Copyright 2004 Outerthought bvba and Schaubroeck nv
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.outerj.daisy.textextraction.impl;
17
18 import org.pdfbox.pdmodel.PDDocument;
19 import org.pdfbox.pdfparser.PDFParser;
20 import org.pdfbox.util.PDFTextStripper;
21
22 import java.io.CharArrayWriter JavaDoc;
23 import java.io.InputStream JavaDoc;
24
25 public class PDFTextExtractor implements MimetypeTextExtractor {
26     public String JavaDoc getText(InputStream JavaDoc is) throws Exception JavaDoc {
27         PDDocument pdfDocument = null;
28         try {
29             PDFParser parser = new PDFParser(is);
30             parser.parse();
31
32             pdfDocument = parser.getPDDocument();
33
34             CharArrayWriter JavaDoc writer = new CharArrayWriter JavaDoc();
35             PDFTextStripper stripper = new PDFTextStripper();
36             stripper.writeText(pdfDocument, writer);
37
38             return writer.toString();
39         } finally {
40             if( pdfDocument != null )
41                 pdfDocument.close();
42         }
43     }
44 }
45
Popular Tags