KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > alfresco > repo > content > transform > PdfBoxContentTransformer


1 /*
2  * Copyright (C) 2005 Alfresco, Inc.
3  *
4  * Licensed under the Mozilla Public License version 1.1
5  * with a permitted attribution clause. You may obtain a
6  * copy of the License at
7  *
8  * http://www.alfresco.org/legal/license.txt
9  *
10  * Unless required by applicable law or agreed to in writing,
11  * software distributed under the License is distributed on an
12  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13  * either express or implied. See the License for the specific
14  * language governing permissions and limitations under the
15  * License.
16  */

17 package org.alfresco.repo.content.transform;
18
19 import java.io.InputStream JavaDoc;
20 import java.util.Map JavaDoc;
21
22 import org.alfresco.repo.content.MimetypeMap;
23 import org.alfresco.service.cmr.repository.ContentReader;
24 import org.alfresco.service.cmr.repository.ContentWriter;
25 import org.pdfbox.pdmodel.PDDocument;
26 import org.pdfbox.util.PDFTextStripper;
27
28 /**
29  * Makes use of the {@link http://www.pdfbox.org/ PDFBox} library to
30  * perform conversions from PDF files to text.
31  *
32  * @author Derek Hulley
33  */

34 public class PdfBoxContentTransformer extends AbstractContentTransformer
35 {
36     /**
37      * Currently the only transformation performed is that of text extraction from PDF documents.
38      */

39     public double getReliability(String JavaDoc sourceMimetype, String JavaDoc targetMimetype)
40     {
41         // TODO: Expand PDFBox usage to convert images to PDF and investigate other conversions
42

43         if (!MimetypeMap.MIMETYPE_PDF.equals(sourceMimetype) ||
44                 !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
45         {
46             // only support PDF -> Text
47
return 0.0;
48         }
49         else
50         {
51             return 1.0;
52         }
53     }
54
55     protected void transformInternal(
56             ContentReader reader,
57             ContentWriter writer,
58             Map JavaDoc<String JavaDoc, Object JavaDoc> options) throws Exception JavaDoc
59     {
60         PDDocument pdf = null;
61         InputStream JavaDoc is = null;
62         try
63         {
64             is = reader.getContentInputStream();
65             // stream the document in
66
pdf = PDDocument.load(is);
67             // strip the text out
68
PDFTextStripper stripper = new PDFTextStripper();
69             String JavaDoc text = stripper.getText(pdf);
70             
71             // dump it all to the writer
72
writer.putContent(text);
73         }
74         finally
75         {
76             if (pdf != null)
77             {
78                 try { pdf.close(); } catch (Throwable JavaDoc e) {e.printStackTrace(); }
79             }
80             if (is != null)
81             {
82                 try { is.close(); } catch (Throwable JavaDoc e) {e.printStackTrace(); }
83             }
84         }
85     }
86 }
87
Popular Tags