KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > dspace > app > mediafilter > PDFFilter


1 /*
2  * PDFFilter.java
3  *
4  * Version: $Revision: 1.9 $
5  *
6  * Date: $Date: 2006/11/01 22:16:10 $
7  *
8  * Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
9  * Institute of Technology. All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions are
13  * met:
14  *
15  * - Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * - Redistributions in binary form must reproduce the above copyright
19  * notice, this list of conditions and the following disclaimer in the
20  * documentation and/or other materials provided with the distribution.
21  *
22  * - Neither the name of the Hewlett-Packard Company nor the name of the
23  * Massachusetts Institute of Technology nor the names of their
24  * contributors may be used to endorse or promote products derived from
25  * this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
38  * DAMAGE.
39  */

40 package org.dspace.app.mediafilter;
41
42 import java.io.ByteArrayInputStream JavaDoc;
43 import java.io.InputStream JavaDoc;
44
45 import org.apache.log4j.Logger;
46 import org.pdfbox.pdfparser.PDFParser;
47 import org.pdfbox.pdmodel.PDDocument;
48 import org.pdfbox.util.PDFTextStripper;
49
50 /*
51  *
52  * to do: helpful error messages - can't find mediafilter.cfg - can't
53  * instantiate filter - bitstream format doesn't exist
54  *
55  */

56 public class PDFFilter extends MediaFilter
57 {
58
59     private static Logger log = Logger.getLogger(PDFFilter.class);
60
61     public String JavaDoc getFilteredName(String JavaDoc oldFilename)
62     {
63         return oldFilename + ".txt";
64     }
65
66     /**
67      * @return String bundle name
68      *
69      */

70     public String JavaDoc getBundleName()
71     {
72         return "TEXT";
73     }
74
75     /**
76      * @return String bitstreamformat
77      */

78     public String JavaDoc getFormatString()
79     {
80         return "Text";
81     }
82
83     /**
84      * @return String description
85      */

86     public String JavaDoc getDescription()
87     {
88         return "Extracted text";
89     }
90
91     /**
92      * @param source
93      * source input stream
94      *
95      * @return InputStream the resulting input stream
96      */

97     public InputStream JavaDoc getDestinationStream(InputStream JavaDoc source)
98             throws Exception JavaDoc
99     {
100         // get input stream from bitstream
101
// pass to filter, get string back
102
PDFTextStripper pts = new PDFTextStripper();
103         PDFParser parser = null;
104         String JavaDoc extractedText = null;
105
106         try
107         {
108             parser = new PDFParser(source);
109             parser.parse();
110             extractedText = pts.getText(new PDDocument(parser.getDocument()));
111         }
112         finally
113         {
114             try
115             {
116                 parser.getDocument().close();
117             }
118             catch(Exception JavaDoc e)
119             {
120                log.error("Error closing temporary PDF file: " + e.getMessage(), e);
121             }
122         }
123
124         // if verbose flag is set, print out extracted text
125
// to STDOUT
126
if (MediaFilterManager.isVerbose)
127         {
128             System.out.println(extractedText);
129         }
130
131
132         // generate an input stream with the extracted text
133
byte[] textBytes = extractedText.getBytes();
134         ByteArrayInputStream JavaDoc bais = new ByteArrayInputStream JavaDoc(textBytes);
135
136         return bais; // will this work? or will the byte array be out of scope?
137

138
139     }
140 }
141
Popular Tags