KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > contineo > core > text > parser > PDFParser


1 package org.contineo.core.text.parser;
2
3 import java.io.ByteArrayOutputStream JavaDoc;
4 import java.io.File JavaDoc;
5 import java.io.FileInputStream JavaDoc;
6 import java.io.InputStream JavaDoc;
7 import java.io.OutputStreamWriter JavaDoc;
8 import java.text.DateFormat JavaDoc;
9 import java.util.Calendar JavaDoc;
10 import java.util.Date JavaDoc;
11 import org.apache.log4j.Level;
12 import org.apache.log4j.Logger;
13 import org.contineo.core.LoggingManager;
14 import org.pdfbox.encryption.DocumentEncryption;
15 import org.pdfbox.pdmodel.PDDocument;
16 import org.pdfbox.pdmodel.PDDocumentInformation;
17 import org.pdfbox.util.PDFTextStripper;
18
19 /**
20  * Parses a PDF document and provides the information. For parsing an external library is used.
21  * Created on 4. November 2003, 18:09
22  * @author Michael Scholz
23  */

24 public class PDFParser implements Parser {
25     
26     /**
27      * @uml.property name="content"
28      */

29     private StringBuffer JavaDoc content = new StringBuffer JavaDoc("");
30     private String JavaDoc author;
31     private String JavaDoc title;
32     private String JavaDoc sourceDate;
33     private String JavaDoc keywords;
34     private Logger logger;
35     
36     
37     /** Creates a new instance of PDFParser */
38     public PDFParser(File JavaDoc file) {
39         author = "";
40         title = "";
41         sourceDate = "";
42         keywords = "";
43         logger = LoggingManager.getLogger(this.getClass());
44         init(file);
45     }
46     
47     protected void init(File JavaDoc file) {
48         PDDocument pdfDocument = null;
49         try {
50                 InputStream JavaDoc is = new FileInputStream JavaDoc(file);
51                 org.pdfbox.pdfparser.PDFParser parser = new org.pdfbox.pdfparser.PDFParser( is );
52                 if (parser != null)
53                     parser.parse();
54                 else throw new Exception JavaDoc("Can not parse pdf file " + file.getName());
55
56                 pdfDocument = parser.getPDDocument();
57                 if (pdfDocument == null)
58                     throw new Exception JavaDoc("Can not get pdf document " + file.getName() + " for parsing");
59
60                 try {
61                     PDDocumentInformation information = pdfDocument.getDocumentInformation();
62                     if (information == null)
63                         throw new Exception JavaDoc("Can not get information from pdf document " + file.getName());
64                     author = information.getAuthor();
65                     if (author == null)
66                         author = "";
67                     title = information.getTitle();
68                     if (title == null)
69                         title = "";
70                     Calendar JavaDoc calendar = information.getCreationDate();
71                     Date JavaDoc date = null;
72                     if (calendar != null)
73                         date = calendar.getTime();
74                     if (date != null)
75                         sourceDate = DateFormat.getDateInstance().format(date);
76
77                     keywords = information.getKeywords();
78                     if (keywords == null)
79                         keywords = "";
80                 } catch (Exception JavaDoc e) {
81                     if (logger.isEnabledFor(Level.ERROR))
82                         logger.error(e.getMessage());
83                 }
84
85                 if( pdfDocument.isEncrypted() ) {
86                     DocumentEncryption decryptor = new DocumentEncryption( pdfDocument );
87                     //Just try using the default password and move on
88
decryptor.decryptDocument( "" );
89                 }
90
91                 //create a tmp output stream with the size of the content.
92
ByteArrayOutputStream JavaDoc out = new ByteArrayOutputStream JavaDoc();
93                 OutputStreamWriter JavaDoc writer = new OutputStreamWriter JavaDoc( out );
94                 PDFTextStripper stripper = new PDFTextStripper();
95                 stripper.writeText( pdfDocument, writer );
96                 writer.close();
97                 content = new StringBuffer JavaDoc(out.toString());
98                 is.close();
99                 out.close();
100         } catch (Exception JavaDoc ex) {
101             if (logger.isEnabledFor(Level.ERROR))
102                 logger.error(ex.getMessage());
103         } finally {
104             try {
105                 if (pdfDocument != null) pdfDocument.close();
106             } catch (Exception JavaDoc e) {
107                 if (logger.isEnabledFor(Level.ERROR))
108                     logger.fatal(e.getMessage());
109             }
110         }
111     }
112
113     /**
114      *
115      * @uml.property name="content"
116      */

117     public StringBuffer JavaDoc getContent() {
118         return content;
119     }
120
121     
122     public String JavaDoc getVersion() {
123         return "";
124     }
125
126     /* (non-Javadoc)
127      * @see org.contineo.core.text.parser.Parser#getAuthor()
128      */

129     /**
130      * @return Returns the author.
131      * @uml.property name="author"
132      */

133     public String JavaDoc getAuthor() {
134         return author;
135     }
136
137     /* (non-Javadoc)
138      * @see org.contineo.core.text.parser.Parser#getSourceDate()
139      */

140     /**
141      * @return Returns the sourceDate.
142      * @uml.property name="sourceDate"
143      */

144     public String JavaDoc getSourceDate() {
145         return sourceDate;
146     }
147
148     /* (non-Javadoc)
149      * @see org.contineo.core.text.parser.Parser#getKeywords()
150      */

151     /**
152      * @return Returns the keywords.
153      * @uml.property name="keywords"
154      */

155     public String JavaDoc getKeywords() {
156         return keywords;
157     }
158
159     /* (non-Javadoc)
160      * @see org.contineo.core.text.parser.Parser#getTitle()
161      */

162     /**
163      * @return Returns the title.
164      * @uml.property name="title"
165      */

166     public String JavaDoc getTitle() {
167         return title;
168     }
169     
170 }
171
Popular Tags