KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > pdfbox > searchengine > lucene > LucenePDFDocument


1 /**
2  * Copyright (c) 2003, www.pdfbox.org
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  * 3. Neither the name of pdfbox; nor the names of its
14  * contributors may be used to endorse or promote products derived from this
15  * software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * http://www.pdfbox.org
29  *
30  */

31 package org.pdfbox.searchengine.lucene;
32
33 import java.io.File JavaDoc;
34 import java.io.FileInputStream JavaDoc;
35 import java.io.InputStream JavaDoc;
36 import java.io.IOException JavaDoc;
37 import java.io.StringReader JavaDoc;
38 import java.io.StringWriter JavaDoc;
39
40 import java.net.URL JavaDoc;
41 import java.net.URLConnection JavaDoc;
42
43 import java.util.Date JavaDoc;
44
45 import org.apache.lucene.document.DateField;
46 import org.apache.lucene.document.Document;
47 import org.apache.lucene.document.Field;
48
49 import org.pdfbox.pdmodel.PDDocument;
50 import org.pdfbox.pdmodel.PDDocumentInformation;
51
52 import org.pdfbox.exceptions.CryptographyException;
53 import org.pdfbox.exceptions.InvalidPasswordException;
54
55 import org.pdfbox.util.PDFTextStripper;
56
57 /**
58  * This class is used to create a document for the lucene search engine.
59  * This should easily plug into the IndexHTML or IndexFiles that comes with
60  * the lucene project. This class will populate the following fields.
61  * <table>
62  * <tr>
63  * <td>Lucene Field Name</td>
64  * <td>Description</td>
65  * </tr>
66  * <tr>
67  * <td>path</td>
68  * <td>File system path if loaded from a file</td>
69  * </tr>
70  * <tr>
71  * <td>url</td>
72  * <td>URL to PDF document</td>
73  * </tr>
74  * <tr>
75  * <td>contents</td>
76  * <td>Entire contents of PDF document, indexed but not stored</td>
77  * </tr>
78  * <tr>
79  * <td>summary</td>
80  * <td>First 500 characters of content</td>
81  * </tr>
82  * <tr>
83  * <td>modified</td>
84  * <td>The modified date/time according to the url or path</td>
85  * </tr>
86  * <tr>
87  * <td>uid</td>
88  * <td>A unique identifier for the Lucene document.</td>
89  * </tr>
90  * <tr>
91  * <td>CreationDate</td>
92  * <td>From PDF meta-data if available</td>
93  * </tr>
94  * <tr>
95  * <td>Creator</td>
96  * <td>From PDF meta-data if available</td>
97  * </tr>
98  * <tr>
99  * <td>Keywords</td>
100  * <td>From PDF meta-data if available</td>
101  * </tr>
102  * <tr>
103  * <td>ModificationDate</td>
104  * <td>From PDF meta-data if available</td>
105  * </tr>
106  * <tr>
107  * <td>Producer</td>
108  * <td>From PDF meta-data if available</td>
109  * </tr>
110  * <tr>
111  * <td>Subject</td>
112  * <td>From PDF meta-data if available</td>
113  * </tr>
114  * <tr>
115  * <td>Trapped</td>
116  * <td>From PDF meta-data if available</td>
117  * </tr>
118  * </table>
119  *
120  * @author Ben Litchfield
121  * @version $Revision: 1.18 $
122  */

123 public final class LucenePDFDocument
124 {
125     private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
126
127
128     /**
129      * private constructor because there are only static methods.
130      */

131     private LucenePDFDocument()
132     {
133         //utility class should not be instantiated
134
}
135     
136     /**
137      * This will get a lucene document from a PDF file.
138      *
139      * @param is The stream to read the PDF from.
140      *
141      * @return The lucene document.
142      *
143      * @throws IOException If there is an error parsing or indexing the document.
144      */

145     public static Document getDocument( InputStream JavaDoc is ) throws IOException JavaDoc
146     {
147         Document document = new Document();
148         addContent( document, is, "<inputstream>" );
149         return document;
150     }
151
152     /**
153      * This will get a lucene document from a PDF file.
154      *
155      * @param file The file to get the document for.
156      *
157      * @return The lucene document.
158      *
159      * @throws IOException If there is an error parsing or indexing the document.
160      */

161     public static Document getDocument( File JavaDoc file ) throws IOException JavaDoc
162     {
163         Document document = new Document();
164
165         // Add the url as a field named "url". Use an UnIndexed field, so
166
// that the url is just stored with the document, but is not searchable.
167
document.add( Field.UnIndexed("path", file.getPath() ) );
168         document.add(Field.UnIndexed("url", file.getPath().replace(FILE_SEPARATOR, '/')));
169
170         // Add the last modified date of the file a field named "modified". Use a
171
// Keyword field, so that it's searchable, but so that no attempt is made
172
// to tokenize the field into words.
173
document.add(Field.Keyword("modified", DateField.timeToString( file.lastModified() )));
174
175         String JavaDoc uid = file.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" +
176                DateField.timeToString(file.lastModified() );
177
178         // Add the uid as a field, so that index can be incrementally maintained.
179
// This field is not stored with document, it is indexed, but it is not
180
// tokenized prior to indexing.
181
document.add(new Field("uid", uid, false, true, false));
182
183         FileInputStream JavaDoc input = null;
184         try
185         {
186             input = new FileInputStream JavaDoc( file );
187             addContent( document, input, file.getPath() );
188         }
189         finally
190         {
191             if( input != null )
192             {
193                 input.close();
194             }
195         }
196
197
198         // return the document
199

200         return document;
201     }
202
203     /**
204      * This will get a lucene document from a PDF file.
205      *
206      * @param url The file to get the document for.
207      *
208      * @return The lucene document.
209      *
210      * @throws IOException If there is an error parsing or indexing the document.
211      */

212     public static Document getDocument( URL JavaDoc url ) throws IOException JavaDoc
213     {
214         Document document = new Document();
215         URLConnection JavaDoc connection = url.openConnection();
216         connection.connect();
217         // Add the url as a field named "url". Use an UnIndexed field, so
218
// that the url is just stored with the document, but is not searchable.
219
document.add( Field.UnIndexed("url", url.toExternalForm() ) );
220
221         // Add the last modified date of the file a field named "modified". Use a
222
// Keyword field, so that it's searchable, but so that no attempt is made
223
// to tokenize the field into words.
224
document.add(Field.Keyword("modified", DateField.timeToString( connection.getLastModified())));
225
226         String JavaDoc uid = url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') + "\u0000" +
227                DateField.timeToString( connection.getLastModified() );
228
229         // Add the uid as a field, so that index can be incrementally maintained.
230
// This field is not stored with document, it is indexed, but it is not
231
// tokenized prior to indexing.
232
document.add(new Field("uid", uid, false, true, false));
233
234         InputStream JavaDoc input = null;
235         try
236         {
237             input = connection.getInputStream();
238             addContent( document, input,url.toExternalForm() );
239         }
240         finally
241         {
242             if( input != null )
243             {
244                 input.close();
245             }
246         }
247
248         // return the document
249
return document;
250     }
251
252     /**
253      * This will add the contents to the lucene document.
254      *
255      * @param document The document to add the contents to.
256      * @param is The stream to get the contents from.
257      * @param documentLocation The location of the document, used just for debug messages.
258      *
259      * @throws IOException If there is an error parsing the document.
260      */

261     private static void addContent( Document document, InputStream JavaDoc is, String JavaDoc documentLocation ) throws IOException JavaDoc
262     {
263         PDDocument pdfDocument = null;
264         try
265         {
266             pdfDocument = PDDocument.load( is );
267
268
269             if( pdfDocument.isEncrypted() )
270             {
271                 //Just try using the default password and move on
272
pdfDocument.decrypt( "" );
273             }
274
275             //create a writer where to append the text content.
276
StringWriter JavaDoc writer = new StringWriter JavaDoc();
277             PDFTextStripper stripper = new PDFTextStripper();
278             stripper.writeText( pdfDocument, writer );
279
280             // Note: the buffer to string operation is costless;
281
// the char array value of the writer buffer and the content string
282
// is shared as long as the buffer content is not modified, which will
283
// not occur here.
284
String JavaDoc contents = writer.getBuffer().toString();
285
286             StringReader JavaDoc reader = new StringReader JavaDoc( contents );
287
288             // Add the tag-stripped contents as a Reader-valued Text field so it will
289
// get tokenized and indexed.
290
document.add( Field.Text( "contents", reader ) );
291
292             PDDocumentInformation info = pdfDocument.getDocumentInformation();
293             if( info.getAuthor() != null )
294             {
295                 document.add(Field.Text( "Author", info.getAuthor() ) );
296             }
297             if( info.getCreationDate() != null )
298             {
299                 Date JavaDoc date = info.getCreationDate().getTime();
300                 //for some reason lucene cannot handle dates before the epoch
301
//and throws a nasty RuntimeException, so we will check and
302
//verify that this does not happen
303
if( date.getTime() >= 0 )
304                 {
305                     document.add(Field.Text("CreationDate", DateField.dateToString( date ) ) );
306                 }
307             }
308             if( info.getCreator() != null )
309             {
310                 document.add( Field.Text( "Creator", info.getCreator() ) );
311             }
312             if( info.getKeywords() != null )
313             {
314                 document.add( Field.Text( "Keywords", info.getKeywords() ) );
315             }
316             if( info.getModificationDate() != null )
317             {
318                 Date JavaDoc date = info.getModificationDate().getTime();
319                 //for some reason lucene cannot handle dates before the epoch
320
//and throws a nasty RuntimeException, so we will check and
321
//verify that this does not happen
322
if( date.getTime() >= 0 )
323                 {
324                     document.add(Field.Text("ModificationDate", DateField.dateToString( date ) ) );
325                 }
326             }
327             if( info.getProducer() != null )
328             {
329                 document.add( Field.Text( "Producer", info.getProducer() ) );
330             }
331             if( info.getSubject() != null )
332             {
333                 document.add( Field.Text( "Subject", info.getSubject() ) );
334             }
335             if( info.getTitle() != null )
336             {
337                 document.add( Field.Text( "Title", info.getTitle() ) );
338             }
339             if( info.getTrapped() != null )
340             {
341                 document.add( Field.Text( "Trapped", info.getTrapped() ) );
342             }
343
344             int summarySize = Math.min( contents.length(), 500 );
345             String JavaDoc summary = contents.substring( 0, summarySize );
346             // Add the summary as an UnIndexed field, so that it is stored and returned
347
// with hit documents for display.
348
document.add( Field.UnIndexed( "summary", summary ) );
349         }
350         catch( CryptographyException e )
351         {
352             throw new IOException JavaDoc( "Error decrypting document(" + documentLocation + "): " + e );
353         }
354         catch( InvalidPasswordException e )
355         {
356             //they didn't suppply a password and the default of "" was wrong.
357
throw new IOException JavaDoc( "Error: The document(" + documentLocation +
358                                     ") is encrypted and will not be indexed." );
359         }
360         finally
361         {
362             if( pdfDocument != null )
363             {
364                 pdfDocument.close();
365             }
366         }
367     }
368
369     /**
370      * This will test creating a document.
371      *
372      * usage: java pdfparser.searchengine.lucene.LucenePDFDocument &lt;pdf-document&gt;
373      *
374      * @param args command line arguments.
375      *
376      * @throws IOException If there is an error.
377      */

378     public static void main( String JavaDoc[] args ) throws IOException JavaDoc
379     {
380         if( args.length != 1 )
381         {
382             System.err.println( "usage: java org.pdfbox.searchengine.lucene.LucenePDFDocument <pdf-document>" );
383             System.exit( 1 );
384         }
385         System.out.println( "Document=" + getDocument( new File JavaDoc( args[0] ) ) );
386     }
387 }
Popular Tags