LucenePDFDocument


1   /**
2    * Copyright (c) 2003, www.pdfbox.org
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions are met:
7    *
8    * 1. Redistributions of source code must retain the above copyright notice,
9    *    this list of conditions and the following disclaimer.
10   * 2. Redistributions in binary form must reproduce the above copyright notice,
11   *    this list of conditions and the following disclaimer in the documentation
12   *    and/or other materials provided with the distribution.
13   * 3. Neither the name of pdfbox; nor the names of its
14   *    contributors may be used to endorse or promote products derived from this
15   *    software without specific prior written permission.
16   *
17   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20   * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27   *
28   * http://www.pdfbox.org
29   *
30   */
31  package org.pdfbox.searchengine.lucene;
32  
33  import java.io.File  ;
34  import java.io.FileInputStream  ;
35  import java.io.InputStream  ;
36  import java.io.IOException  ;
37  import java.io.StringReader  ;
38  import java.io.StringWriter  ;
39  
40  import java.net.URL  ;
41  import java.net.URLConnection  ;
42  
43  import java.util.Date  ;
44  
45  import org.apache.lucene.document.DateField;
46  import org.apache.lucene.document.Document;
47  import org.apache.lucene.document.Field;
48  
49  import org.pdfbox.pdmodel.PDDocument;
50  import org.pdfbox.pdmodel.PDDocumentInformation;
51  
52  import org.pdfbox.exceptions.CryptographyException;
53  import org.pdfbox.exceptions.InvalidPasswordException;
54  
55  import org.pdfbox.util.PDFTextStripper;
56  
57  /**
58   * This class is used to create a document for the lucene search engine.
59   * This should easily plug into the IndexHTML or IndexFiles that comes with
60   * the lucene project.  This class will populate the following fields.
61   * <table>
62   *      <tr>
63   *          <td>Lucene Field Name</td>
64   *          <td>Description</td>
65   *      </tr>
66   *      <tr>
67   *          <td>path</td>
68   *          <td>File system path if loaded from a file</td>
69   *      </tr>
70   *      <tr>
71   *          <td>url</td>
72   *          <td>URL to PDF document</td>
73   *      </tr>
74   *      <tr>
75   *          <td>contents</td>
76   *          <td>Entire contents of PDF document, indexed but not stored</td>
77   *      </tr>
78   *      <tr>
79   *          <td>summary</td>
80   *          <td>First 500 characters of content</td>
81   *      </tr>
82   *      <tr>
83   *          <td>modified</td>
84   *          <td>The modified date/time according to the url or path</td>
85   *      </tr>
86   *      <tr>
87   *          <td>uid</td>
88   *          <td>A unique identifier for the Lucene document.</td>
89   *      </tr>
90   *      <tr>
91   *          <td>CreationDate</td>
92   *          <td>From PDF meta-data if available</td>
93   *      </tr>
94   *      <tr>
95   *          <td>Creator</td>
96   *          <td>From PDF meta-data if available</td>
97   *      </tr>
98   *      <tr>
99   *          <td>Keywords</td>
100  *          <td>From PDF meta-data if available</td>
101  *      </tr>
102  *      <tr>
103  *          <td>ModificationDate</td>
104  *          <td>From PDF meta-data if available</td>
105  *      </tr>
106  *      <tr>
107  *          <td>Producer</td>
108  *          <td>From PDF meta-data if available</td>
109  *      </tr>
110  *      <tr>
111  *          <td>Subject</td>
112  *          <td>From PDF meta-data if available</td>
113  *      </tr>
114  *      <tr>
115  *          <td>Trapped</td>
116  *          <td>From PDF meta-data if available</td>
117  *      </tr>
118  * </table>
119  *
120  * @author  Ben Litchfield
121  * @version $Revision: 1.18 $
122  */
123 public final class LucenePDFDocument
124 {
125     private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
126 
127 
128     /**
129      * private constructor because there are only static methods.
130      */
131     private LucenePDFDocument()
132     {
133         //utility class should not be instantiated
134     }
135     
136     /**
137      * This will get a lucene document from a PDF file.
138      *
139      * @param is The stream to read the PDF from.
140      *
141      * @return The lucene document.
142      *
143      * @throws IOException If there is an error parsing or indexing the document.
144      */
145     public static Document getDocument( InputStream   is ) throws IOException  
146     {
147         Document document = new Document();
148         addContent( document, is, "<inputstream>" );
149         return document;
150     }
151 
152     /**
153      * This will get a lucene document from a PDF file.
154      *
155      * @param file The file to get the document for.
156      *
157      * @return The lucene document.
158      *
159      * @throws IOException If there is an error parsing or indexing the document.
160      */
161     public static Document getDocument( File   file ) throws IOException  
162     {
163         Document document = new Document();
164 
165         // Add the url as a field named "url".  Use an UnIndexed field, so
166         // that the url is just stored with the document, but is not searchable.
167         document.add( Field.UnIndexed("path", file.getPath() ) );
168         document.add(Field.UnIndexed("url", file.getPath().replace(FILE_SEPARATOR, '/')));
169 
170         // Add the last modified date of the file a field named "modified".  Use a
171         // Keyword field, so that it's searchable, but so that no attempt is made
172         // to tokenize the field into words.
173         document.add(Field.Keyword("modified", DateField.timeToString( file.lastModified() )));
174 
175         String   uid = file.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" +
176                DateField.timeToString(file.lastModified() );
177 
178         // Add the uid as a field, so that index can be incrementally maintained.
179         // This field is not stored with document, it is indexed, but it is not
180         // tokenized prior to indexing.
181         document.add(new Field("uid", uid, false, true, false));
182 
183         FileInputStream   input = null;
184         try
185         {
186             input = new FileInputStream  ( file );
187             addContent( document, input, file.getPath() );
188         }
189         finally
190         {
191             if( input != null )
192             {
193                 input.close();
194             }
195         }
196 
197 
198         // return the document
199 
200         return document;
201     }
202 
203     /**
204      * This will get a lucene document from a PDF file.
205      *
206      * @param url The file to get the document for.
207      *
208      * @return The lucene document.
209      *
210      * @throws IOException If there is an error parsing or indexing the document.
211      */
212     public static Document getDocument( URL   url ) throws IOException  
213     {
214         Document document = new Document();
215         URLConnection   connection = url.openConnection();
216         connection.connect();
217         // Add the url as a field named "url".  Use an UnIndexed field, so
218         // that the url is just stored with the document, but is not searchable.
219         document.add( Field.UnIndexed("url", url.toExternalForm() ) );
220 
221         // Add the last modified date of the file a field named "modified".  Use a
222         // Keyword field, so that it's searchable, but so that no attempt is made
223         // to tokenize the field into words.
224         document.add(Field.Keyword("modified", DateField.timeToString( connection.getLastModified())));
225 
226         String   uid = url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') + "\u0000" +
227                DateField.timeToString( connection.getLastModified() );
228 
229         // Add the uid as a field, so that index can be incrementally maintained.
230         // This field is not stored with document, it is indexed, but it is not
231         // tokenized prior to indexing.
232         document.add(new Field("uid", uid, false, true, false));
233 
234         InputStream   input = null;
235         try
236         {
237             input = connection.getInputStream();
238             addContent( document, input,url.toExternalForm() );
239         }
240         finally
241         {
242             if( input != null )
243             {
244                 input.close();
245             }
246         }
247 
248         // return the document
249         return document;
250     }
251 
252     /**
253      * This will add the contents to the lucene document.
254      *
255      * @param document The document to add the contents to.
256      * @param is The stream to get the contents from.
257      * @param documentLocation The location of the document, used just for debug messages.
258      *
259      * @throws IOException If there is an error parsing the document.
260      */
261     private static void addContent( Document document, InputStream   is, String   documentLocation ) throws IOException  
262     {
263         PDDocument pdfDocument = null;
264         try
265         {
266             pdfDocument = PDDocument.load( is );
267 
268 
269             if( pdfDocument.isEncrypted() )
270             {
271                 //Just try using the default password and move on
272                 pdfDocument.decrypt( "" );
273             }
274 
275             //create a writer where to append the text content.
276             StringWriter   writer = new StringWriter  ();
277             PDFTextStripper stripper = new PDFTextStripper();
278             stripper.writeText( pdfDocument, writer );
279 
280             // Note: the buffer to string operation is costless;
281             // the char array value of the writer buffer and the content string
282             // is shared as long as the buffer content is not modified, which will
283             // not occur here.
284             String   contents = writer.getBuffer().toString();
285 
286             StringReader   reader = new StringReader  ( contents );
287 
288             // Add the tag-stripped contents as a Reader-valued Text field so it will
289             // get tokenized and indexed.
290             document.add( Field.Text( "contents", reader ) );
291 
292             PDDocumentInformation info = pdfDocument.getDocumentInformation();
293             if( info.getAuthor() != null )
294             {
295                 document.add(Field.Text( "Author", info.getAuthor() ) );
296             }
297             if( info.getCreationDate() != null )
298             {
299                 Date   date = info.getCreationDate().getTime();
300                 //for some reason lucene cannot handle dates before the epoch
301                 //and throws a nasty RuntimeException, so we will check and
302                 //verify that this does not happen
303                 if( date.getTime() >= 0 )
304                 {
305                     document.add(Field.Text("CreationDate", DateField.dateToString( date ) ) );
306                 }
307             }
308             if( info.getCreator() != null )
309             {
310                 document.add( Field.Text( "Creator", info.getCreator() ) );
311             }
312             if( info.getKeywords() != null )
313             {
314                 document.add( Field.Text( "Keywords", info.getKeywords() ) );
315             }
316             if( info.getModificationDate() != null )
317             {
318                 Date   date = info.getModificationDate().getTime();
319                 //for some reason lucene cannot handle dates before the epoch
320                 //and throws a nasty RuntimeException, so we will check and
321                 //verify that this does not happen
322                 if( date.getTime() >= 0 )
323                 {
324                     document.add(Field.Text("ModificationDate", DateField.dateToString( date ) ) );
325                 }
326             }
327             if( info.getProducer() != null )
328             {
329                 document.add( Field.Text( "Producer", info.getProducer() ) );
330             }
331             if( info.getSubject() != null )
332             {
333                 document.add( Field.Text( "Subject", info.getSubject() ) );
334             }
335             if( info.getTitle() != null )
336             {
337                 document.add( Field.Text( "Title", info.getTitle() ) );
338             }
339             if( info.getTrapped() != null )
340             {
341                 document.add( Field.Text( "Trapped", info.getTrapped() ) );
342             }
343 
344             int summarySize = Math.min( contents.length(), 500 );
345             String   summary = contents.substring( 0, summarySize );
346             // Add the summary as an UnIndexed field, so that it is stored and returned
347             // with hit documents for display.
348             document.add( Field.UnIndexed( "summary", summary ) );
349         }
350         catch( CryptographyException e )
351         {
352             throw new IOException  ( "Error decrypting document(" + documentLocation + "): " + e );
353         }
354         catch( InvalidPasswordException e )
355         {
356             //they didn't suppply a password and the default of "" was wrong.
357             throw new IOException  ( "Error: The document(" + documentLocation +
358                                     ") is encrypted and will not be indexed." );
359         }
360         finally
361         {
362             if( pdfDocument != null )
363             {
364                 pdfDocument.close();
365             }
366         }
367     }
368 
369     /**
370      * This will test creating a document.
371      *
372      * usage: java pdfparser.searchengine.lucene.LucenePDFDocument &lt;pdf-document&gt;
373      *
374      * @param args command line arguments.
375      *
376      * @throws IOException If there is an error.
377      */
378     public static void main( String  [] args ) throws IOException  
379     {
380         if( args.length != 1 )
381         {
382             System.err.println( "usage: java org.pdfbox.searchengine.lucene.LucenePDFDocument <pdf-document>" );
383             System.exit( 1 );
384         }
385         System.out.println( "Document=" + getDocument( new File  ( args[0] ) ) );
386     }
387 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags