1 31 package org.pdfbox.searchengine.lucene; 32 33 import java.io.File ; 34 import java.io.FileInputStream ; 35 import java.io.InputStream ; 36 import java.io.IOException ; 37 import java.io.StringReader ; 38 import java.io.StringWriter ; 39 40 import java.net.URL ; 41 import java.net.URLConnection ; 42 43 import java.util.Date ; 44 45 import org.apache.lucene.document.DateField; 46 import org.apache.lucene.document.Document; 47 import org.apache.lucene.document.Field; 48 49 import org.pdfbox.pdmodel.PDDocument; 50 import org.pdfbox.pdmodel.PDDocumentInformation; 51 52 import org.pdfbox.exceptions.CryptographyException; 53 import org.pdfbox.exceptions.InvalidPasswordException; 54 55 import org.pdfbox.util.PDFTextStripper; 56 57 123 public final class LucenePDFDocument 124 { 125 private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); 126 127 128 131 private LucenePDFDocument() 132 { 133 } 135 136 145 public static Document getDocument( InputStream is ) throws IOException 146 { 147 Document document = new Document(); 148 addContent( document, is, "<inputstream>" ); 149 return document; 150 } 151 152 161 public static Document getDocument( File file ) throws IOException 162 { 163 Document document = new Document(); 164 165 document.add( Field.UnIndexed("path", file.getPath() ) ); 168 document.add(Field.UnIndexed("url", file.getPath().replace(FILE_SEPARATOR, '/'))); 169 170 document.add(Field.Keyword("modified", DateField.timeToString( file.lastModified() ))); 174 175 String uid = file.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + 176 DateField.timeToString(file.lastModified() ); 177 178 document.add(new Field("uid", uid, false, true, false)); 182 183 FileInputStream input = null; 184 try 185 { 186 input = new FileInputStream ( file ); 187 addContent( document, input, file.getPath() ); 188 } 189 finally 190 { 191 if( input != null ) 192 { 193 input.close(); 194 } 195 } 196 197 198 200 return document; 201 } 202 203 212 public static Document getDocument( URL url ) throws IOException 213 { 214 Document document = new Document(); 215 URLConnection connection = url.openConnection(); 216 connection.connect(); 217 document.add( Field.UnIndexed("url", url.toExternalForm() ) ); 220 221 document.add(Field.Keyword("modified", DateField.timeToString( connection.getLastModified()))); 225 226 String uid = url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + 227 DateField.timeToString( connection.getLastModified() ); 228 229 document.add(new Field("uid", uid, false, true, false)); 233 234 InputStream input = null; 235 try 236 { 237 input = connection.getInputStream(); 238 addContent( document, input,url.toExternalForm() ); 239 } 240 finally 241 { 242 if( input != null ) 243 { 244 input.close(); 245 } 246 } 247 248 return document; 250 } 251 252 261 private static void addContent( Document document, InputStream is, String documentLocation ) throws IOException 262 { 263 PDDocument pdfDocument = null; 264 try 265 { 266 pdfDocument = PDDocument.load( is ); 267 268 269 if( pdfDocument.isEncrypted() ) 270 { 271 pdfDocument.decrypt( "" ); 273 } 274 275 StringWriter writer = new StringWriter (); 277 PDFTextStripper stripper = new PDFTextStripper(); 278 stripper.writeText( pdfDocument, writer ); 279 280 String contents = writer.getBuffer().toString(); 285 286 StringReader reader = new StringReader ( contents ); 287 288 document.add( Field.Text( "contents", reader ) ); 291 292 PDDocumentInformation info = pdfDocument.getDocumentInformation(); 293 if( info.getAuthor() != null ) 294 { 295 document.add(Field.Text( "Author", info.getAuthor() ) ); 296 } 297 if( info.getCreationDate() != null ) 298 { 299 Date date = info.getCreationDate().getTime(); 300 if( date.getTime() >= 0 ) 304 { 305 document.add(Field.Text("CreationDate", DateField.dateToString( date ) ) ); 306 } 307 } 308 if( info.getCreator() != null ) 309 { 310 document.add( Field.Text( "Creator", info.getCreator() ) ); 311 } 312 if( info.getKeywords() != null ) 313 { 314 document.add( Field.Text( "Keywords", info.getKeywords() ) ); 315 } 316 if( info.getModificationDate() != null ) 317 { 318 Date date = info.getModificationDate().getTime(); 319 if( date.getTime() >= 0 ) 323 { 324 document.add(Field.Text("ModificationDate", DateField.dateToString( date ) ) ); 325 } 326 } 327 if( info.getProducer() != null ) 328 { 329 document.add( Field.Text( "Producer", info.getProducer() ) ); 330 } 331 if( info.getSubject() != null ) 332 { 333 document.add( Field.Text( "Subject", info.getSubject() ) ); 334 } 335 if( info.getTitle() != null ) 336 { 337 document.add( Field.Text( "Title", info.getTitle() ) ); 338 } 339 if( info.getTrapped() != null ) 340 { 341 document.add( Field.Text( "Trapped", info.getTrapped() ) ); 342 } 343 344 int summarySize = Math.min( contents.length(), 500 ); 345 String summary = contents.substring( 0, summarySize ); 346 document.add( Field.UnIndexed( "summary", summary ) ); 349 } 350 catch( CryptographyException e ) 351 { 352 throw new IOException ( "Error decrypting document(" + documentLocation + "): " + e ); 353 } 354 catch( InvalidPasswordException e ) 355 { 356 throw new IOException ( "Error: The document(" + documentLocation + 358 ") is encrypted and will not be indexed." ); 359 } 360 finally 361 { 362 if( pdfDocument != null ) 363 { 364 pdfDocument.close(); 365 } 366 } 367 } 368 369 378 public static void main( String [] args ) throws IOException 379 { 380 if( args.length != 1 ) 381 { 382 System.err.println( "usage: java org.pdfbox.searchengine.lucene.LucenePDFDocument <pdf-document>" ); 383 System.exit( 1 ); 384 } 385 System.out.println( "Document=" + getDocument( new File ( args[0] ) ) ); 386 } 387 } | Popular Tags |