IndexFiles


1   package org.pdfbox.searchengine.lucene;
2   
3   /*
4    * This source was originally written as an example for the lucene project.
5    * It has been modified to use PDFBox as a  lucene document creator.
6    * -Ben Litchfield
7    *
8    *====================================================================
9    * The Apache Software License, Version 1.1
10   *
11   * Copyright (c) 2001 The Apache Software Foundation.  All rights
12   * reserved.
13   *
14   * Redistribution and use in source and binary forms, with or without
15   * modification, are permitted provided that the following conditions
16   * are met:
17   *
18   * 1. Redistributions of source code must retain the above copyright
19   *    notice, this list of conditions and the following disclaimer.
20   *
21   * 2. Redistributions in binary form must reproduce the above copyright
22   *    notice, this list of conditions and the following disclaimer in
23   *    the documentation and/or other materials provided with the
24   *    distribution.
25   *
26   * 3. The end-user documentation included with the redistribution,
27   *    if any, must include the following acknowledgment:
28   *       "This product includes software developed by the
29   *        Apache Software Foundation (http://www.apache.org/)."
30   *    Alternately, this acknowledgment may appear in the software itself,
31   *    if and wherever such third-party acknowledgments normally appear.
32   *
33   * 4. The names "Apache" and "Apache Software Foundation" and
34   *    "Apache Lucene" must not be used to endorse or promote products
35   *    derived from this software without prior written permission. For
36   *    written permission, please contact apache@apache.org.
37   *
38   * 5. Products derived from this software may not be called "Apache",
39   *    "Apache Lucene", nor may "Apache" appear in their name, without
40   *    prior written permission of the Apache Software Foundation.
41   *
42   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
43   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
44   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
46   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
49   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
50   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
51   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
52   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53   * SUCH DAMAGE.
54   * ====================================================================
55   *
56   * This software consists of voluntary contributions made by many
57   * individuals on behalf of the Apache Software Foundation.  For more
58   * information on the Apache Software Foundation, please see
59   * <http://www.apache.org/>.
60   */
61  
62  import org.apache.lucene.analysis.standard.StandardAnalyzer;
63  
64  import org.apache.lucene.demo.HTMLDocument;
65  
66  import org.apache.lucene.document.Document;
67  
68  import org.apache.lucene.index.IndexReader;
69  import org.apache.lucene.index.IndexWriter;
70  import org.apache.lucene.index.Term;
71  import org.apache.lucene.index.TermEnum;
72  
73  import java.util.Arrays  ;
74  
75  
76  import java.io.File  ;
77  import java.io.IOException  ;
78  
79  import java.util.Date  ;
80  
81  
82  /**
83   * This is a class that will index some files on a local filesystem.  This code
84   * was modified from a demo that comes with the lucene search engine.
85   *
86   * @author Lucene team
87   * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
88   *
89   * @version $Revision: 1.8 $
90   */
91  public class IndexFiles
92  {
93      private boolean deleting = false;     // true during deletion pass
94      private IndexReader reader;       // existing index
95      private IndexWriter writer;       // new index being built
96      private TermEnum uidIter;         // document id iterator
97  
98      /**
99       * This is the main entry point for the indexer.
100      *
101      * @param argv The command line arguments.
102      */
103     public static void main(String  [] argv)
104     {
105 
106         String   index = "index";
107         boolean create = false;
108         File   root = null;
109 
110         String   usage = "org.pdfbox.searchengine.lucene.IndexFiles [-create] [-index <index>] <root_directory>";
111 
112         if (argv.length == 0)
113         {
114             System.err.println("Usage: " + usage);
115             return;
116         }
117 
118         for (int i = 0; i < argv.length; i++)
119         {
120             if (argv[i].equals("-index"))
121             {         // parse -index option
122                 index = argv[++i];
123             }
124             else if (argv[i].equals("-create"))
125             {     // parse -create option
126                 create = true;
127             }
128             else if (i != argv.length-1)
129             {
130                 System.err.println("Usage: " + usage);
131                 return;
132             }
133             else
134             {
135                 System.out.println( "root=" +argv[i] );
136                 root = new File  (argv[i]);
137             }
138         }
139         IndexFiles indexer = new IndexFiles();
140         indexer.index( root, create, index );
141     }
142 
143     /**
144      * This will index a directory.
145      *
146      * @param root The root directory to start indexing.
147      * @param create Should we create a new index?
148      * @param index The name of the index.
149      */
150     public void index( File   root, boolean create, String   index )
151     {
152 
153         try
154         {
155             Date   start = new Date  ();
156 
157             writer = new IndexWriter(index, new StandardAnalyzer(), create);
158 
159             if (!create)
160             {                 // delete stale docs
161                 deleting = true;
162                 indexDocs(root, index, create);
163             }
164 
165             indexDocs(root, index, create);       // add new docs
166 
167             System.out.println("Optimizing index...");
168             writer.optimize();
169             writer.close();
170 
171             Date   end = new Date  ();
172 
173             System.out.print(end.getTime() - start.getTime());
174             System.out.println(" total milliseconds");
175 
176         }
177         catch( Exception   e )
178         {
179             e.printStackTrace();
180         }
181     }
182 
183     /**
184      * Walk directory hierarchy in uid order, while keeping uid iterator from
185      * existing index in sync.  Mismatches indicate one of: (a) old documents to
186      * be deleted; (b) unchanged documents, to be left alone; or (c) new
187      * documents, to be indexed.
188      *
189      * @param file The directory to index.
190      * @param index The index to add the file to.
191      * @param create A flag telling if we should create the index.
192      *
193      * @throws Exception If there is any error indexing the directory.
194      */
195     private void indexDocs(File   file, String   index, boolean create) throws Exception  
196     {
197         if (!create)
198         {                 // incrementally update
199 
200             reader = IndexReader.open(index);         // open existing index
201             uidIter = reader.terms(new Term("uid", "")); // init uid iterator
202 
203             indexDocs(file);
204 
205             if (deleting)
206             {                 // delete rest of stale docs
207                 while (uidIter.term() != null && uidIter.term().field().equals( "uid" ) )
208                 {
209                     System.out.println("deleting " +
210                     HTMLDocument.uid2url(uidIter.term().text()));
211                     reader.deleteDocuments(uidIter.term());
212                     uidIter.next();
213                 }
214                 deleting = false;
215             }
216 
217             uidIter.close();                  // close uid iterator
218             reader.close();               // close existing index
219 
220         }
221         else
222         {
223             indexDocs(file);
224         }
225     }
226 
227 
228     private void indexDocs(File   file) throws Exception  
229     {
230         if (file.isDirectory())
231         {             // if a directory
232             String  [] files = file.list();         // list its files
233             Arrays.sort(files);           // sort the files
234             for (int i = 0; i < files.length; i++)    // recursively index them
235             {
236                 indexDocs(new File  (file, files[i]));
237             }
238         }
239         else
240         {
241             if (uidIter != null)
242             {
243                 String   uid = HTMLDocument.uid(file);      // construct uid for doc
244 
245                 while( uidIter.term() != null &&
246                 uidIter.term().field().equals( "uid" ) &&
247                 uidIter.term().text().compareTo(uid) < 0)
248                 {
249                     if (deleting)
250                     {             // delete stale docs
251                         System.out.println("deleting " +
252                         HTMLDocument.uid2url(uidIter.term().text()));
253                         reader.deleteDocuments(uidIter.term());
254                     }
255                     uidIter.next();
256                 }
257                 if( uidIter.term() != null &&
258                 uidIter.term().field().equals( "uid" ) &&
259                 uidIter.term().text().compareTo(uid) == 0)
260                 {
261                     System.out.println( "Next uid=" +uidIter );
262                     uidIter.next();           // keep matching docs
263                 }
264             }
265             else
266             {
267                 try
268                 {
269                     addDocument( file );
270                 }
271                 catch( IOException   e )
272                 {
273                     //catch exception and move onto the next document
274                     System.out.println( e.getMessage() );
275                 }
276             }
277         }
278     }
279 
280     private void addDocument( File   file ) throws IOException  , InterruptedException  
281     {
282         String   path = file.getName().toUpperCase();
283         Document doc = null;
284         //Gee, this would be a great place for a command pattern
285         if( path.endsWith(".HTML") || // index .html files
286             path.endsWith(".HTM") || // index .htm files
287             path.endsWith(".TXT"))
288         {
289             System.out.println( "Indexing Text document: " + file );
290             doc = HTMLDocument.Document(file);
291         }
292         else if( path.endsWith( ".PDF" ) )
293         {
294             System.out.println( "Indexing PDF document: " + file );
295             doc = LucenePDFDocument.getDocument( file );
296         }
297         else
298         {
299             System.out.println( "Skipping " + file );
300         }
301 
302         if( doc != null )
303         {
304             writer.addDocument(doc);
305         }
306     }
307 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags