KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > pdfbox > searchengine > lucene > IndexFiles


1 package org.pdfbox.searchengine.lucene;
2
3 /*
4  * This source was originally written as an example for the lucene project.
5  * It has been modified to use PDFBox as a lucene document creator.
6  * -Ben Litchfield
7  *
8  *====================================================================
9  * The Apache Software License, Version 1.1
10  *
11  * Copyright (c) 2001 The Apache Software Foundation. All rights
12  * reserved.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  *
18  * 1. Redistributions of source code must retain the above copyright
19  * notice, this list of conditions and the following disclaimer.
20  *
21  * 2. Redistributions in binary form must reproduce the above copyright
22  * notice, this list of conditions and the following disclaimer in
23  * the documentation and/or other materials provided with the
24  * distribution.
25  *
26  * 3. The end-user documentation included with the redistribution,
27  * if any, must include the following acknowledgment:
28  * "This product includes software developed by the
29  * Apache Software Foundation (http://www.apache.org/)."
30  * Alternately, this acknowledgment may appear in the software itself,
31  * if and wherever such third-party acknowledgments normally appear.
32  *
33  * 4. The names "Apache" and "Apache Software Foundation" and
34  * "Apache Lucene" must not be used to endorse or promote products
35  * derived from this software without prior written permission. For
36  * written permission, please contact apache@apache.org.
37  *
38  * 5. Products derived from this software may not be called "Apache",
39  * "Apache Lucene", nor may "Apache" appear in their name, without
40  * prior written permission of the Apache Software Foundation.
41  *
42  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
43  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
44  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
46  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
49  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
50  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
51  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
52  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53  * SUCH DAMAGE.
54  * ====================================================================
55  *
56  * This software consists of voluntary contributions made by many
57  * individuals on behalf of the Apache Software Foundation. For more
58  * information on the Apache Software Foundation, please see
59  * <http://www.apache.org/>.
60  */

61
62 import org.apache.lucene.analysis.standard.StandardAnalyzer;
63
64 import org.apache.lucene.demo.HTMLDocument;
65
66 import org.apache.lucene.document.Document;
67
68 import org.apache.lucene.index.IndexReader;
69 import org.apache.lucene.index.IndexWriter;
70 import org.apache.lucene.index.Term;
71 import org.apache.lucene.index.TermEnum;
72
73 import java.util.Arrays JavaDoc;
74
75
76 import java.io.File JavaDoc;
77 import java.io.IOException JavaDoc;
78
79 import java.util.Date JavaDoc;
80
81
82 /**
83  * This is a class that will index some files on a local filesystem. This code
84  * was modified from a demo that comes with the lucene search engine.
85  *
86  * @author Lucene team
87  * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
88  *
89  * @version $Revision: 1.8 $
90  */

91 public class IndexFiles
92 {
93     private boolean deleting = false; // true during deletion pass
94
private IndexReader reader; // existing index
95
private IndexWriter writer; // new index being built
96
private TermEnum uidIter; // document id iterator
97

98     /**
99      * This is the main entry point for the indexer.
100      *
101      * @param argv The command line arguments.
102      */

103     public static void main(String JavaDoc[] argv)
104     {
105
106         String JavaDoc index = "index";
107         boolean create = false;
108         File JavaDoc root = null;
109
110         String JavaDoc usage = "org.pdfbox.searchengine.lucene.IndexFiles [-create] [-index <index>] <root_directory>";
111
112         if (argv.length == 0)
113         {
114             System.err.println("Usage: " + usage);
115             return;
116         }
117
118         for (int i = 0; i < argv.length; i++)
119         {
120             if (argv[i].equals("-index"))
121             { // parse -index option
122
index = argv[++i];
123             }
124             else if (argv[i].equals("-create"))
125             { // parse -create option
126
create = true;
127             }
128             else if (i != argv.length-1)
129             {
130                 System.err.println("Usage: " + usage);
131                 return;
132             }
133             else
134             {
135                 System.out.println( "root=" +argv[i] );
136                 root = new File JavaDoc(argv[i]);
137             }
138         }
139         IndexFiles indexer = new IndexFiles();
140         indexer.index( root, create, index );
141     }
142
143     /**
144      * This will index a directory.
145      *
146      * @param root The root directory to start indexing.
147      * @param create Should we create a new index?
148      * @param index The name of the index.
149      */

150     public void index( File JavaDoc root, boolean create, String JavaDoc index )
151     {
152
153         try
154         {
155             Date JavaDoc start = new Date JavaDoc();
156
157             writer = new IndexWriter(index, new StandardAnalyzer(), create);
158
159             if (!create)
160             { // delete stale docs
161
deleting = true;
162                 indexDocs(root, index, create);
163             }
164
165             indexDocs(root, index, create); // add new docs
166

167             System.out.println("Optimizing index...");
168             writer.optimize();
169             writer.close();
170
171             Date JavaDoc end = new Date JavaDoc();
172
173             System.out.print(end.getTime() - start.getTime());
174             System.out.println(" total milliseconds");
175
176         }
177         catch( Exception JavaDoc e )
178         {
179             e.printStackTrace();
180         }
181     }
182
183     /**
184      * Walk directory hierarchy in uid order, while keeping uid iterator from
185      * existing index in sync. Mismatches indicate one of: (a) old documents to
186      * be deleted; (b) unchanged documents, to be left alone; or (c) new
187      * documents, to be indexed.
188      *
189      * @param file The directory to index.
190      * @param index The index to add the file to.
191      * @param create A flag telling if we should create the index.
192      *
193      * @throws Exception If there is any error indexing the directory.
194      */

195     private void indexDocs(File JavaDoc file, String JavaDoc index, boolean create) throws Exception JavaDoc
196     {
197         if (!create)
198         { // incrementally update
199

200             reader = IndexReader.open(index); // open existing index
201
uidIter = reader.terms(new Term("uid", "")); // init uid iterator
202

203             indexDocs(file);
204
205             if (deleting)
206             { // delete rest of stale docs
207
while (uidIter.term() != null && uidIter.term().field().equals( "uid" ) )
208                 {
209                     System.out.println("deleting " +
210                     HTMLDocument.uid2url(uidIter.term().text()));
211                     reader.deleteDocuments(uidIter.term());
212                     uidIter.next();
213                 }
214                 deleting = false;
215             }
216
217             uidIter.close(); // close uid iterator
218
reader.close(); // close existing index
219

220         }
221         else
222         {
223             indexDocs(file);
224         }
225     }
226
227
228     private void indexDocs(File JavaDoc file) throws Exception JavaDoc
229     {
230         if (file.isDirectory())
231         { // if a directory
232
String JavaDoc[] files = file.list(); // list its files
233
Arrays.sort(files); // sort the files
234
for (int i = 0; i < files.length; i++) // recursively index them
235
{
236                 indexDocs(new File JavaDoc(file, files[i]));
237             }
238         }
239         else
240         {
241             if (uidIter != null)
242             {
243                 String JavaDoc uid = HTMLDocument.uid(file); // construct uid for doc
244

245                 while( uidIter.term() != null &&
246                 uidIter.term().field().equals( "uid" ) &&
247                 uidIter.term().text().compareTo(uid) < 0)
248                 {
249                     if (deleting)
250                     { // delete stale docs
251
System.out.println("deleting " +
252                         HTMLDocument.uid2url(uidIter.term().text()));
253                         reader.deleteDocuments(uidIter.term());
254                     }
255                     uidIter.next();
256                 }
257                 if( uidIter.term() != null &&
258                 uidIter.term().field().equals( "uid" ) &&
259                 uidIter.term().text().compareTo(uid) == 0)
260                 {
261                     System.out.println( "Next uid=" +uidIter );
262                     uidIter.next(); // keep matching docs
263
}
264             }
265             else
266             {
267                 try
268                 {
269                     addDocument( file );
270                 }
271                 catch( IOException JavaDoc e )
272                 {
273                     //catch exception and move onto the next document
274
System.out.println( e.getMessage() );
275                 }
276             }
277         }
278     }
279
280     private void addDocument( File JavaDoc file ) throws IOException JavaDoc, InterruptedException JavaDoc
281     {
282         String JavaDoc path = file.getName().toUpperCase();
283         Document doc = null;
284         //Gee, this would be a great place for a command pattern
285
if( path.endsWith(".HTML") || // index .html files
286
path.endsWith(".HTM") || // index .htm files
287
path.endsWith(".TXT"))
288         {
289             System.out.println( "Indexing Text document: " + file );
290             doc = HTMLDocument.Document(file);
291         }
292         else if( path.endsWith( ".PDF" ) )
293         {
294             System.out.println( "Indexing PDF document: " + file );
295             doc = LucenePDFDocument.getDocument( file );
296         }
297         else
298         {
299             System.out.println( "Skipping " + file );
300         }
301
302         if( doc != null )
303         {
304             writer.addDocument(doc);
305         }
306     }
307 }
Popular Tags