KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lenya > lucene > index > AbstractIndexer


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */

17
18 /* $Id: AbstractIndexer.java 43174 2004-08-02 06:28:44Z michi $ */
19
20 package org.apache.lenya.lucene.index;
21
22 import java.io.File JavaDoc;
23 import java.io.FileFilter JavaDoc;
24 import java.io.IOException JavaDoc;
25 import java.util.Arrays JavaDoc;
26
27 import org.apache.log4j.Category;
28 import org.apache.lenya.lucene.IndexConfiguration;
29 import org.apache.lucene.analysis.standard.StandardAnalyzer;
30 import org.apache.lucene.document.Document;
31 import org.apache.lucene.index.IndexReader;
32 import org.apache.lucene.index.IndexWriter;
33 import org.apache.lucene.index.Term;
34
35 import org.w3c.dom.Element JavaDoc;
36
37 /**
38  * Abstract base class for indexers.
39  * The factory method {@link #getDocumentCreator(String[])} is used to create a
40  * DocumentCreator from the command-line arguments.
41  */

42 public abstract class AbstractIndexer implements Indexer {
43     private static Category log = Category.getInstance(AbstractIndexer.class);
44     
45     private DocumentCreator documentCreator;
46     private Element JavaDoc indexer;
47     private String JavaDoc configFileName;
48
49     /**
50      * Creates a new instance of AbstractIndexer
51      */

52     public AbstractIndexer() {
53     }
54
55     /**
56      * Returns the DocumentCreator of this indexer.
57      */

58     protected DocumentCreator getDocumentCreator() {
59         return documentCreator;
60     }
61
62     /**
63      * Initializes this indexer with command-line parameters.
64      */

65     public void configure(Element JavaDoc indexer, String JavaDoc configFileName) throws Exception JavaDoc {
66         documentCreator = createDocumentCreator(indexer, configFileName);
67         this.indexer = indexer;
68         this.configFileName = configFileName;
69     }
70
71     /**
72      * DOCUMENT ME!
73      *
74      * @param element DOCUMENT ME!
75      *
76      * @return DOCUMENT ME!
77      *
78      * @throws Exception DOCUMENT ME!
79      */

80     public abstract DocumentCreator createDocumentCreator(Element JavaDoc indexer, String JavaDoc configFileName) throws Exception JavaDoc;
81
82     /**
83      * Updates the index incrementally.
84      * Walk directory hierarchy in uid order, while keeping uid iterator from
85      * existing index in sync. Mismatches indicate one of:
86      * <ol>
87      * <li>old documents to be deleted</li>
88      * <li>unchanged documents, to be left alone, or</li>
89      * <li>new documents, to be indexed.</li>
90      * </ol>
91      */

92     public void updateIndex(File JavaDoc dumpDirectory, File JavaDoc index) throws Exception JavaDoc {
93         deleteStaleDocuments(dumpDirectory, index);
94         doIndex(dumpDirectory, index, false);
95     }
96
97     /**
98      * Updates the index re one document
99      *
100      * <ol>
101      * <li>old documents to be deleted</li>
102      * <li>unchanged documents, to be left alone, or</li>
103      * <li>new documents, to be indexed.</li>
104      * </ol>
105      */

106     public void indexDocument(File JavaDoc file) throws Exception JavaDoc {
107         IndexConfiguration config = new IndexConfiguration(configFileName);
108         log.debug("File: " + file);
109
110         File JavaDoc dumpDir = new File JavaDoc(config.resolvePath(config.getHTDocsDumpDir()));
111         log.debug("Dump dir: " + dumpDir);
112
113         File JavaDoc indexDir = new File JavaDoc(config.resolvePath(config.getIndexDir()));
114         log.debug("Index dir: " + indexDir);
115
116
117     String JavaDoc id = IndexIterator.createID(file, dumpDir);
118
119     boolean createNewIndex = false;
120         if (!IndexReader.indexExists(indexDir)) {
121             log.warn("Index does not exist yet: " + indexDir);
122             createNewIndex = true;
123         } else {
124         // Delete from index
125
IndexReader reader = IndexReader.open(indexDir.getAbsolutePath());
126         Term term = new Term("id", id);
127             log.debug(term.toString());
128             int numberOfDeletedDocuments = reader.delete(term);
129             if (numberOfDeletedDocuments == 1) {
130                 log.info("Document has been deleted: " + term);
131             } else {
132                 log.warn("No such document found in this index: " + term);
133             }
134             //log.debug("Number of deleted documents: " + numberOfDeletedDocuments);
135
//log.debug("Current number of documents in this index: " + reader.numDocs());
136
reader.close();
137         }
138
139     // Append to index
140
Document doc = getDocumentCreator().getDocument(new File JavaDoc(dumpDir, id), dumpDir);
141         IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(), createNewIndex);
142         writer.maxFieldLength = 1000000;
143         writer.addDocument(doc);
144         //log.debug("Document has been added: " + doc);
145
log.info("Document has been added: " + id);
146         writer.optimize();
147         writer.close();
148     }
149
150     /**
151      * Creates a new index.
152      */

153     public void createIndex(File JavaDoc dumpDirectory, File JavaDoc index)
154         throws Exception JavaDoc {
155         doIndex(dumpDirectory, index, true);
156     }
157
158     /**
159      * Index files
160      *
161      * @param dumpDirectory Directory where the files to be indexed are located
162      * @param index Directory where the index shall be located
163      * @param create <strong>true</strong> means the index will be created from scratch, <strong>false</strong> means it will be indexed incrementally
164      */

165     public void doIndex(File JavaDoc dumpDirectory, File JavaDoc index, boolean create) {
166         if (!index.isDirectory()) {
167             index.mkdirs();
168             log.warn("Directory has been created: " + index.getAbsolutePath());
169         }
170         try {
171             IndexWriter writer = new IndexWriter(index.getAbsolutePath(), new StandardAnalyzer(), create);
172             writer.maxFieldLength = 1000000;
173
174             IndexInformation info = new IndexInformation(index.getAbsolutePath(), dumpDirectory, getFilter(indexer, configFileName), create);
175
176             IndexHandler handler;
177
178             if (create) {
179                 handler = new CreateIndexHandler(dumpDirectory, info, writer);
180             } else {
181                 handler = new UpdateIndexHandler(dumpDirectory, info, writer);
182             }
183
184             IndexIterator iterator = new IndexIterator(index.getAbsolutePath(), getFilter(indexer, configFileName));
185             iterator.addHandler(handler);
186             iterator.iterate(dumpDirectory);
187
188             writer.optimize();
189             writer.close();
190         } catch (IOException JavaDoc e) {
191             log.error(e);
192         }
193     }
194
195     /**
196      * Delete the stale documents.
197      */

198     protected void deleteStaleDocuments(File JavaDoc dumpDirectory, File JavaDoc index)
199         throws Exception JavaDoc {
200         log.debug("Deleting stale documents");
201
202         IndexIterator iterator = new IndexIterator(index.getAbsolutePath(), getFilter(indexer, configFileName));
203         iterator.addHandler(new DeleteHandler());
204         iterator.iterate(dumpDirectory);
205         log.debug("Deleting stale documents finished");
206     }
207
208     /**
209      * Returns the filter used to receive the indexable files. Might be overwritten by inherited class.
210      */

211     public FileFilter JavaDoc getFilter(Element JavaDoc indexer, String JavaDoc configFileName) {
212         String JavaDoc[] indexableExtensions = { "html", "htm", "txt" };
213         return new AbstractIndexer.DefaultIndexFilter(indexableExtensions);
214     }
215
216     /**
217      * FileFilter used to obtain the files to index.
218      */

219     public class DefaultIndexFilter implements FileFilter JavaDoc {
220         protected String JavaDoc[] indexableExtensions;
221
222         /**
223          * Default indexable extensions: html, htm, txt
224          */

225         public DefaultIndexFilter() {
226             String JavaDoc[] iE = { "html", "htm", "txt" };
227             indexableExtensions = iE;
228         }
229
230         /**
231          *
232          */

233         public DefaultIndexFilter(String JavaDoc[] indexableExtensions) {
234             this.indexableExtensions = indexableExtensions;
235         }
236
237         /** Tests whether or not the specified abstract pathname should be
238          * included in a pathname list.
239          *
240          * @param pathname The abstract pathname to be tested
241          * @return <code>true</code> if and only if <code>pathname</code> should be included
242          *
243          */

244         public boolean accept(File JavaDoc file) {
245             boolean accept;
246
247             if (file.isDirectory()) {
248                 accept = true;
249             } else {
250                 String JavaDoc fileName = file.getName();
251                 String JavaDoc extension = fileName.substring(fileName.lastIndexOf(".") + 1);
252                 accept = Arrays.asList(indexableExtensions).contains(extension);
253             }
254
255             return accept;
256         }
257     }
258
259     /**
260      * Deletes all stale documents up to the document representing the next file.
261      * The following documents are deleted:
262      * <ul>
263      * <li>representing files that where removed</li>
264      * <li>representing the same file but are older than the current file</li>
265      * </ul>
266      */

267     public class DeleteHandler extends AbstractIndexIteratorHandler {
268         /** Handles a stale document.
269          *
270          */

271         public void handleStaleDocument(IndexReader reader, Term term) {
272             log.debug("deleting " +
273                 IndexIterator.uid2url(term.text()));
274
275             try {
276                 int deletedDocuments = reader.delete(term);
277                 log.debug("deleted " + deletedDocuments +
278                     " documents.");
279             } catch (IOException JavaDoc e) {
280                 log.error(e);
281             }
282         }
283     }
284
285     /**
286      * DOCUMENT ME!
287      */

288     public class IndexHandler extends AbstractIndexIteratorHandler {
289         /**
290          * Creates a new IndexHandler object.
291          *
292          * @param dumpDirectory DOCUMENT ME!
293          * @param info DOCUMENT ME!
294          * @param writer DOCUMENT ME!
295          */

296         public IndexHandler(File JavaDoc dumpDirectory, IndexInformation info, IndexWriter writer) {
297             this.info = info;
298             this.dumpDirectory = dumpDirectory;
299             this.writer = writer;
300         }
301
302         private IndexInformation info;
303
304         protected IndexInformation getInformation() {
305             return info;
306         }
307
308         private File JavaDoc dumpDirectory;
309
310         protected File JavaDoc getDumpDirectory() {
311             return dumpDirectory;
312         }
313
314         private IndexWriter writer;
315
316         protected IndexWriter getWriter() {
317             return writer;
318         }
319
320         /**
321      * Add document to index
322      */

323         protected void addFile(File JavaDoc file) {
324             log.debug("adding document: " + file.getAbsolutePath());
325
326             try {
327                 Document doc = getDocumentCreator().getDocument(file, dumpDirectory);
328                 writer.addDocument(doc);
329             } catch (Exception JavaDoc e) {
330                 log.error(e);
331             }
332
333             info.increase();
334             log.info(info.printProgress());
335         }
336     }
337
338     /**
339      * DOCUMENT ME!
340      */

341     public class CreateIndexHandler extends IndexHandler {
342         /**
343          * Creates a new CreateIndexHandler object.
344          *
345          * @param dumpDirectory DOCUMENT ME!
346          * @param info DOCUMENT ME!
347          * @param writer DOCUMENT ME!
348          */

349         public CreateIndexHandler(File JavaDoc dumpDirectory, IndexInformation info, IndexWriter writer) {
350             super(dumpDirectory, info, writer);
351         }
352
353         /**
354          * Handles a file. Used when creating a new index.
355          */

356         public void handleFile(IndexReader reader, File JavaDoc file) {
357             addFile(file);
358         }
359     }
360
361     /**
362      * DOCUMENT ME!
363      */

364     public class UpdateIndexHandler extends IndexHandler {
365         /**
366          * Creates a new UpdateIndexHandler object.
367          *
368          * @param dumpDirectory DOCUMENT ME!
369          * @param info DOCUMENT ME!
370          * @param writer DOCUMENT ME!
371          */

372         public UpdateIndexHandler(File JavaDoc dumpDirectory, IndexInformation info, IndexWriter writer) {
373             super(dumpDirectory, info, writer);
374         }
375
376         /**
377          * Handles a new document. Used when updating the index.
378          */

379         public void handleNewDocument(IndexReader reader, Term term, File JavaDoc file) {
380             addFile(file);
381         }
382     }
383 }
384
Popular Tags