SimpleLuceneCocoonIndexerImpl


1   /*
2    * Copyright 1999-2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.apache.cocoon.components.search;
17  
18  import org.apache.avalon.framework.activity.Disposable;
19  import org.apache.avalon.framework.configuration.Configurable;
20  import org.apache.avalon.framework.configuration.Configuration;
21  import org.apache.avalon.framework.configuration.ConfigurationException;
22  import org.apache.avalon.framework.logger.AbstractLogEnabled;
23  import org.apache.avalon.framework.service.ServiceException;
24  import org.apache.avalon.framework.service.ServiceManager;
25  import org.apache.avalon.framework.service.Serviceable;
26  import org.apache.cocoon.ProcessingException;
27  import org.apache.cocoon.components.crawler.CocoonCrawler;
28  import org.apache.lucene.analysis.Analyzer;
29  import org.apache.lucene.document.Document;
30  import org.apache.lucene.index.IndexReader;
31  import org.apache.lucene.index.IndexWriter;
32  import org.apache.lucene.index.Term;
33  import org.apache.lucene.index.TermEnum;
34  import org.apache.lucene.store.Directory;
35  
36  import java.io.IOException  ;
37  import java.net.URL  ;
38  import java.util.Iterator  ;
39  
40  /**
41   * A lucene indexer.
42   *
43   * <p>
44   *  XML documents are indexed using lucene.
45   *  Links to XML documents are supplied by
46   *  a crawler, requesting links of documents by specifying a cocoon-view, and
47   *  HTTP protocol.
48   * </p>
49   *
50   * @author <a HREF="mailto:berni_huber@a1.net">Bernhard Huber</a>
51   * @version CVS $Id: SimpleLuceneCocoonIndexerImpl.java 124698 2005-01-09 01:57:13Z antonio $
52   */
53  public class SimpleLuceneCocoonIndexerImpl extends AbstractLogEnabled
54           implements LuceneCocoonIndexer, Configurable, Serviceable, Disposable
55  {
56  
57      /**
58       * configuration tagname for specifying the analyzer class
59       */
60      public final static String   ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
61      
62      /**
63       * configuration default analyzer class
64       */
65      public final static String   ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
66  
67      /**
68       * configuration tagname for specifying lucene's index directory
69       */
70      public final static String   DIRECTORY_CONFIG = "directory";
71      
72      /**
73       * configuration default directory, ie. no default.
74       */
75      public final static String   DIRECTORY_DEFAULT = null;
76  
77      /**
78       * configuration tagname for specifying lucene's merge factor.
79       */
80      public final static String   MERGE_FACTOR_CONFIG = "merge-factor";
81  
82      /**
83       * configuration default value for
84       * <a HREF="http://www.mail-archive.com/lucene-user@jakarta.apache.org/msg00373.html">lucene's merge factor</a>.
85       */
86      public final static int MERGE_FACTOR_DEFAULT = 10;
87  
88      /**
89       * The service manager for looking up components used.
90       */
91      protected ServiceManager manager = null;
92  
93      protected Analyzer analyzer;
94  //    private String analyzerClassnameDefault = ANALYZER_CLASSNAME_DEFAULT;
95      private int mergeFactor = MERGE_FACTOR_DEFAULT;
96  
97  
98      /**
99       *Sets the analyzer attribute of the SimpleLuceneCocoonIndexerImpl object
100      *
101      * @param  analyzer  The new analyzer value
102      */
103     public void setAnalyzer(Analyzer analyzer) {
104         this.analyzer = analyzer;
105     }
106 
107 
108     /**
109      * Configure this component.
110      *
111      * @param  conf                        is the configuration
112      * @exception  ConfigurationException  is thrown if configuring fails
113      */
114     public void configure(Configuration conf) throws ConfigurationException {
115         Configuration child;
116 
117 /*        child = conf.getChild(ANALYZER_CLASSNAME_CONFIG, false);
118         if (child != null) {
119             // fix Bugzilla Bug 25277, use child.getValue
120             // and in all following blocks
121             String value = child.getValue(ANALYZER_CLASSNAME_DEFAULT);
122             if (value != null) {
123                 analyzerClassnameDefault = value;
124             }
125         }
126 */
127         child = conf.getChild(MERGE_FACTOR_CONFIG, false);
128         if (child != null) {
129             // fix Bugzilla Bug 25277, use child instead of conf
130             int int_value = child.getValueAsInteger(MERGE_FACTOR_DEFAULT);
131             mergeFactor = int_value;
132         }
133     }
134 
135 
136     /**
137      * Set the current <code>ServiceManager</code> instance used by this
138      * <code>Serviceable</code>.
139      *
140      * @param  manager                 used by this component
141      * @exception  ServiceException  is never thrown
142      */
143     public void service(ServiceManager manager) throws ServiceException {
144         this.manager = manager;
145     }
146 
147 
148     /**
149      * Dispose this component.
150      */
151     public void dispose() { }
152 
153 
154     /**
155      * index content of base_url, index content of links from base_url.
156      *
157      * @param  index                    the lucene store to write the index to
158      * @param  create                   if true create, or overwrite existing index, else
159      *   update existing index.
160      * @param  base_url                 index content of base_url, and crawl through all its
161      *   links recursivly.
162      * @exception  ProcessingException  is thrown if indexing fails
163      */
164     public void index(Directory index, boolean create, URL   base_url)
165              throws ProcessingException {
166 
167         IndexWriter writer = null;
168         LuceneXMLIndexer lxi = null;
169         CocoonCrawler cocoonCrawler = null;
170 
171         try {
172             lxi = (LuceneXMLIndexer) manager.lookup(LuceneXMLIndexer.ROLE);
173 
174             writer = new IndexWriter(index, analyzer, create);
175             writer.mergeFactor = this.mergeFactor;
176 
177             cocoonCrawler = (CocoonCrawler) manager.lookup(CocoonCrawler.ROLE);
178             cocoonCrawler.crawl(base_url);
179 
180             Iterator   cocoonCrawlerIterator = cocoonCrawler.iterator();
181             while (cocoonCrawlerIterator.hasNext()) {
182                 URL   crawl_url = (URL  ) cocoonCrawlerIterator.next();
183                 // result of fix Bugzilla Bug 25270, in SimpleCocoonCrawlerImpl
184                 // check if crawl_url is null
185                 if (crawl_url == null) {
186                     continue;
187                 } else if (!crawl_url.getHost().equals(base_url.getHost()) ||
188                         crawl_url.getPort() != base_url.getPort()) {
189 
190                     // skip urls using different host, or port than host,
191                     // or port of base url
192                     if (getLogger().isDebugEnabled()) {
193                         getLogger().debug("Skipping crawling URL " + crawl_url.toString() +
194                             " as base_url is " + base_url.toString());
195                     }
196                     continue;
197                 }
198 
199                 // build lucene documents from the content of the crawl_url
200                 Iterator   i = lxi.build(crawl_url).iterator();
201 
202                 // add all built lucene documents
203                 while (i.hasNext()) {
204                     writer.addDocument((Document) i.next());
205                 }
206             }
207             // optimize it
208             writer.optimize();
209         } catch (IOException   ioe) {
210             throw new ProcessingException("IOException in index()", ioe);
211         } catch (ServiceException se) {
212             throw new ProcessingException("Could not lookup service in index()", se);
213         } finally {
214             if (writer != null) {
215                 try {
216                     writer.close();
217                 } catch (IOException   ioe) {
218                 }
219                 writer = null;
220             }
221 
222             if (lxi != null) {
223                 manager.release(lxi);
224                 lxi = null;
225             }
226             if (cocoonCrawler != null) {
227                 manager.release(cocoonCrawler);
228                 cocoonCrawler = null;
229             }
230         }
231     }
232 
233 
234     /**
235      * A document iterator deleting "old" documents form the index.
236      * 
237      * TODO: use this class before indexing, in non-creating mode.
238      */
239     static class DocumentDeletableIterator {
240         private IndexReader reader;
241         // existing index
242         private TermEnum uidIter;
243 
244         // document id iterator
245 
246 
247         /**
248          *Constructor for the DocumentDeletableIterator object
249          *
250          * @param  directory        Description of Parameter
251          * @exception  IOException  Description of Exception
252          */
253         public DocumentDeletableIterator(Directory directory) throws IOException   {
254             reader = IndexReader.open(directory);
255             // open existing index
256             uidIter = reader.terms(new Term("uid", ""));
257             // init uid iterator
258         }
259 
260 
261         /**
262          *Description of the Method
263          *
264          * @exception  IOException  Description of Exception
265          */
266         public void deleteAllStaleDocuments() throws IOException   {
267             while (uidIter.term() != null && uidIter.term().field().equals("uid")) {
268                 reader.delete(uidIter.term());
269                 uidIter.next();
270             }
271         }
272 
273 
274         /**
275          *Description of the Method
276          *
277          * @param  uid              Description of Parameter
278          * @exception  IOException  Description of Exception
279          */
280         public void deleteModifiedDocuments(String   uid) throws IOException   {
281             while (documentHasBeenModified(uidIter.term(), uid)) {
282                 reader.delete(uidIter.term());
283                 uidIter.next();
284             }
285             if (documentHasNotBeenModified(uidIter.term(), uid)) {
286                 uidIter.next();
287             }
288         }
289 
290 
291         /**
292          *Description of the Method
293          *
294          * @exception  Throwable  Description of Exception
295          */
296         protected void finalize() throws Throwable   {
297             super.finalize();
298             if (uidIter != null) {
299                 uidIter.close();
300                 // close uid iterator
301                 uidIter = null;
302             }
303             if (reader != null) {
304                 reader.close();
305                 // close existing index
306                 reader = null;
307             }
308         }
309 
310 
311         /**
312          *Description of the Method
313          *
314          * @param  term  Description of Parameter
315          * @return       Description of the Returned Value
316          */
317         boolean documentIsDeletable(Term term) {
318             return term != null && term.field() == "uid";
319         }
320 
321 
322         /**
323          *Description of the Method
324          *
325          * @param  term  Description of Parameter
326          * @param  uid   Description of Parameter
327          * @return       Description of the Returned Value
328          */
329         boolean documentHasBeenModified(Term term, String   uid) {
330             return documentIsDeletable(term) &&
331                     term.text().compareTo(uid) < 0;
332         }
333 
334 
335         /**
336          *Description of the Method
337          *
338          * @param  term  Description of Parameter
339          * @param  uid   Description of Parameter
340          * @return       Description of the Returned Value
341          */
342         boolean documentHasNotBeenModified(Term term, String   uid) {
343             return documentIsDeletable(term) &&
344                     term.text().compareTo(uid) == 0;
345         }
346     }
347 }
348 
349
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags