KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > slide > index > TextContentIndexer


1 /*
2  * $Header: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java,v 1.5.2.2 2004/09/29 15:01:26 unico Exp $
3  * $Revision: 1.5.2.2 $
4  * $Date: 2004/09/29 15:01:26 $
5  *
6  * ====================================================================
7  *
8  * Copyright 2004 The Apache Software Foundation
9  *
10  * Licensed under the Apache License, Version 2.0 (the "License");
11  * you may not use this file except in compliance with the License.
12  * You may obtain a copy of the License at
13  *
14  * http://www.apache.org/licenses/LICENSE-2.0
15  *
16  * Unless required by applicable law or agreed to in writing, software
17  * distributed under the License is distributed on an "AS IS" BASIS,
18  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  * See the License for the specific language governing permissions and
20  * limitations under the License.
21  *
22  */

23
24 package org.apache.slide.index;
25
26 import org.apache.slide.search.IndexException;
27 import org.apache.slide.search.basic.IBasicExpressionFactory;
28 import org.apache.slide.util.logger.Logger;
29 import org.apache.slide.common.*;
30 import org.apache.slide.content.NodeRevisionNumber;
31 import org.apache.slide.content.NodeRevisionDescriptor;
32 import org.apache.slide.content.NodeRevisionContent;
33 import org.apache.slide.store.IndexStore;
34 import org.apache.slide.extractor.ExtractorManager;
35 import org.apache.slide.extractor.ExtractorException;
36 import org.apache.slide.extractor.ContentExtractor;
37 import org.apache.lucene.index.IndexWriter;
38 import org.apache.lucene.index.IndexReader;
39 import org.apache.lucene.index.Term;
40 import org.apache.lucene.analysis.standard.StandardAnalyzer;
41 import org.apache.lucene.analysis.Analyzer;
42 import org.apache.lucene.document.Document;
43 import org.apache.lucene.document.Field;
44
45 import java.io.IOException JavaDoc;
46 import java.io.CharArrayReader JavaDoc;
47 import java.io.ByteArrayInputStream JavaDoc;
48 import java.io.Reader JavaDoc;
49 import java.util.ArrayList JavaDoc;
50 import java.util.Collection JavaDoc;
51 import java.util.Hashtable JavaDoc;
52 import java.util.Iterator JavaDoc;
53 import java.util.StringTokenizer JavaDoc;
54
55 /**
56  * Lucene based IndexStore for indexing content.
57  * Apart from indexing the content as text field it adds
58  * indexes using the registered content extractor.
59  */

60 public class TextContentIndexer extends XAServiceBase implements IndexStore {
61
62     private static final String JavaDoc INDEX_PATH = "indexpath";
63     private static final String JavaDoc INCLUDES = "includes";
64     private static final String JavaDoc ANALYZER = "analyzer";
65     
66     public static final String JavaDoc URI_FIELD = "uri";
67     public static final String JavaDoc CONTENT_TEXT = "content";
68
69     private String JavaDoc indexpath = "";
70     private Collection JavaDoc includes;
71     private String JavaDoc analyzerClassName;
72     private Analyzer analyzer;
73     private boolean started = false;
74
75   /**
76     * Create Index, if not yet done.
77     *
78     * @param token a NamespaceAccessToken
79     *
80     * @throws org.apache.slide.common.ServiceInitializationFailedException
81     *
82     */

83     public void initialize(NamespaceAccessToken token)
84         throws ServiceInitializationFailedException
85    {
86       initAnalyzer();
87
88       IndexWriter indexWriter = null;
89       try
90       {
91          indexWriter = new IndexWriter(indexpath, analyzer, false);
92       }
93       // will fail, if not yet exists
94
catch (IOException JavaDoc e)
95       {
96          try
97          {
98             // create index
99
indexWriter = new IndexWriter(indexpath, analyzer, true);
100          }
101          catch (IOException JavaDoc ex)
102          {
103             getLogger().log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR);
104             throw new ServiceInitializationFailedException(this, ex);
105          }
106       }
107
108       try
109       {
110          indexWriter.close();
111       }
112       catch (IOException JavaDoc e)
113       {
114           getLogger().log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR);
115             throw new ServiceInitializationFailedException (this, e);
116
117       }
118       getLogger().log("Lucene is correctly initialized", LOG_CHANNEL, Logger.INFO);
119     }
120
121     /**
122      * Index an object content.
123      *
124      * @param uri Uri
125      * @exception IndexException Error accessing the Data Source
126      */

127     synchronized public void createIndex (Uri uri,
128                                           NodeRevisionDescriptor revisionDescriptor,
129                                           NodeRevisionContent revisionContent)
130         throws IndexException
131     {
132       if (!isIncluded(uri.toString())) return;
133       IndexWriter indexWriter = null;
134       try
135       {
136          indexWriter = new IndexWriter(indexpath, analyzer, false);
137
138          // Create document
139
Document doc = new Document();
140
141          doc.add(Field.Keyword(URI_FIELD, uri.toString()));
142          doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor, revisionContent)));
143
144          if ( revisionContent != null && revisionDescriptor != null ) {
145             ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(uri.getNamespace().getName(), null, revisionDescriptor);
146                  for ( int i = 0; i < extractor.length; i++ ) {
147                       Reader reader = extractor[i].extract(new ByteArrayInputStream JavaDoc(revisionContent.getContentBytes()));
148                       doc.add(Field.Text(CONTENT_TEXT, reader));
149                  }
150             }
151
152             indexWriter.addDocument(doc);
153             indexWriter.optimize();
154
155             getLogger().log(
156                  "Added '" + uri.toString() + " - " + revisionDescriptor.getRevisionNumber().toString() + "' to index",
157                  LOG_CHANNEL,
158                  Logger.INFO);
159        }
160       catch (IOException JavaDoc e)
161       {
162          getLogger().log(
163                 "Error creating an index with " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
164                 LOG_CHANNEL,
165                 Logger.ERROR);
166       }
167       catch( ExtractorException e)
168       {
169          getLogger().log(
170             "Error extracting content from " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
171             LOG_CHANNEL,
172             Logger.ERROR);
173       }
174       finally
175       {
176           try
177           {
178               if(indexWriter != null)
179                  indexWriter.close();
180           }
181           catch(IOException JavaDoc ioe ) {}
182       }
183     }
184
185     /**
186      * Method updateIndex
187      *
188      * @param uri an Uri
189      * @param revisionDescriptor a NodeRevisionDescriptor
190      * @param revisionContent a NodeRevisionContent
191      *
192      * @throws IndexException
193      *
194      */

195     synchronized public void updateIndex(Uri uri,
196                                          NodeRevisionDescriptor revisionDescriptor,
197                                          NodeRevisionContent revisionContent)
198       throws IndexException
199     {
200         if (!isIncluded(uri.toString())) return;
201         IndexWriter indexWriter = null;
202         try
203         {
204             // Delete entries from index
205
IndexReader indexReader = IndexReader.open(indexpath);
206             Term term = new Term(URI_FIELD, uri.toString());
207
208             indexReader.delete(term);
209             indexReader.close();
210
211             indexWriter = new IndexWriter(indexpath, analyzer, false);
212
213             // Create document
214
Document doc = new Document();
215
216             doc.add(Field.Keyword(URI_FIELD, uri.toString()));
217             doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor, revisionContent)));
218
219             if ( revisionContent != null && revisionDescriptor != null ) {
220                  ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(uri.getNamespace().getName(), null, revisionDescriptor);
221                  for ( int i = 0; i < extractor.length; i++ ) {
222                       Reader reader = extractor[i].extract(new ByteArrayInputStream JavaDoc(revisionContent.getContentBytes()));
223                       doc.add(Field.Text(CONTENT_TEXT, reader));
224                  }
225             }
226
227             indexWriter.addDocument(doc);
228             indexWriter.optimize();
229             
230             if (getLogger().isEnabled(Logger.DEBUG)) {
231                 getLogger().log(
232                      "Updated '" + uri + " - " + revisionDescriptor.getRevisionNumber() + "' to index",
233                      LOG_CHANNEL,
234                      Logger.DEBUG);
235             }
236         }
237         catch (IOException JavaDoc e)
238         {
239             getLogger().log(
240                  "Error updating the index with " + uri + " - " + revisionDescriptor.getRevisionNumber(),
241                  LOG_CHANNEL,
242                  Logger.ERROR);
243         }
244         catch( ExtractorException e)
245         {
246             getLogger().log(
247                  "Error extracting content from " + uri + " - " + revisionDescriptor.getRevisionNumber(),
248                  LOG_CHANNEL,
249                  Logger.ERROR);
250         }
251         finally
252        {
253            try
254            {
255                if(indexWriter != null)
256                   indexWriter.close();
257            }
258            catch(IOException JavaDoc ioe ) {}
259        }
260     }
261
262     /**
263      * Drop an object revision from the index.
264      *
265      * @param uri Uri
266      * @exception IndexException
267      */

268     synchronized public void dropIndex(Uri uri, NodeRevisionNumber number)
269       throws IndexException
270     {
271         if (!isIncluded(uri.toString())) return;
272         if (number == NodeRevisionNumber.HIDDEN_0_0) return;
273
274         IndexWriter indexWriter = null;
275         try
276         {
277             IndexReader indexReader = IndexReader.open(indexpath);
278             Term term = new Term(URI_FIELD, uri.toString());
279
280             indexReader.delete(term);
281             indexReader.close();
282
283             indexWriter = new IndexWriter(indexpath, analyzer, false);
284             indexWriter.optimize();
285
286             if (getLogger().isEnabled(Logger.DEBUG)) {
287                 getLogger().log(
288                      "Deleted '" + uri + "' from the index",
289                      LOG_CHANNEL,
290                      Logger.DEBUG);
291             }
292         }
293         catch (IOException JavaDoc e)
294         {
295             getLogger().log("Impossible to delete " + uri + " - " + number + " from the Lucene index");
296         }
297         finally
298         {
299             try
300             {
301                 if(indexWriter != null)
302                    indexWriter.close();
303             }
304             catch(IOException JavaDoc ioe ) {}
305         }
306     }
307
308
309     /**
310      * Method getFactory
311      *
312      * @return an IBasicExpressionFactory
313      *
314      */

315    public IBasicExpressionFactory getBasicExpressionFactory()
316    {
317       return new TextContainsExpressionFactory(indexpath, analyzer);
318    }
319
320
321   /**
322     * Connects to the underlying data source (if any is needed).
323     *
324     * @exception ServiceConnectionFailedException Connection failed
325     */

326     public void connect() throws ServiceConnectionFailedException
327     {
328         getLogger().log(
329              "TextContentIndexer: connect",
330              LOG_CHANNEL,
331              Logger.INFO);
332         started = true;
333     }
334
335   /**
336     * This function tells whether or not the service is connected.
337     *
338     * @return boolean true if we are connected
339     * @exception ServiceAccessException Service access error
340     */

341    public boolean isConnected() throws ServiceAccessException
342    {
343        return started;
344    }
345
346     /**
347      * Parametrize the service. This index store expects a parameter
348      * "indexpath" to contain the path to the directory to store the index.
349      * Another optional parameter "includes" lists the paths of resources
350      * that are to be indexed in a comma-separated format.
351      * Everything under an included path is indexed. If not specified all
352      * resources will be indexed.
353      *
354      * @param parameters Hashtable containing the parameters' names
355      * and associated values
356      * @exception ServiceParameterErrorException Incorrect service parameter
357      * @exception ServiceParameterMissingException Service parameter missing
358      */

359    public void setParameters (Hashtable JavaDoc parameters) throws ServiceParameterErrorException, ServiceParameterMissingException
360    {
361         indexpath = (String JavaDoc)parameters.get (INDEX_PATH);
362         if (indexpath == null || indexpath.length() == 0) {
363             throw new ServiceParameterMissingException (this, INDEX_PATH);
364         }
365         String JavaDoc includes = (String JavaDoc) parameters.get(INCLUDES);
366         if (includes != null && includes.length() > 0) {
367             StringTokenizer JavaDoc tokenizer = new StringTokenizer JavaDoc(includes, ",");
368             this.includes = new ArrayList JavaDoc(tokenizer.countTokens());
369             while (tokenizer.hasMoreTokens()) {
370                 this.includes.add(tokenizer.nextToken());
371             }
372         }
373         analyzerClassName = (String JavaDoc)parameters.get (ANALYZER);
374    }
375
376     /**
377      * Disconnects from the underlying data source.
378      *
379      * @exception ServiceDisconnectionFailedException Disconnection failed
380      */

381     public void disconnect() throws ServiceDisconnectionFailedException
382     {
383         getLogger().log(
384              "TextContentIndexer: disconnect",
385              LOG_CHANNEL,
386              Logger.INFO);
387         started = false;
388     }
389
390     /**
391      * Deletes service underlying data source, if possible (and meaningful).
392      *
393      * @exception ServiceResetFailedException Reset failed
394      */

395     public void reset() throws ServiceResetFailedException
396     {
397         getLogger().log(
398              "TextContentIndexer: reset",
399              LOG_CHANNEL,
400              Logger.INFO);
401    }
402
403     protected Reader readContent(NodeRevisionDescriptor revisionDescriptor,
404                                  NodeRevisionContent revisionContent) throws IOException JavaDoc {
405         return new CharArrayReader JavaDoc (revisionContent.getContent());
406     }
407     
408     protected boolean isIncluded(String JavaDoc uri) {
409         if (includes == null) return true;
410         Iterator JavaDoc iter = includes.iterator();
411         while (iter.hasNext()) {
412             if (uri.startsWith((String JavaDoc) iter.next())) {
413                 return true;
414             }
415         }
416         return false;
417     }
418
419
420     protected void initAnalyzer() throws ServiceInitializationFailedException {
421
422         if (analyzerClassName == null || analyzerClassName.length() == 0) {
423             getLogger().log("using Lucene StandardAnalyzer", LOG_CHANNEL, Logger.INFO);
424             analyzer = new StandardAnalyzer();
425
426         } else {
427             getLogger().log("loading Lucene analyzer: " + analyzerClassName, LOG_CHANNEL, Logger.INFO);
428
429             try {
430                 Class JavaDoc analyzerClazz = Class.forName(analyzerClassName);
431                 analyzer = (Analyzer)analyzerClazz.newInstance();
432
433             } catch (ClassNotFoundException JavaDoc cnfe) {
434                 getLogger().log("Error while instantiating analyzer " +
435                                 analyzerClassName + cnfe.getMessage(), LOG_CHANNEL, Logger.ERROR);
436                 throw new ServiceInitializationFailedException(this, cnfe);
437
438             } catch (InstantiationException JavaDoc ie) {
439                 getLogger().log("Error while instantiating analyzer " +
440                                 analyzerClassName + ie.getMessage(), LOG_CHANNEL, Logger.ERROR);
441                 throw new ServiceInitializationFailedException(this, ie);
442
443             } catch (IllegalAccessException JavaDoc iae) {
444                 getLogger().log("Error while instantiating analyzer " +
445                                 analyzerClassName + iae.getMessage(), LOG_CHANNEL, Logger.ERROR);
446                 throw new ServiceInitializationFailedException(this, iae);
447             }
448         }
449     }
450
451 }
Popular Tags