TextContentIndexer


1   /*
2    * $Header: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java,v 1.5.2.2 2004/09/29 15:01:26 unico Exp $
3    * $Revision: 1.5.2.2 $
4    * $Date: 2004/09/29 15:01:26 $
5    *
6    * ====================================================================
7    *
8    * Copyright 2004 The Apache Software Foundation
9    *
10   * Licensed under the Apache License, Version 2.0 (the "License");
11   * you may not use this file except in compliance with the License.
12   * You may obtain a copy of the License at
13   *
14   *     http://www.apache.org/licenses/LICENSE-2.0
15   *
16   * Unless required by applicable law or agreed to in writing, software
17   * distributed under the License is distributed on an "AS IS" BASIS,
18   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19   * See the License for the specific language governing permissions and
20   * limitations under the License.
21   *
22   */
23  
24  package org.apache.slide.index;
25  
26  import org.apache.slide.search.IndexException;
27  import org.apache.slide.search.basic.IBasicExpressionFactory;
28  import org.apache.slide.util.logger.Logger;
29  import org.apache.slide.common.*;
30  import org.apache.slide.content.NodeRevisionNumber;
31  import org.apache.slide.content.NodeRevisionDescriptor;
32  import org.apache.slide.content.NodeRevisionContent;
33  import org.apache.slide.store.IndexStore;
34  import org.apache.slide.extractor.ExtractorManager;
35  import org.apache.slide.extractor.ExtractorException;
36  import org.apache.slide.extractor.ContentExtractor;
37  import org.apache.lucene.index.IndexWriter;
38  import org.apache.lucene.index.IndexReader;
39  import org.apache.lucene.index.Term;
40  import org.apache.lucene.analysis.standard.StandardAnalyzer;
41  import org.apache.lucene.analysis.Analyzer;
42  import org.apache.lucene.document.Document;
43  import org.apache.lucene.document.Field;
44  
45  import java.io.IOException  ;
46  import java.io.CharArrayReader  ;
47  import java.io.ByteArrayInputStream  ;
48  import java.io.Reader  ;
49  import java.util.ArrayList  ;
50  import java.util.Collection  ;
51  import java.util.Hashtable  ;
52  import java.util.Iterator  ;
53  import java.util.StringTokenizer  ;
54  
55  /**
56   * Lucene based IndexStore for indexing content. 
57   * Apart from indexing the content as text field it adds
58   * indexes using the registered content extractor.
59   */
60  public class TextContentIndexer extends XAServiceBase implements IndexStore {
61  
62      private static final String   INDEX_PATH = "indexpath";
63      private static final String   INCLUDES = "includes";
64      private static final String   ANALYZER = "analyzer";
65      
66      public static final String   URI_FIELD = "uri";
67      public static final String   CONTENT_TEXT = "content";
68  
69      private String   indexpath = "";
70      private Collection   includes;
71      private String   analyzerClassName;
72      private Analyzer analyzer;
73      private boolean started = false;
74  
75    /**
76      * Create Index, if not yet done.
77      *
78      * @param    token               a  NamespaceAccessToken
79      *
80      * @throws   org.apache.slide.common.ServiceInitializationFailedException
81      *
82      */
83      public void initialize(NamespaceAccessToken token)
84          throws ServiceInitializationFailedException
85     {
86        initAnalyzer();
87  
88        IndexWriter indexWriter = null;
89        try
90        {
91           indexWriter = new IndexWriter(indexpath, analyzer, false);
92        }
93        // will fail, if not yet exists
94        catch (IOException   e)
95        {
96           try
97           {
98              // create index
99              indexWriter = new IndexWriter(indexpath, analyzer, true);
100          }
101          catch (IOException   ex)
102          {
103             getLogger().log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR);
104             throw new ServiceInitializationFailedException(this, ex);
105          }
106       }
107 
108       try
109       {
110          indexWriter.close();
111       }
112       catch (IOException   e)
113       {
114           getLogger().log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR);
115             throw new ServiceInitializationFailedException (this, e);
116 
117       }
118       getLogger().log("Lucene is correctly initialized", LOG_CHANNEL, Logger.INFO);
119     }
120 
121     /**
122      * Index an object content.
123      *
124      * @param uri Uri
125      * @exception IndexException Error accessing the Data Source
126      */
127     synchronized public void createIndex (Uri uri,
128                                           NodeRevisionDescriptor revisionDescriptor,
129                                           NodeRevisionContent revisionContent)
130         throws IndexException
131     {
132       if (!isIncluded(uri.toString())) return;
133       IndexWriter indexWriter = null;
134       try
135       {
136          indexWriter = new IndexWriter(indexpath, analyzer, false);
137 
138          // Create document
139          Document doc = new Document();
140 
141          doc.add(Field.Keyword(URI_FIELD, uri.toString()));
142          doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor, revisionContent)));
143 
144          if ( revisionContent != null && revisionDescriptor != null ) {
145             ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(uri.getNamespace().getName(), null, revisionDescriptor);
146                  for ( int i = 0; i < extractor.length; i++ ) {
147                       Reader reader = extractor[i].extract(new ByteArrayInputStream  (revisionContent.getContentBytes()));
148                       doc.add(Field.Text(CONTENT_TEXT, reader));
149                  }
150             }
151 
152             indexWriter.addDocument(doc);
153             indexWriter.optimize();
154 
155             getLogger().log(
156                  "Added '" + uri.toString() + " - " + revisionDescriptor.getRevisionNumber().toString() + "' to index",
157                  LOG_CHANNEL,
158                  Logger.INFO);
159        }
160       catch (IOException   e)
161       {
162          getLogger().log(
163                 "Error creating an index with " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
164                 LOG_CHANNEL,
165                 Logger.ERROR);
166       }
167       catch( ExtractorException e)
168       {
169          getLogger().log(
170             "Error extracting content from " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
171             LOG_CHANNEL,
172             Logger.ERROR);
173       }
174       finally
175       {
176           try
177           {
178               if(indexWriter != null)
179                  indexWriter.close();
180           }
181           catch(IOException   ioe ) {}
182       }
183     }
184 
185     /**
186      * Method updateIndex
187      *
188      * @param    uri                 an Uri
189      * @param    revisionDescriptor  a  NodeRevisionDescriptor
190      * @param    revisionContent     a  NodeRevisionContent
191      *
192      * @throws   IndexException
193      *
194      */
195     synchronized public void updateIndex(Uri uri,
196                                          NodeRevisionDescriptor revisionDescriptor,
197                                          NodeRevisionContent revisionContent)
198       throws IndexException
199     {
200         if (!isIncluded(uri.toString())) return;
201         IndexWriter indexWriter = null;
202         try
203         {
204             // Delete entries from index
205             IndexReader indexReader = IndexReader.open(indexpath);
206             Term term = new Term(URI_FIELD, uri.toString());
207 
208             indexReader.delete(term);
209             indexReader.close();
210 
211             indexWriter = new IndexWriter(indexpath, analyzer, false);
212 
213             // Create document
214             Document doc = new Document();
215 
216             doc.add(Field.Keyword(URI_FIELD, uri.toString()));
217             doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor, revisionContent)));
218 
219             if ( revisionContent != null && revisionDescriptor != null ) {
220                  ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(uri.getNamespace().getName(), null, revisionDescriptor);
221                  for ( int i = 0; i < extractor.length; i++ ) {
222                       Reader reader = extractor[i].extract(new ByteArrayInputStream  (revisionContent.getContentBytes()));
223                       doc.add(Field.Text(CONTENT_TEXT, reader));
224                  }
225             }
226 
227             indexWriter.addDocument(doc);
228             indexWriter.optimize();
229             
230             if (getLogger().isEnabled(Logger.DEBUG)) {
231                 getLogger().log(
232                      "Updated '" + uri + " - " + revisionDescriptor.getRevisionNumber() + "' to index",
233                      LOG_CHANNEL,
234                      Logger.DEBUG);
235             }
236         }
237         catch (IOException   e)
238         {
239             getLogger().log(
240                  "Error updating the index with " + uri + " - " + revisionDescriptor.getRevisionNumber(),
241                  LOG_CHANNEL,
242                  Logger.ERROR);
243         }
244         catch( ExtractorException e)
245         {
246             getLogger().log(
247                  "Error extracting content from " + uri + " - " + revisionDescriptor.getRevisionNumber(),
248                  LOG_CHANNEL,
249                  Logger.ERROR);
250         }
251         finally
252        {
253            try
254            {
255                if(indexWriter != null)
256                   indexWriter.close();
257            }
258            catch(IOException   ioe ) {}
259        }
260     }
261 
262     /**
263      * Drop an object revision from the index.
264      *
265      * @param uri Uri
266      * @exception IndexException
267      */
268     synchronized public void dropIndex(Uri uri, NodeRevisionNumber number)
269       throws IndexException
270     {
271         if (!isIncluded(uri.toString())) return;
272         if (number == NodeRevisionNumber.HIDDEN_0_0) return;
273 
274         IndexWriter indexWriter = null;
275         try
276         {
277             IndexReader indexReader = IndexReader.open(indexpath);
278             Term term = new Term(URI_FIELD, uri.toString());
279 
280             indexReader.delete(term);
281             indexReader.close();
282 
283             indexWriter = new IndexWriter(indexpath, analyzer, false);
284             indexWriter.optimize();
285 
286             if (getLogger().isEnabled(Logger.DEBUG)) {
287                 getLogger().log(
288                      "Deleted '" + uri + "' from the index",
289                      LOG_CHANNEL,
290                      Logger.DEBUG);
291             }
292         }
293         catch (IOException   e)
294         {
295             getLogger().log("Impossible to delete " + uri + " - " + number + " from the Lucene index");
296         }
297         finally
298         {
299             try
300             {
301                 if(indexWriter != null)
302                    indexWriter.close();
303             }
304             catch(IOException   ioe )  {}
305         }
306     }
307 
308 
309     /**
310      * Method getFactory
311      *
312      * @return   an IBasicExpressionFactory
313      *
314      */
315    public IBasicExpressionFactory getBasicExpressionFactory()
316    {
317       return new TextContainsExpressionFactory(indexpath, analyzer);
318    }
319 
320 
321   /**
322     * Connects to the underlying data source (if any is needed).
323     *
324     * @exception ServiceConnectionFailedException Connection failed
325     */
326     public void connect() throws ServiceConnectionFailedException
327     {
328         getLogger().log(
329              "TextContentIndexer:  connect",
330              LOG_CHANNEL,
331              Logger.INFO);
332         started = true;
333     }
334 
335   /**
336     * This function tells whether or not the service is connected.
337     *
338     * @return boolean true if we are connected
339     * @exception ServiceAccessException Service access error
340     */
341    public boolean isConnected() throws ServiceAccessException
342    {
343        return started;
344    }
345 
346     /**
347      * Parametrize the service. This index store expects a parameter
348      * "indexpath" to contain the path to the directory to store the index.
349      * Another optional parameter "includes" lists the paths of resources 
350      * that are to be indexed in a comma-separated format. 
351      * Everything under an included path is indexed. If not specified all 
352      * resources will be indexed.
353      * 
354      * @param parameters Hashtable containing the parameters' names
355      * and associated values
356      * @exception ServiceParameterErrorException Incorrect service parameter
357      * @exception ServiceParameterMissingException Service parameter missing
358      */
359    public void setParameters (Hashtable   parameters) throws ServiceParameterErrorException, ServiceParameterMissingException
360    {
361         indexpath = (String  )parameters.get (INDEX_PATH);
362         if (indexpath == null || indexpath.length() == 0) {
363             throw new ServiceParameterMissingException (this, INDEX_PATH);
364         }
365         String   includes = (String  ) parameters.get(INCLUDES);
366         if (includes != null && includes.length() > 0) {
367             StringTokenizer   tokenizer = new StringTokenizer  (includes, ",");
368             this.includes = new ArrayList  (tokenizer.countTokens());
369             while (tokenizer.hasMoreTokens()) {
370                 this.includes.add(tokenizer.nextToken());
371             }
372         }
373         analyzerClassName = (String  )parameters.get (ANALYZER);
374    }
375 
376     /**
377      * Disconnects from the underlying data source.
378      *
379      * @exception ServiceDisconnectionFailedException Disconnection failed
380      */
381     public void disconnect() throws ServiceDisconnectionFailedException
382     {
383         getLogger().log(
384              "TextContentIndexer:  disconnect",
385              LOG_CHANNEL,
386              Logger.INFO);
387         started = false;
388     }
389 
390     /**
391      * Deletes service underlying data source, if possible (and meaningful).
392      *
393      * @exception ServiceResetFailedException Reset failed
394      */
395     public void reset() throws ServiceResetFailedException
396     {
397         getLogger().log(
398              "TextContentIndexer:  reset",
399              LOG_CHANNEL,
400              Logger.INFO);
401    }
402 
403     protected Reader readContent(NodeRevisionDescriptor revisionDescriptor, 
404                                  NodeRevisionContent revisionContent) throws IOException   {
405         return new CharArrayReader   (revisionContent.getContent());
406     }
407     
408     protected boolean isIncluded(String   uri) {
409         if (includes == null) return true;
410         Iterator   iter = includes.iterator();
411         while (iter.hasNext()) {
412             if (uri.startsWith((String  ) iter.next())) {
413                 return true;
414             }
415         }
416         return false;
417     }
418 
419 
420     protected void initAnalyzer() throws ServiceInitializationFailedException {
421 
422         if (analyzerClassName == null || analyzerClassName.length() == 0) {
423             getLogger().log("using Lucene StandardAnalyzer", LOG_CHANNEL, Logger.INFO);
424             analyzer = new StandardAnalyzer();
425 
426         } else {
427             getLogger().log("loading Lucene analyzer: " + analyzerClassName, LOG_CHANNEL, Logger.INFO);
428 
429             try {
430                 Class   analyzerClazz = Class.forName(analyzerClassName);
431                 analyzer = (Analyzer)analyzerClazz.newInstance();
432 
433             } catch (ClassNotFoundException   cnfe) {
434                 getLogger().log("Error while instantiating analyzer " + 
435                                 analyzerClassName + cnfe.getMessage(), LOG_CHANNEL, Logger.ERROR);
436                 throw new ServiceInitializationFailedException(this, cnfe);
437 
438             } catch (InstantiationException   ie) {
439                 getLogger().log("Error while instantiating analyzer " + 
440                                 analyzerClassName + ie.getMessage(), LOG_CHANNEL, Logger.ERROR);
441                 throw new ServiceInitializationFailedException(this, ie);
442 
443             } catch (IllegalAccessException   iae) {
444                 getLogger().log("Error while instantiating analyzer " + 
445                                 analyzerClassName + iae.getMessage(), LOG_CHANNEL, Logger.ERROR);
446                 throw new ServiceInitializationFailedException(this, iae);
447             }
448         }
449     }
450 
451 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags