LuceneSearchIndex


1   package org.tigris.scarab.util.word;
2   
3   /* ================================================================
4    * Copyright (c) 2001 Collab.Net.  All rights reserved.
5    *
6    * Redistribution and use in source and binary forms, with or without
7    * modification, are permitted provided that the following conditions are
8    * met:
9    *
10   * 1. Redistributions of source code must retain the above copyright
11   * notice, this list of conditions and the following disclaimer.
12   *
13   * 2. Redistributions in binary form must reproduce the above copyright
14   * notice, this list of conditions and the following disclaimer in the
15   * documentation and/or other materials provided with the distribution.
16   *
17   * 3. The end-user documentation included with the redistribution, if
18   * any, must include the following acknowlegement: "This product includes
19   * software developed by Collab.Net <http://www.Collab.Net/>."
20   * Alternately, this acknowlegement may appear in the software itself, if
21   * and wherever such third-party acknowlegements normally appear.
22   *
23   * 4. The hosted project names must not be used to endorse or promote
24   * products derived from this software without prior written
25   * permission. For written permission, please contact info@collab.net.
26   *
27   * 5. Products derived from this software may not use the "Tigris" or
28   * "Scarab" names nor may "Tigris" or "Scarab" appear in their names without
29   * prior written permission of Collab.Net.
30   *
31   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
32   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
33   * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
34   * IN NO EVENT SHALL COLLAB.NET OR ITS CONTRIBUTORS BE LIABLE FOR ANY
35   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
37   * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
38   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
39   * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
40   * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42   *
43   * ====================================================================
44   *
45   * This software consists of voluntary contributions made by many
46   * individuals on behalf of Collab.Net.
47   */
48  
49  // JDK classes
50  import java.io.File  ;
51  import java.io.IOException  ;
52  import java.util.ArrayList  ;
53  import java.util.HashMap  ;
54  import java.util.Iterator  ;
55  import java.util.List  ;
56  import java.util.Map  ;
57  
58  import org.apache.avalon.framework.activity.Initializable;
59  import org.apache.avalon.framework.configuration.Configurable;
60  import org.apache.avalon.framework.configuration.Configuration;
61  import org.apache.avalon.framework.context.Context;
62  import org.apache.avalon.framework.context.ContextException;
63  import org.apache.avalon.framework.context.Contextualizable;
64  import org.apache.lucene.document.Document;
65  import org.apache.lucene.document.Field;
66  import org.apache.lucene.index.IndexReader;
67  import org.apache.lucene.index.IndexWriter;
68  import org.apache.lucene.index.Term;
69  import org.apache.lucene.queryParser.QueryParser;
70  import org.apache.lucene.search.Hits;
71  import org.apache.lucene.search.IndexSearcher;
72  import org.apache.lucene.search.Query;
73  import org.apache.torque.util.Criteria;
74  import org.tigris.scarab.om.Attachment;
75  import org.tigris.scarab.om.AttachmentPeer;
76  import org.tigris.scarab.om.AttributeValue;
77  import org.tigris.scarab.om.AttributeValuePeer;
78  import org.tigris.scarab.om.IssuePeer;
79  import org.tigris.scarab.tools.localization.L10NKeySet;
80  import org.tigris.scarab.tools.localization.L10NMessage;
81  import org.tigris.scarab.tools.localization.Localizable;
82  import org.tigris.scarab.util.Log;
83  import org.tigris.scarab.util.ScarabException;
84  
85  import com.workingdogs.village.Record;
86  
87  /**
88   * Support for searching/indexing text
89   *
90   * @author <a HREF="mailto:jmcnally@collab.net">John McNally</a>
91   * @version $Id: LuceneSearchIndex.java 9381 2005-01-09 15:28:09Z dabbous $
92   */
93  public class LuceneSearchIndex 
94      implements SearchIndex, Configurable,Contextualizable,Initializable
95  {
96      private String   applicationRoot;
97      // used to occasionally optimize the index
98      private static int counter = 0;
99  
100     /** the location of the index */
101     private String   path;
102 
103     /** the attributes that will be searched */
104     private List   attributeIds;
105 
106     /** the words and boolean operators */
107     private List   queryText;
108 
109     /** the attachments that will be searched */
110     private List   attachmentIds;
111 
112     /** the words and boolean operators */
113     private List   attachmentQueryText;
114 
115     /**
116      * Ctor.  Sets up an index directory if one does not yet exist in the
117      * path specified by searchindex.path property in Scarab.properties.
118      */
119     public LuceneSearchIndex()
120         throws IOException  
121     {
122         
123         
124     }
125 
126     public void addQuery(Integer  [] ids, String   text)
127     {
128         attributeIds.add(ids);
129         queryText.add(text);
130     }
131 
132     public void addAttachmentQuery(Integer  [] ids, String   text)
133     {
134         attachmentIds.add(ids);
135         attachmentQueryText.add(text);
136     }
137 
138     public Long  [] getRelatedIssues()
139     throws Exception  
140     {
141         return getRelatedIssues(false); // perform AND operation
142     }
143 
144     /**
145      *  returns a list of related issue IDs sorted by relevance descending.
146      *  Should return an empty/length=0 array if search returns no results.
147      *  If mergeResults==true, internally merges results of partial queries,
148      *  otherwise performs an implicit AND operation on partial queries.
149      */
150     public Long  [] getRelatedIssues(boolean mergeResults) 
151         throws Exception  
152     {
153         Long  [] result;
154         List   issueIds = null; 
155         // if there are no words to search for return no results 
156         if (queryText.size() != 0 || attachmentQueryText.size() != 0)
157         {
158             // attributes
159             for (int j=attributeIds.size()-1; j>=0; j--) 
160             {
161                 Integer  [] ids = (Integer  [])attributeIds.get(j);
162                 String   query = (String  ) queryText.get(j);
163                 issueIds = performPartialQuery(ATTRIBUTE_ID, 
164                                                ids, query, issueIds,
165                                                mergeResults);
166             }
167 
168             // attachments
169             for (int j=attachmentIds.size()-1; j>=0; j--) 
170             {
171                 Integer  [] ids = (Integer  [])attachmentIds.get(j);
172                 String   query = (String  ) attachmentQueryText.get(j);
173                 issueIds = performPartialQuery(ATTACHMENT_TYPE_ID, 
174                                                ids, query, issueIds,
175                                                mergeResults);
176             }
177 
178             // put results into final form
179             result = new Long  [issueIds.size()];
180             for (int i=0; i<issueIds.size(); i++) 
181             {
182                 result[i] = (Long  )issueIds.get(i);
183             }
184         }
185         else
186         {
187             result = EMPTY_LIST; 
188         }
189         
190         return result;
191     }
192 
193     private List   performPartialQuery(String   key, Integer  [] ids, 
194                                      String   query, List   issueIds,
195                                      boolean mergeResults)
196         throws ScarabException, IOException  
197     {
198         StringBuffer   fullQuery = new StringBuffer  (query.length()+100);
199         
200         if (query.length() > 0)
201         {
202             query.trim();
203         }
204         
205                 if (ids != null && ids.length != 0) 
206                 {
207                     fullQuery.append("+((");
208                     for (int i=ids.length-1; i>=0; i--) 
209                     {
210                         fullQuery.append(key)
211                             .append(':')
212                             .append(ids[i].toString());
213                         if (i != 0) 
214                         {
215                             fullQuery.append(" OR ");
216                         }
217                     }
218                     fullQuery.append(") AND (")
219                         .append(query)
220                         .append("))");            
221                 }
222                 else
223                 {
224                     fullQuery
225                         .append("+(")
226                         .append(query)
227                         .append(')');
228                 }
229                 
230                 Query q = null;
231                 try
232                 {
233                     Log.get().debug("Querybefore=" + fullQuery);
234                     q = QueryParser.parse(fullQuery.toString(), TEXT, 
235                                           new PorterStemAnalyzer());
236                     Log.get().debug("Queryafter=" + q.toString("text"));
237                 }
238                 catch (Throwable   t)
239                 {
240                     throw new ScarabException(
241                             L10NKeySet.ExceptionParseError,
242                             fullQuery,
243                             t);
244                 }
245                 
246                 IndexSearcher is = new IndexSearcher(path); 
247                 Hits hits = is.search(q);
248                 // remove duplicates
249                 Map   deduper = new HashMap  ((int)(1.25*hits.length()+1));
250                 for (int i=0; i<hits.length(); i++) 
251                 {
252                     deduper.put(hits.doc(i).get(ISSUE_ID), null);
253                     Log.get().debug("Possible issueId from search: " + 
254                                   hits.doc(i).get(ISSUE_ID));
255                 }
256                 is.close();
257                 
258                 if (issueIds == null) 
259                 {
260                     issueIds = new ArrayList  (deduper.size());
261                     Iterator   iter = deduper.keySet().iterator();
262                     while (iter.hasNext()) 
263                     {
264                         issueIds.add(new Long  ((String  )iter.next()));
265                         Log.get().debug("Adding issueId from search: " + 
266                                   issueIds.get(issueIds.size()-1));
267                     }
268                 }
269                 else 
270                 {
271                     if (mergeResults)
272                     {
273                         // perform OR operation
274                         mergeResults(issueIds, deduper);
275                     }
276                     else
277                     {
278                         // perform an AND operation
279                         removeUniqueElements(issueIds, deduper);
280                     }
281                 }
282         return issueIds;
283     }
284 
285     /**
286      * Elements from the list that are not in map are removed from the list
287      */
288     private void removeUniqueElements(List   list, Map   map)
289     {
290         for (int i=list.size()-1; i>=0; i--) 
291         {
292             Object   obj = list.get(i);
293             if (!map.containsKey(obj.toString())) 
294             {
295                 Log.get().debug("removing issueId from search: " + obj);
296                 list.remove(i);
297             }
298         }
299     }
300 
301     /**
302      * Elements from the map, which are not in list are added to the list
303      */
304     private void mergeResults(List   list, Map   map)
305     {
306         for (int i=list.size()-1; i>=0; i--) 
307         {
308             Long   issueId = (Long  )list.get(i);
309             String   id = issueId.toString();
310             if (map.containsKey(id)) 
311             {
312                 map.remove(id);
313                 Log.get().debug("removed duplicate issueId from map: " + id);
314             }
315         }
316         Iterator   iter = map.keySet().iterator();
317         while(iter.hasNext())
318         {
319             String   id = (String  )iter.next();
320             list.add(new Long  (Long.parseLong(id)));
321             Log.get().debug("Add issueId from map to List: " + id);
322         }
323     }
324 
325 
326     /**
327      * Store index information for an AttributeValue
328      */
329     public void index(AttributeValue attributeValue)
330         throws Exception  
331     {
332         String   valId = attributeValue.getValueId().toString();
333 
334         // make sure any old data stored for this attribute value is deleted.
335         Term term = new Term(VALUE_ID, valId);
336         int deletedDocs = 0;
337         try
338         {
339             synchronized (getClass())
340             {
341                 IndexReader reader = null;
342                 try
343                 {
344                     reader = IndexReader.open(path);
345                     deletedDocs = reader.delete(term);
346                 }
347                 finally
348                 {
349                     if (reader != null) 
350                     {
351                         reader.close();
352                     }
353                 }
354             }
355         }
356         catch (NullPointerException   npe)
357         {
358             /* Lucene is throwing npe in reader.delete, so have to explicitely
359                search.  Not sure if the npe will be thrown in the 
360                case where the attribute has previously been indexed, so
361                test whether the npe is harmful.
362             */
363             IndexSearcher is = new IndexSearcher(path); 
364             Query q = QueryParser.parse("+" + VALUE_ID + ":" + valId, TEXT, 
365                                         new PorterStemAnalyzer());
366             Hits hits = is.search(q);
367             if (hits.length() > 0) 
368             {
369                 Localizable l10nInstance = new L10NMessage(L10NKeySet.ExceptionLucene, valId, npe);
370                 Log.get().debug(l10nInstance.getMessage());//[HD: create english message for logging!
371                 throw new ScarabException(l10nInstance);
372             }
373         }
374         if (deletedDocs > 1) 
375         {
376             throw new ScarabException(L10NKeySet.ExceptionMultipleAttValues,
377                                       valId);
378         }
379         /*
380         System.out.println("deleting valId: " + valId);
381         IndexSearcher is = new IndexSearcher(path); 
382         Hits hits = is.search("+" + VALUE_ID + ":" + valId);
383         System.out.println("deleting previous: " + hits.length());
384         if (hits.length() > 1) 
385         {
386             throw new ScarabException("Multiple AttributeValues in Lucene" +
387                                       "index with same ValueId: " + valId);
388         }
389         Document doc = hits.doc(0);
390         */
391 
392         if (attributeValue.getValue() == null) 
393         {
394             Log.get().warn("Attribute value pk=" + valId + 
395                            " has a null value.");
396         }
397         else 
398         {
399             Document doc = new Document();
400             Field valueId = Field.Keyword(VALUE_ID, valId);
401             Field issueId = Field.UnIndexed(ISSUE_ID, 
402                 attributeValue.getIssueId().toString());
403             Field attributeId = Field.Keyword(ATTRIBUTE_ID, 
404                 attributeValue.getAttributeId().toString());
405             Field text = Field.UnStored(TEXT, attributeValue.getValue());
406             doc.add(valueId);
407             doc.add(issueId);
408             doc.add(attributeId);
409             doc.add(text);
410             addDoc(doc);
411         }    
412     }
413 
414     private void addDoc(Document doc)
415         throws IOException  
416     {
417         synchronized (getClass())
418         {
419             IndexWriter indexer = null;
420             try
421             {
422                 indexer = new IndexWriter(path, 
423                                           new PorterStemAnalyzer(), false);
424                 indexer.addDocument(doc);
425                 
426                 if (++counter % 100 == 0) 
427                 {
428                     indexer.optimize();
429                 }
430             }
431             finally
432             {
433                 if (indexer != null) 
434                 {
435                     indexer.close();                    
436                 }
437             }
438         }
439     }        
440 
441     /**
442      * Store index information for an Attachment
443      */
444     public void index(Attachment attachment)
445         throws Exception  
446     {
447         String   attId = attachment.getAttachmentId().toString();
448 
449         // make sure any old data stored for this attribute value is deleted.
450         Term term = new Term(ATTACHMENT_ID, attId);
451         int deletedDocs = 0;
452         try
453         {
454             synchronized (getClass())
455             {
456                 IndexReader reader = null;
457                 try
458                 {
459                     reader = IndexReader.open(path);
460                     deletedDocs = reader.delete(term);
461                 }
462                 finally
463                 {
464                     if (reader != null) 
465                     {
466                         reader.close();
467                     }
468                 }
469             }
470         }
471         catch (NullPointerException   npe)
472         {
473             /* Lucene is throwing npe in reader.delete, so have to explicitely
474                search.  Not sure if the npe will be thrown in the 
475                case where the attribute has previously been indexed, so
476                test whether the npe is harmful.
477             */
478             IndexSearcher is = new IndexSearcher(path); 
479             Query q = QueryParser.parse("+" + ATTACHMENT_ID + ":" + attId, 
480                                         TEXT, new PorterStemAnalyzer());
481             Hits hits = is.search(q);
482             if (hits.length() > 0) 
483             {
484                 Localizable l10nInstance = new L10NMessage(L10NKeySet.ExceptionLucene, attId, npe);
485                 Log.get().debug(l10nInstance.getMessage());//[HD: create english message for logging!
486                 throw new ScarabException(l10nInstance);
487             }
488         }
489         if (deletedDocs > 1) 
490         {
491             throw new ScarabException(L10NKeySet.ExceptionMultipleAttachements,
492                                       attId);
493         }
494 
495 
496         if (attachment.getData() == null) 
497         {
498             Log.get().warn("Attachment pk=" + attId + " has a null data.");
499         }
500         else 
501         {
502             Document doc = new Document();
503             Field attachmentId = Field.Keyword(ATTACHMENT_ID, attId);
504             Field issueId = Field.UnIndexed(ISSUE_ID, 
505                 attachment.getIssueId().toString());
506             Field typeId = Field.Keyword(ATTACHMENT_TYPE_ID, 
507                 attachment.getTypeId().toString());
508             Field text = Field.UnStored(TEXT, attachment.getData());
509             doc.add(attachmentId);
510             doc.add(issueId);
511             doc.add(typeId);
512             doc.add(text);
513             addDoc(doc);
514         }            
515     }
516 
517     /**
518      * update the index for all entities that currently exist
519      */
520     public void updateIndex()
521         throws Exception  
522     {
523         // find estimate of max id
524         Criteria crit = new Criteria();
525         crit.addSelectColumn("max(" + AttributeValuePeer.VALUE_ID + ")");
526         List   records = AttributeValuePeer.doSelectVillageRecords(crit);
527         long max = ((Record)records.get(0)).getValue(1).asLong();
528         
529         long i = 0L;
530         List   avs = null;
531         do
532         {
533             crit = new Criteria();
534             Criteria.Criterion low = crit.getNewCriterion(
535                  AttributeValuePeer.VALUE_ID, 
536                  new Long  (i), Criteria.GREATER_THAN);
537             i += 100L;
538             Criteria.Criterion high = crit.getNewCriterion(
539                 AttributeValuePeer.VALUE_ID, 
540                 new Long  (i), Criteria.LESS_EQUAL);
541             crit.add(low.and(high));
542             crit.add(AttributeValuePeer.DELETED, false);
543             // don't index issues that have been deleted
544             crit.addJoin(AttributeValuePeer.ISSUE_ID, IssuePeer.ISSUE_ID);
545             crit.add(IssuePeer.DELETED, false);
546             avs = AttributeValuePeer.doSelect(crit);
547             if (!avs.isEmpty()) 
548             {
549                 Iterator   avi = avs.iterator();
550                 while (avi.hasNext()) 
551                 {
552                     AttributeValue av = (AttributeValue)avi.next();
553                     index(av);
554                 }
555                 if (Log.get().isDebugEnabled()) 
556                 {
557                     Log.get().debug("Updated index for attribute values (" + 
558                         (i-100L) + "-" + i + "]");                    
559                     Log.debugMemory();
560                 }                
561             }  
562         }
563         while (i<max || !avs.isEmpty());
564 
565         // Attachments
566 
567         crit = new Criteria();
568         crit.addSelectColumn("max(" + AttachmentPeer.ATTACHMENT_ID + ")");
569         records = AttachmentPeer.doSelectVillageRecords(crit);
570         max = ((Record)records.get(0)).getValue(1).asLong();
571         i = 0L;
572         List   atts = null;
573         do
574         {
575             crit = new Criteria();
576             Criteria.Criterion low = crit.getNewCriterion(
577                  AttachmentPeer.ATTACHMENT_ID, 
578                  new Long  (i), Criteria.GREATER_THAN);
579             i += 100L;
580             Criteria.Criterion high = crit.getNewCriterion(
581                 AttachmentPeer.ATTACHMENT_ID, 
582                 new Long  (i), Criteria.LESS_EQUAL);
583             crit.add(low.and(high));
584             crit.add(AttachmentPeer.DELETED, false);
585             // don't index issues that have been deleted
586             crit.addJoin(AttachmentPeer.ISSUE_ID, IssuePeer.ISSUE_ID);
587             crit.add(IssuePeer.DELETED, false);
588             atts = AttachmentPeer.doSelect(crit);
589             if (!atts.isEmpty()) 
590             {
591                 Iterator   atti = atts.iterator();
592                 while (atti.hasNext()) 
593                 {
594                     Attachment att = (Attachment)atti.next();
595                     if (att.getData() != null && att.getData().length() > 0 &&
596                         att.getIssueId() != null && att.getTypeId() != null) 
597                     {
598                         index(att);
599                     }                    
600                 }
601                 
602                 if (Log.get().isDebugEnabled()) 
603                 {
604                     Log.get().debug("Updated index for attachments (" + 
605                         (i-100L) + "-" + i + "]");                    
606                     Log.debugMemory();
607                 }                
608             }  
609         }
610         while (i<max || !atts.isEmpty());
611 
612         // finish off with an optimized index
613         synchronized (getClass())
614         {
615             IndexWriter indexer = null;
616             try
617             {
618                 indexer = new IndexWriter(path, 
619                                           new PorterStemAnalyzer(), false);
620                 indexer.optimize();
621             }
622             finally
623             {
624                 if (indexer != null) 
625                 {
626                     indexer.close();                    
627                 }
628             }
629         }
630     }
631     
632     // ---------------- Avalon Lifecycle Methods ---------------------
633     /**
634      * Avalon component lifecycle method
635      */
636     public void configure(Configuration conf) 
637     {
638         path = conf.getAttribute(INDEX_PATH, null);
639         
640       
641         
642     }
643     
644     /**
645      * @see org.apache.avalon.framework.context.Contextualizable
646      * @avalon.entry key="urn:avalon:home" type="java.io.File"
647      */    
648     public void contextualize(Context context) throws ContextException
649     {
650         this.applicationRoot = context.get( "urn:avalon:home" ).toString();
651     }    
652     
653     /**
654      * Avalon component lifecycle method
655      * Initializes the service by loading default class loaders
656      * and customized object factories.
657      *
658      * @throws InitializationException if initialization fails.
659      */
660     public void initialize() throws Exception  
661     {
662 
663    
664         File   indexDir = new File  (path);
665         if (!indexDir.isAbsolute()) 
666         {
667             path = getRealPath(path);
668             indexDir = new File  (path);
669         }          
670 
671         boolean createIndex = false;
672         if (indexDir.exists()) 
673         {
674             if (indexDir.listFiles().length == 0) 
675             {
676                 createIndex = true;
677             }       
678         }
679         else 
680         {
681             indexDir.mkdirs();
682             createIndex = true;
683         }
684         
685         if (createIndex)
686         {
687             Log.get().info("Creating index at '" + path + '\'');
688             synchronized (getClass())
689             {
690                 IndexWriter indexer = null;
691                 try
692                 {
693                     indexer = 
694                         new IndexWriter(path, new PorterStemAnalyzer(), true);
695                 }
696                 finally
697                 {
698                     if (indexer != null) 
699                     {
700                         indexer.close();                           
701                     }
702                 }
703             }
704         }        
705 
706         clear();
707     }
708     
709     private String   getRealPath(String   path)
710     {
711         String   absolutePath = null;
712         if (applicationRoot == null)
713         {
714             absolutePath = new File  (path).getAbsolutePath();
715         }
716         else
717         {
718             absolutePath = new File  (applicationRoot, path).getAbsolutePath();
719         }
720         return absolutePath;
721     }
722 
723     /* (non-Javadoc)
724      * @see org.tigris.scarab.util.word.SearchIndex#clear()
725      */
726     public void clear()
727     {
728         attributeIds        = new ArrayList  (5);
729         queryText           = new ArrayList  (5);
730         attachmentIds       = new ArrayList  (2);
731         attachmentQueryText = new ArrayList  (2);
732     }
733     
734 }
735
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags