KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > tigris > scarab > util > word > LuceneSearchIndex


1 package org.tigris.scarab.util.word;
2
3 /* ================================================================
4  * Copyright (c) 2001 Collab.Net. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are
8  * met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * 3. The end-user documentation included with the redistribution, if
18  * any, must include the following acknowlegement: "This product includes
19  * software developed by Collab.Net <http://www.Collab.Net/>."
20  * Alternately, this acknowlegement may appear in the software itself, if
21  * and wherever such third-party acknowlegements normally appear.
22  *
23  * 4. The hosted project names must not be used to endorse or promote
24  * products derived from this software without prior written
25  * permission. For written permission, please contact info@collab.net.
26  *
27  * 5. Products derived from this software may not use the "Tigris" or
28  * "Scarab" names nor may "Tigris" or "Scarab" appear in their names without
29  * prior written permission of Collab.Net.
30  *
31  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
32  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
33  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
34  * IN NO EVENT SHALL COLLAB.NET OR ITS CONTRIBUTORS BE LIABLE FOR ANY
35  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
37  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
38  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
39  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
40  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42  *
43  * ====================================================================
44  *
45  * This software consists of voluntary contributions made by many
46  * individuals on behalf of Collab.Net.
47  */

48
49 // JDK classes
50
import java.io.File JavaDoc;
51 import java.io.IOException JavaDoc;
52 import java.util.ArrayList JavaDoc;
53 import java.util.HashMap JavaDoc;
54 import java.util.Iterator JavaDoc;
55 import java.util.List JavaDoc;
56 import java.util.Map JavaDoc;
57
58 import org.apache.avalon.framework.activity.Initializable;
59 import org.apache.avalon.framework.configuration.Configurable;
60 import org.apache.avalon.framework.configuration.Configuration;
61 import org.apache.avalon.framework.context.Context;
62 import org.apache.avalon.framework.context.ContextException;
63 import org.apache.avalon.framework.context.Contextualizable;
64 import org.apache.lucene.document.Document;
65 import org.apache.lucene.document.Field;
66 import org.apache.lucene.index.IndexReader;
67 import org.apache.lucene.index.IndexWriter;
68 import org.apache.lucene.index.Term;
69 import org.apache.lucene.queryParser.QueryParser;
70 import org.apache.lucene.search.Hits;
71 import org.apache.lucene.search.IndexSearcher;
72 import org.apache.lucene.search.Query;
73 import org.apache.torque.util.Criteria;
74 import org.tigris.scarab.om.Attachment;
75 import org.tigris.scarab.om.AttachmentPeer;
76 import org.tigris.scarab.om.AttributeValue;
77 import org.tigris.scarab.om.AttributeValuePeer;
78 import org.tigris.scarab.om.IssuePeer;
79 import org.tigris.scarab.tools.localization.L10NKeySet;
80 import org.tigris.scarab.tools.localization.L10NMessage;
81 import org.tigris.scarab.tools.localization.Localizable;
82 import org.tigris.scarab.util.Log;
83 import org.tigris.scarab.util.ScarabException;
84
85 import com.workingdogs.village.Record;
86
87 /**
88  * Support for searching/indexing text
89  *
90  * @author <a HREF="mailto:jmcnally@collab.net">John McNally</a>
91  * @version $Id: LuceneSearchIndex.java 9381 2005-01-09 15:28:09Z dabbous $
92  */

93 public class LuceneSearchIndex
94     implements SearchIndex, Configurable,Contextualizable,Initializable
95 {
96     private String JavaDoc applicationRoot;
97     // used to occasionally optimize the index
98
private static int counter = 0;
99
100     /** the location of the index */
101     private String JavaDoc path;
102
103     /** the attributes that will be searched */
104     private List JavaDoc attributeIds;
105
106     /** the words and boolean operators */
107     private List JavaDoc queryText;
108
109     /** the attachments that will be searched */
110     private List JavaDoc attachmentIds;
111
112     /** the words and boolean operators */
113     private List JavaDoc attachmentQueryText;
114
115     /**
116      * Ctor. Sets up an index directory if one does not yet exist in the
117      * path specified by searchindex.path property in Scarab.properties.
118      */

119     public LuceneSearchIndex()
120         throws IOException JavaDoc
121     {
122         
123         
124     }
125
126     public void addQuery(Integer JavaDoc[] ids, String JavaDoc text)
127     {
128         attributeIds.add(ids);
129         queryText.add(text);
130     }
131
132     public void addAttachmentQuery(Integer JavaDoc[] ids, String JavaDoc text)
133     {
134         attachmentIds.add(ids);
135         attachmentQueryText.add(text);
136     }
137
138     public Long JavaDoc[] getRelatedIssues()
139     throws Exception JavaDoc
140     {
141         return getRelatedIssues(false); // perform AND operation
142
}
143
144     /**
145      * returns a list of related issue IDs sorted by relevance descending.
146      * Should return an empty/length=0 array if search returns no results.
147      * If mergeResults==true, internally merges results of partial queries,
148      * otherwise performs an implicit AND operation on partial queries.
149      */

150     public Long JavaDoc[] getRelatedIssues(boolean mergeResults)
151         throws Exception JavaDoc
152     {
153         Long JavaDoc[] result;
154         List JavaDoc issueIds = null;
155         // if there are no words to search for return no results
156
if (queryText.size() != 0 || attachmentQueryText.size() != 0)
157         {
158             // attributes
159
for (int j=attributeIds.size()-1; j>=0; j--)
160             {
161                 Integer JavaDoc[] ids = (Integer JavaDoc[])attributeIds.get(j);
162                 String JavaDoc query = (String JavaDoc) queryText.get(j);
163                 issueIds = performPartialQuery(ATTRIBUTE_ID,
164                                                ids, query, issueIds,
165                                                mergeResults);
166             }
167
168             // attachments
169
for (int j=attachmentIds.size()-1; j>=0; j--)
170             {
171                 Integer JavaDoc[] ids = (Integer JavaDoc[])attachmentIds.get(j);
172                 String JavaDoc query = (String JavaDoc) attachmentQueryText.get(j);
173                 issueIds = performPartialQuery(ATTACHMENT_TYPE_ID,
174                                                ids, query, issueIds,
175                                                mergeResults);
176             }
177
178             // put results into final form
179
result = new Long JavaDoc[issueIds.size()];
180             for (int i=0; i<issueIds.size(); i++)
181             {
182                 result[i] = (Long JavaDoc)issueIds.get(i);
183             }
184         }
185         else
186         {
187             result = EMPTY_LIST;
188         }
189         
190         return result;
191     }
192
193     private List JavaDoc performPartialQuery(String JavaDoc key, Integer JavaDoc[] ids,
194                                      String JavaDoc query, List JavaDoc issueIds,
195                                      boolean mergeResults)
196         throws ScarabException, IOException JavaDoc
197     {
198         StringBuffer JavaDoc fullQuery = new StringBuffer JavaDoc(query.length()+100);
199         
200         if (query.length() > 0)
201         {
202             query.trim();
203         }
204         
205                 if (ids != null && ids.length != 0)
206                 {
207                     fullQuery.append("+((");
208                     for (int i=ids.length-1; i>=0; i--)
209                     {
210                         fullQuery.append(key)
211                             .append(':')
212                             .append(ids[i].toString());
213                         if (i != 0)
214                         {
215                             fullQuery.append(" OR ");
216                         }
217                     }
218                     fullQuery.append(") AND (")
219                         .append(query)
220                         .append("))");
221                 }
222                 else
223                 {
224                     fullQuery
225                         .append("+(")
226                         .append(query)
227                         .append(')');
228                 }
229                 
230                 Query q = null;
231                 try
232                 {
233                     Log.get().debug("Querybefore=" + fullQuery);
234                     q = QueryParser.parse(fullQuery.toString(), TEXT,
235                                           new PorterStemAnalyzer());
236                     Log.get().debug("Queryafter=" + q.toString("text"));
237                 }
238                 catch (Throwable JavaDoc t)
239                 {
240                     throw new ScarabException(
241                             L10NKeySet.ExceptionParseError,
242                             fullQuery,
243                             t);
244                 }
245                 
246                 IndexSearcher is = new IndexSearcher(path);
247                 Hits hits = is.search(q);
248                 // remove duplicates
249
Map JavaDoc deduper = new HashMap JavaDoc((int)(1.25*hits.length()+1));
250                 for (int i=0; i<hits.length(); i++)
251                 {
252                     deduper.put(hits.doc(i).get(ISSUE_ID), null);
253                     Log.get().debug("Possible issueId from search: " +
254                                   hits.doc(i).get(ISSUE_ID));
255                 }
256                 is.close();
257                 
258                 if (issueIds == null)
259                 {
260                     issueIds = new ArrayList JavaDoc(deduper.size());
261                     Iterator JavaDoc iter = deduper.keySet().iterator();
262                     while (iter.hasNext())
263                     {
264                         issueIds.add(new Long JavaDoc((String JavaDoc)iter.next()));
265                         Log.get().debug("Adding issueId from search: " +
266                                   issueIds.get(issueIds.size()-1));
267                     }
268                 }
269                 else
270                 {
271                     if (mergeResults)
272                     {
273                         // perform OR operation
274
mergeResults(issueIds, deduper);
275                     }
276                     else
277                     {
278                         // perform an AND operation
279
removeUniqueElements(issueIds, deduper);
280                     }
281                 }
282         return issueIds;
283     }
284
285     /**
286      * Elements from the list that are not in map are removed from the list
287      */

288     private void removeUniqueElements(List JavaDoc list, Map JavaDoc map)
289     {
290         for (int i=list.size()-1; i>=0; i--)
291         {
292             Object JavaDoc obj = list.get(i);
293             if (!map.containsKey(obj.toString()))
294             {
295                 Log.get().debug("removing issueId from search: " + obj);
296                 list.remove(i);
297             }
298         }
299     }
300
301     /**
302      * Elements from the map, which are not in list are added to the list
303      */

304     private void mergeResults(List JavaDoc list, Map JavaDoc map)
305     {
306         for (int i=list.size()-1; i>=0; i--)
307         {
308             Long JavaDoc issueId = (Long JavaDoc)list.get(i);
309             String JavaDoc id = issueId.toString();
310             if (map.containsKey(id))
311             {
312                 map.remove(id);
313                 Log.get().debug("removed duplicate issueId from map: " + id);
314             }
315         }
316         Iterator JavaDoc iter = map.keySet().iterator();
317         while(iter.hasNext())
318         {
319             String JavaDoc id = (String JavaDoc)iter.next();
320             list.add(new Long JavaDoc(Long.parseLong(id)));
321             Log.get().debug("Add issueId from map to List: " + id);
322         }
323     }
324
325
326     /**
327      * Store index information for an AttributeValue
328      */

329     public void index(AttributeValue attributeValue)
330         throws Exception JavaDoc
331     {
332         String JavaDoc valId = attributeValue.getValueId().toString();
333
334         // make sure any old data stored for this attribute value is deleted.
335
Term term = new Term(VALUE_ID, valId);
336         int deletedDocs = 0;
337         try
338         {
339             synchronized (getClass())
340             {
341                 IndexReader reader = null;
342                 try
343                 {
344                     reader = IndexReader.open(path);
345                     deletedDocs = reader.delete(term);
346                 }
347                 finally
348                 {
349                     if (reader != null)
350                     {
351                         reader.close();
352                     }
353                 }
354             }
355         }
356         catch (NullPointerException JavaDoc npe)
357         {
358             /* Lucene is throwing npe in reader.delete, so have to explicitely
359                search. Not sure if the npe will be thrown in the
360                case where the attribute has previously been indexed, so
361                test whether the npe is harmful.
362             */

363             IndexSearcher is = new IndexSearcher(path);
364             Query q = QueryParser.parse("+" + VALUE_ID + ":" + valId, TEXT,
365                                         new PorterStemAnalyzer());
366             Hits hits = is.search(q);
367             if (hits.length() > 0)
368             {
369                 Localizable l10nInstance = new L10NMessage(L10NKeySet.ExceptionLucene, valId, npe);
370                 Log.get().debug(l10nInstance.getMessage());//[HD: create english message for logging!
371
throw new ScarabException(l10nInstance);
372             }
373         }
374         if (deletedDocs > 1)
375         {
376             throw new ScarabException(L10NKeySet.ExceptionMultipleAttValues,
377                                       valId);
378         }
379         /*
380         System.out.println("deleting valId: " + valId);
381         IndexSearcher is = new IndexSearcher(path);
382         Hits hits = is.search("+" + VALUE_ID + ":" + valId);
383         System.out.println("deleting previous: " + hits.length());
384         if (hits.length() > 1)
385         {
386             throw new ScarabException("Multiple AttributeValues in Lucene" +
387                                       "index with same ValueId: " + valId);
388         }
389         Document doc = hits.doc(0);
390         */

391
392         if (attributeValue.getValue() == null)
393         {
394             Log.get().warn("Attribute value pk=" + valId +
395                            " has a null value.");
396         }
397         else
398         {
399             Document doc = new Document();
400             Field valueId = Field.Keyword(VALUE_ID, valId);
401             Field issueId = Field.UnIndexed(ISSUE_ID,
402                 attributeValue.getIssueId().toString());
403             Field attributeId = Field.Keyword(ATTRIBUTE_ID,
404                 attributeValue.getAttributeId().toString());
405             Field text = Field.UnStored(TEXT, attributeValue.getValue());
406             doc.add(valueId);
407             doc.add(issueId);
408             doc.add(attributeId);
409             doc.add(text);
410             addDoc(doc);
411         }
412     }
413
414     private void addDoc(Document doc)
415         throws IOException JavaDoc
416     {
417         synchronized (getClass())
418         {
419             IndexWriter indexer = null;
420             try
421             {
422                 indexer = new IndexWriter(path,
423                                           new PorterStemAnalyzer(), false);
424                 indexer.addDocument(doc);
425                 
426                 if (++counter % 100 == 0)
427                 {
428                     indexer.optimize();
429                 }
430             }
431             finally
432             {
433                 if (indexer != null)
434                 {
435                     indexer.close();
436                 }
437             }
438         }
439     }
440
441     /**
442      * Store index information for an Attachment
443      */

444     public void index(Attachment attachment)
445         throws Exception JavaDoc
446     {
447         String JavaDoc attId = attachment.getAttachmentId().toString();
448
449         // make sure any old data stored for this attribute value is deleted.
450
Term term = new Term(ATTACHMENT_ID, attId);
451         int deletedDocs = 0;
452         try
453         {
454             synchronized (getClass())
455             {
456                 IndexReader reader = null;
457                 try
458                 {
459                     reader = IndexReader.open(path);
460                     deletedDocs = reader.delete(term);
461                 }
462                 finally
463                 {
464                     if (reader != null)
465                     {
466                         reader.close();
467                     }
468                 }
469             }
470         }
471         catch (NullPointerException JavaDoc npe)
472         {
473             /* Lucene is throwing npe in reader.delete, so have to explicitely
474                search. Not sure if the npe will be thrown in the
475                case where the attribute has previously been indexed, so
476                test whether the npe is harmful.
477             */

478             IndexSearcher is = new IndexSearcher(path);
479             Query q = QueryParser.parse("+" + ATTACHMENT_ID + ":" + attId,
480                                         TEXT, new PorterStemAnalyzer());
481             Hits hits = is.search(q);
482             if (hits.length() > 0)
483             {
484                 Localizable l10nInstance = new L10NMessage(L10NKeySet.ExceptionLucene, attId, npe);
485                 Log.get().debug(l10nInstance.getMessage());//[HD: create english message for logging!
486
throw new ScarabException(l10nInstance);
487             }
488         }
489         if (deletedDocs > 1)
490         {
491             throw new ScarabException(L10NKeySet.ExceptionMultipleAttachements,
492                                       attId);
493         }
494
495
496         if (attachment.getData() == null)
497         {
498             Log.get().warn("Attachment pk=" + attId + " has a null data.");
499         }
500         else
501         {
502             Document doc = new Document();
503             Field attachmentId = Field.Keyword(ATTACHMENT_ID, attId);
504             Field issueId = Field.UnIndexed(ISSUE_ID,
505                 attachment.getIssueId().toString());
506             Field typeId = Field.Keyword(ATTACHMENT_TYPE_ID,
507                 attachment.getTypeId().toString());
508             Field text = Field.UnStored(TEXT, attachment.getData());
509             doc.add(attachmentId);
510             doc.add(issueId);
511             doc.add(typeId);
512             doc.add(text);
513             addDoc(doc);
514         }
515     }
516
517     /**
518      * update the index for all entities that currently exist
519      */

520     public void updateIndex()
521         throws Exception JavaDoc
522     {
523         // find estimate of max id
524
Criteria crit = new Criteria();
525         crit.addSelectColumn("max(" + AttributeValuePeer.VALUE_ID + ")");
526         List JavaDoc records = AttributeValuePeer.doSelectVillageRecords(crit);
527         long max = ((Record)records.get(0)).getValue(1).asLong();
528         
529         long i = 0L;
530         List JavaDoc avs = null;
531         do
532         {
533             crit = new Criteria();
534             Criteria.Criterion low = crit.getNewCriterion(
535                  AttributeValuePeer.VALUE_ID,
536                  new Long JavaDoc(i), Criteria.GREATER_THAN);
537             i += 100L;
538             Criteria.Criterion high = crit.getNewCriterion(
539                 AttributeValuePeer.VALUE_ID,
540                 new Long JavaDoc(i), Criteria.LESS_EQUAL);
541             crit.add(low.and(high));
542             crit.add(AttributeValuePeer.DELETED, false);
543             // don't index issues that have been deleted
544
crit.addJoin(AttributeValuePeer.ISSUE_ID, IssuePeer.ISSUE_ID);
545             crit.add(IssuePeer.DELETED, false);
546             avs = AttributeValuePeer.doSelect(crit);
547             if (!avs.isEmpty())
548             {
549                 Iterator JavaDoc avi = avs.iterator();
550                 while (avi.hasNext())
551                 {
552                     AttributeValue av = (AttributeValue)avi.next();
553                     index(av);
554                 }
555                 if (Log.get().isDebugEnabled())
556                 {
557                     Log.get().debug("Updated index for attribute values (" +
558                         (i-100L) + "-" + i + "]");
559                     Log.debugMemory();
560                 }
561             }
562         }
563         while (i<max || !avs.isEmpty());
564
565         // Attachments
566

567         crit = new Criteria();
568         crit.addSelectColumn("max(" + AttachmentPeer.ATTACHMENT_ID + ")");
569         records = AttachmentPeer.doSelectVillageRecords(crit);
570         max = ((Record)records.get(0)).getValue(1).asLong();
571         i = 0L;
572         List JavaDoc atts = null;
573         do
574         {
575             crit = new Criteria();
576             Criteria.Criterion low = crit.getNewCriterion(
577                  AttachmentPeer.ATTACHMENT_ID,
578                  new Long JavaDoc(i), Criteria.GREATER_THAN);
579             i += 100L;
580             Criteria.Criterion high = crit.getNewCriterion(
581                 AttachmentPeer.ATTACHMENT_ID,
582                 new Long JavaDoc(i), Criteria.LESS_EQUAL);
583             crit.add(low.and(high));
584             crit.add(AttachmentPeer.DELETED, false);
585             // don't index issues that have been deleted
586
crit.addJoin(AttachmentPeer.ISSUE_ID, IssuePeer.ISSUE_ID);
587             crit.add(IssuePeer.DELETED, false);
588             atts = AttachmentPeer.doSelect(crit);
589             if (!atts.isEmpty())
590             {
591                 Iterator JavaDoc atti = atts.iterator();
592                 while (atti.hasNext())
593                 {
594                     Attachment att = (Attachment)atti.next();
595                     if (att.getData() != null && att.getData().length() > 0 &&
596                         att.getIssueId() != null && att.getTypeId() != null)
597                     {
598                         index(att);
599                     }
600                 }
601                 
602                 if (Log.get().isDebugEnabled())
603                 {
604                     Log.get().debug("Updated index for attachments (" +
605                         (i-100L) + "-" + i + "]");
606                     Log.debugMemory();
607                 }
608             }
609         }
610         while (i<max || !atts.isEmpty());
611
612         // finish off with an optimized index
613
synchronized (getClass())
614         {
615             IndexWriter indexer = null;
616             try
617             {
618                 indexer = new IndexWriter(path,
619                                           new PorterStemAnalyzer(), false);
620                 indexer.optimize();
621             }
622             finally
623             {
624                 if (indexer != null)
625                 {
626                     indexer.close();
627                 }
628             }
629         }
630     }
631     
632     // ---------------- Avalon Lifecycle Methods ---------------------
633
/**
634      * Avalon component lifecycle method
635      */

636     public void configure(Configuration conf)
637     {
638         path = conf.getAttribute(INDEX_PATH, null);
639         
640       
641         
642     }
643     
644     /**
645      * @see org.apache.avalon.framework.context.Contextualizable
646      * @avalon.entry key="urn:avalon:home" type="java.io.File"
647      */

648     public void contextualize(Context context) throws ContextException
649     {
650         this.applicationRoot = context.get( "urn:avalon:home" ).toString();
651     }
652     
653     /**
654      * Avalon component lifecycle method
655      * Initializes the service by loading default class loaders
656      * and customized object factories.
657      *
658      * @throws InitializationException if initialization fails.
659      */

660     public void initialize() throws Exception JavaDoc
661     {
662
663    
664         File JavaDoc indexDir = new File JavaDoc(path);
665         if (!indexDir.isAbsolute())
666         {
667             path = getRealPath(path);
668             indexDir = new File JavaDoc(path);
669         }
670
671         boolean createIndex = false;
672         if (indexDir.exists())
673         {
674             if (indexDir.listFiles().length == 0)
675             {
676                 createIndex = true;
677             }
678         }
679         else
680         {
681             indexDir.mkdirs();
682             createIndex = true;
683         }
684         
685         if (createIndex)
686         {
687             Log.get().info("Creating index at '" + path + '\'');
688             synchronized (getClass())
689             {
690                 IndexWriter indexer = null;
691                 try
692                 {
693                     indexer =
694                         new IndexWriter(path, new PorterStemAnalyzer(), true);
695                 }
696                 finally
697                 {
698                     if (indexer != null)
699                     {
700                         indexer.close();
701                     }
702                 }
703             }
704         }
705
706         clear();
707     }
708     
709     private String JavaDoc getRealPath(String JavaDoc path)
710     {
711         String JavaDoc absolutePath = null;
712         if (applicationRoot == null)
713         {
714             absolutePath = new File JavaDoc(path).getAbsolutePath();
715         }
716         else
717         {
718             absolutePath = new File JavaDoc(applicationRoot, path).getAbsolutePath();
719         }
720         return absolutePath;
721     }
722
723     /* (non-Javadoc)
724      * @see org.tigris.scarab.util.word.SearchIndex#clear()
725      */

726     public void clear()
727     {
728         attributeIds = new ArrayList JavaDoc(5);
729         queryText = new ArrayList JavaDoc(5);
730         attachmentIds = new ArrayList JavaDoc(2);
731         attachmentQueryText = new ArrayList JavaDoc(2);
732     }
733     
734 }
735
Popular Tags