KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > openharmonise > rm > search > HarmoniseIndexer


1 /*
2  * The contents of this file are subject to the
3  * Mozilla Public License Version 1.1 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at http://www.mozilla.org/MPL/
6  *
7  * Software distributed under the License is distributed on an "AS IS"
8  * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
9  * See the License for the specific language governing rights and
10  * limitations under the License.
11  *
12  * The Initial Developer of the Original Code is Simulacra Media Ltd.
13  * Portions created by Simulacra Media Ltd are Copyright (C) Simulacra Media Ltd, 2004.
14  *
15  * All Rights Reserved.
16  *
17  * Contributor(s):
18  */

19
20 package org.openharmonise.rm.search;
21
22 import java.io.*;
23 import java.util.*;
24 import java.util.logging.*;
25
26 import javax.xml.transform.*;
27 import javax.xml.transform.dom.*;
28 import javax.xml.transform.stream.*;
29
30 import org.apache.lucene.analysis.*;
31 import org.apache.lucene.analysis.standard.*;
32 import org.apache.lucene.document.*;
33 import org.apache.lucene.document.Document;
34 import org.apache.lucene.index.*;
35 import org.apache.lucene.queryParser.*;
36 import org.apache.lucene.search.*;
37 import org.apache.lucene.store.*;
38 import org.openharmonise.commons.xml.*;
39 import org.openharmonise.rm.*;
40 import org.openharmonise.rm.config.*;
41 import org.openharmonise.rm.resources.*;
42 import org.openharmonise.rm.resources.content.*;
43 import org.pdfbox.pdfparser.*;
44 import org.pdfbox.pdmodel.*;
45 import org.pdfbox.util.*;
46 import EDU.oswego.cs.dl.util.concurrent.*;
47
48
49 /**
50  * Interface to the Lucene text searching and indexing API for use with objects within Harmonise.
51  *
52  * @author Michael Bell
53  * @author jejking
54  * @version $Revision: 1.4 $
55  *
56  */

57 public class HarmoniseIndexer {
58     
59     private static HarmoniseIndexer m_instance = null;
60     private static String JavaDoc m_indexHome = "";
61     private static Templates m_striptags_xsl = null;
62     private static ArrayList keywordFieldList = new ArrayList();
63     
64     private static final String JavaDoc FIELD_UNIQUEID = "uniqueid";
65     private static final String JavaDoc FIELD_ID = "id";
66     private static final String JavaDoc FIELD_NAME = "name";
67     private static final String JavaDoc FIELD_DISPLAY_NAME = "display_name";
68     private static final String JavaDoc FIELD_SUMMARY = "summary";
69     private static final String JavaDoc FIELD_GROUP = "group";
70     private static final String JavaDoc FIELD_CONTENTS = "contents";
71     private static final String JavaDoc FIELD_CLASS = "class";
72     private static final String JavaDoc INDEX_LOC_PROP = "INDEX_LOCATION";
73     public static final String JavaDoc TAG_INDEXER = "indexer";
74     public static final String JavaDoc TAG_INDEXABLE = "indexable";
75     public static final String JavaDoc TAG_TEMPLATE = "template";
76     public static final String JavaDoc TAG_COMPARISON = "comparison";
77     public static final String JavaDoc TAG_INDEX = "index";
78     public static final String JavaDoc ATTRIB_CLASSNAME = "classname";
79     private static final String JavaDoc PNAME_STRIPTAGS_XSL = "STRIPTAGS";
80     
81     private Executor executor;
82     
83     /**
84      * Logger.
85      */

86     private static Logger m_logger = Logger.getLogger(HarmoniseIndexer.class.getName());
87     
88     static {
89         //initialise array list of key word fields
90
keywordFieldList.add(FIELD_UNIQUEID);
91         keywordFieldList.add(FIELD_ID);
92         keywordFieldList.add(FIELD_GROUP);
93     }
94     
95     /**
96      * Default constructor.
97      *
98      * @throws HarmoniseIndexerException
99      */

100     private HarmoniseIndexer() throws HarmoniseIndexerException {
101         try {
102             m_indexHome = ConfigSettings.getProperty(INDEX_LOC_PROP);
103             if ((m_indexHome == null) || (m_indexHome.length() == 0)) {
104                 throw new HarmoniseIndexerException("Index location is not defined!!");
105             }
106             executor = new QueuedExecutor();
107         }
108         catch (Exception JavaDoc e) {
109             m_logger.log(Level.SEVERE, "Could not instantiate HarmoniseIndexer", e);
110             throw new HarmoniseIndexerException(e.getMessage(), e);
111         }
112     }
113     
114     /**
115      * Returns singleton instance of <code>HarmoniseIndexer</code>.
116      *
117      * @return instance of <code>HarmoniseIndexer</code>.
118      * @throws HarmoniseIndexerException
119      */

120     public static HarmoniseIndexer getInstance() throws HarmoniseIndexerException {
121         if (m_instance == null) {
122             m_instance = new HarmoniseIndexer();
123         }
124         return m_instance;
125     }
126     
127     public static HarmoniseIndexer getIndexer(String JavaDoc indexHome) throws HarmoniseIndexerException {
128         if (m_instance == null) {
129             m_instance = new HarmoniseIndexer();
130         }
131         HarmoniseIndexer.m_indexHome = indexHome;
132         return m_instance;
133     }
134
135     /**
136      * Returns <code>true</code> if the given object is indexed.
137      *
138      * @param xobj
139      * @return
140      * @throws HarmoniseIndexerException
141      */

142     public static boolean isIndexed(AbstractObject xobj) throws HarmoniseIndexerException {
143         boolean bExists = false;
144
145         try {
146             Directory directory = FSDirectory.getDirectory(HarmoniseIndexer.m_indexHome, false);
147             IndexReader reader = IndexReader.open(directory);
148             Term term = new Term(HarmoniseIndexer.FIELD_UNIQUEID, xobj.getClass().getName() + String.valueOf(xobj.getId()));
149
150             if (reader.docFreq(term) > 0) {
151                 bExists = true;
152             }
153             reader.close();
154         }
155         catch (FileNotFoundException e) {
156             bExists = false;
157         }
158         catch (Exception JavaDoc e) {
159             m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
160             throw new HarmoniseIndexerException(e.getMessage(), e);
161         }
162
163         return bExists;
164     }
165
166     /**
167      * Indexes the given object.
168      *
169      * @param pObj
170      * @throws HarmoniseIndexerException
171      */

172     public void indexObject(AbstractObject pObj) throws HarmoniseIndexerException {
173         if (pObj == null || (pObj instanceof AbstractObject) == false) {
174             throw new HarmoniseIndexerException("Object must be AbstractObject - " + pObj.getClass().getName());
175         }
176         IndexRunnable indexer = new IndexRunnable(pObj);
177         try {
178             executor.execute(indexer); // hands off to single background thread
179
}
180         catch (InterruptedException JavaDoc e) {
181             throw new HarmoniseIndexerException("Problem running indexer asynchronously", e);
182         }
183     }
184
185
186     /**
187      * Searches the index for objects of the type given by the <code>Class</code> and
188      * fulfilling the conditions given by the other arguments and returns a <code>List</code>
189      * of object IDs.
190      *
191      * @param xobj
192      * @param groupIds
193      * @param sName
194      * @param sSummary
195      * @param sContent
196      * @return
197      * @throws HarmoniseIndexerException
198      */

199     public List searchContents(Class JavaDoc xobjClass, Vector groupIds, String JavaDoc sName, String JavaDoc sSummary, String JavaDoc sContent) throws HarmoniseIndexerException {
200         return searchContents(getQuery(xobjClass, groupIds, sName, null, sSummary, sContent));
201     }
202
203     /**
204      * Returns the Lucene query string built from the conditions given for 'name', 'summary', etc.
205      *
206      * @param xobjClass
207      * @param groupIds
208      * @param sName
209      * @param sDisplayName
210      * @param sSummary
211      * @param sContent
212      *
213      * @return
214      */

215     public String JavaDoc getQuery(Class JavaDoc xobjClass, Vector groupIds, String JavaDoc sName, String JavaDoc sDisplayName, String JavaDoc sSummary, String JavaDoc sContent) {
216         StringBuffer JavaDoc sQuery = new StringBuffer JavaDoc();
217
218         sQuery.append(HarmoniseIndexer.FIELD_CLASS).append(":").append(xobjClass.getName());
219
220         if ((groupIds != null) && (groupIds.size() > 0)) {
221             sQuery.append(" AND (");
222
223             for (int i = 0; i < groupIds.size(); i++) {
224                 if (i > 0) {
225                     sQuery.append(" OR ");
226                 }
227
228                 sQuery.append(HarmoniseIndexer.FIELD_GROUP).append(":").append(groupIds.elementAt(i));
229             }
230
231             sQuery.append(") ");
232         }
233
234         sQuery.append(" AND (");
235
236         boolean bOR = false;
237
238         // Process Name, if it has been submitted
239
if ((sName != null) && (sName.length() > 0)) {
240             buildFieldQueryString(sQuery, FIELD_NAME, sName);
241             bOR = true;
242         }
243         
244         if(sDisplayName != null) {
245             if (bOR) {
246                 sQuery.append(" OR ");
247             }
248             buildFieldQueryString(sQuery, FIELD_DISPLAY_NAME, sDisplayName);
249             
250             bOR = true;
251         }
252
253         // Process Summary, it it has been submitted
254
if ((sSummary != null) && (sSummary.length() > 0)) {
255             if (bOR) {
256                 sQuery.append(" OR ");
257             }
258
259             buildFieldQueryString(sQuery, FIELD_SUMMARY, sSummary);
260             bOR = true;
261         }
262
263         // Process Content, if it is been submitted
264
if ((sContent != null) && (sContent.length() > 0)) {
265             if (bOR) {
266                 sQuery.append(" OR ");
267             }
268
269             buildFieldQueryString(sQuery, FIELD_CONTENTS, sContent);
270         }
271
272         sQuery.append(")");
273         
274         return sQuery.toString();
275     }
276
277     /**
278      * Runs the given query against the index and returns a <code>List</code> of object
279      * IDs.
280      *
281      * @param queryString
282      * @return List of hits
283      * @throws HarmoniseIndexerException
284      */

285     public List searchContents(String JavaDoc queryString) throws HarmoniseIndexerException {
286         Vector vec = new Vector();
287
288         if(m_logger.getLevel() == Level.FINE) {
289             m_logger.log(Level.FINE, "Lucene query - " + queryString);
290         }
291         
292         try {
293             Searcher searcher = new IndexSearcher(m_indexHome);
294             StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
295             
296             //need a PerFieldAnalyzerWrapper so that our PorterStem
297
//analyzer isn't applied to the keywords we've set
298
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new HarmoniseAnalyzer());
299             
300             Iterator iter = keywordFieldList.iterator();
301             
302             while(iter.hasNext()) {
303                 String JavaDoc field = (String JavaDoc) iter.next();
304                 analyzer.addAnalyzer(field,standardAnalyzer);
305             }
306
307             Query query = QueryParser.parse(queryString, FIELD_CONTENTS,analyzer);
308
309             Hits hits = searcher.search(query);
310             if(m_logger.getLevel() == Level.FINE) {
311                 m_logger.log(Level.FINE, "Lucene query found " + hits.length() + " hits in the index");
312             }
313
314             for (int i = 0; i < hits.length(); i++) {
315                 vec.addElement(hits.doc(i).get(FIELD_ID));
316             }
317
318             searcher.close();
319         }
320         catch (Exception JavaDoc e) {
321             m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
322         }
323         // if we encounter any errors, rather than propagating the exception up
324
// we'll just return the vector, but it'll be empty
325
return vec;
326     }
327
328     /**
329      * Utility method to query the Lucene index independently of the Harmonise API.
330      *
331      * @param queryString a correctly formatted Lucene query to be parsed.
332      * @return List of Strings containing some summary info about the Hits returned
333      * @throws HarmoniseIndexerException
334      */

335     public List search(String JavaDoc queryString) throws HarmoniseIndexerException {
336         List hitsList = new ArrayList();
337         Hits hits = null;
338         try {
339             Searcher searcher = new IndexSearcher(m_indexHome);
340             Query query = QueryParser.parse(queryString, FIELD_CONTENTS, new HarmoniseAnalyzer());
341             hits = searcher.search(query);
342             // iterate through hits and build a list to return
343

344             for (int i = 0; i < hits.length(); i++) {
345                 StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
346                 org.apache.lucene.document.Document doc = hits.doc(i);
347                 sb.append("Unique id: " + doc.get(FIELD_UNIQUEID));
348                 sb.append("\n");
349                 sb.append("Summary: " + doc.get(FIELD_SUMMARY));
350                 sb.append("\n");
351                 sb.append("Name: " + doc.get(FIELD_NAME));
352                 sb.append("\n");
353                 sb.append("Class: " + doc.get(FIELD_CLASS));
354                 hitsList.add(sb.toString());
355             }
356             searcher.close();
357         }
358         catch (Exception JavaDoc e) {
359             throw new HarmoniseIndexerException(e.getMessage(), e);
360         }
361         return hitsList;
362     }
363
364     /**
365      * Deletes the given object from the index.
366      *
367      * @param xobj
368      * @throws HarmoniseIndexerException
369      */

370     public void deleteFromIndex(AbstractObject xobj) throws HarmoniseIndexerException {
371         DeleterRunnable deleter = new DeleterRunnable(xobj);
372         try {
373             executor.execute(deleter);
374         }
375         catch (InterruptedException JavaDoc e) {
376             throw new HarmoniseIndexerException("Problem running delete asynchronously", e);
377         }
378     }
379
380
381     /**
382      * Utility method to process boolean operators in the raw query string correctly into the
383      * so that the terms are associated with the correct Lucene field name.
384      *
385      * @param queryBuf StringBuffer being used to assemble the final Lucene query
386      * @param fieldName name of the field, so we can prepend it so that Lucene field is searched
387      * @param inputString the raw input, to be processed to make it field specific
388      */

389     private void buildFieldQueryString(StringBuffer JavaDoc queryBuf, String JavaDoc fieldName, String JavaDoc inputString) {
390         //just tokenise on white space as per default
391
StringTokenizer tokeniser = new StringTokenizer(inputString);
392         
393         // ** Use the following code to build simple phrases
394
// ** when this is used, make sure that two successive quotes "" are replaced with
395
// ** one single quote or the parser falls over
396
boolean buildingPhrase = false;
397         while (tokeniser.hasMoreTokens()) {
398             String JavaDoc token = tokeniser.nextToken();
399             token = token.replaceAll("\"\"", "\"");
400             if (token.equals("AND") || token.equals("OR") || token.equals("NOT")) {
401                 queryBuf.append(token + " "); // it's an operator, just append it raw
402
}
403             else {
404                 // are we building a phrase?
405
if (buildingPhrase == true) {
406                     queryBuf.append(token + " "); // no need to prepend field:
407
// do we need to stop building the phrase/
408
if (token.endsWith("\"")) {
409                         buildingPhrase = false;
410                     }
411                     continue;
412                 }
413                 queryBuf.append(fieldName + ":" + token + " ");
414                 // are we going to start building a phrase ?
415
if (token.startsWith("\"")) {
416                     buildingPhrase = true;
417                 }
418             }
419         }
420     }
421     
422     /**
423      * Utility to add objects to the Lucene index. Extracts the indexable fields, including contents for
424      * PDF and XML docments and writes them to the index.
425      *
426      * @author John King
427      */

428     private class IndexRunnable implements Runnable JavaDoc {
429         
430         private AbstractObject obj;
431         private String JavaDoc contents;
432                 
433         public IndexRunnable(AbstractObject obj) {
434             this.obj = obj;
435         }
436         
437         /* (non-Javadoc)
438          * @see java.lang.Runnable#run()
439          */

440         public void run() {
441             contents = getContents();
442             Document doc = new Document(); // new Lucene document to hold details we're indexing
443

444             String JavaDoc classname = obj.getClass().getName();
445             
446             doc.add(Field.Keyword(FIELD_UNIQUEID, classname + String.valueOf(obj.getId())));
447             doc.add(Field.UnIndexed(FIELD_ID, String.valueOf(obj.getId())));
448             
449             try {
450                 AbstractParentObject grp = ((AbstractChildObject) obj).getRealParent();
451                 if (grp != null) {
452                     doc.add(Field.Keyword(FIELD_GROUP, String.valueOf(grp.getId())));
453                 }
454                 
455                 doc.add(Field.Text(FIELD_CLASS, classname));
456                 doc.add(Field.Text(FIELD_NAME, obj.getName()));
457                 if (obj.getSummary() != null) {
458                     doc.add(Field.Text(FIELD_SUMMARY, obj.getSummary()));
459                 }
460                 
461                 if(obj instanceof AbstractEditableObject) {
462                     AbstractEditableObject edObj = (AbstractEditableObject) obj;
463                     String JavaDoc sDispName = edObj.getDisplayName();
464                     
465                     if (sDispName != null) {
466                         doc.add(Field.Text(FIELD_DISPLAY_NAME, sDispName));
467                     }
468                 }
469
470                 if (contents != null) {
471                     doc.add(Field.Text(FIELD_CONTENTS, new StringReader(contents)));
472                 }
473                 
474                 
475                 if (HarmoniseIndexer.isIndexed(obj) == true) {
476                     Directory directory = FSDirectory.getDirectory(HarmoniseIndexer.m_indexHome, false);
477                     
478                     if (IndexReader.indexExists(directory)) {
479                         IndexReader reader = IndexReader.open(directory);
480                         Term term = new Term(HarmoniseIndexer.FIELD_UNIQUEID, obj.getClass().getName() + String.valueOf(obj.getId()));
481                         reader.delete(term);
482                         reader.close();
483                     }
484                 }
485                 IndexWriter writer = null;
486                 try {
487                     writer = new IndexWriter(HarmoniseIndexer.m_indexHome, new HarmoniseAnalyzer(), false);
488                 } catch (FileNotFoundException e) {
489                     writer = new IndexWriter(HarmoniseIndexer.m_indexHome, new HarmoniseAnalyzer(), true);
490                 }
491                 
492                 writer.addDocument(doc);
493                 writer.optimize();
494                 writer.close();
495                 HarmoniseIndexer.m_logger.log(Level.INFO, "indexed " + obj.getType() + ", ID: " + obj.getId());
496             }
497             catch (DataAccessException e) {
498                 HarmoniseIndexer.m_logger.log(Level.WARNING, "Data Access Exception", e);
499             }
500             catch (IOException e) {
501                 HarmoniseIndexer.m_logger.log(Level.WARNING, "IOException", e);
502             }
503             catch (HarmoniseIndexerException e) {
504                 HarmoniseIndexer.m_logger.log(Level.WARNING, "Harmonise Indexer Exception", e);
505             }
506         }
507         
508         private String JavaDoc getContents() {
509             
510             String JavaDoc objContents = null;
511             
512             try {
513                 if (obj instanceof org.openharmonise.rm.resources.content.Document) {
514                     org.openharmonise.rm.resources.content.Document doc = (org.openharmonise.rm.resources.content.Document) obj;
515                     org.w3c.dom.Document JavaDoc xmlcontent = XMLDocument.getXMLDocumentFromString(doc.getContent());
516                     objContents = getStringFromXML(xmlcontent);
517                 }
518                 else if (obj instanceof Asset) {
519                     Asset asset = (Asset) obj;
520                     if (asset.getContentType().equalsIgnoreCase("application/pdf")) {
521                         objContents = getStringFromPDF(asset.getContentFile());
522                     }
523                 }
524             }
525             catch (Exception JavaDoc e) {
526                 HarmoniseIndexer.m_logger.log(Level.WARNING, "Exception", e);
527             }
528             return objContents;
529         }
530         
531         /**
532          * Returns the text content of an XML document.
533          *
534          * @param xml
535          * @return String representing content of document once tags have been stripped off.
536          * @throws HarmoniseIndexerException
537          */

538         private String JavaDoc getStringFromXML(org.w3c.dom.Document JavaDoc xml) throws HarmoniseIndexerException {
539             
540             String JavaDoc sResult = "";
541             try {
542                 if(HarmoniseIndexer.m_striptags_xsl == null) {
543                     //get strip tags xsl if not already created
544
String JavaDoc stripFileName = ConfigSettings.getProperty(PNAME_STRIPTAGS_XSL);
545                     
546                     if(stripFileName != null && stripFileName.length()>0) {
547                         StreamSource ssource = new StreamSource(new File(stripFileName));
548                         HarmoniseIndexer.m_striptags_xsl = (Templates) org.apache.xalan.xsltc.trax.TransformerFactoryImpl.newInstance().newTemplates(ssource);
549                     }
550                 }
551                 //if m_striptags_xsl is null here don't do anything
552
if(m_striptags_xsl != null) {
553                     Transformer trans = HarmoniseIndexer.m_striptags_xsl.newTransformer();
554                     DOMSource ds = new DOMSource(xml.getDocumentElement());
555                     StringWriter sw = new StringWriter();
556                     StreamResult res = new StreamResult(sw);
557                     trans.transform(ds, res);
558                     sResult = sw.toString();
559                     sw.close();
560                 }
561             } catch (ConfigException e) {
562                 throw new HarmoniseIndexerException("Config error", e);
563             } catch (TransformerConfigurationException e) {
564                 throw new HarmoniseIndexerException("Transformer Configuration Exception", e);
565             } catch (TransformerFactoryConfigurationError e) {
566                 throw new HarmoniseIndexerException("Transformer Factory Configuration error", e);
567             } catch (TransformerException e) {
568                 throw new HarmoniseIndexerException("Transformer error", e);
569             } catch (IOException e) {
570                 throw new HarmoniseIndexerException("IO error", e);
571             }
572             
573             return sResult;
574         }
575         
576         /**
577          * Returns the text content of a PDF file as a String.
578          *
579          * @param pdfFile
580          * @return
581          * @throws HarmoniseIndexerException
582          */

583         private String JavaDoc getStringFromPDF(File pdfFile) throws HarmoniseIndexerException {
584             String JavaDoc sText = "";
585             
586             try {
587                 FileInputStream pdfStream = new FileInputStream(pdfFile);
588                 PDFParser pdfParser = new PDFParser(pdfStream);
589                 pdfParser.parse();
590                 PDDocument pdf = pdfParser.getPDDocument();
591                 PDFTextStripper textstripper = new PDFTextStripper();
592                 sText = textstripper.getText(pdf);
593                 HarmoniseIndexer.m_logger.log(Level.FINEST, sText);
594                 pdf.close();
595             } catch (FileNotFoundException e) {
596                 throw new HarmoniseIndexerException("File not found", e);
597             } catch (IOException e) {
598                 throw new HarmoniseIndexerException("IO exception", e);
599             }
600
601             return sText;
602         }
603     }
604     
605     /**
606      * Deletion utility to remove objects from the Lucene index.
607      *
608      * @author jejking
609      */

610     private class DeleterRunnable implements Runnable JavaDoc {
611
612         private AbstractObject obj;
613         
614         /**
615          * @param obj the Harmonise object to delete
616          */

617         public DeleterRunnable(AbstractObject obj) {
618             if (obj == null) {
619                 throw new NullPointerException JavaDoc("obj cannot be null");
620             }
621             this.obj = obj;
622         }
623
624         /* (non-Javadoc)
625          * @see java.lang.Runnable#run()
626          */

627         public void run() {
628             try {
629                 Directory directory = FSDirectory.getDirectory(HarmoniseIndexer.m_indexHome, false);
630     
631                 if (IndexReader.indexExists(directory)) {
632                     IndexReader reader = IndexReader.open(directory);
633                     Term term = new Term(HarmoniseIndexer.FIELD_UNIQUEID, obj.getClass().getName() + String.valueOf(obj.getId()));
634                     reader.delete(term);
635                     reader.close();
636                     HarmoniseIndexer.m_logger.log(Level.FINE, "deleted " + obj.getType() + ", ID: " + obj.getId() + " from index");
637                 }
638             }
639             catch (Exception JavaDoc e) {
640                 HarmoniseIndexer.m_logger.log(Level.WARNING, "problem deleting object", e);
641             }
642         }
643     }
644
645 }
Popular Tags