DSIndexer


1   /*
2    * DSIndexer.java
3    *
4    * Version: $Revision: 1.41 $
5    *
6    * Date: $Date: 2006/11/03 05:01:31 $
7    *
8    * Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
9    * Institute of Technology.  All rights reserved.
10   *
11   * Redistribution and use in source and binary forms, with or without
12   * modification, are permitted provided that the following conditions are
13   * met:
14   *
15   * - Redistributions of source code must retain the above copyright
16   * notice, this list of conditions and the following disclaimer.
17   *
18   * - Redistributions in binary form must reproduce the above copyright
19   * notice, this list of conditions and the following disclaimer in the
20   * documentation and/or other materials provided with the distribution.
21   *
22   * - Neither the name of the Hewlett-Packard Company nor the name of the
23   * Massachusetts Institute of Technology nor the names of their
24   * contributors may be used to endorse or promote products derived from
25   * this software without specific prior written permission.
26   *
27   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28   * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30   * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31   * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34   * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36   * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37   * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
38   * DAMAGE.
39   */
40  package org.dspace.search;
41  
42  import java.io.IOException  ;
43  import java.io.InputStreamReader  ;
44  import java.sql.SQLException  ;
45  import java.util.ArrayList  ;
46  import java.util.HashMap  ;
47  import java.util.Iterator  ;
48  
49  import org.apache.log4j.Logger;
50  import org.apache.lucene.analysis.Analyzer;
51  import org.apache.lucene.document.Document;
52  import org.apache.lucene.document.Field;
53  import org.apache.lucene.index.IndexReader;
54  import org.apache.lucene.index.IndexWriter;
55  import org.apache.lucene.index.Term;
56  import org.dspace.authorize.AuthorizeException;
57  import org.dspace.content.Bitstream;
58  import org.dspace.content.Bundle;
59  import org.dspace.content.Collection;
60  import org.dspace.content.Community;
61  import org.dspace.content.DCValue;
62  import org.dspace.content.DSpaceObject;
63  import org.dspace.content.Item;
64  import org.dspace.content.ItemIterator;
65  import org.dspace.content.MetadataSchema;
66  import org.dspace.core.ConfigurationManager;
67  import org.dspace.core.Constants;
68  import org.dspace.core.Context;
69  import org.dspace.core.LogManager;
70  import org.dspace.handle.HandleManager;
71  
72  /**
73   * DSIndexer contains the methods that index Items and their metadata,
74   * collections, communities, etc. It is meant to either be invoked from the
75   * command line (see dspace/bin/index-all) or via the indexContent() methods
76   * within DSpace.
77   */
78  public class DSIndexer
79  {
80      private static final Logger log = Logger.getLogger(DSIndexer.class);
81  
82      // TODO: Support for analyzers per language, or multiple indices
83      /** The analyzer for this DSpace instance */
84      private static Analyzer analyzer = null;
85      
86      /**
87       * IndexItem() adds a single item to the index
88       */
89      public static void indexContent(Context c, DSpaceObject dso)
90              throws SQLException  , IOException  
91      {
92          IndexWriter writer = openIndex(c, false);
93  
94          try
95          {
96              switch (dso.getType())
97              {
98              case Constants.ITEM:
99                  writeItemIndex(c, writer, (Item) dso);
100 
101                 break;
102 
103             case Constants.COLLECTION:
104                 writeCollectionIndex(c, writer, (Collection) dso);
105 
106                 break;
107 
108             case Constants.COMMUNITY:
109                 writeCommunityIndex(c, writer, (Community) dso);
110 
111                 break;
112 
113             // FIXME: should probably default unknown type exception
114             }
115         }
116         finally
117         {
118             closeIndex(c, writer);
119         }
120     }
121 
122     /**
123      * unIndex removes an Item, Collection, or Community only works if the
124      * DSpaceObject has a handle (uses the handle for its unique ID)
125      * 
126      * @param dso
127      *            DSpace Object, can be Community, Item, or Collection
128      */
129     public static void unIndexContent(Context c, DSpaceObject dso)
130             throws SQLException  , IOException  
131     {
132         String   h = HandleManager.findHandle(c, dso);
133 
134         unIndexContent(c, h);
135     }
136 
137     public static void unIndexContent(Context c, String   myhandle)
138             throws SQLException  , IOException  
139     {
140         String   index_directory = ConfigurationManager.getProperty("search.dir");
141         IndexReader ir = IndexReader.open(index_directory);
142 
143         try
144         {
145             if (myhandle != null)
146             {
147                 // we have a handle (our unique ID, so remove)
148                 Term t = new Term("handle", myhandle);
149                 ir.deleteDocuments(t);
150             }
151             else
152             {
153                 log.warn("unindex of content with null handle attempted");
154 
155                 // FIXME: no handle, fail quietly - should log failure
156                 //System.out.println("Error in unIndexContent: Object had no
157                 // handle!");
158             }
159         }
160         finally
161         {
162             ir.close();
163         }
164     }
165 
166     /**
167      * reIndexContent removes something from the index, then re-indexes it
168      * 
169      * @param c context object
170      * @param dso  object to re-index
171      */
172     public static void reIndexContent(Context c, DSpaceObject dso)
173             throws SQLException  , IOException  
174     {
175         unIndexContent(c, dso);
176         indexContent(c, dso);
177     }
178 
179     /**
180      * create full index - wiping old index
181      * 
182      * @param c   context to use
183      */
184     public static void createIndex(Context c) throws SQLException  , IOException  
185     {
186         IndexWriter writer = openIndex(c, true);
187 
188         try
189         {
190             indexAllCommunities(c, writer);
191             indexAllCollections(c, writer);
192             indexAllItems(c, writer);
193 
194             // optimize the index - important to do regularly to reduce
195             // filehandle
196             // usage
197             // and keep performance fast!
198             writer.optimize();
199         }
200         finally
201         {
202             closeIndex(c, writer);
203         }
204     }
205 
206     /**
207      * When invoked as a command-line tool, (re)-builds the whole index
208      * 
209      * @param args
210      *            the command-line arguments, none used
211      */
212     public static void main(String  [] args) throws Exception  
213     {
214         Context c = new Context();
215 
216         // for testing, pass in a handle of something to remove...
217         if ((args.length == 2) && (args[0].equals("remove")))
218         {
219             unIndexContent(c, args[1]);
220         }
221         else
222         {
223             c.setIgnoreAuthorization(true);
224 
225             createIndex(c);
226 
227             System.out.println("Done with indexing");
228         }
229     }
230 
231     /**
232      * Get the Lucene analyzer to use according to current configuration (or
233      * default). TODO: Should have multiple analyzers (and maybe indices?) for
234      * multi-lingual DSpaces.
235      * 
236      * @return <code>Analyzer</code> to use
237      * @throws IllegalStateException
238      *             if the configured analyzer can't be instantiated
239      */
240     static Analyzer getAnalyzer() throws IllegalStateException  
241     {
242         if (analyzer == null)
243         {
244             // We need to find the analyzer class from the configuration
245             String   analyzerClassName = ConfigurationManager
246                     .getProperty("search.analyzer");
247 
248             if (analyzerClassName == null)
249             {
250                 // Use default
251                 analyzerClassName = "org.dspace.search.DSAnalyzer";
252             }
253 
254             try
255             {
256                 Class   analyzerClass = Class.forName(analyzerClassName);
257                 analyzer = (Analyzer) analyzerClass.newInstance();
258             }
259             catch (Exception   e)
260             {
261                 log.fatal(LogManager.getHeader(null, "no_search_analyzer",
262                         "search.analyzer=" + analyzerClassName), e);
263 
264                 throw new IllegalStateException  (e.toString());
265             }
266         }
267 
268         return analyzer;
269     }
270     
271     
272     ////////////////////////////////////
273     //      Private
274     ////////////////////////////////////
275 
276     /**
277      * prepare index, opening writer, and wiping out existing index if necessary
278      */
279     private static IndexWriter openIndex(Context c, boolean wipe_existing)
280             throws IOException  
281     {
282         IndexWriter writer;
283 
284         String   index_directory = ConfigurationManager.getProperty("search.dir");
285 
286         writer = new IndexWriter(index_directory, getAnalyzer(),
287                 wipe_existing);
288 
289         /* Set maximum number of terms to index if present in dspace.cfg */
290         if (ConfigurationManager.getProperty("search.maxfieldlength") != null)
291         {
292             int maxfieldlength = ConfigurationManager
293                     .getIntProperty("search.maxfieldlength");
294             if (maxfieldlength == -1)
295             {
296                 writer.setMaxFieldLength(Integer.MAX_VALUE);
297             }
298             else
299             {
300                 writer.setMaxFieldLength(maxfieldlength);
301             }
302         }
303 
304         return writer;
305     }
306 
307     /**
308      * close up the indexing engine
309      */
310     private static void closeIndex(Context c, IndexWriter writer)
311             throws IOException  
312     {
313         if (writer != null)
314         {
315             writer.close();
316         }
317     }
318 
319     private static String   buildItemLocationString(Context c, Item myitem)
320             throws SQLException  
321     {
322         // build list of community ids
323         Community[] communities = myitem.getCommunities();
324 
325         // build list of collection ids
326         Collection[] collections = myitem.getCollections();
327 
328         // now put those into strings
329         String   location = "";
330         int i = 0;
331 
332         for (i = 0; i < communities.length; i++)
333             location = new String  (location + " m" + communities[i].getID());
334 
335         for (i = 0; i < collections.length; i++)
336             location = new String  (location + " l" + collections[i].getID());
337 
338         return location;
339     }
340 
341     private static String   buildCollectionLocationString(Context c,
342             Collection target) throws SQLException  
343     {
344         // build list of community ids
345         Community[] communities = target.getCommunities();
346 
347         // now put those into strings
348         String   location = "";
349         int i = 0;
350 
351         for (i = 0; i < communities.length; i++)
352             location = new String  (location + " m" + communities[i].getID());
353 
354         return location;
355     }
356 
357     /**
358      * iterate through the communities, and index each one
359      */
360     private static void indexAllCommunities(Context c, IndexWriter writer)
361             throws SQLException  , IOException  
362     {
363         Community[] targets = Community.findAll(c);
364 
365         int i;
366 
367         for (i = 0; i < targets.length; i++)
368             writeCommunityIndex(c, writer, targets[i]);
369     }
370 
371     /**
372      * iterate through collections, indexing each one
373      */
374     private static void indexAllCollections(Context c, IndexWriter writer)
375             throws SQLException  , IOException  
376     {
377         Collection[] targets = Collection.findAll(c);
378 
379         int i;
380 
381         for (i = 0; i < targets.length; i++)
382             writeCollectionIndex(c, writer, targets[i]);
383     }
384 
385     /**
386      * iterate through all items, indexing each one
387      */
388     private static void indexAllItems(Context c, IndexWriter writer)
389             throws SQLException  , IOException  
390     {
391         ItemIterator i = Item.findAll(c);
392 
393         while (i.hasNext())
394         {
395             Item target = (Item) i.next();
396 
397             writeItemIndex(c, writer, target);
398         }
399     }
400 
401     /**
402      * write index record for a community
403      */
404     private static void writeCommunityIndex(Context c, IndexWriter writer,
405             Community target) throws SQLException  , IOException  
406     {
407         // build a hash for the metadata
408         HashMap   textvalues = new HashMap  ();
409 
410         // get the handle
411         String   myhandle = HandleManager.findHandle(c, target);
412 
413         // and populate it
414         String   name = target.getMetadata("name");
415 
416         //        String description = target.getMetadata("short_description");
417         //        String intro_text = target.getMetadata("introductory_text");
418         textvalues.put("name", name);
419 
420         //        textvalues.put("description", description);
421         //        textvalues.put("intro_text", intro_text );
422         textvalues.put("handletext", myhandle);
423 
424         writeIndexRecord(writer, Constants.COMMUNITY, myhandle, textvalues, "");
425     }
426 
427     /**
428      * write an index record for a collection
429      */
430     private static void writeCollectionIndex(Context c, IndexWriter writer,
431             Collection target) throws SQLException  , IOException  
432     {
433         String   location_text = buildCollectionLocationString(c, target);
434 
435         // get the handle
436         String   myhandle = HandleManager.findHandle(c, target);
437 
438         // build a hash for the metadata
439         HashMap   textvalues = new HashMap  ();
440 
441         // and populate it
442         String   name = target.getMetadata("name");
443 
444         //        String description = target.getMetadata("short_description");
445         //        String intro_text = target.getMetadata("introductory_text");
446         textvalues.put("name", name);
447 
448         //        textvalues.put("description",description );
449         //        textvalues.put("intro_text", intro_text );
450         textvalues.put("location", location_text);
451         textvalues.put("handletext", myhandle);
452 
453         writeIndexRecord(writer, Constants.COLLECTION, myhandle, textvalues, "");
454     }
455 
456     /**
457      * writes an index record - the index record is a set of name/value hashes,
458      * which are sent to Lucene.
459      */
460     private static void writeItemIndex(Context c, IndexWriter writer,
461             Item myitem) throws SQLException  , IOException  
462     {
463         // FIXME: config reading should happen just once & be cached?  
464         
465         // get the location string (for searching by collection & community)
466         String   location_text = buildItemLocationString(c, myitem);
467 
468         // read in indexes from the config
469         ArrayList   indexes = new ArrayList  ();
470 
471         // read in search.index.1, search.index.2....
472         for (int i = 1; ConfigurationManager.getProperty("search.index." + i) != null; i++)
473         {
474             indexes.add(ConfigurationManager.getProperty("search.index." + i));
475         }
476 
477         int j;
478         int k = 0;
479 
480         // initialize hash to be built
481         HashMap   textvalues = new HashMap  ();
482 
483         if (indexes.size() > 0)
484         {
485             ArrayList   fields = new ArrayList  ();
486             ArrayList   content = new ArrayList  ();
487             DCValue[] mydc;
488 
489             for (int i = 0; i < indexes.size(); i++)
490             {
491                 String   index = (String  ) indexes.get(i);
492 
493                 String  [] configLine = index.split(":");
494                 String   indexName = configLine[0];
495 
496                 String   schema;
497                 String   element;
498                 String   qualifier = null;
499 
500                 // Get the schema, element and qualifier for the index
501                 // TODO: Should check valid schema, element, qualifier?
502                 String  [] parts = configLine[1].split("\\.");
503                 
504                 switch (parts.length)
505                 {
506                 case 3:
507                     qualifier = parts[2];
508                 case 2:
509                     schema = parts[0];
510                     element = parts[1];
511                     break;
512                 default:
513                     log.warn("Malformed configuration line: search.index." + i);
514                     // FIXME: Can't proceed here, no suitable exception to throw
515                     throw new RuntimeException  (
516                             "Malformed configuration line: search.index." + i);
517                 }
518                 
519                 // extract metadata (ANY is wildcard from Item class)
520                 if (qualifier!= null && qualifier.equals("*"))
521                 {
522                     mydc = myitem.getMetadata(schema, element, Item.ANY, Item.ANY);
523                 }
524                 else
525                 {
526                     mydc = myitem.getMetadata(schema, element, qualifier, Item.ANY);
527                 }
528 
529                 // put them all from an array of strings to one string for
530                 // writing out pack all of the arrays of DCValues into plain
531                 // text strings for the indexer
532                 String   content_text = "";
533 
534                 for (j = 0; j < mydc.length; j++)
535                 {
536                     content_text = new String  (content_text + mydc[j].value
537                             + " ");
538                 }
539 
540                 // arranges content with fields in ArrayLists with same index to
541                 // put
542                 // into hash later
543                 k = fields.indexOf(indexName);
544 
545                 if (k < 0)
546                 {
547                     fields.add(indexName);
548                     content.add(content_text);
549                 }
550                 else
551                 {
552                     content_text = new String  (content_text
553                             + (String  ) content.get(k) + " ");
554                     content.set(k, content_text);
555                 }
556             }
557 
558             // build the hash
559             for (int i = 0; i < fields.size(); i++)
560             {
561                 textvalues.put((String  ) fields.get(i), (String  ) content.get(i));
562             }
563 
564             textvalues.put("location", location_text);
565         }
566         else
567         // if no search indexes found in cfg file, for backward compatibility
568         {
569             // extract metadata (ANY is wildcard from Item class)
570             DCValue[] authors = myitem.getDC("contributor", Item.ANY, Item.ANY);
571             DCValue[] creators = myitem.getDC("creator", Item.ANY, Item.ANY);
572             DCValue[] titles = myitem.getDC("title", Item.ANY, Item.ANY);
573             DCValue[] keywords = myitem.getDC("subject", Item.ANY, Item.ANY);
574 
575             DCValue[] abstracts = myitem.getDC("description", "abstract",
576                     Item.ANY);
577             DCValue[] sors = myitem.getDC("description",
578                     "statementofresponsibility", Item.ANY);
579             DCValue[] series = myitem.getDC("relation", "ispartofseries",
580                     Item.ANY);
581             DCValue[] tocs = myitem.getDC("description", "tableofcontents",
582                     Item.ANY);
583             DCValue[] mimetypes = myitem.getDC("format", "mimetype", Item.ANY);
584             DCValue[] sponsors = myitem.getDC("description", "sponsorship",
585                     Item.ANY);
586             DCValue[] identifiers = myitem.getDC("identifier", Item.ANY,
587                     Item.ANY);
588 
589             // put them all from an array of strings to one string for writing
590             // out
591             String   author_text = "";
592             String   title_text = "";
593             String   keyword_text = "";
594 
595             String   abstract_text = "";
596             String   series_text = "";
597             String   mime_text = "";
598             String   sponsor_text = "";
599             String   id_text = "";
600 
601             // pack all of the arrays of DCValues into plain text strings for
602             // the
603             // indexer
604             for (j = 0; j < authors.length; j++)
605             {
606                 author_text = new String  (author_text + authors[j].value + " ");
607             }
608 
609             for (j = 0; j < creators.length; j++) //also authors
610             {
611                 author_text = new String  (author_text + creators[j].value + " ");
612             }
613 
614             for (j = 0; j < sors.length; j++) //also authors
615             {
616                 author_text = new String  (author_text + sors[j].value + " ");
617             }
618 
619             for (j = 0; j < titles.length; j++)
620             {
621                 title_text = new String  (title_text + titles[j].value + " ");
622             }
623 
624             for (j = 0; j < keywords.length; j++)
625             {
626                 keyword_text = new String  (keyword_text + keywords[j].value
627                         + " ");
628             }
629 
630             for (j = 0; j < abstracts.length; j++)
631             {
632                 abstract_text = new String  (abstract_text + abstracts[j].value
633                         + " ");
634             }
635 
636             for (j = 0; j < tocs.length; j++)
637             {
638                 abstract_text = new String  (abstract_text + tocs[j].value + " ");
639             }
640 
641             for (j = 0; j < series.length; j++)
642             {
643                 series_text = new String  (series_text + series[j].value + " ");
644             }
645 
646             for (j = 0; j < mimetypes.length; j++)
647             {
648                 mime_text = new String  (mime_text + mimetypes[j].value + " ");
649             }
650 
651             for (j = 0; j < sponsors.length; j++)
652             {
653                 sponsor_text = new String  (sponsor_text + sponsors[j].value
654                         + " ");
655             }
656 
657             for (j = 0; j < identifiers.length; j++)
658             {
659                 id_text = new String  (id_text + identifiers[j].value + " ");
660             }
661 
662             // build the hash
663             textvalues.put("author", author_text);
664             textvalues.put("title", title_text);
665             textvalues.put("keyword", keyword_text);
666             textvalues.put("location", location_text);
667             textvalues.put("abstract", abstract_text);
668 
669             textvalues.put("series", series_text);
670             textvalues.put("mimetype", mime_text);
671             textvalues.put("sponsor", sponsor_text);
672             textvalues.put("identifier", id_text);
673         }
674 
675         // now get full text of any bitstreams in the TEXT bundle
676         String   extractedText = "";
677 
678         // trundle through the bundles
679         Bundle[] myBundles = myitem.getBundles();
680 
681         for (int i = 0; i < myBundles.length; i++)
682         {
683             if ((myBundles[i].getName() != null)
684                     && myBundles[i].getName().equals("TEXT"))
685             {
686                 // a-ha! grab the text out of the bitstreams
687                 Bitstream[] myBitstreams = myBundles[i].getBitstreams();
688 
689                 for (j = 0; j < myBitstreams.length; j++)
690                 {
691                     try
692                     {
693                         InputStreamReader   is = new InputStreamReader  (
694                                 myBitstreams[j].retrieve()); // get input
695                         // stream
696                         StringBuffer   sb = new StringBuffer  ();
697                         char[] charBuffer = new char[1024];
698 
699                         while (true)
700                         {
701                             int bytesIn = is.read(charBuffer);
702 
703                             if (bytesIn == -1)
704                             {
705                                 break;
706                             }
707 
708                             if (bytesIn > 0)
709                             {
710                                 sb.append(charBuffer, 0, bytesIn);
711                             }
712                         }
713 
714                         // now sb has the full text - tack on to fullText string
715                         extractedText = extractedText.concat(new String  (sb));
716 
717                         //                        System.out.println("Found extracted text!\n" + new
718                         // String(sb));
719                     }
720                     catch (AuthorizeException e)
721                     {
722                         // this will never happen, but compiler is now happy.
723                     }
724                 }
725             }
726         }
727 
728         // lastly, get the handle
729         String   itemhandle = HandleManager.findHandle(c, myitem);
730         textvalues.put("handletext", itemhandle);
731 
732         if (log.isDebugEnabled())
733         {
734             log.debug(LogManager.getHeader(c, "write_index", "handle=" +itemhandle));
735             log.debug(textvalues.toString());
736         }
737 
738         // write out the metatdata (for scalability, using hash instead of
739         // individual strings)
740         writeIndexRecord(writer, Constants.ITEM, itemhandle, textvalues,
741                 extractedText);
742     }
743 
744     /**
745      * writeIndexRecord() creates a document from its args and writes it out to
746      * the index that is opened
747      */
748     private static void writeIndexRecord(IndexWriter iw, int type,
749             String   handle, HashMap   textvalues, String   extractedText)
750             throws IOException  
751     {
752         Document doc = new Document();
753         Integer   ty = new Integer  (type);
754         String   fulltext = "";
755 
756         // do id, type, handle first
757         doc.add(new Field("type", ty.toString(), Field.Store.YES, Field.Index.NO));
758 
759         // want to be able to search for handle, so use keyword
760         // (not tokenized, but it is indexed)
761         if (handle != null)
762         {
763             doc.add(new Field("handle", handle, Field.Store.YES, Field.Index.UN_TOKENIZED));
764         }
765 
766         // now iterate through the hash, building full text string
767         // and index all values
768         Iterator   i = textvalues.keySet().iterator();
769 
770         while (i.hasNext())
771         {
772             String   key = (String  ) i.next();
773             String   value = (String  ) textvalues.get(key);
774 
775             fulltext = fulltext + " " + value;
776 
777             if (value != null)
778             {
779                 doc.add(new Field(key, value, Field.Store.YES, Field.Index.TOKENIZED));
780             }
781         }
782 
783         fulltext = fulltext.concat(extractedText);
784 
785         //        System.out.println("Full Text:\n" + fulltext + "------------\n\n");
786         // add the full text
787         doc.add(new Field("default", fulltext, Field.Store.YES, Field.Index.TOKENIZED));
788 
789         // index the document
790         iw.addDocument(doc);
791     }
792 }
793
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags