KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > contineo > searchengine > crawler > Indexer


1 package org.contineo.searchengine.crawler;
2
3 import java.io.File JavaDoc;
4 import java.io.IOException JavaDoc;
5 import org.apache.log4j.Level;
6 import org.apache.log4j.Logger;
7 import org.apache.lucene.analysis.Analyzer;
8 import org.apache.lucene.analysis.de.GermanAnalyzer;
9 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
10 import org.apache.lucene.analysis.standard.StandardAnalyzer;
11 import org.apache.lucene.document.Document;
12 import org.apache.lucene.index.IndexReader;
13 import org.apache.lucene.index.IndexWriter;
14 import org.apache.lucene.index.MultiReader;
15 import org.apache.lucene.index.Term;
16 import org.apache.lucene.store.FSDirectory;
17 import org.contineo.core.LoggingManager;
18 import org.contineo.core.config.SettingConfigurator;
19 import org.contineo.core.text.AnalyzeText;
20 import org.contineo.core.text.analyze.Stopwords;
21 import org.contineo.core.text.lili.LanguageIdentifier;
22 import org.contineo.core.text.parser.Parser;
23 import org.contineo.core.text.parser.ParserFactory;
24 import org.contineo.searchengine.util.SquareSimilarity;
25
26 /**
27  * Class for indexing a given file.
28  * Created on 5. November 2003, 16:45
29  * @author Michael Scholz
30  */

31 public class Indexer {
32
33     /**
34      * @uml.property name="writer"
35      * @uml.associationEnd
36      */

37     private IndexWriter writer;
38
39     /**
40      * @uml.property name="logger"
41      * @uml.associationEnd
42      */

43     private Logger logger;
44
45     /**
46      *
47      * @uml.property name="conf"
48      * @uml.associationEnd
49      * @uml.property name="conf" multiplicity="(1 1)"
50      */

51     private SettingConfigurator conf;
52
53     /**
54      * Creates a new instance of Indexer.
55      * @param lang - Language which the indexer should use.
56      */

57     public Indexer() {
58         logger = LoggingManager.getLogger(this.getClass());
59         conf = new SettingConfigurator();
60     }
61
62     public synchronized int addFile(File JavaDoc file, org.contineo.documan.Document d, StringBuffer JavaDoc content, String JavaDoc language) throws Exception JavaDoc {
63         String JavaDoc name = file.getName();
64         int testversion = -1;
65         int result = -1;
66         name = name.substring(name.lastIndexOf(".") + 1);
67         try {
68             testversion = Integer.parseInt(name);
69         } catch (Exception JavaDoc e) {
70         }
71         if (testversion == -1) {
72             LuceneDocument lDoc = new LuceneDocument(d);
73             try {
74                 Document doc = lDoc.getDocument(file, content);
75                 result = addDocument(doc, language);
76             } catch (Exception JavaDoc e) {
77                  if (logger.isEnabledFor(Level.ERROR))
78                     logger.error(e.getMessage());
79             }
80             try {
81                 AnalyzeText aText = new AnalyzeText();
82                 aText.storeTerms(d.getMenuId(), content.toString(), language);
83             } catch (Exception JavaDoc e) {
84                  if (logger.isEnabledFor(Level.ERROR))
85                     logger.error(e.getMessage());
86             }
87         }
88         return result;
89     }
90     
91     /**
92      * Adds a LuceneDocument to the index.
93      * @param doc
94      */

95     public int addDocument(Document doc, String JavaDoc language) {
96         try {
97             Analyzer analyzer;
98             String JavaDoc dir;
99             if (language.equals("de")) {
100                 analyzer = new GermanAnalyzer(Stopwords.getStopwords("de"));
101                 dir = "german";
102             } else if (language.equals("fr")) {
103                 analyzer = new FrenchAnalyzer(Stopwords.getStopwords("fr"));
104                 dir = "french";
105             } else {
106                 analyzer = new StandardAnalyzer(Stopwords.getStopwords("en"));
107                 dir = "english";
108             }
109             String JavaDoc path = conf.getValue("indexdir");
110             if (!path.endsWith(File.pathSeparator))
111                     path += "/";
112             path += dir + "/";
113             try {
114                 writer = new IndexWriter(path ,analyzer, false);
115             } catch (IOException JavaDoc ioe) {
116                 writer = new IndexWriter(path ,analyzer, true);
117                 //if (logger.isEnabledFor(Level.ERROR))
118
//logger.error(ioe.getMessage());
119
}
120             writer.setSimilarity(new SquareSimilarity());
121             writer.addDocument(doc);
122             writer.optimize();
123             writer.close();
124             return writer.docCount() - 1;
125         } catch (Exception JavaDoc e) {
126             if (logger.isEnabledFor(Level.ERROR))
127                 logger.error(e.getMessage());
128             return -1;
129         }
130     }
131     
132     /**
133      * Adds all documents of a given directory to the index of the search engine.
134      * @param file - Path of the directory.
135      * @throws Exception
136      */

137     public void addDirectory(File JavaDoc file, org.contineo.documan.Document doc) throws Exception JavaDoc {
138         if (file.isDirectory()){
139             String JavaDoc[] subitems = file.list();
140             for (int i=0; i<subitems.length; i++)
141             {
142                 addDirectory(new File JavaDoc(file,subitems[i]), doc);
143             }
144         } else{
145             try {
146                 Parser parser = ParserFactory.getParser(file);
147                 if (parser == null)
148                     return;
149                 StringBuffer JavaDoc content = parser.getContent();
150                 LanguageIdentifier lili = new LanguageIdentifier();
151                 String JavaDoc language = lili.identify(content.toString());
152                 if (language == null || language.equals(""))
153                     language = "en";
154                 addFile(file, doc, content, language);
155             } catch (Exception JavaDoc e) {
156                 if (logger.isEnabledFor(Level.ERROR))
157                     logger.error(e.getMessage());
158             }
159         }
160     }
161     
162     protected void optimize() {
163         String JavaDoc path = conf.getValue("indexdir");
164         if (!path.endsWith(File.pathSeparator))
165             path += "/";
166         try {
167             writer = new IndexWriter(path + "english/",new StandardAnalyzer(), false);
168             writer.optimize();
169             writer.close();
170             writer = new IndexWriter(path + "french/",new FrenchAnalyzer(), false);
171             writer.optimize();
172             writer.close();
173             writer = new IndexWriter(path + "german/",new GermanAnalyzer(), false);
174             writer.optimize();
175             writer.close();
176         } catch (Exception JavaDoc e) {
177             if (logger.isEnabledFor(Level.ERROR))
178                 logger.error(e.getMessage());
179         }
180     }
181     
182     /**
183      * Deletes the entries of a document in the index of the search engine.
184      * @param menuid - MenuID of the document.
185      */

186     public void deleteFile(String JavaDoc menuid) {
187         String JavaDoc path = conf.getValue("indexdir");
188         if (!path.endsWith(File.pathSeparator))
189             path += "/";
190         try {
191             IndexReader enreader = IndexReader.open(path + "english/");
192             IndexReader frreader = IndexReader.open(path + "french/");
193             IndexReader dereader = IndexReader.open(path + "german/");
194             IndexReader[] readers = new IndexReader[]{enreader,frreader,dereader};
195             MultiReader reader = new MultiReader(readers);
196             reader.delete(new Term("menuid",menuid));
197             reader.close();
198             optimize();
199         } catch (IOException JavaDoc ioe) {
200             if (logger.isEnabledFor(Level.ERROR))
201                 logger.error(ioe.getMessage());
202         }
203     }
204     
205     public Document getDocument(int luceneid) {
206         String JavaDoc path = conf.getValue("indexdir");
207         if (!path.endsWith(File.pathSeparator))
208             path += "/";
209         try {
210             IndexReader enreader = IndexReader.open(path + "english/");
211             IndexReader frreader = IndexReader.open(path + "french/");
212             IndexReader dereader = IndexReader.open(path + "german/");
213             IndexReader[] readers = new IndexReader[]{enreader,frreader,dereader};
214             MultiReader reader = new MultiReader(readers);
215             Document doc = reader.document(luceneid);
216             reader.close();
217             return doc;
218         } catch (Exception JavaDoc e) {
219             if (logger.isEnabledFor(Level.ERROR))
220                 logger.error(e.getMessage());
221             return null;
222         }
223     }
224     
225     /**
226      * This method can unlock a locked index.
227      */

228     public void unlock() {
229         String JavaDoc path = conf.getValue("indexdir");
230         if (!path.endsWith(File.pathSeparator))
231             path += "/";
232         try {
233             FSDirectory enfsdir = FSDirectory.getDirectory(path + "english/",false);
234             IndexReader reader = IndexReader.open(enfsdir);
235             IndexReader.unlock(enfsdir);
236             reader.close();
237             FSDirectory frfsdir = FSDirectory.getDirectory(path + "french/",false);
238             reader = IndexReader.open(frfsdir);
239             IndexReader.unlock(frfsdir);
240             reader.close();
241             FSDirectory defsdir = FSDirectory.getDirectory(path + "german/",false);
242             reader = IndexReader.open(defsdir);
243             IndexReader.unlock(defsdir);
244             reader.close();
245         } catch (Exception JavaDoc e) {
246             if (logger.isEnabledFor(Level.ERROR))
247                 logger.error(e.getMessage());
248         }
249     }
250     
251     public boolean isLocked() {
252         boolean result = false;
253         String JavaDoc path = conf.getValue("indexdir");
254         if (!path.endsWith(File.pathSeparator))
255             path += "/";
256         try {
257             FSDirectory enfsdir = FSDirectory.getDirectory(path + "english/",false);
258             IndexReader reader = IndexReader.open(enfsdir);
259             if (IndexReader.isLocked(enfsdir))
260                 result = true;
261             reader.close();
262             FSDirectory frfsdir = FSDirectory.getDirectory(path + "french/",false);
263             reader = IndexReader.open(frfsdir);
264             if (IndexReader.isLocked(frfsdir))
265                 result = true;
266             reader.close();
267             FSDirectory defsdir = FSDirectory.getDirectory(path + "german/",false);
268             reader = IndexReader.open(defsdir);
269             if (IndexReader.isLocked(defsdir))
270                 result = true;
271             reader.close();
272         } catch (Exception JavaDoc e) {
273             if (logger.isEnabledFor(Level.ERROR))
274                 logger.error(e.getMessage());
275         }
276         return result;
277     }
278     
279     /**
280      * Returns the number of indexed documents in all indexes.
281      * Used for statistical output.
282      * @return
283      */

284     public int getCount() {
285         int count = 0;
286         String JavaDoc path = conf.getValue("indexdir");
287         if (!path.endsWith(File.pathSeparator))
288             path += "/";
289         try {
290             IndexReader enreader = IndexReader.open(path + "english/");
291             IndexReader frreader = IndexReader.open(path + "french/");
292             IndexReader dereader = IndexReader.open(path + "german/");
293             count = enreader.numDocs();
294             count += frreader.numDocs();
295             count += dereader.numDocs();
296             enreader.close();
297             frreader.close();
298             dereader.close();
299         } catch (Exception JavaDoc e) {
300             if (logger.isEnabledFor(Level.ERROR))
301                 logger.error(e.getMessage());
302         }
303         return count;
304     }
305 }
306
Popular Tags