KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > cocoon > transformation > LuceneIndexTransformer


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.apache.cocoon.transformation;
17
18 import java.io.File JavaDoc;
19 import java.io.IOException JavaDoc;
20 import java.io.Serializable JavaDoc;
21 import java.util.Map JavaDoc;
22 import java.util.Stack JavaDoc;
23
24 import org.apache.avalon.framework.configuration.Configurable;
25 import org.apache.avalon.framework.configuration.Configuration;
26 import org.apache.avalon.framework.configuration.ConfigurationException;
27 import org.apache.avalon.framework.context.Context;
28 import org.apache.avalon.framework.context.ContextException;
29 import org.apache.avalon.framework.context.Contextualizable;
30 import org.apache.avalon.framework.parameters.Parameters;
31
32 import org.apache.cocoon.Constants;
33 import org.apache.cocoon.ProcessingException;
34 import org.apache.cocoon.caching.CacheableProcessingComponent;
35 import org.apache.cocoon.components.search.LuceneCocoonHelper;
36 import org.apache.cocoon.components.search.LuceneXMLIndexer;
37 import org.apache.cocoon.environment.SourceResolver;
38 import org.apache.commons.lang.BooleanUtils;
39 import org.apache.excalibur.source.SourceValidity;
40 import org.apache.excalibur.source.impl.validity.NOPValidity;
41
42 import org.apache.lucene.analysis.Analyzer;
43 import org.apache.lucene.document.Document;
44 import org.apache.lucene.document.Field;
45 import org.apache.lucene.index.IndexWriter;
46 import org.apache.lucene.index.IndexReader;
47 import org.apache.lucene.index.Term;
48 import org.apache.lucene.store.Directory;
49 import org.xml.sax.Attributes JavaDoc;
50 import org.xml.sax.SAXException JavaDoc;
51 import org.xml.sax.helpers.AttributesImpl JavaDoc;
52
53 /**
54  * A lucene index creation transformer.
55  * <p>See <a HREF="http://wiki.cocoondev.org/Wiki.jsp?page=LuceneIndexTransformer">LuceneIndexTransformer</a>
56  * documentation on the Cocoon Wiki.</p>
57  * <p>TODO: Write more documentation.</p>
58  *
59  * @author <a HREF="mailto:vgritsenko@apache.org">Vadim Gritsenko</a>
60  * @author <a HREF="mailto:conal@nzetc.org">Conal Tuohy</a>
61  * @version CVS $Id: LuceneIndexTransformer.java 124685 2005-01-08 22:20:56Z antonio $
62  */

63 public class LuceneIndexTransformer extends AbstractTransformer
64     implements CacheableProcessingComponent, Configurable, Contextualizable {
65
66     public static final String JavaDoc ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
67     public static final String JavaDoc ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname";
68     public static final String JavaDoc ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
69     public static final String JavaDoc DIRECTORY_CONFIG = "directory";
70     public static final String JavaDoc DIRECTORY_PARAMETER = "directory";
71     public static final String JavaDoc DIRECTORY_DEFAULT = "index";
72     public static final String JavaDoc MERGE_FACTOR_CONFIG = "merge-factor";
73     public static final String JavaDoc MERGE_FACTOR_PARAMETER = "merge-factor";
74     public static final int MERGE_FACTOR_DEFAULT = 20;
75
76     public static final String JavaDoc LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
77     public static final String JavaDoc LUCENE_QUERY_ELEMENT = "index";
78     public static final String JavaDoc LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer";
79     public static final String JavaDoc LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory";
80     public static final String JavaDoc LUCENE_QUERY_CREATE_ATTRIBUTE = "create";
81     public static final String JavaDoc LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor";
82     public static final String JavaDoc LUCENE_DOCUMENT_ELEMENT = "document";
83     public static final String JavaDoc LUCENE_DOCUMENT_URL_ATTRIBUTE = "url";
84     public static final String JavaDoc LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
85     public static final String JavaDoc LUCENE_ELEMENT_ATTR_STORE_VALUE = "store";
86     public static final String JavaDoc LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time";
87     public static final String JavaDoc CDATA = "CDATA";
88     
89     // The 3 states of the state machine
90
private static final int STATE_GROUND = 0; // initial or "ground" state
91
private static final int STATE_QUERY = 1; // processing a lucene:index (Query) element
92
private static final int STATE_DOCUMENT = 2; // processing a lucene:document element
93

94     // Initialization time variables
95
protected File JavaDoc workDir = null;
96
97     // Declaration time parameters values (specified in sitemap component config)
98
private IndexerConfiguration configureConfiguration;
99     // Invocation time parameters values (specified in sitemap transform parameters)
100
private IndexerConfiguration setupConfiguration;
101     // Parameters specified in the input document
102
private IndexerConfiguration queryConfiguration;
103
104     // Runtime variables
105
private int processing;
106     private boolean createIndex = false;
107     private IndexWriter writer;
108     private StringBuffer JavaDoc bodyText;
109     private Document bodyDocument;
110     private String JavaDoc bodyDocumentURL;
111     private Stack JavaDoc elementStack = new Stack JavaDoc();
112     /**
113      * Storage for the document element's attributes until the document
114      * has been indexed, so that they can be copied to the output
115      * along with a boolean <code>indexed</code> attribute.
116      */

117     private AttributesImpl JavaDoc documentAttributes;
118     private long documentStartTime;
119
120
121     private static String JavaDoc uid(String JavaDoc url) {
122         return url.replace('/', '\u0000'); // + "\u0000" + DateField.timeToString(urlConnection.getLastModified());
123
}
124
125
126     /**
127      * Configure the transformer. The configuration parameters are stored as
128      * general defaults, which may be over-ridden by parameters specified as
129      * parameters in the sitemap pipeline, or by attributes of the query
130      * element(s) in the XML input document.
131      */

132     public void configure(Configuration conf) throws ConfigurationException {
133         this.configureConfiguration = new IndexerConfiguration(
134             conf.getChild(ANALYZER_CLASSNAME_CONFIG).getValue(ANALYZER_CLASSNAME_DEFAULT),
135             conf.getChild(DIRECTORY_CONFIG).getValue(DIRECTORY_DEFAULT),
136             conf.getChild(MERGE_FACTOR_CONFIG).getValueAsInteger(MERGE_FACTOR_DEFAULT)
137         );
138     }
139
140     /**
141      * Setup the transformer.
142      * Called when the pipeline is assembled.
143      * The parameters are those specified as child elements of the
144      * <code>&lt;map:transform&gt;</code> element in the sitemap.
145      * These parameters are optional:
146      * If no parameters are specified here then the defaults are
147      * supplied by the component configuration.
148      * Any parameters specified here may be over-ridden by attributes
149      * of the lucene:index element in the input document.
150      */

151     public void setup(SourceResolver resolver, Map JavaDoc objectModel, String JavaDoc src, Parameters parameters)
152     throws ProcessingException, SAXException JavaDoc, IOException JavaDoc {
153         setupConfiguration = new IndexerConfiguration(
154             parameters.getParameter(ANALYZER_CLASSNAME_PARAMETER, configureConfiguration.analyzerClassname),
155             parameters.getParameter(DIRECTORY_PARAMETER, configureConfiguration.indexDirectory),
156             parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER, configureConfiguration.mergeFactor)
157         );
158     }
159
160     /**
161      * Contextualize this class
162      */

163     public void contextualize(Context context) throws ContextException {
164         this.workDir = (File JavaDoc) context.get(Constants.CONTEXT_WORK_DIR);
165     }
166
167     public void recycle() {
168         this.processing = STATE_GROUND;
169         if (this.writer != null) {
170             try { this.writer.close(); } catch (IOException JavaDoc ioe) { }
171             this.writer = null;
172         }
173         this.bodyText = null;
174         this.bodyDocument = null;
175         this.bodyDocumentURL = null;
176         this.elementStack.clear();
177         super.recycle();
178     }
179
180     /**
181      * Generate the unique key.
182      * This key must be unique inside the space of this component.
183      *
184      * @return The generated key
185      */

186     public Serializable JavaDoc getKey() {
187         return "1";
188     }
189
190     /**
191      * Generate the validity object.
192      *
193      * @return The generated validity object or <code>null</code> if the
194      * component is currently not cacheable.
195      */

196     public SourceValidity getValidity() {
197         return NOPValidity.SHARED_INSTANCE;
198     }
199
200
201     public void startDocument() throws SAXException JavaDoc {
202         super.startDocument();
203     }
204
205     public void endDocument() throws SAXException JavaDoc {
206         super.endDocument();
207     }
208
209     /**
210      * Begin the scope of a prefix-URI Namespace mapping.
211      *
212      * @param prefix The Namespace prefix being declared.
213      * @param uri The Namespace URI the prefix is mapped to.
214      */

215     public void startPrefixMapping(String JavaDoc prefix, String JavaDoc uri) throws SAXException JavaDoc {
216         if (processing == STATE_GROUND) {
217             super.startPrefixMapping(prefix,uri);
218         }
219     }
220
221     /**
222      * End the scope of a prefix-URI mapping.
223      *
224      * @param prefix The prefix that was being mapping.
225      */

226     public void endPrefixMapping(String JavaDoc prefix) throws SAXException JavaDoc {
227         if (processing == STATE_GROUND) {
228             super.endPrefixMapping(prefix);
229         }
230     }
231
232     public void startElement(String JavaDoc namespaceURI, String JavaDoc localName, String JavaDoc qName, Attributes JavaDoc atts)
233         throws SAXException JavaDoc {
234
235         if (processing == STATE_GROUND) {
236             if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)){
237                 String JavaDoc sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
238                 createIndex = BooleanUtils.toBoolean(sCreate);
239
240                 String JavaDoc analyzerClassname = atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
241                 String JavaDoc indexDirectory = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
242                 String JavaDoc mergeFactor = atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);
243
244                 queryConfiguration = new IndexerConfiguration(
245                     analyzerClassname != null ? analyzerClassname : setupConfiguration.analyzerClassname,
246                     indexDirectory != null ? indexDirectory : setupConfiguration.indexDirectory,
247                     mergeFactor != null ? Integer.parseInt(mergeFactor) : setupConfiguration.mergeFactor
248                 );
249
250                 if (!createIndex) {
251                     // Not asked to create the index - but check if this is necessary anyway:
252
try {
253                         IndexReader reader = openReader();
254                         reader.close();
255                     } catch (IOException JavaDoc ioe) {
256                         // couldn't open the index - so recreate it
257
createIndex = true;
258                     }
259                 }
260                 // propagate the lucene:index to the next stage in the pipeline
261
super.startElement(namespaceURI, localName, qName, atts);
262                 processing = STATE_QUERY;
263             } else {
264                 super.startElement(namespaceURI, localName, qName, atts);
265             }
266         } else if (processing == STATE_QUERY) {
267             // processing a lucene:index - expecting a lucene:document
268
if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)){
269                 this.bodyDocumentURL = atts.getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE);
270                 if (this.bodyDocumentURL == null) {
271                     throw new SAXException JavaDoc("<lucene:document> must have @url attribute");
272                 }
273
274                 // Remember the time the document indexing began
275
this.documentStartTime = System.currentTimeMillis();
276                 // remember these attributes so they can be passed on to the next stage in the pipeline,
277
// when this document element is ended.
278
this.documentAttributes = new AttributesImpl JavaDoc(atts);
279                 this.bodyText = new StringBuffer JavaDoc();
280                 this.bodyDocument = new Document();
281                 this.elementStack.clear();
282                 processing = STATE_DOCUMENT;
283             } else {
284                 throw new SAXException JavaDoc("<lucene:index> element can contain only <lucene:document> elements!");
285             }
286         } else if (processing == STATE_DOCUMENT) {
287             elementStack.push(new IndexHelperField(localName, new AttributesImpl JavaDoc(atts)));
288         }
289     }
290
291     public void endElement(String JavaDoc namespaceURI, String JavaDoc localName, String JavaDoc qName)
292         throws SAXException JavaDoc {
293
294         if (processing == STATE_QUERY) {
295             if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)) {
296                 // End query processing
297
try {
298                     if (this.writer == null) {
299                         openWriter();
300                     }
301                     this.writer.optimize();
302                     this.writer.close();
303                     this.writer = null;
304                 } catch (IOException JavaDoc e) {
305                     throw new SAXException JavaDoc(e);
306                 }
307                 // propagate the query element to the next stage in the pipeline
308
super.endElement(namespaceURI, localName, qName);
309                 this.processing = STATE_GROUND;
310             } else {
311                 throw new SAXException JavaDoc("</lucene:index> was expected!");
312             }
313         } else if (processing == STATE_DOCUMENT) {
314             if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
315                 // End document processing
316
this.bodyDocument.add(Field.UnStored(LuceneXMLIndexer.BODY_FIELD, this.bodyText.toString()));
317                 this.bodyText = null;
318
319                 this.bodyDocument.add(Field.UnIndexed(LuceneXMLIndexer.URL_FIELD, this.bodyDocumentURL));
320                 // store: false, index: true, tokenize: false
321
this.bodyDocument.add(new Field(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL), false, true, false));
322                 try {
323                     reindexDocument();
324                 } catch (IOException JavaDoc e) {
325                     throw new SAXException JavaDoc(e);
326                 }
327                 this.bodyDocumentURL = null;
328
329                 // propagate the lucene:document element to the next stage in the pipeline
330
long elapsedTime = System.currentTimeMillis() - this.documentStartTime;
331                 //documentAttributes = new AttributesImpl();
332
this.documentAttributes.addAttribute(
333                     "",
334                     LUCENE_ELAPSED_TIME_ATTRIBUTE,
335                     LUCENE_ELAPSED_TIME_ATTRIBUTE,
336                     CDATA,
337                     String.valueOf(elapsedTime)
338                 );
339                 super.startElement(namespaceURI, localName, qName, this.documentAttributes);
340                 super.endElement(namespaceURI, localName, qName);
341                 this.processing = STATE_QUERY;
342             } else {
343                 // End element processing
344
IndexHelperField tos = (IndexHelperField) elementStack.pop();
345                 StringBuffer JavaDoc text = tos.getText();
346
347                 Attributes JavaDoc atts = tos.getAttributes();
348                 boolean attributesToText = atts.getIndex(LUCENE_URI, LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE) != -1;
349                 for (int i = 0; i < atts.getLength(); i++) {
350                     // Ignore Lucene attributes
351
if (LUCENE_URI.equals(atts.getURI(i)))
352                         continue;
353
354                     String JavaDoc atts_lname = atts.getLocalName(i);
355                     String JavaDoc atts_value = atts.getValue(i);
356                     bodyDocument.add(Field.UnStored(localName + "@" + atts_lname, atts_value));
357                     if (attributesToText) {
358                         text.append(atts_value);
359                         text.append(' ');
360                         bodyText.append(atts_value);
361                         bodyText.append(' ');
362                     }
363                 }
364
365                 boolean store = atts.getIndex(LUCENE_URI, LUCENE_ELEMENT_ATTR_STORE_VALUE) != -1;
366                 if (text != null && text.length() > 0) {
367                     if (store) {
368                         bodyDocument.add(Field.Text(localName, text.toString()));
369                     } else {
370                         bodyDocument.add(Field.UnStored(localName, text.toString()));
371                     }
372                 }
373             }
374         } else {
375             // All other tags
376
super.endElement(namespaceURI, localName, qName);
377         }
378     }
379
380     public void characters(char[] ch, int start, int length)
381         throws SAXException JavaDoc {
382
383         if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0 && length > 1 && elementStack.size() > 0) {
384             String JavaDoc text = new String JavaDoc(ch, start, length);
385             ((IndexHelperField) elementStack.peek()).append(text);
386             bodyText.append(text);
387             bodyText.append(' ');
388         } else if (processing == STATE_GROUND) {
389             super.characters(ch, start, length);
390         }
391     }
392
393     private void openWriter() throws IOException JavaDoc {
394             File JavaDoc indexDirectory = new File JavaDoc(queryConfiguration.indexDirectory);
395         if (!indexDirectory.isAbsolute()) {
396             indexDirectory = new File JavaDoc(workDir, queryConfiguration.indexDirectory);
397         }
398
399         // If the index directory doesn't exist, then always create it.
400
boolean indexExists = IndexReader.indexExists(indexDirectory);
401         if (!indexExists) {
402             createIndex = true;
403         }
404         
405         // Get the index directory, creating it if necessary
406
Directory directory = LuceneCocoonHelper.getDirectory(indexDirectory, createIndex);
407         Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(queryConfiguration.analyzerClassname);
408         this.writer = new IndexWriter(directory, analyzer, createIndex);
409         this.writer.mergeFactor = queryConfiguration.mergeFactor;
410     }
411     
412     private IndexReader openReader() throws IOException JavaDoc {
413             File JavaDoc indexDirectory = new File JavaDoc(queryConfiguration.indexDirectory);
414         if (!indexDirectory.isAbsolute()) {
415             indexDirectory = new File JavaDoc(workDir, queryConfiguration.indexDirectory);
416         }
417     
418         Directory directory = LuceneCocoonHelper.getDirectory(indexDirectory, createIndex);
419         IndexReader reader = IndexReader.open(directory);
420         return reader;
421     }
422
423      private void reindexDocument() throws IOException JavaDoc {
424         if (this.createIndex) {
425             // The index is being created, so there's no need to delete the doc from an existing index.
426
// This means we can keep a single IndexWriter open throughout the process.
427
if (this.writer == null)
428                 openWriter();
429             this.writer.addDocument(this.bodyDocument);
430         } else {
431             // This is an incremental reindex, so the document should be removed from the index before adding it
432
try {
433                 IndexReader reader = openReader();
434                 reader.delete(new Term(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL)));
435                 reader.close();
436             } catch (IOException JavaDoc e) { /* ignore */ }
437             openWriter();
438             this.writer.addDocument(this.bodyDocument);
439             this.writer.close();
440             this.writer = null;
441         }
442         this.bodyDocument = null;
443      }
444
445     static class IndexHelperField {
446         String JavaDoc localName;
447         StringBuffer JavaDoc text;
448         Attributes JavaDoc attributes;
449
450         IndexHelperField(String JavaDoc localName, Attributes JavaDoc atts) {
451             this.localName = localName;
452             this.attributes = atts;
453             this.text = new StringBuffer JavaDoc();
454         }
455
456         public Attributes JavaDoc getAttributes() {
457             return attributes;
458         }
459
460         public StringBuffer JavaDoc getText() {
461             return text;
462         }
463
464         public void append(String JavaDoc text) {
465             this.text.append(text);
466         }
467
468         public void append(char[] str, int offset, int length) {
469             this.text.append(str, offset, length);
470         }
471     }
472
473     static class IndexerConfiguration {
474         String JavaDoc analyzerClassname;
475         String JavaDoc indexDirectory;
476         int mergeFactor;
477
478         public IndexerConfiguration(String JavaDoc analyzerClassname,
479                                     String JavaDoc indexDirectory,
480                                     int mergeFactor)
481         {
482             this.analyzerClassname = analyzerClassname;
483             this.indexDirectory = indexDirectory;
484             this.mergeFactor = mergeFactor;
485         }
486     }
487
488 }
489
Popular Tags