1 16 package org.apache.cocoon.transformation; 17 18 import java.io.File ; 19 import java.io.IOException ; 20 import java.io.Serializable ; 21 import java.util.Map ; 22 import java.util.Stack ; 23 24 import org.apache.avalon.framework.configuration.Configurable; 25 import org.apache.avalon.framework.configuration.Configuration; 26 import org.apache.avalon.framework.configuration.ConfigurationException; 27 import org.apache.avalon.framework.context.Context; 28 import org.apache.avalon.framework.context.ContextException; 29 import org.apache.avalon.framework.context.Contextualizable; 30 import org.apache.avalon.framework.parameters.Parameters; 31 32 import org.apache.cocoon.Constants; 33 import org.apache.cocoon.ProcessingException; 34 import org.apache.cocoon.caching.CacheableProcessingComponent; 35 import org.apache.cocoon.components.search.LuceneCocoonHelper; 36 import org.apache.cocoon.components.search.LuceneXMLIndexer; 37 import org.apache.cocoon.environment.SourceResolver; 38 import org.apache.commons.lang.BooleanUtils; 39 import org.apache.excalibur.source.SourceValidity; 40 import org.apache.excalibur.source.impl.validity.NOPValidity; 41 42 import org.apache.lucene.analysis.Analyzer; 43 import org.apache.lucene.document.Document; 44 import org.apache.lucene.document.Field; 45 import org.apache.lucene.index.IndexWriter; 46 import org.apache.lucene.index.IndexReader; 47 import org.apache.lucene.index.Term; 48 import org.apache.lucene.store.Directory; 49 import org.xml.sax.Attributes ; 50 import org.xml.sax.SAXException ; 51 import org.xml.sax.helpers.AttributesImpl ; 52 53 63 public class LuceneIndexTransformer extends AbstractTransformer 64 implements CacheableProcessingComponent, Configurable, Contextualizable { 65 66 public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname"; 67 public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname"; 68 public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer"; 69 public static final String DIRECTORY_CONFIG = "directory"; 70 public static final String DIRECTORY_PARAMETER = "directory"; 71 public static final String DIRECTORY_DEFAULT = "index"; 72 public static final String MERGE_FACTOR_CONFIG = "merge-factor"; 73 public static final String MERGE_FACTOR_PARAMETER = "merge-factor"; 74 public static final int MERGE_FACTOR_DEFAULT = 20; 75 76 public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0"; 77 public static final String LUCENE_QUERY_ELEMENT = "index"; 78 public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer"; 79 public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory"; 80 public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create"; 81 public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor"; 82 public static final String LUCENE_DOCUMENT_ELEMENT = "document"; 83 public static final String LUCENE_DOCUMENT_URL_ATTRIBUTE = "url"; 84 public static final String LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr"; 85 public static final String LUCENE_ELEMENT_ATTR_STORE_VALUE = "store"; 86 public static final String LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time"; 87 public static final String CDATA = "CDATA"; 88 89 private static final int STATE_GROUND = 0; private static final int STATE_QUERY = 1; private static final int STATE_DOCUMENT = 2; 94 protected File workDir = null; 96 97 private IndexerConfiguration configureConfiguration; 99 private IndexerConfiguration setupConfiguration; 101 private IndexerConfiguration queryConfiguration; 103 104 private int processing; 106 private boolean createIndex = false; 107 private IndexWriter writer; 108 private StringBuffer bodyText; 109 private Document bodyDocument; 110 private String bodyDocumentURL; 111 private Stack elementStack = new Stack (); 112 117 private AttributesImpl documentAttributes; 118 private long documentStartTime; 119 120 121 private static String uid(String url) { 122 return url.replace('/', '\u0000'); } 124 125 126 132 public void configure(Configuration conf) throws ConfigurationException { 133 this.configureConfiguration = new IndexerConfiguration( 134 conf.getChild(ANALYZER_CLASSNAME_CONFIG).getValue(ANALYZER_CLASSNAME_DEFAULT), 135 conf.getChild(DIRECTORY_CONFIG).getValue(DIRECTORY_DEFAULT), 136 conf.getChild(MERGE_FACTOR_CONFIG).getValueAsInteger(MERGE_FACTOR_DEFAULT) 137 ); 138 } 139 140 151 public void setup(SourceResolver resolver, Map objectModel, String src, Parameters parameters) 152 throws ProcessingException, SAXException , IOException { 153 setupConfiguration = new IndexerConfiguration( 154 parameters.getParameter(ANALYZER_CLASSNAME_PARAMETER, configureConfiguration.analyzerClassname), 155 parameters.getParameter(DIRECTORY_PARAMETER, configureConfiguration.indexDirectory), 156 parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER, configureConfiguration.mergeFactor) 157 ); 158 } 159 160 163 public void contextualize(Context context) throws ContextException { 164 this.workDir = (File ) context.get(Constants.CONTEXT_WORK_DIR); 165 } 166 167 public void recycle() { 168 this.processing = STATE_GROUND; 169 if (this.writer != null) { 170 try { this.writer.close(); } catch (IOException ioe) { } 171 this.writer = null; 172 } 173 this.bodyText = null; 174 this.bodyDocument = null; 175 this.bodyDocumentURL = null; 176 this.elementStack.clear(); 177 super.recycle(); 178 } 179 180 186 public Serializable getKey() { 187 return "1"; 188 } 189 190 196 public SourceValidity getValidity() { 197 return NOPValidity.SHARED_INSTANCE; 198 } 199 200 201 public void startDocument() throws SAXException { 202 super.startDocument(); 203 } 204 205 public void endDocument() throws SAXException { 206 super.endDocument(); 207 } 208 209 215 public void startPrefixMapping(String prefix, String uri) throws SAXException { 216 if (processing == STATE_GROUND) { 217 super.startPrefixMapping(prefix,uri); 218 } 219 } 220 221 226 public void endPrefixMapping(String prefix) throws SAXException { 227 if (processing == STATE_GROUND) { 228 super.endPrefixMapping(prefix); 229 } 230 } 231 232 public void startElement(String namespaceURI, String localName, String qName, Attributes atts) 233 throws SAXException { 234 235 if (processing == STATE_GROUND) { 236 if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)){ 237 String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE); 238 createIndex = BooleanUtils.toBoolean(sCreate); 239 240 String analyzerClassname = atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE); 241 String indexDirectory = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE); 242 String mergeFactor = atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE); 243 244 queryConfiguration = new IndexerConfiguration( 245 analyzerClassname != null ? analyzerClassname : setupConfiguration.analyzerClassname, 246 indexDirectory != null ? indexDirectory : setupConfiguration.indexDirectory, 247 mergeFactor != null ? Integer.parseInt(mergeFactor) : setupConfiguration.mergeFactor 248 ); 249 250 if (!createIndex) { 251 try { 253 IndexReader reader = openReader(); 254 reader.close(); 255 } catch (IOException ioe) { 256 createIndex = true; 258 } 259 } 260 super.startElement(namespaceURI, localName, qName, atts); 262 processing = STATE_QUERY; 263 } else { 264 super.startElement(namespaceURI, localName, qName, atts); 265 } 266 } else if (processing == STATE_QUERY) { 267 if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)){ 269 this.bodyDocumentURL = atts.getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE); 270 if (this.bodyDocumentURL == null) { 271 throw new SAXException ("<lucene:document> must have @url attribute"); 272 } 273 274 this.documentStartTime = System.currentTimeMillis(); 276 this.documentAttributes = new AttributesImpl (atts); 279 this.bodyText = new StringBuffer (); 280 this.bodyDocument = new Document(); 281 this.elementStack.clear(); 282 processing = STATE_DOCUMENT; 283 } else { 284 throw new SAXException ("<lucene:index> element can contain only <lucene:document> elements!"); 285 } 286 } else if (processing == STATE_DOCUMENT) { 287 elementStack.push(new IndexHelperField(localName, new AttributesImpl (atts))); 288 } 289 } 290 291 public void endElement(String namespaceURI, String localName, String qName) 292 throws SAXException { 293 294 if (processing == STATE_QUERY) { 295 if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)) { 296 try { 298 if (this.writer == null) { 299 openWriter(); 300 } 301 this.writer.optimize(); 302 this.writer.close(); 303 this.writer = null; 304 } catch (IOException e) { 305 throw new SAXException (e); 306 } 307 super.endElement(namespaceURI, localName, qName); 309 this.processing = STATE_GROUND; 310 } else { 311 throw new SAXException ("</lucene:index> was expected!"); 312 } 313 } else if (processing == STATE_DOCUMENT) { 314 if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)) { 315 this.bodyDocument.add(Field.UnStored(LuceneXMLIndexer.BODY_FIELD, this.bodyText.toString())); 317 this.bodyText = null; 318 319 this.bodyDocument.add(Field.UnIndexed(LuceneXMLIndexer.URL_FIELD, this.bodyDocumentURL)); 320 this.bodyDocument.add(new Field(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL), false, true, false)); 322 try { 323 reindexDocument(); 324 } catch (IOException e) { 325 throw new SAXException (e); 326 } 327 this.bodyDocumentURL = null; 328 329 long elapsedTime = System.currentTimeMillis() - this.documentStartTime; 331 this.documentAttributes.addAttribute( 333 "", 334 LUCENE_ELAPSED_TIME_ATTRIBUTE, 335 LUCENE_ELAPSED_TIME_ATTRIBUTE, 336 CDATA, 337 String.valueOf(elapsedTime) 338 ); 339 super.startElement(namespaceURI, localName, qName, this.documentAttributes); 340 super.endElement(namespaceURI, localName, qName); 341 this.processing = STATE_QUERY; 342 } else { 343 IndexHelperField tos = (IndexHelperField) elementStack.pop(); 345 StringBuffer text = tos.getText(); 346 347 Attributes atts = tos.getAttributes(); 348 boolean attributesToText = atts.getIndex(LUCENE_URI, LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE) != -1; 349 for (int i = 0; i < atts.getLength(); i++) { 350 if (LUCENE_URI.equals(atts.getURI(i))) 352 continue; 353 354 String atts_lname = atts.getLocalName(i); 355 String atts_value = atts.getValue(i); 356 bodyDocument.add(Field.UnStored(localName + "@" + atts_lname, atts_value)); 357 if (attributesToText) { 358 text.append(atts_value); 359 text.append(' '); 360 bodyText.append(atts_value); 361 bodyText.append(' '); 362 } 363 } 364 365 boolean store = atts.getIndex(LUCENE_URI, LUCENE_ELEMENT_ATTR_STORE_VALUE) != -1; 366 if (text != null && text.length() > 0) { 367 if (store) { 368 bodyDocument.add(Field.Text(localName, text.toString())); 369 } else { 370 bodyDocument.add(Field.UnStored(localName, text.toString())); 371 } 372 } 373 } 374 } else { 375 super.endElement(namespaceURI, localName, qName); 377 } 378 } 379 380 public void characters(char[] ch, int start, int length) 381 throws SAXException { 382 383 if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0 && length > 1 && elementStack.size() > 0) { 384 String text = new String (ch, start, length); 385 ((IndexHelperField) elementStack.peek()).append(text); 386 bodyText.append(text); 387 bodyText.append(' '); 388 } else if (processing == STATE_GROUND) { 389 super.characters(ch, start, length); 390 } 391 } 392 393 private void openWriter() throws IOException { 394 File indexDirectory = new File (queryConfiguration.indexDirectory); 395 if (!indexDirectory.isAbsolute()) { 396 indexDirectory = new File (workDir, queryConfiguration.indexDirectory); 397 } 398 399 boolean indexExists = IndexReader.indexExists(indexDirectory); 401 if (!indexExists) { 402 createIndex = true; 403 } 404 405 Directory directory = LuceneCocoonHelper.getDirectory(indexDirectory, createIndex); 407 Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(queryConfiguration.analyzerClassname); 408 this.writer = new IndexWriter(directory, analyzer, createIndex); 409 this.writer.mergeFactor = queryConfiguration.mergeFactor; 410 } 411 412 private IndexReader openReader() throws IOException { 413 File indexDirectory = new File (queryConfiguration.indexDirectory); 414 if (!indexDirectory.isAbsolute()) { 415 indexDirectory = new File (workDir, queryConfiguration.indexDirectory); 416 } 417 418 Directory directory = LuceneCocoonHelper.getDirectory(indexDirectory, createIndex); 419 IndexReader reader = IndexReader.open(directory); 420 return reader; 421 } 422 423 private void reindexDocument() throws IOException { 424 if (this.createIndex) { 425 if (this.writer == null) 428 openWriter(); 429 this.writer.addDocument(this.bodyDocument); 430 } else { 431 try { 433 IndexReader reader = openReader(); 434 reader.delete(new Term(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL))); 435 reader.close(); 436 } catch (IOException e) { } 437 openWriter(); 438 this.writer.addDocument(this.bodyDocument); 439 this.writer.close(); 440 this.writer = null; 441 } 442 this.bodyDocument = null; 443 } 444 445 static class IndexHelperField { 446 String localName; 447 StringBuffer text; 448 Attributes attributes; 449 450 IndexHelperField(String localName, Attributes atts) { 451 this.localName = localName; 452 this.attributes = atts; 453 this.text = new StringBuffer (); 454 } 455 456 public Attributes getAttributes() { 457 return attributes; 458 } 459 460 public StringBuffer getText() { 461 return text; 462 } 463 464 public void append(String text) { 465 this.text.append(text); 466 } 467 468 public void append(char[] str, int offset, int length) { 469 this.text.append(str, offset, length); 470 } 471 } 472 473 static class IndexerConfiguration { 474 String analyzerClassname; 475 String indexDirectory; 476 int mergeFactor; 477 478 public IndexerConfiguration(String analyzerClassname, 479 String indexDirectory, 480 int mergeFactor) 481 { 482 this.analyzerClassname = analyzerClassname; 483 this.indexDirectory = indexDirectory; 484 this.mergeFactor = mergeFactor; 485 } 486 } 487 488 } 489 | Popular Tags |