1 16 package org.apache.cocoon.components.search; 17 18 import org.apache.avalon.framework.activity.Disposable; 19 import org.apache.avalon.framework.configuration.Configurable; 20 import org.apache.avalon.framework.configuration.Configuration; 21 import org.apache.avalon.framework.configuration.ConfigurationException; 22 import org.apache.avalon.framework.logger.AbstractLogEnabled; 23 import org.apache.avalon.framework.service.ServiceException; 24 import org.apache.avalon.framework.service.ServiceManager; 25 import org.apache.avalon.framework.service.Serviceable; 26 import org.apache.cocoon.ProcessingException; 27 import org.apache.cocoon.components.crawler.CocoonCrawler; 28 import org.apache.lucene.analysis.Analyzer; 29 import org.apache.lucene.document.Document; 30 import org.apache.lucene.index.IndexReader; 31 import org.apache.lucene.index.IndexWriter; 32 import org.apache.lucene.index.Term; 33 import org.apache.lucene.index.TermEnum; 34 import org.apache.lucene.store.Directory; 35 36 import java.io.IOException ; 37 import java.net.URL ; 38 import java.util.Iterator ; 39 40 53 public class SimpleLuceneCocoonIndexerImpl extends AbstractLogEnabled 54 implements LuceneCocoonIndexer, Configurable, Serviceable, Disposable 55 { 56 57 60 public final static String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname"; 61 62 65 public final static String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer"; 66 67 70 public final static String DIRECTORY_CONFIG = "directory"; 71 72 75 public final static String DIRECTORY_DEFAULT = null; 76 77 80 public final static String MERGE_FACTOR_CONFIG = "merge-factor"; 81 82 86 public final static int MERGE_FACTOR_DEFAULT = 10; 87 88 91 protected ServiceManager manager = null; 92 93 protected Analyzer analyzer; 94 private int mergeFactor = MERGE_FACTOR_DEFAULT; 96 97 98 103 public void setAnalyzer(Analyzer analyzer) { 104 this.analyzer = analyzer; 105 } 106 107 108 114 public void configure(Configuration conf) throws ConfigurationException { 115 Configuration child; 116 117 127 child = conf.getChild(MERGE_FACTOR_CONFIG, false); 128 if (child != null) { 129 int int_value = child.getValueAsInteger(MERGE_FACTOR_DEFAULT); 131 mergeFactor = int_value; 132 } 133 } 134 135 136 143 public void service(ServiceManager manager) throws ServiceException { 144 this.manager = manager; 145 } 146 147 148 151 public void dispose() { } 152 153 154 164 public void index(Directory index, boolean create, URL base_url) 165 throws ProcessingException { 166 167 IndexWriter writer = null; 168 LuceneXMLIndexer lxi = null; 169 CocoonCrawler cocoonCrawler = null; 170 171 try { 172 lxi = (LuceneXMLIndexer) manager.lookup(LuceneXMLIndexer.ROLE); 173 174 writer = new IndexWriter(index, analyzer, create); 175 writer.mergeFactor = this.mergeFactor; 176 177 cocoonCrawler = (CocoonCrawler) manager.lookup(CocoonCrawler.ROLE); 178 cocoonCrawler.crawl(base_url); 179 180 Iterator cocoonCrawlerIterator = cocoonCrawler.iterator(); 181 while (cocoonCrawlerIterator.hasNext()) { 182 URL crawl_url = (URL ) cocoonCrawlerIterator.next(); 183 if (crawl_url == null) { 186 continue; 187 } else if (!crawl_url.getHost().equals(base_url.getHost()) || 188 crawl_url.getPort() != base_url.getPort()) { 189 190 if (getLogger().isDebugEnabled()) { 193 getLogger().debug("Skipping crawling URL " + crawl_url.toString() + 194 " as base_url is " + base_url.toString()); 195 } 196 continue; 197 } 198 199 Iterator i = lxi.build(crawl_url).iterator(); 201 202 while (i.hasNext()) { 204 writer.addDocument((Document) i.next()); 205 } 206 } 207 writer.optimize(); 209 } catch (IOException ioe) { 210 throw new ProcessingException("IOException in index()", ioe); 211 } catch (ServiceException se) { 212 throw new ProcessingException("Could not lookup service in index()", se); 213 } finally { 214 if (writer != null) { 215 try { 216 writer.close(); 217 } catch (IOException ioe) { 218 } 219 writer = null; 220 } 221 222 if (lxi != null) { 223 manager.release(lxi); 224 lxi = null; 225 } 226 if (cocoonCrawler != null) { 227 manager.release(cocoonCrawler); 228 cocoonCrawler = null; 229 } 230 } 231 } 232 233 234 239 static class DocumentDeletableIterator { 240 private IndexReader reader; 241 private TermEnum uidIter; 243 244 246 247 253 public DocumentDeletableIterator(Directory directory) throws IOException { 254 reader = IndexReader.open(directory); 255 uidIter = reader.terms(new Term("uid", "")); 257 } 259 260 261 266 public void deleteAllStaleDocuments() throws IOException { 267 while (uidIter.term() != null && uidIter.term().field().equals("uid")) { 268 reader.delete(uidIter.term()); 269 uidIter.next(); 270 } 271 } 272 273 274 280 public void deleteModifiedDocuments(String uid) throws IOException { 281 while (documentHasBeenModified(uidIter.term(), uid)) { 282 reader.delete(uidIter.term()); 283 uidIter.next(); 284 } 285 if (documentHasNotBeenModified(uidIter.term(), uid)) { 286 uidIter.next(); 287 } 288 } 289 290 291 296 protected void finalize() throws Throwable { 297 super.finalize(); 298 if (uidIter != null) { 299 uidIter.close(); 300 uidIter = null; 302 } 303 if (reader != null) { 304 reader.close(); 305 reader = null; 307 } 308 } 309 310 311 317 boolean documentIsDeletable(Term term) { 318 return term != null && term.field() == "uid"; 319 } 320 321 322 329 boolean documentHasBeenModified(Term term, String uid) { 330 return documentIsDeletable(term) && 331 term.text().compareTo(uid) < 0; 332 } 333 334 335 342 boolean documentHasNotBeenModified(Term term, String uid) { 343 return documentIsDeletable(term) && 344 term.text().compareTo(uid) == 0; 345 } 346 } 347 } 348 349 | Popular Tags |