1 17 18 19 20 package org.apache.lenya.lucene.index; 21 22 import java.io.File ; 23 import java.io.FileFilter ; 24 import java.io.IOException ; 25 import java.util.Arrays ; 26 27 import org.apache.log4j.Category; 28 import org.apache.lenya.lucene.IndexConfiguration; 29 import org.apache.lucene.analysis.standard.StandardAnalyzer; 30 import org.apache.lucene.document.Document; 31 import org.apache.lucene.index.IndexReader; 32 import org.apache.lucene.index.IndexWriter; 33 import org.apache.lucene.index.Term; 34 35 import org.w3c.dom.Element ; 36 37 42 public abstract class AbstractIndexer implements Indexer { 43 private static Category log = Category.getInstance(AbstractIndexer.class); 44 45 private DocumentCreator documentCreator; 46 private Element indexer; 47 private String configFileName; 48 49 52 public AbstractIndexer() { 53 } 54 55 58 protected DocumentCreator getDocumentCreator() { 59 return documentCreator; 60 } 61 62 65 public void configure(Element indexer, String configFileName) throws Exception { 66 documentCreator = createDocumentCreator(indexer, configFileName); 67 this.indexer = indexer; 68 this.configFileName = configFileName; 69 } 70 71 80 public abstract DocumentCreator createDocumentCreator(Element indexer, String configFileName) throws Exception ; 81 82 92 public void updateIndex(File dumpDirectory, File index) throws Exception { 93 deleteStaleDocuments(dumpDirectory, index); 94 doIndex(dumpDirectory, index, false); 95 } 96 97 106 public void indexDocument(File file) throws Exception { 107 IndexConfiguration config = new IndexConfiguration(configFileName); 108 log.debug("File: " + file); 109 110 File dumpDir = new File (config.resolvePath(config.getHTDocsDumpDir())); 111 log.debug("Dump dir: " + dumpDir); 112 113 File indexDir = new File (config.resolvePath(config.getIndexDir())); 114 log.debug("Index dir: " + indexDir); 115 116 117 String id = IndexIterator.createID(file, dumpDir); 118 119 boolean createNewIndex = false; 120 if (!IndexReader.indexExists(indexDir)) { 121 log.warn("Index does not exist yet: " + indexDir); 122 createNewIndex = true; 123 } else { 124 IndexReader reader = IndexReader.open(indexDir.getAbsolutePath()); 126 Term term = new Term("id", id); 127 log.debug(term.toString()); 128 int numberOfDeletedDocuments = reader.delete(term); 129 if (numberOfDeletedDocuments == 1) { 130 log.info("Document has been deleted: " + term); 131 } else { 132 log.warn("No such document found in this index: " + term); 133 } 134 reader.close(); 137 } 138 139 Document doc = getDocumentCreator().getDocument(new File (dumpDir, id), dumpDir); 141 IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(), createNewIndex); 142 writer.maxFieldLength = 1000000; 143 writer.addDocument(doc); 144 log.info("Document has been added: " + id); 146 writer.optimize(); 147 writer.close(); 148 } 149 150 153 public void createIndex(File dumpDirectory, File index) 154 throws Exception { 155 doIndex(dumpDirectory, index, true); 156 } 157 158 165 public void doIndex(File dumpDirectory, File index, boolean create) { 166 if (!index.isDirectory()) { 167 index.mkdirs(); 168 log.warn("Directory has been created: " + index.getAbsolutePath()); 169 } 170 try { 171 IndexWriter writer = new IndexWriter(index.getAbsolutePath(), new StandardAnalyzer(), create); 172 writer.maxFieldLength = 1000000; 173 174 IndexInformation info = new IndexInformation(index.getAbsolutePath(), dumpDirectory, getFilter(indexer, configFileName), create); 175 176 IndexHandler handler; 177 178 if (create) { 179 handler = new CreateIndexHandler(dumpDirectory, info, writer); 180 } else { 181 handler = new UpdateIndexHandler(dumpDirectory, info, writer); 182 } 183 184 IndexIterator iterator = new IndexIterator(index.getAbsolutePath(), getFilter(indexer, configFileName)); 185 iterator.addHandler(handler); 186 iterator.iterate(dumpDirectory); 187 188 writer.optimize(); 189 writer.close(); 190 } catch (IOException e) { 191 log.error(e); 192 } 193 } 194 195 198 protected void deleteStaleDocuments(File dumpDirectory, File index) 199 throws Exception { 200 log.debug("Deleting stale documents"); 201 202 IndexIterator iterator = new IndexIterator(index.getAbsolutePath(), getFilter(indexer, configFileName)); 203 iterator.addHandler(new DeleteHandler()); 204 iterator.iterate(dumpDirectory); 205 log.debug("Deleting stale documents finished"); 206 } 207 208 211 public FileFilter getFilter(Element indexer, String configFileName) { 212 String [] indexableExtensions = { "html", "htm", "txt" }; 213 return new AbstractIndexer.DefaultIndexFilter(indexableExtensions); 214 } 215 216 219 public class DefaultIndexFilter implements FileFilter { 220 protected String [] indexableExtensions; 221 222 225 public DefaultIndexFilter() { 226 String [] iE = { "html", "htm", "txt" }; 227 indexableExtensions = iE; 228 } 229 230 233 public DefaultIndexFilter(String [] indexableExtensions) { 234 this.indexableExtensions = indexableExtensions; 235 } 236 237 244 public boolean accept(File file) { 245 boolean accept; 246 247 if (file.isDirectory()) { 248 accept = true; 249 } else { 250 String fileName = file.getName(); 251 String extension = fileName.substring(fileName.lastIndexOf(".") + 1); 252 accept = Arrays.asList(indexableExtensions).contains(extension); 253 } 254 255 return accept; 256 } 257 } 258 259 267 public class DeleteHandler extends AbstractIndexIteratorHandler { 268 271 public void handleStaleDocument(IndexReader reader, Term term) { 272 log.debug("deleting " + 273 IndexIterator.uid2url(term.text())); 274 275 try { 276 int deletedDocuments = reader.delete(term); 277 log.debug("deleted " + deletedDocuments + 278 " documents."); 279 } catch (IOException e) { 280 log.error(e); 281 } 282 } 283 } 284 285 288 public class IndexHandler extends AbstractIndexIteratorHandler { 289 296 public IndexHandler(File dumpDirectory, IndexInformation info, IndexWriter writer) { 297 this.info = info; 298 this.dumpDirectory = dumpDirectory; 299 this.writer = writer; 300 } 301 302 private IndexInformation info; 303 304 protected IndexInformation getInformation() { 305 return info; 306 } 307 308 private File dumpDirectory; 309 310 protected File getDumpDirectory() { 311 return dumpDirectory; 312 } 313 314 private IndexWriter writer; 315 316 protected IndexWriter getWriter() { 317 return writer; 318 } 319 320 323 protected void addFile(File file) { 324 log.debug("adding document: " + file.getAbsolutePath()); 325 326 try { 327 Document doc = getDocumentCreator().getDocument(file, dumpDirectory); 328 writer.addDocument(doc); 329 } catch (Exception e) { 330 log.error(e); 331 } 332 333 info.increase(); 334 log.info(info.printProgress()); 335 } 336 } 337 338 341 public class CreateIndexHandler extends IndexHandler { 342 349 public CreateIndexHandler(File dumpDirectory, IndexInformation info, IndexWriter writer) { 350 super(dumpDirectory, info, writer); 351 } 352 353 356 public void handleFile(IndexReader reader, File file) { 357 addFile(file); 358 } 359 } 360 361 364 public class UpdateIndexHandler extends IndexHandler { 365 372 public UpdateIndexHandler(File dumpDirectory, IndexInformation info, IndexWriter writer) { 373 super(dumpDirectory, info, writer); 374 } 375 376 379 public void handleNewDocument(IndexReader reader, Term term, File file) { 380 addFile(file); 381 } 382 } 383 } 384 | Popular Tags |