1 package org.pdfbox.searchengine.lucene; 2 3 61 62 import org.apache.lucene.analysis.standard.StandardAnalyzer; 63 64 import org.apache.lucene.demo.HTMLDocument; 65 66 import org.apache.lucene.document.Document; 67 68 import org.apache.lucene.index.IndexReader; 69 import org.apache.lucene.index.IndexWriter; 70 import org.apache.lucene.index.Term; 71 import org.apache.lucene.index.TermEnum; 72 73 import java.util.Arrays ; 74 75 76 import java.io.File ; 77 import java.io.IOException ; 78 79 import java.util.Date ; 80 81 82 91 public class IndexFiles 92 { 93 private boolean deleting = false; private IndexReader reader; private IndexWriter writer; private TermEnum uidIter; 98 103 public static void main(String [] argv) 104 { 105 106 String index = "index"; 107 boolean create = false; 108 File root = null; 109 110 String usage = "org.pdfbox.searchengine.lucene.IndexFiles [-create] [-index <index>] <root_directory>"; 111 112 if (argv.length == 0) 113 { 114 System.err.println("Usage: " + usage); 115 return; 116 } 117 118 for (int i = 0; i < argv.length; i++) 119 { 120 if (argv[i].equals("-index")) 121 { index = argv[++i]; 123 } 124 else if (argv[i].equals("-create")) 125 { create = true; 127 } 128 else if (i != argv.length-1) 129 { 130 System.err.println("Usage: " + usage); 131 return; 132 } 133 else 134 { 135 System.out.println( "root=" +argv[i] ); 136 root = new File (argv[i]); 137 } 138 } 139 IndexFiles indexer = new IndexFiles(); 140 indexer.index( root, create, index ); 141 } 142 143 150 public void index( File root, boolean create, String index ) 151 { 152 153 try 154 { 155 Date start = new Date (); 156 157 writer = new IndexWriter(index, new StandardAnalyzer(), create); 158 159 if (!create) 160 { deleting = true; 162 indexDocs(root, index, create); 163 } 164 165 indexDocs(root, index, create); 167 System.out.println("Optimizing index..."); 168 writer.optimize(); 169 writer.close(); 170 171 Date end = new Date (); 172 173 System.out.print(end.getTime() - start.getTime()); 174 System.out.println(" total milliseconds"); 175 176 } 177 catch( Exception e ) 178 { 179 e.printStackTrace(); 180 } 181 } 182 183 195 private void indexDocs(File file, String index, boolean create) throws Exception 196 { 197 if (!create) 198 { 200 reader = IndexReader.open(index); uidIter = reader.terms(new Term("uid", "")); 203 indexDocs(file); 204 205 if (deleting) 206 { while (uidIter.term() != null && uidIter.term().field().equals( "uid" ) ) 208 { 209 System.out.println("deleting " + 210 HTMLDocument.uid2url(uidIter.term().text())); 211 reader.deleteDocuments(uidIter.term()); 212 uidIter.next(); 213 } 214 deleting = false; 215 } 216 217 uidIter.close(); reader.close(); 220 } 221 else 222 { 223 indexDocs(file); 224 } 225 } 226 227 228 private void indexDocs(File file) throws Exception 229 { 230 if (file.isDirectory()) 231 { String [] files = file.list(); Arrays.sort(files); for (int i = 0; i < files.length; i++) { 236 indexDocs(new File (file, files[i])); 237 } 238 } 239 else 240 { 241 if (uidIter != null) 242 { 243 String uid = HTMLDocument.uid(file); 245 while( uidIter.term() != null && 246 uidIter.term().field().equals( "uid" ) && 247 uidIter.term().text().compareTo(uid) < 0) 248 { 249 if (deleting) 250 { System.out.println("deleting " + 252 HTMLDocument.uid2url(uidIter.term().text())); 253 reader.deleteDocuments(uidIter.term()); 254 } 255 uidIter.next(); 256 } 257 if( uidIter.term() != null && 258 uidIter.term().field().equals( "uid" ) && 259 uidIter.term().text().compareTo(uid) == 0) 260 { 261 System.out.println( "Next uid=" +uidIter ); 262 uidIter.next(); } 264 } 265 else 266 { 267 try 268 { 269 addDocument( file ); 270 } 271 catch( IOException e ) 272 { 273 System.out.println( e.getMessage() ); 275 } 276 } 277 } 278 } 279 280 private void addDocument( File file ) throws IOException , InterruptedException 281 { 282 String path = file.getName().toUpperCase(); 283 Document doc = null; 284 if( path.endsWith(".HTML") || path.endsWith(".HTM") || path.endsWith(".TXT")) 288 { 289 System.out.println( "Indexing Text document: " + file ); 290 doc = HTMLDocument.Document(file); 291 } 292 else if( path.endsWith( ".PDF" ) ) 293 { 294 System.out.println( "Indexing PDF document: " + file ); 295 doc = LucenePDFDocument.getDocument( file ); 296 } 297 else 298 { 299 System.out.println( "Skipping " + file ); 300 } 301 302 if( doc != null ) 303 { 304 writer.addDocument(doc); 305 } 306 } 307 } | Popular Tags |