1 package org.apache.lucene.demo; 2 3 18 19 import org.apache.lucene.analysis.standard.StandardAnalyzer; 20 import org.apache.lucene.document.Document; 21 import org.apache.lucene.index.IndexReader; 22 import org.apache.lucene.index.IndexWriter; 23 import org.apache.lucene.index.Term; 24 import org.apache.lucene.index.TermEnum; 25 import java.io.File ; 26 import java.util.Date ; 27 import java.util.Arrays ; 28 29 class IndexHTML { 30 private static boolean deleting = false; private static IndexReader reader; private static IndexWriter writer; private static TermEnum uidIter; 35 public static void main(String [] argv) { 36 try { 37 String index = "index"; 38 boolean create = false; 39 File root = null; 40 41 String usage = "IndexHTML [-create] [-index <index>] <root_directory>"; 42 43 if (argv.length == 0) { 44 System.err.println("Usage: " + usage); 45 return; 46 } 47 48 for (int i = 0; i < argv.length; i++) { 49 if (argv[i].equals("-index")) { index = argv[++i]; 51 } else if (argv[i].equals("-create")) { create = true; 53 } else if (i != argv.length-1) { 54 System.err.println("Usage: " + usage); 55 return; 56 } else 57 root = new File (argv[i]); 58 } 59 60 Date start = new Date (); 61 62 if (!create) { deleting = true; 64 indexDocs(root, index, create); 65 } 66 67 writer = new IndexWriter(index, new StandardAnalyzer(), create); 68 writer.maxFieldLength = 1000000; 69 70 indexDocs(root, index, create); 72 System.out.println("Optimizing index..."); 73 writer.optimize(); 74 writer.close(); 75 76 Date end = new Date (); 77 78 System.out.print(end.getTime() - start.getTime()); 79 System.out.println(" total milliseconds"); 80 81 } catch (Exception e) { 82 System.out.println(" caught a " + e.getClass() + 83 "\n with message: " + e.getMessage()); 84 } 85 } 86 87 92 93 private static void indexDocs(File file, String index, boolean create) 94 throws Exception { 95 if (!create) { 97 reader = IndexReader.open(index); uidIter = reader.terms(new Term("uid", "")); 100 indexDocs(file); 101 102 if (deleting) { while (uidIter.term() != null && uidIter.term().field() == "uid") { 104 System.out.println("deleting " + 105 HTMLDocument.uid2url(uidIter.term().text())); 106 reader.delete(uidIter.term()); 107 uidIter.next(); 108 } 109 deleting = false; 110 } 111 112 uidIter.close(); reader.close(); 115 } else indexDocs(file); 117 } 118 119 private static void indexDocs(File file) throws Exception { 120 if (file.isDirectory()) { String [] files = file.list(); Arrays.sort(files); for (int i = 0; i < files.length; i++) indexDocs(new File (file, files[i])); 125 126 } else if (file.getPath().endsWith(".html") || file.getPath().endsWith(".htm") || file.getPath().endsWith(".txt")) { 130 if (uidIter != null) { 131 String uid = HTMLDocument.uid(file); 133 while (uidIter.term() != null && uidIter.term().field() == "uid" && 134 uidIter.term().text().compareTo(uid) < 0) { 135 if (deleting) { System.out.println("deleting " + 137 HTMLDocument.uid2url(uidIter.term().text())); 138 reader.delete(uidIter.term()); 139 } 140 uidIter.next(); 141 } 142 if (uidIter.term() != null && uidIter.term().field() == "uid" && 143 uidIter.term().text().compareTo(uid) == 0) { 144 uidIter.next(); } else if (!deleting) { Document doc = HTMLDocument.Document(file); 147 System.out.println("adding " + doc.get("url")); 148 writer.addDocument(doc); 149 } 150 } else { Document doc = HTMLDocument.Document(file); 152 System.out.println("adding " + doc.get("url")); 153 writer.addDocument(doc); } 155 } 156 } 157 } 158 | Popular Tags |