1 17 18 19 20 package org.apache.lenya.lucene; 21 22 import java.io.File ; 23 import java.io.IOException ; 24 25 import org.apache.lenya.lucene.html.HTMLParser; 26 import org.apache.lucene.document.DateField; 27 import org.apache.lucene.document.Document; 28 import org.apache.lucene.document.Field; 29 30 31 34 public class HTMLDocument { 35 static char dirSep = System.getProperty("file.separator").charAt(0); 36 37 private HTMLDocument() { 38 } 39 40 50 public static String uid(File f, File htdocsDumpDir) { 51 String requestURI = f.getPath().substring(htdocsDumpDir.getPath().length()); 52 String uid = requestURI.replace(dirSep, '\u0000') + "\u0000" + 53 DateField.timeToString(f.lastModified()); 54 55 return uid; 56 } 57 58 65 public static String uid2url(String uid) { 66 String url = uid.replace('\u0000', '/'); 68 return url.substring(0, url.lastIndexOf('/')); } 70 71 82 public static Document Document(File f, File htdocsDumpDir) 83 throws IOException , InterruptedException { 84 System.out.println("HTMLDocument.Document(File,File): " + f); 85 86 Document doc = new Document(); 88 89 String requestURI = f.getPath().replace(dirSep, '/').substring(htdocsDumpDir.getPath() 92 .length()); 93 if (requestURI.substring(requestURI.length() - 8).equals(".pdf.txt")) { 94 requestURI = requestURI.substring(0, requestURI.length() - 4); } 96 97 doc.add(Field.UnIndexed("url", requestURI)); 98 99 if (requestURI.substring(requestURI.length() - 5).equals(".html")) { 101 doc.add(Field.UnIndexed("mime-type", "text/html")); 102 } else if (requestURI.substring(requestURI.length() - 4).equals(".txt")) { 103 doc.add(Field.UnIndexed("mime-type", "text/plain")); 104 } else if (requestURI.substring(requestURI.length() - 4).equals(".pdf")) { 105 doc.add(Field.UnIndexed("mime-type", "application/pdf")); 106 } else { 107 doc.add(Field.UnIndexed("mime-type", "null")); 108 } 109 110 doc.add(Field.Keyword("modified", DateField.timeToString(f.lastModified()))); 114 115 doc.add(new Field("uid", uid(f, htdocsDumpDir), false, true, false)); 119 120 HTMLParser parser = new HTMLParser(f); 122 123 135 doc.add(Field.Text("title", parser.getTitle())); 136 137 150 doc.add(Field.Text("contents", parser.getReader())); 151 152 return doc; 153 } 154 } 155 | Popular Tags |