KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > demo > HTMLDocument


1 package org.apache.lucene.demo;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import java.io.*;
20 import org.apache.lucene.document.*;
21 import org.apache.lucene.demo.html.HTMLParser;
22
23 /** A utility for making Lucene Documents for HTML documents. */
24
25 public class HTMLDocument {
26   static char dirSep = System.getProperty("file.separator").charAt(0);
27
28   public static String JavaDoc uid(File f) {
29     // Append path and date into a string in such a way that lexicographic
30
// sorting gives the same results as a walk of the file hierarchy. Thus
31
// null (\u0000) is used both to separate directory components and to
32
// separate the path from the date.
33
return f.getPath().replace(dirSep, '\u0000') +
34       "\u0000" +
35       DateField.timeToString(f.lastModified());
36   }
37
38   public static String JavaDoc uid2url(String JavaDoc uid) {
39     String JavaDoc url = uid.replace('\u0000', '/'); // replace nulls with slashes
40
return url.substring(0, url.lastIndexOf('/')); // remove date from end
41
}
42
43   public static Document Document(File f)
44        throws IOException, InterruptedException JavaDoc {
45     // make a new, empty document
46
Document doc = new Document();
47
48     // Add the url as a field named "url". Use an UnIndexed field, so
49
// that the url is just stored with the document, but is not searchable.
50
doc.add(Field.UnIndexed("url", f.getPath().replace(dirSep, '/')));
51
52     // Add the last modified date of the file a field named "modified". Use a
53
// Keyword field, so that it's searchable, but so that no attempt is made
54
// to tokenize the field into words.
55
doc.add(Field.Keyword("modified",
56               DateField.timeToString(f.lastModified())));
57
58     // Add the uid as a field, so that index can be incrementally maintained.
59
// This field is not stored with document, it is indexed, but it is not
60
// tokenized prior to indexing.
61
doc.add(new Field("uid", uid(f), false, true, false));
62
63     HTMLParser parser = new HTMLParser(f);
64
65     // Add the tag-stripped contents as a Reader-valued Text field so it will
66
// get tokenized and indexed.
67
doc.add(Field.Text("contents", parser.getReader()));
68
69     // Add the summary as an UnIndexed field, so that it is stored and returned
70
// with hit documents for display.
71
doc.add(Field.UnIndexed("summary", parser.getSummary()));
72
73     // Add the title as a separate Text field, so that it can be searched
74
// separately.
75
doc.add(Field.Text("title", parser.getTitle()));
76
77     // return the document
78
return doc;
79   }
80
81   private HTMLDocument() {}
82 }
83     
84
Popular Tags