KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lenya > lucene > HTMLDocument


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */

17
18 /* $Id: HTMLDocument.java 42598 2004-03-01 16:18:28Z gregor $ */
19
20 package org.apache.lenya.lucene;
21
22 import java.io.File JavaDoc;
23 import java.io.IOException JavaDoc;
24
25 import org.apache.lenya.lucene.html.HTMLParser;
26 import org.apache.lucene.document.DateField;
27 import org.apache.lucene.document.Document;
28 import org.apache.lucene.document.Field;
29
30
31 /**
32  * A utility for making Lucene Documents for HTML documents.
33  */

34 public class HTMLDocument {
35     static char dirSep = System.getProperty("file.separator").charAt(0);
36
37     private HTMLDocument() {
38     }
39
40     /**
41      * Append path and date into a string in such a way that lexicographic sorting gives the same
42      * results as a walk of the file hierarchy. Thus null () is used both to separate
43      * directory components and to separate the path from the date.
44      *
45      * @param f DOCUMENT ME!
46      * @param htdocsDumpDir DOCUMENT ME!
47      *
48      * @return DOCUMENT ME!
49      */

50     public static String JavaDoc uid(File JavaDoc f, File JavaDoc htdocsDumpDir) {
51         String JavaDoc requestURI = f.getPath().substring(htdocsDumpDir.getPath().length());
52         String JavaDoc uid = requestURI.replace(dirSep, '\u0000') + "\u0000" +
53             DateField.timeToString(f.lastModified());
54
55         return uid;
56     }
57
58     /**
59      * DOCUMENT ME!
60      *
61      * @param uid DOCUMENT ME!
62      *
63      * @return DOCUMENT ME!
64      */

65     public static String JavaDoc uid2url(String JavaDoc uid) {
66         String JavaDoc url = uid.replace('\u0000', '/'); // replace nulls with slashes
67

68         return url.substring(0, url.lastIndexOf('/')); // remove date from end
69
}
70
71     /**
72      * DOCUMENT ME!
73      *
74      * @param f DOCUMENT ME!
75      * @param htdocsDumpDir DOCUMENT ME!
76      *
77      * @return org.apache.lucene.document.Document
78      *
79      * @throws IOException DOCUMENT ME!
80      * @throws InterruptedException DOCUMENT ME!
81      */

82     public static Document Document(File JavaDoc f, File JavaDoc htdocsDumpDir)
83         throws IOException JavaDoc, InterruptedException JavaDoc {
84         System.out.println("HTMLDocument.Document(File,File): " + f);
85
86         // make a new, empty document
87
Document doc = new Document();
88
89         // Add the url as a field named "url". Use an UnIndexed field, so
90
// that the url is just stored with the document, but is not searchable.
91
String JavaDoc requestURI = f.getPath().replace(dirSep, '/').substring(htdocsDumpDir.getPath()
92                                                                                     .length());
93         if (requestURI.substring(requestURI.length() - 8).equals(".pdf.txt")) {
94             requestURI = requestURI.substring(0, requestURI.length() - 4); // Remove .txt extension from PDF text file
95
}
96
97         doc.add(Field.UnIndexed("url", requestURI));
98
99         // Add the mime-type as a field named "mime-type"
100
if (requestURI.substring(requestURI.length() - 5).equals(".html")) {
101             doc.add(Field.UnIndexed("mime-type", "text/html"));
102         } else if (requestURI.substring(requestURI.length() - 4).equals(".txt")) {
103             doc.add(Field.UnIndexed("mime-type", "text/plain"));
104         } else if (requestURI.substring(requestURI.length() - 4).equals(".pdf")) {
105             doc.add(Field.UnIndexed("mime-type", "application/pdf"));
106         } else {
107             doc.add(Field.UnIndexed("mime-type", "null"));
108         }
109
110         // Add the last modified date of the file a field named "modified". Use a
111
// Keyword field, so that it's searchable, but so that no attempt is made
112
// to tokenize the field into words.
113
doc.add(Field.Keyword("modified", DateField.timeToString(f.lastModified())));
114
115         // Add the uid as a field, so that index can be incrementally maintained.
116
// This field is not stored with document, it is indexed, but it is not
117
// tokenized prior to indexing.
118
doc.add(new Field("uid", uid(f, htdocsDumpDir), false, true, false));
119
120         //HtmlDocument htmlDoc = new HtmlDocument(f);
121
HTMLParser parser = new HTMLParser(f);
122
123         // Add the summary as an UnIndexed field, so that it is stored and returned
124
// with hit documents for display.
125
// Add the title as a separate Text field, so that it can be searched separately.
126
/*
127                 String title = htmlDoc.getTitle();
128
129                 if (title != null) {
130                     doc.add(Field.Text("title", title));
131                 } else {
132                     doc.add(Field.Text("title", ""));
133                 }
134         */

135         doc.add(Field.Text("title", parser.getTitle()));
136
137         //System.out.println("HTMLDocument.getLuceneDocument(): title field added: " + title);
138
// Add the tag-stripped contents as a Reader-valued Text field so it will get tokenized and indexed.
139
/*
140                 String body = htmlDoc.getBody();
141                 String contents = "";
142
143                 if ((body != null) && (title != null)) {
144                     contents = title + " " + body;
145                     doc.add(Field.Text("contents", title + body));
146                 }
147
148                 doc.add(Field.Text("contents", contents));
149         */

150         doc.add(Field.Text("contents", parser.getReader()));
151
152         return doc;
153     }
154 }
155
Popular Tags