KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lenya > lucene > index > AbstractDocumentCreator


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */

17
18 /* $Id: AbstractDocumentCreator.java 43169 2004-08-02 00:19:42Z michi $ */
19
20 package org.apache.lenya.lucene.index;
21
22 import java.io.File JavaDoc;
23
24 import org.apache.lucene.document.DateField;
25 import org.apache.lucene.document.Document;
26 import org.apache.lucene.document.Field;
27
28 import org.apache.log4j.Category;
29
30 public class AbstractDocumentCreator implements DocumentCreator {
31     Category log = Category.getInstance(AbstractDocumentCreator.class);
32
33     /** Creates a new instance of AbstractDocumentCreator */
34     public AbstractDocumentCreator() {
35     }
36
37     /**
38      * DOCUMENT ME!
39      *
40      * @param file DOCUMENT ME!
41      * @param htdocsDumpDir DOCUMENT ME!
42      *
43      * @return DOCUMENT ME!
44      *
45      * @throws Exception DOCUMENT ME!
46      */

47     public Document getDocument(File JavaDoc file, File JavaDoc htdocsDumpDir)
48         throws Exception JavaDoc {
49         // make a new, empty document
50
Document doc = new Document();
51
52         // Add the url as a field named "url". Use an UnIndexed field, so
53
// that the url is just stored with the document, but is not searchable.
54
String JavaDoc requestURI = file.getPath().replace(File.separatorChar, '/').substring(htdocsDumpDir.getPath()
55                                                                                                    .length());
56         if (requestURI.substring(requestURI.length() - 8).equals(".pdf.txt")) {
57             requestURI = requestURI.substring(0, requestURI.length() - 4); // Remove .txt extension from PDF text file
58
}
59
60         doc.add(Field.UnIndexed("url", requestURI));
61
62         // Add the mime-type as a field named "mime-type"
63
if (requestURI.substring(requestURI.length() - 5).equals(".html")) {
64             doc.add(Field.UnIndexed("mime-type", "text/html"));
65         } else if (requestURI.substring(requestURI.length() - 4).equals(".txt")) {
66             doc.add(Field.UnIndexed("mime-type", "text/plain"));
67         } else if (requestURI.substring(requestURI.length() - 4).equals(".pdf")) {
68             doc.add(Field.UnIndexed("mime-type", "application/pdf"));
69         } else {
70             // Don't add any mime-type field
71
//doc.add(Field.UnIndexed("mime-type", "null"));
72
}
73
74         // Add the last modified date of the file a field named "modified". Use a
75
// Keyword field, so that it's searchable, but so that no attempt is made
76
// to tokenize the field into words.
77
doc.add(Field.Keyword("modified", DateField.timeToString(file.lastModified())));
78
79         // Add the id as a field, so that index can be incrementally maintained.
80
String JavaDoc id = IndexIterator.createID(file, htdocsDumpDir);
81         log.debug(id);
82         doc.add(Field.Keyword("id", id));
83
84         // Add the uid as a field, so that index can be incrementally maintained.
85
// This field is not stored with document, it is indexed, but it is not
86
// tokenized prior to indexing.
87
String JavaDoc uid = IndexIterator.createUID(file, htdocsDumpDir);
88         log.debug(uid);
89         doc.add(new Field("uid", uid, false, true, false));
90
91         return doc;
92     }
93 }
94
Popular Tags