AbstractDocumentCreator


1   /*
2    * Copyright  1999-2004 The Apache Software Foundation
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *
16   */
17  
18  /* $Id: AbstractDocumentCreator.java 43169 2004-08-02 00:19:42Z michi $  */
19  
20  package org.apache.lenya.lucene.index;
21  
22  import java.io.File  ;
23  
24  import org.apache.lucene.document.DateField;
25  import org.apache.lucene.document.Document;
26  import org.apache.lucene.document.Field;
27  
28  import org.apache.log4j.Category;
29  
30  public class AbstractDocumentCreator implements DocumentCreator {
31      Category log = Category.getInstance(AbstractDocumentCreator.class);
32  
33      /** Creates a new instance of AbstractDocumentCreator */
34      public AbstractDocumentCreator() {
35      }
36  
37      /**
38       * DOCUMENT ME!
39       *
40       * @param file DOCUMENT ME!
41       * @param htdocsDumpDir DOCUMENT ME!
42       *
43       * @return DOCUMENT ME!
44       *
45       * @throws Exception DOCUMENT ME!
46       */
47      public Document getDocument(File   file, File   htdocsDumpDir)
48          throws Exception   {
49          // make a new, empty document
50          Document doc = new Document();
51  
52          // Add the url as a field named "url".  Use an UnIndexed field, so
53          // that the url is just stored with the document, but is not searchable.
54          String   requestURI = file.getPath().replace(File.separatorChar, '/').substring(htdocsDumpDir.getPath()
55                                                                                                     .length());
56          if (requestURI.substring(requestURI.length() - 8).equals(".pdf.txt")) {
57              requestURI = requestURI.substring(0, requestURI.length() - 4); // Remove .txt extension from PDF text file
58          }
59  
60          doc.add(Field.UnIndexed("url", requestURI));
61  
62          // Add the mime-type as a field named "mime-type"
63          if (requestURI.substring(requestURI.length() - 5).equals(".html")) {
64              doc.add(Field.UnIndexed("mime-type", "text/html"));
65          } else if (requestURI.substring(requestURI.length() - 4).equals(".txt")) {
66              doc.add(Field.UnIndexed("mime-type", "text/plain"));
67          } else if (requestURI.substring(requestURI.length() - 4).equals(".pdf")) {
68              doc.add(Field.UnIndexed("mime-type", "application/pdf"));
69          } else {
70              // Don't add any mime-type field
71              //doc.add(Field.UnIndexed("mime-type", "null"));
72          }
73  
74          // Add the last modified date of the file a field named "modified".  Use a
75          // Keyword field, so that it's searchable, but so that no attempt is made
76          // to tokenize the field into words.
77          doc.add(Field.Keyword("modified", DateField.timeToString(file.lastModified())));
78  
79          // Add the id as a field, so that index can be incrementally maintained.
80      String   id = IndexIterator.createID(file, htdocsDumpDir);
81          log.debug(id);
82          doc.add(Field.Keyword("id", id));
83  
84          // Add the uid as a field, so that index can be incrementally maintained.
85          // This field is not stored with document, it is indexed, but it is not
86          // tokenized prior to indexing.
87      String   uid = IndexIterator.createUID(file, htdocsDumpDir);
88          log.debug(uid);
89          doc.add(new Field("uid", uid, false, true, false));
90  
91          return doc;
92      }
93  }
94
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags