HTMLDocument


1   /*
2    * Copyright  1999-2004 The Apache Software Foundation
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *
16   */
17  
18  /* $Id: HTMLDocument.java 42598 2004-03-01 16:18:28Z gregor $  */
19  
20  package org.apache.lenya.lucene;
21  
22  import java.io.File  ;
23  import java.io.IOException  ;
24  
25  import org.apache.lenya.lucene.html.HTMLParser;
26  import org.apache.lucene.document.DateField;
27  import org.apache.lucene.document.Document;
28  import org.apache.lucene.document.Field;
29  
30  
31  /**
32   * A utility for making Lucene Documents for HTML documents.
33   */
34  public class HTMLDocument {
35      static char dirSep = System.getProperty("file.separator").charAt(0);
36  
37      private HTMLDocument() {
38      }
39  
40      /**
41       * Append path and date into a string in such a way that lexicographic sorting gives the same
42       * results as a walk of the file hierarchy.  Thus null () is used both to separate
43       * directory components and to separate the path from the date.
44       *
45       * @param f DOCUMENT ME!
46       * @param htdocsDumpDir DOCUMENT ME!
47       *
48       * @return DOCUMENT ME!
49       */
50      public static String   uid(File   f, File   htdocsDumpDir) {
51          String   requestURI = f.getPath().substring(htdocsDumpDir.getPath().length());
52          String   uid = requestURI.replace(dirSep, '\u0000') + "\u0000" +
53              DateField.timeToString(f.lastModified());
54  
55          return uid;
56      }
57  
58      /**
59       * DOCUMENT ME!
60       *
61       * @param uid DOCUMENT ME!
62       *
63       * @return DOCUMENT ME!
64       */
65      public static String   uid2url(String   uid) {
66          String   url = uid.replace('\u0000', '/'); // replace nulls with slashes
67  
68          return url.substring(0, url.lastIndexOf('/')); // remove date from end
69      }
70  
71      /**
72       * DOCUMENT ME!
73       *
74       * @param f DOCUMENT ME!
75       * @param htdocsDumpDir DOCUMENT ME!
76       *
77       * @return org.apache.lucene.document.Document
78       *
79       * @throws IOException DOCUMENT ME!
80       * @throws InterruptedException DOCUMENT ME!
81       */
82      public static Document Document(File   f, File   htdocsDumpDir)
83          throws IOException  , InterruptedException   {
84          System.out.println("HTMLDocument.Document(File,File): " + f);
85  
86          // make a new, empty document
87          Document doc = new Document();
88  
89          // Add the url as a field named "url".  Use an UnIndexed field, so
90          // that the url is just stored with the document, but is not searchable.
91          String   requestURI = f.getPath().replace(dirSep, '/').substring(htdocsDumpDir.getPath()
92                                                                                      .length());
93          if (requestURI.substring(requestURI.length() - 8).equals(".pdf.txt")) {
94              requestURI = requestURI.substring(0, requestURI.length() - 4); // Remove .txt extension from PDF text file
95          }
96  
97          doc.add(Field.UnIndexed("url", requestURI));
98  
99          // Add the mime-type as a field named "mime-type"
100         if (requestURI.substring(requestURI.length() - 5).equals(".html")) {
101             doc.add(Field.UnIndexed("mime-type", "text/html"));
102         } else if (requestURI.substring(requestURI.length() - 4).equals(".txt")) {
103             doc.add(Field.UnIndexed("mime-type", "text/plain"));
104         } else if (requestURI.substring(requestURI.length() - 4).equals(".pdf")) {
105             doc.add(Field.UnIndexed("mime-type", "application/pdf"));
106         } else {
107             doc.add(Field.UnIndexed("mime-type", "null"));
108         }
109 
110         // Add the last modified date of the file a field named "modified".  Use a
111         // Keyword field, so that it's searchable, but so that no attempt is made
112         // to tokenize the field into words.
113         doc.add(Field.Keyword("modified", DateField.timeToString(f.lastModified())));
114 
115         // Add the uid as a field, so that index can be incrementally maintained.
116         // This field is not stored with document, it is indexed, but it is not
117         // tokenized prior to indexing.
118         doc.add(new Field("uid", uid(f, htdocsDumpDir), false, true, false));
119 
120         //HtmlDocument htmlDoc = new HtmlDocument(f);
121         HTMLParser parser = new HTMLParser(f);
122 
123         // Add the summary as an UnIndexed field, so that it is stored and returned
124         // with hit documents for display.
125         // Add the title as a separate Text field, so that it can be searched separately.
126         /*
127                 String title = htmlDoc.getTitle();
128 
129                 if (title != null) {
130                     doc.add(Field.Text("title", title));
131                 } else {
132                     doc.add(Field.Text("title", ""));
133                 }
134         */
135         doc.add(Field.Text("title", parser.getTitle()));
136 
137         //System.out.println("HTMLDocument.getLuceneDocument(): title field added: " + title);
138         // Add the tag-stripped contents as a Reader-valued Text field so it will get tokenized and indexed.
139         /*
140                 String body = htmlDoc.getBody();
141                 String contents = "";
142 
143                 if ((body != null) && (title != null)) {
144                     contents = title + " " + body;
145                     doc.add(Field.Text("contents", title + body));
146                 }
147 
148                 doc.add(Field.Text("contents", contents));
149         */
150         doc.add(Field.Text("contents", parser.getReader()));
151 
152         return doc;
153     }
154 }
155
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags