HTMLDocument


1   package org.apache.lucene.demo;
2   
3   /**
4    * Copyright 2004 The Apache Software Foundation
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  import java.io.*;
20  import org.apache.lucene.document.*;
21  import org.apache.lucene.demo.html.HTMLParser;
22  
23  /** A utility for making Lucene Documents for HTML documents. */
24  
25  public class HTMLDocument {
26    static char dirSep = System.getProperty("file.separator").charAt(0);
27  
28    public static String   uid(File f) {
29      // Append path and date into a string in such a way that lexicographic
30      // sorting gives the same results as a walk of the file hierarchy.  Thus
31      // null (\u0000) is used both to separate directory components and to
32      // separate the path from the date.
33      return f.getPath().replace(dirSep, '\u0000') +
34        "\u0000" +
35        DateField.timeToString(f.lastModified());
36    }
37  
38    public static String   uid2url(String   uid) {
39      String   url = uid.replace('\u0000', '/');      // replace nulls with slashes
40      return url.substring(0, url.lastIndexOf('/')); // remove date from end
41    }
42  
43    public static Document Document(File f)
44         throws IOException, InterruptedException    {
45      // make a new, empty document
46      Document doc = new Document();
47  
48      // Add the url as a field named "url".  Use an UnIndexed field, so
49      // that the url is just stored with the document, but is not searchable.
50      doc.add(Field.UnIndexed("url", f.getPath().replace(dirSep, '/')));
51  
52      // Add the last modified date of the file a field named "modified".  Use a
53      // Keyword field, so that it's searchable, but so that no attempt is made
54      // to tokenize the field into words.
55      doc.add(Field.Keyword("modified",
56                DateField.timeToString(f.lastModified())));
57  
58      // Add the uid as a field, so that index can be incrementally maintained.
59      // This field is not stored with document, it is indexed, but it is not
60      // tokenized prior to indexing.
61      doc.add(new Field("uid", uid(f), false, true, false));
62  
63      HTMLParser parser = new HTMLParser(f);
64  
65      // Add the tag-stripped contents as a Reader-valued Text field so it will
66      // get tokenized and indexed.
67      doc.add(Field.Text("contents", parser.getReader()));
68  
69      // Add the summary as an UnIndexed field, so that it is stored and returned
70      // with hit documents for display.
71      doc.add(Field.UnIndexed("summary", parser.getSummary()));
72  
73      // Add the title as a separate Text field, so that it can be searched
74      // separately.
75      doc.add(Field.Text("title", parser.getTitle()));
76  
77      // return the document
78      return doc;
79    }
80  
81    private HTMLDocument() {}
82  }
83      
84
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags