HtmlDocument


1   /*
2    * Copyright  1999-2004 The Apache Software Foundation
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *
16   */
17  
18  /* $Id: HtmlDocument.java 42598 2004-03-01 16:18:28Z gregor $  */
19  
20  package org.apache.lenya.lucene.html;
21  
22  
23  // Imports commented out since there is a name clash and fully
24  // qualified class names will be used in the code.  Imports are
25  // left for ease of maintenance.
26  import java.io.BufferedReader  ;
27  import java.io.File  ;
28  import java.io.FileInputStream  ;
29  import java.io.FileReader  ;
30  import java.io.IOException  ;
31  import java.io.InputStream  ;
32  import java.io.StringWriter  ;
33  
34  import org.apache.lucene.document.Field;
35  import org.w3c.dom.Attr  ;
36  import org.w3c.dom.Element  ;
37  import org.w3c.dom.Node  ;
38  import org.w3c.dom.NodeList  ;
39  import org.w3c.dom.Text  ;
40  import org.w3c.tidy.Tidy;
41  
42  
43  /**
44   * The <code>HtmlDocument</code> class creates a Lucene {@link org.apache.lucene.document.Document}
45   * from an HTML document.
46   *
47   * <P>
48   * It does this by using JTidy package. It can take input input from {@link java.io.File} or {@link
49   * java.io.InputStream}.
50   * </p>
51   */
52  public class HtmlDocument {
53      private Element   rawDoc;
54      private String   luceneTagName = null;
55      private String   luceneClassValue = null;
56  
57      /**
58       * Constructs an <code>HtmlDocument</code> from a {@link java.io.File}.
59       *
60       * @param file the <code>File</code> containing the HTML to parse
61       * @exception IOException if an I/O exception occurs
62       */
63      public HtmlDocument(File   file) throws IOException   {
64          Tidy tidy = new Tidy();
65          tidy.setQuiet(true);
66          tidy.setShowWarnings(false);
67  
68          org.w3c.dom.Document   root = tidy.parseDOM(new FileInputStream  (file), null);
69          rawDoc = root.getDocumentElement();
70      }
71  
72      /**
73       * Constructs an <code>HtmlDocument</code> from an {@link java.io.InputStream}.
74       *
75       * @param is the <code>InputStream</code> containing the HTML
76       * @exception IOException if I/O exception occurs
77       */
78      public HtmlDocument(InputStream   is) throws IOException   {
79          Tidy tidy = new Tidy();
80          tidy.setQuiet(true);
81          tidy.setShowWarnings(false);
82  
83          org.w3c.dom.Document   root = tidy.parseDOM(is, null);
84          rawDoc = root.getDocumentElement();
85      }
86  
87      /**
88       * Creates a Lucene <code>Document</code> from an {@link java.io.InputStream}.
89       *
90       * @param is
91       * @return org.apache.lucene.document.Document
92       * @exception IOException
93       */
94      public static org.apache.lucene.document.Document getDocument(InputStream   is)
95          throws IOException   {
96          HtmlDocument htmlDoc = new HtmlDocument(is);
97          org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
98  
99          luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
100         luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
101 
102         return luceneDoc;
103     }
104 
105     /**
106      * Creates a Lucene <code>Document</code> from a {@link java.io.File}.
107      *
108      * @param file
109      * @return org.apache.lucene.document.Document
110      * @exception IOException
111      */
112     public static org.apache.lucene.document.Document Document(File   file)
113         throws IOException   {
114         HtmlDocument htmlDoc = new HtmlDocument(file);
115         org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
116 
117         luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
118         luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
119 
120         String   contents = null;
121         BufferedReader   br = new BufferedReader  (new FileReader  (file));
122         StringWriter   sw = new StringWriter  ();
123         String   line = br.readLine();
124 
125         while (line != null) {
126             sw.write(line);
127             line = br.readLine();
128         }
129 
130         br.close();
131         contents = sw.toString();
132         sw.close();
133 
134         luceneDoc.add(Field.UnIndexed("rawcontents", contents));
135 
136         return luceneDoc;
137     }
138 
139     /**
140      * Gets the title attribute of the <code>HtmlDocument</code> object.
141      *
142      * @return the title value
143      */
144     public String   getTitle() {
145         if (rawDoc == null) {
146             return null;
147         }
148 
149         String   title = "";
150 
151         NodeList   nl = rawDoc.getElementsByTagName("title");
152 
153         if (nl.getLength() > 0) {
154             Element   titleElement = ((Element  ) nl.item(0));
155             Text   text = (Text  ) titleElement.getFirstChild();
156 
157             if (text != null) {
158                 title = text.getData();
159             }
160         }
161 
162         return title;
163     }
164 
165     /**
166      * Gets the body text attribute of the <code>HtmlDocument</code> object.
167      *
168      * @return the body text value
169      */
170     public String   getBody() {
171         if (rawDoc == null) {
172             return null;
173         }
174 
175         // NOTE: JTidy will insert a meta tag: <meta name="generator" content="HTML Tidy, see www.w3.org" />
176         //       This means that getLength is always greater than 0
177         NodeList   metaNL = rawDoc.getElementsByTagName("meta");
178 
179         for (int i = 0; i < metaNL.getLength(); i++) {
180             Element   metaElement = (Element  ) metaNL.item(i);
181             Attr   nameAttr = metaElement.getAttributeNode("name");
182             Attr   valueAttr = metaElement.getAttributeNode("value");
183 
184             if ((nameAttr != null) && (valueAttr != null)) {
185                 if (nameAttr.getValue().equals("lucene-tag-name")) {
186                     luceneTagName = valueAttr.getValue();
187                 }
188 
189                 if (nameAttr.getValue().equals("lucene-class-value")) {
190                     luceneClassValue = valueAttr.getValue();
191                 }
192             }
193         }
194 
195         boolean indexByLucene = true;
196 
197         if ((luceneTagName != null) && (luceneClassValue != null)) {
198             indexByLucene = false;
199         }
200 
201         System.out.println("HtmlDocument.getBody(): Index By Lucene (Default): " + indexByLucene);
202 
203         String   body = "";
204         NodeList   nl = rawDoc.getElementsByTagName("body");
205 
206         if (nl.getLength() > 0) {
207             body = getBodyText(nl.item(0), indexByLucene);
208         }
209 
210         return body;
211     }
212 
213     /**
214      * Gets the bodyText attribute of the <code>HtmlDocument</code> object.
215      *
216      * @param node a DOM Node
217      * @param indexByLucene DOCUMENT ME!
218      * @return The bodyText value
219      */
220     private String   getBodyText(Node   node, boolean indexByLucene) {
221         NodeList   nl = node.getChildNodes();
222         StringBuffer   buffer = new StringBuffer  ();
223 
224         for (int i = 0; i < nl.getLength(); i++) {
225             boolean index = indexByLucene;
226             Node   child = nl.item(i);
227 
228             switch (child.getNodeType()) {
229             case Node.ELEMENT_NODE:
230 
231                 if ((luceneTagName != null) && (luceneClassValue != null)) {
232                     if (child.getNodeName().equals(luceneTagName)) {
233                         Attr   attribute = ((Element  ) child).getAttributeNode("class");
234 
235                         if (attribute != null) {
236                             if (attribute.getValue().equals(luceneClassValue)) {
237                                 System.out.println("HtmlDocument.getBodyText(): <" + luceneTagName +
238                                     " class=\"" + luceneClassValue + "\"> found!");
239                                 index = true;
240                             }
241 
242                         }
243                     }
244                 }
245 
246                 buffer.append(getBodyText(child, index));
247 
248                 if (index) {
249                     buffer.append(" ");
250                 }
251 
252                 break;
253 
254             case Node.TEXT_NODE:
255 
256                 if (indexByLucene) {
257                     buffer.append(((Text  ) child).getData());
258                 }
259 
260                 break;
261             }
262         }
263 
264         return buffer.toString();
265     }
266 }
267
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags