KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lenya > lucene > html > HtmlDocument


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */

17
18 /* $Id: HtmlDocument.java 42598 2004-03-01 16:18:28Z gregor $ */
19
20 package org.apache.lenya.lucene.html;
21
22
23 // Imports commented out since there is a name clash and fully
24
// qualified class names will be used in the code. Imports are
25
// left for ease of maintenance.
26
import java.io.BufferedReader JavaDoc;
27 import java.io.File JavaDoc;
28 import java.io.FileInputStream JavaDoc;
29 import java.io.FileReader JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.io.InputStream JavaDoc;
32 import java.io.StringWriter JavaDoc;
33
34 import org.apache.lucene.document.Field;
35 import org.w3c.dom.Attr JavaDoc;
36 import org.w3c.dom.Element JavaDoc;
37 import org.w3c.dom.Node JavaDoc;
38 import org.w3c.dom.NodeList JavaDoc;
39 import org.w3c.dom.Text JavaDoc;
40 import org.w3c.tidy.Tidy;
41
42
43 /**
44  * The <code>HtmlDocument</code> class creates a Lucene {@link org.apache.lucene.document.Document}
45  * from an HTML document.
46  *
47  * <P>
48  * It does this by using JTidy package. It can take input input from {@link java.io.File} or {@link
49  * java.io.InputStream}.
50  * </p>
51  */

52 public class HtmlDocument {
53     private Element JavaDoc rawDoc;
54     private String JavaDoc luceneTagName = null;
55     private String JavaDoc luceneClassValue = null;
56
57     /**
58      * Constructs an <code>HtmlDocument</code> from a {@link java.io.File}.
59      *
60      * @param file the <code>File</code> containing the HTML to parse
61      * @exception IOException if an I/O exception occurs
62      */

63     public HtmlDocument(File JavaDoc file) throws IOException JavaDoc {
64         Tidy tidy = new Tidy();
65         tidy.setQuiet(true);
66         tidy.setShowWarnings(false);
67
68         org.w3c.dom.Document JavaDoc root = tidy.parseDOM(new FileInputStream JavaDoc(file), null);
69         rawDoc = root.getDocumentElement();
70     }
71
72     /**
73      * Constructs an <code>HtmlDocument</code> from an {@link java.io.InputStream}.
74      *
75      * @param is the <code>InputStream</code> containing the HTML
76      * @exception IOException if I/O exception occurs
77      */

78     public HtmlDocument(InputStream JavaDoc is) throws IOException JavaDoc {
79         Tidy tidy = new Tidy();
80         tidy.setQuiet(true);
81         tidy.setShowWarnings(false);
82
83         org.w3c.dom.Document JavaDoc root = tidy.parseDOM(is, null);
84         rawDoc = root.getDocumentElement();
85     }
86
87     /**
88      * Creates a Lucene <code>Document</code> from an {@link java.io.InputStream}.
89      *
90      * @param is
91      * @return org.apache.lucene.document.Document
92      * @exception IOException
93      */

94     public static org.apache.lucene.document.Document getDocument(InputStream JavaDoc is)
95         throws IOException JavaDoc {
96         HtmlDocument htmlDoc = new HtmlDocument(is);
97         org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
98
99         luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
100         luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
101
102         return luceneDoc;
103     }
104
105     /**
106      * Creates a Lucene <code>Document</code> from a {@link java.io.File}.
107      *
108      * @param file
109      * @return org.apache.lucene.document.Document
110      * @exception IOException
111      */

112     public static org.apache.lucene.document.Document Document(File JavaDoc file)
113         throws IOException JavaDoc {
114         HtmlDocument htmlDoc = new HtmlDocument(file);
115         org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
116
117         luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
118         luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
119
120         String JavaDoc contents = null;
121         BufferedReader JavaDoc br = new BufferedReader JavaDoc(new FileReader JavaDoc(file));
122         StringWriter JavaDoc sw = new StringWriter JavaDoc();
123         String JavaDoc line = br.readLine();
124
125         while (line != null) {
126             sw.write(line);
127             line = br.readLine();
128         }
129
130         br.close();
131         contents = sw.toString();
132         sw.close();
133
134         luceneDoc.add(Field.UnIndexed("rawcontents", contents));
135
136         return luceneDoc;
137     }
138
139     /**
140      * Gets the title attribute of the <code>HtmlDocument</code> object.
141      *
142      * @return the title value
143      */

144     public String JavaDoc getTitle() {
145         if (rawDoc == null) {
146             return null;
147         }
148
149         String JavaDoc title = "";
150
151         NodeList JavaDoc nl = rawDoc.getElementsByTagName("title");
152
153         if (nl.getLength() > 0) {
154             Element JavaDoc titleElement = ((Element JavaDoc) nl.item(0));
155             Text JavaDoc text = (Text JavaDoc) titleElement.getFirstChild();
156
157             if (text != null) {
158                 title = text.getData();
159             }
160         }
161
162         return title;
163     }
164
165     /**
166      * Gets the body text attribute of the <code>HtmlDocument</code> object.
167      *
168      * @return the body text value
169      */

170     public String JavaDoc getBody() {
171         if (rawDoc == null) {
172             return null;
173         }
174
175         // NOTE: JTidy will insert a meta tag: <meta name="generator" content="HTML Tidy, see www.w3.org" />
176
// This means that getLength is always greater than 0
177
NodeList JavaDoc metaNL = rawDoc.getElementsByTagName("meta");
178
179         for (int i = 0; i < metaNL.getLength(); i++) {
180             Element JavaDoc metaElement = (Element JavaDoc) metaNL.item(i);
181             Attr JavaDoc nameAttr = metaElement.getAttributeNode("name");
182             Attr JavaDoc valueAttr = metaElement.getAttributeNode("value");
183
184             if ((nameAttr != null) && (valueAttr != null)) {
185                 if (nameAttr.getValue().equals("lucene-tag-name")) {
186                     luceneTagName = valueAttr.getValue();
187                 }
188
189                 if (nameAttr.getValue().equals("lucene-class-value")) {
190                     luceneClassValue = valueAttr.getValue();
191                 }
192             }
193         }
194
195         boolean indexByLucene = true;
196
197         if ((luceneTagName != null) && (luceneClassValue != null)) {
198             indexByLucene = false;
199         }
200
201         System.out.println("HtmlDocument.getBody(): Index By Lucene (Default): " + indexByLucene);
202
203         String JavaDoc body = "";
204         NodeList JavaDoc nl = rawDoc.getElementsByTagName("body");
205
206         if (nl.getLength() > 0) {
207             body = getBodyText(nl.item(0), indexByLucene);
208         }
209
210         return body;
211     }
212
213     /**
214      * Gets the bodyText attribute of the <code>HtmlDocument</code> object.
215      *
216      * @param node a DOM Node
217      * @param indexByLucene DOCUMENT ME!
218      * @return The bodyText value
219      */

220     private String JavaDoc getBodyText(Node JavaDoc node, boolean indexByLucene) {
221         NodeList JavaDoc nl = node.getChildNodes();
222         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
223
224         for (int i = 0; i < nl.getLength(); i++) {
225             boolean index = indexByLucene;
226             Node JavaDoc child = nl.item(i);
227
228             switch (child.getNodeType()) {
229             case Node.ELEMENT_NODE:
230
231                 if ((luceneTagName != null) && (luceneClassValue != null)) {
232                     if (child.getNodeName().equals(luceneTagName)) {
233                         Attr JavaDoc attribute = ((Element JavaDoc) child).getAttributeNode("class");
234
235                         if (attribute != null) {
236                             if (attribute.getValue().equals(luceneClassValue)) {
237                                 System.out.println("HtmlDocument.getBodyText(): <" + luceneTagName +
238                                     " class=\"" + luceneClassValue + "\"> found!");
239                                 index = true;
240                             }
241
242                         }
243                     }
244                 }
245
246                 buffer.append(getBodyText(child, index));
247
248                 if (index) {
249                     buffer.append(" ");
250                 }
251
252                 break;
253
254             case Node.TEXT_NODE:
255
256                 if (indexByLucene) {
257                     buffer.append(((Text JavaDoc) child).getData());
258                 }
259
260                 break;
261             }
262         }
263
264         return buffer.toString();
265     }
266 }
267
Popular Tags