1 17 18 19 20 package org.apache.lenya.lucene.html; 21 22 23 import java.io.BufferedReader ; 27 import java.io.File ; 28 import java.io.FileInputStream ; 29 import java.io.FileReader ; 30 import java.io.IOException ; 31 import java.io.InputStream ; 32 import java.io.StringWriter ; 33 34 import org.apache.lucene.document.Field; 35 import org.w3c.dom.Attr ; 36 import org.w3c.dom.Element ; 37 import org.w3c.dom.Node ; 38 import org.w3c.dom.NodeList ; 39 import org.w3c.dom.Text ; 40 import org.w3c.tidy.Tidy; 41 42 43 52 public class HtmlDocument { 53 private Element rawDoc; 54 private String luceneTagName = null; 55 private String luceneClassValue = null; 56 57 63 public HtmlDocument(File file) throws IOException { 64 Tidy tidy = new Tidy(); 65 tidy.setQuiet(true); 66 tidy.setShowWarnings(false); 67 68 org.w3c.dom.Document root = tidy.parseDOM(new FileInputStream (file), null); 69 rawDoc = root.getDocumentElement(); 70 } 71 72 78 public HtmlDocument(InputStream is) throws IOException { 79 Tidy tidy = new Tidy(); 80 tidy.setQuiet(true); 81 tidy.setShowWarnings(false); 82 83 org.w3c.dom.Document root = tidy.parseDOM(is, null); 84 rawDoc = root.getDocumentElement(); 85 } 86 87 94 public static org.apache.lucene.document.Document getDocument(InputStream is) 95 throws IOException { 96 HtmlDocument htmlDoc = new HtmlDocument(is); 97 org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document(); 98 99 luceneDoc.add(Field.Text("title", htmlDoc.getTitle())); 100 luceneDoc.add(Field.Text("contents", htmlDoc.getBody())); 101 102 return luceneDoc; 103 } 104 105 112 public static org.apache.lucene.document.Document Document(File file) 113 throws IOException { 114 HtmlDocument htmlDoc = new HtmlDocument(file); 115 org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document(); 116 117 luceneDoc.add(Field.Text("title", htmlDoc.getTitle())); 118 luceneDoc.add(Field.Text("contents", htmlDoc.getBody())); 119 120 String contents = null; 121 BufferedReader br = new BufferedReader (new FileReader (file)); 122 StringWriter sw = new StringWriter (); 123 String line = br.readLine(); 124 125 while (line != null) { 126 sw.write(line); 127 line = br.readLine(); 128 } 129 130 br.close(); 131 contents = sw.toString(); 132 sw.close(); 133 134 luceneDoc.add(Field.UnIndexed("rawcontents", contents)); 135 136 return luceneDoc; 137 } 138 139 144 public String getTitle() { 145 if (rawDoc == null) { 146 return null; 147 } 148 149 String title = ""; 150 151 NodeList nl = rawDoc.getElementsByTagName("title"); 152 153 if (nl.getLength() > 0) { 154 Element titleElement = ((Element ) nl.item(0)); 155 Text text = (Text ) titleElement.getFirstChild(); 156 157 if (text != null) { 158 title = text.getData(); 159 } 160 } 161 162 return title; 163 } 164 165 170 public String getBody() { 171 if (rawDoc == null) { 172 return null; 173 } 174 175 NodeList metaNL = rawDoc.getElementsByTagName("meta"); 178 179 for (int i = 0; i < metaNL.getLength(); i++) { 180 Element metaElement = (Element ) metaNL.item(i); 181 Attr nameAttr = metaElement.getAttributeNode("name"); 182 Attr valueAttr = metaElement.getAttributeNode("value"); 183 184 if ((nameAttr != null) && (valueAttr != null)) { 185 if (nameAttr.getValue().equals("lucene-tag-name")) { 186 luceneTagName = valueAttr.getValue(); 187 } 188 189 if (nameAttr.getValue().equals("lucene-class-value")) { 190 luceneClassValue = valueAttr.getValue(); 191 } 192 } 193 } 194 195 boolean indexByLucene = true; 196 197 if ((luceneTagName != null) && (luceneClassValue != null)) { 198 indexByLucene = false; 199 } 200 201 System.out.println("HtmlDocument.getBody(): Index By Lucene (Default): " + indexByLucene); 202 203 String body = ""; 204 NodeList nl = rawDoc.getElementsByTagName("body"); 205 206 if (nl.getLength() > 0) { 207 body = getBodyText(nl.item(0), indexByLucene); 208 } 209 210 return body; 211 } 212 213 220 private String getBodyText(Node node, boolean indexByLucene) { 221 NodeList nl = node.getChildNodes(); 222 StringBuffer buffer = new StringBuffer (); 223 224 for (int i = 0; i < nl.getLength(); i++) { 225 boolean index = indexByLucene; 226 Node child = nl.item(i); 227 228 switch (child.getNodeType()) { 229 case Node.ELEMENT_NODE: 230 231 if ((luceneTagName != null) && (luceneClassValue != null)) { 232 if (child.getNodeName().equals(luceneTagName)) { 233 Attr attribute = ((Element ) child).getAttributeNode("class"); 234 235 if (attribute != null) { 236 if (attribute.getValue().equals(luceneClassValue)) { 237 System.out.println("HtmlDocument.getBodyText(): <" + luceneTagName + 238 " class=\"" + luceneClassValue + "\"> found!"); 239 index = true; 240 } 241 242 } 243 } 244 } 245 246 buffer.append(getBodyText(child, index)); 247 248 if (index) { 249 buffer.append(" "); 250 } 251 252 break; 253 254 case Node.TEXT_NODE: 255 256 if (indexByLucene) { 257 buffer.append(((Text ) child).getData()); 258 } 259 260 break; 261 } 262 } 263 264 return buffer.toString(); 265 } 266 } 267 | Popular Tags |