1 package net.matuschek.html; 2 3 6 7 import java.net.MalformedURLException ; 8 import java.net.URL ; 9 import java.util.Vector ; 10 import java.util.StringTokenizer ; 11 import java.io.*; 12 13 import org.w3c.dom.Document ; 14 import org.w3c.dom.Element ; 15 import org.w3c.dom.NodeList ; 16 import org.w3c.tidy.Tidy; 17 18 import org.apache.log4j.Category; 19 20 import net.matuschek.util.AttribValuePair; 21 22 31 public class HtmlDocument 32 { 33 34 35 private URL url = null; 36 37 38 private byte[] content = null; 39 40 41 private Document domDoc = null; 42 43 44 private Category log; 45 46 47 private String encoding; 48 49 50 private URL baseURL=null; 51 52 53 Vector <URL > links; 54 55 56 59 private HtmlDocument(URL url) { 60 log = Category.getInstance(getClass().getName()); 61 this.url = url; 62 } 63 64 65 71 public HtmlDocument(URL url, byte[] content) { 72 this(url); 73 this.content = content; 74 parse(); 75 } 76 77 84 public HtmlDocument(URL url, byte[] content, String newEncoding) { 85 this(url); 86 this.content = content; 87 encoding = newEncoding; 88 parse(); 89 } 90 91 92 96 public HtmlDocument(URL url, String contentStr) { 97 this(url); 98 this.content = new byte[contentStr.length()+1]; 99 for (int i=0; i<contentStr.length(); i++) { 100 this.content[i] = (byte)contentStr.charAt(i); 101 } 102 parse(); 103 } 104 105 106 107 112 private void parse() { 113 if (domDoc == null) { 114 parseToDOM(); 115 } 116 this.links = new Vector <URL >(); 117 extractLinks(domDoc.getDocumentElement(),links); 118 } 119 120 public Vector <URL > getLinks() { 121 return this.links; 122 } 123 124 125 130 public Vector getImageLinks() { 131 if (domDoc == null) { 132 parseToDOM(); 133 } 134 Vector <URL > links = new Vector <URL >(); 135 extractImageLinks(domDoc.getDocumentElement(),links); 136 137 return links; 138 } 139 140 141 147 public Vector getElements(String type) { 148 if (domDoc == null) { 149 parseToDOM(); 150 } 151 152 Vector <Element >links = new Vector <Element >(); 153 extractElements(domDoc.getDocumentElement(),type,links); 154 155 return links; 156 } 157 158 159 166 protected void extractLinks(Element element, Vector <URL >links) { 167 168 if (element==null) { 170 log.error("got a null element"); 171 return; 172 } 173 174 String name = element.getNodeName().toLowerCase(); 175 176 if (name.equals("a")) { 177 178 addLink(element.getAttribute("href"),links); 180 181 } else if (name.equals("base")) { 182 183 try { 185 this.baseURL = new URL (element.getAttribute("href")); 186 log.info("baseUR="+baseURL); 187 } catch (MalformedURLException e) { } 188 189 } else if (name.equals("frame")) { 190 191 addLink(element.getAttribute("src"),links); 193 194 } else if (name.equals("iframe")) { 196 197 addLink(element.getAttribute("src"),links); 199 200 } else if (name.equals("image")) { 201 202 addLink(element.getAttribute("src"),links); 204 205 } else if (name.equals("img")) { 206 207 addLink(element.getAttribute("src"),links); 209 210 } else if (name.equals("area")) { 211 212 addLink(element.getAttribute("href"),links); 214 215 } else if (name.equals("meta")) { 216 217 String equiv=element.getAttribute("http-equiv"); 219 if ((equiv != null) && (equiv.equalsIgnoreCase("refresh"))) { 220 String refreshcontent=element.getAttribute("content"); 221 if (refreshcontent == null) { refreshcontent=""; } 222 223 StringTokenizer st=new StringTokenizer (refreshcontent,";"); 224 while (st.hasMoreTokens()) { 225 String token=st.nextToken().trim(); 226 AttribValuePair av = new AttribValuePair(token); 227 if (av.getAttrib().equals("url")) { 228 addLink(av.getValue(),links); 229 } 230 } 231 } 232 233 } else if (name.equals("body")) { 234 String background = element.getAttribute("background"); 236 if ( ! ( background == null) || 237 ( background.equals("") ) ) { 238 addLink(background,links); 239 } 240 241 } else { 242 log.info("Ignore tag name: "+name); 243 } 244 245 246 NodeList childs = element.getChildNodes(); 248 249 for (int i=0; i<childs.getLength(); i++) { 250 if (childs.item(i) instanceof Element ) { 251 extractLinks((Element )childs.item(i),links); 252 } 253 } 254 255 } 256 257 258 265 protected void extractImageLinks(Element element, Vector <URL > links) { 266 267 if (element==null) { 269 log.error("got a null element"); 270 return; 271 } 272 273 String name = element.getNodeName(); 274 275 if (name.equals("img")) { 276 addLink(element.getAttribute("src"),links); 278 } 279 280 if (name.equals("image")) { 281 addLink(element.getAttribute("src"),links); 283 } 284 285 NodeList childs = element.getChildNodes(); 287 288 for (int i=0; i<childs.getLength(); i++) { 289 if (childs.item(i) instanceof Element ) { 290 extractImageLinks((Element )childs.item(i),links); 291 } 292 } 293 294 } 295 296 297 305 protected void extractElements(Element element, 306 String type, 307 Vector <Element >elementList) { 308 309 if (element==null) { 311 log.error("got a null element"); 312 return; 313 } 314 315 String name = element.getNodeName(); 316 317 if (name.equals(type)) { 318 elementList.add(element); 319 } 320 321 322 NodeList childs = element.getChildNodes(); 324 325 for (int i=0; i<childs.getLength(); i++) { 326 if (childs.item(i) instanceof Element ) { 327 extractElements((Element )childs.item(i),type,elementList); 328 } 329 } 330 331 } 332 333 334 337 private void parseToDOM() { 338 ByteArrayInputStream is = new ByteArrayInputStream(content); 339 340 Tidy tidy = new Tidy(); 342 tidy.setUpperCaseTags(false); 343 tidy.setUpperCaseAttrs(false); 344 tidy.setErrout(new PrintWriter(System.err)); 345 346 domDoc = tidy.parseDOM(is,null); 347 } 348 349 350 353 private void addLink(String newURL, Vector <URL > links) { 354 355 if ((newURL == null) || (newURL.equals(""))) return; 358 int pos = newURL.indexOf("#"); 359 if (pos >=0 ) { 360 newURL = newURL.substring(0,pos); 361 } 362 363 if (encoding != null) { 364 try { 365 newURL = new String (newURL.getBytes(), encoding); 366 } catch (UnsupportedEncodingException e) { 367 } 368 } else { 369 try { 370 newURL = new String (newURL.getBytes(), "ISO-8859-1"); 371 } catch (UnsupportedEncodingException e) { 372 } 373 } 374 375 try { 376 URL u = null; 377 if (this.baseURL != null) { 378 u = new URL (this.baseURL,newURL); 379 } else { 380 u = new URL (url,newURL); 381 } 382 links.add(u); 383 } catch (Exception e) { 384 log.debug("error during link extraction: "+e.getMessage()+" "+newURL); 385 } 386 } 387 388 389 public URL getBaseURL() { 390 return baseURL; 391 } 392 393 394 395 } 396 | Popular Tags |