1 2 3 4 package net.nutch.parse.html; 5 6 import java.net.URL ; 7 import java.net.MalformedURLException ; 8 import java.util.ArrayList ; 9 import java.util.HashMap ; 10 11 import net.nutch.parse.Outlink; 12 13 import org.w3c.dom.*; 14 15 22 public class DOMContentUtils { 23 24 public static class LinkParams { 25 public String elName; 26 public String attrName; 27 public int childLen; 28 29 public LinkParams(String elName, String attrName, int childLen) { 30 this.elName = elName; 31 this.attrName = attrName; 32 this.childLen = childLen; 33 } 34 35 public String toString() { 36 return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; 37 } 38 } 39 40 public static HashMap linkParams = new HashMap (); 41 42 static { 43 linkParams.put("a", new LinkParams("a", "href", 1)); 44 linkParams.put("area", new LinkParams("area", "href", 0)); 45 linkParams.put("frame", new LinkParams("frame", "src", 0)); 46 linkParams.put("iframe", new LinkParams("iframe", "src", 0)); 47 } 48 49 66 public static final boolean getText(StringBuffer sb, Node node, 67 boolean abortOnNestedAnchors) { 68 if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { 69 return true; 70 } 71 return false; 72 } 73 74 75 80 public static final void getText(StringBuffer sb, Node node) { 81 getText(sb, node, false); 82 } 83 84 private static final boolean getTextHelper(StringBuffer sb, Node node, 87 boolean abortOnNestedAnchors, 88 int anchorDepth) { 89 if ("script".equalsIgnoreCase(node.getNodeName())) { 90 return false; 91 } 92 if ("style".equalsIgnoreCase(node.getNodeName())) { 93 return false; 94 } 95 if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) { 96 anchorDepth++; 97 if (anchorDepth > 1) 98 return true; 99 } 100 if (node.getNodeType() == Node.COMMENT_NODE) { 101 return false; 102 } 103 if (node.getNodeType() == Node.TEXT_NODE) { 104 String text = node.getNodeValue(); 106 text = text.replaceAll("\\s+", " "); 107 text = text.trim(); 108 if (text.length() > 0) { 109 if (sb.length() > 0) sb.append(' '); 110 sb.append(text); 111 } 112 } 113 boolean abort = false; 114 NodeList children = node.getChildNodes(); 115 if (children != null) { 116 int len = children.getLength(); 117 for (int i = 0; i < len; i++) { 118 if (getTextHelper(sb, children.item(i), 119 abortOnNestedAnchors, anchorDepth)) { 120 abort = true; 121 break; 122 } 123 } 124 } 125 return abort; 126 } 127 128 135 public static final boolean getTitle(StringBuffer sb, Node node) { 136 if (node.getNodeType() == Node.ELEMENT_NODE) { 137 if ("title".equalsIgnoreCase(node.getNodeName())) { 138 getText(sb, node); 139 return true; 140 } 141 } 142 NodeList children = node.getChildNodes(); 143 if (children != null) { 144 int len = children.getLength(); 145 for (int i = 0; i < len; i++) { 146 if (getTitle(sb, children.item(i))) { 147 return true; 148 } 149 } 150 } 151 return false; 152 } 153 154 155 public static final URL getBase(Node node) { 156 157 if (node.getNodeType() == Node.ELEMENT_NODE) { 159 if ("base".equalsIgnoreCase(node.getNodeName())) { 160 NamedNodeMap attrs = node.getAttributes(); 161 for (int i= 0; i < attrs.getLength(); i++ ) { 162 Node attr = attrs.item(i); 163 if ("href".equalsIgnoreCase(attr.getNodeName())) { 164 try { 165 return new URL (attr.getNodeValue()); 166 } catch (MalformedURLException e) {} 167 } 168 } 169 } 170 } 171 172 NodeList children = node.getChildNodes(); 174 if (children != null) { 175 int len = children.getLength(); 176 for (int i = 0; i < len; i++) { 177 URL base = getBase(children.item(i)); 178 if (base != null) 179 return base; 180 } 181 } 182 183 return null; 185 } 186 187 188 private static boolean hasOnlyWhiteSpace(Node node) { 189 String val= node.getNodeValue(); 190 for (int i= 0; i < val.length(); i++) { 191 if (!Character.isWhitespace(val.charAt(i))) 192 return false; 193 } 194 return true; 195 } 196 197 private static boolean shouldThrowAwayLink(Node node, NodeList children, 200 int childLen, LinkParams params) { 201 if (childLen == 0) { 202 if (params.childLen == 0) return false; 204 else return true; 205 } else if ((childLen == 1) 206 && (children.item(0).getNodeType() == Node.ELEMENT_NODE) 207 && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { 208 return true; 210 211 } else if (childLen == 2) { 212 213 Node c0= children.item(0); 214 Node c1= children.item(1); 215 216 if ((c0.getNodeType() == Node.ELEMENT_NODE) 217 && (params.elName.equalsIgnoreCase(c0.getNodeName())) 218 && (c1.getNodeType() == Node.TEXT_NODE) 219 && hasOnlyWhiteSpace(c1) ) { 220 return true; 222 } 223 224 if ((c1.getNodeType() == Node.ELEMENT_NODE) 225 && (params.elName.equalsIgnoreCase(c1.getNodeName())) 226 && (c0.getNodeType() == Node.TEXT_NODE) 227 && hasOnlyWhiteSpace(c0) ) { 228 return true; 230 } 231 232 } else if (childLen == 3) { 233 Node c0= children.item(0); 234 Node c1= children.item(1); 235 Node c2= children.item(2); 236 237 if ((c1.getNodeType() == Node.ELEMENT_NODE) 238 && (params.elName.equalsIgnoreCase(c1.getNodeName())) 239 && (c0.getNodeType() == Node.TEXT_NODE) 240 && (c2.getNodeType() == Node.TEXT_NODE) 241 && hasOnlyWhiteSpace(c0) 242 && hasOnlyWhiteSpace(c2) ) { 243 return true; 245 } 246 } 247 248 return false; 249 } 250 251 265 public static final void getOutlinks(URL base, ArrayList outlinks, 266 Node node) { 267 268 NodeList children = node.getChildNodes(); 269 int childLen= 0; 270 if (children != null) 271 childLen= children.getLength(); 272 273 if (node.getNodeType() == Node.ELEMENT_NODE) { 274 LinkParams params = (LinkParams)linkParams.get(node.getNodeName().toLowerCase()); 275 if (params != null) { 276 if (shouldThrowAwayLink(node, children, childLen, params)) { 277 } else { 280 281 StringBuffer linkText = new StringBuffer (); 282 getText(linkText, node, true); 283 284 NamedNodeMap attrs = node.getAttributes(); 285 String target = null; 286 for (int i= 0; i < attrs.getLength(); i++ ) { 287 if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) { 288 target = attrs.item(i).getNodeValue(); 289 break; 290 } 291 } 292 if (target != null) 293 try { 294 URL url = new URL (base, target); 295 outlinks.add(new Outlink(url.toString(), 296 linkText.toString().trim())); 297 } catch (MalformedURLException e) { 298 } 300 } 301 if (params.childLen == 0) return; 303 } 304 } 305 for ( int i = 0; i < childLen; i++ ) { 306 getOutlinks(base, outlinks, children.item(i)); 307 } 308 } 309 310 } 311 312 | Popular Tags |