1 2 3 4 package net.nutch.parse.html; 5 6 import java.net.URL ; 7 8 import org.w3c.dom.*; 9 import org.w3c.dom.html.*; 10 import org.apache.html.dom.*; 11 12 18 public class RobotsMetaProcessor { 19 20 24 public static class RobotsMetaIndicator { 25 private boolean noIndex= false; 26 private boolean noFollow= false; 27 private boolean noCache= false; 28 private URL baseHref= null; 29 30 34 public void reset() { 35 noIndex= false; 36 noFollow= false; 37 noCache= false; 38 baseHref= null; 39 } 40 41 44 public void setNoFollow() { 45 noFollow= true; 46 } 47 48 51 public void setNoIndex() { 52 noIndex= true; 53 } 54 55 58 public void setNoCache() { 59 noCache= true; 60 } 61 62 65 public void setBaseHref(URL baseHref) { 66 this.baseHref= baseHref; 67 } 68 69 72 public boolean getNoIndex() { 73 return noIndex; 74 } 75 76 79 public boolean getNoFollow() { 80 return noFollow; 81 } 82 83 86 public boolean getNoCache() { 87 return noCache; 88 } 89 90 94 public URL getBaseHref() { 95 return baseHref; 96 } 97 98 } 99 100 105 public static final void getRobotsMetaDirectives( 106 RobotsMetaIndicator robotsMeta, Node node, URL currURL) { 107 108 robotsMeta.reset(); 109 getRobotsMetaDirectivesHelper(robotsMeta, node, currURL); 110 } 111 112 private static final void getRobotsMetaDirectivesHelper( 113 RobotsMetaIndicator robotsMeta, Node node, URL currURL) { 114 115 if (node.getNodeType() == Node.ELEMENT_NODE) { 116 117 if ("BODY".equals(node.getNodeName())) { 118 return; 120 } 121 122 if ("META".equals(node.getNodeName())) { 123 NamedNodeMap attrs= node.getAttributes(); 124 Node nameNode= attrs.getNamedItem("name"); 125 126 if (nameNode != null) { 127 if ("robots".equalsIgnoreCase(nameNode.getNodeValue())) { 128 Node contentNode= attrs.getNamedItem("content"); 129 130 if (contentNode != null) { 131 String directives= 132 contentNode.getNodeValue().toLowerCase(); 133 int index= directives.indexOf("none"); 134 135 if (index >= 0) { 136 robotsMeta.setNoIndex(); 137 robotsMeta.setNoFollow(); 138 } 139 140 index= directives.indexOf("all"); 141 if (index >= 0) { 142 } 144 145 index= directives.indexOf("noindex"); 146 if (index >= 0) { 147 robotsMeta.setNoIndex(); 148 } 149 150 index= directives.indexOf("nofollow"); 151 if (index >= 0) { 152 robotsMeta.setNoFollow(); 153 } 154 } 155 156 } } 159 Node HTTPEquivNode= attrs.getNamedItem("http-equiv"); 160 161 if ( (HTTPEquivNode != null) 162 && ("Pragma".equalsIgnoreCase(HTTPEquivNode.getNodeValue())) ) { 163 Node contentNode= attrs.getNamedItem("content"); 164 165 if (contentNode != null) { 166 String content= contentNode.getNodeValue().toLowerCase(); 167 int index= content.indexOf("no-cache"); 168 if (index >= 0) 169 robotsMeta.setNoCache(); 170 } 171 172 } 173 174 } else if ("BASE".equalsIgnoreCase(node.getNodeName())) { 175 NamedNodeMap attrs= node.getAttributes(); 176 Node hrefNode= attrs.getNamedItem("href"); 177 178 if (hrefNode != null) { 179 String urlString= hrefNode.getNodeValue(); 180 181 URL url= null; 182 try { 183 if (currURL == null) 184 url= new URL (urlString); 185 else 186 url= new URL (currURL, urlString); 187 } catch (Exception e) { 188 ; 189 } 190 191 if (url != null) 192 robotsMeta.setBaseHref(url); 193 } 194 195 } 196 197 } 198 199 NodeList children = node.getChildNodes(); 200 if ( children != null ) { 201 int len = children.getLength(); 202 for ( int i = 0; i < len; i++ ) { 203 getRobotsMetaDirectivesHelper(robotsMeta, children.item(i), currURL); 204 } 205 } 206 } 207 208 } 209 | Popular Tags |