1 27 31 32 package net.matuschek.spider; 33 34 import java.io.*; 35 import java.net.*; 36 import java.util.*; 37 38 import org.apache.log4j.Category; 39 40 import net.matuschek.http.*; 41 42 58 public class NoRobots { 59 60 Category log = Category.getInstance(getClass().getName()); 61 62 private static final String robotFile = "/robots.txt"; 64 65 private String robotName; 67 68 private Hashtable servers = new net.matuschek.util.LruHashtable(500); 70 71 private HttpTool httpTool; 73 private boolean ignore = false; 74 75 80 public NoRobots(String robotName, HttpTool inhttpTool) { 81 this.robotName = robotName; 82 this.httpTool = inhttpTool; 83 92 } 93 94 102 public boolean ok(URL url) { 103 if (ignore) { 105 return true; 106 } 107 108 String protocol = url.getProtocol(); 109 String host = url.getHost(); 110 int port = url.getPort(); 111 if (port == -1) { 112 port = 80; 113 } 114 115 String file = url.getFile(); 116 117 Vector disallows = getDisallows(protocol, host, port); 118 Enumeration en = disallows.elements(); 119 while (en.hasMoreElements()) { 120 String pattern = (String ) en.nextElement(); 121 if (file.startsWith(pattern)) 122 return false; 123 } 124 return true; 125 } 126 127 136 private Vector getDisallows(String protocol, String host, int port) { 137 String key = protocol + "://" + host + ":" + port; 138 Vector disallows = (Vector) servers.get(key); 139 if (disallows != null) 140 return disallows; 141 142 disallows = new Vector(); 143 try { 144 URL robotUrl = new URL(protocol, host, port, robotFile); 145 try { 146 147 log.debug("Retrieving robot file '" + robotUrl + "'."); 149 httpTool.setReferer("-"); 150 String robotsFile = ""; 151 try { 152 HttpDoc doc = 153 httpTool.retrieveDocument( 154 robotUrl, 155 HttpConstants.GET, 156 ""); 157 if (doc != null && doc.isOk()) { 159 robotsFile = new String (doc.getContent()); 160 } 161 } catch (HttpException e) { 162 log.info("Cannot read robots.txt: " + e.getMessage()); 164 } 165 166 BufferedReader robotReader = 167 new BufferedReader(new StringReader(robotsFile)); 168 boolean userAgentIsMe = false; 169 while (true) { 170 String line = robotReader.readLine(); 171 if (line == null) 172 break; 173 line = line.trim(); 174 175 if (line.startsWith("#")) 178 continue; 179 180 int cmt = line.indexOf('#'); 182 if (cmt != -1) 183 line = line.substring(0, cmt).trim(); 184 185 if (line.length() == 0) 186 userAgentIsMe = false; 187 else if (line.toLowerCase().startsWith("user-agent:")) { 188 if (!userAgentIsMe) { 189 String value = line.substring(11).trim(); 190 if (match(value, robotName)) 191 userAgentIsMe = true; 192 } 193 } else if (line.toLowerCase().startsWith("disallow:")) { 194 if (userAgentIsMe) { 195 String value = line.substring(9).trim(); 196 disallows.addElement(value); 197 } 198 } 199 } 200 } catch (IOException ignore) { 201 } 202 } catch (MalformedURLException ignore) { 203 } 204 205 servers.put(key, disallows); 206 return disallows; 207 } 208 209 217 protected static boolean match(String pattern, String string) { 218 for (int p = 0;; ++p) { 219 for (int s = 0;; ++p, ++s) { 220 boolean sEnd = (s >= string.length()); 221 boolean pEnd = 222 (p >= pattern.length() || pattern.charAt(p) == '|'); 223 if (sEnd && pEnd) 224 return true; 225 if (sEnd || pEnd) 226 break; 227 if (pattern.charAt(p) == '?') 228 continue; 229 if (pattern.charAt(p) == '*') { 230 int i; 231 ++p; 232 for (i = string.length(); i >= s; --i) 233 if (match(pattern.substring(p), 234 string.substring(i))) 235 return true; 236 break; 237 } 238 if (pattern.charAt(p) != string.charAt(s)) 239 break; 240 } 241 p = pattern.indexOf('|', p); 242 if (p == -1) 243 return false; 244 } 245 } 246 247 252 public boolean getIgnore() { 253 return ignore; 254 } 255 256 262 public void setIgnore(boolean ignore) { 263 this.ignore = ignore; 264 } 265 266 269 public void finish() { 270 if (httpTool != null) { 271 httpTool.finish(); 272 } 273 } 274 } 275 | Popular Tags |