1 24 package org.archive.crawler.datamodel; 25 26 import java.io.BufferedReader ; 27 import java.io.IOException ; 28 import java.io.InputStreamReader ; 29 import java.io.ObjectInputStream ; 30 import java.io.Serializable ; 31 import java.io.StringReader ; 32 import java.util.HashSet ; 33 import java.util.Set ; 34 import java.util.zip.Checksum ; 35 36 import org.apache.commons.httpclient.URIException; 37 import org.archive.crawler.datamodel.credential.CredentialAvatar; 38 import org.archive.crawler.framework.Checkpointer; 39 import org.archive.crawler.framework.ToeThread; 40 import org.archive.crawler.settings.CrawlerSettings; 41 import org.archive.crawler.settings.SettingsHandler; 42 import org.archive.io.ReplayInputStream; 43 import org.archive.net.UURIFactory; 44 45 53 public class CrawlServer implements Serializable , CrawlSubstats.HasCrawlSubstats { 54 55 private static final long serialVersionUID = -989714570750970369L; 56 57 public static final long ROBOTS_NOT_FETCHED = -1; 58 60 public static final long MIN_ROBOTS_RETRIES = 2; 61 62 private final String server; private int port; 64 private transient SettingsHandler settingsHandler; 65 private RobotsExclusionPolicy robots; 66 long robotsFetched = ROBOTS_NOT_FETCHED; 67 boolean validRobots = false; 68 Checksum robotstxtChecksum; 69 CrawlSubstats substats = new CrawlSubstats(); 70 71 protected int consecutiveConnectionErrors = 0; 75 76 79 private transient Set <CredentialAvatar> avatars = null; 80 81 86 public CrawlServer(String h) { 87 server = h; 89 int colonIndex = server.lastIndexOf(":"); 90 if (colonIndex < 0) { 91 port = -1; 92 } else { 93 try { 94 port = Integer.parseInt(server.substring(colonIndex + 1)); 95 } catch (NumberFormatException e) { 96 port = -1; 97 } 98 } 99 } 100 101 105 public RobotsExclusionPolicy getRobots() { 106 return robots; 107 } 108 109 113 public void setRobots(RobotsExclusionPolicy policy) { 114 robots = policy; 115 } 116 117 public String toString() { 118 return "CrawlServer("+server+")"; 119 } 120 121 126 public void updateRobots(CrawlURI curi) { 127 RobotsHonoringPolicy honoringPolicy = 128 settingsHandler.getOrder().getRobotsHonoringPolicy(); 129 130 robotsFetched = System.currentTimeMillis(); 131 132 boolean gotSomething = curi.getFetchStatus() > 0 133 && curi.isHttpTransaction(); 134 if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) { 135 validRobots = false; 137 return; 138 } 139 140 CrawlerSettings settings = getSettings(curi); 141 int type = honoringPolicy.getType(settings); 142 if (type == RobotsHonoringPolicy.IGNORE) { 143 robots = RobotsExclusionPolicy.ALLOWALL; 145 validRobots = true; 146 return; 147 } 148 149 if(!gotSomething) { 150 validRobots = false; 152 return; 153 } 154 155 if (!curi.is2XXSuccess()) { 156 robots = RobotsExclusionPolicy.ALLOWALL; 164 validRobots = true; 165 return; 166 } 167 168 ReplayInputStream contentBodyStream = null; 169 try { 170 try { 171 BufferedReader reader; 172 if (type == RobotsHonoringPolicy.CUSTOM) { 173 reader = new BufferedReader (new StringReader (honoringPolicy 174 .getCustomRobots(settings))); 175 } else { 176 contentBodyStream = curi.getHttpRecorder() 177 .getRecordedInput().getContentReplayInputStream(); 178 179 contentBodyStream.setToResponseBodyStart(); 180 reader = new BufferedReader (new InputStreamReader ( 181 contentBodyStream)); 182 } 183 robots = RobotsExclusionPolicy.policyFor(settings, 184 reader, honoringPolicy); 185 validRobots = true; 186 } finally { 187 if (contentBodyStream != null) { 188 contentBodyStream.close(); 189 } 190 } 191 } catch (IOException e) { 192 robots = RobotsExclusionPolicy.ALLOWALL; 193 validRobots = true; 194 curi.addLocalizedError(getName(), e, 195 "robots.txt parsing IOException"); 196 } 197 } 198 199 202 public long getRobotsFetchedTime() { 203 return robotsFetched; 204 } 205 206 209 public String getName() { 210 return server; 211 } 212 213 217 public int getPort() { 218 return port; 219 } 220 221 232 private void readObject(ObjectInputStream stream) 233 throws IOException , ClassNotFoundException { 234 stream.defaultReadObject(); 235 Thread t = Thread.currentThread(); 236 if (t instanceof Checkpointer.CheckpointingThread) { 237 settingsHandler = ((Checkpointer.CheckpointingThread)t) 238 .getController().getSettingsHandler(); 239 } else if (t instanceof ToeThread) { 240 settingsHandler = ((ToeThread) Thread.currentThread()) 241 .getController().getSettingsHandler(); 242 } else { 243 throw new RuntimeException ("CrawlServer must deserialize " + 246 "in a ToeThread or CheckpointingThread"); 247 } 248 postDeserialize(); 249 } 250 251 private void postDeserialize() { 252 if (this.robots != null) { 253 RobotsHonoringPolicy honoringPolicy = 254 settingsHandler.getOrder().getRobotsHonoringPolicy(); 255 this.robots.honoringPolicy = honoringPolicy; 256 } 257 } 258 259 263 public SettingsHandler getSettingsHandler() { 264 return this.settingsHandler; 265 } 266 267 273 private CrawlerSettings getSettings(CandidateURI curi) { 274 try { 275 return this.settingsHandler. 276 getSettings(curi.getUURI().getReferencedHost(), 277 curi.getUURI()); 278 } catch (URIException e) { 279 return null; 280 } 281 } 282 283 287 public void setSettingsHandler(SettingsHandler settingsHandler) { 288 this.settingsHandler = settingsHandler; 289 } 290 291 public void incrementConsecutiveConnectionErrors() { 292 this.consecutiveConnectionErrors++; 293 } 294 295 public void resetConsecutiveConnectionErrors() { 296 this.consecutiveConnectionErrors = 0; 297 } 298 299 302 public Set getCredentialAvatars() { 303 return this.avatars; 304 } 305 306 309 public boolean hasCredentialAvatars() { 310 return this.avatars != null && this.avatars.size() > 0; 311 } 312 313 318 public void addCredentialAvatar(CredentialAvatar ca) { 319 if (this.avatars == null) { 320 this.avatars = new HashSet <CredentialAvatar>(); 321 } 322 this.avatars.add(ca); 323 } 324 325 332 public boolean isValidRobots() { 333 return validRobots; 334 } 335 336 342 public static String getServerKey(CandidateURI cauri) 343 throws URIException { 344 String key = cauri.getUURI().getAuthorityMinusUserinfo(); 348 if (key == null) { 349 key = cauri.getUURI().getCurrentHierPath(); 353 if(key != null && !key.matches("[-_\\w\\.:]+")) { 354 key = null; 357 } 358 } 359 if (key != null && 360 cauri.getUURI().getScheme().equals(UURIFactory.HTTPS)) { 361 if (!key.matches(".+:[0-9]+")) { 364 key += ":" + UURIFactory.HTTPS_PORT; 365 } 366 } 367 return key; 368 } 369 370 373 public CrawlSubstats getSubstats() { 374 return substats; 375 } 376 } 377 | Popular Tags |