1 23 package org.archive.crawler.postprocessor; 24 25 26 import java.util.logging.Logger ; 27 28 import org.apache.commons.httpclient.URIException; 29 import org.archive.crawler.datamodel.CoreAttributeConstants; 30 import org.archive.crawler.datamodel.CrawlHost; 31 import org.archive.crawler.datamodel.CrawlServer; 32 import org.archive.crawler.datamodel.CrawlURI; 33 import org.archive.crawler.datamodel.FetchStatusCodes; 34 import org.archive.crawler.framework.Processor; 35 import org.archive.crawler.framework.Frontier.FrontierGroup; 36 37 38 47 public class CrawlStateUpdater extends Processor implements 48 CoreAttributeConstants, FetchStatusCodes { 49 50 private static final long serialVersionUID = -1072728147960180091L; 51 52 private static final Logger logger = 53 Logger.getLogger(CrawlStateUpdater.class.getName()); 54 55 public CrawlStateUpdater(String name) { 56 super(name, "Crawl state updater"); 57 } 58 59 protected void innerProcess(CrawlURI curi) { 60 CrawlServer server = 62 getController().getServerCache().getServerFor(curi); 63 if (server != null) { 64 server.getSubstats().tally(curi); 65 } 66 CrawlHost host = 67 getController().getServerCache().getHostFor(curi); 68 if (host != null) { 69 host.getSubstats().tally(curi); 70 } 71 FrontierGroup group = 72 getController().getFrontier().getGroup(curi); 73 group.getSubstats().tally(curi); 74 75 String scheme = curi.getUURI().getScheme().toLowerCase(); 76 if (scheme.equals("http") || scheme.equals("https") && 77 server != null) { 78 if(curi.getFetchStatus() == S_CONNECT_FAILED) { 80 server.incrementConsecutiveConnectionErrors(); 81 } else if (curi.getFetchStatus() > 0){ 82 server.resetConsecutiveConnectionErrors(); 83 } 84 85 try { 87 if (curi.getUURI().getPath() != null && 88 curi.getUURI().getPath().equals("/robots.txt")) { 89 server.updateRobots(curi); 91 } 92 } 93 catch (URIException e) { 94 logger.severe("Failed get path on " + curi.getUURI()); 95 } 96 } 97 } 98 } 99 | Popular Tags |