1 23 package org.archive.crawler.prefetch; 24 25 import java.util.logging.Level ; 26 import java.util.logging.Logger ; 27 28 import org.archive.crawler.datamodel.CoreAttributeConstants; 29 import org.archive.crawler.datamodel.CrawlSubstats; 30 import org.archive.crawler.datamodel.CrawlURI; 31 import org.archive.crawler.datamodel.FetchStatusCodes; 32 import org.archive.crawler.framework.Processor; 33 import org.archive.crawler.settings.SimpleType; 34 35 43 public class QuotaEnforcer extends Processor implements FetchStatusCodes { 44 45 private static final long serialVersionUID = 6091720623469404595L; 46 47 private final Logger LOGGER = Logger.getLogger(this.getClass().getName()); 48 49 protected static final int SERVER = 0; 51 protected static final int HOST = 1; 52 protected static final int GROUP = 2; 53 protected static final int NAME = 0; 54 protected static final int SUCCESSES = 1; 55 protected static final int SUCCESS_KB = 2; 56 protected static final int RESPONSES = 3; 57 protected static final int RESPONSE_KB = 4; 58 protected static final String [][] keys = new String [][] { 59 { 60 "server", 61 "server-max-fetch-successes", 62 "server-max-success-kb", 63 "server-max-fetch-responses", 64 "server-max-all-kb" 65 }, 66 { 67 "host", 68 "host-max-fetch-successes", 69 "host-max-success-kb", 70 "host-max-fetch-responses", 71 "host-max-all-kb" 72 }, 73 { 74 "group", 75 "group-max-fetch-successes", 76 "group-max-success-kb", 77 "group-max-fetch-responses", 78 "group-max-all-kb" 79 } 80 }; 81 82 85 protected static final String ATTR_SERVER_MAX_FETCH_SUCCESSES = 86 keys[SERVER][SUCCESSES]; 87 protected static final Long DEFAULT_SERVER_MAX_FETCH_SUCCESSES = 88 new Long (-1); 89 90 protected static final String ATTR_SERVER_MAX_SUCCESS_KB = 91 keys[SERVER][SUCCESS_KB];; 92 protected static final Long DEFAULT_SERVER_MAX_SUCCESS_KB = 93 new Long (-1); 94 96 protected static final String ATTR_SERVER_MAX_FETCH_RESPONSES = 97 keys[SERVER][RESPONSES]; 98 protected static final Long DEFAULT_SERVER_MAX_FETCH_RESPONSES = 99 new Long (-1); 100 101 protected static final String ATTR_SERVER_MAX_ALL_KB = 102 keys[SERVER][RESPONSE_KB]; 103 protected static final Long DEFAULT_SERVER_MAX_ALL_KB = 104 new Long (-1); 105 106 109 protected static final String ATTR_HOST_MAX_FETCH_SUCCESSES = 110 keys[HOST][SUCCESSES];; 111 protected static final Long DEFAULT_HOST_MAX_FETCH_SUCCESSES = 112 new Long (-1); 113 114 protected static final String ATTR_HOST_MAX_SUCCESS_KB = 115 keys[HOST][SUCCESS_KB];; 116 protected static final Long DEFAULT_HOST_MAX_SUCCESS_KB = 117 new Long (-1); 118 120 protected static final String ATTR_HOST_MAX_FETCH_RESPONSES = 121 keys[HOST][RESPONSES]; 122 protected static final Long DEFAULT_HOST_MAX_FETCH_RESPONSES = 123 new Long (-1); 124 125 protected static final String ATTR_HOST_MAX_ALL_KB = 126 keys[HOST][RESPONSE_KB]; 127 protected static final Long DEFAULT_HOST_MAX_ALL_KB = 128 new Long (-1); 129 130 133 protected static final String ATTR_GROUP_MAX_FETCH_SUCCESSES = 134 keys[GROUP][SUCCESSES]; 135 protected static final Long DEFAULT_GROUP_MAX_FETCH_SUCCESSES = 136 new Long (-1); 137 138 protected static final String ATTR_GROUP_MAX_SUCCESS_KB = 139 keys[GROUP][SUCCESS_KB]; 140 protected static final Long DEFAULT_GROUP_MAX_SUCCESS_KB = 141 new Long (-1); 142 144 protected static final String ATTR_GROUP_MAX_FETCH_RESPONSES = 145 keys[GROUP][RESPONSES]; 146 protected static final Long DEFAULT_GROUP_MAX_FETCH_RESPONSES = 147 new Long (-1); 148 149 protected static final String ATTR_GROUP_MAX_ALL_KB = 150 keys[GROUP][RESPONSE_KB]; 151 protected static final Long DEFAULT_GROUP_MAX_ALL_KB = 152 new Long (-1); 153 154 155 protected static final String ATTR_FORCE_RETIRE = 156 "force-retire"; 157 protected static final Boolean DEFAULT_FORCE_RETIRE = true; 158 159 163 public QuotaEnforcer(String name) { 164 super(name, "QuotaEnforcer."); 165 166 addElementToDefinition(new SimpleType(ATTR_FORCE_RETIRE, 167 "Whether an over-quota situation should result in the " + 168 "containing queue being force-retired (if the Frontier " + 169 "supports this). Note that if your queues combine URIs " + 170 "that are different with regard to the quota category, " + 171 "the retirement may hold back URIs not in the same " + 172 "quota category. " + 173 "Default is false.", 174 DEFAULT_FORCE_RETIRE)); 175 176 String maxFetchSuccessesDesc = "Maximum number of fetch successes " + 177 "(e.g. 200 responses) to collect from one $CATEGORY. " + 178 "Default is -1, meaning no limit."; 179 String maxSuccessKbDesc = "Maximum amount of fetch success content " + 180 "(e.g. 200 responses) in KB to collect from one $CATEGORY. " + 181 "Default is -1, meaning no limit."; 182 String maxFetchResponsesDesc = "Maximum number of fetch responses " + 183 "(incl. error responses) to collect from one $CATEGORY. " + 184 "Default is -1, meaning no limit."; 185 String maxAllKbDesc = "Maximum amount of response content " + 186 "(incl. error responses) in KB to collect from one $CATEGORY. " + 187 "Default is -1, meaning no limit."; 188 addElementToDefinition(new SimpleType(ATTR_SERVER_MAX_FETCH_SUCCESSES, 190 maxFetchSuccessesDesc.replaceAll("$CATEGORY","server"), 191 DEFAULT_SERVER_MAX_FETCH_SUCCESSES)); 192 addElementToDefinition(new SimpleType(ATTR_SERVER_MAX_SUCCESS_KB, 193 maxSuccessKbDesc.replaceAll("$CATEGORY","server"), 194 DEFAULT_SERVER_MAX_SUCCESS_KB)); 195 addElementToDefinition(new SimpleType(ATTR_SERVER_MAX_FETCH_RESPONSES, 197 maxFetchResponsesDesc.replaceAll("$CATEGORY","server"), 198 DEFAULT_SERVER_MAX_FETCH_RESPONSES)); 199 addElementToDefinition(new SimpleType(ATTR_SERVER_MAX_ALL_KB, 200 maxAllKbDesc.replaceAll("$CATEGORY","server"), 201 DEFAULT_SERVER_MAX_ALL_KB)); 202 addElementToDefinition(new SimpleType(ATTR_HOST_MAX_FETCH_SUCCESSES, 204 maxFetchSuccessesDesc.replaceAll("$CATEGORY","host"), 205 DEFAULT_HOST_MAX_FETCH_SUCCESSES)); 206 addElementToDefinition(new SimpleType(ATTR_HOST_MAX_SUCCESS_KB, 207 maxSuccessKbDesc.replaceAll("$CATEGORY","host"), 208 DEFAULT_HOST_MAX_SUCCESS_KB)); 209 addElementToDefinition(new SimpleType(ATTR_HOST_MAX_FETCH_RESPONSES, 211 maxFetchResponsesDesc.replaceAll("$CATEGORY","host"), 212 DEFAULT_HOST_MAX_FETCH_RESPONSES)); 213 addElementToDefinition(new SimpleType(ATTR_HOST_MAX_ALL_KB, 214 maxAllKbDesc.replaceAll("$CATEGORY","host"), 215 DEFAULT_HOST_MAX_ALL_KB)); 216 addElementToDefinition(new SimpleType(ATTR_GROUP_MAX_FETCH_SUCCESSES, 218 maxFetchSuccessesDesc.replaceAll("$CATEGORY","group (queue)"), 219 DEFAULT_GROUP_MAX_FETCH_SUCCESSES)); 220 addElementToDefinition(new SimpleType(ATTR_GROUP_MAX_SUCCESS_KB, 221 maxSuccessKbDesc.replaceAll("$CATEGORY","group (queue)"), 222 DEFAULT_GROUP_MAX_SUCCESS_KB)); 223 addElementToDefinition(new SimpleType(ATTR_GROUP_MAX_FETCH_RESPONSES, 225 maxFetchResponsesDesc.replaceAll("$CATEGORY","group (queue)"), 226 DEFAULT_GROUP_MAX_FETCH_RESPONSES)); 227 addElementToDefinition(new SimpleType(ATTR_GROUP_MAX_ALL_KB, 228 maxAllKbDesc.replaceAll("$CATEGORY","group (queue)"), 229 DEFAULT_GROUP_MAX_ALL_KB)); 230 231 } 232 233 protected void innerProcess(CrawlURI curi) { 234 CrawlSubstats.HasCrawlSubstats[] haveStats = 235 new CrawlSubstats.HasCrawlSubstats[] { 236 getController().getServerCache().getServerFor(curi), getController().getServerCache().getHostFor(curi), getController().getFrontier().getGroup(curi) }; 240 241 for(int cat = SERVER; cat <= GROUP; cat++) { 242 if (checkQuotas(curi, haveStats[cat], cat)) { 243 return; 244 } 245 } 246 } 247 248 257 protected boolean checkQuotas(final CrawlURI curi, 258 final CrawlSubstats.HasCrawlSubstats hasStats, 259 final int CAT) { 260 if (hasStats == null) { 261 if (LOGGER.isLoggable(Level.FINE)) { 262 LOGGER.fine(curi.toString() + " null stats category: " + CAT); 263 } 264 return false; 265 } 266 CrawlSubstats substats = hasStats.getSubstats(); 267 long[] actuals = new long[] { 268 -1, substats.getFetchSuccesses(), 270 substats.getSuccessBytes()/1024, 271 substats.getFetchResponses(), 272 substats.getTotalBytes()/1024, 273 }; 274 for(int q = SUCCESSES; q <= RESPONSE_KB; q++) { 275 if(applyQuota(curi, keys[CAT][q], actuals[q])) { 276 return true; 277 } 278 } 279 return false; 280 } 281 282 293 protected boolean applyQuota(CrawlURI curi, String quotaKey, long actual) { 294 long quota = ((Long )getUncheckedAttribute(curi, quotaKey)).longValue(); 295 if (quota >= 0 && actual >= quota) { 296 curi.setFetchStatus(S_BLOCKED_BY_QUOTA); 297 curi.addAnnotation("Q:"+quotaKey); 298 curi.skipToProcessorChain(getController().getPostprocessorChain()); 299 if((Boolean )getUncheckedAttribute(curi,ATTR_FORCE_RETIRE)) { 300 curi.putObject(CoreAttributeConstants.A_FORCE_RETIRE, (Boolean ) true); 301 } 302 return true; 303 } 304 return false; 305 } 306 } 307 | Popular Tags |