1 23 package org.archive.crawler.prefetch; 24 25 import java.util.logging.Level ; 26 import java.util.logging.Logger ; 27 28 import org.archive.crawler.admin.CrawlJob; 29 import org.archive.crawler.datamodel.CrawlURI; 30 import org.archive.crawler.datamodel.FetchStatusCodes; 31 import org.archive.crawler.framework.Processor; 32 import org.archive.crawler.settings.SimpleType; 33 import org.archive.crawler.settings.Type; 34 35 62 public class RuntimeLimitEnforcer 63 extends Processor implements FetchStatusCodes { 64 65 private static final long serialVersionUID = 1L; 66 67 protected Logger logger = Logger.getLogger( 68 RuntimeLimitEnforcer.class.getName()); 69 70 public static final String ATTR_RUNTIME_SECONDS = "runtime-sec".intern(); 71 protected static final long DEFAULT_RUNTIME_SECONDS = 86400; 73 public static final String ATTR_END_OPERATION = "end-operation".intern(); 74 protected static final String OP_PAUSE = "Pause job".intern(); 75 protected static final String OP_TERMINATE = "Terminate job".intern(); 76 protected static final String OP_BLOCK_URIS = "Block URIs".intern(); 77 protected static final String DEFAULT_END_OPERATION = OP_PAUSE; 78 protected static final String [] AVAILABLE_END_OPERATIONS = { 79 OP_PAUSE, OP_TERMINATE, OP_BLOCK_URIS}; 80 81 public RuntimeLimitEnforcer(String name) { 82 super(name, "A processor that halts further progress once a fixed " + 83 "amount of time has elapsed since the start of a crawl. " + 84 "It is possible to configure this processor per host, but " + 85 "it should be noted that Heritrix does not track runtime " + 86 "per host seperately. Especially when using facilities " + 87 "like the BdbFrontier's hold-queues, the actual amount of " + 88 "time spent crawling a host may have little relevance to " + 89 "total elapsed time. Note however that using overrides " + 90 "and/or refinements only makes sense when using the " + 91 "'Block URIs' end operation. The pause and terminate " + 92 "operations have global impact once encountered."); 93 Type t = new SimpleType( 94 ATTR_RUNTIME_SECONDS, 95 "The amount of time, in seconds, that the crawl will be " + 96 "allowed to run before this processor performs it's 'end " + 97 "operation.'", 98 DEFAULT_RUNTIME_SECONDS); 99 addElementToDefinition(t); 100 t = new SimpleType( 101 ATTR_END_OPERATION, 102 "The action that the processor takes once the runtime has " + 103 "elapsed.\n " + 104 "Operation: Pause job - Pauses the crawl. A change " + 105 "(increase) to the runtime duration will " + 106 "make it pausible to resume the crawl. Attempts to resume " + 107 "the crawl without modifying the run time will cause it to " + 108 "be immediately paused again.\n " + 109 "Operation: Terminate job - Terminates the job. Equivalent " + 110 "to using the max-time setting on the CrawlController.\n " + 111 "Operation: Block URIs - Blocks each URI with an -5002 " + 112 "(blocked by custom processor) fetch status code. This will " + 113 "cause all the URIs queued to wind up in the crawl.log.", 114 DEFAULT_END_OPERATION, 115 AVAILABLE_END_OPERATIONS); 116 addElementToDefinition(t); 117 } 118 119 protected void innerProcess(CrawlURI curi) throws InterruptedException { 120 long allowedRuntime = getRuntime(curi); 121 long currentRuntime = getController().getStatistics().crawlDuration(); 122 if(currentRuntime > allowedRuntime){ 123 String op = (String )getUncheckedAttribute(curi,ATTR_END_OPERATION); 124 if(op != null){ 125 if(op.equals(OP_PAUSE)){ 126 getController().requestCrawlPause(); 127 } else if(op.equals(OP_TERMINATE)){ 128 getController().requestCrawlStop( 129 CrawlJob.STATUS_FINISHED_TIME_LIMIT); 130 } else if(op.equals(OP_BLOCK_URIS)){ 131 curi.setFetchStatus(S_BLOCKED_BY_RUNTIME_LIMIT); 132 curi.addAnnotation("Runtime exceeded " + allowedRuntime + 133 "ms"); 134 curi.skipToProcessorChain( 135 getController().getPostprocessorChain()); 136 } 137 } else { 138 logger.log(Level.SEVERE,"Null value for " + ATTR_END_OPERATION + 139 " when processing " + curi.toString()); 140 } 141 } 142 } 143 144 149 protected long getRuntime(CrawlURI curi){ 150 Object o = getUncheckedAttribute(curi,ATTR_RUNTIME_SECONDS); 151 if(o == null){ 152 logger.log(Level.SEVERE,"Null value for " + ATTR_RUNTIME_SECONDS + 153 " when processing " + curi.toString()); 154 return Long.MAX_VALUE; 155 } 156 return ((Long )o).longValue()*1000; } 158 159 } 160 | Popular Tags |