KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > prefetch > RuntimeLimitEnforcer


1 /* RuntimeLimitEnforcer
2  *
3  * Created on July 7, 2006
4  *
5  * Copyright (C) 2005 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.prefetch;
24
25 import java.util.logging.Level JavaDoc;
26 import java.util.logging.Logger JavaDoc;
27
28 import org.archive.crawler.admin.CrawlJob;
29 import org.archive.crawler.datamodel.CrawlURI;
30 import org.archive.crawler.datamodel.FetchStatusCodes;
31 import org.archive.crawler.framework.Processor;
32 import org.archive.crawler.settings.SimpleType;
33 import org.archive.crawler.settings.Type;
34
35 /**
36  * A processor to enforce runtime limits on crawls.
37  * <p>
38  * This processor extends and improves on the 'max-time' capability of Heritrix.
39  * Essentially, the 'Terminate job' option functions the same way as 'max-time'.
40  * The processor however also enables pausing when the runtime is exceeded and
41  * the blocking of all URIs.
42  * <p>
43  * <ol>
44  * <li>Pause job - Pauses the crawl. A change (increase) to the
45  * runtime duration will make it pausible to resume the crawl.
46  * Attempts to resume the crawl without modifying the run time
47  * will cause it to be immediately paused again.</li>
48  * <li>Terminate job - Terminates the job. Equivalent
49  * to using the max-time setting on the CrawlController.</li>
50  * <li>Block URIs - Blocks each URI with an -5002
51  * (blocked by custom processor) fetch status code. This will
52  * cause all the URIs queued to wind up in the crawl.log.</li>
53  * <ol>
54  * <p>
55  * The processor allows variable runtime based on host (or other
56  * override/refinement criteria) however using such overrides only makes sense
57  * when using 'Block URIs' as pause and terminate will have global impact once
58  * encountered anywhere.
59  *
60  * @author Kristinn Sigur&eth;sson
61  */

62 public class RuntimeLimitEnforcer
63                 extends Processor implements FetchStatusCodes {
64
65     private static final long serialVersionUID = 1L;
66     
67     protected Logger JavaDoc logger = Logger.getLogger(
68             RuntimeLimitEnforcer.class.getName());
69     
70     public static final String JavaDoc ATTR_RUNTIME_SECONDS = "runtime-sec".intern();
71     protected static final long DEFAULT_RUNTIME_SECONDS = 86400; // 1 day
72

73     public static final String JavaDoc ATTR_END_OPERATION = "end-operation".intern();
74     protected static final String JavaDoc OP_PAUSE = "Pause job".intern();
75     protected static final String JavaDoc OP_TERMINATE = "Terminate job".intern();
76     protected static final String JavaDoc OP_BLOCK_URIS = "Block URIs".intern();
77     protected static final String JavaDoc DEFAULT_END_OPERATION = OP_PAUSE;
78     protected static final String JavaDoc[] AVAILABLE_END_OPERATIONS = {
79         OP_PAUSE, OP_TERMINATE, OP_BLOCK_URIS};
80     
81     public RuntimeLimitEnforcer(String JavaDoc name) {
82         super(name, "A processor that halts further progress once a fixed " +
83                 "amount of time has elapsed since the start of a crawl. " +
84                 "It is possible to configure this processor per host, but " +
85                 "it should be noted that Heritrix does not track runtime " +
86                 "per host seperately. Especially when using facilities " +
87                 "like the BdbFrontier's hold-queues, the actual amount of " +
88                 "time spent crawling a host may have little relevance to " +
89                 "total elapsed time. Note however that using overrides " +
90                 "and/or refinements only makes sense when using the " +
91                 "'Block URIs' end operation. The pause and terminate " +
92                 "operations have global impact once encountered.");
93         Type t = new SimpleType(
94                 ATTR_RUNTIME_SECONDS,
95                 "The amount of time, in seconds, that the crawl will be " +
96                 "allowed to run before this processor performs it's 'end " +
97                 "operation.'",
98                 DEFAULT_RUNTIME_SECONDS);
99         addElementToDefinition(t);
100         t = new SimpleType(
101                 ATTR_END_OPERATION,
102                 "The action that the processor takes once the runtime has " +
103                 "elapsed.\n " +
104                 "Operation: Pause job - Pauses the crawl. A change " +
105                 "(increase) to the runtime duration will " +
106                 "make it pausible to resume the crawl. Attempts to resume " +
107                 "the crawl without modifying the run time will cause it to " +
108                 "be immediately paused again.\n " +
109                 "Operation: Terminate job - Terminates the job. Equivalent " +
110                 "to using the max-time setting on the CrawlController.\n " +
111                 "Operation: Block URIs - Blocks each URI with an -5002 " +
112                 "(blocked by custom processor) fetch status code. This will " +
113                 "cause all the URIs queued to wind up in the crawl.log.",
114                 DEFAULT_END_OPERATION,
115                 AVAILABLE_END_OPERATIONS);
116         addElementToDefinition(t);
117     }
118
119     protected void innerProcess(CrawlURI curi) throws InterruptedException JavaDoc {
120         long allowedRuntime = getRuntime(curi);
121         long currentRuntime = getController().getStatistics().crawlDuration();
122         if(currentRuntime > allowedRuntime){
123             String JavaDoc op = (String JavaDoc)getUncheckedAttribute(curi,ATTR_END_OPERATION);
124             if(op != null){
125                 if(op.equals(OP_PAUSE)){
126                     getController().requestCrawlPause();
127                 } else if(op.equals(OP_TERMINATE)){
128                     getController().requestCrawlStop(
129                             CrawlJob.STATUS_FINISHED_TIME_LIMIT);
130                 } else if(op.equals(OP_BLOCK_URIS)){
131                     curi.setFetchStatus(S_BLOCKED_BY_RUNTIME_LIMIT);
132                     curi.addAnnotation("Runtime exceeded " + allowedRuntime +
133                             "ms");
134                     curi.skipToProcessorChain(
135                             getController().getPostprocessorChain());
136                 }
137             } else {
138                 logger.log(Level.SEVERE,"Null value for " + ATTR_END_OPERATION +
139                         " when processing " + curi.toString());
140             }
141         }
142     }
143     
144     /**
145      * Returns the amount of time to allow the crawl to run before this
146      * processor interrupts.
147      * @return the amount of time in milliseconds.
148      */

149     protected long getRuntime(CrawlURI curi){
150         Object JavaDoc o = getUncheckedAttribute(curi,ATTR_RUNTIME_SECONDS);
151         if(o == null){
152             logger.log(Level.SEVERE,"Null value for " + ATTR_RUNTIME_SECONDS +
153                     " when processing " + curi.toString());
154             return Long.MAX_VALUE;
155         }
156         return ((Long JavaDoc)o).longValue()*1000; //extract value and convert to ms.
157
}
158     
159 }
160
Popular Tags