KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > postprocessor > WaitEvaluator


1 /* WaitEvaluator
2  *
3  * $Id: WaitEvaluator.java,v 1.5.18.1 2007/01/13 01:31:24 stack-sf Exp $
4  *
5  * Created on 26.11.2004
6  *
7  * Copyright (C) 2004 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.postprocessor;
26
27 import java.util.logging.Level JavaDoc;
28 import java.util.logging.Logger JavaDoc;
29
30 import javax.management.AttributeNotFoundException JavaDoc;
31
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.Processor;
34 import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
35 import org.archive.crawler.settings.SimpleType;
36
37 /**
38  * A processor that determines when a URI should be revisited next. Does
39  * <b>not</b> account for DNS and robots.txt expiration. That should be
40  * handled seperately by the Frontiers.
41  *
42  * @author Kristinn Sigurdsson
43  */

44 public class WaitEvaluator extends Processor
45 implements AdaptiveRevisitAttributeConstants {
46     
47     private static final long serialVersionUID = 7452762726125458413L;
48
49     Logger JavaDoc logger = Logger.getLogger(WaitEvaluator.class.getName());
50     
51     /** Default wait time after initial visit. */
52     public final static String JavaDoc ATTR_INITIAL_WAIT_INTERVAL =
53         "initial-wait-interval-seconds";
54     protected final static Long JavaDoc DEFAULT_INITIAL_WAIT_INTERVAL =
55         new Long JavaDoc(86400); // 1 day
56
/** Maximum wait between visits */
57     public final static String JavaDoc ATTR_MAX_WAIT_INTERVAL =
58         "max-wait-interval-seconds";
59     protected final static Long JavaDoc DEFAULT_MAX_WAIT_INTERVAL =
60         new Long JavaDoc(2419200); // 4 weeks
61
/** Minimum wait between visits */
62     public final static String JavaDoc ATTR_MIN_WAIT_INTERVAL =
63         "min-wait-interval-seconds";
64     protected final static Long JavaDoc DEFAULT_MIN_WAIT_INTERVAL =
65         new Long JavaDoc(3600); // 1 hour
66
/** Factor increase on wait when unchanged */
67     public final static String JavaDoc ATTR_UNCHANGED_FACTOR = "unchanged-factor";
68     protected final static Double JavaDoc DEFAULT_UNCHANGED_FACTOR = new Double JavaDoc(1.5);
69     /** Factor decrease on wait when changed */
70     public final static String JavaDoc ATTR_CHANGED_FACTOR = "changed-factor";
71     protected final static Double JavaDoc DEFAULT_CHANGED_FACTOR = new Double JavaDoc(1.5);
72     /** Fixed wait time for 'unknown' change status. I.e. wait time for URIs
73      * whose content change detection is not available. */

74     public final static String JavaDoc ATTR_DEFAULT_WAIT_INTERVAL =
75         "default-wait-interval-seconds";
76     protected final static Long JavaDoc DEFAULT_DEFAULT_WAIT_INTERVAL =
77         new Long JavaDoc(259200); // 3 days
78
/** Indicates if the amount of time the URI was overdue should be added
79      * to the wait time before the new wait time is calculated. */

80     public final static String JavaDoc ATTR_USE_OVERDUE_TIME = "use-overdue-time";
81     protected final static Boolean JavaDoc DEFAULT_USE_OVERDUE_TIME = new Boolean JavaDoc(false);
82
83     /**
84      * Constructor
85      *
86      * @param name The name of the module
87      */

88     public WaitEvaluator(String JavaDoc name) {
89         this(name,
90                 "Evaluates how long to wait before fetching a URI again. " +
91                 "Typically, this processor should be in the post processing " +
92                 "chain. It will pass if another wait evaluator has already " +
93                 "processed the CrawlURI.",
94                 DEFAULT_INITIAL_WAIT_INTERVAL,
95                 DEFAULT_MAX_WAIT_INTERVAL,
96                 DEFAULT_MIN_WAIT_INTERVAL,
97                 DEFAULT_UNCHANGED_FACTOR,
98                 DEFAULT_CHANGED_FACTOR);
99     }
100
101     /**
102      * Constructor
103      *
104      * @param name The name of the module
105      * @param description Description of the module
106      * @param default_inital_wait_interval The default value for initial wait
107      * time
108      * @param default_max_wait_interval The maximum value for wait time
109      * @param default_min_wait_interval The minimum value for wait time
110      * @param default_unchanged_factor The factor for changing wait times of
111      * unchanged documents (will be multiplied by this value)
112      * @param default_changed_factor The factor for changing wait times of
113      * changed documents (will be divided by this value)
114      */

115     public WaitEvaluator(String JavaDoc name, String JavaDoc description,
116             Long JavaDoc default_inital_wait_interval,
117             Long JavaDoc default_max_wait_interval,
118             Long JavaDoc default_min_wait_interval,
119             Double JavaDoc default_unchanged_factor,
120             Double JavaDoc default_changed_factor){
121         super(name, description);
122         
123         addElementToDefinition(new SimpleType(ATTR_INITIAL_WAIT_INTERVAL,
124                 "The initial wait time between revisits. Will then be " +
125                 "updated according to crawler experiance. I.e. shorter " +
126                 "wait, visit more often, if document has changed between " +
127                 "visits, and vica versa.",
128                 default_inital_wait_interval));
129         addElementToDefinition(new SimpleType(ATTR_MAX_WAIT_INTERVAL,
130                 "The maximum settable wait time between revisits. Once a " +
131                 "URIs wait time reaches this value, it will not grow " +
132                 "further, regardless of subsequent visits that discover " +
133                 "no changes. Note that this does not ensure that the URI " +
134                 "does not wait any longer, since the crawler might be " +
135                 "'behind,' forcing a URI to wait until other URIs, " +
136                 "scheduled for earlier are completed..",
137                 default_max_wait_interval));
138         addElementToDefinition(new SimpleType(ATTR_MIN_WAIT_INTERVAL,
139                 "The minum settable wait time between revisits. Once a " +
140                 "URIs wait time reaches this value, it will not be shortened " +
141                 "further, regardlesss of subsequent visits that discover " +
142                 "changes.",
143                 default_min_wait_interval));
144         addElementToDefinition(new SimpleType(ATTR_DEFAULT_WAIT_INTERVAL,
145                 "Fixed wait time for 'unknown' change status. I.e. wait time " +
146                 "for URIs whose content change detection is not available.",
147                 DEFAULT_DEFAULT_WAIT_INTERVAL));
148         addElementToDefinition(new SimpleType(ATTR_UNCHANGED_FACTOR,
149                 "The factor by which a URIs wait time is increased when a " +
150                 "revisit reveals an unchanged document. A value of 1 will " +
151                 "leave it unchanged, a value of 2 will double it etc.",
152                 default_unchanged_factor));
153         addElementToDefinition(new SimpleType(ATTR_CHANGED_FACTOR,
154                 "The factor by which a URIs wait time is decreased when a " +
155                 "revisit reveals a changed document. A value of 1 will leave " +
156                 "it unchanged, a value of two will half it etc.",
157                 default_changed_factor));
158         addElementToDefinition(new SimpleType(ATTR_USE_OVERDUE_TIME,
159                 "Indicates if the amount of time the URI was overdue should " +
160                 "be added to the wait time before the new wait time is " +
161                 "calculated.",
162                 DEFAULT_USE_OVERDUE_TIME));
163
164         // Register persistent CrawlURI items
165
CrawlURI.addAlistPersistentMember(A_WAIT_INTERVAL);
166     }
167
168     protected void innerProcess(CrawlURI curi) throws InterruptedException JavaDoc {
169         
170         if(curi.isSuccess()==false){
171             // If the URI was not crawled successfully, we can not reevaluate
172
// the wait interval.
173
return;
174         }
175         
176         if(curi.containsKey(A_WAIT_REEVALUATED) &&
177                 ((Boolean JavaDoc)curi.getObject(A_WAIT_REEVALUATED)).booleanValue()){
178             // This CrawlURIs wait interval has already been reevaluted during
179
// this processing round.
180
return;
181         }
182             
183         long min;
184         try {
185             min = ((Long JavaDoc)getAttribute(curi, ATTR_MIN_WAIT_INTERVAL)).
186                 longValue() * 1000;
187         } catch (AttributeNotFoundException JavaDoc e1) {
188             min = DEFAULT_MIN_WAIT_INTERVAL.longValue();
189             logger.fine("Unable to load minimum wait interval for " +
190                     curi.toString());
191         }
192
193         long max;
194         try {
195             max = ((Long JavaDoc)getAttribute(curi, ATTR_MAX_WAIT_INTERVAL)).
196                 longValue() * 1000;
197         } catch (AttributeNotFoundException JavaDoc e1) {
198             max = DEFAULT_MAX_WAIT_INTERVAL.longValue();
199             logger.fine("Unable to load maximum wait interval for " +
200                     curi.toString());
201         }
202
203         
204         long waitInterval;
205         if (!curi.containsKey(A_CONTENT_STATE_KEY) ||
206                 curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_UNKNOWN) {
207             try {
208                 waitInterval = ((Long JavaDoc)getAttribute(curi,
209                         ATTR_DEFAULT_WAIT_INTERVAL)).longValue() * 1000;
210             } catch (AttributeNotFoundException JavaDoc e1) {
211                 waitInterval = DEFAULT_DEFAULT_WAIT_INTERVAL.longValue();
212                 logger.fine("Unable to load default wait interval for "
213                         + curi.toString());
214             }
215         } else {
216             /* Calculate curi's time of next processing */
217             waitInterval = DEFAULT_INITIAL_WAIT_INTERVAL.longValue()*1000;
218
219             // Retrieve wait interval
220
if(curi.containsKey(A_WAIT_INTERVAL)){
221                 waitInterval = curi.getLong(A_WAIT_INTERVAL);
222
223                 // Should override time be taken into account?
224
boolean useOverrideTime = DEFAULT_USE_OVERDUE_TIME.booleanValue();
225                 try {
226                     useOverrideTime = ((Boolean JavaDoc)getAttribute(
227                             curi,ATTR_USE_OVERDUE_TIME)).booleanValue();
228                 } catch (AttributeNotFoundException JavaDoc e1) {
229                     useOverrideTime = DEFAULT_USE_OVERDUE_TIME.booleanValue();
230                     logger.fine("Unable to load use-overdue-time for " +
231                             curi.toString());
232                 }
233                 
234                 if(useOverrideTime){
235                     waitInterval += curi.getLong(A_FETCH_OVERDUE);
236                 }
237
238                 // Revise the wait interval
239
if(curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED){
240                     // Had changed. Decrease wait interval time.
241
double factor;
242                     try {
243                         factor = ((Double JavaDoc)getAttribute(
244                                 curi,ATTR_CHANGED_FACTOR)).doubleValue();
245                     } catch (AttributeNotFoundException JavaDoc e2) {
246                         factor = DEFAULT_CHANGED_FACTOR.doubleValue();
247                         logger.fine("Unable to load changed factor for " +
248                                 curi.toString());
249                     }
250                     waitInterval = (long)(waitInterval / factor);
251                 } else if(curi.getInt(A_CONTENT_STATE_KEY) ==
252                         CONTENT_UNCHANGED) {
253                     // Had not changed. Increase wait interval time
254
double factor;
255                     try {
256                         factor = ((Double JavaDoc)getAttribute(
257                                 curi,ATTR_UNCHANGED_FACTOR)).doubleValue();
258                     } catch (AttributeNotFoundException JavaDoc e2) {
259                         factor = DEFAULT_UNCHANGED_FACTOR.doubleValue();
260                         logger.fine("Unable to load unchanged factor for " +
261                                 curi.toString());
262                     }
263                     waitInterval = (long)(waitInterval*factor);
264                 }
265             } else {
266                 // If wait element not found, use initial wait interval
267
try {
268                     waitInterval = ((Long JavaDoc)getAttribute(
269                             curi,ATTR_INITIAL_WAIT_INTERVAL)).longValue()*1000;
270                 } catch (AttributeNotFoundException JavaDoc e1) {
271                     // If this fails use default (already set) and log error.
272
logger.fine("Unable to load initial wait interval for " +
273                             curi.toString());
274                 }
275             }
276         }
277         
278         if(waitInterval < min){
279             waitInterval = min;
280         } else if(waitInterval > max){
281             waitInterval = max;
282         }
283         
284         if (logger.isLoggable(Level.FINE)) {
285             logger.fine("URI " + curi.toString() + ", change: "
286                     + curi.getInt(A_CONTENT_STATE_KEY) + " new wait interval: "
287                     + waitInterval);
288         }
289         // Update wait interval
290
curi.putLong(A_WAIT_INTERVAL,waitInterval);
291         curi.putObject(A_WAIT_REEVALUATED,new Boolean JavaDoc(true));
292     }
293 }
294
Popular Tags