1 25 package org.archive.crawler.postprocessor; 26 27 import java.util.logging.Level ; 28 import java.util.logging.Logger ; 29 30 import javax.management.AttributeNotFoundException ; 31 32 import org.archive.crawler.datamodel.CrawlURI; 33 import org.archive.crawler.framework.Processor; 34 import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants; 35 import org.archive.crawler.settings.SimpleType; 36 37 44 public class WaitEvaluator extends Processor 45 implements AdaptiveRevisitAttributeConstants { 46 47 private static final long serialVersionUID = 7452762726125458413L; 48 49 Logger logger = Logger.getLogger(WaitEvaluator.class.getName()); 50 51 52 public final static String ATTR_INITIAL_WAIT_INTERVAL = 53 "initial-wait-interval-seconds"; 54 protected final static Long DEFAULT_INITIAL_WAIT_INTERVAL = 55 new Long (86400); 57 public final static String ATTR_MAX_WAIT_INTERVAL = 58 "max-wait-interval-seconds"; 59 protected final static Long DEFAULT_MAX_WAIT_INTERVAL = 60 new Long (2419200); 62 public final static String ATTR_MIN_WAIT_INTERVAL = 63 "min-wait-interval-seconds"; 64 protected final static Long DEFAULT_MIN_WAIT_INTERVAL = 65 new Long (3600); 67 public final static String ATTR_UNCHANGED_FACTOR = "unchanged-factor"; 68 protected final static Double DEFAULT_UNCHANGED_FACTOR = new Double (1.5); 69 70 public final static String ATTR_CHANGED_FACTOR = "changed-factor"; 71 protected final static Double DEFAULT_CHANGED_FACTOR = new Double (1.5); 72 74 public final static String ATTR_DEFAULT_WAIT_INTERVAL = 75 "default-wait-interval-seconds"; 76 protected final static Long DEFAULT_DEFAULT_WAIT_INTERVAL = 77 new Long (259200); 80 public final static String ATTR_USE_OVERDUE_TIME = "use-overdue-time"; 81 protected final static Boolean DEFAULT_USE_OVERDUE_TIME = new Boolean (false); 82 83 88 public WaitEvaluator(String name) { 89 this(name, 90 "Evaluates how long to wait before fetching a URI again. " + 91 "Typically, this processor should be in the post processing " + 92 "chain. It will pass if another wait evaluator has already " + 93 "processed the CrawlURI.", 94 DEFAULT_INITIAL_WAIT_INTERVAL, 95 DEFAULT_MAX_WAIT_INTERVAL, 96 DEFAULT_MIN_WAIT_INTERVAL, 97 DEFAULT_UNCHANGED_FACTOR, 98 DEFAULT_CHANGED_FACTOR); 99 } 100 101 115 public WaitEvaluator(String name, String description, 116 Long default_inital_wait_interval, 117 Long default_max_wait_interval, 118 Long default_min_wait_interval, 119 Double default_unchanged_factor, 120 Double default_changed_factor){ 121 super(name, description); 122 123 addElementToDefinition(new SimpleType(ATTR_INITIAL_WAIT_INTERVAL, 124 "The initial wait time between revisits. Will then be " + 125 "updated according to crawler experiance. I.e. shorter " + 126 "wait, visit more often, if document has changed between " + 127 "visits, and vica versa.", 128 default_inital_wait_interval)); 129 addElementToDefinition(new SimpleType(ATTR_MAX_WAIT_INTERVAL, 130 "The maximum settable wait time between revisits. Once a " + 131 "URIs wait time reaches this value, it will not grow " + 132 "further, regardless of subsequent visits that discover " + 133 "no changes. Note that this does not ensure that the URI " + 134 "does not wait any longer, since the crawler might be " + 135 "'behind,' forcing a URI to wait until other URIs, " + 136 "scheduled for earlier are completed..", 137 default_max_wait_interval)); 138 addElementToDefinition(new SimpleType(ATTR_MIN_WAIT_INTERVAL, 139 "The minum settable wait time between revisits. Once a " + 140 "URIs wait time reaches this value, it will not be shortened " + 141 "further, regardlesss of subsequent visits that discover " + 142 "changes.", 143 default_min_wait_interval)); 144 addElementToDefinition(new SimpleType(ATTR_DEFAULT_WAIT_INTERVAL, 145 "Fixed wait time for 'unknown' change status. I.e. wait time " + 146 "for URIs whose content change detection is not available.", 147 DEFAULT_DEFAULT_WAIT_INTERVAL)); 148 addElementToDefinition(new SimpleType(ATTR_UNCHANGED_FACTOR, 149 "The factor by which a URIs wait time is increased when a " + 150 "revisit reveals an unchanged document. A value of 1 will " + 151 "leave it unchanged, a value of 2 will double it etc.", 152 default_unchanged_factor)); 153 addElementToDefinition(new SimpleType(ATTR_CHANGED_FACTOR, 154 "The factor by which a URIs wait time is decreased when a " + 155 "revisit reveals a changed document. A value of 1 will leave " + 156 "it unchanged, a value of two will half it etc.", 157 default_changed_factor)); 158 addElementToDefinition(new SimpleType(ATTR_USE_OVERDUE_TIME, 159 "Indicates if the amount of time the URI was overdue should " + 160 "be added to the wait time before the new wait time is " + 161 "calculated.", 162 DEFAULT_USE_OVERDUE_TIME)); 163 164 CrawlURI.addAlistPersistentMember(A_WAIT_INTERVAL); 166 } 167 168 protected void innerProcess(CrawlURI curi) throws InterruptedException { 169 170 if(curi.isSuccess()==false){ 171 return; 174 } 175 176 if(curi.containsKey(A_WAIT_REEVALUATED) && 177 ((Boolean )curi.getObject(A_WAIT_REEVALUATED)).booleanValue()){ 178 return; 181 } 182 183 long min; 184 try { 185 min = ((Long )getAttribute(curi, ATTR_MIN_WAIT_INTERVAL)). 186 longValue() * 1000; 187 } catch (AttributeNotFoundException e1) { 188 min = DEFAULT_MIN_WAIT_INTERVAL.longValue(); 189 logger.fine("Unable to load minimum wait interval for " + 190 curi.toString()); 191 } 192 193 long max; 194 try { 195 max = ((Long )getAttribute(curi, ATTR_MAX_WAIT_INTERVAL)). 196 longValue() * 1000; 197 } catch (AttributeNotFoundException e1) { 198 max = DEFAULT_MAX_WAIT_INTERVAL.longValue(); 199 logger.fine("Unable to load maximum wait interval for " + 200 curi.toString()); 201 } 202 203 204 long waitInterval; 205 if (!curi.containsKey(A_CONTENT_STATE_KEY) || 206 curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_UNKNOWN) { 207 try { 208 waitInterval = ((Long )getAttribute(curi, 209 ATTR_DEFAULT_WAIT_INTERVAL)).longValue() * 1000; 210 } catch (AttributeNotFoundException e1) { 211 waitInterval = DEFAULT_DEFAULT_WAIT_INTERVAL.longValue(); 212 logger.fine("Unable to load default wait interval for " 213 + curi.toString()); 214 } 215 } else { 216 217 waitInterval = DEFAULT_INITIAL_WAIT_INTERVAL.longValue()*1000; 218 219 if(curi.containsKey(A_WAIT_INTERVAL)){ 221 waitInterval = curi.getLong(A_WAIT_INTERVAL); 222 223 boolean useOverrideTime = DEFAULT_USE_OVERDUE_TIME.booleanValue(); 225 try { 226 useOverrideTime = ((Boolean )getAttribute( 227 curi,ATTR_USE_OVERDUE_TIME)).booleanValue(); 228 } catch (AttributeNotFoundException e1) { 229 useOverrideTime = DEFAULT_USE_OVERDUE_TIME.booleanValue(); 230 logger.fine("Unable to load use-overdue-time for " + 231 curi.toString()); 232 } 233 234 if(useOverrideTime){ 235 waitInterval += curi.getLong(A_FETCH_OVERDUE); 236 } 237 238 if(curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED){ 240 double factor; 242 try { 243 factor = ((Double )getAttribute( 244 curi,ATTR_CHANGED_FACTOR)).doubleValue(); 245 } catch (AttributeNotFoundException e2) { 246 factor = DEFAULT_CHANGED_FACTOR.doubleValue(); 247 logger.fine("Unable to load changed factor for " + 248 curi.toString()); 249 } 250 waitInterval = (long)(waitInterval / factor); 251 } else if(curi.getInt(A_CONTENT_STATE_KEY) == 252 CONTENT_UNCHANGED) { 253 double factor; 255 try { 256 factor = ((Double )getAttribute( 257 curi,ATTR_UNCHANGED_FACTOR)).doubleValue(); 258 } catch (AttributeNotFoundException e2) { 259 factor = DEFAULT_UNCHANGED_FACTOR.doubleValue(); 260 logger.fine("Unable to load unchanged factor for " + 261 curi.toString()); 262 } 263 waitInterval = (long)(waitInterval*factor); 264 } 265 } else { 266 try { 268 waitInterval = ((Long )getAttribute( 269 curi,ATTR_INITIAL_WAIT_INTERVAL)).longValue()*1000; 270 } catch (AttributeNotFoundException e1) { 271 logger.fine("Unable to load initial wait interval for " + 273 curi.toString()); 274 } 275 } 276 } 277 278 if(waitInterval < min){ 279 waitInterval = min; 280 } else if(waitInterval > max){ 281 waitInterval = max; 282 } 283 284 if (logger.isLoggable(Level.FINE)) { 285 logger.fine("URI " + curi.toString() + ", change: " 286 + curi.getInt(A_CONTENT_STATE_KEY) + " new wait interval: " 287 + waitInterval); 288 } 289 curi.putLong(A_WAIT_INTERVAL,waitInterval); 291 curi.putObject(A_WAIT_REEVALUATED,new Boolean (true)); 292 } 293 } 294 | Popular Tags |