1 25 package org.archive.crawler.postprocessor; 26 27 import javax.management.AttributeNotFoundException ; 28 29 import org.archive.crawler.datamodel.CrawlURI; 30 import org.archive.crawler.settings.SimpleType; 31 import org.archive.util.TextUtils; 32 33 42 public class ContentBasedWaitEvaluator extends WaitEvaluator { 43 44 private static final long serialVersionUID = 1623347208782997347L; 45 46 47 public final static String ATTR_CONTENT_REGEXPR = 48 "content-regular-expression"; 49 protected final static String DEFAULT_CONTENT_REGEXPR = "^.*$"; 51 56 public ContentBasedWaitEvaluator(String name) { 57 this(name,"Evaluates how long to wait before fetching a URI again. " + 58 "Only handles CrawlURIs whose content type matches the " + 59 "regular expression set. " + 60 "Typically, this processor should be in the post processing " + 61 "chain. It will pass if another wait evaluator has already " + 62 "processed the CrawlURI.", DEFAULT_CONTENT_REGEXPR, 63 DEFAULT_INITIAL_WAIT_INTERVAL, 64 DEFAULT_MAX_WAIT_INTERVAL, 65 DEFAULT_MIN_WAIT_INTERVAL, 66 DEFAULT_UNCHANGED_FACTOR, 67 DEFAULT_CHANGED_FACTOR); 68 } 69 70 84 public ContentBasedWaitEvaluator(String name, String description, 85 String defaultRegExpr, 86 Long default_inital_wait_interval, 87 Long default_max_wait_interval, 88 Long default_min_wait_interval, 89 Double default_unchanged_factor, 90 Double default_changed_factor){ 91 super(name,description, 92 default_inital_wait_interval, 93 default_max_wait_interval, 94 default_min_wait_interval, 95 default_unchanged_factor, 96 default_changed_factor); 97 98 addElementToDefinition(new SimpleType(ATTR_CONTENT_REGEXPR, 99 "Only URIs whose content type matches this regular " + 100 "expression will be evaluated.", 101 defaultRegExpr)); 102 103 } 104 105 protected void innerProcess(CrawlURI curi) throws InterruptedException { 106 String content_type = curi.getContentType(); 108 if(content_type==null){ 109 return; 111 } 112 String regexpr; 113 try { 114 regexpr = (String )getAttribute(curi,ATTR_CONTENT_REGEXPR); 115 } catch (AttributeNotFoundException e) { 116 logger.warning("Regular expression for content type not found"); 117 return; 118 } 119 120 if(TextUtils.matches(regexpr, content_type) == false){ 121 return; 123 } 124 126 super.innerProcess(curi); 127 } 128 } 129 | Popular Tags |