1 24 package org.archive.crawler.prefetch; 25 26 import javax.management.AttributeNotFoundException ; 27 28 import org.archive.crawler.datamodel.CrawlURI; 29 import org.archive.crawler.datamodel.FetchStatusCodes; 30 import org.archive.crawler.framework.Scoper; 31 import org.archive.crawler.settings.SimpleType; 32 import org.archive.crawler.settings.Type; 33 import org.archive.util.TextUtils; 34 35 45 public class Preselector extends Scoper 46 implements FetchStatusCodes { 47 48 private static final long serialVersionUID = 3738560264369561017L; 49 50 51 public static final String ATTR_RECHECK_SCOPE = "recheck-scope"; 52 54 public static final String ATTR_BLOCK_ALL = "block-all"; 55 56 public static final String ATTR_BLOCK_BY_REGEXP = "block-by-regexp"; 57 58 public static final String ATTR_ALLOW_BY_REGEXP = "allow-by-regexp"; 59 60 64 public Preselector(String name) { 65 super(name, "Preselector. Does one last bit of checking to make " + 66 "sure that the current URI should be fetched."); 67 Type e; 68 e = addElementToDefinition(new SimpleType(ATTR_RECHECK_SCOPE, 69 "Recheck if uri is in scope. This is meaningful if the scope" + 70 " is altered during a crawl. URIs are checked against the" + 71 " scope when they are added to queues. Setting this value to" + 72 " true forces the URI to be checked against the scope when it" + 73 " is comming out of the queue, possibly after the scope is" + 74 " altered.", new Boolean (false))); 75 e.setExpertSetting(true); 76 77 e = addElementToDefinition(new SimpleType(ATTR_BLOCK_ALL, 78 "Block all URIs from being processed. This is most likely to" + 79 " be used in overrides to easily reject certain hosts from" + 80 " being processed.", new Boolean (false))); 81 e.setExpertSetting(true); 82 83 e = addElementToDefinition(new SimpleType(ATTR_BLOCK_BY_REGEXP, 84 "Block all URIs matching the regular expression from being" + 85 " processed.", "")); 86 e.setExpertSetting(true); 87 88 e = addElementToDefinition(new SimpleType(ATTR_ALLOW_BY_REGEXP, 89 "Allow only URIs matching the regular expression to be" + 90 " processed.", "")); 91 e.setExpertSetting(true); 92 } 93 94 protected void innerProcess(CrawlURI curi) { 95 try { 97 if (((Boolean ) getAttribute(ATTR_BLOCK_ALL, curi)).booleanValue()) { 98 curi.setFetchStatus(S_BLOCKED_BY_USER); 99 curi.skipToProcessorChain(getController(). 100 getPostprocessorChain()); 101 } 102 } catch (AttributeNotFoundException e) { 103 } 105 106 try { 108 String regexp = (String ) getAttribute(ATTR_ALLOW_BY_REGEXP, curi); 109 if (regexp != null && !regexp.equals("")) { 110 if (!TextUtils.matches(regexp, curi.toString())) { 111 curi.setFetchStatus(S_BLOCKED_BY_USER); 112 curi.skipToProcessorChain(getController(). 113 getPostprocessorChain()); 114 } 115 } 116 } catch (AttributeNotFoundException e) { 117 } 119 120 121 try { 123 String regexp = (String ) getAttribute(ATTR_BLOCK_BY_REGEXP, curi); 124 if (regexp != null && !regexp.equals("")) { 125 if (TextUtils.matches(regexp, curi.toString())) { 126 curi.setFetchStatus(S_BLOCKED_BY_USER); 127 curi.skipToProcessorChain(getController(). 128 getPostprocessorChain()); 129 } 130 } 131 } catch (AttributeNotFoundException e) { 132 } 134 135 try { 137 if (((Boolean ) getAttribute(ATTR_RECHECK_SCOPE, curi)). 138 booleanValue()) { 139 if (!isInScope(curi)) { 140 curi.setFetchStatus(S_OUT_OF_SCOPE); 142 curi.skipToProcessorChain(getController(). 143 getPostprocessorChain()); 144 } 145 } 146 } catch (AttributeNotFoundException e) { 147 } 149 } 150 } 151 | Popular Tags |