1 25 package org.archive.crawler.frontier; 26 27 import java.io.IOException ; 28 import java.util.Hashtable ; 29 import java.util.logging.Logger ; 30 31 import javax.management.AttributeNotFoundException ; 32 import javax.management.MBeanException ; 33 import javax.management.ReflectionException ; 34 35 import org.archive.crawler.datamodel.CrawlURI; 36 import org.archive.crawler.event.CrawlURIDispositionListener; 37 import org.archive.crawler.filter.OrFilter; 38 import org.archive.crawler.filter.URIRegExpFilter; 39 import org.archive.crawler.framework.CrawlController; 40 import org.archive.crawler.framework.exceptions.FatalConfigurationException; 41 import org.archive.crawler.prefetch.QuotaEnforcer; 42 import org.archive.crawler.scope.ClassicScope; 43 import org.archive.crawler.settings.CrawlerSettings; 44 import org.archive.crawler.settings.SimpleType; 45 import org.archive.crawler.settings.Type; 46 47 68 public class DomainSensitiveFrontier extends BdbFrontier 69 implements CrawlURIDispositionListener { 70 71 private static final long serialVersionUID = -3330190056282726202L; 72 73 private static final Logger logger = 74 Logger.getLogger(DomainSensitiveFrontier.class.getName()); 75 76 public static final String ATTR_MAX_DOCS = "max-docs"; 77 public static final String ATTR_COUNTER_MODE = "counter-mode"; 78 public static final String COUNT_OVERRIDE = "count-per-override"; 79 public static final String COUNT_HOST = "count-per-host"; 80 public static final String COUNT_DOMAIN = "count-per-domain"; 81 public static final String [] ATTR_AVAILABLE_MODES = new String [] { 82 COUNT_OVERRIDE, COUNT_HOST, COUNT_DOMAIN }; 83 public static final String DEFAULT_MODE = COUNT_OVERRIDE; 84 85 private Hashtable <String ,Long > hostCounters = new Hashtable <String ,Long >(); 87 private boolean countPerOverride = true; 88 private String counterMode; 89 90 public DomainSensitiveFrontier(String name) { 91 super(ATTR_NAME, "DomainSensitiveFrontier. *Deprecated* Use " + 92 "BdbFrontier+QuotaEnforcer instead. " + 93 "Overrides BdbFrontier to add specification of number of " + 94 "documents to download (Expects 'exclude-filter' " + 95 "to be part of CrawlScope)."); 96 Type e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCS, 97 "Maximum number of documents to download for host or domain" + 98 " (Zero means no limit).", new Long (0))); 99 e.setOverrideable(true); 100 e = addElementToDefinition(new SimpleType(ATTR_COUNTER_MODE, 101 "If " + COUNT_OVERRIDE + ", acts like the crawl " + 102 "order maximum download count and the crawler will download " + 103 "this total amount of docs only. Override to change the max " + 104 "count for the overridden domain or host. " + 105 "Else if " + COUNT_HOST + " the crawler will download " + 106 ATTR_MAX_DOCS + " per host. Add an override to change " + 107 "max count on a per-domain or a per-host basis.For " + 108 "example, if you set " + ATTR_MAX_DOCS + " to 30 in " + 109 "this mode, the crawler will download 30 docs from " + 110 "each host in scope. If you override for kb.se setting " + 111 ATTR_MAX_DOCS + 112 " to 20, it will instead download only 20 docs from each " + 113 "host of kb.se. (It can be a larger as well as a smaller " + 114 "value here.). " + 115 "Finally " + COUNT_DOMAIN + " behaves similar to " + 116 COUNT_HOST + 117 ", but instead sets max on a per-domain basis." + 118 "Here you can do overrides on the domain-level, but " + 119 "not on the host-level. So if you here set " + 120 ATTR_MAX_DOCS + 121 " to 30 the crawler will download 30 docs from each " + 122 "domain in scope. If you override for kb.se setting " + 123 ATTR_MAX_DOCS + " to 20, it will instead download only " + 124 "20 docs in total from the whole kb.se domain. (It can be " + 125 "a larger as well as a smaller value here.)", 126 DEFAULT_MODE, ATTR_AVAILABLE_MODES)); 127 e.setOverrideable(false); 128 } 129 130 public void initialize(CrawlController c) 131 throws FatalConfigurationException, IOException { 132 super.initialize(c); 133 this.controller.addCrawlURIDispositionListener(this); 134 try { 135 counterMode = ((String )getAttribute(ATTR_COUNTER_MODE)); 136 if(counterMode.equalsIgnoreCase(COUNT_DOMAIN) || 137 counterMode.equalsIgnoreCase(COUNT_HOST)) 138 countPerOverride = false; 139 else 140 countPerOverride = true; 141 } catch (AttributeNotFoundException e) { 142 e.printStackTrace(); 143 } catch (MBeanException e) { 144 e.printStackTrace(); 145 } catch (ReflectionException e) { 146 e.printStackTrace(); 147 } 148 } 149 150 160 private synchronized boolean checkDownloadLimits(CrawlURI curi) { 161 long thisMaxDocs = 0; 162 long thisCounter = 0; 163 boolean discarded = false; 164 boolean retVal = false; 165 if (curi.getUURI().getScheme().equals("dns")) { 166 return false; 167 } 168 try { 169 String host = curi.getUURI().getHost(); 170 CrawlerSettings cs = controller.getSettingsHandler(). 171 getSettings(host); 172 do { 173 String scope; 174 if(counterMode.equalsIgnoreCase(COUNT_OVERRIDE)) 175 scope = cs.getScope() != null ? cs.getScope() : "root"; 176 else if(counterMode.equalsIgnoreCase(COUNT_HOST)) 177 scope = host; 178 else{ int i = host.lastIndexOf("."); 180 i = host.lastIndexOf(".", i-1); 181 scope = host.substring(i+1, host.length()); 182 } 183 thisMaxDocs = 184 ((Long ) getAttribute(cs, ATTR_MAX_DOCS)).longValue(); 185 thisCounter = this.hostCounters.get(scope) != null ? 186 ((Long ) this.hostCounters.get(scope)).longValue(): 0; 187 if ((thisMaxDocs > 0 && thisCounter >= thisMaxDocs)) { 190 logger.fine("Discarding Queue: " + host + " "); 191 curi.addAnnotation("dsfLimit"); 192 if (!discarded) { 193 long count = 0; 194 WorkQueue wq = getQueueFor(curi); 195 wq.unpeek(); 196 count += wq.deleteMatching(this, ".*"); 197 decrementQueuedCount(count); 198 discarded = true; 199 } 203 OrFilter or = (OrFilter) this.controller.getScope() 205 .getAttribute(ClassicScope.ATTR_EXCLUDE_FILTER); 206 String filter = scope.equalsIgnoreCase("root") ? 209 ".*" : "^((https?://)?[a-zA-Z0-9\\.]*)" + scope + 210 "($|/.*)"; 211 logger.fine("Adding filter: [" + filter + "]."); 212 URIRegExpFilter urf = 213 new URIRegExpFilter(curi.toString(), filter); 214 or.addFilter(this.controller.getSettingsHandler(). 215 getSettings(null), urf); 216 thisMaxDocs = 0; 217 thisCounter = 0; 218 retVal = true; 219 } 220 } while ((cs = cs.getParent()) != null && countPerOverride); 221 } catch (Exception e) { 222 logger.severe("ERROR: checkDownloadLimits(), " 223 + "while processing {" + curi.toString() + "}" 224 + e.getClass() 225 + "message: " + e.getMessage() + ". Stack trace:"); 226 e.printStackTrace(); 227 } 228 return retVal; 229 } 230 231 protected synchronized void incrementHostCounters(CrawlURI curi) { 232 if (!curi.getUURI().toString().startsWith("dns:")) { 233 try { 234 String host = curi.getUURI().getHost(); 235 CrawlerSettings cs = 236 controller.getSettingsHandler().getSettings(host); 237 do { 238 String scope; 239 if(counterMode.equalsIgnoreCase(COUNT_OVERRIDE)) 240 scope = cs.getScope() != null? cs.getScope() : "root"; 241 else if(counterMode.equalsIgnoreCase(COUNT_HOST)) 242 scope = host; 243 else{ int i = host.lastIndexOf("."); 245 i = host.lastIndexOf(".", i-1); 246 scope = host.substring(i+1, host.length()); 247 } 248 long counter = this.hostCounters.get(scope) != null ? 249 ((Long )this.hostCounters.get(scope)).longValue(): 0; 250 this.hostCounters.put(scope, new Long (++counter)); 251 } while ((cs = cs.getParent()) != null && countPerOverride); 252 } catch (Exception e) { 253 logger.severe("ERROR: incrementHostCounters() " + 254 e.getMessage()); 255 } 256 } 257 } 258 259 public void crawledURISuccessful(CrawlURI curi) { 260 incrementHostCounters(curi); 261 checkDownloadLimits(curi); 262 } 263 264 public void crawledURINeedRetry(CrawlURI curi) { 265 } 266 267 public void crawledURIDisregard(CrawlURI curi) { 268 } 269 270 public void crawledURIFailure(CrawlURI curi) { 271 } 272 } 273 | Popular Tags |