1 24 package org.archive.crawler.scope; 25 26 import javax.management.AttributeNotFoundException ; 27 28 import org.archive.crawler.datamodel.CandidateURI; 29 import org.archive.crawler.extractor.Link; 30 import org.archive.crawler.framework.CrawlScope; 32 import org.archive.crawler.settings.SimpleType; 33 34 58 public class ClassicScope extends CrawlScope { 59 60 private static final long serialVersionUID = 4494905304855590002L; 61 62 65 public static final String ATTR_EXCLUDE_FILTER = "exclude-filter"; 66 public static final String ATTR_FORCE_ACCEPT_FILTER = "force-accept-filter"; 67 68 public static final String ATTR_MAX_LINK_HOPS = "max-link-hops"; 69 70 public static final String ATTR_MAX_TRANS_HOPS = "max-trans-hops"; 71 72 74 @SuppressWarnings ("deprecation") 75 private org.archive.crawler.filter.OrFilter excludeFilter; 76 @SuppressWarnings ("deprecation") 77 private org.archive.crawler.filter.OrFilter forceAcceptFilter; 78 79 83 @SuppressWarnings ("deprecation") 84 public ClassicScope(String name) { 85 super(name); 86 addElementToDefinition(new SimpleType(ATTR_MAX_LINK_HOPS, 87 "Max link hops to include. URIs more than this number " 88 + "of links from a seed will not be ruled in-scope. (Such " 89 + "determination does not preclude later inclusion if a " 90 + "shorter path is later discovered.)", new Integer (25))); 91 addElementToDefinition(new SimpleType(ATTR_MAX_TRANS_HOPS, 92 "Max transitive hops (embeds, referrals, preconditions) to " + 93 "include. URIs reached by more than this number of transitive " + 94 "hops will not be ruled in-scope, even if otherwise on an " + 95 "in-focus site. (Such determination does not preclude later " + 96 " inclusion if a shorter path is later discovered.)", 97 new Integer (5))); 98 this.excludeFilter = (org.archive.crawler.filter.OrFilter) 99 addElementToDefinition(new org.archive.crawler.filter.OrFilter( 100 ATTR_EXCLUDE_FILTER)); 101 this.forceAcceptFilter = (org.archive.crawler.filter.OrFilter) 102 addElementToDefinition( 103 new org.archive.crawler.filter.OrFilter( 104 ATTR_FORCE_ACCEPT_FILTER)); 105 this.forceAcceptFilter.setExpertSetting(true); 106 107 setPreservedFields(new String [] { ATTR_SEEDS, ATTR_MAX_LINK_HOPS, 110 ATTR_MAX_TRANS_HOPS, ATTR_EXCLUDE_FILTER, 111 ATTR_FORCE_ACCEPT_FILTER }); 112 } 113 114 117 public ClassicScope() { 118 this(CrawlScope.ATTR_NAME); 119 } 120 121 130 protected final boolean innerAccepts(Object o) { 131 return forceAccepts(o) || (((isSeed(o) || focusAccepts(o)) || 132 additionalFocusAccepts(o) || transitiveAccepts(o)) && 133 !excludeAccepts(o)); 134 } 135 136 145 protected boolean additionalFocusAccepts(Object o) { 146 return false; 147 } 148 149 154 protected boolean transitiveAccepts(Object o) { 155 return false; 156 } 157 158 162 protected boolean forceAccepts(Object o) { 163 return false; 164 } 165 166 175 protected boolean focusAccepts(Object o) { 176 return false; 178 } 179 180 187 @SuppressWarnings ("deprecation") 188 protected boolean excludeAccepts(Object o) { 189 return (this.excludeFilter.isEmpty(o)) ? exceedsMaxHops(o) 190 : this.excludeFilter.accepts(o) || exceedsMaxHops(o); 191 } 192 193 200 protected boolean exceedsMaxHops(Object o) { 201 if (!(o instanceof CandidateURI)) { 202 return false; 203 } 204 205 int maxLinkHops = 0; 206 208 try { 209 maxLinkHops = ((Integer ) getAttribute(o, ATTR_MAX_LINK_HOPS)) 210 .intValue(); 211 } catch (AttributeNotFoundException e) { 214 e.printStackTrace(); 216 } 217 218 CandidateURI cand = (CandidateURI) o; 219 220 String path = cand.getPathFromSeed(); 221 int linkCount = 0; 222 int transCount = 0; 223 for (int i = path.length() - 1; i >= 0; i--) { 224 if (path.charAt(i) == Link.NAVLINK_HOP) { 225 linkCount++; 226 } else if (linkCount == 0) { 227 transCount++; 228 } 229 } 230 return (linkCount > maxLinkHops); 233 } 234 235 239 @SuppressWarnings ("deprecation") 240 public void kickUpdate() { 241 super.kickUpdate(); 242 excludeFilter.kickUpdate(); 243 } 244 } 245 | Popular Tags |