1 24 package org.archive.crawler.scope; 25 26 import java.util.Iterator ; 27 28 import org.archive.crawler.deciderules.DecidingScope; 29 import org.archive.crawler.filter.FilePatternFilter; 30 import org.archive.crawler.filter.TransclusionFilter; 31 import org.archive.crawler.framework.Filter; 32 import org.archive.net.UURI; 33 34 67 public class HostScope extends SeedCachingScope { 68 69 private static final long serialVersionUID = -6257664892667267266L; 70 71 public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter"; 72 public static final String ATTR_ADDITIONAL_FOCUS_FILTER = 73 "additionalScopeFocus"; 74 75 Filter additionalFocusFilter; 76 Filter transitiveFilter; 77 78 public HostScope(String name) { 79 super(name); 80 setDescription( 81 "HostScope: A scope for host crawls *Deprecated* Use " + 82 "DecidingScope instead. Crawls made with this scope" + 83 " will be limited to the hosts its seeds. Thus if one of" + 84 " the seeds is 'archive.org' the subdomain" + 85 " 'crawler.archive.org' will not be crawled." + 86 " 'www.host' is considered to be the same as host."); 87 additionalFocusFilter = (Filter) addElementToDefinition( 88 new FilePatternFilter(ATTR_ADDITIONAL_FOCUS_FILTER)); 89 this.transitiveFilter = (Filter) addElementToDefinition( 90 new TransclusionFilter(ATTR_TRANSITIVE_FILTER)); 91 } 92 93 97 protected boolean transitiveAccepts(Object o) { 98 if (this.transitiveFilter == null) { 99 return true; 100 } 101 return this.transitiveFilter.accepts(o); 102 } 103 104 108 protected boolean focusAccepts(Object o) { 109 UURI u = UURI.from(o); 110 if (u == null) { 111 return false; 112 } 113 Iterator iter = seedsIterator(); 115 while(iter.hasNext()) { 116 if (isSameHost((UURI)iter.next(), u)) { 117 checkClose(iter); 118 return true; 119 } 120 } 121 checkClose(iter); 123 return false; 124 } 125 126 127 @Override 129 protected boolean additionalFocusAccepts(Object o) { 130 return additionalFocusFilter.accepts(o); 131 } 132 133 } 134 | Popular Tags |