1 24 package org.archive.crawler.scope; 25 26 import java.util.Iterator ; 27 import java.util.logging.Logger ; 28 29 import org.apache.commons.httpclient.URIException; 30 import org.archive.crawler.deciderules.DecidingScope; 31 import org.archive.crawler.filter.FilePatternFilter; 32 import org.archive.crawler.filter.TransclusionFilter; 33 import org.archive.crawler.framework.Filter; 34 import org.archive.net.UURI; 35 36 69 public class DomainScope extends SeedCachingScope { 70 71 private static final long serialVersionUID = 648062105277258820L; 72 73 private static final Logger logger = 74 Logger.getLogger(DomainScope.class.getName()); 75 76 public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter"; 77 public static final String ATTR_ADDITIONAL_FOCUS_FILTER = 78 "additionalScopeFocus"; 79 public static final String DOT = "."; 80 81 Filter additionalFocusFilter; 82 Filter transitiveFilter; 83 84 public DomainScope(String name) { 85 super(name); 86 setDescription( 87 "DomainScope: A scope for domain crawls *Deprecated* Use " + 88 "DecidingScope instead. Crawls made with this" + 89 " scope will be limited to the domain of its seeds. It will" + 90 " however reach subdomains of the seeds' original domains." + 91 " www[#].host is considered to be the same as host."); 92 this.additionalFocusFilter = (Filter) addElementToDefinition( 93 new FilePatternFilter(ATTR_ADDITIONAL_FOCUS_FILTER)); 94 this.transitiveFilter = (Filter) addElementToDefinition( 95 new TransclusionFilter(ATTR_TRANSITIVE_FILTER)); 96 } 97 98 102 protected boolean transitiveAccepts(Object o) { 103 return this.transitiveFilter.accepts(o); 104 } 105 106 112 protected boolean focusAccepts(Object o) { 113 UURI u = UURI.from(o); 114 if (u == null) { 115 return false; 116 } 117 String seedDomain = null; 122 String candidateDomain =null; 123 124 try { 126 candidateDomain = u.getHostBasename(); 127 } 128 catch (URIException e1) { 129 logger.severe( 130 "UURI getHostBasename failed for candidate URI: " + u); 131 } 132 if (candidateDomain == null) { 133 return false; 135 } 136 137 Iterator iter = seedsIterator(); 138 while(iter.hasNext()) { 139 UURI s = (UURI)iter.next(); 140 try { 142 seedDomain = s.getHostBasename(); 143 } 144 catch (URIException e) { 145 logger.severe("UURI getHostBasename failed for seed: " + 146 s); 147 } 148 if (seedDomain == null) { 149 continue; 152 } 153 154 if (seedDomain.equals(candidateDomain)) { 156 checkClose(iter); 157 return true; 158 } 159 160 seedDomain = DOT + seedDomain; 163 if (seedDomain.regionMatches(0, candidateDomain, 164 candidateDomain.length() - seedDomain.length(), 165 seedDomain.length())) { 166 checkClose(iter); 168 return true; 169 } } 171 checkClose(iter); 173 return false; 174 } 175 176 protected boolean additionalFocusAccepts(Object o) { 177 return additionalFocusFilter.accepts(o); 178 } 179 } 180 | Popular Tags |