1 24 package org.archive.crawler.scope; 25 26 import java.util.Iterator ; 27 import java.util.logging.Logger ; 28 29 import org.apache.commons.httpclient.URIException; 30 import org.archive.crawler.deciderules.DecidingScope; 31 import org.archive.crawler.filter.FilePatternFilter; 32 import org.archive.crawler.filter.TransclusionFilter; 33 import org.archive.crawler.framework.Filter; 34 import org.archive.net.UURI; 35 36 69 public class PathScope extends SeedCachingScope { 70 71 private static final long serialVersionUID = -2217024073240277527L; 72 73 private static Logger logger = 74 Logger.getLogger("org.archive.crawler.basic.PathScope"); 75 76 public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter"; 77 public static final String ATTR_ADDITIONAL_FOCUS_FILTER = 78 "additionalScopeFocus"; 79 80 Filter additionalFocusFilter; 81 Filter transitiveFilter; 82 83 public PathScope(String name) { 84 super(name); 85 setDescription( 86 "PathScope: A scope for path crawls *Deprecated* Use " + 87 "DecidingScope instead. Crawls made with this scope" + 88 " will be limited to a specific portion of the hosts its seeds" + 89 " provide. More specifically the paths those seeds provide." + 90 " For example if one of the seeds is 'archive.org/example/'" + 91 " all URIs under the path 'examples' will be crawled (like" + 92 " 'archive.org/examples/hello.html') but not URIs in other" + 93 " paths or root (i.e. 'archive.org/index.html)."); 94 this.additionalFocusFilter = (Filter) addElementToDefinition( 95 new FilePatternFilter(ATTR_ADDITIONAL_FOCUS_FILTER)); 96 this.transitiveFilter = (Filter) addElementToDefinition( 97 new TransclusionFilter(ATTR_TRANSITIVE_FILTER)); 98 } 99 100 104 protected boolean transitiveAccepts(Object o) { 105 if (this.transitiveFilter == null) { 106 return true; 107 } 108 return this.transitiveFilter.accepts(o); 109 } 110 111 115 protected boolean focusAccepts(Object o) { 116 UURI u = UURI.from(o); 117 if (u == null) { 118 return false; 119 } 120 Iterator iter = seedsIterator(); 122 while(iter.hasNext()) { 123 UURI s = (UURI) iter.next(); 124 if (isSameHost(s, u)) { 125 try { 126 if (s.getPath() == null || u.getPath() == null) { 130 continue; 131 } 132 } 133 catch (URIException e) { 134 logger.severe("Failed get path on " + u + " or " + s + 135 ": " + e.getMessage()); 136 } 137 try { 138 if (s.getPath().regionMatches(0, u.getPath(), 0, 139 s.getPath().lastIndexOf('/'))) { 140 checkClose(iter); 142 return true; 143 } else { 144 continue; 146 } 147 } 148 catch (URIException e) { 149 logger.severe("Failed get path on " + u + " or " + s + 150 ": " + e.getMessage()); 151 } 152 } 153 } 154 checkClose(iter); 156 return false; 157 } 158 159 @Override 161 protected boolean additionalFocusAccepts(Object o) { 162 return this.additionalFocusFilter.accepts(o); 163 } 164 165 } 166 | Popular Tags |