1 26 package org.archive.crawler.postprocessor; 27 28 import java.util.Collection ; 29 import java.util.HashSet ; 30 import java.util.Iterator ; 31 import java.util.logging.Level ; 32 import java.util.logging.Logger ; 33 34 import org.apache.commons.httpclient.URIException; 35 import org.archive.crawler.datamodel.CandidateURI; 36 import org.archive.crawler.datamodel.CrawlURI; 37 import org.archive.crawler.datamodel.FetchStatusCodes; 38 import org.archive.crawler.extractor.Link; 39 import org.archive.crawler.framework.Filter; 40 import org.archive.crawler.framework.Scoper; 41 import org.archive.crawler.settings.MapType; 42 import org.archive.crawler.settings.SimpleType; 43 import org.archive.crawler.settings.Type; 44 45 59 public class LinksScoper extends Scoper 60 implements FetchStatusCodes { 61 62 private static final long serialVersionUID = -4074442117992496793L; 63 64 private static Logger LOGGER = 65 Logger.getLogger(LinksScoper.class.getName()); 66 67 private final static String ATTR_SEED_REDIRECTS_NEW_SEEDS = 68 "seed-redirects-new-seed"; 69 70 private final static Boolean DEFAULT_SEED_REDIRECTS_NEW_SEEDS = 71 new Boolean (true); 72 73 public static final String ATTR_LOG_REJECT_FILTERS = 74 "scope-rejected-url-filters"; 75 76 public static final String ATTR_PREFERENCE_DEPTH_HOPS = 77 "preference-depth-hops"; 78 79 private final static Integer DEFAULT_PREFERENCE_DEPTH_HOPS = 80 new Integer (-1); 81 82 85 private MapType rejectLogFilters = null; 86 87 90 public LinksScoper(String name) { 91 super(name, "LinksScoper. Rules on which extracted links " + 92 "are within configured scope."); 93 94 Type t; 95 t = addElementToDefinition( 96 new SimpleType(ATTR_SEED_REDIRECTS_NEW_SEEDS, 97 "If enabled, any URL found because a seed redirected to it " + 98 "(original seed returned 301 or 302), will also be treated " + 99 "as a seed.", DEFAULT_SEED_REDIRECTS_NEW_SEEDS)); 100 t.setExpertSetting(true); 101 102 t = addElementToDefinition(new SimpleType(ATTR_PREFERENCE_DEPTH_HOPS, 103 "Number of hops (of any sort) from a seed up to which a URI has higher " + 104 "priority scheduling than any remaining seed. For example, if set to 1 items one " + 105 "hop (link, embed, redirect, etc.) away from a seed will be scheduled " + 106 "with HIGH priority. If set to -1, no " + 107 "preferencing will occur, and a breadth-first search with seeds " + 108 "processed before discovered links will proceed. If set to zero, a " + 109 "purely depth-first search will proceed, with all discovered links processed " + 110 "before remaining seeds. Seed redirects are treated as one hop from a seed.", 111 DEFAULT_PREFERENCE_DEPTH_HOPS)); 112 t.setExpertSetting(true); 113 114 this.rejectLogFilters = (MapType)addElementToDefinition( 115 new MapType(ATTR_LOG_REJECT_FILTERS, "Filters applied after " + 116 "an URI has been rejected. If filter return " + 117 "TRUE, the URI is logged (if the logging level is INFO). " + 118 "Depends on " + ATTR_OVERRIDE_LOGGER_ENABLED + 119 " being enabled.", Filter.class)); 120 this.rejectLogFilters.setExpertSetting(true); 121 } 122 123 protected void innerProcess(final CrawlURI curi) { 124 if (LOGGER.isLoggable(Level.FINEST)) { 125 LOGGER.finest(getName() + " processing " + curi); 126 } 127 128 if (curi.hasPrerequisiteUri()) { 130 handlePrerequisite(curi); 131 return; 132 } 133 134 if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) { 136 curi.clearOutlinks(); 137 return; 138 } 139 140 if (curi.outlinksSize() <= 0) { 141 return; 143 } 144 145 final boolean redirectsNewSeeds = ((Boolean )getUncheckedAttribute(curi, 146 ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue(); 147 int preferenceDepthHops = ((Integer )getUncheckedAttribute(curi, 148 ATTR_PREFERENCE_DEPTH_HOPS)).intValue(); 149 Collection <CandidateURI> inScopeLinks = new HashSet <CandidateURI>(); 150 for (final Iterator i = curi.getOutObjects().iterator(); i.hasNext();) { 151 Object o = i.next(); 152 if(o instanceof Link){ 153 final Link wref = (Link)o; 154 try { 155 final int directive = getSchedulingFor(curi, wref, 156 preferenceDepthHops); 157 final CandidateURI caURI = 158 curi.createCandidateURI(curi.getBaseURI(), wref, 159 directive, 160 considerAsSeed(curi, wref, redirectsNewSeeds)); 161 if (isInScope(caURI)) { 162 inScopeLinks.add(caURI); 163 } 164 } catch (URIException e) { 165 getController().logUriError(e, curi.getUURI(), 166 wref.getDestination().toString()); 167 } 168 } else if(o instanceof CandidateURI){ 169 CandidateURI caURI = (CandidateURI)o; 170 if(isInScope(caURI)){ 171 inScopeLinks.add(caURI); 172 } 173 } else { 174 LOGGER.severe("Unexpected type: " + o); 175 } 176 } 177 curi.replaceOutlinks(inScopeLinks); 180 } 181 182 187 protected void handlePrerequisite(CrawlURI curi) { 188 try { 189 CandidateURI caUri = 191 curi.createCandidateURI(curi.getBaseURI(), 192 (Link) curi.getPrerequisiteUri()); 193 int prereqPriority = curi.getSchedulingDirective() - 1; 194 if (prereqPriority < 0) { 195 prereqPriority = 0; 196 LOGGER.severe("Unable to promote prerequisite " + caUri + 197 " above " + curi); 198 } 199 caUri.setSchedulingDirective(prereqPriority); 200 caUri.setForceFetch(true); 201 if(isInScope(caUri)) { 202 curi.setPrerequisiteUri(caUri); 204 } else { 205 curi.setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE); 208 } 209 } catch (URIException ex) { 210 Object [] array = {curi, curi.getPrerequisiteUri()}; 211 getController().uriErrors.log(Level.INFO,ex.getMessage(), array); 212 } catch (NumberFormatException e) { 213 Object [] array = {curi, curi.getPrerequisiteUri()}; 215 getController().uriErrors.log(Level.INFO,e.getMessage(), array); 216 } 217 } 218 219 protected void outOfScope(CandidateURI caUri) { 220 super.outOfScope(caUri); 221 if (!LOGGER.isLoggable(Level.INFO)) { 222 return; 223 } 224 CrawlURI curi = (caUri instanceof CrawlURI)? 226 (CrawlURI)caUri: 227 new CrawlURI(caUri.getUURI()); 228 if (filtersAccept(this.rejectLogFilters, curi)) { 229 LOGGER.info(curi.getUURI().toString()); 230 } 231 } 232 233 private boolean considerAsSeed(final CrawlURI curi, final Link wref, 234 final boolean redirectsNewSeeds) { 235 if (curi.isSeed() 237 && (curi.getFetchStatus() == 301 || 238 curi.getFetchStatus() == 302) 239 && wref.getHopType() == Link.REFER_HOP) { 240 if (redirectsNewSeeds) { 242 return true; 243 } 244 } 245 return false; 246 } 247 248 255 protected int getSchedulingFor(final CrawlURI curi, final Link wref, 256 final int preferenceDepthHops) { 257 final char c = wref.getHopType(); 258 if (LOGGER.isLoggable(Level.FINEST)) { 259 LOGGER.finest(curi + " with path=" + curi.getPathFromSeed() + 260 " isSeed=" + curi.isSeed() + " with fetchStatus=" + 261 curi.getFetchStatus() + " -> " + wref.getDestination() + 262 " type " + c + " with context=" + wref.getContext()); 263 } 264 265 switch (c) { 266 case Link.REFER_HOP: 267 return (preferenceDepthHops >= 0 ? CandidateURI.HIGH : 270 CandidateURI.MEDIUM); 271 default: 272 if (preferenceDepthHops == 0) 273 return CandidateURI.HIGH; 274 if (preferenceDepthHops > 0 && 279 curi.getPathFromSeed().length() + 1 <= preferenceDepthHops) 280 return CandidateURI.HIGH; 281 return CandidateURI.NORMAL; 283 } 284 } 285 } 286 | Popular Tags |