1 26 package org.archive.crawler.postprocessor; 27 28 import java.util.Collection ; 29 import java.util.HashSet ; 30 import java.util.logging.Level ; 31 import java.util.logging.Logger ; 32 33 import org.archive.crawler.datamodel.CandidateURI; 34 import org.archive.crawler.datamodel.CrawlURI; 35 import org.archive.crawler.framework.Filter; 36 import org.archive.crawler.framework.Scoper; 37 import org.archive.crawler.settings.MapType; 38 39 50 public class SupplementaryLinksScoper extends Scoper { 51 52 private static final long serialVersionUID = -775819977752790418L; 53 54 private static Logger LOGGER = 55 Logger.getLogger(SupplementaryLinksScoper.class.getName()); 56 57 public static final String ATTR_LINK_FILTERS = "link-filters"; 58 59 62 private MapType filters = null; 63 64 65 68 public SupplementaryLinksScoper(String name) { 69 super(name, "SupplementaryLinksScoper. Use to do supplementary " + 70 "processing of in-scope links. Will run each link through " + 71 "configured filters. Must be run after LinkScoper and " + 72 "before FrontierScheduler. " + 73 "Optionally logs rejected links (Enable " + 74 ATTR_OVERRIDE_LOGGER_ENABLED + " and set logger level " + 75 "at INFO or above)."); 76 77 this.filters = (MapType)addElementToDefinition( 78 new MapType(ATTR_LINK_FILTERS, "Filters to apply to each " + 79 "link carried by the passed CrawlURI.", Filter.class)); 80 this.filters.setExpertSetting(true); 81 } 82 83 protected void innerProcess(final CrawlURI curi) { 84 if (curi.hasPrerequisiteUri() || curi.outlinksSize() <= 0) { 86 return; 87 } 88 89 Collection <CandidateURI> inScopeLinks = new HashSet <CandidateURI>(); 90 for (CandidateURI cauri: curi.getOutCandidates()) { 91 if (isInScope(cauri)) { 92 inScopeLinks.add(cauri); 93 } 94 } 95 curi.replaceOutlinks(inScopeLinks); 98 } 99 100 protected boolean isInScope(CandidateURI caUri) { 101 CrawlURI curi = (caUri instanceof CrawlURI)? 103 (CrawlURI)caUri: 104 new CrawlURI(caUri.getUURI()); 105 boolean result = false; 106 if (filtersAccept(this.filters, curi)) { 107 result = true; 108 if (LOGGER.isLoggable(Level.FINER)) { 109 LOGGER.finer("Accepted: " + caUri); 110 } 111 } else { 112 outOfScope(caUri); 113 } 114 return result; 115 } 116 117 121 protected void outOfScope(CandidateURI caUri) { 122 if (!LOGGER.isLoggable(Level.INFO)) { 123 return; 124 } 125 LOGGER.info(caUri.getUURI().toString()); 126 } 127 } | Popular Tags |