1 25 package org.archive.crawler.filter; 26 27 import java.util.logging.Logger ; 28 29 import javax.management.AttributeNotFoundException ; 30 31 import org.archive.crawler.datamodel.CrawlURI; 32 import org.archive.crawler.deciderules.DecideRule; 33 import org.archive.crawler.deciderules.DecidingFilter; 34 import org.archive.crawler.settings.SimpleType; 35 import org.archive.crawler.settings.Type; 36 37 49 public class PathologicalPathFilter extends URIRegExpFilter { 50 51 private static final long serialVersionUID = 2797805167250054353L; 52 53 private static final Logger logger = 54 Logger.getLogger(PathologicalPathFilter.class.getName()); 55 56 public static final String ATTR_REPETITIONS = "repetitions"; 57 58 public static final Integer DEFAULT_REPETITIONS = new Integer (3); 59 60 private final String REGEX_PREFIX = ".*?/(.*?/)\\1{"; 61 private final String REGEX_SUFFIX = ",}.*"; 62 63 67 public PathologicalPathFilter(String name) { 68 super(name); 69 setDescription("Pathological path filter *Deprecated* Use" + 70 "DecidingFilter and equivalent DecideRule instead. " + 71 "The Pathologicalpath filter" + 72 " is used to avoid crawler traps by adding a constraint on" + 73 " how many times a pattern in the URI could be repeated." + 74 " Returns false if the path is NOT pathological (There" + 75 " are no subpath reptitions or reptitions are less than" + 76 " the '" + ATTR_REPETITIONS + "' limit)."); 77 78 Type type = getElementFromDefinition(ATTR_MATCH_RETURN_VALUE); 79 type.setTransient(true); 80 81 type = getElementFromDefinition(ATTR_REGEXP); 82 type.setTransient(true); 83 84 addElementToDefinition(new SimpleType(ATTR_REPETITIONS, 85 "Number of times the pattern should be allowed to occur. \n" + 86 "This filter returns true if number of repetitions of a" + 87 " pattern exceeds this value", 88 DEFAULT_REPETITIONS)); 89 } 90 91 96 protected String getRegexp(Object o) { 97 int rep = 0; 98 try { 99 rep = ((Integer )getAttribute(o, ATTR_REPETITIONS)).intValue(); 100 } catch (AttributeNotFoundException e) { 101 logger.severe(e.getMessage()); 102 } 103 return rep == 0? null: REGEX_PREFIX + (rep - 1) + REGEX_SUFFIX; 104 } 105 106 protected boolean getFilterOffPosition(CrawlURI curi) { 107 return false; 108 } 109 } 110 | Popular Tags |