1 25 package org.archive.crawler.deciderules; 26 27 import java.util.logging.Logger ; 28 29 import javax.management.AttributeNotFoundException ; 30 31 import org.archive.crawler.settings.SimpleType; 32 import org.archive.crawler.settings.Type; 33 34 35 36 43 public class PathologicalPathDecideRule extends MatchesRegExpDecideRule { 44 45 private static final long serialVersionUID = -1803997581321178499L; 46 47 private static final Logger logger = 48 Logger.getLogger(PathologicalPathDecideRule.class.getName()); 49 50 public static final String ATTR_REPETITIONS = "max-repetitions"; 51 52 56 static final Integer DEFAULT_REPETITIONS = new Integer (2); 57 58 protected String constructedRegexp; 59 60 64 public PathologicalPathDecideRule(String name) { 65 super(name); 66 setDescription("PathologicalPathDecideRule. This rule" + 67 " is used to avoid crawler traps by adding a constraint on" + 68 " how many times a path-segment pattern in the URI may be" + 69 " repeated. A URI will be REJECTed if the same path-segment" + 70 " repeats more than '" + ATTR_REPETITIONS + "' in a row."); 71 72 Type type = addElementToDefinition(new SimpleType(ATTR_DECISION, 74 "Decision to be applied", REJECT, ALLOWED_TYPES)); 75 type.setTransient(true); 76 77 type = getElementFromDefinition(ATTR_REGEXP); 79 type.setTransient(true); 80 81 type = addElementToDefinition(new SimpleType(ATTR_REPETITIONS, 82 "Number of times the pattern should be allowed to occur. " + 83 "This rule returns its decision (usually REJECT) if a " + 84 "path-segment is repeated more than number of times.", 85 DEFAULT_REPETITIONS)); 86 type.setOverrideable(false); 88 } 89 90 95 protected String getRegexp(Object o) { 96 if (constructedRegexp == null) { 97 constructedRegexp = constructRegexp(); 99 } 100 return constructedRegexp; 101 } 102 103 protected String constructRegexp() { 104 int rep = 0; 105 try { 106 rep = ((Integer ) getAttribute(null, ATTR_REPETITIONS)).intValue(); 107 } catch (AttributeNotFoundException e) { 108 logger.severe(e.getMessage()); 109 } 110 return (rep == 0) ? null : ".*?/(.*?/)\\1{" + rep + ",}.*"; 111 } 112 113 114 119 public void kickUpdate() { 120 super.kickUpdate(); 121 constructedRegexp = constructRegexp(); 122 } 123 } 124 | Popular Tags |