1 25 package org.archive.crawler.deciderules; 26 27 import org.archive.crawler.datamodel.CandidateURI; 28 import org.archive.crawler.settings.SimpleType; 29 import org.archive.crawler.settings.Type; 30 31 32 33 40 public class TooManyPathSegmentsDecideRule extends PredicatedDecideRule { 41 42 private static final long serialVersionUID = 147079100367815075L; 43 44 public static final String ATTR_MAX_PATH_DEPTH = "max-path-depth"; 45 46 50 static final Integer DEFAULT_MAX_PATH_DEPTH = new Integer (20); 51 52 56 public TooManyPathSegmentsDecideRule(String name) { 57 super(name); 58 setDescription("TooManyPathSegmentsDecideRule. REJECTs URIs with " + 59 "more total path-segments (as indicated by '/' characters) " + 60 "than the configured '" + ATTR_MAX_PATH_DEPTH + "'."); 61 62 Type type = addElementToDefinition(new SimpleType(ATTR_DECISION, 64 "Decision to be applied", REJECT, ALLOWED_TYPES)); 65 type.setTransient(true); 66 67 addElementToDefinition(new SimpleType(ATTR_MAX_PATH_DEPTH, "Number of" + 68 " path segments beyond which this rule will reject URIs.", 69 DEFAULT_MAX_PATH_DEPTH)); 70 71 } 72 73 80 protected boolean evaluate(Object object) { 81 boolean result = false; 82 CandidateURI curi = null; 83 try { 84 curi = (CandidateURI)object; 85 } catch (ClassCastException e) { 86 return result; 88 } 89 String uri = curi.toString(); 90 int count = 0; 91 int threshold = getThresholdSegments(object); 92 for (int i = 0; i < uri.length(); i++) { 93 if (uri.charAt(i) == '/') { 94 count++; 95 } 96 if (count > threshold) { 97 result = true; 98 break; 99 } 100 } 101 return result; 102 } 103 104 108 private int getThresholdSegments(Object obj) { 109 return ((Integer ) getUncheckedAttribute(obj, ATTR_MAX_PATH_DEPTH)) 111 .intValue() + 2; 112 } 113 } 114 | Popular Tags |