| 1 23 package org.archive.crawler.deciderules; 24 25 import java.util.logging.Level ; 26 import java.util.logging.Logger ; 27 28 import org.apache.commons.httpclient.HttpMethod; 29 import org.archive.crawler.datamodel.CoreAttributeConstants; 30 import org.archive.crawler.datamodel.CrawlURI; 31 import org.archive.crawler.settings.SimpleType; 32 33 public class NotExceedsDocumentLengthTresholdDecideRule 34 extends PredicatedDecideRule implements CoreAttributeConstants { 35 36 37 private static final long serialVersionUID = -8774160016195991876L; 38 39 private static final Logger logger = Logger. 40 getLogger(NotExceedsDocumentLengthTresholdDecideRule.class.getName()); 41 public static final String ATTR_CONTENT_LENGTH_TRESHOLD = 42 "content-length-treshold"; 43 static final Integer DEFAULT_CONTENT_LENGTH_TRESHOLD = -1; 44 public static final String ATTR_USE_AS_MIDFETCH = "use-as-midfetch-filter"; 45 static final Boolean DEFAULT_USE_AS_MIDFETCH = new Boolean (true); 46 47 48 public static final int HEADER_PREDICTS_MISSING = -1; 50 51 public NotExceedsDocumentLengthTresholdDecideRule(String name){ 52 super(name); 53 setDescription("NotExceedsDocumentLengthTresholdDecideRule. " + 54 "REJECTs URIs "+ 55 "with content length exceeding a given treshold. "+ 56 "Either examines HTTP header content length or " + 57 "actual downloaded content length and returns false " + 58 "for documents exceeding a given length treshold."); 59 60 addElementToDefinition(new SimpleType(ATTR_USE_AS_MIDFETCH, 61 "Shall this rule be used as a midfetch rule? If true, " + 62 "this rule will determine content length based on HTTP " + 63 "header information, otherwise the size of the already " + 64 "downloaded content will be used.", 65 DEFAULT_USE_AS_MIDFETCH)); 66 67 addElementToDefinition(new SimpleType(ATTR_CONTENT_LENGTH_TRESHOLD, 68 "Max " + 69 "content-length this filter will allow to pass through. If -1, " + 70 "then no limit.", 71 DEFAULT_CONTENT_LENGTH_TRESHOLD)); 72 } 73 74 protected boolean evaluate(Object object) { 75 try { 76 CrawlURI curi = (CrawlURI)object; 77 78 int contentlength = HEADER_PREDICTS_MISSING; 79 80 if (getIsMidfetchRule(object)){ 82 83 if(curi.containsKey(A_HTTP_TRANSACTION) == false){ 84 if (logger.isLoggable(Level.INFO)) { 86 logger.info("Error: Missing HttpMethod object in " + 87 "CrawlURI. " + curi.toString()); 88 } 89 return false; 90 } 91 92 HttpMethod method = 94 (HttpMethod)curi.getObject(A_HTTP_TRANSACTION); 95 96 String newContentlength = null; 98 if (method.getResponseHeader("content-length") != null) { 99 newContentlength = method. 100 getResponseHeader("content-length").getValue(); 101 } 102 103 if (newContentlength != null && 104 newContentlength.length() > 0) { 105 try { 106 contentlength = Integer.parseInt(newContentlength); 107 } catch (NumberFormatException nfe) { 108 } 110 } 111 112 if (contentlength == HEADER_PREDICTS_MISSING) { 115 return false; 116 } 117 } else { 118 contentlength = (int)curi.getContentSize(); 119 } 120 121 return makeDecision(contentlength, object); 122 123 } catch (ClassCastException e) { 124 return false; 126 } 127 } 128 129 134 protected Boolean makeDecision(int contentLength, Object obj) { 135 return contentLength < getContentLengthTreshold(obj); 136 } 137 138 142 protected int getContentLengthTreshold(Object obj) { 143 int len = ((Integer )getUncheckedAttribute(obj, 144 ATTR_CONTENT_LENGTH_TRESHOLD)).intValue(); 145 return len == -1? Integer.MAX_VALUE: len; 146 } 147 148 152 private Boolean getIsMidfetchRule(Object obj) { 153 return ((Boolean )getUncheckedAttribute(obj,ATTR_USE_AS_MIDFETCH)). 154 booleanValue(); 155 } 156 } | Popular Tags |