1 25 package org.archive.crawler.filter; 26 27 28 import java.util.logging.Level ; 29 import java.util.logging.Logger ; 30 31 import org.apache.commons.httpclient.HttpMethod; 32 import org.archive.crawler.datamodel.CrawlURI; 33 import org.archive.crawler.framework.Filter; 34 import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants; 35 36 52 public class HTTPMidFetchUnchangedFilter extends Filter 53 implements AdaptiveRevisitAttributeConstants { 54 55 private static final long serialVersionUID = -7416477243375196980L; 56 57 private static final Logger logger = 58 Logger.getLogger(HTTPMidFetchUnchangedFilter.class.getName()); 59 60 public static final int HEADER_PREDICTS_MISSING = -1; 62 public static final int HEADER_PREDICTS_UNCHANGED = 0; 63 public static final int HEADER_PREDICTS_CHANGED = 1; 64 65 70 public HTTPMidFetchUnchangedFilter(String name){ 71 this(name, "Filters out unchanged documents. " + 72 "Examines HTTP Header timestamp and etags. " + 73 "This filter should" + 74 "only be used in the 'midfetch-filters' on the FetchHTTP " + 75 "processor. Earlier then that, the headers are not available " + 76 "and later, the entire document is available and examining " + 77 "this will usually give better results then relying on HTTP " + 78 "headers. See documentation for further details."); 79 80 CrawlURI.addAlistPersistentMember(A_LAST_DATESTAMP); 82 CrawlURI.addAlistPersistentMember(A_LAST_ETAG); 83 } 84 85 91 public HTTPMidFetchUnchangedFilter(String name, String description) { 92 super(name, description); 93 } 94 95 protected boolean innerAccepts(Object o) { 96 if(o instanceof CrawlURI == false){ 99 if (logger.isLoggable(Level.INFO)) { 101 logger.info("Error: Object passed for evaluation was not a " + 102 "CrawlURI. " + o.toString()); 103 } 104 return true; 105 } 106 107 CrawlURI curi = (CrawlURI)o; 108 109 if (curi.isHttpTransaction() == false) { 110 if (logger.isLoggable(Level.INFO)) { 112 logger.info("Error: Non HTTP CrawlURI was passed for evalution. " 113 + curi.toString()); 114 } 115 return true; 116 } 117 118 if(curi.containsKey(A_HTTP_TRANSACTION) == false){ 119 if (logger.isLoggable(Level.INFO)) { 121 logger.info("Error: Missing HttpMethod object in CrawlURI. " 122 + curi.toString()); 123 } 124 return true; 125 } 126 127 int datestamp = HEADER_PREDICTS_MISSING; 129 int etag = HEADER_PREDICTS_MISSING; 130 HttpMethod method = (HttpMethod)curi.getObject(A_HTTP_TRANSACTION); 131 132 String newDatestamp = null; 134 if (method.getResponseHeader("last-modified") != null) { 135 newDatestamp = method.getResponseHeader("last-modified").getValue(); 136 } 137 138 if (newDatestamp != null && newDatestamp.length() > 0) { 139 datestamp = HEADER_PREDICTS_CHANGED; if (curi.containsKey(A_LAST_DATESTAMP)) { 141 if (newDatestamp.equals(curi.getString(A_LAST_DATESTAMP))) { 142 datestamp = HEADER_PREDICTS_UNCHANGED; 145 } 146 } 147 curi.putString(A_LAST_DATESTAMP, newDatestamp); 148 } 149 150 String newETag = null; 152 if(method.getResponseHeader("last-etag") != null){ 153 newETag = method.getResponseHeader("last-etag").getValue(); 154 } 155 156 if(newETag != null && newETag.length() > 0){ 157 etag = HEADER_PREDICTS_CHANGED; if(curi.containsKey(A_LAST_ETAG)){ 159 if(newETag.equals(curi.getString(A_LAST_ETAG))){ 160 etag = HEADER_PREDICTS_UNCHANGED; 163 } 164 } 165 curi.putString(A_LAST_ETAG, newETag); 166 } 167 168 if (datestamp == HEADER_PREDICTS_UNCHANGED 170 && etag == HEADER_PREDICTS_UNCHANGED) { 171 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED); 173 return false; 174 } 175 if (datestamp == HEADER_PREDICTS_MISSING 177 && etag == HEADER_PREDICTS_UNCHANGED) { 178 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED); 180 return false; 181 } 182 if (datestamp == HEADER_PREDICTS_UNCHANGED 183 && etag == HEADER_PREDICTS_MISSING) { 184 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED); 186 return false; 187 } 188 return true; } 190 } 191 | Popular Tags |