KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > filter > HTTPMidFetchUnchangedFilter


1 /* HTTPMidFetchUnhangedFilter
2  *
3  * $Id: HTTPMidFetchUnchangedFilter.java,v 1.3.18.1 2007/01/13 01:31:21 stack-sf Exp $
4  *
5  * Created on 4.2.2005
6  *
7  * Copyright (C) 2005 Kristinn Sigur?sson
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.filter;
26
27
28 import java.util.logging.Level JavaDoc;
29 import java.util.logging.Logger JavaDoc;
30
31 import org.apache.commons.httpclient.HttpMethod;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.Filter;
34 import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
35
36 /**
37  * A mid fetch filter for HTTP fetcher processors. It will evaluate the HTTP
38  * header to try and predict if the document has changed since it last passed
39  * through this filter. It does this by comparing the last-modified and etag
40  * values with the same values stored during the last processing of the URI.
41  * <p>
42  * If both values are present, they must agree on predicting no change,
43  * otherwise a change is predicted (return true).
44  * <p>
45  * If only one of the values is present, it alone is used to predict if a
46  * change has occured.
47  * <p>
48  * If neither value is present the filter will return true (predict change)
49  *
50  * @author Kristinn Sigurdsson
51  */

52 public class HTTPMidFetchUnchangedFilter extends Filter
53 implements AdaptiveRevisitAttributeConstants {
54
55     private static final long serialVersionUID = -7416477243375196980L;
56
57     private static final Logger JavaDoc logger =
58         Logger.getLogger(HTTPMidFetchUnchangedFilter.class.getName());
59
60     // Header predictor state constants
61
public static final int HEADER_PREDICTS_MISSING = -1;
62     public static final int HEADER_PREDICTS_UNCHANGED = 0;
63     public static final int HEADER_PREDICTS_CHANGED = 1;
64     
65     /**
66      * Constructor
67      *
68      * @param name Module name
69      */

70     public HTTPMidFetchUnchangedFilter(String JavaDoc name){
71         this(name, "Filters out unchanged documents. " +
72                 "Examines HTTP Header timestamp and etags. " +
73                 "This filter should" +
74                 "only be used in the 'midfetch-filters' on the FetchHTTP " +
75                 "processor. Earlier then that, the headers are not available " +
76                 "and later, the entire document is available and examining " +
77                 "this will usually give better results then relying on HTTP " +
78                 "headers. See documentation for further details.");
79
80         // Register persistent CrawlURI items
81
CrawlURI.addAlistPersistentMember(A_LAST_DATESTAMP);
82         CrawlURI.addAlistPersistentMember(A_LAST_ETAG);
83     }
84     
85     /**
86      * Constructor
87      *
88      * @param name Module name
89      * @param description A description of the modules functions
90      */

91     public HTTPMidFetchUnchangedFilter(String JavaDoc name, String JavaDoc description) {
92         super(name, description);
93     }
94
95     protected boolean innerAccepts(Object JavaDoc o) {
96         // Return FALSE when the document has NOT changed!
97
// Return TRUE if the document has changed or we can't tell
98
if(o instanceof CrawlURI == false){
99             // Only handles CrawlURIs
100
if (logger.isLoggable(Level.INFO)) {
101                 logger.info("Error: Object passed for evaluation was not a " +
102                     "CrawlURI. " + o.toString());
103             }
104             return true;
105         }
106         
107         CrawlURI curi = (CrawlURI)o;
108         
109         if (curi.isHttpTransaction() == false) {
110             // Only handles HTTP
111
if (logger.isLoggable(Level.INFO)) {
112                 logger.info("Error: Non HTTP CrawlURI was passed for evalution. "
113                     + curi.toString());
114             }
115             return true;
116         }
117         
118         if(curi.containsKey(A_HTTP_TRANSACTION) == false){
119             // Missing header info, can't do anything.
120
if (logger.isLoggable(Level.INFO)) {
121                 logger.info("Error: Missing HttpMethod object in CrawlURI. "
122                         + curi.toString());
123             }
124             return true;
125         }
126         
127         // Intially assume header info is missing
128
int datestamp = HEADER_PREDICTS_MISSING;
129         int etag = HEADER_PREDICTS_MISSING;
130         HttpMethod method = (HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
131
132         // Compare datestamps (last-modified)
133
String JavaDoc newDatestamp = null;
134         if (method.getResponseHeader("last-modified") != null) {
135             newDatestamp = method.getResponseHeader("last-modified").getValue();
136         }
137         
138         if (newDatestamp != null && newDatestamp.length() > 0) {
139             datestamp = HEADER_PREDICTS_CHANGED; // Not missing, assume change
140
if (curi.containsKey(A_LAST_DATESTAMP)) {
141                 if (newDatestamp.equals(curi.getString(A_LAST_DATESTAMP))) {
142                     // Both new and old are present and equal, datestamp
143
// predicts no change
144
datestamp = HEADER_PREDICTS_UNCHANGED;
145                 }
146             }
147             curi.putString(A_LAST_DATESTAMP, newDatestamp);
148         }
149         
150         // Compare ETags
151
String JavaDoc newETag = null;
152         if(method.getResponseHeader("last-etag") != null){
153             newETag = method.getResponseHeader("last-etag").getValue();
154         }
155         
156         if(newETag != null && newETag.length() > 0){
157             etag = HEADER_PREDICTS_CHANGED; // Not missing, assume change
158
if(curi.containsKey(A_LAST_ETAG)){
159                 if(newETag.equals(curi.getString(A_LAST_ETAG))){
160                     // Both new and old are present and equal, etag
161
// predicts no change
162
etag = HEADER_PREDICTS_UNCHANGED;
163                 }
164             }
165             curi.putString(A_LAST_ETAG, newETag);
166         }
167         
168         // If both are present, predict no change only if both agree
169
if (datestamp == HEADER_PREDICTS_UNCHANGED
170                 && etag == HEADER_PREDICTS_UNCHANGED) {
171             // Have both and they agree, no change
172
curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
173             return false;
174         }
175         // If one or the other is missing, trust the one that is present
176
if (datestamp == HEADER_PREDICTS_MISSING
177                 && etag == HEADER_PREDICTS_UNCHANGED) {
178             // Only have etag, and it predicts no change
179
curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
180             return false;
181         }
182         if (datestamp == HEADER_PREDICTS_UNCHANGED
183                 && etag == HEADER_PREDICTS_MISSING) {
184             // Only have last-modified, and it predicts no change
185
curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
186             return false;
187         }
188         return true; // Default, assume change.
189
}
190 }
191
Popular Tags