KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ChangeEvaluator


1 /* ChangeEvaluator
2  *
3  * $Id: ChangeEvaluator.java,v 1.4.16.1 2007/01/13 01:31:15 stack-sf Exp $
4  *
5  * Created on 11.11.2004
6  *
7  * Copyright (C) 2004 Kristinn Sigurdsson.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.extractor;
26
27 import java.util.logging.Level JavaDoc;
28 import java.util.logging.Logger JavaDoc;
29
30 import org.archive.crawler.datamodel.CrawlURI;
31 import org.archive.crawler.framework.Processor;
32 import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
33 import org.archive.util.Base32;
34
35 /**
36  * This processor compares the CrawlURI's current
37  * {@link org.archive.crawler.datamodel.CrawlURI#getContentDigest() content digest}
38  * with the one from a previous crawl. If they are equal, then further
39  * processing is skipped (going straight to the post processor chain) and the
40  * CrawlURI is marked appropriately.
41  *
42  * @author Kristinn Sigurdsson
43  */

44 public class ChangeEvaluator extends Processor
45 implements AdaptiveRevisitAttributeConstants {
46
47     private static final long serialVersionUID = 5547590621493534632L;
48     private static final Logger JavaDoc logger =
49         Logger.getLogger(ChangeEvaluator.class.getName());
50
51     /**
52      * Constructor
53      * @param name The name of the module
54      */

55     public ChangeEvaluator(String JavaDoc name) {
56         super(name, "Compares CrawlURI's current " +
57                 "content digest with digest from previous crawl. If " +
58                 "equal, further processing is skipped (going " +
59                 "straight to the post processor chain) and the CrawlURI is " +
60                 "marked appropriately. Should be located at the start of " +
61                 "the Extractor chain.");
62
63         // Register persistent CrawlURI items
64
CrawlURI.addAlistPersistentMember(A_LAST_CONTENT_DIGEST);
65         CrawlURI.addAlistPersistentMember(A_NUMBER_OF_VISITS);
66         CrawlURI.addAlistPersistentMember(A_NUMBER_OF_VERSIONS);
67     }
68
69     protected void innerProcess(CrawlURI curi) throws InterruptedException JavaDoc {
70         if (curi.isSuccess() == false) {
71             // Early return. No point in doing comparison on failed downloads.
72
if (logger.isLoggable(Level.FINEST)) {
73                 logger.finest("Not handling " + curi.toString()
74                         + ", did not " + "succeed.");
75             }
76             return;
77         }
78         
79         // If a mid fetch filter aborts the HTTP fetch because the headers
80
// predict no change, we can skip the whole comparing hashes.
81
if (!curi.containsKey(A_CONTENT_STATE_KEY) ||
82                 curi.getInt(A_CONTENT_STATE_KEY) != CONTENT_UNCHANGED) {
83             String JavaDoc currentDigest = null;
84             Object JavaDoc digest = curi.getContentDigest();
85             if (digest != null) {
86                 currentDigest = Base32.encode((byte[])digest);
87             }
88     
89             String JavaDoc oldDigest = null;
90             if (curi.containsKey(A_LAST_CONTENT_DIGEST)) {
91                 oldDigest = curi.getString(A_LAST_CONTENT_DIGEST);
92             }
93     
94             // Compare the String representation of the byte arrays.
95
if (currentDigest == null && oldDigest == null) {
96                 // Both are null, can't do a thing
97
if (logger.isLoggable(Level.FINER)) {
98                     logger.finer("On " + curi.toString()
99                             + " both digest are null");
100                 }
101                 // NOTE! RETURN!
102
return;
103             }
104             
105             if (currentDigest != null && oldDigest != null
106                     && currentDigest.equals(oldDigest)) {
107                 // If equal, we have just downloaded a duplicate.
108
if (logger.isLoggable(Level.FINER)) {
109                     logger.finer("On " + curi.toString()
110                             + " both digest are " + "equal. Old: " + oldDigest
111                             + ", new: " + currentDigest);
112                 }
113                 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
114                 // TODO: In the future processors should take note of the content
115
// state, removing the need for the following 'skip'
116
curi.skipToProcessorChain(getController().getPostprocessorChain());
117                 // Make not in log
118
curi.addAnnotation("unchanged");
119                 // Set content size to zero, we are not going to 'write it to disk'
120
curi.setContentSize(0);
121             } else {
122                 // Document has changed
123
if (logger.isLoggable(Level.FINER)) {
124                     logger.finer("On " + curi.toString()
125                             + " digest are not " + "equal. Old: "
126                             + (oldDigest == null? "null": oldDigest)
127                             + ", new: "
128                             + (currentDigest == null? "null": currentDigest));
129                 }
130                 // currentDigest may be null, that probably means a failed download
131
curi.putInt(A_CONTENT_STATE_KEY, CONTENT_CHANGED);
132                 curi.putString(A_LAST_CONTENT_DIGEST, currentDigest);
133             }
134         } else {
135             if (logger.isLoggable(Level.FINER)) {
136                 logger.finer("On " + curi.toString()
137                         + " content state was " + "already set as UNCHANGED.");
138             }
139         }
140         
141         // Update visit and version counters
142
int visits = 1;
143         if(curi.containsKey(A_NUMBER_OF_VISITS)) {
144             visits = curi.getInt(A_NUMBER_OF_VISITS) + 1;
145         }
146         curi.putInt(A_NUMBER_OF_VISITS, visits);
147
148         // Update versions.
149
if(curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED) {
150             int versions = 1;
151             if(curi.containsKey(A_NUMBER_OF_VERSIONS)) {
152                 versions = curi.getInt(A_NUMBER_OF_VERSIONS) + 1;
153             }
154             curi.putInt(A_NUMBER_OF_VERSIONS,versions);
155         }
156     }
157 }
158
Popular Tags