| 1 25 package org.archive.crawler.extractor; 26 27 import java.util.logging.Level ; 28 import java.util.logging.Logger ; 29 30 import org.archive.crawler.datamodel.CrawlURI; 31 import org.archive.crawler.framework.Processor; 32 import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants; 33 import org.archive.util.Base32; 34 35 44 public class ChangeEvaluator extends Processor 45 implements AdaptiveRevisitAttributeConstants { 46 47 private static final long serialVersionUID = 5547590621493534632L; 48 private static final Logger logger = 49 Logger.getLogger(ChangeEvaluator.class.getName()); 50 51 55 public ChangeEvaluator(String name) { 56 super(name, "Compares CrawlURI's current " + 57 "content digest with digest from previous crawl. If " + 58 "equal, further processing is skipped (going " + 59 "straight to the post processor chain) and the CrawlURI is " + 60 "marked appropriately. Should be located at the start of " + 61 "the Extractor chain."); 62 63 CrawlURI.addAlistPersistentMember(A_LAST_CONTENT_DIGEST); 65 CrawlURI.addAlistPersistentMember(A_NUMBER_OF_VISITS); 66 CrawlURI.addAlistPersistentMember(A_NUMBER_OF_VERSIONS); 67 } 68 69 protected void innerProcess(CrawlURI curi) throws InterruptedException { 70 if (curi.isSuccess() == false) { 71 if (logger.isLoggable(Level.FINEST)) { 73 logger.finest("Not handling " + curi.toString() 74 + ", did not " + "succeed."); 75 } 76 return; 77 } 78 79 if (!curi.containsKey(A_CONTENT_STATE_KEY) || 82 curi.getInt(A_CONTENT_STATE_KEY) != CONTENT_UNCHANGED) { 83 String currentDigest = null; 84 Object digest = curi.getContentDigest(); 85 if (digest != null) { 86 currentDigest = Base32.encode((byte[])digest); 87 } 88 89 String oldDigest = null; 90 if (curi.containsKey(A_LAST_CONTENT_DIGEST)) { 91 oldDigest = curi.getString(A_LAST_CONTENT_DIGEST); 92 } 93 94 if (currentDigest == null && oldDigest == null) { 96 if (logger.isLoggable(Level.FINER)) { 98 logger.finer("On " + curi.toString() 99 + " both digest are null"); 100 } 101 return; 103 } 104 105 if (currentDigest != null && oldDigest != null 106 && currentDigest.equals(oldDigest)) { 107 if (logger.isLoggable(Level.FINER)) { 109 logger.finer("On " + curi.toString() 110 + " both digest are " + "equal. Old: " + oldDigest 111 + ", new: " + currentDigest); 112 } 113 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED); 114 curi.skipToProcessorChain(getController().getPostprocessorChain()); 117 curi.addAnnotation("unchanged"); 119 curi.setContentSize(0); 121 } else { 122 if (logger.isLoggable(Level.FINER)) { 124 logger.finer("On " + curi.toString() 125 + " digest are not " + "equal. Old: " 126 + (oldDigest == null? "null": oldDigest) 127 + ", new: " 128 + (currentDigest == null? "null": currentDigest)); 129 } 130 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_CHANGED); 132 curi.putString(A_LAST_CONTENT_DIGEST, currentDigest); 133 } 134 } else { 135 if (logger.isLoggable(Level.FINER)) { 136 logger.finer("On " + curi.toString() 137 + " content state was " + "already set as UNCHANGED."); 138 } 139 } 140 141 int visits = 1; 143 if(curi.containsKey(A_NUMBER_OF_VISITS)) { 144 visits = curi.getInt(A_NUMBER_OF_VISITS) + 1; 145 } 146 curi.putInt(A_NUMBER_OF_VISITS, visits); 147 148 if(curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED) { 150 int versions = 1; 151 if(curi.containsKey(A_NUMBER_OF_VERSIONS)) { 152 versions = curi.getInt(A_NUMBER_OF_VERSIONS) + 1; 153 } 154 curi.putInt(A_NUMBER_OF_VERSIONS,versions); 155 } 156 } 157 } 158 | Popular Tags |