1 25 package org.archive.crawler.extractor; 26 27 import java.io.IOException ; 28 import java.security.MessageDigest ; 29 import java.security.NoSuchAlgorithmException ; 30 import java.util.logging.Level ; 31 import java.util.logging.Logger ; 32 import java.util.regex.Matcher ; 33 34 import javax.management.AttributeNotFoundException ; 35 36 import org.archive.crawler.datamodel.CrawlURI; 37 import org.archive.crawler.framework.Processor; 38 import org.archive.crawler.settings.SimpleType; 39 import org.archive.io.ReplayCharSequence; 40 import org.archive.util.Base32; 41 import org.archive.util.TextUtils; 42 43 69 public class HTTPContentDigest extends Processor { 70 71 private static final long serialVersionUID = 8055532198737384358L; 72 73 private static Logger logger = 74 Logger.getLogger(HTTPContentDigest.class.getName()); 75 76 77 public final static String ATTR_STRIP_REG_EXPR = "strip-reg-expr"; 78 protected final static String DEFAULT_STRIP_REG_EXPR = ""; 79 80 public final static String ATTR_MAX_SIZE_BYTES = "max-size-bytes"; 81 protected final static Long DEFAULT_MAX_SIZE_BYTES = new Long (1048576); 83 private static final String SHA1 = "SHA1"; 84 85 86 90 public HTTPContentDigest(String name) { 91 super(name, "Calculate custom - stripped - content digests. " + 92 "A processor for calculating custom HTTP content digests " + 93 "in place of the default (if any) computed by the HTTP " + 94 "fetcher processors. " + 95 "This processor enables you to specify a regular expression " + 96 "called strip-reg-expr. Any segment of a document (text " + 97 "only, binary files will be skipped) that matches this " + 98 "regular expression will be rewritten with the blank " + 99 "character (character 32 in the ANSI character set) FOR THE " + 100 "PURPOSE OF THE DIGEST, this has no effect on the document " + 101 "for subsequent processing or archiving. You can also " + 102 "specify a maximum length for documents being evaluated by " + 103 "this processor. Documents exceeding that length will be " + 104 "ignored. " + 105 "To further discriminate by file type or URL, you should use " + 106 "the override and refinement options (the processor can be " + 107 "disabled by default and only enabled as needed in overrides " + 108 "and refinements. " + 109 "It is generally recommended that this recalculation only be " + 110 "performed when absolutely needed (because of stripping data " + 111 "that changes automatically each time the URL is fetched) as " + 112 "this is an expensive operation."); 113 114 addElementToDefinition(new SimpleType(ATTR_STRIP_REG_EXPR, 115 "A regular expression that matches those portions of " + 116 "downloaded documents that need to be ignored when " + 117 "calculating the content digest. " + 118 "Segments matching this expression will be rewritten with " + 119 "the blank character for the content digest.", 120 DEFAULT_STRIP_REG_EXPR)); 121 addElementToDefinition(new SimpleType(ATTR_MAX_SIZE_BYTES, 122 "Maximum size of of documents to recalculate the digest for." + 123 " Documents that exceed this value (bytes) will be ignored." + 124 " Defaults to 1048576 bytes, or 1 MB. " + 125 "-1 denotes unlimited size. A setting of 0 will effectively " + 126 "disable the processor.", 127 DEFAULT_MAX_SIZE_BYTES)); 128 } 129 130 protected void innerProcess(CrawlURI curi) throws InterruptedException { 131 if (!curi.isHttpTransaction()){ 132 return; 134 } 135 if(!TextUtils.matches("^text.*$", curi.getContentType())){ 136 return; 138 } 139 long maxsize = DEFAULT_MAX_SIZE_BYTES.longValue(); 140 try { 141 maxsize = ((Long )getAttribute(curi,ATTR_MAX_SIZE_BYTES)).longValue(); 142 } catch (AttributeNotFoundException e) { 143 logger.severe("Missing max-size-bytes attribute when processing " + 144 curi.toString()); 145 } 146 if(maxsize < curi.getContentSize() && maxsize > -1){ 147 return; 149 } 150 151 String regexpr = ""; 154 try { 155 regexpr = (String )getAttribute(curi,ATTR_STRIP_REG_EXPR); 156 } catch (AttributeNotFoundException e2) { 157 logger.severe("Missing strip-reg-exp when processing " + 158 curi.toString()); 159 return; } 161 162 ReplayCharSequence cs = null; 164 165 try { 166 cs = curi.getHttpRecorder().getReplayCharSequence(); 167 } catch(Exception e) { 168 curi.addLocalizedError(this.getName(), e, 169 "Failed get of replay char sequence " + curi.toString() + 170 " " + e.getMessage()); 171 logger.warning("Failed get of replay char sequence " + 172 curi.toString() + " " + e.getMessage() + " " + 173 Thread.currentThread().getName()); 174 return; } 176 177 MessageDigest digest = null; 179 180 try { 183 try { 184 digest = MessageDigest.getInstance(SHA1); 185 } catch (NoSuchAlgorithmException e1) { 186 e1.printStackTrace(); 187 return; 188 } 189 190 digest.reset(); 191 192 String s = null; 193 194 if (regexpr.length() == 0) { 195 s = cs.toString(); 196 } else { 197 Matcher m = TextUtils.getMatcher(regexpr, cs); 199 s = m.replaceAll(" "); 200 TextUtils.recycleMatcher(m); 201 } 202 digest.update(s.getBytes()); 203 204 byte[] newDigestValue = digest.digest(); 206 207 if (logger.isLoggable(Level.FINEST)) { 209 logger.finest("Recalculated content digest for " 210 + curi.toString() + " old: " 211 + Base32.encode((byte[]) curi.getContentDigest()) 212 + ", new: " + Base32.encode(newDigestValue)); 213 } 214 curi.setContentDigest(SHA1, newDigestValue); 216 } finally { 217 if (cs != null) { 218 try { 219 cs.close(); 220 } catch (IOException ioe) { 221 logger.warning(TextUtils.exceptionToString( 222 "Failed close of ReplayCharSequence.", ioe)); 223 } 224 } 225 } 226 } 227 } | Popular Tags |