KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > HTTPContentDigest


1 /* HTTPContentDigest
2  *
3  * $Id: HTTPContentDigest.java,v 1.6.4.1 2007/01/13 01:31:16 stack-sf Exp $
4  *
5  * Created on 5.1.2005
6  *
7  * Copyright (C) 2005 Kristinn Sigur?sson
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.extractor;
26
27 import java.io.IOException JavaDoc;
28 import java.security.MessageDigest JavaDoc;
29 import java.security.NoSuchAlgorithmException JavaDoc;
30 import java.util.logging.Level JavaDoc;
31 import java.util.logging.Logger JavaDoc;
32 import java.util.regex.Matcher JavaDoc;
33
34 import javax.management.AttributeNotFoundException JavaDoc;
35
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.framework.Processor;
38 import org.archive.crawler.settings.SimpleType;
39 import org.archive.io.ReplayCharSequence;
40 import org.archive.util.Base32;
41 import org.archive.util.TextUtils;
42
43 /**
44  * A processor for calculating custum HTTP content digests in place of the
45  * default (if any) computed by the HTTP fetcher processors.
46  * <p>
47  * This processor allows the user to specify a regular expression called
48  * <i>strip-reg-expr<i>. Any segment of a document (text only, binary files will
49  * be skipped) that matches this regular expression will by rewritten with
50  * the blank character (character 32 in the ANSI character set) <b> for the
51  * purpose of the digest</b> this has no effect on the document for subsequent
52  * processing or archiving.
53  * <p>
54  * NOTE: Content digest only accounts for the document body, not headers.
55  * <p>
56  * The operator will also be able to specify a maximum length for documents
57  * being evaluated by this processors. Documents exceeding that length will be
58  * ignored.
59  * <p>
60  * To further discriminate by file type or URL, an operator should use the
61  * override and refinement options.
62  * <p>
63  * It is generally recommended that this recalculation only be performed when
64  * absolutely needed (because of stripping data that changes automatically each
65  * time the URL is fetched) as this is an expensive operation.
66  *
67  * @author Kristinn Sigurdsson
68  */

69 public class HTTPContentDigest extends Processor {
70
71     private static final long serialVersionUID = 8055532198737384358L;
72
73     private static Logger JavaDoc logger =
74         Logger.getLogger(HTTPContentDigest.class.getName());
75
76     /** A regular expression detailing elements to strip before making digest */
77     public final static String JavaDoc ATTR_STRIP_REG_EXPR = "strip-reg-expr";
78     protected final static String JavaDoc DEFAULT_STRIP_REG_EXPR = "";
79     /** Maximum file size for - longer files will be ignored. -1 = unlimited*/
80     public final static String JavaDoc ATTR_MAX_SIZE_BYTES = "max-size-bytes";
81     protected final static Long JavaDoc DEFAULT_MAX_SIZE_BYTES = new Long JavaDoc(1048576); // 1 Megabyte
82

83     private static final String JavaDoc SHA1 = "SHA1";
84
85     
86     /**
87      * Constructor
88      * @param name Processor name
89      */

90     public HTTPContentDigest(String JavaDoc name) {
91         super(name, "Calculate custom - stripped - content digests. " +
92                 "A processor for calculating custom HTTP content digests " +
93                 "in place of the default (if any) computed by the HTTP " +
94                 "fetcher processors. " +
95                 "This processor enables you to specify a regular expression " +
96                 "called strip-reg-expr. Any segment of a document (text " +
97                 "only, binary files will be skipped) that matches this " +
98                 "regular expression will be rewritten with the blank " +
99                 "character (character 32 in the ANSI character set) FOR THE " +
100                 "PURPOSE OF THE DIGEST, this has no effect on the document " +
101                 "for subsequent processing or archiving. You can also " +
102                 "specify a maximum length for documents being evaluated by " +
103                 "this processor. Documents exceeding that length will be " +
104                 "ignored. " +
105                 "To further discriminate by file type or URL, you should use " +
106                 "the override and refinement options (the processor can be " +
107                 "disabled by default and only enabled as needed in overrides " +
108                 "and refinements. " +
109                 "It is generally recommended that this recalculation only be " +
110                 "performed when absolutely needed (because of stripping data " +
111                 "that changes automatically each time the URL is fetched) as " +
112                 "this is an expensive operation.");
113
114         addElementToDefinition(new SimpleType(ATTR_STRIP_REG_EXPR,
115                 "A regular expression that matches those portions of " +
116                 "downloaded documents that need to be ignored when " +
117                 "calculating the content digest. " +
118                 "Segments matching this expression will be rewritten with " +
119                 "the blank character for the content digest.",
120                 DEFAULT_STRIP_REG_EXPR));
121         addElementToDefinition(new SimpleType(ATTR_MAX_SIZE_BYTES,
122                 "Maximum size of of documents to recalculate the digest for." +
123                 " Documents that exceed this value (bytes) will be ignored." +
124                 " Defaults to 1048576 bytes, or 1 MB. " +
125                 "-1 denotes unlimited size. A setting of 0 will effectively " +
126                 "disable the processor.",
127                 DEFAULT_MAX_SIZE_BYTES));
128     }
129
130     protected void innerProcess(CrawlURI curi) throws InterruptedException JavaDoc {
131         if (!curi.isHttpTransaction()){
132             // Only handles HTTP docsuments.
133
return;
134         }
135         if(!TextUtils.matches("^text.*$", curi.getContentType())){
136             // Only handles text based documents.
137
return;
138         }
139         long maxsize = DEFAULT_MAX_SIZE_BYTES.longValue();
140         try {
141             maxsize = ((Long JavaDoc)getAttribute(curi,ATTR_MAX_SIZE_BYTES)).longValue();
142         } catch (AttributeNotFoundException JavaDoc e) {
143             logger.severe("Missing max-size-bytes attribute when processing " +
144                     curi.toString());
145         }
146         if(maxsize < curi.getContentSize() && maxsize > -1){
147             // Document too big.
148
return;
149         }
150         
151         // Ok, if we got this far we need to calculate the content digest.
152
// Get the regexpr
153
String JavaDoc regexpr = "";
154         try {
155             regexpr = (String JavaDoc)getAttribute(curi,ATTR_STRIP_REG_EXPR);
156         } catch (AttributeNotFoundException JavaDoc e2) {
157             logger.severe("Missing strip-reg-exp when processing " +
158                     curi.toString());
159             return; // Can't do anything without it.
160
}
161         
162         // Get a replay of the document character seq.
163
ReplayCharSequence cs = null;
164         
165         try {
166            cs = curi.getHttpRecorder().getReplayCharSequence();
167         } catch(Exception JavaDoc e) {
168             curi.addLocalizedError(this.getName(), e,
169                 "Failed get of replay char sequence " + curi.toString() +
170                     " " + e.getMessage());
171             logger.warning("Failed get of replay char sequence " +
172                 curi.toString() + " " + e.getMessage() + " " +
173                 Thread.currentThread().getName());
174             return; // Can't proceed if this happens.
175
}
176         
177         // Create a MessageDigest
178
MessageDigest JavaDoc digest = null;
179         
180         // We have a ReplayCharSequence open. Wrap all in finally so we
181
// for sure close it before we leave.
182
try {
183             try {
184                 digest = MessageDigest.getInstance(SHA1);
185             } catch (NoSuchAlgorithmException JavaDoc e1) {
186                 e1.printStackTrace();
187                 return;
188             }
189
190             digest.reset();
191
192             String JavaDoc s = null;
193
194             if (regexpr.length() == 0) {
195                 s = cs.toString();
196             } else {
197                 // Process the document
198
Matcher JavaDoc m = TextUtils.getMatcher(regexpr, cs);
199                 s = m.replaceAll(" ");
200                 TextUtils.recycleMatcher(m);
201             }
202             digest.update(s.getBytes());
203
204             // Get the new digest value
205
byte[] newDigestValue = digest.digest();
206
207             // Log if needed.
208
if (logger.isLoggable(Level.FINEST)) {
209                 logger.finest("Recalculated content digest for "
210                         + curi.toString() + " old: "
211                         + Base32.encode((byte[]) curi.getContentDigest())
212                         + ", new: " + Base32.encode(newDigestValue));
213             }
214             // Save new digest value
215
curi.setContentDigest(SHA1, newDigestValue);
216         } finally {
217             if (cs != null) {
218                 try {
219                     cs.close();
220                 } catch (IOException JavaDoc ioe) {
221                     logger.warning(TextUtils.exceptionToString(
222                             "Failed close of ReplayCharSequence.", ioe));
223                 }
224             }
225         }
226     }
227 }
Popular Tags