KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > writer > ExperimentalWARCWriterProcessor


1 /* $Id: ExperimentalWARCWriterProcessor.java,v 1.13.2.1 2007/01/13 01:31:30 stack-sf Exp $
2  *
3  * Created on August 1st, 2006.
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.writer;
24
25 import java.io.ByteArrayInputStream JavaDoc;
26 import java.io.IOException JavaDoc;
27 import java.net.URI JavaDoc;
28 import java.net.URISyntaxException JavaDoc;
29 import java.util.Collection JavaDoc;
30 import java.util.HashMap JavaDoc;
31 import java.util.List JavaDoc;
32 import java.util.Map JavaDoc;
33 import java.util.concurrent.atomic.AtomicInteger JavaDoc;
34 import java.util.logging.Level JavaDoc;
35 import java.util.logging.Logger JavaDoc;
36
37 import org.archive.crawler.datamodel.CoreAttributeConstants;
38 import org.archive.crawler.datamodel.CrawlURI;
39 import org.archive.crawler.datamodel.FetchStatusCodes;
40 import org.archive.crawler.event.CrawlStatusListener;
41 import org.archive.crawler.extractor.Link;
42 import org.archive.crawler.framework.WriterPoolProcessor;
43 import org.archive.io.WriterPoolMember;
44 import org.archive.io.WriterPoolSettings;
45 import org.archive.io.warc.ExperimentalWARCWriter;
46 import org.archive.io.warc.WARCConstants;
47 import org.archive.io.warc.WARCWriterPool;
48 import org.archive.uid.GeneratorFactory;
49 import org.archive.util.ArchiveUtils;
50 import org.archive.util.anvl.ANVLRecord;
51
52
53 /**
54  * Experimental WARCWriterProcessor.
55  *
56  * @author stack
57  */

58 public class ExperimentalWARCWriterProcessor extends WriterPoolProcessor
59 implements CoreAttributeConstants, CrawlStatusListener,
60 WriterPoolSettings, FetchStatusCodes, WARCConstants {
61
62     private static final long serialVersionUID = 188656957531675821L;
63
64     private final Logger JavaDoc logger = Logger.getLogger(this.getClass().getName());
65     
66     /**
67      * Default path list.
68      */

69     private static final String JavaDoc [] DEFAULT_PATH = {"warcs"};
70
71     protected String JavaDoc [] getDefaultPath() {
72         return DEFAULT_PATH;
73     }
74     
75     /**
76      * @param name Name of this writer.
77      */

78     public ExperimentalWARCWriterProcessor(String JavaDoc name) {
79         super(name, "Experimental WARCWriter processor");
80     }
81
82     protected void setupPool(final AtomicInteger JavaDoc serialNo) {
83         setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(),
84             getPoolMaximumWait()));
85     }
86     
87     /**
88      * Writes a CrawlURI and its associated data to store file.
89      *
90      * Currently this method understands the following uri types: dns, http, and
91      * https.
92      *
93      * @param curi
94      * CrawlURI to process.
95      *
96      */

97     protected void innerProcess(CrawlURI curi) {
98         // If failure, or we haven't fetched the resource yet, return
99
if (curi.getFetchStatus() <= 0) {
100             return;
101         }
102         
103         // If no content, don't write record.
104
int recordLength = (int)curi.getContentSize();
105         if (recordLength <= 0) {
106             // Write nothing.
107
return;
108         }
109         
110         String JavaDoc scheme = curi.getUURI().getScheme().toLowerCase();
111         try {
112             if ((scheme.equals("dns") &&
113                     curi.getFetchStatus() == S_DNS_SUCCESS) ||
114                 ((scheme.equals("http") || scheme.equals("https")) &&
115                     curi.getFetchStatus() > 0 && curi.isHttpTransaction()) ||
116                 (scheme.equals("ftp") && curi.getFetchStatus() == 200)) {
117                 write(scheme, curi);
118             } else {
119                 logger.info("This writer does not write out scheme " +
120                         scheme + " content");
121             }
122         } catch (IOException JavaDoc e) {
123             curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
124                 curi.toString());
125             logger.log(Level.SEVERE, "Failed write of Record: " +
126                 curi.toString(), e);
127         }
128     }
129     
130     protected void write(final String JavaDoc lowerCaseScheme, final CrawlURI curi)
131     throws IOException JavaDoc {
132         WriterPoolMember writer = getPool().borrowFile();
133         long position = writer.getPosition();
134         // See if we need to open a new file because we've exceeed maxBytes.
135
// Call to checkFileSize will open new file if we're at maximum for
136
// current file.
137
writer.checkSize();
138         if (writer.getPosition() != position) {
139             // We just closed the file because it was larger than maxBytes.
140
// Add to the totalBytesWritten the size of the first record
141
// in the file, if any.
142
setTotalBytesWritten(getTotalBytesWritten() +
143                 (writer.getPosition() - position));
144             position = writer.getPosition();
145         }
146         
147         ExperimentalWARCWriter w = (ExperimentalWARCWriter)writer;
148         try {
149             // Write a request, response, and metadata all in the one
150
// 'transaction'.
151
final URI JavaDoc baseid = getRecordID();
152             final String JavaDoc timestamp =
153                 ArchiveUtils.get14DigitDate(curi.getLong(A_FETCH_BEGAN_TIME));
154             if (lowerCaseScheme.startsWith("http")) {
155                 // Add named fields for ip, checksum, and relate the metadata
156
// and request to the resource field.
157
ANVLRecord r = new ANVLRecord();
158                 if (curi.getContentDigest() != null) {
159                     // TODO: This is digest for content -- doesn't include
160
// response headers.
161
r.addLabelValue(NAMED_FIELD_CHECKSUM_LABEL,
162                         curi.getContentDigestSchemeString());
163                 }
164                 r.addLabelValue(NAMED_FIELD_IP_LABEL, getHostAddress(curi));
165                 URI JavaDoc rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,
166                     baseid, curi, r);
167                 r = new ANVLRecord(1);
168                 r.addLabelValue(NAMED_FIELD_RELATED_LABEL, rid.toString());
169                 writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,
170                     baseid, curi, r);
171                 writeMetadata(w, timestamp, baseid, curi, r);
172             } else if (lowerCaseScheme.equals("dns")) {
173                 String JavaDoc ip = curi.getString(A_DNS_SERVER_IP_LABEL);
174                 ANVLRecord r = null;
175                 if (ip != null && ip.length() > 0) {
176                     r = new ANVLRecord();
177                     r.addLabelValue(NAMED_FIELD_IP_LABEL, ip);
178                 }
179                 writeResponse(w, timestamp, curi.getContentType(), baseid,
180                     curi, r);
181             } else {
182                 logger.warning("No handler for scheme " + lowerCaseScheme);
183             }
184         } catch (IOException JavaDoc e) {
185             // Invalidate this file (It gets a '.invalid' suffix).
186
getPool().invalidateFile(writer);
187             // Set the writer to null otherwise the pool accounting
188
// of how many active writers gets skewed if we subsequently
189
// do a returnWriter call on this object in the finally block.
190
writer = null;
191             throw e;
192         } finally {
193             if (writer != null) {
194                 setTotalBytesWritten(getTotalBytesWritten() +
195                      (writer.getPosition() - position));
196                 getPool().returnFile(writer);
197             }
198         }
199         checkBytesWritten();
200     }
201     
202     protected URI JavaDoc writeRequest(final ExperimentalWARCWriter w,
203             final String JavaDoc timestamp, final String JavaDoc mimetype,
204             final URI JavaDoc baseid, final CrawlURI curi,
205             final ANVLRecord namedFields)
206     throws IOException JavaDoc {
207         final URI JavaDoc uid = qualifyRecordID(baseid, TYPE, REQUEST);
208         w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,
209             namedFields,
210             curi.getHttpRecorder().getRecordedOutput().getReplayInputStream(),
211             curi.getHttpRecorder().getRecordedOutput().getSize());
212         return uid;
213     }
214     
215     protected URI JavaDoc writeResponse(final ExperimentalWARCWriter w,
216             final String JavaDoc timestamp, final String JavaDoc mimetype,
217             final URI JavaDoc baseid, final CrawlURI curi,
218             final ANVLRecord namedFields)
219     throws IOException JavaDoc {
220         w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,
221             namedFields,
222             curi.getHttpRecorder().getRecordedInput().getReplayInputStream(),
223             curi.getHttpRecorder().getRecordedInput().getSize());
224         return baseid;
225     }
226     
227     protected URI JavaDoc writeMetadata(final ExperimentalWARCWriter w,
228             final String JavaDoc timestamp,
229             final URI JavaDoc baseid, final CrawlURI curi,
230             final ANVLRecord namedFields)
231     throws IOException JavaDoc {
232         final URI JavaDoc uid = qualifyRecordID(baseid, TYPE, METADATA);
233         // Get some metadata from the curi.
234
// TODO: Get all curi metadata.
235
ANVLRecord r = new ANVLRecord();
236         if (curi.isSeed()) {
237             r.addLabel("seed");
238         } else {
239             if (curi.forceFetch()) {
240                 r.addLabel("force-fetch");
241             }
242             r.addLabelValue("via", curi.flattenVia());
243             r.addLabelValue("pathFromSeed", curi.getPathFromSeed());
244         }
245         Collection JavaDoc<Link> links = curi.getOutLinks();
246         if (links != null || links.size() > 0) {
247             for (Link link: links) {
248                 r.addLabelValue("outlink", link.toString());
249             }
250         }
251         if (curi.isTruncatedFetch()) {
252             String JavaDoc value = curi.isTimeTruncatedFetch()?
253                     NAMED_FIELD_TRUNCATED_VALUE_TIME:
254                 curi.isLengthTruncatedFetch()?
255                         NAMED_FIELD_TRUNCATED_VALUE_LEN:
256                 curi.isHeaderTruncatedFetch()?
257                         NAMED_FIELD_TRUNCATED_VALUE_HEAD:
258                 NAMED_FIELD_TRUNCATED_VALUE_UNSPECIFIED;
259                       
260             r.addLabelValue(NAMED_FIELD_TRUNCATED, value);
261         }
262         
263         // TODO: Other curi fields to write to metadata.
264
//
265
// Credentials
266
//
267
// fetch-began-time: 1154569278774
268
// fetch-completed-time: 1154569281816
269
//
270
// Annotations.
271

272         byte [] b = r.getUTF8Bytes();
273         w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,
274             uid, namedFields, new ByteArrayInputStream JavaDoc(b), b.length);
275         return uid;
276     }
277     
278     protected URI JavaDoc getRecordID() throws IOException JavaDoc {
279         URI JavaDoc result;
280         try {
281             result = GeneratorFactory.getFactory().getRecordID();
282         } catch (URISyntaxException JavaDoc e) {
283             throw new IOException JavaDoc(e.toString());
284         }
285         return result;
286     }
287     
288     protected URI JavaDoc qualifyRecordID(final URI JavaDoc base, final String JavaDoc key,
289             final String JavaDoc value)
290     throws IOException JavaDoc {
291         URI JavaDoc result;
292         Map JavaDoc<String JavaDoc, String JavaDoc> qualifiers = new HashMap JavaDoc<String JavaDoc, String JavaDoc>(1);
293         qualifiers.put(key, value);
294         try {
295             result = GeneratorFactory.getFactory().
296                 qualifyRecordID(base, qualifiers);
297         } catch (URISyntaxException JavaDoc e) {
298             throw new IOException JavaDoc(e.toString());
299         }
300         return result;
301     }
302
303     public List JavaDoc getMetadata() {
304         // TODO: As ANVL?
305
return null;
306     }
307 }
Popular Tags