KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > writer > ARCWriterProcessor


1 /*
2  * ARCWriter
3  *
4  * $Id: ARCWriterProcessor.java,v 1.54 2006/09/01 00:55:51 paul_jack Exp $
5  *
6  * Created on Jun 5, 2003
7  *
8  * Copyright (C) 2003 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26 package org.archive.crawler.writer;
27
28 import java.io.File JavaDoc;
29 import java.io.FileInputStream JavaDoc;
30 import java.io.FileNotFoundException JavaDoc;
31 import java.io.IOException JavaDoc;
32 import java.io.InputStream JavaDoc;
33 import java.io.StringWriter JavaDoc;
34 import java.net.InetAddress JavaDoc;
35 import java.net.UnknownHostException JavaDoc;
36 import java.util.ArrayList JavaDoc;
37 import java.util.List JavaDoc;
38 import java.util.concurrent.atomic.AtomicInteger JavaDoc;
39 import java.util.logging.Level JavaDoc;
40 import java.util.logging.Logger JavaDoc;
41
42 import javax.xml.transform.SourceLocator JavaDoc;
43 import javax.xml.transform.Templates JavaDoc;
44 import javax.xml.transform.Transformer JavaDoc;
45 import javax.xml.transform.TransformerConfigurationException JavaDoc;
46 import javax.xml.transform.TransformerException JavaDoc;
47 import javax.xml.transform.TransformerFactory JavaDoc;
48 import javax.xml.transform.stream.StreamResult JavaDoc;
49 import javax.xml.transform.stream.StreamSource JavaDoc;
50
51 import org.archive.crawler.Heritrix;
52 import org.archive.crawler.datamodel.CoreAttributeConstants;
53 import org.archive.crawler.datamodel.CrawlURI;
54 import org.archive.crawler.datamodel.FetchStatusCodes;
55 import org.archive.crawler.event.CrawlStatusListener;
56 import org.archive.crawler.framework.WriterPoolProcessor;
57 import org.archive.crawler.settings.XMLSettingsHandler;
58 import org.archive.io.ReplayInputStream;
59 import org.archive.io.WriterPoolMember;
60 import org.archive.io.WriterPoolSettings;
61 import org.archive.io.arc.ARCConstants;
62 import org.archive.io.arc.ARCWriter;
63 import org.archive.io.arc.ARCWriterPool;
64
65
66 /**
67  * Processor module for writing the results of successful fetches (and
68  * perhaps someday, certain kinds of network failures) to the Internet Archive
69  * ARC file format.
70  *
71  * Assumption is that there is only one of these ARCWriterProcessors per
72  * Heritrix instance.
73  *
74  * @author Parker Thompson
75  */

76 public class ARCWriterProcessor extends WriterPoolProcessor
77 implements CoreAttributeConstants, ARCConstants, CrawlStatusListener,
78 WriterPoolSettings, FetchStatusCodes {
79     private static final long serialVersionUID = 1957518408532644531L;
80
81     private final Logger JavaDoc logger = Logger.getLogger(this.getClass().getName());
82     
83     /**
84      * Default path list.
85      */

86     private static final String JavaDoc [] DEFAULT_PATH = {"arcs"};
87
88     /**
89      * Calculate metadata once only.
90      */

91     transient private List JavaDoc<String JavaDoc> cachedMetadata = null;
92
93     /**
94      * @param name Name of this writer.
95      */

96     public ARCWriterProcessor(String JavaDoc name) {
97         super(name, "ARCWriter processor");
98     }
99     
100     protected String JavaDoc [] getDefaultPath() {
101         return DEFAULT_PATH;
102     }
103
104     protected void setupPool(final AtomicInteger JavaDoc serialNo) {
105         setPool(new ARCWriterPool(serialNo, this, getPoolMaximumActive(),
106             getPoolMaximumWait()));
107     }
108     
109     /**
110      * Writes a CrawlURI and its associated data to store file.
111      *
112      * Currently this method understands the following uri types: dns, http,
113      * and https.
114      *
115      * @param curi CrawlURI to process.
116      */

117     protected void innerProcess(CrawlURI curi) {
118         // If failure, or we haven't fetched the resource yet, return
119
if (curi.getFetchStatus() <= 0) {
120             return;
121         }
122         
123         // If no content, don't write record.
124
int recordLength = (int)curi.getContentSize();
125         if (recordLength <= 0) {
126             // Write nothing.
127
return;
128         }
129         
130         String JavaDoc scheme = curi.getUURI().getScheme().toLowerCase();
131         try {
132             // TODO: Since we made FetchDNS work like FetchHTTP, IF we
133
// move test for success of different schemes -- DNS, HTTP(S) and
134
// soon FTP -- up into CrawlURI#isSuccess (Have it read list of
135
// supported schemes from heritrix.properties and cater to each's
136
// notions of 'success' appropriately), then we can collapse this
137
// if/else into a lone if (curi.isSuccess). See WARCWriter for
138
// an example.
139
if ((scheme.equals("dns") &&
140                     curi.getFetchStatus() == S_DNS_SUCCESS)) {
141                 InputStream JavaDoc is = curi.getHttpRecorder().getRecordedInput().
142                     getReplayInputStream();
143                 write(curi, recordLength, is,
144                     curi.getString(A_DNS_SERVER_IP_LABEL));
145             } else if ((scheme.equals("http") || scheme.equals("https")) &&
146                     curi.getFetchStatus() > 0 && curi.isHttpTransaction()) {
147                 InputStream JavaDoc is = curi.getHttpRecorder().getRecordedInput().
148                     getReplayInputStream();
149                 write(curi, recordLength, is, getHostAddress(curi));
150             } else if (scheme.equals("ftp") && (curi.getFetchStatus() == 200)) {
151                 InputStream JavaDoc is = curi.getHttpRecorder().getRecordedInput().
152                  getReplayInputStream();
153                 write(curi, recordLength, is, getHostAddress(curi));
154             } else {
155                 logger.info("This writer does not write out scheme " + scheme +
156                     " content");
157             }
158         } catch (IOException JavaDoc e) {
159             curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
160                 curi.toString());
161             logger.log(Level.SEVERE, "Failed write of Record: " +
162                 curi.toString(), e);
163         }
164     }
165     
166     protected void write(CrawlURI curi, int recordLength, InputStream JavaDoc in,
167         String JavaDoc ip)
168     throws IOException JavaDoc {
169         WriterPoolMember writer = getPool().borrowFile();
170         long position = writer.getPosition();
171         // See if we need to open a new file because we've exceeed maxBytes.
172
// Call to checkFileSize will open new file if we're at maximum for
173
// current file.
174
writer.checkSize();
175         if (writer.getPosition() != position) {
176             // We just closed the file because it was larger than maxBytes.
177
// Add to the totalBytesWritten the size of the first record
178
// in the file, if any.
179
setTotalBytesWritten(getTotalBytesWritten() +
180                 (writer.getPosition() - position));
181             position = writer.getPosition();
182         }
183         
184         ARCWriter w = (ARCWriter)writer;
185         try {
186             if (in instanceof ReplayInputStream) {
187                 w.write(curi.toString(), curi.getContentType(),
188                     ip, curi.getLong(A_FETCH_BEGAN_TIME),
189                     recordLength, (ReplayInputStream)in);
190             } else {
191                 w.write(curi.toString(), curi.getContentType(),
192                     ip, curi.getLong(A_FETCH_BEGAN_TIME),
193                     recordLength, in);
194             }
195         } catch (IOException JavaDoc e) {
196             // Invalidate this file (It gets a '.invalid' suffix).
197
getPool().invalidateFile(writer);
198             // Set the writer to null otherwise the pool accounting
199
// of how many active writers gets skewed if we subsequently
200
// do a returnWriter call on this object in the finally block.
201
writer = null;
202             throw e;
203         } finally {
204             if (writer != null) {
205                 setTotalBytesWritten(getTotalBytesWritten() +
206                      (writer.getPosition() - position));
207                 getPool().returnFile(writer);
208             }
209         }
210         checkBytesWritten();
211     }
212
213     /**
214      * Return list of metadatas to add to first arc file metadata record.
215      *
216      * Get xml files from settingshandle. Currently order file is the
217      * only xml file. We're NOT adding seeds to meta data.
218      *
219      * @return List of strings and/or files to add to arc file as metadata or
220      * null.
221      */

222     public synchronized List JavaDoc<String JavaDoc> getMetadata() {
223         if (this.cachedMetadata != null) {
224             return this.cachedMetadata;
225         }
226         return cacheMetadata();
227     }
228     
229     protected synchronized List JavaDoc<String JavaDoc> cacheMetadata() {
230         if (this.cachedMetadata != null) {
231             return this.cachedMetadata;
232         }
233         
234         List JavaDoc<String JavaDoc> result = null;
235         if (!XMLSettingsHandler.class.isInstance(getSettingsHandler())) {
236             logger.warning("Expected xml settings handler (No arcmetadata).");
237             // Early return
238
return result;
239         }
240         
241         XMLSettingsHandler xsh = (XMLSettingsHandler)getSettingsHandler();
242         File JavaDoc orderFile = xsh.getOrderFile();
243         if (!orderFile.exists() || !orderFile.canRead()) {
244                 logger.severe("File " + orderFile.getAbsolutePath() +
245                     " is does not exist or is not readable.");
246         } else {
247             result = new ArrayList JavaDoc<String JavaDoc>(1);
248             result.add(getMetadataBody(orderFile));
249         }
250         this.cachedMetadata = result;
251         return this.cachedMetadata;
252     }
253
254     /**
255      * Write the arc metadata body content.
256      *
257      * Its based on the order xml file but into this base we'll add other info
258      * such as machine ip.
259      *
260      * @param orderFile Order file.
261      *
262      * @return String that holds the arc metaheader body.
263      */

264     protected String JavaDoc getMetadataBody(File JavaDoc orderFile) {
265         String JavaDoc result = null;
266         TransformerFactory JavaDoc factory = TransformerFactory.newInstance();
267         Templates JavaDoc templates = null;
268         Transformer JavaDoc xformer = null;
269         try {
270             templates = factory.newTemplates(new StreamSource JavaDoc(
271                 this.getClass().getResourceAsStream("/arcMetaheaderBody.xsl")));
272             xformer = templates.newTransformer();
273             // Below parameter names must match what is in the stylesheet.
274
xformer.setParameter("software", "Heritrix " +
275                 Heritrix.getVersion() + " http://crawler.archive.org");
276             xformer.setParameter("ip",
277                 InetAddress.getLocalHost().getHostAddress());
278             xformer.setParameter("hostname",
279                 InetAddress.getLocalHost().getHostName());
280             StreamSource JavaDoc source = new StreamSource JavaDoc(
281                 new FileInputStream JavaDoc(orderFile));
282             StringWriter JavaDoc writer = new StringWriter JavaDoc();
283             StreamResult JavaDoc target = new StreamResult JavaDoc(writer);
284             xformer.transform(source, target);
285             result= writer.toString();
286         } catch (TransformerConfigurationException JavaDoc e) {
287             logger.severe("Failed transform " + e);
288         } catch (FileNotFoundException JavaDoc e) {
289             logger.severe("Failed transform, file not found " + e);
290         } catch (UnknownHostException JavaDoc e) {
291             logger.severe("Failed transform, unknown host " + e);
292         } catch(TransformerException JavaDoc e) {
293             SourceLocator JavaDoc locator = e.getLocator();
294             int col = locator.getColumnNumber();
295             int line = locator.getLineNumber();
296             String JavaDoc publicId = locator.getPublicId();
297             String JavaDoc systemId = locator.getSystemId();
298             logger.severe("Transform error " + e + ", col " + col + ", line " +
299                 line + ", publicId " + publicId + ", systemId " + systemId);
300         }
301
302         return result;
303     }
304 }
Popular Tags