1 26 package org.archive.crawler.writer; 27 28 import java.io.File ; 29 import java.io.FileInputStream ; 30 import java.io.FileNotFoundException ; 31 import java.io.IOException ; 32 import java.io.InputStream ; 33 import java.io.StringWriter ; 34 import java.net.InetAddress ; 35 import java.net.UnknownHostException ; 36 import java.util.ArrayList ; 37 import java.util.List ; 38 import java.util.concurrent.atomic.AtomicInteger ; 39 import java.util.logging.Level ; 40 import java.util.logging.Logger ; 41 42 import javax.xml.transform.SourceLocator ; 43 import javax.xml.transform.Templates ; 44 import javax.xml.transform.Transformer ; 45 import javax.xml.transform.TransformerConfigurationException ; 46 import javax.xml.transform.TransformerException ; 47 import javax.xml.transform.TransformerFactory ; 48 import javax.xml.transform.stream.StreamResult ; 49 import javax.xml.transform.stream.StreamSource ; 50 51 import org.archive.crawler.Heritrix; 52 import org.archive.crawler.datamodel.CoreAttributeConstants; 53 import org.archive.crawler.datamodel.CrawlURI; 54 import org.archive.crawler.datamodel.FetchStatusCodes; 55 import org.archive.crawler.event.CrawlStatusListener; 56 import org.archive.crawler.framework.WriterPoolProcessor; 57 import org.archive.crawler.settings.XMLSettingsHandler; 58 import org.archive.io.ReplayInputStream; 59 import org.archive.io.WriterPoolMember; 60 import org.archive.io.WriterPoolSettings; 61 import org.archive.io.arc.ARCConstants; 62 import org.archive.io.arc.ARCWriter; 63 import org.archive.io.arc.ARCWriterPool; 64 65 66 76 public class ARCWriterProcessor extends WriterPoolProcessor 77 implements CoreAttributeConstants, ARCConstants, CrawlStatusListener, 78 WriterPoolSettings, FetchStatusCodes { 79 private static final long serialVersionUID = 1957518408532644531L; 80 81 private final Logger logger = Logger.getLogger(this.getClass().getName()); 82 83 86 private static final String [] DEFAULT_PATH = {"arcs"}; 87 88 91 transient private List <String > cachedMetadata = null; 92 93 96 public ARCWriterProcessor(String name) { 97 super(name, "ARCWriter processor"); 98 } 99 100 protected String [] getDefaultPath() { 101 return DEFAULT_PATH; 102 } 103 104 protected void setupPool(final AtomicInteger serialNo) { 105 setPool(new ARCWriterPool(serialNo, this, getPoolMaximumActive(), 106 getPoolMaximumWait())); 107 } 108 109 117 protected void innerProcess(CrawlURI curi) { 118 if (curi.getFetchStatus() <= 0) { 120 return; 121 } 122 123 int recordLength = (int)curi.getContentSize(); 125 if (recordLength <= 0) { 126 return; 128 } 129 130 String scheme = curi.getUURI().getScheme().toLowerCase(); 131 try { 132 if ((scheme.equals("dns") && 140 curi.getFetchStatus() == S_DNS_SUCCESS)) { 141 InputStream is = curi.getHttpRecorder().getRecordedInput(). 142 getReplayInputStream(); 143 write(curi, recordLength, is, 144 curi.getString(A_DNS_SERVER_IP_LABEL)); 145 } else if ((scheme.equals("http") || scheme.equals("https")) && 146 curi.getFetchStatus() > 0 && curi.isHttpTransaction()) { 147 InputStream is = curi.getHttpRecorder().getRecordedInput(). 148 getReplayInputStream(); 149 write(curi, recordLength, is, getHostAddress(curi)); 150 } else if (scheme.equals("ftp") && (curi.getFetchStatus() == 200)) { 151 InputStream is = curi.getHttpRecorder().getRecordedInput(). 152 getReplayInputStream(); 153 write(curi, recordLength, is, getHostAddress(curi)); 154 } else { 155 logger.info("This writer does not write out scheme " + scheme + 156 " content"); 157 } 158 } catch (IOException e) { 159 curi.addLocalizedError(this.getName(), e, "WriteRecord: " + 160 curi.toString()); 161 logger.log(Level.SEVERE, "Failed write of Record: " + 162 curi.toString(), e); 163 } 164 } 165 166 protected void write(CrawlURI curi, int recordLength, InputStream in, 167 String ip) 168 throws IOException { 169 WriterPoolMember writer = getPool().borrowFile(); 170 long position = writer.getPosition(); 171 writer.checkSize(); 175 if (writer.getPosition() != position) { 176 setTotalBytesWritten(getTotalBytesWritten() + 180 (writer.getPosition() - position)); 181 position = writer.getPosition(); 182 } 183 184 ARCWriter w = (ARCWriter)writer; 185 try { 186 if (in instanceof ReplayInputStream) { 187 w.write(curi.toString(), curi.getContentType(), 188 ip, curi.getLong(A_FETCH_BEGAN_TIME), 189 recordLength, (ReplayInputStream)in); 190 } else { 191 w.write(curi.toString(), curi.getContentType(), 192 ip, curi.getLong(A_FETCH_BEGAN_TIME), 193 recordLength, in); 194 } 195 } catch (IOException e) { 196 getPool().invalidateFile(writer); 198 writer = null; 202 throw e; 203 } finally { 204 if (writer != null) { 205 setTotalBytesWritten(getTotalBytesWritten() + 206 (writer.getPosition() - position)); 207 getPool().returnFile(writer); 208 } 209 } 210 checkBytesWritten(); 211 } 212 213 222 public synchronized List <String > getMetadata() { 223 if (this.cachedMetadata != null) { 224 return this.cachedMetadata; 225 } 226 return cacheMetadata(); 227 } 228 229 protected synchronized List <String > cacheMetadata() { 230 if (this.cachedMetadata != null) { 231 return this.cachedMetadata; 232 } 233 234 List <String > result = null; 235 if (!XMLSettingsHandler.class.isInstance(getSettingsHandler())) { 236 logger.warning("Expected xml settings handler (No arcmetadata)."); 237 return result; 239 } 240 241 XMLSettingsHandler xsh = (XMLSettingsHandler)getSettingsHandler(); 242 File orderFile = xsh.getOrderFile(); 243 if (!orderFile.exists() || !orderFile.canRead()) { 244 logger.severe("File " + orderFile.getAbsolutePath() + 245 " is does not exist or is not readable."); 246 } else { 247 result = new ArrayList <String >(1); 248 result.add(getMetadataBody(orderFile)); 249 } 250 this.cachedMetadata = result; 251 return this.cachedMetadata; 252 } 253 254 264 protected String getMetadataBody(File orderFile) { 265 String result = null; 266 TransformerFactory factory = TransformerFactory.newInstance(); 267 Templates templates = null; 268 Transformer xformer = null; 269 try { 270 templates = factory.newTemplates(new StreamSource ( 271 this.getClass().getResourceAsStream("/arcMetaheaderBody.xsl"))); 272 xformer = templates.newTransformer(); 273 xformer.setParameter("software", "Heritrix " + 275 Heritrix.getVersion() + " http://crawler.archive.org"); 276 xformer.setParameter("ip", 277 InetAddress.getLocalHost().getHostAddress()); 278 xformer.setParameter("hostname", 279 InetAddress.getLocalHost().getHostName()); 280 StreamSource source = new StreamSource ( 281 new FileInputStream (orderFile)); 282 StringWriter writer = new StringWriter (); 283 StreamResult target = new StreamResult (writer); 284 xformer.transform(source, target); 285 result= writer.toString(); 286 } catch (TransformerConfigurationException e) { 287 logger.severe("Failed transform " + e); 288 } catch (FileNotFoundException e) { 289 logger.severe("Failed transform, file not found " + e); 290 } catch (UnknownHostException e) { 291 logger.severe("Failed transform, unknown host " + e); 292 } catch(TransformerException e) { 293 SourceLocator locator = e.getLocator(); 294 int col = locator.getColumnNumber(); 295 int line = locator.getLineNumber(); 296 String publicId = locator.getPublicId(); 297 String systemId = locator.getSystemId(); 298 logger.severe("Transform error " + e + ", col " + col + ", line " + 299 line + ", publicId " + publicId + ", systemId " + systemId); 300 } 301 302 return result; 303 } 304 } | Popular Tags |