KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > arc > ARCWriter


1 /*
2  * ARCWriter
3  *
4  * $Id: ARCWriter.java,v 1.54.2.2 2007/01/13 01:31:37 stack-sf Exp $
5  *
6  * Created on Jun 5, 2003
7  *
8  * Copyright (C) 2003 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26 package org.archive.io.arc;
27
28 import java.io.BufferedInputStream JavaDoc;
29 import java.io.ByteArrayOutputStream JavaDoc;
30 import java.io.File JavaDoc;
31 import java.io.FileInputStream JavaDoc;
32 import java.io.IOException JavaDoc;
33 import java.io.InputStream JavaDoc;
34 import java.io.PrintStream JavaDoc;
35 import java.io.UnsupportedEncodingException JavaDoc;
36 import java.util.Iterator JavaDoc;
37 import java.util.List JavaDoc;
38 import java.util.concurrent.atomic.AtomicInteger JavaDoc;
39 import java.util.logging.Logger JavaDoc;
40 import java.util.regex.Matcher JavaDoc;
41 import java.util.regex.Pattern JavaDoc;
42
43 import org.archive.io.GzippedInputStream;
44 import org.archive.io.ReplayInputStream;
45 import org.archive.io.WriterPoolMember;
46 import org.archive.util.ArchiveUtils;
47 import org.archive.util.DevUtils;
48 import org.archive.util.MimetypeUtils;
49
50
51 /**
52  * Write ARC files.
53  *
54  * Assumption is that the caller is managing access to this ARCWriter ensuring
55  * only one thread of control accessing this ARC file instance at any one time.
56  *
57  * <p>ARC files are described here:
58  * <a HREF="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
59  * File Format</a>. This class does version 1 of the ARC file format. It also
60  * writes version 1.1 which is version 1 with data stuffed into the body of the
61  * first arc record in the file, the arc file meta record itself.
62  *
63  * <p>An ARC file is three lines of meta data followed by an optional 'body' and
64  * then a couple of '\n' and then: record, '\n', record, '\n', record, etc.
65  * If we are writing compressed ARC files, then each of the ARC file records is
66  * individually gzipped and concatenated together to make up a single ARC file.
67  * In GZIP terms, each ARC record is a GZIP <i>member</i> of a total gzip'd
68  * file.
69  *
70  * <p>The GZIPping of the ARC file meta data is exceptional. It is GZIPped
71  * w/ an extra GZIP header, a special Internet Archive (IA) extra header field
72  * (e.g. FEXTRA is set in the GZIP header FLG field and an extra field is
73  * appended to the GZIP header). The extra field has little in it but its
74  * presence denotes this GZIP as an Internet Archive gzipped ARC. See RFC1952
75  * to learn about the GZIP header structure.
76  *
77  * <p>This class then does its GZIPping in the following fashion. Each GZIP
78  * member is written w/ a new instance of GZIPOutputStream -- actually
79  * ARCWriterGZIPOututStream so we can get access to the underlying stream.
80  * The underlying stream stays open across GZIPoutputStream instantiations.
81  * For the 'special' GZIPing of the ARC file meta data, we cheat by catching the
82  * GZIPOutputStream output into a byte array, manipulating it adding the
83  * IA GZIP header, before writing to the stream.
84  *
85  * <p>I tried writing a resettable GZIPOutputStream and could make it work w/
86  * the SUN JDK but the IBM JDK threw NPE inside in the deflate.reset -- its zlib
87  * native call doesn't seem to like the notion of resetting -- so I gave up on
88  * it.
89  *
90  * <p>Because of such as the above and troubles with GZIPInputStream, we should
91  * write our own GZIP*Streams, ones that resettable and consious of gzip
92  * members.
93  *
94  * <p>This class will write until we hit >= maxSize. The check is done at
95  * record boundary. Records do not span ARC files. We will then close current
96  * file and open another and then continue writing.
97  *
98  * <p><b>TESTING: </b>Here is how to test that produced ARC files are good
99  * using the
100  * <a HREF="http://www.archive.org/web/researcher/tool_documentation.php">alexa
101  * ARC c-tools</a>:
102  * <pre>
103  * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
104  * /tmp/hx20040109230030-0.dat.gz
105  * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
106  * </pre>
107  * Examine the produced cdx file to make sure it makes sense. Search
108  * for 'no-type 0'. If found, then we're opening a gzip record w/o data to
109  * write. This is bad.
110  *
111  * <p>You can also do <code>gzip -t FILENAME</code> and it will tell you if the
112  * ARC makes sense to GZIP.
113  *
114  * <p>While being written, ARCs have a '.open' suffix appended.
115  *
116  * @author stack
117  */

118 public class ARCWriter extends WriterPoolMember implements ARCConstants {
119     private static final Logger JavaDoc logger =
120         Logger.getLogger(ARCWriter.class.getName());
121     
122     /**
123      * Metadata line pattern.
124      */

125     private static final Pattern JavaDoc METADATA_LINE_PATTERN =
126         Pattern.compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR + "?)$");
127
128     /**
129      * Buffer to reuse writing streams.
130      */

131     private final byte [] readbuffer = new byte[4 * 1024];
132     
133     private List JavaDoc metadata = null;
134     
135     
136     /**
137      * Constructor.
138      * Takes a stream. Use with caution. There is no upperbound check on size.
139      * Will just keep writing.
140      *
141      * @param serialNo used to generate unique file name sequences
142      * @param out Where to write.
143      * @param arc File the <code>out</code> is connected to.
144      * @param cmprs Compress the content written.
145      * @param metadata File meta data. Can be null. Is list of File and/or
146      * String objects.
147      * @param a14DigitDate If null, we'll write current time.
148      * @throws IOException
149      */

150     public ARCWriter(final AtomicInteger JavaDoc serialNo, final PrintStream JavaDoc out,
151         final File JavaDoc arc, final boolean cmprs, String JavaDoc a14DigitDate,
152         final List JavaDoc metadata)
153     throws IOException JavaDoc {
154         super(serialNo, out, arc, cmprs, a14DigitDate);
155         this.metadata = metadata;
156         writeFirstRecord(a14DigitDate);
157     }
158     
159     /**
160      * Constructor.
161      *
162      * @param serialNo used to generate unique file name sequences
163      * @param dirs Where to drop the ARC files.
164      * @param prefix ARC file prefix to use. If null, we use
165      * DEFAULT_ARC_FILE_PREFIX.
166      * @param cmprs Compress the ARC files written. The compression is done
167      * by individually gzipping each record added to the ARC file: i.e. the
168      * ARC file is a bunch of gzipped records concatenated together.
169      * @param maxSize Maximum size for ARC files written.
170      */

171     public ARCWriter(final AtomicInteger JavaDoc serialNo, final List JavaDoc<File JavaDoc> dirs,
172             final String JavaDoc prefix, final boolean cmprs, final int maxSize) {
173         this(serialNo, dirs, prefix, "", cmprs, maxSize, null);
174     }
175             
176     /**
177      * Constructor.
178      *
179      * @param serialNo used to generate unique file name sequences
180      * @param dirs Where to drop files.
181      * @param prefix File prefix to use.
182      * @param cmprs Compress the records written.
183      * @param maxSize Maximum size for ARC files written.
184      * @param suffix File tail to use. If null, unused.
185      * @param meta File meta data. Can be null. Is list of File and/or
186      * String objects.
187      */

188     public ARCWriter(final AtomicInteger JavaDoc serialNo, final List JavaDoc<File JavaDoc> dirs,
189             final String JavaDoc prefix, final String JavaDoc suffix, final boolean cmprs,
190             final int maxSize, final List JavaDoc meta) {
191         super(serialNo, dirs, prefix, suffix, cmprs, maxSize,
192             ARC_FILE_EXTENSION);
193         this.metadata = meta;
194     }
195
196     protected String JavaDoc createFile()
197     throws IOException JavaDoc {
198         String JavaDoc name = super.createFile();
199         writeFirstRecord(getCreateTimestamp());
200         return name;
201     }
202     
203     private void writeFirstRecord(final String JavaDoc ts)
204     throws IOException JavaDoc {
205         write(generateARCFileMetaData(ts));
206     }
207         
208     /**
209      * Write out the ARCMetaData.
210      *
211      * <p>Generate ARC file meta data. Currently we only do version 1 of the
212      * ARC file formats or version 1.1 when metadata has been supplied (We
213      * write it into the body of the first record in the arc file).
214      *
215      * <p>Version 1 metadata looks roughly like this:
216      *
217      * <pre>filedesc://testWriteRecord-JunitIAH20040110013326-2.arc 0.0.0.0 \\
218      * 20040110013326 text/plain 77
219      * 1 0 InternetArchive
220      * URL IP-address Archive-date Content-type Archive-length
221      * </pre>
222      *
223      * <p>If compress is set, then we generate a header that has been gzipped
224      * in the Internet Archive manner. Such a gzipping enables the FEXTRA
225      * flag in the FLG field of the gzip header. It then appends an extra
226      * header field: '8', '0', 'L', 'X', '0', '0', '0', '0'. The first two
227      * bytes are the length of the field and the last 6 bytes the Internet
228      * Archive header. To learn about GZIP format, see RFC1952. To learn
229      * about the Internet Archive extra header field, read the source for
230      * av_ziparc which can be found at
231      * <code>alexa/vista/alexa-tools-1.2/src/av_ziparc.cc</code>.
232      *
233      * <p>We do things in this roundabout manner because the java
234      * GZIPOutputStream does not give access to GZIP header fields.
235      *
236      * @param date Date to put into the ARC metadata.
237      *
238      * @return Byte array filled w/ the arc header.
239      * @throws IOException
240      */

241     private byte [] generateARCFileMetaData(String JavaDoc date)
242     throws IOException JavaDoc {
243         int metadataBodyLength = getMetadataLength();
244         // If metadata body, then the minor part of the version is '1' rather
245
// than '0'.
246
String JavaDoc metadataHeaderLinesTwoAndThree =
247             getMetadataHeaderLinesTwoAndThree("1 " +
248                 ((metadataBodyLength > 0)? "1": "0"));
249         int recordLength = metadataBodyLength +
250             metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length;
251         String JavaDoc metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename() +
252             " 0.0.0.0 " + date + " text/plain " + recordLength +
253             metadataHeaderLinesTwoAndThree;
254         ByteArrayOutputStream JavaDoc metabaos =
255             new ByteArrayOutputStream JavaDoc(recordLength);
256         // Write the metadata header.
257
metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
258         // Write the metadata body, if anything to write.
259
if (metadataBodyLength > 0) {
260             writeMetaData(metabaos);
261         }
262         
263         // Write out a LINE_SEPARATORs to end this record.
264
metabaos.write(LINE_SEPARATOR);
265         
266         // Now get bytes of all just written and compress if flag set.
267
byte [] bytes = metabaos.toByteArray();
268         
269         if(isCompressed()) {
270             // GZIP the header but catch the gzipping into a byte array so we
271
// can add the special IA GZIP header to the product. After
272
// manipulations, write to the output stream (The JAVA GZIP
273
// implementation does not give access to GZIP header. It
274
// produces a 'default' header only). We can get away w/ these
275
// maniupulations because the GZIP 'default' header doesn't
276
// do the 'optional' CRC'ing of the header.
277
byte [] gzippedMetaData = GzippedInputStream.gzip(bytes);
278             if (gzippedMetaData[3] != 0) {
279                 throw new IOException JavaDoc("The GZIP FLG header is unexpectedly " +
280                     " non-zero. Need to add smarter code that can deal " +
281                     " when already extant extra GZIP header fields.");
282             }
283             // Set the GZIP FLG header to '4' which says that the GZIP header
284
// has extra fields. Then insert the alex {'L', 'X', '0', '0', '0,
285
// '0'} 'extra' field. The IA GZIP header will also set byte
286
// 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same.
287
gzippedMetaData[3] = 4;
288             gzippedMetaData[9] = 3;
289             byte [] assemblyBuffer = new byte[gzippedMetaData.length +
290                 ARC_GZIP_EXTRA_FIELD.length];
291             // '10' in the below is a pointer past the following bytes of the
292
// GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See
293
// RFC1952 for explaination of the abbreviations just used.
294
System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
295             System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10,
296                 ARC_GZIP_EXTRA_FIELD.length);
297             System.arraycopy(gzippedMetaData, 10, assemblyBuffer,
298                 10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10);
299             bytes = assemblyBuffer;
300         }
301         return bytes;
302     }
303     
304     public String JavaDoc getMetadataHeaderLinesTwoAndThree(String JavaDoc version) {
305         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
306         buffer.append(LINE_SEPARATOR);
307         buffer.append(version);
308         buffer.append(" InternetArchive");
309         buffer.append(LINE_SEPARATOR);
310         buffer.append("URL IP-address Archive-date Content-type Archive-length");
311         buffer.append(LINE_SEPARATOR);
312         return buffer.toString();
313     }
314
315     /**
316      * Write all metadata to passed <code>baos</code>.
317      *
318      * @param baos Byte array to write to.
319      * @throws UnsupportedEncodingException
320      * @throws IOException
321      */

322     private void writeMetaData(ByteArrayOutputStream JavaDoc baos)
323             throws UnsupportedEncodingException JavaDoc, IOException JavaDoc {
324         if (this.metadata == null) {
325             return;
326         }
327
328         for (Iterator JavaDoc i = this.metadata.iterator();
329                 i.hasNext();) {
330             Object JavaDoc obj = i.next();
331             if (obj instanceof String JavaDoc) {
332                 baos.write(((String JavaDoc)obj).getBytes(DEFAULT_ENCODING));
333             } else if (obj instanceof File JavaDoc) {
334                 InputStream JavaDoc is = null;
335                 try {
336                     is = new BufferedInputStream JavaDoc(
337                         new FileInputStream JavaDoc((File JavaDoc)obj));
338                     byte [] buffer = new byte[4096];
339                     for (int read = -1; (read = is.read(buffer)) != -1;) {
340                         baos.write(buffer, 0, read);
341                     }
342                 } finally {
343                     if (is != null) {
344                         is.close();
345                     }
346                 }
347             } else if (obj != null) {
348                 logger.severe("Unsupported metadata type: " + obj);
349             }
350         }
351         return;
352     }
353
354     /**
355      * @return Total length of metadata.
356      * @throws UnsupportedEncodingException
357      */

358     private int getMetadataLength()
359     throws UnsupportedEncodingException JavaDoc {
360         int result = -1;
361         if (this.metadata == null) {
362             result = 0;
363         } else {
364             for (Iterator JavaDoc i = this.metadata.iterator();
365                     i.hasNext();) {
366                 Object JavaDoc obj = i.next();
367                 if (obj instanceof String JavaDoc) {
368                     result += ((String JavaDoc)obj).getBytes(DEFAULT_ENCODING).length;
369                 } else if (obj instanceof File JavaDoc) {
370                     result += ((File JavaDoc)obj).length();
371                 } else {
372                     logger.severe("Unsupported metadata type: " + obj);
373                 }
374             }
375         }
376         return result;
377     }
378
379     public void write(String JavaDoc uri, String JavaDoc contentType, String JavaDoc hostIP,
380             long fetchBeginTimeStamp, int recordLength,
381             ByteArrayOutputStream JavaDoc baos)
382     throws IOException JavaDoc {
383         preWriteRecordTasks();
384         try {
385             write(getMetaLine(uri, contentType, hostIP,
386                 fetchBeginTimeStamp, recordLength).getBytes(UTF8));
387             baos.writeTo(getOutputStream());
388             write(LINE_SEPARATOR);
389         } finally {
390             postWriteRecordTasks();
391         }
392     }
393
394     public void write(String JavaDoc uri, String JavaDoc contentType, String JavaDoc hostIP,
395             long fetchBeginTimeStamp, int recordLength, InputStream JavaDoc in)
396     throws IOException JavaDoc {
397         preWriteRecordTasks();
398         try {
399             write(getMetaLine(uri, contentType, hostIP,
400                     fetchBeginTimeStamp, recordLength).getBytes(UTF8));
401             readFullyFrom(in, recordLength, this.readbuffer);
402             write(LINE_SEPARATOR);
403         } finally {
404             postWriteRecordTasks();
405         }
406     }
407
408     public void write(String JavaDoc uri, String JavaDoc contentType, String JavaDoc hostIP,
409             long fetchBeginTimeStamp, int recordLength,
410             ReplayInputStream ris)
411     throws IOException JavaDoc {
412         preWriteRecordTasks();
413         try {
414             write(getMetaLine(uri, contentType, hostIP,
415                     fetchBeginTimeStamp, recordLength).getBytes(UTF8));
416             try {
417                 ris.readFullyTo(getOutputStream());
418                 long remaining = ris.remaining();
419                 // Should be zero at this stage. If not, something is
420
// wrong.
421
if (remaining != 0) {
422                     String JavaDoc message = "Gap between expected and actual: " +
423                         remaining + LINE_SEPARATOR + DevUtils.extraInfo() +
424                         " writing arc " + this.getFile().getAbsolutePath();
425                     DevUtils.warnHandle(new Throwable JavaDoc(message), message);
426                     throw new IOException JavaDoc(message);
427                 }
428             } finally {
429                 ris.close();
430             }
431             
432             // Write out trailing newline
433
write(LINE_SEPARATOR);
434         } finally {
435             postWriteRecordTasks();
436         }
437     }
438     
439     /**
440      * @param uri
441      * @param contentType
442      * @param hostIP
443      * @param fetchBeginTimeStamp
444      * @param recordLength
445      * @return Metadata line for an ARCRecord made of passed components.
446      * @exception IOException
447      */

448     protected String JavaDoc getMetaLine(String JavaDoc uri, String JavaDoc contentType, String JavaDoc hostIP,
449         long fetchBeginTimeStamp, int recordLength)
450     throws IOException JavaDoc {
451         if (fetchBeginTimeStamp <= 0) {
452             throw new IOException JavaDoc("Bogus fetchBeginTimestamp: " +
453                 Long.toString(fetchBeginTimeStamp));
454         }
455
456         return validateMetaLine(createMetaline(uri, hostIP,
457             ArchiveUtils.get14DigitDate(fetchBeginTimeStamp),
458             MimetypeUtils.truncate(contentType),
459             Integer.toString(recordLength)));
460     }
461     
462     public String JavaDoc createMetaline(String JavaDoc uri, String JavaDoc hostIP,
463             String JavaDoc timeStamp, String JavaDoc mimetype, String JavaDoc recordLength) {
464         return uri + HEADER_FIELD_SEPARATOR + hostIP +
465             HEADER_FIELD_SEPARATOR + timeStamp +
466             HEADER_FIELD_SEPARATOR + mimetype +
467             HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR;
468     }
469     
470     /**
471      * Test that the metadata line is valid before writing.
472      * @param metaLineStr
473      * @throws IOException
474      * @return The passed in metaline.
475      */

476     protected String JavaDoc validateMetaLine(String JavaDoc metaLineStr)
477     throws IOException JavaDoc {
478         if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) {
479             throw new IOException JavaDoc("Metadata line length is " +
480                 metaLineStr.length() + " which is > than maximum " +
481                 MAX_METADATA_LINE_LENGTH);
482         }
483         Matcher JavaDoc m = METADATA_LINE_PATTERN.matcher(metaLineStr);
484         if (!m.matches()) {
485             throw new IOException JavaDoc("Metadata line doesn't match expected" +
486                 " pattern: " + metaLineStr);
487         }
488         return metaLineStr;
489     }
490 }
491
Popular Tags