KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > warc > ExperimentalWARCWriter


1 /* $Id: ExperimentalWARCWriter.java,v 1.21 2006/09/06 05:38:18 stack-sf Exp $
2  *
3  * Created on July 27th, 2006
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io.warc;
24
25 import java.io.ByteArrayInputStream JavaDoc;
26 import java.io.ByteArrayOutputStream JavaDoc;
27 import java.io.File JavaDoc;
28 import java.io.IOException JavaDoc;
29 import java.io.InputStream JavaDoc;
30 import java.io.OutputStream JavaDoc;
31 import java.net.URI JavaDoc;
32 import java.net.URISyntaxException JavaDoc;
33 import java.text.DecimalFormat JavaDoc;
34 import java.text.NumberFormat JavaDoc;
35 import java.util.Iterator JavaDoc;
36 import java.util.List JavaDoc;
37 import java.util.Map JavaDoc;
38 import java.util.concurrent.atomic.AtomicInteger JavaDoc;
39
40 import org.archive.io.UTF8Bytes;
41 import org.archive.io.WriterPoolMember;
42 import org.archive.uid.GeneratorFactory;
43 import org.archive.util.ArchiveUtils;
44 import org.archive.util.anvl.ANVLRecord;
45
46
47 /**
48  * <b>Experimental</b> WARC implementation.
49  *
50  * Based on unreleased version 0.9 of <a
51  * HREF="http://archive-access.sourceforge.net//warc/warc_file_format.html">WARC
52  * File Format</a> document. Specification and implementation subject to
53  * change.
54  *
55  * <p>Assumption is that the caller is managing access to this
56  * ExperimentalWARCWriter ensuring only one thread accessing this WARC instance
57  * at any one time.
58  *
59  * <p>While being written, WARCs have a '.open' suffix appended.
60  *
61  * @author stack
62  * @version $Revision: 1.21 $ $Date: 2006/09/06 05:38:18 $
63  */

64 public class ExperimentalWARCWriter extends WriterPoolMember
65 implements WARCConstants {
66     /**
67      * Buffer to reuse writing streams.
68      */

69     private final byte [] readbuffer = new byte[16 * 1024];
70     
71     /**
72      * NEWLINE as bytes.
73      */

74     public static byte [] CRLF_BYTES;
75     static {
76         try {
77             CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING);
78         } catch(Exception JavaDoc e) {
79             e.printStackTrace();
80         }
81     };
82     
83     /**
84      * Formatter for the length.
85      */

86     private static NumberFormat JavaDoc RECORD_LENGTH_FORMATTER =
87         new DecimalFormat JavaDoc(PLACEHOLDER_RECORD_LENGTH_STRING);
88     
89     /**
90      * Metadata.
91      * TODO: Exploit writing warcinfo record. Currently unused.
92      */

93     private final List JavaDoc fileMetadata;
94     
95     
96     /**
97      * Shutdown Constructor
98      * Has default access so can make instance to test utility methods.
99      */

100     ExperimentalWARCWriter() {
101         this(null, null, "", "", true, -1, null);
102     }
103     
104     /**
105      * Constructor.
106      * Takes a stream. Use with caution. There is no upperbound check on size.
107      * Will just keep writing. Only pass Streams that are bounded.
108      * @param serialNo used to generate unique file name sequences
109      * @param out Where to write.
110      * @param f File the <code>out</code> is connected to.
111      * @param cmprs Compress the content written.
112      * @param a14DigitDate If null, we'll write current time.
113      * @throws IOException
114      */

115     public ExperimentalWARCWriter(final AtomicInteger JavaDoc serialNo,
116             final OutputStream JavaDoc out, final File JavaDoc f,
117             final boolean cmprs, final String JavaDoc a14DigitDate,
118             final List JavaDoc warcinfoData)
119     throws IOException JavaDoc {
120         super(serialNo, out, f, cmprs, a14DigitDate);
121         // TODO: Currently unused.
122
this.fileMetadata = warcinfoData;
123     }
124             
125     /**
126      * Constructor.
127      *
128      * @param dirs Where to drop files.
129      * @param prefix File prefix to use.
130      * @param cmprs Compress the records written.
131      * @param maxSize Maximum size for ARC files written.
132      * @param suffix File tail to use. If null, unused.
133      * @param warcinfoData File metadata for warcinfo record.
134      */

135     public ExperimentalWARCWriter(final AtomicInteger JavaDoc serialNo,
136             final List JavaDoc<File JavaDoc> dirs, final String JavaDoc prefix,
137             final String JavaDoc suffix, final boolean cmprs,
138             final int maxSize, final List JavaDoc warcinfoData) {
139         super(serialNo, dirs, prefix, suffix, cmprs, maxSize,
140             WARC_FILE_EXTENSION);
141         // TODO: Currently unused.
142
this.fileMetadata = warcinfoData;
143     }
144     
145     @Override JavaDoc
146     protected String JavaDoc createFile(File JavaDoc file) throws IOException JavaDoc {
147         String JavaDoc filename = super.createFile(file);
148         writeWarcinfoRecord(filename);
149         return filename;
150     }
151     
152     protected void baseCharacterCheck(final char c, final String JavaDoc parameter)
153     throws IOException JavaDoc {
154         // TODO: Too strict? UNICODE control characters?
155
if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) {
156             throw new IOException JavaDoc("Contains illegal character 0x" +
157                 Integer.toHexString(c) + ": " + parameter);
158         }
159     }
160     
161     protected String JavaDoc checkHeaderLineParameters(final String JavaDoc parameter)
162     throws IOException JavaDoc {
163         for (int i = 0; i < parameter.length(); i++) {
164             final char c = parameter.charAt(i);
165             baseCharacterCheck(c, parameter);
166             if (Character.isWhitespace(c)) {
167                 throw new IOException JavaDoc("Contains disallowed white space 0x" +
168                     Integer.toHexString(c) + ": " + parameter);
169             }
170         }
171         return parameter;
172     }
173     
174     protected String JavaDoc checkHeaderLineMimetypeParameter(final String JavaDoc parameter)
175     throws IOException JavaDoc {
176         StringBuilder JavaDoc sb = new StringBuilder JavaDoc(parameter.length());
177         boolean wasWhitespace = false;
178         for (int i = 0; i < parameter.length(); i++) {
179             char c = parameter.charAt(i);
180             if (Character.isWhitespace(c)) {
181                 // Map all to ' ' and collapse multiples into one.
182
// TODO: Make sure white space occurs in legal location --
183
// before parameter or inside quoted-string.
184
if (wasWhitespace) {
185                     continue;
186                 }
187                 wasWhitespace = true;
188                 c = ' ';
189             } else {
190                 wasWhitespace = false;
191                 baseCharacterCheck(c, parameter);
192             }
193             sb.append(c);
194         }
195         
196         return sb.toString();
197     }
198
199     protected byte [] createRecordHeaderline(final String JavaDoc type,
200             final String JavaDoc url, final String JavaDoc create14DigitDate,
201             final String JavaDoc mimetype, final URI JavaDoc recordId,
202             final int namedFieldsLength, final long contentLength)
203     throws IOException JavaDoc {
204         final StringBuilder JavaDoc sb =
205             new StringBuilder JavaDoc(2048/*A SWAG: TODO: Do analysis.*/);
206         sb.append(WARC_ID);
207         sb.append(HEADER_FIELD_SEPARATOR);
208         sb.append(PLACEHOLDER_RECORD_LENGTH_STRING);
209         sb.append(HEADER_FIELD_SEPARATOR);
210         sb.append(type);
211         sb.append(HEADER_FIELD_SEPARATOR);
212         sb.append(checkHeaderLineParameters(url));
213         sb.append(HEADER_FIELD_SEPARATOR);
214         sb.append(checkHeaderLineParameters(create14DigitDate));
215         sb.append(HEADER_FIELD_SEPARATOR);
216         // 0.9 of spec. has mimetype second-to-last and recordid last on
217
// header line. Here we swap their positions and allow writing
218
// of full mimetypes rather than the curtailed type we used write into
219
// ARCs. These two deviations to be proposed as amendments to spec 0.9.
220
sb.append(checkHeaderLineParameters(recordId.toString()));
221         sb.append(HEADER_FIELD_SEPARATOR);
222         sb.append(checkHeaderLineMimetypeParameter(mimetype));
223         // Add terminating CRLF.
224
sb.append(CRLF);
225         
226         long length = sb.length() + namedFieldsLength + contentLength;
227         
228         // Insert length and pad out to fixed width with zero prefix to
229
// highlight 'fixed-widthness' of length.
230
int start = WARC_ID.length() + 1 /*HEADER_FIELD_SEPARATOR */;
231         int end = start + PLACEHOLDER_RECORD_LENGTH_STRING.length();
232         String JavaDoc lenStr = RECORD_LENGTH_FORMATTER.format(length);
233         sb.replace(start, end, lenStr);
234
235         return sb.toString().getBytes(HEADER_LINE_ENCODING);
236     }
237
238     protected void writeRecord(final String JavaDoc type, final String JavaDoc url,
239             final String JavaDoc create14DigitDate, final String JavaDoc mimetype,
240             final URI JavaDoc recordId, ANVLRecord namedFields,
241             final InputStream JavaDoc contentStream, final long contentLength)
242     throws IOException JavaDoc {
243         if (!TYPES_LIST.contains(type)) {
244             throw new IllegalArgumentException JavaDoc("Unknown record type: " + type);
245         }
246         if (contentLength == 0 &&
247                 (namedFields == null || namedFields.size() <= 0)) {
248             throw new IllegalArgumentException JavaDoc("Cannot have a record made " +
249                 "of a Header line only (Content and Named Fields are empty).");
250         }
251         
252         preWriteRecordTasks();
253         try {
254             if (namedFields == null) {
255                 // Use the empty anvl record so the length of blank line on
256
// end gets counted as part of the record length.
257
namedFields = ANVLRecord.EMPTY_ANVL_RECORD;
258             }
259             
260             // Serialize metadata first so we have metadata length.
261
final byte [] namedFieldsBlock = namedFields.getUTF8Bytes();
262             // Now serialize the Header line.
263
final byte [] header = createRecordHeaderline(type, url,
264                 create14DigitDate, mimetype, recordId, namedFieldsBlock.length,
265                 contentLength);
266             write(header);
267             write(namedFieldsBlock);
268             if (contentStream != null && contentLength > 0) {
269                 readFullyFrom(contentStream, contentLength, this.readbuffer);
270             }
271             
272             // Write out the two blank lines at end of all records.
273
// TODO: Why? Messes up skipping through file. Also not in grammar.
274
write(CRLF_BYTES);
275             write(CRLF_BYTES);
276         } finally {
277             postWriteRecordTasks();
278         }
279     }
280     
281     protected URI JavaDoc generateRecordId(final Map JavaDoc<String JavaDoc, String JavaDoc> qualifiers)
282     throws IOException JavaDoc {
283         URI JavaDoc rid = null;
284         try {
285             rid = GeneratorFactory.getFactory().
286                 getQualifiedRecordID(qualifiers);
287         } catch (URISyntaxException JavaDoc e) {
288             // Convert to IOE so can let it out.
289
throw new IOException JavaDoc(e.getMessage());
290         }
291         return rid;
292     }
293     
294     protected URI JavaDoc generateRecordId(final String JavaDoc key, final String JavaDoc value)
295     throws IOException JavaDoc {
296         URI JavaDoc rid = null;
297         try {
298             rid = GeneratorFactory.getFactory().
299                 getQualifiedRecordID(key, value);
300         } catch (URISyntaxException JavaDoc e) {
301             // Convert to IOE so can let it out.
302
throw new IOException JavaDoc(e.getMessage());
303         }
304         return rid;
305     }
306     
307     public URI JavaDoc writeWarcinfoRecord(String JavaDoc filename)
308     throws IOException JavaDoc {
309         return writeWarcinfoRecord(filename, null);
310     }
311     
312     public URI JavaDoc writeWarcinfoRecord(String JavaDoc filename, final String JavaDoc description)
313             throws IOException JavaDoc {
314         // Strip .open suffix if present.
315
if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
316             filename = filename.substring(0,
317                 filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length());
318         }
319         ANVLRecord record = new ANVLRecord(2);
320         record.addLabelValue(NAMED_FIELD_WARCFILENAME, filename);
321         if (description != null && description.length() > 0) {
322             record.addLabelValue(NAMED_FIELD_DESCRIPTION, description);
323         }
324         // Add warcinfo body.
325
byte [] warcinfoBody = null;
326         if (this.fileMetadata == null) {
327             // TODO: What to write into a warcinfo? What to associate?
328
warcinfoBody = "TODO: Unimplemented".getBytes();
329         } else {
330             ByteArrayOutputStream JavaDoc baos = new ByteArrayOutputStream JavaDoc();
331             for (final Iterator JavaDoc i = this.fileMetadata.iterator();
332                     i.hasNext();) {
333                 baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8));
334             }
335             warcinfoBody = baos.toByteArray();
336         }
337         URI JavaDoc uri = writeWarcinfoRecord("text/plain", record,
338             new ByteArrayInputStream JavaDoc(warcinfoBody), warcinfoBody.length);
339         // TODO: If at start of file, and we're writing compressed,
340
// write out our distinctive GZIP extensions.
341
return uri;
342     }
343     
344     /**
345      * Write a warcinfo to current file.
346      * TODO: Write crawl metadata or pointers to crawl description.
347      * @param mimetype Mimetype of the <code>fileMetadata</code> block.
348      * @param namedFields Named fields. Pass <code>null</code> if none.
349      * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
350      * @param fileMetadataLength Length of <code>fileMetadata</code>.
351      * @throws IOException
352      * @return Generated record-id made with
353      * <a HREF="http://en.wikipedia.org/wiki/Data:_URL">data: scheme</a> and
354      * the current filename.
355      */

356     public URI JavaDoc writeWarcinfoRecord(final String JavaDoc mimetype,
357         final ANVLRecord namedFields, final InputStream JavaDoc fileMetadata,
358         final long fileMetadataLength)
359     throws IOException JavaDoc {
360         final URI JavaDoc recordid = generateRecordId(TYPE, WARCINFO);
361         writeWarcinfoRecord(ArchiveUtils.get14DigitDate(), mimetype, recordid,
362             namedFields, fileMetadata, fileMetadataLength);
363         return recordid;
364     }
365     
366     /**
367      * Write a <code>warcinfo</code> to current file.
368      * The <code>warcinfo</code> type uses its <code>recordId</code> as its URL.
369      * @param recordId URI to use for this warcinfo.
370      * @param create14DigitDate Record creation date as 14 digit date.
371      * @param mimetype Mimetype of the <code>fileMetadata</code>.
372      * @param namedFields Named fields.
373      * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
374      * @param fileMetadataLength Length of <code>fileMetadata</code>.
375      * @throws IOException
376      */

377     public void writeWarcinfoRecord(final String JavaDoc create14DigitDate,
378         final String JavaDoc mimetype, final URI JavaDoc recordId, final ANVLRecord namedFields,
379         final InputStream JavaDoc fileMetadata, final long fileMetadataLength)
380     throws IOException JavaDoc {
381         writeRecord(WARCINFO, recordId.toString(), create14DigitDate, mimetype,
382             recordId, namedFields, fileMetadata, fileMetadataLength);
383     }
384     
385     public void writeRequestRecord(final String JavaDoc url,
386         final String JavaDoc create14DigitDate, final String JavaDoc mimetype,
387         final URI JavaDoc recordId,
388         final ANVLRecord namedFields, final InputStream JavaDoc request,
389         final long requestLength)
390     throws IOException JavaDoc {
391         writeRecord(REQUEST, url, create14DigitDate,
392             mimetype, recordId, namedFields, request,
393             requestLength);
394     }
395     
396     public void writeResourceRecord(final String JavaDoc url,
397             final String JavaDoc create14DigitDate, final String JavaDoc mimetype,
398             final ANVLRecord namedFields, final InputStream JavaDoc response,
399             final long responseLength)
400     throws IOException JavaDoc {
401         writeResourceRecord(url, create14DigitDate, mimetype, getRecordID(),
402                 namedFields, response, responseLength);
403     }
404     
405     public void writeResourceRecord(final String JavaDoc url,
406             final String JavaDoc create14DigitDate, final String JavaDoc mimetype,
407             final URI JavaDoc recordId,
408             final ANVLRecord namedFields, final InputStream JavaDoc response,
409             final long responseLength)
410     throws IOException JavaDoc {
411         writeRecord(RESOURCE, url, create14DigitDate,
412             mimetype, recordId, namedFields, response,
413             responseLength);
414     }
415
416     public void writeResponseRecord(final String JavaDoc url,
417             final String JavaDoc create14DigitDate, final String JavaDoc mimetype,
418             final URI JavaDoc recordId,
419             final ANVLRecord namedFields, final InputStream JavaDoc response,
420             final long responseLength)
421     throws IOException JavaDoc {
422         writeRecord(RESPONSE, url, create14DigitDate,
423             mimetype, recordId, namedFields, response,
424             responseLength);
425     }
426     
427     public void writeMetadataRecord(final String JavaDoc url,
428             final String JavaDoc create14DigitDate, final String JavaDoc mimetype,
429             final URI JavaDoc recordId,
430             final ANVLRecord namedFields, final InputStream JavaDoc metadata,
431             final long metadataLength)
432     throws IOException JavaDoc {
433         writeRecord(METADATA, url, create14DigitDate,
434             mimetype, recordId, namedFields, metadata,
435             metadataLength);
436     }
437     
438     /**
439      * Convenience method for getting Record-Ids.
440      * @return A record ID.
441      * @throws IOException
442      */

443     public static URI JavaDoc getRecordID() throws IOException JavaDoc {
444         URI JavaDoc result;
445         try {
446             result = GeneratorFactory.getFactory().getRecordID();
447         } catch (URISyntaxException JavaDoc e) {
448             throw new IOException JavaDoc(e.toString());
449         }
450         return result;
451     }
452 }
453
Popular Tags