KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > warc > ExperimentalWARCWriterTest


1 /*
2  * ExperimentalWARCWriterTest
3  *
4  * $Id: ExperimentalWARCWriterTest.java,v 1.12 2006/08/30 02:35:48 stack-sf Exp $
5  *
6  * Created on July 27th, 2006
7  *
8  * Copyright (C) 2006 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26 package org.archive.io.warc;
27
28 import java.io.ByteArrayInputStream JavaDoc;
29 import java.io.ByteArrayOutputStream JavaDoc;
30 import java.io.File JavaDoc;
31 import java.io.FileNotFoundException JavaDoc;
32 import java.io.IOException JavaDoc;
33 import java.net.URI JavaDoc;
34 import java.net.URISyntaxException JavaDoc;
35 import java.util.Arrays JavaDoc;
36 import java.util.Iterator JavaDoc;
37 import java.util.List JavaDoc;
38 import java.util.concurrent.atomic.AtomicInteger JavaDoc;
39
40 import org.archive.io.ArchiveRecord;
41 import org.archive.io.ArchiveRecordHeader;
42 import org.archive.io.UTF8Bytes;
43 import org.archive.io.WriterPoolMember;
44 import org.archive.uid.GeneratorFactory;
45 import org.archive.util.ArchiveUtils;
46 import org.archive.util.TmpDirTestCase;
47 import org.archive.util.anvl.ANVLRecord;
48
49 /**
50  * Test Writer and Reader.
51  * @author stack
52  * @version $Date: 2006/08/30 02:35:48 $ $Version$
53  */

54 public class ExperimentalWARCWriterTest
55 extends TmpDirTestCase implements WARCConstants {
56     private static final AtomicInteger JavaDoc SERIAL_NO = new AtomicInteger JavaDoc();
57     
58     /**
59      * Prefix to use for ARC files made by JUNIT.
60      */

61     private static final String JavaDoc PREFIX = "IAH";
62     
63     private static final String JavaDoc SOME_URL = "http://www.archive.org/test/";
64     
65     public void testCheckHeaderLineValue() throws Exception JavaDoc {
66         ExperimentalWARCWriter writer = new ExperimentalWARCWriter();
67         writer.checkHeaderLineParameters("one");
68         IOException JavaDoc exception = null;
69         try {
70             writer.checkHeaderLineParameters("with space");
71         } catch(IOException JavaDoc e) {
72             exception = e;
73         }
74        assertNotNull(exception);
75        exception = null;
76        try {
77            writer.checkHeaderLineParameters("with\0x0000controlcharacter");
78        } catch(IOException JavaDoc e) {
79            exception = e;
80        }
81       assertNotNull(exception);
82     }
83
84     public void testMimetypes() throws IOException JavaDoc {
85         ExperimentalWARCWriter writer = new ExperimentalWARCWriter();
86         writer.checkHeaderLineMimetypeParameter("text/xml");
87         writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
88         writer.checkHeaderLineMimetypeParameter(
89             "text/plain; charset=SHIFT-JIS");
90         System.out.println(writer.checkHeaderLineMimetypeParameter(
91             "multipart/mixed; \r\n boundary=\"simple boundary\""));
92     }
93     
94     public void testWriteRecord() throws IOException JavaDoc {
95         File JavaDoc [] files = {getTmpDir()};
96         
97         // Write uncompressed.
98
ExperimentalWARCWriter writer =
99             new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files),
100                     this.getClass().getName(), "suffix", false, -1, null);
101         writeFile(writer);
102         
103         // Write compressed.
104
writer = new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files),
105                 this.getClass().getName(), "suffix", true, -1, null);
106         writeFile(writer);
107     }
108     
109     private void writeFile(final ExperimentalWARCWriter writer)
110     throws IOException JavaDoc {
111         try {
112             writeWarcinfoRecord(writer);
113             writeBasicRecords(writer);
114         } finally {
115             writer.close();
116             writer.getFile().delete();
117         }
118     }
119     
120     private void writeWarcinfoRecord(ExperimentalWARCWriter writer)
121     throws IOException JavaDoc {
122         ANVLRecord meta = new ANVLRecord();
123         meta.addLabelValue("size", "1G");
124         meta.addLabelValue("operator", "igor");
125         byte [] bytes = meta.getUTF8Bytes();
126         writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null,
127             new ByteArrayInputStream JavaDoc(bytes), bytes.length);
128     }
129
130     protected void writeBasicRecords(final ExperimentalWARCWriter writer)
131     throws IOException JavaDoc {
132         ANVLRecord headerFields = new ANVLRecord();
133         headerFields.addLabelValue("x", "y");
134         headerFields.addLabelValue("a", "b");
135         
136         URI JavaDoc rid = null;
137         try {
138             rid = GeneratorFactory.getFactory().
139                 getQualifiedRecordID(TYPE, METADATA);
140         } catch (URISyntaxException JavaDoc e) {
141             // Convert to IOE so can let it out.
142
throw new IOException JavaDoc(e.getMessage());
143         }
144         final String JavaDoc content = "Any old content.";
145         for (int i = 0; i < 10; i++) {
146             String JavaDoc body = i + ". " + content;
147             byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
148             writer.writeRecord(METADATA, "http://www.archive.org/",
149                 ArchiveUtils.get14DigitDate(), "no/type",
150                 rid, headerFields, new ByteArrayInputStream JavaDoc(bodyBytes),
151                 (long)bodyBytes.length);
152         }
153     }
154
155     /**
156      * @return Generic HTML Content.
157      */

158     protected static String JavaDoc getContent() {
159         return getContent(null);
160     }
161     
162     /**
163      * @return Generic HTML Content with mention of passed <code>indexStr</code>
164      * in title and body.
165      */

166     protected static String JavaDoc getContent(String JavaDoc indexStr) {
167         String JavaDoc page = (indexStr != null)? "Page #" + indexStr: "Some Page";
168         return "HTTP/1.1 200 OK\r\n" +
169         "Content-Type: text/html\r\n\r\n" +
170         "<html><head><title>" + page +
171         "</title></head>" +
172         "<body>" + page +
173         "</body></html>";
174     }
175
176     /**
177      * Write random HTML Record.
178      * @param w Where to write.
179      * @param index An index to put into content.
180      * @return Length of record written.
181      * @throws IOException
182      */

183     protected int writeRandomHTTPRecord(ExperimentalWARCWriter w, int index)
184     throws IOException JavaDoc {
185         ByteArrayOutputStream JavaDoc baos = new ByteArrayOutputStream JavaDoc();
186         String JavaDoc indexStr = Integer.toString(index);
187         byte[] record = (getContent(indexStr)).getBytes();
188         int recordLength = record.length;
189         baos.write(record);
190         // Add named fields for ip, checksum, and relate the metadata
191
// and request to the resource field.
192
ANVLRecord r = new ANVLRecord(1);
193         r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1");
194         w.writeResourceRecord(
195             "http://www.one.net/id=" + indexStr,
196             ArchiveUtils.get14DigitDate(),
197             "text/html; charset=UTF-8",
198             r,
199             new ByteArrayInputStream JavaDoc(baos.toByteArray()),
200             recordLength);
201         return recordLength;
202     }
203
204     /**
205      * Fill a WARC with HTML Records.
206      * @param baseName WARC basename.
207      * @param compress Whether to compress or not.
208      * @param maxSize Maximum WARC size.
209      * @param recordCount How many records.
210      * @return The written file.
211      * @throws IOException
212      */

213     private File JavaDoc writeRecords(String JavaDoc baseName, boolean compress,
214         int maxSize, int recordCount)
215     throws IOException JavaDoc {
216         cleanUpOldFiles(baseName);
217         File JavaDoc [] files = {getTmpDir()};
218         ExperimentalWARCWriter w = new ExperimentalWARCWriter(SERIAL_NO,
219             Arrays.asList(files), baseName + '-' + PREFIX, "", compress,
220             maxSize, null);
221         assertNotNull(w);
222         for (int i = 0; i < recordCount; i++) {
223             writeRandomHTTPRecord(w, i);
224         }
225         w.close();
226         assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(),
227             w.getFile().exists());
228         return w.getFile();
229     }
230
231     /**
232      * Run validation of passed file.
233      * @param f File to validate.
234      * @param recordCount Expected count of records.
235      * @throws FileNotFoundException
236      * @throws IOException
237      */

238     private void validate(File JavaDoc f, int recordCount)
239     throws FileNotFoundException JavaDoc, IOException JavaDoc {
240         WARCReader reader = WARCReaderFactory.get(f);
241         assertNotNull(reader);
242         List JavaDoc headers = null;
243         if (recordCount == -1) {
244             headers = reader.validate();
245         } else {
246             headers = reader.validate(recordCount);
247         }
248         reader.close();
249         
250         // Now, run through each of the records doing absolute get going from
251
// the end to start. Reopen the arc so no context between this test
252
// and the previous.
253
reader = WARCReaderFactory.get(f);
254         for (int i = headers.size() - 1; i >= 0; i--) {
255             ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
256             ArchiveRecord r = reader.get(h.getOffset());
257             String JavaDoc mimeType = r.getHeader().getMimetype();
258             assertTrue("Record is bogus",
259                 mimeType != null && mimeType.length() > 0);
260         }
261         reader.close();
262         
263         assertTrue("Metadatas not equal", headers.size() == recordCount);
264         for (Iterator JavaDoc i = headers.iterator(); i.hasNext();) {
265             ArchiveRecordHeader r = (ArchiveRecordHeader)i.next();
266             assertTrue("Record is empty", r.getLength() > 0);
267         }
268     }
269
270     public void testWriteRecords() throws IOException JavaDoc {
271         final int recordCount = 2;
272         File JavaDoc f = writeRecords("writeRecord", false, DEFAULT_MAX_WARC_FILE_SIZE,
273             recordCount);
274         validate(f, recordCount + 1); // Header record.
275
}
276
277     public void testRandomAccess() throws IOException JavaDoc {
278         final int recordCount = 3;
279         File JavaDoc f = writeRecords("writeRecord", true, DEFAULT_MAX_WARC_FILE_SIZE,
280             recordCount);
281         WARCReader reader = WARCReaderFactory.get(f);
282         // Get to second record. Get its offset for later use.
283
boolean readFirst = false;
284         String JavaDoc url = null;
285         long offset = -1;
286         long totalRecords = 0;
287         boolean readSecond = false;
288         for (final Iterator JavaDoc i = reader.iterator(); i.hasNext();
289                 totalRecords++) {
290             WARCRecord ar = (WARCRecord)i.next();
291             if (!readFirst) {
292                 readFirst = true;
293                 continue;
294             }
295             if (!readSecond) {
296                 url = ar.getHeader().getUrl();
297                 offset = ar.getHeader().getOffset();
298                 readSecond = true;
299             }
300         }
301         
302         reader = WARCReaderFactory.get(f, offset);
303         ArchiveRecord ar = reader.get();
304         assertEquals(ar.getHeader().getUrl(), url);
305         ar.close();
306         
307         // Get reader again. See how iterator works with offset
308
reader = WARCReaderFactory.get(f, offset);
309         int count = 0;
310         for (final Iterator JavaDoc i = reader.iterator(); i.hasNext(); i.next()) {
311             count++;
312         }
313         reader.close();
314         assertEquals(totalRecords - 1, count);
315     }
316     
317     public void testWriteRecordCompressed() throws IOException JavaDoc {
318         final int recordCount = 2;
319         File JavaDoc arcFile = writeRecords("writeRecordCompressed", true,
320             DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
321         validate(arcFile, recordCount + 1 /*Header record*/);
322     }
323     
324     protected ExperimentalWARCWriter createWARCWriter(String JavaDoc NAME,
325             boolean compress) {
326         File JavaDoc [] files = {getTmpDir()};
327         return new ExperimentalWARCWriter(SERIAL_NO,
328             Arrays.asList(files), NAME, "",
329             compress, DEFAULT_MAX_WARC_FILE_SIZE, null);
330     }
331     
332     protected static ByteArrayOutputStream JavaDoc getBaos(String JavaDoc str)
333     throws IOException JavaDoc {
334         ByteArrayOutputStream JavaDoc baos = new ByteArrayOutputStream JavaDoc();
335         baos.write(str.getBytes());
336         return baos;
337     }
338     
339     protected static void writeRecord(ExperimentalWARCWriter w, String JavaDoc url,
340         String JavaDoc mimetype, int len, ByteArrayOutputStream JavaDoc baos)
341     throws IOException JavaDoc {
342         w.writeResourceRecord(url,
343             ArchiveUtils.get14DigitDate(),
344             mimetype,
345             null,
346             new ByteArrayInputStream JavaDoc(baos.toByteArray()),
347             len);
348     }
349     
350     protected int iterateRecords(WARCReader r)
351     throws IOException JavaDoc {
352         int count = 0;
353         for (Iterator JavaDoc<ArchiveRecord> i = r.iterator(); i.hasNext();) {
354             ArchiveRecord ar = i.next();
355             ar.close();
356             if (count != 0) {
357                 assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
358                     ar.getHeader().getUrl().equals(SOME_URL));
359             }
360             count++;
361         }
362         return count;
363     }
364     
365     protected ExperimentalWARCWriter createWithOneRecord(String JavaDoc name,
366         boolean compressed)
367     throws IOException JavaDoc {
368         ExperimentalWARCWriter writer = createWARCWriter(name, compressed);
369         String JavaDoc content = getContent();
370         writeRecord(writer, SOME_URL, "text/html",
371             content.length(), getBaos(content));
372         return writer;
373     }
374     
375     public void testSpaceInURL() {
376         String JavaDoc eMessage = null;
377         try {
378             holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
379         } catch (IOException JavaDoc e) {
380             eMessage = e.getMessage();
381         }
382         assertTrue("Didn't get expected exception: " + eMessage,
383             eMessage.startsWith("Contains disallowed"));
384     }
385
386     public void testTabInURL() {
387         String JavaDoc eMessage = null;
388         try {
389             holeyUrl("testTabInURL-" + PREFIX, false, "\t");
390         } catch (IOException JavaDoc e) {
391             eMessage = e.getMessage();
392         }
393         assertTrue("Didn't get expected exception: " + eMessage,
394             eMessage.startsWith("Contains illegal"));
395     }
396     
397     protected void holeyUrl(String JavaDoc name, boolean compress, String JavaDoc urlInsert)
398     throws IOException JavaDoc {
399         ExperimentalWARCWriter writer = createWithOneRecord(name, compress);
400         // Add some bytes on the end to mess up the record.
401
String JavaDoc content = getContent();
402         ByteArrayOutputStream JavaDoc baos = getBaos(content);
403         writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
404             content.length(), baos);
405         writer.close();
406     }
407     
408     /**
409      * Write an arc file for other tests to use.
410      * @param arcdir Directory to write to.
411      * @param compress True if file should be compressed.
412      * @return ARC written.
413      * @throws IOException
414      */

415     public static File JavaDoc createWARCFile(File JavaDoc arcdir, boolean compress)
416     throws IOException JavaDoc {
417         File JavaDoc [] files = {arcdir};
418         ExperimentalWARCWriter writer =
419             new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files),
420             "test", "", compress, DEFAULT_MAX_WARC_FILE_SIZE, null);
421         String JavaDoc content = getContent();
422         writeRecord(writer, SOME_URL, "text/html", content.length(),
423             getBaos(content));
424         writer.close();
425         return writer.getFile();
426     }
427     
428 // public void testSpeed() throws IOException {
429
// ARCWriter writer = createArcWithOneRecord("speed", true);
430
// // Add a record with a length that is too long.
431
// String content = getContent();
432
// final int count = 100000;
433
// logger.info("Starting speed write of " + count + " records.");
434
// for (int i = 0; i < count; i++) {
435
// writeRecord(writer, SOME_URL, "text/html", content.length(),
436
// getBaos(content));
437
// }
438
// writer.close();
439
// logger.info("Finished speed write test.");
440
// }
441

442     public void testArcRecordOffsetReads() throws Exception JavaDoc {
443         // Get an ARC with one record.
444
WriterPoolMember w =
445             createWithOneRecord("testArcRecordInBufferStream", true);
446         w.close();
447         // Get reader on said ARC.
448
WARCReader r = WARCReaderFactory.get(w.getFile());
449         final Iterator JavaDoc<ArchiveRecord> i = r.iterator();
450         // Skip first ARC meta record.
451
ArchiveRecord ar = i.next();
452         i.hasNext();
453         // Now we're at first and only record in ARC.
454
ar = (WARCRecord) i.next();
455         // Now try getting some random set of bytes out of it
456
// at an odd offset (used to fail because we were
457
// doing bad math to find where in buffer to read).
458
final byte[] buffer = new byte[17];
459         final int maxRead = 4;
460         int totalRead = 0;
461         while (totalRead < maxRead) {
462             totalRead = totalRead
463                 + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
464             assertTrue(totalRead > 0);
465         }
466     }
467 }
Popular Tags