KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > arc > ARCWriterTest


1 /* ARCWriterTest
2  *
3  * $Id: ARCWriterTest.java,v 1.37 2006/08/25 17:34:38 stack-sf Exp $
4  *
5  * Created on Dec 31, 2003.
6  *
7  * Copyright (C) 2003 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.io.arc;
26
27 import java.io.ByteArrayOutputStream JavaDoc;
28 import java.io.File JavaDoc;
29 import java.io.FileNotFoundException JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.io.OutputStream JavaDoc;
32 import java.io.PrintStream JavaDoc;
33 import java.util.Arrays JavaDoc;
34 import java.util.Date JavaDoc;
35 import java.util.Iterator JavaDoc;
36 import java.util.List JavaDoc;
37 import java.util.concurrent.atomic.AtomicInteger JavaDoc;
38
39 import org.archive.io.ArchiveRecord;
40 import org.archive.io.ReplayInputStream;
41 import org.archive.io.WriterPoolMember;
42 import org.archive.util.ArchiveUtils;
43 import org.archive.util.FileUtils;
44 import org.archive.util.TmpDirTestCase;
45
46
47 /**
48  * Test ARCWriter class.
49  *
50  * This code exercises ARCWriter AND ARCReader. First it writes ARCs w/
51  * ARCWriter. Then it validates what was written w/ ARCReader.
52  *
53  * @author stack
54  */

55 public class ARCWriterTest
56 extends TmpDirTestCase implements ARCConstants {
57     /**
58      * Prefix to use for ARC files made by JUNIT.
59      */

60     private static final String JavaDoc PREFIX =
61         /* TODO DEFAULT_ARC_FILE_PREFIX*/ "IAH";
62     
63     private static final String JavaDoc SOME_URL = "http://www.archive.org/test/";
64
65     
66     private static final AtomicInteger JavaDoc SERIAL_NO = new AtomicInteger JavaDoc();
67
68     /*
69      * @see TestCase#setUp()
70      */

71     protected void setUp() throws Exception JavaDoc {
72         super.setUp();
73     }
74
75     /*
76      * @see TestCase#tearDown()
77      */

78     protected void tearDown() throws Exception JavaDoc {
79         super.tearDown();
80     }
81     
82     protected static String JavaDoc getContent() {
83         return getContent(null);
84     }
85     
86     protected static String JavaDoc getContent(String JavaDoc indexStr) {
87         String JavaDoc page = (indexStr != null)? "Page #" + indexStr: "Some Page";
88         return "HTTP/1.1 200 OK\r\n" +
89         "Content-Type: text/html\r\n\r\n" +
90         "<html><head><title>" + page +
91         "</title></head>" +
92         "<body>" + page +
93         "</body></html>";
94     }
95
96     protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
97     throws IOException JavaDoc {
98         String JavaDoc indexStr = Integer.toString(index);
99         ByteArrayOutputStream JavaDoc baos = new ByteArrayOutputStream JavaDoc();
100         // Start the record with an arbitrary 14-digit date per RFC2540
101
String JavaDoc now = ArchiveUtils.get14DigitDate();
102         int recordLength = 0;
103         byte[] record = (getContent(indexStr)).getBytes();
104         recordLength += record.length;
105         baos.write(record);
106         // Add the newline between records back in
107
baos.write("\n".getBytes());
108         recordLength += 1;
109         arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",
110             "0.1.2.3", Long.parseLong(now), recordLength, baos);
111         return recordLength;
112     }
113
114     private File JavaDoc writeRecords(String JavaDoc baseName, boolean compress,
115         int maxSize, int recordCount)
116     throws IOException JavaDoc {
117         cleanUpOldFiles(baseName);
118         File JavaDoc [] files = {getTmpDir()};
119         ARCWriter arcWriter = new ARCWriter(SERIAL_NO, Arrays.asList(files),
120             baseName + '-' + PREFIX, compress, maxSize);
121         assertNotNull(arcWriter);
122         for (int i = 0; i < recordCount; i++) {
123             writeRandomHTTPRecord(arcWriter, i);
124         }
125         arcWriter.close();
126         assertTrue("Doesn't exist: " +
127                 arcWriter.getFile().getAbsolutePath(),
128             arcWriter.getFile().exists());
129         return arcWriter.getFile();
130     }
131
132     private void validate(File JavaDoc arcFile, int recordCount)
133     throws FileNotFoundException JavaDoc, IOException JavaDoc {
134         ARCReader reader = ARCReaderFactory.get(arcFile);
135         assertNotNull(reader);
136         List JavaDoc metaDatas = null;
137         if (recordCount == -1) {
138             metaDatas = reader.validate();
139         } else {
140             metaDatas = reader.validate(recordCount);
141         }
142         reader.close();
143         // Now, run through each of the records doing absolute get going from
144
// the end to start. Reopen the arc so no context between this test
145
// and the previous.
146
reader = ARCReaderFactory.get(arcFile);
147         for (int i = metaDatas.size() - 1; i >= 0; i--) {
148             ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i);
149             ArchiveRecord r = reader.get(meta.getOffset());
150             String JavaDoc mimeType = r.getHeader().getMimetype();
151             assertTrue("Record is bogus",
152                 mimeType != null && mimeType.length() > 0);
153         }
154         reader.close();
155         assertTrue("Metadatas not equal", metaDatas.size() == recordCount);
156         for (Iterator JavaDoc i = metaDatas.iterator(); i.hasNext();) {
157                 ARCRecordMetaData r = (ARCRecordMetaData)i.next();
158                 assertTrue("Record is empty", r.getLength() > 0);
159         }
160     }
161
162     public void testCheckARCFileSize()
163     throws IOException JavaDoc {
164         runCheckARCFileSizeTest("checkARCFileSize", false);
165     }
166
167     public void testCheckARCFileSizeCompressed()
168     throws IOException JavaDoc {
169         runCheckARCFileSizeTest("checkARCFileSize", true);
170     }
171
172     public void testWriteRecord() throws IOException JavaDoc {
173         final int recordCount = 2;
174         File JavaDoc arcFile = writeRecords("writeRecord", false,
175                 DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
176         validate(arcFile, recordCount + 1); // Header record.
177
}
178     
179     public void testRandomAccess() throws IOException JavaDoc {
180         final int recordCount = 3;
181         File JavaDoc arcFile = writeRecords("writeRecord", true,
182             DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
183         ARCReader reader = ARCReaderFactory.get(arcFile);
184         // Get to second record. Get its offset for later use.
185
boolean readFirst = false;
186         String JavaDoc url = null;
187         long offset = -1;
188         long totalRecords = 0;
189         boolean readSecond = false;
190         for (final Iterator JavaDoc i = reader.iterator(); i.hasNext(); totalRecords++) {
191             ARCRecord ar = (ARCRecord)i.next();
192             if (!readFirst) {
193                 readFirst = true;
194                 continue;
195             }
196             if (!readSecond) {
197                 url = ar.getMetaData().getUrl();
198                 offset = ar.getMetaData().getOffset();
199                 readSecond = true;
200             }
201         }
202         
203         reader = ARCReaderFactory.get(arcFile, offset);
204         ArchiveRecord ar = reader.get();
205         assertEquals(ar.getHeader().getUrl(), url);
206         ar.close();
207         
208         // Get reader again. See how iterator works with offset
209
reader = ARCReaderFactory.get(arcFile, offset);
210         int count = 0;
211         for (final Iterator JavaDoc i = reader.iterator(); i.hasNext(); i.next()) {
212             count++;
213         }
214         reader.close();
215         assertEquals(totalRecords - 1, count);
216     }
217
218     public void testWriteRecordCompressed() throws IOException JavaDoc {
219         final int recordCount = 2;
220         File JavaDoc arcFile = writeRecords("writeRecordCompressed", true,
221                 DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
222         validate(arcFile, recordCount + 1 /*Header record*/);
223     }
224     
225     private void runCheckARCFileSizeTest(String JavaDoc baseName, boolean compress)
226     throws FileNotFoundException JavaDoc, IOException JavaDoc {
227         writeRecords(baseName, compress, 1024, 15);
228         // Now validate all files just created.
229
File JavaDoc [] files = FileUtils.getFilesWithPrefix(getTmpDir(), PREFIX);
230         for (int i = 0; i < files.length; i++) {
231             validate(files[i], -1);
232         }
233     }
234     
235     protected ARCWriter createARCWriter(String JavaDoc NAME, boolean compress) {
236         File JavaDoc [] files = {getTmpDir()};
237         return new ARCWriter(SERIAL_NO, Arrays.asList(files), NAME,
238             compress, DEFAULT_MAX_ARC_FILE_SIZE);
239     }
240     
241     protected static ByteArrayOutputStream JavaDoc getBaos(String JavaDoc str)
242     throws IOException JavaDoc {
243         ByteArrayOutputStream JavaDoc baos = new ByteArrayOutputStream JavaDoc();
244         baos.write(str.getBytes());
245         return baos;
246     }
247     
248     protected static void writeRecord(ARCWriter writer, String JavaDoc url,
249         String JavaDoc type, int len, ByteArrayOutputStream JavaDoc baos)
250     throws IOException JavaDoc {
251         writer.write(url, type, "192.168.1.1", (new Date JavaDoc()).getTime(), len,
252             baos);
253     }
254     
255     protected int iterateRecords(ARCReader r)
256     throws IOException JavaDoc {
257         int count = 0;
258         for (Iterator JavaDoc i = r.iterator(); i.hasNext();) {
259             ARCRecord rec = (ARCRecord)i.next();
260             rec.close();
261             if (count != 0) {
262                 assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
263                     rec.getMetaData().getUrl().equals(SOME_URL));
264             }
265             count++;
266         }
267         return count;
268     }
269     
270     protected ARCWriter createArcWithOneRecord(String JavaDoc name,
271         boolean compressed)
272     throws IOException JavaDoc {
273         ARCWriter writer = createARCWriter(name, compressed);
274         String JavaDoc content = getContent();
275         writeRecord(writer, SOME_URL, "text/html",
276             content.length(), getBaos(content));
277         return writer;
278     }
279     
280     public void testSpaceInURL() {
281         String JavaDoc eMessage = null;
282         try {
283             holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
284         } catch (IOException JavaDoc e) {
285             eMessage = e.getMessage();
286         }
287         assertTrue("Didn't get expected exception: " + eMessage,
288             eMessage.startsWith("Metadata line doesn't match"));
289     }
290
291     public void testTabInURL() {
292         String JavaDoc eMessage = null;
293         try {
294             holeyUrl("testTabInURL-" + PREFIX, false, "\t");
295         } catch (IOException JavaDoc e) {
296             eMessage = e.getMessage();
297         }
298         assertTrue("Didn't get expected exception: " + eMessage,
299             eMessage.startsWith("Metadata line doesn't match"));
300     }
301     
302     protected void holeyUrl(String JavaDoc name, boolean compress, String JavaDoc urlInsert)
303     throws IOException JavaDoc {
304         ARCWriter writer = createArcWithOneRecord(name, compress);
305         // Add some bytes on the end to mess up the record.
306
String JavaDoc content = getContent();
307         ByteArrayOutputStream JavaDoc baos = getBaos(content);
308         writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
309             content.length(), baos);
310         writer.close();
311     }
312     
313 // If uncompressed, length has to be right or parse will fail.
314
//
315
// public void testLengthTooShort() throws IOException {
316
// lengthTooShort("testLengthTooShort-" + PREFIX, false);
317
// }
318

319     public void testLengthTooShortCompressed() throws IOException JavaDoc {
320         lengthTooShort("testLengthTooShortCompressed-" + PREFIX, true, false);
321     }
322     
323     public void testLengthTooShortCompressedStrict()
324     throws IOException JavaDoc {
325         String JavaDoc eMessage = null;
326         try {
327             lengthTooShort("testLengthTooShortCompressedStrict-" + PREFIX,
328                 true, true);
329         } catch (RuntimeException JavaDoc e) {
330             eMessage = e.getMessage();
331         }
332         assertTrue("Didn't get expected exception: " + eMessage,
333             eMessage.startsWith("java.io.IOException: Record ENDING at"));
334     }
335      
336     protected void lengthTooShort(String JavaDoc name, boolean compress, boolean strict)
337     throws IOException JavaDoc {
338         ARCWriter writer = createArcWithOneRecord(name, compress);
339         // Add some bytes on the end to mess up the record.
340
String JavaDoc content = getContent();
341         ByteArrayOutputStream JavaDoc baos = getBaos(content);
342         baos.write("SOME TRAILING BYTES".getBytes());
343         writeRecord(writer, SOME_URL, "text/html",
344             content.length(), baos);
345         writeRecord(writer, SOME_URL, "text/html",
346             content.length(), getBaos(content));
347         writer.close();
348         
349         // Catch System.err into a byte stream.
350
ByteArrayOutputStream JavaDoc os = new ByteArrayOutputStream JavaDoc();
351         System.setErr(new PrintStream JavaDoc(os));
352         
353         ARCReader r = ARCReaderFactory.get(writer.getFile());
354         r.setStrict(strict);
355         int count = iterateRecords(r);
356         assertTrue("Count wrong " + count, count == 4);
357
358         // Make sure we get the warning string which complains about the
359
// trailing bytes.
360
String JavaDoc err = os.toString();
361         assertTrue("No message " + err, err.startsWith("WARNING") &&
362             (err.indexOf("Record ENDING at") > 0));
363     }
364     
365 // If uncompressed, length has to be right or parse will fail.
366
//
367
// public void testLengthTooLong()
368
// throws IOException {
369
// lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
370
// false, false);
371
// }
372

373     public void testLengthTooLongCompressed()
374     throws IOException JavaDoc {
375         lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
376             true, false);
377     }
378     
379     public void testLengthTooLongCompressedStrict() {
380         String JavaDoc eMessage = null;
381         try {
382             lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
383                 true, true);
384         } catch (IOException JavaDoc e) {
385             eMessage = e.getMessage();
386         }
387         assertTrue("Didn't get expected exception: " + eMessage,
388             eMessage.startsWith("Premature EOF before end-of-record"));
389     }
390     
391     protected void lengthTooLong(String JavaDoc name, boolean compress,
392             boolean strict)
393     throws IOException JavaDoc {
394         ARCWriter writer = createArcWithOneRecord(name, compress);
395         // Add a record with a length that is too long.
396
String JavaDoc content = getContent();
397         writeRecord(writer, SOME_URL, "text/html",
398             content.length() + 10, getBaos(content));
399         writeRecord(writer, SOME_URL, "text/html",
400             content.length(), getBaos(content));
401         writer.close();
402         
403         // Catch System.err.
404
ByteArrayOutputStream JavaDoc os = new ByteArrayOutputStream JavaDoc();
405         System.setErr(new PrintStream JavaDoc(os));
406         
407         ARCReader r = ARCReaderFactory.get(writer.getFile());
408         r.setStrict(strict);
409         int count = iterateRecords(r);
410         assertTrue("Count wrong " + count, count == 4);
411         
412         // Make sure we get the warning string which complains about the
413
// trailing bytes.
414
String JavaDoc err = os.toString();
415         assertTrue("No message " + err,
416             err.startsWith("WARNING Premature EOF before end-of-record"));
417     }
418     
419     public void testGapError() throws IOException JavaDoc {
420         ARCWriter writer = createArcWithOneRecord("testGapError", true);
421         String JavaDoc content = getContent();
422         // Make a 'weird' RIS that returns bad 'remaining' length
423
// after the call to readFullyTo.
424
ReplayInputStream ris = new ReplayInputStream(content.getBytes(),
425                 content.length(), null) {
426             private boolean readFullyToCalled = false;
427             public void readFullyTo(OutputStream JavaDoc os)
428             throws IOException JavaDoc {
429                 super.readFullyTo(os);
430                 this.readFullyToCalled = true;
431             }
432             
433             public long remaining() {
434                 return (this.readFullyToCalled)? -1: super.remaining();
435             }
436         };
437         String JavaDoc message = null;
438         try {
439         writer.write(SOME_URL, "text/html", "192.168.1.1",
440             (new Date JavaDoc()).getTime(), content.length(), ris);
441         } catch (IOException JavaDoc e) {
442             message = e.getMessage();
443         }
444         writer.close();
445         assertTrue("No gap when should be",
446             message != null &&
447             message.indexOf("Gap between expected and actual") >= 0);
448     }
449     
450     /**
451      * Write an arc file for other tests to use.
452      * @param arcdir Directory to write to.
453      * @param compress True if file should be compressed.
454      * @return ARC written.
455      * @throws IOException
456      */

457     public static File JavaDoc createARCFile(File JavaDoc arcdir, boolean compress)
458     throws IOException JavaDoc {
459         File JavaDoc [] files = {arcdir};
460         ARCWriter writer = new ARCWriter(SERIAL_NO, Arrays.asList(files),
461             "test", compress, DEFAULT_MAX_ARC_FILE_SIZE);
462         String JavaDoc content = getContent();
463         writeRecord(writer, SOME_URL, "text/html", content.length(),
464             getBaos(content));
465         writer.close();
466         return writer.getFile();
467     }
468     
469 // public void testSpeed() throws IOException {
470
// ARCWriter writer = createArcWithOneRecord("speed", true);
471
// // Add a record with a length that is too long.
472
// String content = getContent();
473
// final int count = 100000;
474
// logger.info("Starting speed write of " + count + " records.");
475
// for (int i = 0; i < count; i++) {
476
// writeRecord(writer, SOME_URL, "text/html", content.length(),
477
// getBaos(content));
478
// }
479
// writer.close();
480
// logger.info("Finished speed write test.");
481
// }
482

483     
484     public void testValidateMetaLine() throws Exception JavaDoc {
485         final String JavaDoc line = "http://www.aandw.net/images/walden2.png " +
486             "128.197.34.86 20060111174224 image/png 2160";
487         ARCWriter w = createARCWriter("testValidateMetaLine", true);
488         try {
489             w.validateMetaLine(line);
490             w.validateMetaLine(line + LINE_SEPARATOR);
491             w.validateMetaLine(line + "\\r\\n");
492         } finally {
493             w.close();
494         }
495     }
496     
497     public void testArcRecordOffsetReads() throws Exception JavaDoc {
498         // Get an ARC with one record.
499
WriterPoolMember w =
500             createArcWithOneRecord("testArcRecordInBufferStream", true);
501         w.close();
502         // Get reader on said ARC.
503
ARCReader r = ARCReaderFactory.get(w.getFile());
504         final Iterator JavaDoc i = r.iterator();
505         // Skip first ARC meta record.
506
ARCRecord ar = (ARCRecord) i.next();
507         i.hasNext();
508         // Now we're at first and only record in ARC.
509
ar = (ARCRecord) i.next();
510         // Now try getting some random set of bytes out of it
511
// at an odd offset (used to fail because we were
512
// doing bad math to find where in buffer to read).
513
final byte[] buffer = new byte[17];
514         final int maxRead = 4;
515         int totalRead = 0;
516         while (totalRead < maxRead) {
517             totalRead = totalRead
518                 + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
519             assertTrue(totalRead > 0);
520         }
521     }
522 }
523
Popular Tags