1 26 package org.archive.io.warc; 27 28 import java.io.ByteArrayInputStream ; 29 import java.io.ByteArrayOutputStream ; 30 import java.io.File ; 31 import java.io.FileNotFoundException ; 32 import java.io.IOException ; 33 import java.net.URI ; 34 import java.net.URISyntaxException ; 35 import java.util.Arrays ; 36 import java.util.Iterator ; 37 import java.util.List ; 38 import java.util.concurrent.atomic.AtomicInteger ; 39 40 import org.archive.io.ArchiveRecord; 41 import org.archive.io.ArchiveRecordHeader; 42 import org.archive.io.UTF8Bytes; 43 import org.archive.io.WriterPoolMember; 44 import org.archive.uid.GeneratorFactory; 45 import org.archive.util.ArchiveUtils; 46 import org.archive.util.TmpDirTestCase; 47 import org.archive.util.anvl.ANVLRecord; 48 49 54 public class ExperimentalWARCWriterTest 55 extends TmpDirTestCase implements WARCConstants { 56 private static final AtomicInteger SERIAL_NO = new AtomicInteger (); 57 58 61 private static final String PREFIX = "IAH"; 62 63 private static final String SOME_URL = "http://www.archive.org/test/"; 64 65 public void testCheckHeaderLineValue() throws Exception { 66 ExperimentalWARCWriter writer = new ExperimentalWARCWriter(); 67 writer.checkHeaderLineParameters("one"); 68 IOException exception = null; 69 try { 70 writer.checkHeaderLineParameters("with space"); 71 } catch(IOException e) { 72 exception = e; 73 } 74 assertNotNull(exception); 75 exception = null; 76 try { 77 writer.checkHeaderLineParameters("with\0x0000controlcharacter"); 78 } catch(IOException e) { 79 exception = e; 80 } 81 assertNotNull(exception); 82 } 83 84 public void testMimetypes() throws IOException { 85 ExperimentalWARCWriter writer = new ExperimentalWARCWriter(); 86 writer.checkHeaderLineMimetypeParameter("text/xml"); 87 writer.checkHeaderLineMimetypeParameter("text/xml+rdf"); 88 writer.checkHeaderLineMimetypeParameter( 89 "text/plain; charset=SHIFT-JIS"); 90 System.out.println(writer.checkHeaderLineMimetypeParameter( 91 "multipart/mixed; \r\n boundary=\"simple boundary\"")); 92 } 93 94 public void testWriteRecord() throws IOException { 95 File [] files = {getTmpDir()}; 96 97 ExperimentalWARCWriter writer = 99 new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files), 100 this.getClass().getName(), "suffix", false, -1, null); 101 writeFile(writer); 102 103 writer = new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files), 105 this.getClass().getName(), "suffix", true, -1, null); 106 writeFile(writer); 107 } 108 109 private void writeFile(final ExperimentalWARCWriter writer) 110 throws IOException { 111 try { 112 writeWarcinfoRecord(writer); 113 writeBasicRecords(writer); 114 } finally { 115 writer.close(); 116 writer.getFile().delete(); 117 } 118 } 119 120 private void writeWarcinfoRecord(ExperimentalWARCWriter writer) 121 throws IOException { 122 ANVLRecord meta = new ANVLRecord(); 123 meta.addLabelValue("size", "1G"); 124 meta.addLabelValue("operator", "igor"); 125 byte [] bytes = meta.getUTF8Bytes(); 126 writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null, 127 new ByteArrayInputStream (bytes), bytes.length); 128 } 129 130 protected void writeBasicRecords(final ExperimentalWARCWriter writer) 131 throws IOException { 132 ANVLRecord headerFields = new ANVLRecord(); 133 headerFields.addLabelValue("x", "y"); 134 headerFields.addLabelValue("a", "b"); 135 136 URI rid = null; 137 try { 138 rid = GeneratorFactory.getFactory(). 139 getQualifiedRecordID(TYPE, METADATA); 140 } catch (URISyntaxException e) { 141 throw new IOException (e.getMessage()); 143 } 144 final String content = "Any old content."; 145 for (int i = 0; i < 10; i++) { 146 String body = i + ". " + content; 147 byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8); 148 writer.writeRecord(METADATA, "http://www.archive.org/", 149 ArchiveUtils.get14DigitDate(), "no/type", 150 rid, headerFields, new ByteArrayInputStream (bodyBytes), 151 (long)bodyBytes.length); 152 } 153 } 154 155 158 protected static String getContent() { 159 return getContent(null); 160 } 161 162 166 protected static String getContent(String indexStr) { 167 String page = (indexStr != null)? "Page #" + indexStr: "Some Page"; 168 return "HTTP/1.1 200 OK\r\n" + 169 "Content-Type: text/html\r\n\r\n" + 170 "<html><head><title>" + page + 171 "</title></head>" + 172 "<body>" + page + 173 "</body></html>"; 174 } 175 176 183 protected int writeRandomHTTPRecord(ExperimentalWARCWriter w, int index) 184 throws IOException { 185 ByteArrayOutputStream baos = new ByteArrayOutputStream (); 186 String indexStr = Integer.toString(index); 187 byte[] record = (getContent(indexStr)).getBytes(); 188 int recordLength = record.length; 189 baos.write(record); 190 ANVLRecord r = new ANVLRecord(1); 193 r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1"); 194 w.writeResourceRecord( 195 "http://www.one.net/id=" + indexStr, 196 ArchiveUtils.get14DigitDate(), 197 "text/html; charset=UTF-8", 198 r, 199 new ByteArrayInputStream (baos.toByteArray()), 200 recordLength); 201 return recordLength; 202 } 203 204 213 private File writeRecords(String baseName, boolean compress, 214 int maxSize, int recordCount) 215 throws IOException { 216 cleanUpOldFiles(baseName); 217 File [] files = {getTmpDir()}; 218 ExperimentalWARCWriter w = new ExperimentalWARCWriter(SERIAL_NO, 219 Arrays.asList(files), baseName + '-' + PREFIX, "", compress, 220 maxSize, null); 221 assertNotNull(w); 222 for (int i = 0; i < recordCount; i++) { 223 writeRandomHTTPRecord(w, i); 224 } 225 w.close(); 226 assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(), 227 w.getFile().exists()); 228 return w.getFile(); 229 } 230 231 238 private void validate(File f, int recordCount) 239 throws FileNotFoundException , IOException { 240 WARCReader reader = WARCReaderFactory.get(f); 241 assertNotNull(reader); 242 List headers = null; 243 if (recordCount == -1) { 244 headers = reader.validate(); 245 } else { 246 headers = reader.validate(recordCount); 247 } 248 reader.close(); 249 250 reader = WARCReaderFactory.get(f); 254 for (int i = headers.size() - 1; i >= 0; i--) { 255 ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i); 256 ArchiveRecord r = reader.get(h.getOffset()); 257 String mimeType = r.getHeader().getMimetype(); 258 assertTrue("Record is bogus", 259 mimeType != null && mimeType.length() > 0); 260 } 261 reader.close(); 262 263 assertTrue("Metadatas not equal", headers.size() == recordCount); 264 for (Iterator i = headers.iterator(); i.hasNext();) { 265 ArchiveRecordHeader r = (ArchiveRecordHeader)i.next(); 266 assertTrue("Record is empty", r.getLength() > 0); 267 } 268 } 269 270 public void testWriteRecords() throws IOException { 271 final int recordCount = 2; 272 File f = writeRecords("writeRecord", false, DEFAULT_MAX_WARC_FILE_SIZE, 273 recordCount); 274 validate(f, recordCount + 1); } 276 277 public void testRandomAccess() throws IOException { 278 final int recordCount = 3; 279 File f = writeRecords("writeRecord", true, DEFAULT_MAX_WARC_FILE_SIZE, 280 recordCount); 281 WARCReader reader = WARCReaderFactory.get(f); 282 boolean readFirst = false; 284 String url = null; 285 long offset = -1; 286 long totalRecords = 0; 287 boolean readSecond = false; 288 for (final Iterator i = reader.iterator(); i.hasNext(); 289 totalRecords++) { 290 WARCRecord ar = (WARCRecord)i.next(); 291 if (!readFirst) { 292 readFirst = true; 293 continue; 294 } 295 if (!readSecond) { 296 url = ar.getHeader().getUrl(); 297 offset = ar.getHeader().getOffset(); 298 readSecond = true; 299 } 300 } 301 302 reader = WARCReaderFactory.get(f, offset); 303 ArchiveRecord ar = reader.get(); 304 assertEquals(ar.getHeader().getUrl(), url); 305 ar.close(); 306 307 reader = WARCReaderFactory.get(f, offset); 309 int count = 0; 310 for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) { 311 count++; 312 } 313 reader.close(); 314 assertEquals(totalRecords - 1, count); 315 } 316 317 public void testWriteRecordCompressed() throws IOException { 318 final int recordCount = 2; 319 File arcFile = writeRecords("writeRecordCompressed", true, 320 DEFAULT_MAX_WARC_FILE_SIZE, recordCount); 321 validate(arcFile, recordCount + 1 ); 322 } 323 324 protected ExperimentalWARCWriter createWARCWriter(String NAME, 325 boolean compress) { 326 File [] files = {getTmpDir()}; 327 return new ExperimentalWARCWriter(SERIAL_NO, 328 Arrays.asList(files), NAME, "", 329 compress, DEFAULT_MAX_WARC_FILE_SIZE, null); 330 } 331 332 protected static ByteArrayOutputStream getBaos(String str) 333 throws IOException { 334 ByteArrayOutputStream baos = new ByteArrayOutputStream (); 335 baos.write(str.getBytes()); 336 return baos; 337 } 338 339 protected static void writeRecord(ExperimentalWARCWriter w, String url, 340 String mimetype, int len, ByteArrayOutputStream baos) 341 throws IOException { 342 w.writeResourceRecord(url, 343 ArchiveUtils.get14DigitDate(), 344 mimetype, 345 null, 346 new ByteArrayInputStream (baos.toByteArray()), 347 len); 348 } 349 350 protected int iterateRecords(WARCReader r) 351 throws IOException { 352 int count = 0; 353 for (Iterator <ArchiveRecord> i = r.iterator(); i.hasNext();) { 354 ArchiveRecord ar = i.next(); 355 ar.close(); 356 if (count != 0) { 357 assertTrue("Unexpected URL " + ar.getHeader().getUrl(), 358 ar.getHeader().getUrl().equals(SOME_URL)); 359 } 360 count++; 361 } 362 return count; 363 } 364 365 protected ExperimentalWARCWriter createWithOneRecord(String name, 366 boolean compressed) 367 throws IOException { 368 ExperimentalWARCWriter writer = createWARCWriter(name, compressed); 369 String content = getContent(); 370 writeRecord(writer, SOME_URL, "text/html", 371 content.length(), getBaos(content)); 372 return writer; 373 } 374 375 public void testSpaceInURL() { 376 String eMessage = null; 377 try { 378 holeyUrl("testSpaceInURL-" + PREFIX, false, " "); 379 } catch (IOException e) { 380 eMessage = e.getMessage(); 381 } 382 assertTrue("Didn't get expected exception: " + eMessage, 383 eMessage.startsWith("Contains disallowed")); 384 } 385 386 public void testTabInURL() { 387 String eMessage = null; 388 try { 389 holeyUrl("testTabInURL-" + PREFIX, false, "\t"); 390 } catch (IOException e) { 391 eMessage = e.getMessage(); 392 } 393 assertTrue("Didn't get expected exception: " + eMessage, 394 eMessage.startsWith("Contains illegal")); 395 } 396 397 protected void holeyUrl(String name, boolean compress, String urlInsert) 398 throws IOException { 399 ExperimentalWARCWriter writer = createWithOneRecord(name, compress); 400 String content = getContent(); 402 ByteArrayOutputStream baos = getBaos(content); 403 writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html", 404 content.length(), baos); 405 writer.close(); 406 } 407 408 415 public static File createWARCFile(File arcdir, boolean compress) 416 throws IOException { 417 File [] files = {arcdir}; 418 ExperimentalWARCWriter writer = 419 new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files), 420 "test", "", compress, DEFAULT_MAX_WARC_FILE_SIZE, null); 421 String content = getContent(); 422 writeRecord(writer, SOME_URL, "text/html", content.length(), 423 getBaos(content)); 424 writer.close(); 425 return writer.getFile(); 426 } 427 428 442 public void testArcRecordOffsetReads() throws Exception { 443 WriterPoolMember w = 445 createWithOneRecord("testArcRecordInBufferStream", true); 446 w.close(); 447 WARCReader r = WARCReaderFactory.get(w.getFile()); 449 final Iterator <ArchiveRecord> i = r.iterator(); 450 ArchiveRecord ar = i.next(); 452 i.hasNext(); 453 ar = (WARCRecord) i.next(); 455 final byte[] buffer = new byte[17]; 459 final int maxRead = 4; 460 int totalRead = 0; 461 while (totalRead < maxRead) { 462 totalRead = totalRead 463 + ar.read(buffer, 13 + totalRead, maxRead - totalRead); 464 assertTrue(totalRead > 0); 465 } 466 } 467 } | Popular Tags |