1 26 package org.archive.io.arc; 27 28 import java.io.BufferedInputStream ; 29 import java.io.ByteArrayOutputStream ; 30 import java.io.File ; 31 import java.io.FileInputStream ; 32 import java.io.IOException ; 33 import java.io.InputStream ; 34 import java.io.PrintStream ; 35 import java.io.UnsupportedEncodingException ; 36 import java.util.Iterator ; 37 import java.util.List ; 38 import java.util.concurrent.atomic.AtomicInteger ; 39 import java.util.logging.Logger ; 40 import java.util.regex.Matcher ; 41 import java.util.regex.Pattern ; 42 43 import org.archive.io.GzippedInputStream; 44 import org.archive.io.ReplayInputStream; 45 import org.archive.io.WriterPoolMember; 46 import org.archive.util.ArchiveUtils; 47 import org.archive.util.DevUtils; 48 import org.archive.util.MimetypeUtils; 49 50 51 118 public class ARCWriter extends WriterPoolMember implements ARCConstants { 119 private static final Logger logger = 120 Logger.getLogger(ARCWriter.class.getName()); 121 122 125 private static final Pattern METADATA_LINE_PATTERN = 126 Pattern.compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR + "?)$"); 127 128 131 private final byte [] readbuffer = new byte[4 * 1024]; 132 133 private List metadata = null; 134 135 136 150 public ARCWriter(final AtomicInteger serialNo, final PrintStream out, 151 final File arc, final boolean cmprs, String a14DigitDate, 152 final List metadata) 153 throws IOException { 154 super(serialNo, out, arc, cmprs, a14DigitDate); 155 this.metadata = metadata; 156 writeFirstRecord(a14DigitDate); 157 } 158 159 171 public ARCWriter(final AtomicInteger serialNo, final List <File > dirs, 172 final String prefix, final boolean cmprs, final int maxSize) { 173 this(serialNo, dirs, prefix, "", cmprs, maxSize, null); 174 } 175 176 188 public ARCWriter(final AtomicInteger serialNo, final List <File > dirs, 189 final String prefix, final String suffix, final boolean cmprs, 190 final int maxSize, final List meta) { 191 super(serialNo, dirs, prefix, suffix, cmprs, maxSize, 192 ARC_FILE_EXTENSION); 193 this.metadata = meta; 194 } 195 196 protected String createFile() 197 throws IOException { 198 String name = super.createFile(); 199 writeFirstRecord(getCreateTimestamp()); 200 return name; 201 } 202 203 private void writeFirstRecord(final String ts) 204 throws IOException { 205 write(generateARCFileMetaData(ts)); 206 } 207 208 241 private byte [] generateARCFileMetaData(String date) 242 throws IOException { 243 int metadataBodyLength = getMetadataLength(); 244 String metadataHeaderLinesTwoAndThree = 247 getMetadataHeaderLinesTwoAndThree("1 " + 248 ((metadataBodyLength > 0)? "1": "0")); 249 int recordLength = metadataBodyLength + 250 metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length; 251 String metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename() + 252 " 0.0.0.0 " + date + " text/plain " + recordLength + 253 metadataHeaderLinesTwoAndThree; 254 ByteArrayOutputStream metabaos = 255 new ByteArrayOutputStream (recordLength); 256 metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING)); 258 if (metadataBodyLength > 0) { 260 writeMetaData(metabaos); 261 } 262 263 metabaos.write(LINE_SEPARATOR); 265 266 byte [] bytes = metabaos.toByteArray(); 268 269 if(isCompressed()) { 270 byte [] gzippedMetaData = GzippedInputStream.gzip(bytes); 278 if (gzippedMetaData[3] != 0) { 279 throw new IOException ("The GZIP FLG header is unexpectedly " + 280 " non-zero. Need to add smarter code that can deal " + 281 " when already extant extra GZIP header fields."); 282 } 283 gzippedMetaData[3] = 4; 288 gzippedMetaData[9] = 3; 289 byte [] assemblyBuffer = new byte[gzippedMetaData.length + 290 ARC_GZIP_EXTRA_FIELD.length]; 291 System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10); 295 System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10, 296 ARC_GZIP_EXTRA_FIELD.length); 297 System.arraycopy(gzippedMetaData, 10, assemblyBuffer, 298 10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10); 299 bytes = assemblyBuffer; 300 } 301 return bytes; 302 } 303 304 public String getMetadataHeaderLinesTwoAndThree(String version) { 305 StringBuffer buffer = new StringBuffer (); 306 buffer.append(LINE_SEPARATOR); 307 buffer.append(version); 308 buffer.append(" InternetArchive"); 309 buffer.append(LINE_SEPARATOR); 310 buffer.append("URL IP-address Archive-date Content-type Archive-length"); 311 buffer.append(LINE_SEPARATOR); 312 return buffer.toString(); 313 } 314 315 322 private void writeMetaData(ByteArrayOutputStream baos) 323 throws UnsupportedEncodingException , IOException { 324 if (this.metadata == null) { 325 return; 326 } 327 328 for (Iterator i = this.metadata.iterator(); 329 i.hasNext();) { 330 Object obj = i.next(); 331 if (obj instanceof String ) { 332 baos.write(((String )obj).getBytes(DEFAULT_ENCODING)); 333 } else if (obj instanceof File ) { 334 InputStream is = null; 335 try { 336 is = new BufferedInputStream ( 337 new FileInputStream ((File )obj)); 338 byte [] buffer = new byte[4096]; 339 for (int read = -1; (read = is.read(buffer)) != -1;) { 340 baos.write(buffer, 0, read); 341 } 342 } finally { 343 if (is != null) { 344 is.close(); 345 } 346 } 347 } else if (obj != null) { 348 logger.severe("Unsupported metadata type: " + obj); 349 } 350 } 351 return; 352 } 353 354 358 private int getMetadataLength() 359 throws UnsupportedEncodingException { 360 int result = -1; 361 if (this.metadata == null) { 362 result = 0; 363 } else { 364 for (Iterator i = this.metadata.iterator(); 365 i.hasNext();) { 366 Object obj = i.next(); 367 if (obj instanceof String ) { 368 result += ((String )obj).getBytes(DEFAULT_ENCODING).length; 369 } else if (obj instanceof File ) { 370 result += ((File )obj).length(); 371 } else { 372 logger.severe("Unsupported metadata type: " + obj); 373 } 374 } 375 } 376 return result; 377 } 378 379 public void write(String uri, String contentType, String hostIP, 380 long fetchBeginTimeStamp, int recordLength, 381 ByteArrayOutputStream baos) 382 throws IOException { 383 preWriteRecordTasks(); 384 try { 385 write(getMetaLine(uri, contentType, hostIP, 386 fetchBeginTimeStamp, recordLength).getBytes(UTF8)); 387 baos.writeTo(getOutputStream()); 388 write(LINE_SEPARATOR); 389 } finally { 390 postWriteRecordTasks(); 391 } 392 } 393 394 public void write(String uri, String contentType, String hostIP, 395 long fetchBeginTimeStamp, int recordLength, InputStream in) 396 throws IOException { 397 preWriteRecordTasks(); 398 try { 399 write(getMetaLine(uri, contentType, hostIP, 400 fetchBeginTimeStamp, recordLength).getBytes(UTF8)); 401 readFullyFrom(in, recordLength, this.readbuffer); 402 write(LINE_SEPARATOR); 403 } finally { 404 postWriteRecordTasks(); 405 } 406 } 407 408 public void write(String uri, String contentType, String hostIP, 409 long fetchBeginTimeStamp, int recordLength, 410 ReplayInputStream ris) 411 throws IOException { 412 preWriteRecordTasks(); 413 try { 414 write(getMetaLine(uri, contentType, hostIP, 415 fetchBeginTimeStamp, recordLength).getBytes(UTF8)); 416 try { 417 ris.readFullyTo(getOutputStream()); 418 long remaining = ris.remaining(); 419 if (remaining != 0) { 422 String message = "Gap between expected and actual: " + 423 remaining + LINE_SEPARATOR + DevUtils.extraInfo() + 424 " writing arc " + this.getFile().getAbsolutePath(); 425 DevUtils.warnHandle(new Throwable (message), message); 426 throw new IOException (message); 427 } 428 } finally { 429 ris.close(); 430 } 431 432 write(LINE_SEPARATOR); 434 } finally { 435 postWriteRecordTasks(); 436 } 437 } 438 439 448 protected String getMetaLine(String uri, String contentType, String hostIP, 449 long fetchBeginTimeStamp, int recordLength) 450 throws IOException { 451 if (fetchBeginTimeStamp <= 0) { 452 throw new IOException ("Bogus fetchBeginTimestamp: " + 453 Long.toString(fetchBeginTimeStamp)); 454 } 455 456 return validateMetaLine(createMetaline(uri, hostIP, 457 ArchiveUtils.get14DigitDate(fetchBeginTimeStamp), 458 MimetypeUtils.truncate(contentType), 459 Integer.toString(recordLength))); 460 } 461 462 public String createMetaline(String uri, String hostIP, 463 String timeStamp, String mimetype, String recordLength) { 464 return uri + HEADER_FIELD_SEPARATOR + hostIP + 465 HEADER_FIELD_SEPARATOR + timeStamp + 466 HEADER_FIELD_SEPARATOR + mimetype + 467 HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR; 468 } 469 470 476 protected String validateMetaLine(String metaLineStr) 477 throws IOException { 478 if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) { 479 throw new IOException ("Metadata line length is " + 480 metaLineStr.length() + " which is > than maximum " + 481 MAX_METADATA_LINE_LENGTH); 482 } 483 Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr); 484 if (!m.matches()) { 485 throw new IOException ("Metadata line doesn't match expected" + 486 " pattern: " + metaLineStr); 487 } 488 return metaLineStr; 489 } 490 } 491 | Popular Tags |