1 23 package org.archive.io.warc; 24 25 import java.io.ByteArrayInputStream ; 26 import java.io.ByteArrayOutputStream ; 27 import java.io.File ; 28 import java.io.IOException ; 29 import java.io.InputStream ; 30 import java.io.OutputStream ; 31 import java.net.URI ; 32 import java.net.URISyntaxException ; 33 import java.text.DecimalFormat ; 34 import java.text.NumberFormat ; 35 import java.util.Iterator ; 36 import java.util.List ; 37 import java.util.Map ; 38 import java.util.concurrent.atomic.AtomicInteger ; 39 40 import org.archive.io.UTF8Bytes; 41 import org.archive.io.WriterPoolMember; 42 import org.archive.uid.GeneratorFactory; 43 import org.archive.util.ArchiveUtils; 44 import org.archive.util.anvl.ANVLRecord; 45 46 47 64 public class ExperimentalWARCWriter extends WriterPoolMember 65 implements WARCConstants { 66 69 private final byte [] readbuffer = new byte[16 * 1024]; 70 71 74 public static byte [] CRLF_BYTES; 75 static { 76 try { 77 CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING); 78 } catch(Exception e) { 79 e.printStackTrace(); 80 } 81 }; 82 83 86 private static NumberFormat RECORD_LENGTH_FORMATTER = 87 new DecimalFormat (PLACEHOLDER_RECORD_LENGTH_STRING); 88 89 93 private final List fileMetadata; 94 95 96 100 ExperimentalWARCWriter() { 101 this(null, null, "", "", true, -1, null); 102 } 103 104 115 public ExperimentalWARCWriter(final AtomicInteger serialNo, 116 final OutputStream out, final File f, 117 final boolean cmprs, final String a14DigitDate, 118 final List warcinfoData) 119 throws IOException { 120 super(serialNo, out, f, cmprs, a14DigitDate); 121 this.fileMetadata = warcinfoData; 123 } 124 125 135 public ExperimentalWARCWriter(final AtomicInteger serialNo, 136 final List <File > dirs, final String prefix, 137 final String suffix, final boolean cmprs, 138 final int maxSize, final List warcinfoData) { 139 super(serialNo, dirs, prefix, suffix, cmprs, maxSize, 140 WARC_FILE_EXTENSION); 141 this.fileMetadata = warcinfoData; 143 } 144 145 @Override 146 protected String createFile(File file) throws IOException { 147 String filename = super.createFile(file); 148 writeWarcinfoRecord(filename); 149 return filename; 150 } 151 152 protected void baseCharacterCheck(final char c, final String parameter) 153 throws IOException { 154 if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) { 156 throw new IOException ("Contains illegal character 0x" + 157 Integer.toHexString(c) + ": " + parameter); 158 } 159 } 160 161 protected String checkHeaderLineParameters(final String parameter) 162 throws IOException { 163 for (int i = 0; i < parameter.length(); i++) { 164 final char c = parameter.charAt(i); 165 baseCharacterCheck(c, parameter); 166 if (Character.isWhitespace(c)) { 167 throw new IOException ("Contains disallowed white space 0x" + 168 Integer.toHexString(c) + ": " + parameter); 169 } 170 } 171 return parameter; 172 } 173 174 protected String checkHeaderLineMimetypeParameter(final String parameter) 175 throws IOException { 176 StringBuilder sb = new StringBuilder (parameter.length()); 177 boolean wasWhitespace = false; 178 for (int i = 0; i < parameter.length(); i++) { 179 char c = parameter.charAt(i); 180 if (Character.isWhitespace(c)) { 181 if (wasWhitespace) { 185 continue; 186 } 187 wasWhitespace = true; 188 c = ' '; 189 } else { 190 wasWhitespace = false; 191 baseCharacterCheck(c, parameter); 192 } 193 sb.append(c); 194 } 195 196 return sb.toString(); 197 } 198 199 protected byte [] createRecordHeaderline(final String type, 200 final String url, final String create14DigitDate, 201 final String mimetype, final URI recordId, 202 final int namedFieldsLength, final long contentLength) 203 throws IOException { 204 final StringBuilder sb = 205 new StringBuilder (2048); 206 sb.append(WARC_ID); 207 sb.append(HEADER_FIELD_SEPARATOR); 208 sb.append(PLACEHOLDER_RECORD_LENGTH_STRING); 209 sb.append(HEADER_FIELD_SEPARATOR); 210 sb.append(type); 211 sb.append(HEADER_FIELD_SEPARATOR); 212 sb.append(checkHeaderLineParameters(url)); 213 sb.append(HEADER_FIELD_SEPARATOR); 214 sb.append(checkHeaderLineParameters(create14DigitDate)); 215 sb.append(HEADER_FIELD_SEPARATOR); 216 sb.append(checkHeaderLineParameters(recordId.toString())); 221 sb.append(HEADER_FIELD_SEPARATOR); 222 sb.append(checkHeaderLineMimetypeParameter(mimetype)); 223 sb.append(CRLF); 225 226 long length = sb.length() + namedFieldsLength + contentLength; 227 228 int start = WARC_ID.length() + 1 ; 231 int end = start + PLACEHOLDER_RECORD_LENGTH_STRING.length(); 232 String lenStr = RECORD_LENGTH_FORMATTER.format(length); 233 sb.replace(start, end, lenStr); 234 235 return sb.toString().getBytes(HEADER_LINE_ENCODING); 236 } 237 238 protected void writeRecord(final String type, final String url, 239 final String create14DigitDate, final String mimetype, 240 final URI recordId, ANVLRecord namedFields, 241 final InputStream contentStream, final long contentLength) 242 throws IOException { 243 if (!TYPES_LIST.contains(type)) { 244 throw new IllegalArgumentException ("Unknown record type: " + type); 245 } 246 if (contentLength == 0 && 247 (namedFields == null || namedFields.size() <= 0)) { 248 throw new IllegalArgumentException ("Cannot have a record made " + 249 "of a Header line only (Content and Named Fields are empty)."); 250 } 251 252 preWriteRecordTasks(); 253 try { 254 if (namedFields == null) { 255 namedFields = ANVLRecord.EMPTY_ANVL_RECORD; 258 } 259 260 final byte [] namedFieldsBlock = namedFields.getUTF8Bytes(); 262 final byte [] header = createRecordHeaderline(type, url, 264 create14DigitDate, mimetype, recordId, namedFieldsBlock.length, 265 contentLength); 266 write(header); 267 write(namedFieldsBlock); 268 if (contentStream != null && contentLength > 0) { 269 readFullyFrom(contentStream, contentLength, this.readbuffer); 270 } 271 272 write(CRLF_BYTES); 275 write(CRLF_BYTES); 276 } finally { 277 postWriteRecordTasks(); 278 } 279 } 280 281 protected URI generateRecordId(final Map <String , String > qualifiers) 282 throws IOException { 283 URI rid = null; 284 try { 285 rid = GeneratorFactory.getFactory(). 286 getQualifiedRecordID(qualifiers); 287 } catch (URISyntaxException e) { 288 throw new IOException (e.getMessage()); 290 } 291 return rid; 292 } 293 294 protected URI generateRecordId(final String key, final String value) 295 throws IOException { 296 URI rid = null; 297 try { 298 rid = GeneratorFactory.getFactory(). 299 getQualifiedRecordID(key, value); 300 } catch (URISyntaxException e) { 301 throw new IOException (e.getMessage()); 303 } 304 return rid; 305 } 306 307 public URI writeWarcinfoRecord(String filename) 308 throws IOException { 309 return writeWarcinfoRecord(filename, null); 310 } 311 312 public URI writeWarcinfoRecord(String filename, final String description) 313 throws IOException { 314 if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) { 316 filename = filename.substring(0, 317 filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length()); 318 } 319 ANVLRecord record = new ANVLRecord(2); 320 record.addLabelValue(NAMED_FIELD_WARCFILENAME, filename); 321 if (description != null && description.length() > 0) { 322 record.addLabelValue(NAMED_FIELD_DESCRIPTION, description); 323 } 324 byte [] warcinfoBody = null; 326 if (this.fileMetadata == null) { 327 warcinfoBody = "TODO: Unimplemented".getBytes(); 329 } else { 330 ByteArrayOutputStream baos = new ByteArrayOutputStream (); 331 for (final Iterator i = this.fileMetadata.iterator(); 332 i.hasNext();) { 333 baos.write(i.next().toString().getBytes(UTF8Bytes.UTF8)); 334 } 335 warcinfoBody = baos.toByteArray(); 336 } 337 URI uri = writeWarcinfoRecord("text/plain", record, 338 new ByteArrayInputStream (warcinfoBody), warcinfoBody.length); 339 return uri; 342 } 343 344 356 public URI writeWarcinfoRecord(final String mimetype, 357 final ANVLRecord namedFields, final InputStream fileMetadata, 358 final long fileMetadataLength) 359 throws IOException { 360 final URI recordid = generateRecordId(TYPE, WARCINFO); 361 writeWarcinfoRecord(ArchiveUtils.get14DigitDate(), mimetype, recordid, 362 namedFields, fileMetadata, fileMetadataLength); 363 return recordid; 364 } 365 366 377 public void writeWarcinfoRecord(final String create14DigitDate, 378 final String mimetype, final URI recordId, final ANVLRecord namedFields, 379 final InputStream fileMetadata, final long fileMetadataLength) 380 throws IOException { 381 writeRecord(WARCINFO, recordId.toString(), create14DigitDate, mimetype, 382 recordId, namedFields, fileMetadata, fileMetadataLength); 383 } 384 385 public void writeRequestRecord(final String url, 386 final String create14DigitDate, final String mimetype, 387 final URI recordId, 388 final ANVLRecord namedFields, final InputStream request, 389 final long requestLength) 390 throws IOException { 391 writeRecord(REQUEST, url, create14DigitDate, 392 mimetype, recordId, namedFields, request, 393 requestLength); 394 } 395 396 public void writeResourceRecord(final String url, 397 final String create14DigitDate, final String mimetype, 398 final ANVLRecord namedFields, final InputStream response, 399 final long responseLength) 400 throws IOException { 401 writeResourceRecord(url, create14DigitDate, mimetype, getRecordID(), 402 namedFields, response, responseLength); 403 } 404 405 public void writeResourceRecord(final String url, 406 final String create14DigitDate, final String mimetype, 407 final URI recordId, 408 final ANVLRecord namedFields, final InputStream response, 409 final long responseLength) 410 throws IOException { 411 writeRecord(RESOURCE, url, create14DigitDate, 412 mimetype, recordId, namedFields, response, 413 responseLength); 414 } 415 416 public void writeResponseRecord(final String url, 417 final String create14DigitDate, final String mimetype, 418 final URI recordId, 419 final ANVLRecord namedFields, final InputStream response, 420 final long responseLength) 421 throws IOException { 422 writeRecord(RESPONSE, url, create14DigitDate, 423 mimetype, recordId, namedFields, response, 424 responseLength); 425 } 426 427 public void writeMetadataRecord(final String url, 428 final String create14DigitDate, final String mimetype, 429 final URI recordId, 430 final ANVLRecord namedFields, final InputStream metadata, 431 final long metadataLength) 432 throws IOException { 433 writeRecord(METADATA, url, create14DigitDate, 434 mimetype, recordId, namedFields, metadata, 435 metadataLength); 436 } 437 438 443 public static URI getRecordID() throws IOException { 444 URI result; 445 try { 446 result = GeneratorFactory.getFactory().getRecordID(); 447 } catch (URISyntaxException e) { 448 throw new IOException (e.toString()); 449 } 450 return result; 451 } 452 } 453 | Popular Tags |