1 23 package org.archive.io.warc; 24 25 import java.io.ByteArrayOutputStream ; 26 import java.io.IOException ; 27 import java.io.InputStream ; 28 import java.util.HashMap ; 29 import java.util.Map ; 30 import java.util.Set ; 31 import java.util.regex.Matcher ; 32 import java.util.regex.Pattern ; 33 34 import org.archive.io.ArchiveRecord; 35 import org.archive.io.ArchiveRecordHeader; 36 import org.archive.util.LongWrapper; 37 import org.archive.util.anvl.ANVLRecord; 38 39 40 45 public class WARCRecord extends ArchiveRecord implements WARCConstants { 46 50 private final static Pattern HEADER_LINE = Pattern.compile( 51 "^WARC/([0-9]+\\.[0-9]+(?:\\.[0-9]+)?)" + "[\\t ]+" + "([0-9]+)" + "[\\t ]+" + "(request|response|warcinfo|resource|metadata|" + 56 "revisit|conversion)" + "[\\t ]+" + "([^\\t ]+)" + "[\\t ]+" + "([0-9]{14})" + "[\\t ]+" + "([^\\t ]+)" + "[\\t ]+" + "(.+)$"); 66 67 private Pattern WHITESPACE = Pattern.compile("\\s"); 68 69 76 public WARCRecord(InputStream in, final String identifier, 77 final long offset) 78 throws IOException { 79 this(in, identifier, offset, true, false); 80 } 81 82 88 public WARCRecord(InputStream in, ArchiveRecordHeader headers) 89 throws IOException { 90 super(in, headers, 0, true, false); 91 } 92 93 108 public WARCRecord(final InputStream in, final String identifier, 109 final long offset, boolean digest, boolean strict) 110 throws IOException { 111 super(in, null, 0, digest, strict); 112 setHeader(parseHeaders(in, identifier, offset, strict)); 113 } 114 115 124 protected ArchiveRecordHeader parseHeaders(final InputStream in, 125 final String identifier, final long offset, final boolean strict) 126 throws IOException { 127 final Map <Object , Object > m = new HashMap <Object , Object >(); 128 m.put(ABSOLUTE_OFFSET_KEY, new Long (offset)); 129 m.put(READER_IDENTIFIER_FIELD_KEY, identifier); 130 int headLineLength = parseHeaderLine(in, m, strict); 137 138 final LongWrapper anvlParseLength = new LongWrapper(0); 148 InputStream countingStream = new InputStream () { 149 @Override 150 public int read() throws IOException { 151 int c = in.read(); 152 if (c != -1) { 153 anvlParseLength.longValue++; 154 } 155 return c; 156 } 157 }; 158 parseNamedFields(countingStream, m); 159 final int contentOffset = 162 (int)(headLineLength + anvlParseLength.longValue); 163 incrementPosition(contentOffset); 164 165 return new ArchiveRecordHeader() { 166 private Map <Object , Object > fields = m; 167 private int contentBegin = contentOffset; 168 169 public String getDate() { 170 return (String )this.fields.get(DATE_FIELD_KEY); 171 } 172 173 public String getDigest() { 174 return (String )this.fields.get(NAMED_FIELD_CHECKSUM_LABEL); 175 } 176 177 public String getReaderIdentifier() { 178 return (String )this.fields.get(READER_IDENTIFIER_FIELD_KEY); 179 } 180 181 public Set getHeaderFieldKeys() { 182 return this.fields.keySet(); 183 } 184 185 public Map getHeaderFields() { 186 return this.fields; 187 } 188 189 public Object getHeaderValue(String key) { 190 return this.fields.get(key); 191 } 192 193 public long getLength() { 194 Object o = this.fields.get(LENGTH_FIELD_KEY); 195 if (o == null) { 196 return -1; 197 } 198 return ((Long )o).longValue(); 199 } 200 201 public String getMimetype() { 202 return (String )this.fields.get(MIMETYPE_FIELD_KEY); 203 } 204 205 public long getOffset() { 206 Object o = this.fields.get(ABSOLUTE_OFFSET_KEY); 207 if (o == null) { 208 return -1; 209 } 210 return ((Long )o).longValue(); 211 } 212 213 public String getRecordIdentifier() { 214 return (String )this.fields.get(RECORD_IDENTIFIER_FIELD_KEY); 215 } 216 217 public String getUrl() { 218 return (String )this.fields.get(URL_FIELD_KEY); 219 } 220 221 public String getVersion() { 222 return (String )this.fields.get(VERSION_FIELD_KEY); 223 } 224 225 public int getContentBegin() { 226 return this.contentBegin; 227 } 228 229 @Override 230 public String toString() { 231 return this.fields.toString(); 232 } 233 }; 234 } 235 236 protected int parseHeaderLine(final InputStream in, 237 final Map <Object , Object > fields, final boolean strict) 238 throws IOException { 239 byte [] line = readLine(in, strict); 240 if (line.length <= 2) { 241 throw new IOException ("No Header Line found"); 242 } 243 String headerLine = new String (line, 0, line.length - 2, 245 HEADER_LINE_ENCODING); 246 Matcher m = HEADER_LINE.matcher(headerLine); 247 if (!m.matches()) { 248 throw new IOException ("Failed parse of Header Line: " + 249 headerLine); 250 } 251 for (int i = 0; i < HEADER_FIELD_KEYS.length; i++) { 252 if (i == 1) { 253 fields.put(HEADER_FIELD_KEYS[i], 255 Long.parseLong(m.group(i + 1))); 256 continue; 257 } 258 fields.put(HEADER_FIELD_KEYS[i], m.group(i + 1)); 259 } 260 261 return line.length; 262 } 263 264 274 protected byte [] readLine(final InputStream in, final boolean strict) 275 throws IOException { 276 boolean done = false; 277 boolean recordStart = strict; 278 int read = 0; 279 ByteArrayOutputStream baos = new ByteArrayOutputStream (1024 ); 280 for (int c = -1, previousCharacter; !done;) { 281 if (read++ >= MAX_LINE_LENGTH) { 282 throw new IOException ("Read " + MAX_LINE_LENGTH + 283 " bytes without finding CRLF"); 284 } 285 previousCharacter = c; 286 c = in.read(); 287 if (c == -1) { 288 throw new IOException ("End-Of-Stream before CRLF:\n" + 289 new String (baos.toByteArray())); 290 } 291 if (isLF((char)c) && isCR((char)previousCharacter)) { 292 done = true; 293 } else if (!recordStart && Character.isWhitespace(c)) { 294 continue; 296 } else { 297 if (isCR((char)previousCharacter)) { 298 throw new IOException ("CR in middle of Header:\n" + 301 new String (baos.toByteArray())); 302 } 303 304 if (!recordStart) { 306 recordStart = true; 307 } 308 } 309 baos.write(c); 310 } 311 return baos.toByteArray(); 312 } 313 314 protected void parseNamedFields(final InputStream in, 315 final Map <Object , Object > fields) 316 throws IOException { 317 ANVLRecord r = ANVLRecord.load(in); 318 fields.putAll(r.asMap()); 319 } 320 321 public static boolean isCROrLF(final char c) { 322 return isCR(c) || isLF(c); 323 } 324 325 public static boolean isCR(final char c) { 326 return c == CRLF.charAt(0); 327 } 328 329 public static boolean isLF(final char c) { 330 return c == CRLF.charAt(1); 331 } 332 333 334 @Override 335 protected String getMimetype4Cdx(ArchiveRecordHeader h) { 336 final String m = super.getMimetype4Cdx(h); 337 Matcher matcher = WHITESPACE.matcher(m); 341 return matcher.replaceAll(""); 342 } 343 } 344 | Popular Tags |