WARCRecord


1   /* $Id: WARCRecord.java,v 1.6 2006/08/31 16:51:41 stack-sf Exp $
2    *
3    * Created on August 25th, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io.warc;
24  
25  import java.io.ByteArrayOutputStream  ;
26  import java.io.IOException  ;
27  import java.io.InputStream  ;
28  import java.util.HashMap  ;
29  import java.util.Map  ;
30  import java.util.Set  ;
31  import java.util.regex.Matcher  ;
32  import java.util.regex.Pattern  ;
33  
34  import org.archive.io.ArchiveRecord;
35  import org.archive.io.ArchiveRecordHeader;
36  import org.archive.util.LongWrapper;
37  import org.archive.util.anvl.ANVLRecord;
38  
39  
40  /**
41   * A WARC file Record.
42   *
43   * @author stack
44   */
45  public class WARCRecord extends ArchiveRecord implements WARCConstants {
46      /**
47       * Header-Line pattern;
48       * I heart http://www.fileformat.info/tool/regex.htm
49       */
50      private final static Pattern   HEADER_LINE = Pattern.compile(
51          "^WARC/([0-9]+\\.[0-9]+(?:\\.[0-9]+)?)" +// Regex group 1: WARC lead-in.
52          "[\\t ]+" +                 // Multiple tabs or spaces.
53          "([0-9]+)" +                // Regex group 2: Length.
54          "[\\t ]+" +                 // Multiple tabs or spaces.
55          "(request|response|warcinfo|resource|metadata|" +
56              "revisit|conversion)" + // Regex group 3: Type of WARC Record.
57          "[\\t ]+" +                 // Multiple tabs or spaces.
58          "([^\\t ]+)" +              // Regex group 4: Subject-uri.
59          "[\\t ]+" +                 // Multiple tabs or spaces.
60          "([0-9]{14})" +             // Regex group 5: Date
61          "[\\t ]+" +                 // Multiple tabs or spaces.
62          "([^\\t ]+)" +              // Regex group 6: Record-Id
63          "[\\t ]+" +                 // Multiple tabs or spaces.
64          "(.+)$");                   // Regex group 7: Mimetype.
65      
66  
67      private Pattern   WHITESPACE = Pattern.compile("\\s");
68      
69      /**
70       * Constructor.
71       *
72       * @param in Stream cue'd up to be at the start of the record this instance
73       * is to represent.
74       * @throws IOException
75       */
76      public WARCRecord(InputStream   in, final String   identifier,
77          final long offset)
78      throws IOException   {
79          this(in, identifier, offset, true, false);
80      }
81      
82      /**
83       * Constructor.
84       * @param in Stream cue'd up just past Header Line and Named Fields.
85       * @param headers Header Line and ANVL Named fields.
86       * @throws IOException
87       */
88      public WARCRecord(InputStream   in, ArchiveRecordHeader headers)
89              throws IOException   {
90          super(in, headers, 0, true, false);
91      }
92  
93      /**
94       * Constructor.
95       *
96       * @param in Stream cue'd up to be at the start of the record this instance
97       * is to represent or, if <code>headers</code> is not null, just past the
98       * Header Line and Named Fields.
99       * @param identifier Identifier for this the hosting Reader.
100      * @param offset Current offset into <code>in</code> (Used to keep
101      * <code>position</code> properly aligned).  Usually 0.
102      * @param digest True if we're to calculate digest for this record.  Not
103      * digesting saves about ~15% of cpu during parse.
104      * @param strict Be strict parsing (Parsing stops if file inproperly
105      * formatted).
106      * @throws IOException
107      */
108     public WARCRecord(final InputStream   in, final String   identifier,
109         final long offset, boolean digest, boolean strict) 
110     throws IOException   {
111         super(in, null, 0, digest, strict);
112         setHeader(parseHeaders(in, identifier, offset, strict));
113     }
114     
115     /**
116      * Parse WARC Header Line and Named Fields.
117      * @param in Stream to read.
118      * @param identifier Identifier for the hosting Reader.
119      * @param offset Absolute offset into Reader.
120      * @param strict Whether to be loose parsing or not.
121      * @return An ArchiveRecordHeader.
122      * @throws IOException 
123      */
124     protected ArchiveRecordHeader parseHeaders(final InputStream   in,
125         final String   identifier, final long offset, final boolean strict)
126     throws IOException   {
127         final Map  <Object  , Object  > m = new HashMap  <Object  , Object  >();
128         m.put(ABSOLUTE_OFFSET_KEY, new Long  (offset));
129         m.put(READER_IDENTIFIER_FIELD_KEY, identifier);
130         // Here we start reading off the inputstream but we're reading the
131         // stream direct rather than going via WARCRecord#read.  The latter will
132         // keep count of bytes read, digest and fail properly if EOR too soon...
133         // We don't want digesting while reading Header Line and Named Fields.
134         // 
135         // The returned length includes terminating CRLF.
136         int headLineLength = parseHeaderLine(in, m, strict);
137         
138         // Now, doing the ANVL parse, hard to know how many bytes have been
139         // read since passed Stream doesn't keep count and the ANVL parse can
140         // throw away bytes (e.g. if white space padding at start of a folded
141         // Value or if a Value has a newline in it and it gets converted to a
142         // CRNL in the ANVL representation).  Wrap the stream in a
143         // byte-counting stream.
144         //
145         // TODO: Buffering.  Currently, we rely on the deflate buffer when
146         // file is gzipped.  Otherwise, if uncompressed, no buffering.
147         final LongWrapper anvlParseLength = new LongWrapper(0);
148         InputStream   countingStream = new InputStream  () {
149             @Override  
150             public int read() throws IOException   {
151                 int c = in.read();
152                 if (c != -1) {
153                     anvlParseLength.longValue++;
154                 }
155                 return c;
156             }
157         };
158         parseNamedFields(countingStream, m);
159         // Set offset at which content begins. Its the Header Line length plus
160         // whatever we read parsing ANVL.
161         final int contentOffset =
162             (int)(headLineLength + anvlParseLength.longValue);
163         incrementPosition(contentOffset);
164    
165         return new ArchiveRecordHeader() {
166             private Map  <Object  , Object  > fields = m;
167             private int contentBegin = contentOffset;
168 
169             public String   getDate() {
170                 return (String  )this.fields.get(DATE_FIELD_KEY);
171             }
172 
173             public String   getDigest() {
174                 return (String  )this.fields.get(NAMED_FIELD_CHECKSUM_LABEL);
175             }
176 
177             public String   getReaderIdentifier() {
178                 return (String  )this.fields.get(READER_IDENTIFIER_FIELD_KEY);
179             }
180 
181             public Set   getHeaderFieldKeys() {
182                 return this.fields.keySet();
183             }
184 
185             public Map   getHeaderFields() {
186                 return this.fields;
187             }
188 
189             public Object   getHeaderValue(String   key) {
190                 return this.fields.get(key);
191             }
192 
193             public long getLength() {
194                 Object   o = this.fields.get(LENGTH_FIELD_KEY);
195                 if (o == null) {
196                     return -1;
197                 }
198                 return ((Long  )o).longValue();
199             }
200 
201             public String   getMimetype() {
202                 return (String  )this.fields.get(MIMETYPE_FIELD_KEY);
203             }
204 
205             public long getOffset() {
206                 Object   o = this.fields.get(ABSOLUTE_OFFSET_KEY);
207                 if (o == null) {
208                     return -1;
209                 }
210                 return ((Long  )o).longValue();
211             }
212 
213             public String   getRecordIdentifier() {
214                 return (String  )this.fields.get(RECORD_IDENTIFIER_FIELD_KEY);
215             }
216 
217             public String   getUrl() {
218                 return (String  )this.fields.get(URL_FIELD_KEY);
219             }
220 
221             public String   getVersion() {
222                 return (String  )this.fields.get(VERSION_FIELD_KEY);
223             }
224             
225             public int getContentBegin() {
226                 return this.contentBegin;
227             }
228             
229             @Override  
230             public String   toString() {
231                 return this.fields.toString();
232             }
233         };
234     }
235     
236     protected int parseHeaderLine(final InputStream   in,
237             final Map  <Object  , Object  > fields, final boolean strict) 
238     throws IOException   {
239         byte [] line = readLine(in, strict);
240         if (line.length <= 2) {
241             throw new IOException  ("No Header Line found");
242         }
243         // Strip the CRLF.
244         String   headerLine = new String  (line, 0, line.length - 2,
245             HEADER_LINE_ENCODING);
246         Matcher   m = HEADER_LINE.matcher(headerLine);
247         if (!m.matches()) {
248             throw new IOException  ("Failed parse of Header Line: " +
249                 headerLine);
250         }
251         for (int i = 0; i < HEADER_FIELD_KEYS.length; i++) {
252             if (i == 1) {
253                 // Do length of Record as a Long.
254                 fields.put(HEADER_FIELD_KEYS[i],
255                     Long.parseLong(m.group(i + 1)));
256                 continue;
257             }
258             fields.put(HEADER_FIELD_KEYS[i], m.group(i + 1));
259         }
260         
261         return line.length;
262     }
263 
264     /**
265      * Read a line.
266      * A 'line' in this context ends in CRLF and contains ascii-only and no
267      * control-characters.
268      * @param in InputStream to read.
269      * @param strict Strict parsing (If false, we'll eat whitespace before the
270      * record.
271      * @return All bytes in line including terminating CRLF.
272      * @throws IOException
273      */
274     protected byte [] readLine(final InputStream   in, final boolean strict) 
275     throws IOException   {
276         boolean done = false;
277         boolean recordStart = strict;
278         int read = 0;
279         ByteArrayOutputStream   baos = new ByteArrayOutputStream  (1024 /*SWAG*/);
280         for (int c  = -1, previousCharacter; !done;) {
281             if (read++ >= MAX_LINE_LENGTH) {
282                 throw new IOException  ("Read " + MAX_LINE_LENGTH +
283                     " bytes without finding CRLF");
284             }
285             previousCharacter = c;
286             c = in.read();
287             if (c == -1) {
288                 throw new IOException  ("End-Of-Stream before CRLF:\n" +
289                     new String  (baos.toByteArray()));
290             }
291             if (isLF((char)c) && isCR((char)previousCharacter)) {
292                 done = true;
293             } else if (!recordStart && Character.isWhitespace(c)) {
294                 // Skip any whitespace at start.
295                 continue;
296             } else {
297                 if (isCR((char)previousCharacter)) {
298                     // If previous character was a CR and this character is not
299                     // a LF, we tested above, thats illegal.
300                     throw new IOException  ("CR in middle of Header:\n" +
301                         new String  (baos.toByteArray()));
302                 }
303                 
304                 // Not whitespace so start record if we haven't already.
305                 if (!recordStart) {
306                     recordStart = true;
307                 }
308             }
309             baos.write(c);
310         }
311         return baos.toByteArray();
312     }
313  
314     protected void parseNamedFields(final InputStream   in,
315         final Map  <Object  , Object  > fields) 
316     throws IOException   {
317         ANVLRecord r = ANVLRecord.load(in);
318         fields.putAll(r.asMap());
319     }
320     
321     public static boolean isCROrLF(final char c) {
322         return isCR(c) || isLF(c);
323     }
324     
325     public static boolean isCR(final char c) {
326         return c == CRLF.charAt(0);
327     }
328     
329     public static boolean isLF(final char c) {
330         return c == CRLF.charAt(1);
331     }
332     
333     
334     @Override  
335     protected String   getMimetype4Cdx(ArchiveRecordHeader h) {
336         final String   m = super.getMimetype4Cdx(h);
337         // Mimetypes can have spaces in WARCs.  Emitting for CDX, just
338         // squash them for now.  Later, quote them since squashing spaces won't
339         // work for params that have quoted-string values.
340         Matcher   matcher = WHITESPACE.matcher(m);
341         return matcher.replaceAll("");
342     }
343 }
344
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags