KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > warc > WARCRecord


1 /* $Id: WARCRecord.java,v 1.6 2006/08/31 16:51:41 stack-sf Exp $
2  *
3  * Created on August 25th, 2006
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io.warc;
24
25 import java.io.ByteArrayOutputStream JavaDoc;
26 import java.io.IOException JavaDoc;
27 import java.io.InputStream JavaDoc;
28 import java.util.HashMap JavaDoc;
29 import java.util.Map JavaDoc;
30 import java.util.Set JavaDoc;
31 import java.util.regex.Matcher JavaDoc;
32 import java.util.regex.Pattern JavaDoc;
33
34 import org.archive.io.ArchiveRecord;
35 import org.archive.io.ArchiveRecordHeader;
36 import org.archive.util.LongWrapper;
37 import org.archive.util.anvl.ANVLRecord;
38
39
40 /**
41  * A WARC file Record.
42  *
43  * @author stack
44  */

45 public class WARCRecord extends ArchiveRecord implements WARCConstants {
46     /**
47      * Header-Line pattern;
48      * I heart http://www.fileformat.info/tool/regex.htm
49      */

50     private final static Pattern JavaDoc HEADER_LINE = Pattern.compile(
51         "^WARC/([0-9]+\\.[0-9]+(?:\\.[0-9]+)?)" +// Regex group 1: WARC lead-in.
52
"[\\t ]+" + // Multiple tabs or spaces.
53
"([0-9]+)" + // Regex group 2: Length.
54
"[\\t ]+" + // Multiple tabs or spaces.
55
"(request|response|warcinfo|resource|metadata|" +
56             "revisit|conversion)" + // Regex group 3: Type of WARC Record.
57
"[\\t ]+" + // Multiple tabs or spaces.
58
"([^\\t ]+)" + // Regex group 4: Subject-uri.
59
"[\\t ]+" + // Multiple tabs or spaces.
60
"([0-9]{14})" + // Regex group 5: Date
61
"[\\t ]+" + // Multiple tabs or spaces.
62
"([^\\t ]+)" + // Regex group 6: Record-Id
63
"[\\t ]+" + // Multiple tabs or spaces.
64
"(.+)$"); // Regex group 7: Mimetype.
65

66
67     private Pattern JavaDoc WHITESPACE = Pattern.compile("\\s");
68     
69     /**
70      * Constructor.
71      *
72      * @param in Stream cue'd up to be at the start of the record this instance
73      * is to represent.
74      * @throws IOException
75      */

76     public WARCRecord(InputStream JavaDoc in, final String JavaDoc identifier,
77         final long offset)
78     throws IOException JavaDoc {
79         this(in, identifier, offset, true, false);
80     }
81     
82     /**
83      * Constructor.
84      * @param in Stream cue'd up just past Header Line and Named Fields.
85      * @param headers Header Line and ANVL Named fields.
86      * @throws IOException
87      */

88     public WARCRecord(InputStream JavaDoc in, ArchiveRecordHeader headers)
89             throws IOException JavaDoc {
90         super(in, headers, 0, true, false);
91     }
92
93     /**
94      * Constructor.
95      *
96      * @param in Stream cue'd up to be at the start of the record this instance
97      * is to represent or, if <code>headers</code> is not null, just past the
98      * Header Line and Named Fields.
99      * @param identifier Identifier for this the hosting Reader.
100      * @param offset Current offset into <code>in</code> (Used to keep
101      * <code>position</code> properly aligned). Usually 0.
102      * @param digest True if we're to calculate digest for this record. Not
103      * digesting saves about ~15% of cpu during parse.
104      * @param strict Be strict parsing (Parsing stops if file inproperly
105      * formatted).
106      * @throws IOException
107      */

108     public WARCRecord(final InputStream JavaDoc in, final String JavaDoc identifier,
109         final long offset, boolean digest, boolean strict)
110     throws IOException JavaDoc {
111         super(in, null, 0, digest, strict);
112         setHeader(parseHeaders(in, identifier, offset, strict));
113     }
114     
115     /**
116      * Parse WARC Header Line and Named Fields.
117      * @param in Stream to read.
118      * @param identifier Identifier for the hosting Reader.
119      * @param offset Absolute offset into Reader.
120      * @param strict Whether to be loose parsing or not.
121      * @return An ArchiveRecordHeader.
122      * @throws IOException
123      */

124     protected ArchiveRecordHeader parseHeaders(final InputStream JavaDoc in,
125         final String JavaDoc identifier, final long offset, final boolean strict)
126     throws IOException JavaDoc {
127         final Map JavaDoc<Object JavaDoc, Object JavaDoc> m = new HashMap JavaDoc<Object JavaDoc, Object JavaDoc>();
128         m.put(ABSOLUTE_OFFSET_KEY, new Long JavaDoc(offset));
129         m.put(READER_IDENTIFIER_FIELD_KEY, identifier);
130         // Here we start reading off the inputstream but we're reading the
131
// stream direct rather than going via WARCRecord#read. The latter will
132
// keep count of bytes read, digest and fail properly if EOR too soon...
133
// We don't want digesting while reading Header Line and Named Fields.
134
//
135
// The returned length includes terminating CRLF.
136
int headLineLength = parseHeaderLine(in, m, strict);
137         
138         // Now, doing the ANVL parse, hard to know how many bytes have been
139
// read since passed Stream doesn't keep count and the ANVL parse can
140
// throw away bytes (e.g. if white space padding at start of a folded
141
// Value or if a Value has a newline in it and it gets converted to a
142
// CRNL in the ANVL representation). Wrap the stream in a
143
// byte-counting stream.
144
//
145
// TODO: Buffering. Currently, we rely on the deflate buffer when
146
// file is gzipped. Otherwise, if uncompressed, no buffering.
147
final LongWrapper anvlParseLength = new LongWrapper(0);
148         InputStream JavaDoc countingStream = new InputStream JavaDoc() {
149             @Override JavaDoc
150             public int read() throws IOException JavaDoc {
151                 int c = in.read();
152                 if (c != -1) {
153                     anvlParseLength.longValue++;
154                 }
155                 return c;
156             }
157         };
158         parseNamedFields(countingStream, m);
159         // Set offset at which content begins. Its the Header Line length plus
160
// whatever we read parsing ANVL.
161
final int contentOffset =
162             (int)(headLineLength + anvlParseLength.longValue);
163         incrementPosition(contentOffset);
164    
165         return new ArchiveRecordHeader() {
166             private Map JavaDoc<Object JavaDoc, Object JavaDoc> fields = m;
167             private int contentBegin = contentOffset;
168
169             public String JavaDoc getDate() {
170                 return (String JavaDoc)this.fields.get(DATE_FIELD_KEY);
171             }
172
173             public String JavaDoc getDigest() {
174                 return (String JavaDoc)this.fields.get(NAMED_FIELD_CHECKSUM_LABEL);
175             }
176
177             public String JavaDoc getReaderIdentifier() {
178                 return (String JavaDoc)this.fields.get(READER_IDENTIFIER_FIELD_KEY);
179             }
180
181             public Set JavaDoc getHeaderFieldKeys() {
182                 return this.fields.keySet();
183             }
184
185             public Map JavaDoc getHeaderFields() {
186                 return this.fields;
187             }
188
189             public Object JavaDoc getHeaderValue(String JavaDoc key) {
190                 return this.fields.get(key);
191             }
192
193             public long getLength() {
194                 Object JavaDoc o = this.fields.get(LENGTH_FIELD_KEY);
195                 if (o == null) {
196                     return -1;
197                 }
198                 return ((Long JavaDoc)o).longValue();
199             }
200
201             public String JavaDoc getMimetype() {
202                 return (String JavaDoc)this.fields.get(MIMETYPE_FIELD_KEY);
203             }
204
205             public long getOffset() {
206                 Object JavaDoc o = this.fields.get(ABSOLUTE_OFFSET_KEY);
207                 if (o == null) {
208                     return -1;
209                 }
210                 return ((Long JavaDoc)o).longValue();
211             }
212
213             public String JavaDoc getRecordIdentifier() {
214                 return (String JavaDoc)this.fields.get(RECORD_IDENTIFIER_FIELD_KEY);
215             }
216
217             public String JavaDoc getUrl() {
218                 return (String JavaDoc)this.fields.get(URL_FIELD_KEY);
219             }
220
221             public String JavaDoc getVersion() {
222                 return (String JavaDoc)this.fields.get(VERSION_FIELD_KEY);
223             }
224             
225             public int getContentBegin() {
226                 return this.contentBegin;
227             }
228             
229             @Override JavaDoc
230             public String JavaDoc toString() {
231                 return this.fields.toString();
232             }
233         };
234     }
235     
236     protected int parseHeaderLine(final InputStream JavaDoc in,
237             final Map JavaDoc<Object JavaDoc, Object JavaDoc> fields, final boolean strict)
238     throws IOException JavaDoc {
239         byte [] line = readLine(in, strict);
240         if (line.length <= 2) {
241             throw new IOException JavaDoc("No Header Line found");
242         }
243         // Strip the CRLF.
244
String JavaDoc headerLine = new String JavaDoc(line, 0, line.length - 2,
245             HEADER_LINE_ENCODING);
246         Matcher JavaDoc m = HEADER_LINE.matcher(headerLine);
247         if (!m.matches()) {
248             throw new IOException JavaDoc("Failed parse of Header Line: " +
249                 headerLine);
250         }
251         for (int i = 0; i < HEADER_FIELD_KEYS.length; i++) {
252             if (i == 1) {
253                 // Do length of Record as a Long.
254
fields.put(HEADER_FIELD_KEYS[i],
255                     Long.parseLong(m.group(i + 1)));
256                 continue;
257             }
258             fields.put(HEADER_FIELD_KEYS[i], m.group(i + 1));
259         }
260         
261         return line.length;
262     }
263
264     /**
265      * Read a line.
266      * A 'line' in this context ends in CRLF and contains ascii-only and no
267      * control-characters.
268      * @param in InputStream to read.
269      * @param strict Strict parsing (If false, we'll eat whitespace before the
270      * record.
271      * @return All bytes in line including terminating CRLF.
272      * @throws IOException
273      */

274     protected byte [] readLine(final InputStream JavaDoc in, final boolean strict)
275     throws IOException JavaDoc {
276         boolean done = false;
277         boolean recordStart = strict;
278         int read = 0;
279         ByteArrayOutputStream JavaDoc baos = new ByteArrayOutputStream JavaDoc(1024 /*SWAG*/);
280         for (int c = -1, previousCharacter; !done;) {
281             if (read++ >= MAX_LINE_LENGTH) {
282                 throw new IOException JavaDoc("Read " + MAX_LINE_LENGTH +
283                     " bytes without finding CRLF");
284             }
285             previousCharacter = c;
286             c = in.read();
287             if (c == -1) {
288                 throw new IOException JavaDoc("End-Of-Stream before CRLF:\n" +
289                     new String JavaDoc(baos.toByteArray()));
290             }
291             if (isLF((char)c) && isCR((char)previousCharacter)) {
292                 done = true;
293             } else if (!recordStart && Character.isWhitespace(c)) {
294                 // Skip any whitespace at start.
295
continue;
296             } else {
297                 if (isCR((char)previousCharacter)) {
298                     // If previous character was a CR and this character is not
299
// a LF, we tested above, thats illegal.
300
throw new IOException JavaDoc("CR in middle of Header:\n" +
301                         new String JavaDoc(baos.toByteArray()));
302                 }
303                 
304                 // Not whitespace so start record if we haven't already.
305
if (!recordStart) {
306                     recordStart = true;
307                 }
308             }
309             baos.write(c);
310         }
311         return baos.toByteArray();
312     }
313  
314     protected void parseNamedFields(final InputStream JavaDoc in,
315         final Map JavaDoc<Object JavaDoc, Object JavaDoc> fields)
316     throws IOException JavaDoc {
317         ANVLRecord r = ANVLRecord.load(in);
318         fields.putAll(r.asMap());
319     }
320     
321     public static boolean isCROrLF(final char c) {
322         return isCR(c) || isLF(c);
323     }
324     
325     public static boolean isCR(final char c) {
326         return c == CRLF.charAt(0);
327     }
328     
329     public static boolean isLF(final char c) {
330         return c == CRLF.charAt(1);
331     }
332     
333     
334     @Override JavaDoc
335     protected String JavaDoc getMimetype4Cdx(ArchiveRecordHeader h) {
336         final String JavaDoc m = super.getMimetype4Cdx(h);
337         // Mimetypes can have spaces in WARCs. Emitting for CDX, just
338
// squash them for now. Later, quote them since squashing spaces won't
339
// work for params that have quoted-string values.
340
Matcher JavaDoc matcher = WHITESPACE.matcher(m);
341         return matcher.replaceAll("");
342     }
343 }
344
Popular Tags