KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > ArchiveRecord


1 /* $Id: ArchiveRecord.java,v 1.8.2.1 2007/01/13 01:31:31 stack-sf Exp $
2  *
3  * Created on August 21st, 2006
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io;
24
25 import java.io.IOException JavaDoc;
26 import java.io.InputStream JavaDoc;
27 import java.io.OutputStream JavaDoc;
28 import java.security.MessageDigest JavaDoc;
29 import java.security.NoSuchAlgorithmException JavaDoc;
30 import java.util.logging.Level JavaDoc;
31
32 import org.archive.util.Base32;
33
34 /**
35  * Archive file Record.
36  * @author stack
37  * @version $Date: 2007/01/13 01:31:31 $ $Version$
38  */

39 public abstract class ArchiveRecord extends InputStream JavaDoc {
40     ArchiveRecordHeader header = null;
41
42     /**
43      * Stream to read this record from.
44      *
45      * Stream can only be read sequentially. Will only return this records'
46      * content returning a -1 if you try to read beyond the end of the current
47      * record.
48      *
49      * <p>Streams can be markable or not. If they are, we'll be able to roll
50      * back when we've read too far. If not markable, assumption is that
51      * the underlying stream is managing our not reading too much (This pertains
52      * to the skipping over the end of the ARCRecord. See {@link #skip()}.
53      */

54     InputStream JavaDoc in = null;
55
56     /**
57      * Position w/i the Record content, within <code>in</code>.
58      * This position is relative within this Record. Its not same as the
59      * Archive file position.
60      */

61     long position = 0;
62
63     /**
64      * Set flag when we've reached the end-of-record.
65      */

66     boolean eor = false;
67     
68     /**
69      * Compute digest on what we read and add to metadata when done.
70      *
71      * Currently hardcoded as sha-1. TODO: Remove when archive records
72      * digest or else, add a facility that allows the arc reader to
73      * compare the calculated digest to that which is recorded in
74      * the arc.
75      *
76      * <p>Protected instead of private so subclasses can update and complete
77      * the digest.
78      */

79     protected MessageDigest JavaDoc digest = null;
80     private String JavaDoc digestStr = null;
81
82     boolean strict = false;
83     
84     private ArchiveRecord() {
85         super();
86     }
87     
88     /**
89      * Constructor.
90      *
91      * @param in Stream cue'd up to be at the start of the record this instance
92      * is to represent.
93      * @throws IOException
94      */

95     public ArchiveRecord(InputStream JavaDoc in)
96             throws IOException JavaDoc {
97         this(in, null, 0, true, false);
98     }
99     
100     /**
101      * Constructor.
102      *
103      * @param in Stream cue'd up to be at the start of the record this instance
104      * is to represent.
105      * @param header Header data.
106      * @throws IOException
107      */

108     public ArchiveRecord(InputStream JavaDoc in, ArchiveRecordHeader header)
109             throws IOException JavaDoc {
110         this(in, header, 0, true, false);
111     }
112
113     /**
114      * Constructor.
115      *
116      * @param in Stream cue'd up to be at the start of the record this instance
117      * is to represent.
118      * @param header Header data.
119      * @param bodyOffset Offset into the body. Usually 0.
120      * @param digest True if we're to calculate digest for this record. Not
121      * digesting saves about ~15% of cpu during an ARC parse.
122      * @param strict Be strict parsing (Parsing stops if ARC inproperly
123      * formatted).
124      * @throws IOException
125      */

126     public ArchiveRecord(InputStream JavaDoc in, ArchiveRecordHeader header,
127         int bodyOffset, boolean digest, boolean strict)
128     throws IOException JavaDoc {
129         this.in = in;
130         this.header = header;
131         this.position = bodyOffset;
132         if (digest) {
133             try {
134                 this.digest = MessageDigest.getInstance("SHA1");
135             } catch (NoSuchAlgorithmException JavaDoc e) {
136                 // Convert to IOE because thats more amenable to callers
137
// -- they are dealing with it anyways.
138
throw new IOException JavaDoc(e.getMessage());
139             }
140         }
141         this.strict = strict;
142     }
143
144     public boolean markSupported() {
145         return false;
146     }
147
148     /**
149      * @return Header data for this record.
150      */

151     public ArchiveRecordHeader getHeader() {
152         return this.header;
153     }
154     
155     protected void setHeader(ArchiveRecordHeader header) {
156         this.header = header;
157     }
158
159     /**
160      * Calling close on a record skips us past this record to the next record
161      * in the stream.
162      *
163      * It does not actually close the stream. The underlying steam is probably
164      * being used by the next arc record.
165      *
166      * @throws IOException
167      */

168     public void close() throws IOException JavaDoc {
169         if (this.in != null) {
170             skip();
171             this.in = null;
172             if (this.digest != null) {
173                 this.digestStr = Base32.encode(this.digest.digest());
174             }
175         }
176     }
177
178     /**
179      * @return Next character in this Record content else -1 if at EOR.
180      * @throws IOException
181      */

182     public int read() throws IOException JavaDoc {
183         int c = -1;
184         if (available() > 0) {
185             c = this.in.read();
186             if (c == -1) {
187                 throw new IOException JavaDoc("Premature EOF before end-of-record.");
188             }
189             if (this.digest != null) {
190                 this.digest.update((byte) c);
191             }
192         }
193         incrementPosition();
194         return c;
195     }
196
197     public int read(byte[] b, int offset, int length) throws IOException JavaDoc {
198         int read = Math.min(length, available());
199         if (read == -1 || read == 0) {
200             read = -1;
201         } else {
202             read = this.in.read(b, offset, read);
203             if (read == -1) {
204                 String JavaDoc msg = "Premature EOF before end-of-record: "
205                     + getHeader().getHeaderFields();
206                 if (isStrict()) {
207                     throw new IOException JavaDoc(msg);
208                 }
209                 setEor(true);
210                 System.err.println(Level.WARNING.toString() + " " + msg);
211             }
212             if (this.digest != null && read >= 0) {
213                 this.digest.update(b, offset, read);
214             }
215         }
216         incrementPosition(read);
217         return read;
218     }
219
220     /**
221      * This available is not the stream's available. Its an available based on
222      * what the stated Archive record length is minus what we've read to date.
223      *
224      * @return True if bytes remaining in record content.
225      */

226     public int available() {
227         return (int)(getHeader().getLength() - getPosition());
228     }
229
230     /**
231      * Skip over this records content.
232      *
233      * @throws IOException
234      */

235     void skip() throws IOException JavaDoc {
236         if (this.eor) {
237             return;
238         }
239         
240         // Read to the end of the body of the record. Exhaust the stream.
241
// Can't skip direct to end because underlying stream may be compressed
242
// and we're calculating the digest for the record.
243
if (available() > 0) {
244             skip(available());
245         }
246     }
247     
248     public long skip(long n) throws IOException JavaDoc {
249         final int SKIP_BUFFERSIZE = 1024 * 4;
250         byte[] b = new byte[SKIP_BUFFERSIZE];
251         long total = 0;
252         for (int read = 0; (total < n) && (read != -1);) {
253             read = Math.min(SKIP_BUFFERSIZE, (int) (n - total));
254             // TODO: Interesting is that reading from compressed stream, we only
255
// read about 500 characters at a time though we ask for 4k.
256
// Look at this sometime.
257
read = read(b, 0, read);
258             if (read <= 0) {
259                 read = -1;
260             } else {
261                 total += read;
262             }
263         }
264         return total;
265     }
266
267     /**
268      * @return Returns the strict.
269      */

270     public boolean isStrict() {
271         return this.strict;
272     }
273
274     /**
275      * @param strict The strict to set.
276      */

277     public void setStrict(boolean strict) {
278         this.strict = strict;
279     }
280
281     protected InputStream JavaDoc getIn() {
282         return this.in;
283     }
284
285     public String JavaDoc getDigestStr() {
286         return this.digestStr;
287     }
288     
289     protected void incrementPosition() {
290         this.position++;
291     }
292     
293     protected void incrementPosition(final long incr) {
294         this.position += incr;
295     }
296     
297     protected long getPosition() {
298         return this.position;
299     }
300
301     protected boolean isEor() {
302         return eor;
303     }
304
305     protected void setEor(boolean eor) {
306         this.eor = eor;
307     }
308     
309     protected String JavaDoc getStatusCode4Cdx(final ArchiveRecordHeader h) {
310         return "-";
311     }
312     
313     protected String JavaDoc getIp4Cdx(final ArchiveRecordHeader h) {
314         return "-";
315     }
316     
317     protected String JavaDoc getDigest4Cdx(final ArchiveRecordHeader h) {
318         return getDigestStr() == null? "-": getDigestStr();
319     }
320     
321     protected String JavaDoc getMimetype4Cdx(final ArchiveRecordHeader h) {
322         return h.getMimetype();
323     }
324
325     protected String JavaDoc outputCdx(final String JavaDoc strippedFileName)
326     throws IOException JavaDoc {
327         // Read the whole record so we get out a hash. Should be safe calling
328
// close on already closed Record.
329
close();
330         ArchiveRecordHeader h = getHeader();
331         StringBuilder JavaDoc buffer =
332             new StringBuilder JavaDoc(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
333         buffer.append(h.getDate());
334         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
335         buffer.append(getIp4Cdx(h));
336         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
337         buffer.append(h.getUrl());
338         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
339         buffer.append(getMimetype4Cdx(h));
340         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
341         buffer.append(getStatusCode4Cdx(h));
342         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
343         buffer.append(getDigest4Cdx(h));
344         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
345         buffer.append(h.getOffset());
346         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
347         buffer.append(h.getLength());
348         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
349         buffer.append(strippedFileName != null? strippedFileName: '-');
350         return buffer.toString();
351     }
352     
353     /**
354      * Writes output on STDOUT.
355      * @throws IOException
356      */

357     public void dump()
358     throws IOException JavaDoc {
359         dump(System.out);
360     }
361     
362     /**
363      * Writes output on passed <code>os</code>.
364      * @throws IOException
365      */

366     public void dump(final OutputStream JavaDoc os)
367     throws IOException JavaDoc {
368         final byte [] outputBuffer = new byte [16*1024];
369         int read = outputBuffer.length;
370         while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) {
371             os.write(outputBuffer, 0, read);
372         }
373         os.flush();
374     }
375 }
376
Popular Tags