KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > arc > ARCRecord


1 /* ARCRecord
2  *
3  * $Id: ARCRecord.java,v 1.34.2.1 2007/01/13 01:31:36 stack-sf Exp $
4  *
5  * Created on Jan 7, 2004
6  *
7  * Copyright (C) 2004 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.io.arc;
26
27 import java.io.ByteArrayInputStream JavaDoc;
28 import java.io.ByteArrayOutputStream JavaDoc;
29 import java.io.IOException JavaDoc;
30 import java.io.InputStream JavaDoc;
31
32 import org.apache.commons.httpclient.Header;
33 import org.apache.commons.httpclient.HttpParser;
34 import org.apache.commons.httpclient.StatusLine;
35 import org.apache.commons.httpclient.util.EncodingUtil;
36 import org.archive.io.ArchiveRecord;
37 import org.archive.io.ArchiveRecordHeader;
38
39
40 /**
41  * An ARC file record.
42  * Does not compass the ARCRecord metadata line, just the record content.
43  * @author stack
44  */

45 public class ARCRecord extends ArchiveRecord implements ARCConstants {
46     /**
47      * Http status line object.
48      *
49      * May be null if record is not http.
50      */

51     private StatusLine httpStatus = null;
52
53     /**
54      * Http header bytes.
55      *
56      * If non-null and bytes available, give out its contents before we
57      * go back to the underlying stream.
58      */

59     private InputStream JavaDoc httpHeaderStream = null;
60     
61     /**
62      * Http headers.
63      *
64      * Only populated after reading of headers.
65      */

66     private Header [] httpHeaders = null;
67
68     
69     /**
70      * Minimal http header length.
71      *
72      * I've seen in arcs content length of 1 with no
73      * header.
74      */

75     private static final long MIN_HTTP_HEADER_LENGTH =
76         "HTTP/1.1 200 OK\r\n".length();
77     
78     /**
79      * Constructor.
80      *
81      * @param in Stream cue'd up to be at the start of the record this instance
82      * is to represent.
83      * @param metaData Meta data.
84      * @throws IOException
85      */

86     public ARCRecord(InputStream JavaDoc in, ArchiveRecordHeader metaData)
87             throws IOException JavaDoc {
88         this(in, metaData, 0, true, false, true);
89     }
90
91     /**
92      * Constructor.
93      *
94      * @param in Stream cue'd up to be at the start of the record this instance
95      * is to represent.
96      * @param metaData Meta data.
97      * @param bodyOffset Offset into the body. Usually 0.
98      * @param digest True if we're to calculate digest for this record. Not
99      * digesting saves about ~15% of cpu during an ARC parse.
100      * @param strict Be strict parsing (Parsing stops if ARC inproperly
101      * formatted).
102      * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
103      * about ~20% of CPU during an ARC parse.
104      * @throws IOException
105      */

106     public ARCRecord(InputStream JavaDoc in, ArchiveRecordHeader metaData,
107         int bodyOffset, boolean digest, boolean strict,
108         final boolean parseHttpHeaders)
109     throws IOException JavaDoc {
110         super(in, metaData, bodyOffset, digest, strict);
111         if (parseHttpHeaders) {
112             this.httpHeaderStream = readHttpHeader();
113         }
114     }
115     
116     /**
117      * Skip over the the http header if one present.
118      *
119      * Subsequent reads will get the body.
120      *
121      * <p>Calling this method in the midst of reading the header
122      * will make for strange results. Otherwise, safe to call
123      * at any time though before reading any of the arc record
124      * content is only time that it makes sense.
125      *
126      * <p>After calling this method, you can call
127      * {@link #getHttpHeaders()} to get the read http header.
128      *
129      * @throws IOException
130      */

131     public void skipHttpHeader() throws IOException JavaDoc {
132         if (this.httpHeaderStream != null) {
133             // Empty the httpHeaderStream
134
for (int available = this.httpHeaderStream.available();
135                     this.httpHeaderStream != null &&
136                         (available = this.httpHeaderStream.available()) > 0;) {
137                 // We should be in this loop once only we should only do this
138
// buffer allocation once.
139
byte [] buffer = new byte[available];
140                 // The read nulls out httpHeaderStream when done with it so
141
// need check for null in the loop control line.
142
read(buffer, 0, available);
143             }
144         }
145     }
146     
147     public void dumpHttpHeader() throws IOException JavaDoc {
148         if (this.httpHeaderStream == null) {
149             return;
150         }
151         // Dump the httpHeaderStream to STDOUT
152
for (int available = this.httpHeaderStream.available();
153             this.httpHeaderStream != null
154                 && (available = this.httpHeaderStream.available()) > 0;) {
155             // We should be in this loop only once and should do this
156
// buffer allocation once.
157
byte[] buffer = new byte[available];
158             // The read nulls out httpHeaderStream when done with it so
159
// need check for null in the loop control line.
160
int read = read(buffer, 0, available);
161             System.out.write(buffer, 0, read);
162         }
163     }
164     
165     /**
166      * Read http header if present. Technique borrowed from HttpClient HttpParse
167      * class.
168      *
169      * @return ByteArrayInputStream with the http header in it or null if no
170      * http header.
171      * @throws IOException
172      */

173     private InputStream JavaDoc readHttpHeader() throws IOException JavaDoc {
174         // If judged a record that doesn't have an http header, return
175
// immediately.
176
if(!getHeader().getUrl().startsWith("http") ||
177             getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
178             return null;
179         }
180         byte [] statusBytes = HttpParser.readRawLine(getIn());
181         int eolCharCount = getEolCharsCount(statusBytes);
182         if (eolCharCount <= 0) {
183             throw new IOException JavaDoc("Failed to read http status where one " +
184                 " was expected: " + new String JavaDoc(statusBytes));
185         }
186         String JavaDoc statusLine = EncodingUtil.getString(statusBytes, 0,
187             statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
188         if ((statusLine == null) ||
189                 !StatusLine.startsWithHTTP(statusLine)) {
190             throw new IOException JavaDoc("Failed parse of http status line.");
191         }
192         this.httpStatus = new StatusLine(statusLine);
193         
194         // Save off all bytes read. Keep them as bytes rather than
195
// convert to strings so we don't have to worry about encodings
196
// though this should never be a problem doing http headers since
197
// its all supposed to be ascii.
198
ByteArrayOutputStream JavaDoc baos =
199             new ByteArrayOutputStream JavaDoc(statusBytes.length + 4 * 1024);
200         baos.write(statusBytes);
201         
202         // Now read rest of the header lines looking for the separation
203
// between header and body.
204
for (byte [] lineBytes = null; true;) {
205             lineBytes = HttpParser.readRawLine(getIn());
206             eolCharCount = getEolCharsCount(lineBytes);
207             if (eolCharCount <= 0) {
208                 throw new IOException JavaDoc("Failed reading http headers: " +
209                     ((lineBytes != null)? new String JavaDoc(lineBytes): null));
210             }
211             // Save the bytes read.
212
baos.write(lineBytes);
213             if ((lineBytes.length - eolCharCount) <= 0) {
214                 // We've finished reading the http header.
215
break;
216             }
217         }
218         
219         byte [] headerBytes = baos.toByteArray();
220         // Save off where body starts.
221
this.getMetaData().setContentBegin(headerBytes.length);
222         ByteArrayInputStream JavaDoc bais =
223             new ByteArrayInputStream JavaDoc(headerBytes);
224         if (!bais.markSupported()) {
225             throw new IOException JavaDoc("ByteArrayInputStream does not support mark");
226         }
227         bais.mark(headerBytes.length);
228         // Read the status line. Don't let it into the parseHeaders function.
229
// It doesn't know what to do with it.
230
bais.read(statusBytes, 0, statusBytes.length);
231         this.httpHeaders = HttpParser.parseHeaders(bais,
232             ARCConstants.DEFAULT_ENCODING);
233         this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
234         bais.reset();
235         return bais;
236     }
237     
238     /**
239      * Return status code for this record.
240      *
241      * This method will return -1 until the http header has been read.
242      * @return Status code.
243      */

244     public int getStatusCode() {
245         return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
246     }
247     
248     /**
249      * @param bytes Array of bytes to examine for an EOL.
250      * @return Count of end-of-line characters or zero if none.
251      */

252     private int getEolCharsCount(byte [] bytes) {
253         int count = 0;
254         if (bytes != null && bytes.length >=1 &&
255                 bytes[bytes.length - 1] == '\n') {
256             count++;
257             if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
258                 count++;
259             }
260         }
261         return count;
262     }
263
264     /**
265      * @return Meta data for this record.
266      */

267     public ARCRecordMetaData getMetaData() {
268         return (ARCRecordMetaData)getHeader();
269     }
270     
271     /**
272      * @return http headers (Only available after header has been read).
273      */

274     public Header [] getHttpHeaders() {
275         return this.httpHeaders;
276     }
277
278     /**
279      * @return Next character in this ARCRecord's content else -1 if at end of
280      * this record.
281      * @throws IOException
282      */

283     public int read() throws IOException JavaDoc {
284         int c = -1;
285         if (this.httpHeaderStream != null &&
286                 (this.httpHeaderStream.available() > 0)) {
287             // If http header, return bytes from it before we go to underlying
288
// stream.
289
c = this.httpHeaderStream.read();
290             // If done with the header stream, null it out.
291
if (this.httpHeaderStream.available() <= 0) {
292                 this.httpHeaderStream = null;
293             }
294             incrementPosition();
295         } else {
296             c = super.read();
297         }
298         return c;
299     }
300
301     public int read(byte [] b, int offset, int length) throws IOException JavaDoc {
302         int read = -1;
303         if (this.httpHeaderStream != null &&
304                 (this.httpHeaderStream.available() > 0)) {
305             // If http header, return bytes from it before we go to underlying
306
// stream.
307
read = Math.min(length, this.httpHeaderStream.available());
308             if (read == 0) {
309                 read = -1;
310             } else {
311                 read = this.httpHeaderStream.read(b, offset, read);
312             }
313             // If done with the header stream, null it out.
314
if (this.httpHeaderStream.available() <= 0) {
315                 this.httpHeaderStream = null;
316             }
317             incrementPosition(read);
318         } else {
319             read = super.read(b, offset, length);
320         }
321         return read;
322     }
323
324     /**
325      * @return Offset at which the body begins (Only known after
326      * header has been read) or -1 if none or if we haven't read
327      * headers yet. Usually length of HTTP headers (does not include ARC
328      * metadata line length).
329      */

330     public int getBodyOffset() {
331         return this.getMetaData().getContentBegin();
332     }
333     
334     @Override JavaDoc
335     protected String JavaDoc getIp4Cdx(ArchiveRecordHeader h) {
336         String JavaDoc result = null;
337         if (h instanceof ARCRecordMetaData) {
338             result = ((ARCRecordMetaData)h).getIp();
339         }
340         return (result != null)? result: super.getIp4Cdx(h);
341     }
342     
343     @Override JavaDoc
344     protected String JavaDoc getStatusCode4Cdx(ArchiveRecordHeader h) {
345         String JavaDoc result = null;
346         if (h instanceof ARCRecordMetaData) {
347             result = ((ARCRecordMetaData) h).getStatusCode();
348         }
349         return (result != null) ? result: super.getStatusCode4Cdx(h);
350     }
351     
352     @Override JavaDoc
353     protected String JavaDoc getDigest4Cdx(ArchiveRecordHeader h) {
354         String JavaDoc result = null;
355         if (h instanceof ARCRecordMetaData) {
356             result = ((ARCRecordMetaData) h).getDigest();
357         }
358         return (result != null) ? result: super.getDigest4Cdx(h);
359     }
360 }
Popular Tags