ARCRecord


1   /* ARCRecord
2    *
3    * $Id: ARCRecord.java,v 1.34.2.1 2007/01/13 01:31:36 stack-sf Exp $
4    *
5    * Created on Jan 7, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.io.arc;
26  
27  import java.io.ByteArrayInputStream  ;
28  import java.io.ByteArrayOutputStream  ;
29  import java.io.IOException  ;
30  import java.io.InputStream  ;
31  
32  import org.apache.commons.httpclient.Header;
33  import org.apache.commons.httpclient.HttpParser;
34  import org.apache.commons.httpclient.StatusLine;
35  import org.apache.commons.httpclient.util.EncodingUtil;
36  import org.archive.io.ArchiveRecord;
37  import org.archive.io.ArchiveRecordHeader;
38  
39  
40  /**
41   * An ARC file record.
42   * Does not compass the ARCRecord metadata line, just the record content.
43   * @author stack
44   */
45  public class ARCRecord extends ArchiveRecord implements ARCConstants {
46      /**
47       * Http status line object.
48       * 
49       * May be null if record is not http.
50       */
51      private StatusLine httpStatus = null;
52  
53      /**
54       * Http header bytes.
55       * 
56       * If non-null and bytes available, give out its contents before we
57       * go back to the underlying stream.
58       */
59      private InputStream   httpHeaderStream = null;
60      
61      /**
62       * Http headers.
63       * 
64       * Only populated after reading of headers.
65       */
66      private Header [] httpHeaders = null;
67  
68      
69      /**
70       * Minimal http header length.
71       * 
72       * I've seen in arcs content length of 1 with no 
73       * header.
74       */
75      private static final long MIN_HTTP_HEADER_LENGTH =
76          "HTTP/1.1 200 OK\r\n".length();
77      
78      /**
79       * Constructor.
80       *
81       * @param in Stream cue'd up to be at the start of the record this instance
82       * is to represent.
83       * @param metaData Meta data.
84       * @throws IOException
85       */
86      public ARCRecord(InputStream   in, ArchiveRecordHeader metaData)
87              throws IOException   {
88          this(in, metaData, 0, true, false, true);
89      }
90  
91      /**
92       * Constructor.
93       *
94       * @param in Stream cue'd up to be at the start of the record this instance
95       * is to represent.
96       * @param metaData Meta data.
97       * @param bodyOffset Offset into the body.  Usually 0.
98       * @param digest True if we're to calculate digest for this record.  Not
99       * digesting saves about ~15% of cpu during an ARC parse.
100      * @param strict Be strict parsing (Parsing stops if ARC inproperly
101      * formatted).
102      * @param parseHttpHeaders True if we are to parse HTTP headers.  Costs
103      * about ~20% of CPU during an ARC parse.
104      * @throws IOException
105      */
106     public ARCRecord(InputStream   in, ArchiveRecordHeader metaData,
107         int bodyOffset, boolean digest, boolean strict,
108         final boolean parseHttpHeaders) 
109     throws IOException   {
110         super(in, metaData, bodyOffset, digest, strict);
111         if (parseHttpHeaders) {
112             this.httpHeaderStream = readHttpHeader();
113         }
114     }
115     
116     /**
117      * Skip over the the http header if one present.
118      * 
119      * Subsequent reads will get the body.
120      * 
121      * <p>Calling this method in the midst of reading the header
122      * will make for strange results.  Otherwise, safe to call
123      * at any time though before reading any of the arc record
124      * content is only time that it makes sense.
125      * 
126      * <p>After calling this method, you can call
127      * {@link #getHttpHeaders()} to get the read http header.
128      * 
129      * @throws IOException
130      */
131     public void skipHttpHeader() throws IOException   {
132         if (this.httpHeaderStream != null) {
133             // Empty the httpHeaderStream
134             for (int available = this.httpHeaderStream.available();
135                     this.httpHeaderStream != null &&
136                         (available = this.httpHeaderStream.available()) > 0;) {
137                 // We should be in this loop once only we should only do this
138                 // buffer allocation once.
139                 byte [] buffer = new byte[available];
140                 // The read nulls out httpHeaderStream when done with it so
141                 // need check for null in the loop control line.
142                 read(buffer, 0, available);
143             }
144         }
145     }
146     
147     public void dumpHttpHeader() throws IOException   {
148         if (this.httpHeaderStream == null) {
149             return;
150         }
151         // Dump the httpHeaderStream to STDOUT
152         for (int available = this.httpHeaderStream.available();
153             this.httpHeaderStream != null
154                 && (available = this.httpHeaderStream.available()) > 0;) {
155             // We should be in this loop only once and should do this
156             // buffer allocation once.
157             byte[] buffer = new byte[available];
158             // The read nulls out httpHeaderStream when done with it so
159             // need check for null in the loop control line.
160             int read = read(buffer, 0, available);
161             System.out.write(buffer, 0, read);
162         }
163     }
164     
165     /**
166      * Read http header if present. Technique borrowed from HttpClient HttpParse
167      * class.
168      * 
169      * @return ByteArrayInputStream with the http header in it or null if no
170      *         http header.
171      * @throws IOException
172      */
173     private InputStream   readHttpHeader() throws IOException   {
174         // If judged a record that doesn't have an http header, return
175         // immediately.
176         if(!getHeader().getUrl().startsWith("http") ||
177             getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
178             return null;
179         }
180         byte [] statusBytes = HttpParser.readRawLine(getIn());
181         int eolCharCount = getEolCharsCount(statusBytes);
182         if (eolCharCount <= 0) {
183             throw new IOException  ("Failed to read http status where one " +
184                 " was expected: " + new String  (statusBytes));
185         }
186         String   statusLine = EncodingUtil.getString(statusBytes, 0,
187             statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
188         if ((statusLine == null) ||
189                 !StatusLine.startsWithHTTP(statusLine)) {
190             throw new IOException  ("Failed parse of http status line.");
191         }
192         this.httpStatus = new StatusLine(statusLine);
193         
194         // Save off all bytes read.  Keep them as bytes rather than
195         // convert to strings so we don't have to worry about encodings
196         // though this should never be a problem doing http headers since
197         // its all supposed to be ascii.
198         ByteArrayOutputStream   baos =
199             new ByteArrayOutputStream  (statusBytes.length + 4 * 1024);
200         baos.write(statusBytes);
201         
202         // Now read rest of the header lines looking for the separation
203         // between header and body.
204         for (byte [] lineBytes = null; true;) {
205             lineBytes = HttpParser.readRawLine(getIn());
206             eolCharCount = getEolCharsCount(lineBytes);
207             if (eolCharCount <= 0) {
208                 throw new IOException  ("Failed reading http headers: " +
209                     ((lineBytes != null)? new String  (lineBytes): null));
210             }
211             // Save the bytes read.
212             baos.write(lineBytes);
213             if ((lineBytes.length - eolCharCount) <= 0) {
214                 // We've finished reading the http header.
215                 break;
216             }
217         }
218         
219         byte [] headerBytes = baos.toByteArray();
220         // Save off where body starts.
221         this.getMetaData().setContentBegin(headerBytes.length);
222         ByteArrayInputStream   bais =
223             new ByteArrayInputStream  (headerBytes);
224         if (!bais.markSupported()) {
225             throw new IOException  ("ByteArrayInputStream does not support mark");
226         }
227         bais.mark(headerBytes.length);
228         // Read the status line.  Don't let it into the parseHeaders function.
229         // It doesn't know what to do with it.
230         bais.read(statusBytes, 0, statusBytes.length);
231         this.httpHeaders = HttpParser.parseHeaders(bais,
232             ARCConstants.DEFAULT_ENCODING);
233         this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
234         bais.reset();
235         return bais;
236     }
237     
238     /**
239      * Return status code for this record.
240      * 
241      * This method will return -1 until the http header has been read.
242      * @return Status code.
243      */
244     public int getStatusCode() {
245         return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
246     }
247     
248     /**
249      * @param bytes Array of bytes to examine for an EOL.
250      * @return Count of end-of-line characters or zero if none.
251      */
252     private int getEolCharsCount(byte [] bytes) {
253         int count = 0;
254         if (bytes != null && bytes.length >=1 &&
255                 bytes[bytes.length - 1] == '\n') {
256             count++;
257             if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
258                 count++;
259             }
260         }
261         return count;
262     }
263 
264     /**
265      * @return Meta data for this record.
266      */
267     public ARCRecordMetaData getMetaData() {
268         return (ARCRecordMetaData)getHeader();
269     }
270     
271     /**
272      * @return http headers (Only available after header has been read).
273      */
274     public Header [] getHttpHeaders() {
275         return this.httpHeaders;
276     }
277 
278     /**
279      * @return Next character in this ARCRecord's content else -1 if at end of
280      * this record.
281      * @throws IOException
282      */
283     public int read() throws IOException   {
284         int c = -1;
285         if (this.httpHeaderStream != null &&
286                 (this.httpHeaderStream.available() > 0)) {
287             // If http header, return bytes from it before we go to underlying
288             // stream.
289             c = this.httpHeaderStream.read();
290             // If done with the header stream, null it out.
291             if (this.httpHeaderStream.available() <= 0) {
292                 this.httpHeaderStream = null;
293             }
294             incrementPosition();
295         } else {
296             c = super.read();
297         }
298         return c;
299     }
300 
301     public int read(byte [] b, int offset, int length) throws IOException   {
302         int read = -1;
303         if (this.httpHeaderStream != null &&
304                 (this.httpHeaderStream.available() > 0)) {
305             // If http header, return bytes from it before we go to underlying
306             // stream.
307             read = Math.min(length, this.httpHeaderStream.available());
308             if (read == 0) {
309                 read = -1;
310             } else {
311                 read = this.httpHeaderStream.read(b, offset, read);
312             }
313             // If done with the header stream, null it out.
314             if (this.httpHeaderStream.available() <= 0) {
315                 this.httpHeaderStream = null;
316             }
317             incrementPosition(read);
318         } else {
319             read = super.read(b, offset, length);
320         }
321         return read;
322     }
323 
324     /**
325      * @return Offset at which the body begins (Only known after
326      * header has been read) or -1 if none or if we haven't read
327      * headers yet.  Usually length of HTTP headers (does not include ARC
328      * metadata line length).
329      */
330     public int getBodyOffset() {
331         return this.getMetaData().getContentBegin();
332     }
333     
334     @Override  
335     protected String   getIp4Cdx(ArchiveRecordHeader h) {
336         String   result = null;
337         if (h instanceof ARCRecordMetaData) {
338             result = ((ARCRecordMetaData)h).getIp();
339         }
340         return (result != null)? result: super.getIp4Cdx(h);
341     }
342     
343     @Override  
344     protected String   getStatusCode4Cdx(ArchiveRecordHeader h) {
345         String   result = null;
346         if (h instanceof ARCRecordMetaData) {
347             result = ((ARCRecordMetaData) h).getStatusCode();
348         }
349         return (result != null) ? result: super.getStatusCode4Cdx(h);
350     }
351     
352     @Override  
353     protected String   getDigest4Cdx(ArchiveRecordHeader h) {
354         String   result = null;
355         if (h instanceof ARCRecordMetaData) {
356             result = ((ARCRecordMetaData) h).getDigest();
357         }
358         return (result != null) ? result: super.getDigest4Cdx(h);
359     }
360 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags