KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > warc > WARCConstants


1 /*
2  * WARCConstants
3  *
4  * $Id: WARCConstants.java,v 1.16 2006/09/02 02:44:09 stack-sf Exp $
5  *
6  * Created on July 27th, 2006
7  *
8  * Copyright (C) 2006 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26 package org.archive.io.warc;
27
28 import java.util.Arrays JavaDoc;
29 import java.util.List JavaDoc;
30
31 import org.archive.io.ArchiveFileConstants;
32
33 /**
34  * WARC Constants used by readers and writers.
35  * @author stack
36  * @version $Revision: 1.16 $ $Date: 2006/09/02 02:44:09 $
37  */

38 public interface WARCConstants extends ArchiveFileConstants {
39     /**
40      * Default maximum WARC file size.
41      * 1Gig.
42      */

43     public static final int DEFAULT_MAX_WARC_FILE_SIZE = 1024 * 1024 * 1024;
44     
45     /**
46      * WARC MAGIC
47      * WARC files and records begin with this sequence.
48      */

49     public static final String JavaDoc WARC_MAGIC = "WARC/";
50     
51     /**
52      * Hard-coded version for WARC files made with this code.
53      * Setting to 0.10 because differs from 0.9 spec. See accompanying
54      * package documentation.
55      */

56     public static final String JavaDoc WARC_VERSION = "0.10";
57     
58     /**
59      * Assumed maximum size of a Header Line.
60      *
61      * This 100k which seems massive but its the same as the LINE_LENGTH from
62      * <code>alexa/include/a_arcio.h</code>:
63      * <pre>
64      * #define LINE_LENGTH (100*1024)
65      * </pre>
66      */

67     public static final int MAX_WARC_HEADER_LINE_LENGTH = 1024 * 100;
68     public static final int MAX_LINE_LENGTH = MAX_WARC_HEADER_LINE_LENGTH;
69     
70     /**
71      * WARC file extention.
72      */

73     public static final String JavaDoc WARC_FILE_EXTENSION = "warc";
74     
75     /**
76      * Dot WARC file extension.
77      */

78     public static final String JavaDoc DOT_WARC_FILE_EXTENSION =
79         "." + WARC_FILE_EXTENSION;
80     
81     public static final String JavaDoc DOT_COMPRESSED_FILE_EXTENSION =
82         ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION;
83
84     /**
85      * Compressed WARC file extension.
86      */

87     public static final String JavaDoc COMPRESSED_WARC_FILE_EXTENSION =
88         WARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
89     
90     /**
91      * Compressed dot WARC file extension.
92      */

93     public static final String JavaDoc DOT_COMPRESSED_WARC_FILE_EXTENSION =
94         DOT_WARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
95     
96     /**
97      * Encoding to use getting bytes from strings.
98      *
99      * Specify an encoding rather than leave it to chance: i.e whatever the
100      * JVMs encoding. Use an encoding that gets the stream as bytes, not chars.
101      *
102      * <p>TODO: ARC uses ISO-8859-1. In general, we should use UTF-8 but we
103      * probably need a single byte encoding if we're out for preserving the
104      * binary data as received over the net (We probably don't want to transform
105      * the supra-ASCII characters to UTF-8 before storing in ARC). For now,
106      * till we figure it, DEFAULT_ENCODING is single-byte charset -- same as
107      * ARCs.
108      */

109     public static final String JavaDoc DEFAULT_ENCODING = "ISO-8859-1";
110     public static final String JavaDoc HEADER_LINE_ENCODING = DEFAULT_ENCODING;
111     
112     public static final String JavaDoc [] HEADER_FIELD_KEYS = {
113         VERSION_FIELD_KEY,
114         LENGTH_FIELD_KEY,
115         TYPE_FIELD_KEY,
116         URL_FIELD_KEY,
117         DATE_FIELD_KEY,
118         RECORD_IDENTIFIER_FIELD_KEY,
119         MIMETYPE_FIELD_KEY
120     };
121     
122     /**
123      * WARC Record Types.
124      */

125     public static final String JavaDoc WARCINFO = "warcinfo";
126     public static final String JavaDoc RESPONSE = "response";
127     public static final String JavaDoc RESOURCE = "resource";
128     public static final String JavaDoc REQUEST = "request";
129     public static final String JavaDoc METADATA = "metadata";
130     public static final String JavaDoc REVISIT = "revist";
131     public static final String JavaDoc CONVERSION = "conversion";
132     public static final String JavaDoc CONTINUATION = "continuation";
133     
134     public static final String JavaDoc TYPE = "type";
135     
136     // List of all WARC Record TYPES
137
public static final String JavaDoc [] TYPES = {WARCINFO, RESPONSE, RESOURCE,
138         REQUEST, METADATA, REVISIT, CONVERSION, CONTINUATION};
139     
140     // Indices into TYPES array.
141
public static final int WARCINFO_INDEX = 0;
142     public static final int RESPONSE_INDEX = 1;
143     public static final int RESOURCE_INDEX = 2;
144     public static final int REQUEST_INDEX = 3;
145     public static final int METADATA_INDEX = 4;
146     public static final int REVISIT_INDEX = 5;
147     public static final int CONVERSION_INDEX = 6;
148     public static final int CONTINUATION_INDEX = 7;
149     
150     // TYPES as List.
151
public static final List JavaDoc TYPES_LIST = Arrays.asList(TYPES);
152     
153     /**
154      * WARC-ID
155      */

156     public static final String JavaDoc WARC_ID = WARC_MAGIC + WARC_VERSION;
157         
158     /**
159      * Header field seperator character.
160      */

161     public static final char HEADER_FIELD_SEPARATOR = ' ';
162     
163     /**
164      * WSP
165      * One of a space or horizontal tab character.
166      * TODO: WSP undefined. Fix.
167      */

168     public static final Character JavaDoc [] WSP = {HEADER_FIELD_SEPARATOR, '\t'};
169
170     /**
171      * Placeholder for length in Header line.
172      * Placeholder is same size as the fixed field size allocated for length,
173      * 12 characters. 12 characters allows records of size almost 1TB.
174      */

175     public static final String JavaDoc PLACEHOLDER_RECORD_LENGTH_STRING =
176         "000000000000";
177     
178     public static final String JavaDoc NAMED_FIELD_IP_LABEL = "IP-Address";
179     public static final String JavaDoc NAMED_FIELD_CHECKSUM_LABEL = "Checksum";
180     public static final String JavaDoc NAMED_FIELD_RELATED_LABEL = "Related-Record-ID";
181     public static final String JavaDoc NAMED_FIELD_WARCFILENAME = "Filename";
182     public static final String JavaDoc NAMED_FIELD_DESCRIPTION = "Description";
183     public static final String JavaDoc NAMED_FIELD_FILEDESC = "ARC-FileDesc";
184     public static final String JavaDoc NAMED_FIELD_TRUNCATED = "Truncated";
185     public static final String JavaDoc NAMED_FIELD_TRUNCATED_VALUE_TIME = "time";
186     public static final String JavaDoc NAMED_FIELD_TRUNCATED_VALUE_LEN = "length";
187     public static final String JavaDoc NAMED_FIELD_TRUNCATED_VALUE_HEAD =
188         "long-headers";
189     public static final String JavaDoc NAMED_FIELD_TRUNCATED_VALUE_UNSPECIFIED = null;
190     
191     /**
192      * To be safe, lets use application type rather than message. Regards
193      * 'message/http', RFC says "...provided that it obeys the MIME restrictions
194      * for all 'message' types regarding line length and encodings." This
195      * usually means lines of 1000 octets max (unless a
196      * 'Content-Transfer-Encoding: binary' mime header is present).
197      * @see <a HREF="http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html#sec19.1">rfc2616 section 19.1</a>
198      */

199     public static final String JavaDoc HTTP_REQUEST_MIMETYPE =
200         "application/http;msgtype=request";
201     public static final String JavaDoc HTTP_RESPONSE_MIMETYPE =
202         "application/http;msgtype=response";
203 }
204
Popular Tags