KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > warc > WARCReader


1 /* $Id: WARCReader.java,v 1.5.2.1 2007/01/13 01:31:37 stack-sf Exp $
2  *
3  * Created Aug 23, 2006
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io.warc;
24
25 import java.io.File JavaDoc;
26 import java.io.IOException JavaDoc;
27 import java.io.InputStream JavaDoc;
28 import java.util.Iterator JavaDoc;
29 import java.util.List JavaDoc;
30
31 import org.apache.commons.cli.CommandLine;
32 import org.apache.commons.cli.HelpFormatter;
33 import org.apache.commons.cli.Option;
34 import org.apache.commons.cli.Options;
35 import org.apache.commons.cli.ParseException;
36 import org.apache.commons.cli.PosixParser;
37 import org.apache.commons.lang.NotImplementedException;
38 import org.archive.io.ArchiveReader;
39 import org.archive.io.ArchiveRecord;
40
41 /**
42  * WARCReader.
43  * Go via {@link WARCReaderFactory} to get instance.
44  * @author stack
45  * @version $Date: 2007/01/13 01:31:37 $ $Version$
46  */

47 public class WARCReader extends ArchiveReader implements WARCConstants {
48     WARCReader() {
49         super();
50     }
51     
52     @Override JavaDoc
53     protected void initialize(String JavaDoc i) {
54         super.initialize(i);
55         setVersion(WARC_VERSION);
56     }
57     
58     /**
59      * Skip over any trailing new lines at end of the record so we're lined up
60      * ready to read the next.
61      * @param record
62      * @throws IOException
63      */

64     protected void gotoEOR(ArchiveRecord record) throws IOException JavaDoc {
65         if (record.available() != 0) {
66             throw new IOException JavaDoc("Record should be exhausted before coming " +
67                 "in here");
68         }
69
70         // Records end in 2*CRLF. Such it up.
71
readExpectedChar(getIn(), CRLF.charAt(0));
72         readExpectedChar(getIn(), CRLF.charAt(1));
73         readExpectedChar(getIn(), CRLF.charAt(0));
74         readExpectedChar(getIn(), CRLF.charAt(1));
75     }
76     
77     protected void readExpectedChar(final InputStream JavaDoc is, final int expected)
78     throws IOException JavaDoc {
79         int c = is.read();
80         if (c != expected) {
81             throw new IOException JavaDoc("Unexpected character " +
82                 Integer.toHexString(c) + "(Expecting " +
83                 Integer.toHexString(expected) + ")");
84         }
85     }
86     
87     /**
88      * Create new WARC record.
89      * Encapsulate housekeeping that has to do w/ creating new Record.
90      * @param is InputStream to use.
91      * @param offset Absolute offset into WARC file.
92      * @return A WARCRecord.
93      * @throws IOException
94      */

95     protected WARCRecord createArchiveRecord(InputStream JavaDoc is, long offset)
96     throws IOException JavaDoc {
97         return (WARCRecord)currentRecord(new WARCRecord(is,
98             getReaderIdentifier(), offset, isDigest(), isStrict()));
99     }
100     
101     @Override JavaDoc
102     public void dump(boolean compress)
103     throws IOException JavaDoc, java.text.ParseException JavaDoc {
104         for (final Iterator JavaDoc<ArchiveRecord> i = iterator(); i.hasNext();) {
105             ArchiveRecord r = i.next();
106             System.out.println(r.getHeader().toString());
107             r.dump();
108             System.out.println();
109         }
110     }
111     
112
113     @Override JavaDoc
114     public ArchiveReader getDeleteFileOnCloseReader(final File JavaDoc f) {
115         throw new NotImplementedException("TODO");
116     }
117
118     @Override JavaDoc
119     public String JavaDoc getDotFileExtension() {
120         return DOT_WARC_FILE_EXTENSION;
121     }
122
123     @Override JavaDoc
124     public String JavaDoc getFileExtension() {
125         return WARC_FILE_EXTENSION;
126     }
127     
128     // Static methods follow. Mostly for command-line processing.
129

130     /**
131      *
132      * @param formatter Help formatter instance.
133      * @param options Usage options.
134      * @param exitCode Exit code.
135      */

136     private static void usage(HelpFormatter formatter, Options options,
137             int exitCode) {
138         formatter.printHelp("java org.archive.io.arc.WARCReader" +
139             " [--digest=true|false] \\\n" +
140             " [--format=cdx|cdxfile|dump|gzipdump]" +
141             " [--offset=#] \\\n[--strict] [--parse] WARC_FILE|WARC_URL",
142                 options);
143         System.exit(exitCode);
144     }
145
146     /**
147      * Write out the arcfile.
148      *
149      * @param reader
150      * @param format Format to use outputting.
151      * @throws IOException
152      * @throws java.text.ParseException
153      */

154     protected static void output(WARCReader reader, String JavaDoc format)
155     throws IOException JavaDoc, java.text.ParseException JavaDoc {
156         if (!reader.output(format)) {
157             throw new IOException JavaDoc("Unsupported format: " + format);
158         }
159     }
160     
161     /**
162      * Output passed record using passed format specifier.
163      * @param r ARCReader instance to output.
164      * @param format What format to use outputting.
165      * @throws IOException
166      */

167     protected static void outputRecord(final WARCReader r,
168         final String JavaDoc format)
169     throws IOException JavaDoc {
170         if (!r.outputRecord(format)) {
171             throw new IOException JavaDoc("Unsupported format" +
172                 " (or unsupported on a single record): " + format);
173         }
174     }
175
176     /**
177      * Generate a CDX index file for an ARC file.
178      *
179      * @param urlOrPath The ARC file to generate a CDX index for
180      * @throws IOException
181      * @throws java.text.ParseException
182      */

183     public static void createCDXIndexFile(String JavaDoc urlOrPath)
184     throws IOException JavaDoc, java.text.ParseException JavaDoc {
185         WARCReader r = WARCReaderFactory.get(urlOrPath);
186         r.setStrict(false);
187         r.setDigest(true);
188         output(r, CDX_FILE);
189     }
190
191     /**
192      * Command-line interface to WARCReader.
193      *
194      * Here is the command-line interface:
195      * <pre>
196      * usage: java org.archive.io.arc.WARCReader [--offset=#] ARCFILE
197      * -h,--help Prints this message and exits.
198      * -o,--offset Outputs record at this offset into arc file.</pre>
199      *
200      * <p>Outputs using a pseudo-CDX format as described here:
201      * <a HREF="http://www.archive.org/web/researcher/cdx_legend.php">CDX
202      * Legent</a> and here
203      * <a HREF="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
204      * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
205      * Hash is hard-coded straight SHA-1 hash of content.
206      *
207      * @param args Command-line arguments.
208      * @throws ParseException Failed parse of the command line.
209      * @throws IOException
210      * @throws java.text.ParseException
211      */

212     public static void main(String JavaDoc [] args)
213     throws ParseException, IOException JavaDoc, java.text.ParseException JavaDoc {
214         Options options = new Options();
215         options.addOption(new Option("h","help", false,
216             "Prints this message and exits."));
217         options.addOption(new Option("o","offset", true,
218             "Outputs record at this offset into arc file."));
219         options.addOption(new Option("d","digest", true,
220             "Pass true|false. Expensive. Default: true (SHA-1)."));
221         options.addOption(new Option("s","strict", false,
222             "Strict mode. Fails parse if incorrectly formatted WARC."));
223         options.addOption(new Option("f","format", true,
224             "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," +
225             "'or 'nohead'. Default: 'cdx'."));
226         PosixParser parser = new PosixParser();
227         CommandLine cmdline = parser.parse(options, args, false);
228         List JavaDoc cmdlineArgs = cmdline.getArgList();
229         Option [] cmdlineOptions = cmdline.getOptions();
230         HelpFormatter formatter = new HelpFormatter();
231
232         // If no args, print help.
233
if (cmdlineArgs.size() <= 0) {
234             usage(formatter, options, 0);
235         }
236
237         // Now look at options passed.
238
long offset = -1;
239         boolean digest = false;
240         boolean strict = false;
241         String JavaDoc format = CDX;
242         for (int i = 0; i < cmdlineOptions.length; i++) {
243             switch(cmdlineOptions[i].getId()) {
244                 case 'h':
245                     usage(formatter, options, 0);
246                     break;
247
248                 case 'o':
249                     offset =
250                         Long.parseLong(cmdlineOptions[i].getValue());
251                     break;
252                     
253                 case 's':
254                     strict = true;
255                     break;
256                     
257                 case 'd':
258                     digest = getTrueOrFalse(cmdlineOptions[i].getValue());
259                     break;
260                     
261                 case 'f':
262                     format = cmdlineOptions[i].getValue().toLowerCase();
263                     boolean match = false;
264                     // List of supported formats.
265
final String JavaDoc [] supportedFormats =
266                         {CDX, DUMP, GZIP_DUMP, CDX_FILE};
267                     for (int ii = 0; ii < supportedFormats.length; ii++) {
268                         if (supportedFormats[ii].equals(format)) {
269                             match = true;
270                             break;
271                         }
272                     }
273                     if (!match) {
274                         usage(formatter, options, 1);
275                     }
276                     break;
277
278                 default:
279                     throw new RuntimeException JavaDoc("Unexpected option: " +
280                         + cmdlineOptions[i].getId());
281             }
282         }
283         
284         if (offset >= 0) {
285             if (cmdlineArgs.size() != 1) {
286                 System.out.println("Error: Pass one arcfile only.");
287                 usage(formatter, options, 1);
288             }
289             WARCReader r = WARCReaderFactory.get(
290                 new File JavaDoc((String JavaDoc)cmdlineArgs.get(0)), offset);
291             r.setStrict(strict);
292             outputRecord(r, format);
293         } else {
294             for (Iterator JavaDoc i = cmdlineArgs.iterator(); i.hasNext();) {
295                 String JavaDoc urlOrPath = (String JavaDoc)i.next();
296                 try {
297                     WARCReader r = WARCReaderFactory.get(urlOrPath);
298                     r.setStrict(strict);
299                     r.setDigest(digest);
300                     output(r, format);
301                 } catch (RuntimeException JavaDoc e) {
302                     // Write out name of file we failed on to help with
303
// debugging. Then print stack trace and try to keep
304
// going. We do this for case where we're being fed
305
// a bunch of ARCs; just note the bad one and move
306
// on to the next.
307
System.err.println("Exception processing " + urlOrPath +
308                         ": " + e.getMessage());
309                     e.printStackTrace(System.err);
310                     System.exit(1);
311                 }
312             }
313         }
314     }
315 }
Popular Tags