KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > arc > ARCReader


1 /* $Id: ARCReader.java,v 1.72.2.1 2007/01/13 01:31:35 stack-sf Exp $
2  *
3  * Created on May 1, 2004
4  *
5  * Copyright (C) 2004 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io.arc;
24
25 import java.io.ByteArrayOutputStream JavaDoc;
26 import java.io.File JavaDoc;
27 import java.io.IOException JavaDoc;
28 import java.io.InputStream JavaDoc;
29 import java.util.ArrayList JavaDoc;
30 import java.util.Arrays JavaDoc;
31 import java.util.HashMap JavaDoc;
32 import java.util.Iterator JavaDoc;
33 import java.util.List JavaDoc;
34 import java.util.Map JavaDoc;
35 import java.util.concurrent.atomic.AtomicInteger JavaDoc;
36 import java.util.logging.Level JavaDoc;
37 import java.util.logging.Logger JavaDoc;
38 import java.util.regex.Matcher JavaDoc;
39
40 import org.apache.commons.cli.CommandLine;
41 import org.apache.commons.cli.HelpFormatter;
42 import org.apache.commons.cli.Option;
43 import org.apache.commons.cli.Options;
44 import org.apache.commons.cli.ParseException;
45 import org.apache.commons.cli.PosixParser;
46 import org.archive.io.ArchiveReader;
47 import org.archive.io.ArchiveRecord;
48 import org.archive.io.ArchiveRecordHeader;
49 import org.archive.io.RecoverableIOException;
50 import org.archive.io.WriterPoolMember;
51 import org.archive.util.ArchiveUtils;
52 import org.archive.util.InetAddressUtil;
53 import org.archive.util.TextUtils;
54
55
56 /**
57  * Get an iterator on an ARC file or get a record by absolute position.
58  *
59  * ARC files are described here:
60  * <a HREF="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
61  * File Format</a>.
62  *
63  * <p>This class knows how to parse an ARC file. Pass it a file path
64  * or an URL to an ARC. It can parse ARC Version 1 and 2.
65  *
66  * <p>Iterator returns <code>ARCRecord</code>
67  * though {@link Iterator#next()} is returning
68  * java.lang.Object. Cast the return.
69  *
70  * <p>Profiling java.io vs. memory-mapped ByteBufferInputStream shows the
71  * latter slightly slower -- but not by much. TODO: Test more. Just
72  * change {@link #getInputStream(File, long)}.
73  *
74  * @author stack
75  * @version $Date: 2007/01/13 01:31:35 $ $Revision: 1.72.2.1 $
76  */

77 public abstract class ARCReader extends ArchiveReader
78 implements ARCConstants {
79     Logger JavaDoc logger = Logger.getLogger(ARCReader.class.getName());
80     
81     /**
82      * Set to true if we are aligned on first record of Archive file.
83      * We used depend on offset. If offset was zero, then we were
84      * aligned on first record. This is no longer necessarily the case when
85      * Reader is created at an offset into an Archive file: The offset is zero
86      * but its relative to where we started reading.
87      */

88     private boolean alignedOnFirstRecord = true;
89     
90     /**
91      * Assumed maximum size of a record meta header line.
92      *
93      * This 100k which seems massive but its the same as the LINE_LENGTH from
94      * <code>alexa/include/a_arcio.h</code>:
95      * <pre>
96      * #define LINE_LENGTH (100*1024)
97      * </pre>
98      */

99     private static final int MAX_HEADER_LINE_LENGTH = 1024 * 100;
100
101     /**
102      * Array of field names.
103      *
104      * Used to initialize <code>headerFieldNameKeys</code>.
105      */

106     private final String JavaDoc [] headerFieldNameKeysArray = {
107         URL_FIELD_KEY,
108         IP_HEADER_FIELD_KEY,
109         DATE_FIELD_KEY,
110         MIMETYPE_FIELD_KEY,
111         LENGTH_FIELD_KEY
112     };
113     
114     /**
115      * An array of the header field names found in the ARC file header on
116      * the 3rd line.
117      *
118      * We used to read these in from the arc file first record 3rd line but
119      * now we hardcode them for sake of improved performance.
120      */

121     private final List JavaDoc<String JavaDoc> headerFieldNameKeys =
122         Arrays.asList(this.headerFieldNameKeysArray);
123     
124     private boolean parseHttpHeaders = true;
125     
126     ARCReader() {
127         super();
128     }
129     
130     /**
131      * Skip over any trailing new lines at end of the record so we're lined up
132      * ready to read the next.
133      * @param record
134      * @throws IOException
135      */

136     protected void gotoEOR(ArchiveRecord record) throws IOException JavaDoc {
137         if (getIn().available() <= 0) {
138             return;
139         }
140         
141         // Remove any trailing LINE_SEPARATOR
142
int c = -1;
143         while (getIn().available() > 0) {
144             if (getIn().markSupported()) {
145                 getIn().mark(1);
146             }
147             c = getIn().read();
148             if (c != -1) {
149                 if (c == LINE_SEPARATOR) {
150                     continue;
151                 }
152                 if (getIn().markSupported()) {
153                     // We've overread. We're probably in next record. There is
154
// no way of telling for sure. It may be dross at end of
155
// current record. Backup.
156
getIn().reset();
157                     break;
158                 }
159                 ArchiveRecordHeader h = (getCurrentRecord() != null)?
160                     record.getHeader(): null;
161                 throw new IOException JavaDoc("Read " + (char)c +
162                     " when only " + LINE_SEPARATOR + " expected. " +
163                     getReaderIdentifier() + ((h != null)?
164                         h.getHeaderFields().toString(): ""));
165             }
166         }
167     }
168     
169     /**
170      * Create new arc record.
171      *
172      * Encapsulate housekeeping that has to do w/ creating a new record.
173      *
174      * <p>Call this method at end of constructor to read in the
175      * arcfile header. Will be problems reading subsequent arc records
176      * if you don't since arcfile header has the list of metadata fields for
177      * all records that follow.
178      *
179      * <p>When parsing through ARCs writing out CDX info, we spend about
180      * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine
181      * -- of which 16% is reading.
182      *
183      * @param is InputStream to use.
184      * @param offset Absolute offset into arc file.
185      * @return An arc record.
186      * @throws IOException
187      */

188     protected ARCRecord createArchiveRecord(InputStream JavaDoc is, long offset)
189     throws IOException JavaDoc {
190         ArrayList JavaDoc<String JavaDoc> firstLineValues = new ArrayList JavaDoc<String JavaDoc>(20);
191         getTokenizedHeaderLine(is, firstLineValues);
192         int bodyOffset = 0;
193         if (offset == 0 && isAlignedOnFirstRecord()) {
194             // If offset is zero and we were aligned at first record on
195
// creation (See #alignedOnFirstRecord for more on this), then no
196
// records have been read yet and we're reading our first one, the
197
// record of ARC file meta info. Its special. In ARC versions
198
// 1.x, first record has three lines of meta info. We've just read
199
// the first line. There are two more. The second line has misc.
200
// info. We're only interested in the first field, the version
201
// number. The third line is the list of field names. Here's what
202
// ARC file version 1.x meta content looks like:
203
//
204
// filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\
205
// 20040107015752 text/plain 77
206
// 1 0 InternetArchive
207
// URL IP-address Archive-date Content-type Archive-length
208
//
209
ArrayList JavaDoc<String JavaDoc> secondLineValues = new ArrayList JavaDoc<String JavaDoc>(20);
210             bodyOffset += getTokenizedHeaderLine(is, secondLineValues);
211             setVersion((String JavaDoc)secondLineValues.get(0) +
212                 "." + (String JavaDoc)secondLineValues.get(1));
213             // Just read over the 3rd line. We used to parse it and use
214
// values found here but now we just hardcode them to avoid
215
// having to read this 3rd line even for random arc file accesses.
216
bodyOffset += getTokenizedHeaderLine(is, null);
217         }
218
219         try {
220             currentRecord(new ARCRecord(is,
221                 (ArchiveRecordHeader)computeMetaData(this.headerFieldNameKeys,
222                     firstLineValues,
223                     getVersion(), offset), bodyOffset, isDigest(),
224                     isStrict(), isParseHttpHeaders()));
225         } catch (IOException JavaDoc e) {
226             IOException JavaDoc newE = new IOException JavaDoc(e.getMessage() + " (Offset " +
227                     offset + ").");
228             newE.setStackTrace(e.getStackTrace());
229             throw newE;
230         }
231         return (ARCRecord)getCurrentRecord();
232     }
233     
234     /**
235      * Returns version of this ARC file. Usually read from first record of ARC.
236      * If we're reading without having first read the first record -- e.g.
237      * random access into middle of an ARC -- then version will not have been
238      * set. For now, we return a default, version 1.1. Later, if more than
239      * just one version of ARC, we could look at such as the meta line to see
240      * what version of ARC this is.
241      * @return Version of this ARC file.
242      */

243     public String JavaDoc getVersion() {
244         return (super.getVersion() == null)? "1.1": super.getVersion();
245     }
246
247     /**
248      * Get a record header line as list of tokens.
249      *
250      * We keep reading till we find a LINE_SEPARATOR or we reach the end
251      * of file w/o finding a LINE_SEPARATOR or the line length is crazy.
252      *
253      * @param stream InputStream to read from.
254      * @param list Empty list that gets filled w/ string tokens.
255      * @return Count of characters read.
256      * @exception IOException If problem reading stream or no line separator
257      * found or EOF before EOL or we didn't get minimum header fields.
258      */

259     private int getTokenizedHeaderLine(final InputStream JavaDoc stream,
260             List JavaDoc<String JavaDoc> list) throws IOException JavaDoc {
261         // Preallocate usual line size.
262
StringBuilder JavaDoc buffer = new StringBuilder JavaDoc(2048 + 20);
263         int read = 0;
264         int previous = -1;
265         for (int c = -1; true;) {
266             previous = c;
267             c = stream.read();
268             if (c == -1) {
269                 throw new RecoverableIOException("Hit EOF before header EOL.");
270             }
271             c &= 0xff;
272             read++;
273             if (read > MAX_HEADER_LINE_LENGTH) {
274                 throw new IOException JavaDoc("Header line longer than max allowed " +
275                     " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) +
276                     " -- or passed buffer doesn't contain a line (Read: " +
277                     buffer.length() + "). Here's" +
278                     " some of what was read: " +
279                     buffer.substring(0, Math.min(buffer.length(), 256)));
280             }
281
282             if (c == LINE_SEPARATOR) {
283                 if (buffer.length() == 0) {
284                     // Empty line at start of buffer. Skip it and try again.
285
continue;
286                 }
287
288                 if (list != null) {
289                     list.add(buffer.toString());
290                 }
291                 // LOOP TERMINATION.
292
break;
293             } else if (c == HEADER_FIELD_SEPARATOR) {
294                 if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {
295                     // Early ARCs sometimes had multiple spaces between fields.
296
continue;
297                 }
298                 if (list != null) {
299                     list.add(buffer.toString());
300                 }
301                 // reset to empty
302
buffer.setLength(0);
303             } else {
304                 buffer.append((char)c);
305             }
306         }
307
308         // List must have at least 3 elements in it and no more than 10. If
309
// it has other than this, then bogus parse.
310
if (list != null && (list.size() < 3 || list.size() > 100)) {
311             throw new IOException JavaDoc("Unparseable header line: " + list);
312         }
313
314         return read;
315     }
316
317     /**
318      * Compute metadata fields.
319      *
320      * Here we check the meta field has right number of items in it.
321      *
322      * @param keys Keys to use composing headerFields map.
323      * @param values Values to set into the headerFields map.
324      * @param v The version of this ARC file.
325      * @param offset Offset into arc file.
326      *
327      * @return Metadata structure for this record.
328      *
329      * @exception IOException If no. of keys doesn't match no. of values.
330      */

331     private ARCRecordMetaData computeMetaData(List JavaDoc<String JavaDoc> keys,
332             List JavaDoc<String JavaDoc> values, String JavaDoc v, long offset)
333     throws IOException JavaDoc {
334         if (keys.size() != values.size()) {
335             List JavaDoc<String JavaDoc> originalValues = values;
336             if (!isStrict()) {
337                 values = fixSpaceInMetadataLine(values, keys.size());
338                 // If values still doesn't match key size, try and do
339
// further repair.
340
if (keys.size() != values.size()) {
341                     // Early ARCs had a space in mimetype.
342
if (values.size() == (keys.size() + 1) &&
343                             values.get(4).toLowerCase().startsWith("charset=")) {
344                         List JavaDoc<String JavaDoc> nuvalues =
345                             new ArrayList JavaDoc<String JavaDoc>(keys.size());
346                         nuvalues.add(0, values.get(0));
347                         nuvalues.add(1, values.get(1));
348                         nuvalues.add(2, values.get(2));
349                         nuvalues.add(3, values.get(3) + values.get(4));
350                         nuvalues.add(4, values.get(5));
351                         values = nuvalues;
352                     }
353                 }
354             }
355             if (keys.size() != values.size()) {
356                 throw new IOException JavaDoc("Size of field name keys does" +
357                     " not match count of field values: " + values);
358             }
359             // Note that field was fixed on stderr.
360
logStdErr(Level.WARNING, "Fixed spaces in metadata line at " +
361                 "offset " + offset +
362                 " Original: " + originalValues + ", New: " + values);
363         }
364         
365         Map JavaDoc<Object JavaDoc, Object JavaDoc> headerFields =
366             new HashMap JavaDoc<Object JavaDoc, Object JavaDoc>(keys.size() + 2);
367         for (int i = 0; i < keys.size(); i++) {
368             headerFields.put(keys.get(i), values.get(i));
369         }
370         
371         // Add a check for tabs in URLs. If any, replace with '%09'.
372
// See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,
373
// [ 1010966 ] crawl.log has URIs with spaces in them.
374
String JavaDoc url = (String JavaDoc)headerFields.get(URL_FIELD_KEY);
375         if (url != null && url.indexOf('\t') >= 0) {
376             headerFields.put(URL_FIELD_KEY,
377                 TextUtils.replaceAll("\t", url, "%09"));
378         }
379
380         headerFields.put(VERSION_FIELD_KEY, v);
381         headerFields.put(ABSOLUTE_OFFSET_KEY, new Long JavaDoc(offset));
382
383         return new ARCRecordMetaData(getReaderIdentifier(), headerFields);
384     }
385     
386     /**
387      * Fix space in URLs.
388      * The ARCWriter used to write into the ARC URLs with spaces in them.
389      * See <a
390      * HREF="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ]
391      * crawl.log has URIs with spaces in them</a>.
392      * This method does fix up on such headers converting all spaces found
393      * to '%20'.
394      * @param values List of metadata values.
395      * @param requiredSize Expected size of resultant values list.
396      * @return New list if we successfully fixed up values or original if
397      * fixup failed.
398      */

399     protected List JavaDoc<String JavaDoc> fixSpaceInMetadataLine(List JavaDoc<String JavaDoc> values,
400             int requiredSize) {
401         // Do validity check. 3rd from last is a date of 14 numeric
402
// characters. The 4th from last is IP, all before the IP
403
// should be concatenated together with a '%20' joiner.
404
// In the below, '4' is 4th field from end which has the IP.
405
if (!(values.size() > requiredSize) || values.size() < 4) {
406             return values;
407         }
408         // Test 3rd field is valid date.
409
String JavaDoc date = (String JavaDoc)values.get(values.size() - 3);
410         if (date.length() != 14) {
411             return values;
412         }
413         for (int i = 0; i < date.length(); i++) {
414             if (!Character.isDigit(date.charAt(i))) {
415                 return values;
416             }
417         }
418         // Test 4th field is valid IP.
419
String JavaDoc ip = (String JavaDoc)values.get(values.size() - 4);
420         Matcher JavaDoc m = InetAddressUtil.IPV4_QUADS.matcher(ip);
421         if (ip == "-" || m.matches()) {
422             List JavaDoc<String JavaDoc> newValues = new ArrayList JavaDoc<String JavaDoc>(requiredSize);
423             StringBuffer JavaDoc url = new StringBuffer JavaDoc();
424             for (int i = 0; i < (values.size() - 4); i++) {
425                 if (i > 0) {
426                     url.append("%20");
427                 }
428                 url.append(values.get(i));
429             }
430             newValues.add(url.toString());
431             for (int i = values.size() - 4; i < values.size(); i++) {
432                 newValues.add(values.get(i));
433             }
434             values = newValues;
435         }
436         return values;
437     }
438     
439     protected boolean isAlignedOnFirstRecord() {
440         return alignedOnFirstRecord;
441     }
442
443     protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {
444         this.alignedOnFirstRecord = alignedOnFirstRecord;
445     }
446     
447     /**
448      * @return Returns the parseHttpHeaders.
449      */

450     public boolean isParseHttpHeaders() {
451         return this.parseHttpHeaders;
452     }
453     
454     /**
455      * @param parse The parseHttpHeaders to set.
456      */

457     public void setParseHttpHeaders(boolean parse) {
458         this.parseHttpHeaders = parse;
459     }
460     
461     public String JavaDoc getFileExtension() {
462         return ARC_FILE_EXTENSION;
463     }
464     
465     public String JavaDoc getDotFileExtension() {
466         return DOT_ARC_FILE_EXTENSION;
467     }
468     
469     protected boolean output(final String JavaDoc format)
470     throws IOException JavaDoc, java.text.ParseException JavaDoc {
471         boolean result = super.output(format);
472         if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) {
473             throw new IOException JavaDoc(format +
474                 " format only supported for single Records");
475         }
476         return result;
477     }
478     
479     protected boolean outputRecord(final String JavaDoc format) throws IOException JavaDoc {
480         boolean result = super.outputRecord(format);
481         if (result) {
482             return result;
483         }
484         if (format.equals(NOHEAD)) {
485             // No point digesting if dumping content.
486
setDigest(false);
487             ARCRecord r = (ARCRecord) get();
488             r.skipHttpHeader();
489             r.dump();
490             result = true;
491         } else if (format.equals(HEADER)) {
492             // No point digesting if dumping content.
493
setDigest(false);
494             ARCRecord r = (ARCRecord) get();
495             r.dumpHttpHeader();
496             result = true;
497         }
498
499         return result;
500     }
501
502     public void dump(final boolean compress)
503     throws IOException JavaDoc, java.text.ParseException JavaDoc {
504         // No point digesting if we're doing a dump.
505
setDigest(false);
506         boolean firstRecord = true;
507         ARCWriter writer = null;
508         for (Iterator JavaDoc<ArchiveRecord> ii = iterator(); ii.hasNext();) {
509             ARCRecord r = (ARCRecord)ii.next();
510             // We're to dump the arc on stdout.
511
// Get the first record's data if any.
512
ARCRecordMetaData meta = r.getMetaData();
513             if (firstRecord) {
514                 firstRecord = false;
515                 // Get an ARCWriter.
516
ByteArrayOutputStream JavaDoc baos =
517                     new ByteArrayOutputStream JavaDoc(r.available());
518                 // This is slow but done only once at top of ARC.
519
while (r.available() > 0) {
520                     baos.write(r.read());
521                 }
522                 List JavaDoc<String JavaDoc> listOfMetadata = new ArrayList JavaDoc<String JavaDoc>();
523                 listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));
524                 // Assume getArc returns full path to file. ARCWriter
525
// or new File will complain if it is otherwise.
526
writer = new ARCWriter(new AtomicInteger JavaDoc(), System.out,
527                     new File JavaDoc(meta.getArc()),
528                     compress, meta.getDate(), listOfMetadata);
529                 continue;
530             }
531             
532             writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),
533                 ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),
534                 (int)meta.getLength(), r);
535         }
536         // System.out.println(System.currentTimeMillis() - start);
537
}
538     
539     /**
540      * @return an ArchiveReader that will delete a local file on close. Used
541      * when we bring Archive files local and need to clean up afterward.
542      */

543     public ARCReader getDeleteFileOnCloseReader(final File JavaDoc f) {
544         final ARCReader d = this;
545         return new ARCReader() {
546             private final ARCReader delegate = d;
547             private File JavaDoc archiveFile = f;
548             
549             public void close() throws IOException JavaDoc {
550                 this.delegate.close();
551                 if (this.archiveFile != null) {
552                     if (archiveFile.exists()) {
553                         archiveFile.delete();
554                     }
555                     this.archiveFile = null;
556                 }
557             }
558             
559             public ArchiveRecord get(long o) throws IOException JavaDoc {
560                 return this.delegate.get(o);
561             }
562             
563             public boolean isDigest() {
564                 return this.delegate.isDigest();
565             }
566             
567             public boolean isStrict() {
568                 return this.delegate.isStrict();
569             }
570             
571             public Iterator JavaDoc<ArchiveRecord> iterator() {
572                 return this.delegate.iterator();
573             }
574             
575             public void setDigest(boolean d) {
576                 this.delegate.setDigest(d);
577             }
578             
579             public void setStrict(boolean s) {
580                 this.delegate.setStrict(s);
581             }
582             
583             public List JavaDoc validate() throws IOException JavaDoc {
584                 return this.delegate.validate();
585             }
586
587             @Override JavaDoc
588             public ArchiveRecord get() throws IOException JavaDoc {
589                 return this.delegate.get();
590             }
591
592             @Override JavaDoc
593             public String JavaDoc getVersion() {
594                 return this.delegate.getVersion();
595             }
596
597             @Override JavaDoc
598             public List JavaDoc validate(int noRecords) throws IOException JavaDoc {
599                 return this.delegate.validate(noRecords);
600             }
601
602             @Override JavaDoc
603             protected ARCRecord createArchiveRecord(InputStream JavaDoc is,
604                     long offset)
605             throws IOException JavaDoc {
606                 return this.delegate.createArchiveRecord(is, offset);
607             }
608
609             @Override JavaDoc
610             protected void gotoEOR(ArchiveRecord record) throws IOException JavaDoc {
611                 this.delegate.gotoEOR(record);
612             }
613
614             @Override JavaDoc
615             public void dump(boolean compress)
616             throws IOException JavaDoc, java.text.ParseException JavaDoc {
617                 this.delegate.dump(compress);
618             }
619
620             @Override JavaDoc
621             public String JavaDoc getDotFileExtension() {
622                 return this.delegate.getDotFileExtension();
623             }
624
625             @Override JavaDoc
626             public String JavaDoc getFileExtension() {
627                 return this.delegate.getFileExtension();
628             }
629         };
630     }
631     
632     // Static methods follow.
633

634     /**
635      *
636      * @param formatter Help formatter instance.
637      * @param options Usage options.
638      * @param exitCode Exit code.
639      */

640     private static void usage(HelpFormatter formatter, Options options,
641             int exitCode) {
642         formatter.printHelp("java org.archive.io.arc.ARCReader" +
643             " [--digest=true|false] \\\n" +
644             " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" +
645             " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL",
646                 options);
647         System.exit(exitCode);
648     }
649
650     /**
651      * Write out the arcfile.
652      *
653      * @param reader
654      * @param format Format to use outputting.
655      * @throws IOException
656      * @throws java.text.ParseException
657      */

658     protected static void output(ARCReader reader, String JavaDoc format)
659     throws IOException JavaDoc, java.text.ParseException JavaDoc {
660         if (!reader.output(format)) {
661             throw new IOException JavaDoc("Unsupported format: " + format);
662         }
663     }
664     
665     
666     /**
667      * Output passed record using passed format specifier.
668      * @param r ARCReader instance to output.
669      * @param format What format to use outputting.
670      * @throws IOException
671      */

672     protected static void outputRecord(final ARCReader r, final String JavaDoc format)
673     throws IOException JavaDoc {
674         if (!r.outputRecord(format)) {
675             throw new IOException JavaDoc("Unsupported format" +
676                 " (or unsupported on a single record): " + format);
677         }
678     }
679
680     /**
681      * Generate a CDX index file for an ARC file.
682      *
683      * @param urlOrPath The ARC file to generate a CDX index for
684      * @throws IOException
685      * @throws java.text.ParseException
686      */

687     public static void createCDXIndexFile(String JavaDoc urlOrPath)
688     throws IOException JavaDoc, java.text.ParseException JavaDoc {
689         ARCReader r = ARCReaderFactory.get(urlOrPath);
690         r.setStrict(false);
691         r.setParseHttpHeaders(true);
692         r.setDigest(true);
693         output(r, CDX_FILE);
694     }
695
696     /**
697      * Command-line interface to ARCReader.
698      *
699      * Here is the command-line interface:
700      * <pre>
701      * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
702      * -h,--help Prints this message and exits.
703      * -o,--offset Outputs record at this offset into arc file.</pre>
704      *
705      * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll
706      * take care of classpaths and the calling of ARCReader.
707      *
708      * <p>Outputs using a pseudo-CDX format as described here:
709      * <a HREF="http://www.archive.org/web/researcher/cdx_legend.php">CDX
710      * Legent</a> and here
711      * <a HREF="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
712      * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
713      * Hash is hard-coded straight SHA-1 hash of content.
714      *
715      * @param args Command-line arguments.
716      * @throws ParseException Failed parse of the command line.
717      * @throws IOException
718      * @throws java.text.ParseException
719      */

720     public static void main(String JavaDoc [] args)
721     throws ParseException, IOException JavaDoc, java.text.ParseException JavaDoc {
722         Options options = new Options();
723         options.addOption(new Option("h","help", false,
724             "Prints this message and exits."));
725         options.addOption(new Option("o","offset", true,
726             "Outputs record at this offset into arc file."));
727         options.addOption(new Option("d","digest", true,
728             "Pass true|false. Expensive. Default: true (SHA-1)."));
729         options.addOption(new Option("s","strict", false,
730             "Strict mode. Fails parse if incorrectly formatted ARC."));
731         options.addOption(new Option("p","parse", true,
732             "Pass true|false to parse HTTP Headers. Default: false."));
733         options.addOption(new Option("f","format", true,
734             "Output options: 'cdx', 'cdxfile', 'dump', 'gzipdump', " +
735             "'header', or 'nohead'. Default: 'cdx'."));
736         PosixParser parser = new PosixParser();
737         CommandLine cmdline = parser.parse(options, args, false);
738         List JavaDoc cmdlineArgs = cmdline.getArgList();
739         Option [] cmdlineOptions = cmdline.getOptions();
740         HelpFormatter formatter = new HelpFormatter();
741
742         // If no args, print help.
743
if (cmdlineArgs.size() <= 0) {
744             usage(formatter, options, 0);
745         }
746
747         // Now look at options passed.
748
long offset = -1;
749         boolean digest = false;
750         boolean strict = false;
751         boolean parse = false;
752         String JavaDoc format = CDX;
753         for (int i = 0; i < cmdlineOptions.length; i++) {
754             switch(cmdlineOptions[i].getId()) {
755                 case 'h':
756                     usage(formatter, options, 0);
757                     break;
758
759                 case 'o':
760                     offset =
761                         Long.parseLong(cmdlineOptions[i].getValue());
762                     break;
763                     
764                 case 's':
765                     strict = true;
766                     break;
767                     
768                 case 'p':
769                     parse = getTrueOrFalse(cmdlineOptions[i].getValue());
770                     break;
771                     
772                 case 'd':
773                     digest = getTrueOrFalse(cmdlineOptions[i].getValue());
774                     break;
775                     
776                 case 'f':
777                     format = cmdlineOptions[i].getValue().toLowerCase();
778                     boolean match = false;
779                     // List of supported formats.
780
final String JavaDoc [] supportedFormats =
781                         {CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE};
782                     for (int ii = 0; ii < supportedFormats.length; ii++) {
783                         if (supportedFormats[ii].equals(format)) {
784                             match = true;
785                             break;
786                         }
787                     }
788                     if (!match) {
789                         usage(formatter, options, 1);
790                     }
791                     break;
792
793                 default:
794                     throw new RuntimeException JavaDoc("Unexpected option: " +
795                         + cmdlineOptions[i].getId());
796             }
797         }
798         
799         if (offset >= 0) {
800             if (cmdlineArgs.size() != 1) {
801                 System.out.println("Error: Pass one arcfile only.");
802                 usage(formatter, options, 1);
803             }
804             ARCReader arc = ARCReaderFactory.get((String JavaDoc)cmdlineArgs.get(0),
805                 offset);
806             arc.setStrict(strict);
807             // We must parse headers if we need to skip them.
808
if (format.equals(NOHEAD) || format.equals(HEADER)) {
809                 parse = true;
810             }
811             arc.setParseHttpHeaders(parse);
812             outputRecord(arc, format);
813         } else {
814             for (Iterator JavaDoc i = cmdlineArgs.iterator(); i.hasNext();) {
815                 String JavaDoc urlOrPath = (String JavaDoc)i.next();
816                 try {
817                     ARCReader r = ARCReaderFactory.get(urlOrPath);
818                     r.setStrict(strict);
819                     r.setParseHttpHeaders(parse);
820                     r.setDigest(digest);
821                     output(r, format);
822                 } catch (RuntimeException JavaDoc e) {
823                     // Write out name of file we failed on to help with
824
// debugging. Then print stack trace and try to keep
825
// going. We do this for case where we're being fed
826
// a bunch of ARCs; just note the bad one and move
827
// on to the next.
828
System.err.println("Exception processing " + urlOrPath +
829                         ": " + e.getMessage());
830                     e.printStackTrace(System.err);
831                     System.exit(1);
832                 }
833             }
834         }
835     }
836 }
837
Popular Tags