ARCReader


1   /* $Id: ARCReader.java,v 1.72.2.1 2007/01/13 01:31:35 stack-sf Exp $
2    *
3    * Created on May 1, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io.arc;
24  
25  import java.io.ByteArrayOutputStream  ;
26  import java.io.File  ;
27  import java.io.IOException  ;
28  import java.io.InputStream  ;
29  import java.util.ArrayList  ;
30  import java.util.Arrays  ;
31  import java.util.HashMap  ;
32  import java.util.Iterator  ;
33  import java.util.List  ;
34  import java.util.Map  ;
35  import java.util.concurrent.atomic.AtomicInteger  ;
36  import java.util.logging.Level  ;
37  import java.util.logging.Logger  ;
38  import java.util.regex.Matcher  ;
39  
40  import org.apache.commons.cli.CommandLine;
41  import org.apache.commons.cli.HelpFormatter;
42  import org.apache.commons.cli.Option;
43  import org.apache.commons.cli.Options;
44  import org.apache.commons.cli.ParseException;
45  import org.apache.commons.cli.PosixParser;
46  import org.archive.io.ArchiveReader;
47  import org.archive.io.ArchiveRecord;
48  import org.archive.io.ArchiveRecordHeader;
49  import org.archive.io.RecoverableIOException;
50  import org.archive.io.WriterPoolMember;
51  import org.archive.util.ArchiveUtils;
52  import org.archive.util.InetAddressUtil;
53  import org.archive.util.TextUtils;
54  
55  
56  /**
57   * Get an iterator on an ARC file or get a record by absolute position.
58   *
59   * ARC files are described here:
60   * <a HREF="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
61   * File Format</a>.
62   *
63   * <p>This class knows how to parse an ARC file.  Pass it a file path
64   * or an URL to an ARC. It can parse ARC Version 1 and 2.
65   *
66   * <p>Iterator returns <code>ARCRecord</code>
67   * though {@link Iterator#next()} is returning
68   * java.lang.Object.  Cast the return.
69   *
70   * <p>Profiling java.io vs. memory-mapped ByteBufferInputStream shows the
71   * latter slightly slower -- but not by much.  TODO: Test more.  Just
72   * change {@link #getInputStream(File, long)}.
73   *
74   * @author stack
75   * @version $Date: 2007/01/13 01:31:35 $ $Revision: 1.72.2.1 $
76   */
77  public abstract class ARCReader extends ArchiveReader
78  implements ARCConstants {
79      Logger   logger = Logger.getLogger(ARCReader.class.getName());
80      
81      /**
82       * Set to true if we are aligned on first record of Archive file.
83       * We used depend on offset. If offset was zero, then we were
84       * aligned on first record.  This is no longer necessarily the case when
85       * Reader is created at an offset into an Archive file: The offset is zero
86       * but its relative to where we started reading.
87       */
88      private boolean alignedOnFirstRecord = true;
89      
90      /**
91       * Assumed maximum size of a record meta header line.
92       *
93       * This 100k which seems massive but its the same as the LINE_LENGTH from
94       * <code>alexa/include/a_arcio.h</code>:
95       * <pre>
96       * #define LINE_LENGTH     (100*1024)
97       * </pre>
98       */
99      private static final int MAX_HEADER_LINE_LENGTH = 1024 * 100;
100 
101     /**
102      * Array of field names.
103      * 
104      * Used to initialize <code>headerFieldNameKeys</code>.
105      */
106     private final String   [] headerFieldNameKeysArray = {
107         URL_FIELD_KEY,
108         IP_HEADER_FIELD_KEY,
109         DATE_FIELD_KEY,
110         MIMETYPE_FIELD_KEY,
111         LENGTH_FIELD_KEY
112     };
113     
114     /**
115      * An array of the header field names found in the ARC file header on
116      * the 3rd line.
117      * 
118      * We used to read these in from the arc file first record 3rd line but
119      * now we hardcode them for sake of improved performance.
120      */
121     private final List  <String  > headerFieldNameKeys =
122         Arrays.asList(this.headerFieldNameKeysArray);
123     
124     private boolean parseHttpHeaders = true;
125     
126     ARCReader() {
127         super();
128     }
129     
130     /**
131      * Skip over any trailing new lines at end of the record so we're lined up
132      * ready to read the next.
133      * @param record
134      * @throws IOException
135      */
136     protected void gotoEOR(ArchiveRecord record) throws IOException   {
137         if (getIn().available() <= 0) {
138             return;
139         }
140         
141         // Remove any trailing LINE_SEPARATOR
142         int c = -1;
143         while (getIn().available() > 0) {
144             if (getIn().markSupported()) {
145                 getIn().mark(1);
146             }
147             c = getIn().read();
148             if (c != -1) {
149                 if (c == LINE_SEPARATOR) {
150                     continue;
151                 }
152                 if (getIn().markSupported()) {
153                     // We've overread.  We're probably in next record.  There is
154                     // no way of telling for sure. It may be dross at end of
155                     // current record. Backup.
156                     getIn().reset();
157                     break;
158                 }
159                 ArchiveRecordHeader h = (getCurrentRecord() != null)?
160                     record.getHeader(): null;
161                 throw new IOException  ("Read " + (char)c +
162                     " when only " + LINE_SEPARATOR + " expected. " + 
163                     getReaderIdentifier() + ((h != null)?
164                         h.getHeaderFields().toString(): ""));
165             }
166         }
167     }
168     
169     /**
170      * Create new arc record.
171      *
172      * Encapsulate housekeeping that has to do w/ creating a new record.
173      *
174      * <p>Call this method at end of constructor to read in the
175      * arcfile header.  Will be problems reading subsequent arc records
176      * if you don't since arcfile header has the list of metadata fields for
177      * all records that follow.
178      * 
179      * <p>When parsing through ARCs writing out CDX info, we spend about
180      * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine
181      * -- of which 16% is reading.
182      *
183      * @param is InputStream to use.
184      * @param offset Absolute offset into arc file.
185      * @return An arc record.
186      * @throws IOException
187      */
188     protected ARCRecord createArchiveRecord(InputStream   is, long offset)
189     throws IOException   {
190         ArrayList  <String  > firstLineValues = new ArrayList  <String  >(20);
191         getTokenizedHeaderLine(is, firstLineValues);
192         int bodyOffset = 0;
193         if (offset == 0 && isAlignedOnFirstRecord()) {
194             // If offset is zero and we were aligned at first record on
195             // creation (See #alignedOnFirstRecord for more on this), then no
196             // records have been read yet and we're reading our first one, the
197             // record of ARC file meta info.  Its special.  In ARC versions
198             // 1.x, first record has three lines of meta info. We've just read
199             // the first line. There are two more.  The second line has misc.
200             // info.  We're only interested in the first field, the version
201             // number.  The third line is the list of field names. Here's what
202             // ARC file version 1.x meta content looks like:
203             //
204             // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\
205             //      20040107015752 text/plain 77
206             // 1 0 InternetArchive
207             // URL IP-address Archive-date Content-type Archive-length
208             //
209             ArrayList  <String  > secondLineValues = new ArrayList  <String  >(20);
210             bodyOffset += getTokenizedHeaderLine(is, secondLineValues);
211             setVersion((String  )secondLineValues.get(0) +
212                 "." + (String  )secondLineValues.get(1));
213             // Just read over the 3rd line.  We used to parse it and use
214             // values found here but now we just hardcode them to avoid
215             // having to read this 3rd line even for random arc file accesses.
216             bodyOffset += getTokenizedHeaderLine(is, null);
217         }
218 
219         try {
220             currentRecord(new ARCRecord(is,
221                 (ArchiveRecordHeader)computeMetaData(this.headerFieldNameKeys,
222                     firstLineValues,
223                     getVersion(), offset), bodyOffset, isDigest(),
224                     isStrict(), isParseHttpHeaders()));
225         } catch (IOException   e) {
226             IOException   newE = new IOException  (e.getMessage() + " (Offset " +
227                     offset + ").");
228             newE.setStackTrace(e.getStackTrace());
229             throw newE;
230         }
231         return (ARCRecord)getCurrentRecord();
232     }
233     
234     /**
235      * Returns version of this ARC file.  Usually read from first record of ARC.
236      * If we're reading without having first read the first record -- e.g.
237      * random access into middle of an ARC -- then version will not have been
238      * set.  For now, we return a default, version 1.1.  Later, if more than
239      * just one version of ARC, we could look at such as the meta line to see
240      * what version of ARC this is.
241      * @return Version of this ARC file.
242      */
243     public String   getVersion() {
244         return (super.getVersion() == null)? "1.1": super.getVersion();
245     }
246 
247     /**
248      * Get a record header line as list of tokens.
249      *
250      * We keep reading till we find a LINE_SEPARATOR or we reach the end
251      * of file w/o finding a LINE_SEPARATOR or the line length is crazy.
252      *
253      * @param stream InputStream to read from.
254      * @param list Empty list that gets filled w/ string tokens.
255      * @return Count of characters read.
256      * @exception IOException If problem reading stream or no line separator
257      * found or EOF before EOL or we didn't get minimum header fields.
258      */
259     private int getTokenizedHeaderLine(final InputStream   stream,
260             List  <String  > list) throws IOException   {
261         // Preallocate usual line size.
262         StringBuilder   buffer = new StringBuilder  (2048 + 20);
263         int read = 0;
264         int previous = -1;
265         for (int c = -1; true;) {
266             previous = c;
267             c = stream.read();
268             if (c == -1) {
269                 throw new RecoverableIOException("Hit EOF before header EOL.");
270             }
271             c &= 0xff; 
272             read++;
273             if (read > MAX_HEADER_LINE_LENGTH) {
274                 throw new IOException  ("Header line longer than max allowed " +
275                     " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) +
276                     " -- or passed buffer doesn't contain a line (Read: " +
277                     buffer.length() + ").  Here's" +
278                     " some of what was read: " +
279                     buffer.substring(0, Math.min(buffer.length(), 256)));
280             }
281 
282             if (c == LINE_SEPARATOR) {
283                 if (buffer.length() == 0) {
284                     // Empty line at start of buffer.  Skip it and try again.
285                     continue;
286                 }
287 
288                 if (list != null) {
289                     list.add(buffer.toString());
290                 }
291                 // LOOP TERMINATION.
292                 break;
293             } else if (c == HEADER_FIELD_SEPARATOR) {
294                 if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {
295                     // Early ARCs sometimes had multiple spaces between fields.
296                     continue;
297                 }
298                 if (list != null) {
299                     list.add(buffer.toString());
300                 }
301                 // reset to empty
302                 buffer.setLength(0);
303             } else {
304                 buffer.append((char)c);
305             }
306         }
307 
308         // List must have at least 3 elements in it and no more than 10.  If
309         // it has other than this, then bogus parse.
310         if (list != null && (list.size() < 3 || list.size() > 100)) {
311             throw new IOException  ("Unparseable header line: " + list);
312         }
313 
314         return read;
315     }
316 
317     /**
318      * Compute metadata fields.
319      *
320      * Here we check the meta field has right number of items in it.
321      *
322      * @param keys Keys to use composing headerFields map.
323      * @param values Values to set into the headerFields map.
324      * @param v The version of this ARC file.
325      * @param offset Offset into arc file.
326      *
327      * @return Metadata structure for this record.
328      *
329      * @exception IOException  If no. of keys doesn't match no. of values.
330      */
331     private ARCRecordMetaData computeMetaData(List  <String  > keys,
332             List  <String  > values, String   v, long offset)
333     throws IOException   {
334         if (keys.size() != values.size()) {
335             List  <String  > originalValues = values;
336             if (!isStrict()) {
337                 values = fixSpaceInMetadataLine(values, keys.size());
338                 // If values still doesn't match key size, try and do
339                 // further repair.
340                 if (keys.size() != values.size()) {
341                     // Early ARCs had a space in mimetype.
342                     if (values.size() == (keys.size() + 1) &&
343                             values.get(4).toLowerCase().startsWith("charset=")) {
344                         List  <String  > nuvalues =
345                             new ArrayList  <String  >(keys.size());
346                         nuvalues.add(0, values.get(0));
347                         nuvalues.add(1, values.get(1));
348                         nuvalues.add(2, values.get(2));
349                         nuvalues.add(3, values.get(3) + values.get(4));
350                         nuvalues.add(4, values.get(5));
351                         values = nuvalues;
352                     }
353                 }
354             }
355             if (keys.size() != values.size()) {
356                 throw new IOException  ("Size of field name keys does" +
357                     " not match count of field values: " + values);
358             }
359             // Note that field was fixed on stderr.
360             logStdErr(Level.WARNING, "Fixed spaces in metadata line at " +
361                 "offset " + offset +
362                 " Original: " + originalValues + ", New: " + values);
363         }
364         
365         Map  <Object  , Object  > headerFields =
366             new HashMap  <Object  , Object  >(keys.size() + 2);
367         for (int i = 0; i < keys.size(); i++) {
368             headerFields.put(keys.get(i), values.get(i));
369         }
370         
371         // Add a check for tabs in URLs.  If any, replace with '%09'.
372         // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,
373         // [ 1010966 ] crawl.log has URIs with spaces in them.
374         String   url = (String  )headerFields.get(URL_FIELD_KEY);
375         if (url != null && url.indexOf('\t') >= 0) {
376             headerFields.put(URL_FIELD_KEY,
377                 TextUtils.replaceAll("\t", url, "%09"));
378         }
379 
380         headerFields.put(VERSION_FIELD_KEY, v);
381         headerFields.put(ABSOLUTE_OFFSET_KEY, new  Long  (offset));
382 
383         return new ARCRecordMetaData(getReaderIdentifier(), headerFields);
384     }
385     
386     /**
387      * Fix space in URLs.
388      * The ARCWriter used to write into the ARC URLs with spaces in them.
389      * See <a
390      * HREF="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ]
391      * crawl.log has URIs with spaces in them</a>.
392      * This method does fix up on such headers converting all spaces found
393      * to '%20'.
394      * @param values List of metadata values.
395      * @param requiredSize Expected size of resultant values list.
396      * @return New list if we successfully fixed up values or original if
397      * fixup failed.
398      */
399     protected List  <String  > fixSpaceInMetadataLine(List  <String  > values,
400             int requiredSize) {
401         // Do validity check. 3rd from last is a date of 14 numeric
402         // characters.  The 4th from last is IP, all before the IP
403         // should be concatenated together with a '%20' joiner.
404         // In the below, '4' is 4th field from end which has the IP.
405         if (!(values.size() > requiredSize) || values.size() < 4) {
406             return values;
407         }
408         // Test 3rd field is valid date.
409         String   date = (String  )values.get(values.size() - 3);
410         if (date.length() != 14) {
411             return values;
412         }
413         for (int i = 0; i < date.length(); i++) {
414             if (!Character.isDigit(date.charAt(i))) {
415                 return values;
416             }
417         }
418         // Test 4th field is valid IP.
419         String   ip = (String  )values.get(values.size() - 4);
420         Matcher   m = InetAddressUtil.IPV4_QUADS.matcher(ip);
421         if (ip == "-" || m.matches()) {
422             List  <String  > newValues = new ArrayList  <String  >(requiredSize);
423             StringBuffer   url = new StringBuffer  ();
424             for (int i = 0; i < (values.size() - 4); i++) {
425                 if (i > 0) {
426                     url.append("%20");
427                 }
428                 url.append(values.get(i));
429             } 
430             newValues.add(url.toString());
431             for (int i = values.size() - 4; i < values.size(); i++) {
432                 newValues.add(values.get(i));
433             }
434             values =  newValues;
435         }
436         return values;
437     }
438     
439     protected boolean isAlignedOnFirstRecord() {
440         return alignedOnFirstRecord;
441     }
442 
443     protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {
444         this.alignedOnFirstRecord = alignedOnFirstRecord;
445     }
446     
447     /**
448      * @return Returns the parseHttpHeaders.
449      */
450     public boolean isParseHttpHeaders() {
451         return this.parseHttpHeaders;
452     }
453     
454     /**
455      * @param parse The parseHttpHeaders to set.
456      */
457     public void setParseHttpHeaders(boolean parse) {
458         this.parseHttpHeaders = parse;
459     }
460     
461     public String   getFileExtension() {
462         return ARC_FILE_EXTENSION;
463     }
464     
465     public String   getDotFileExtension() {
466         return DOT_ARC_FILE_EXTENSION;
467     }
468     
469     protected boolean output(final String   format) 
470     throws IOException  , java.text.ParseException   {
471         boolean result = super.output(format);
472         if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) {
473             throw new IOException  (format +
474                 " format only supported for single Records");
475         }
476         return result;
477     }
478     
479     protected boolean outputRecord(final String   format) throws IOException   {
480         boolean result = super.outputRecord(format);
481         if (result) {
482             return result;
483         }
484         if (format.equals(NOHEAD)) {
485             // No point digesting if dumping content.
486             setDigest(false);
487             ARCRecord r = (ARCRecord) get();
488             r.skipHttpHeader();
489             r.dump();
490             result = true;
491         } else if (format.equals(HEADER)) {
492             // No point digesting if dumping content.
493             setDigest(false);
494             ARCRecord r = (ARCRecord) get();
495             r.dumpHttpHeader();
496             result = true;
497         }
498 
499         return result;
500     }
501 
502     public void dump(final boolean compress)
503     throws IOException  , java.text.ParseException   {
504         // No point digesting if we're doing a dump.
505         setDigest(false);
506         boolean firstRecord = true;
507         ARCWriter writer = null;
508         for (Iterator  <ArchiveRecord> ii = iterator(); ii.hasNext();) {
509             ARCRecord r = (ARCRecord)ii.next();
510             // We're to dump the arc on stdout.
511             // Get the first record's data if any.
512             ARCRecordMetaData meta = r.getMetaData();
513             if (firstRecord) {
514                 firstRecord = false;
515                 // Get an ARCWriter.
516                 ByteArrayOutputStream   baos =
517                     new ByteArrayOutputStream  (r.available());
518                 // This is slow but done only once at top of ARC.
519                 while (r.available() > 0) {
520                     baos.write(r.read());
521                 }
522                 List  <String  > listOfMetadata = new ArrayList  <String  >();
523                 listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));
524                 // Assume getArc returns full path to file.  ARCWriter
525                 // or new File will complain if it is otherwise.
526                 writer = new ARCWriter(new AtomicInteger  (), System.out,
527                     new File  (meta.getArc()),
528                     compress, meta.getDate(), listOfMetadata);
529                 continue;
530             }
531             
532             writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),
533                 ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),
534                 (int)meta.getLength(), r);
535         }
536         // System.out.println(System.currentTimeMillis() - start);
537     }
538     
539     /**
540      * @return an ArchiveReader that will delete a local file on close.  Used
541      * when we bring Archive files local and need to clean up afterward.
542      */
543     public ARCReader getDeleteFileOnCloseReader(final File   f) {
544         final ARCReader d = this;
545         return new ARCReader() {
546             private final ARCReader delegate = d;
547             private File   archiveFile = f;
548             
549             public void close() throws IOException   {
550                 this.delegate.close();
551                 if (this.archiveFile != null) {
552                     if (archiveFile.exists()) {
553                         archiveFile.delete();
554                     }
555                     this.archiveFile = null;
556                 }
557             }
558             
559             public ArchiveRecord get(long o) throws IOException   {
560                 return this.delegate.get(o);
561             }
562             
563             public boolean isDigest() {
564                 return this.delegate.isDigest();
565             }
566             
567             public boolean isStrict() {
568                 return this.delegate.isStrict();
569             }
570             
571             public Iterator  <ArchiveRecord> iterator() {
572                 return this.delegate.iterator();
573             }
574             
575             public void setDigest(boolean d) {
576                 this.delegate.setDigest(d);
577             }
578             
579             public void setStrict(boolean s) {
580                 this.delegate.setStrict(s);
581             }
582             
583             public List   validate() throws IOException   {
584                 return this.delegate.validate();
585             }
586 
587             @Override  
588             public ArchiveRecord get() throws IOException   {
589                 return this.delegate.get();
590             }
591 
592             @Override  
593             public String   getVersion() {
594                 return this.delegate.getVersion();
595             }
596 
597             @Override  
598             public List   validate(int noRecords) throws IOException   {
599                 return this.delegate.validate(noRecords);
600             }
601 
602             @Override  
603             protected ARCRecord createArchiveRecord(InputStream   is,
604                     long offset)
605             throws IOException   {
606                 return this.delegate.createArchiveRecord(is, offset);
607             }
608 
609             @Override  
610             protected void gotoEOR(ArchiveRecord record) throws IOException   {
611                 this.delegate.gotoEOR(record);
612             }
613 
614             @Override  
615             public void dump(boolean compress)
616             throws IOException  , java.text.ParseException   {
617                 this.delegate.dump(compress);
618             }
619 
620             @Override  
621             public String   getDotFileExtension() {
622                 return this.delegate.getDotFileExtension();
623             }
624 
625             @Override  
626             public String   getFileExtension() {
627                 return this.delegate.getFileExtension();
628             }
629         };
630     }
631     
632     // Static methods follow.
633 
634     /**
635      *
636      * @param formatter Help formatter instance.
637      * @param options Usage options.
638      * @param exitCode Exit code.
639      */
640     private static void usage(HelpFormatter formatter, Options options,
641             int exitCode) {
642         formatter.printHelp("java org.archive.io.arc.ARCReader" +
643             " [--digest=true|false] \\\n" +
644             " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" +
645             " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL",
646                 options);
647         System.exit(exitCode);
648     }
649 
650     /**
651      * Write out the arcfile.
652      * 
653      * @param reader
654      * @param format Format to use outputting.
655      * @throws IOException
656      * @throws java.text.ParseException
657      */
658     protected static void output(ARCReader reader, String   format)
659     throws IOException  , java.text.ParseException   {
660         if (!reader.output(format)) {
661             throw new IOException  ("Unsupported format: " + format);
662         }
663     }
664     
665     
666     /**
667      * Output passed record using passed format specifier.
668      * @param r ARCReader instance to output.
669      * @param format What format to use outputting.
670      * @throws IOException
671      */
672     protected static void outputRecord(final ARCReader r, final String   format)
673     throws IOException   {
674         if (!r.outputRecord(format)) {
675             throw new IOException  ("Unsupported format" +
676                 " (or unsupported on a single record): " + format);
677         }
678     }
679 
680     /**
681      * Generate a CDX index file for an ARC file.
682      *
683      * @param urlOrPath The ARC file to generate a CDX index for
684      * @throws IOException
685      * @throws java.text.ParseException
686      */
687     public static void createCDXIndexFile(String   urlOrPath)
688     throws IOException  , java.text.ParseException   {
689         ARCReader r = ARCReaderFactory.get(urlOrPath);
690         r.setStrict(false);
691         r.setParseHttpHeaders(true);
692         r.setDigest(true);
693         output(r, CDX_FILE);
694     }
695 
696     /**
697      * Command-line interface to ARCReader.
698      *
699      * Here is the command-line interface:
700      * <pre>
701      * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
702      *  -h,--help      Prints this message and exits.
703      *  -o,--offset    Outputs record at this offset into arc file.</pre>
704      *
705      * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll
706      * take care of classpaths and the calling of ARCReader.
707      *
708      * <p>Outputs using a pseudo-CDX format as described here:
709      * <a HREF="http://www.archive.org/web/researcher/cdx_legend.php">CDX
710      * Legent</a> and here
711      * <a HREF="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
712      * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
713      * Hash is hard-coded straight SHA-1 hash of content.
714      *
715      * @param args Command-line arguments.
716      * @throws ParseException Failed parse of the command line.
717      * @throws IOException
718      * @throws java.text.ParseException
719      */
720     public static void main(String   [] args)
721     throws ParseException, IOException  , java.text.ParseException   {
722         Options options = new Options();
723         options.addOption(new Option("h","help", false,
724             "Prints this message and exits."));
725         options.addOption(new Option("o","offset", true,
726             "Outputs record at this offset into arc file."));
727         options.addOption(new Option("d","digest", true,
728             "Pass true|false. Expensive. Default: true (SHA-1)."));
729         options.addOption(new Option("s","strict", false,
730             "Strict mode. Fails parse if incorrectly formatted ARC."));
731         options.addOption(new Option("p","parse", true,
732             "Pass true|false to parse HTTP Headers. Default: false."));
733         options.addOption(new Option("f","format", true,
734             "Output options: 'cdx', 'cdxfile', 'dump', 'gzipdump', " +
735             "'header', or 'nohead'. Default: 'cdx'."));
736         PosixParser parser = new PosixParser();
737         CommandLine cmdline = parser.parse(options, args, false);
738         List   cmdlineArgs = cmdline.getArgList();
739         Option [] cmdlineOptions = cmdline.getOptions();
740         HelpFormatter formatter = new HelpFormatter();
741 
742         // If no args, print help.
743         if (cmdlineArgs.size() <= 0) {
744             usage(formatter, options, 0);
745         }
746 
747         // Now look at options passed.
748         long offset = -1;
749         boolean digest = false;
750         boolean strict = false;
751         boolean parse = false;
752         String   format = CDX;
753         for (int i = 0; i < cmdlineOptions.length; i++) {
754             switch(cmdlineOptions[i].getId()) {
755                 case 'h':
756                     usage(formatter, options, 0);
757                     break;
758 
759                 case 'o':
760                     offset =
761                         Long.parseLong(cmdlineOptions[i].getValue());
762                     break;
763                     
764                 case 's':
765                     strict = true;
766                     break;
767                     
768                 case 'p':
769                     parse = getTrueOrFalse(cmdlineOptions[i].getValue());
770                     break;
771                     
772                 case 'd':
773                     digest = getTrueOrFalse(cmdlineOptions[i].getValue());
774                     break;
775                     
776                 case 'f':
777                     format = cmdlineOptions[i].getValue().toLowerCase();
778                     boolean match = false;
779                     // List of supported formats.
780                     final String   [] supportedFormats =
781                         {CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE};
782                     for (int ii = 0; ii < supportedFormats.length; ii++) {
783                         if (supportedFormats[ii].equals(format)) {
784                             match = true;
785                             break;
786                         }
787                     }
788                     if (!match) {
789                         usage(formatter, options, 1);
790                     }
791                     break;
792 
793                 default:
794                     throw new RuntimeException  ("Unexpected option: " +
795                         + cmdlineOptions[i].getId());
796             }
797         }
798         
799         if (offset >= 0) {
800             if (cmdlineArgs.size() != 1) {
801                 System.out.println("Error: Pass one arcfile only.");
802                 usage(formatter, options, 1);
803             }
804             ARCReader arc = ARCReaderFactory.get((String  )cmdlineArgs.get(0),
805                 offset);
806             arc.setStrict(strict);
807             // We must parse headers if we need to skip them.
808             if (format.equals(NOHEAD) || format.equals(HEADER)) {
809                 parse = true;
810             }
811             arc.setParseHttpHeaders(parse);
812             outputRecord(arc, format);
813         } else {
814             for (Iterator   i = cmdlineArgs.iterator(); i.hasNext();) {
815                 String   urlOrPath = (String  )i.next();
816                 try {
817                     ARCReader r = ARCReaderFactory.get(urlOrPath);
818                     r.setStrict(strict);
819                     r.setParseHttpHeaders(parse);
820                     r.setDigest(digest);
821                     output(r, format);
822                 } catch (RuntimeException   e) {
823                     // Write out name of file we failed on to help with
824                     // debugging.  Then print stack trace and try to keep
825                     // going.  We do this for case where we're being fed
826                     // a bunch of ARCs; just note the bad one and move
827                     // on to the next.
828                     System.err.println("Exception processing " + urlOrPath +
829                         ": " + e.getMessage());
830                     e.printStackTrace(System.err);
831                     System.exit(1);
832                 }
833             }
834         }
835     }
836 }
837
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags