KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > ArchiveReader


1 /* $Id: ArchiveReader.java,v 1.5.2.1 2007/01/13 01:31:31 stack-sf Exp $
2  *
3  * Created on August 21st, 2006
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io;
24
25 import it.unimi.dsi.fastutil.io.RepositionableStream;
26
27 import java.io.BufferedInputStream JavaDoc;
28 import java.io.BufferedWriter JavaDoc;
29 import java.io.File JavaDoc;
30 import java.io.FileWriter JavaDoc;
31 import java.io.IOException JavaDoc;
32 import java.io.InputStream JavaDoc;
33 import java.util.ArrayList JavaDoc;
34 import java.util.Iterator JavaDoc;
35 import java.util.List JavaDoc;
36 import java.util.logging.Level JavaDoc;
37 import java.util.logging.Logger JavaDoc;
38
39 import org.archive.util.MimetypeUtils;
40
41
42 /**
43  * Reader for an Archive file of Archive {@link ArchiveRecord}s.
44  * @author stack
45  * @version $Date: 2007/01/13 01:31:31 $ $Version$
46  */

47 public abstract class ArchiveReader implements ArchiveFileConstants {
48     /**
49      * Is this Archive file compressed?
50      */

51     private boolean compressed = false;
52     
53     /**
54      * Should we digest as we read?
55      */

56     private boolean digest = true;
57     
58     /**
59      * Should the parse be strict?
60      */

61     private boolean strict = false;
62     
63     /**
64      * Archive file input stream.
65      *
66      * Keep it around so we can close it when done.
67      *
68      * <p>Set in constructor. Must support {@link RepositionableStream}
69      * interface. Make it protected so subclasses have access.
70      */

71     private InputStream JavaDoc in = null;
72     
73     /**
74      * Maximum amount of recoverable exceptions in a row.
75      * If more than this amount in a row, we'll let out the exception rather
76      * than go back in for yet another retry.
77      */

78     public static final int MAX_ALLOWED_RECOVERABLES = 10;
79     
80
81     /**
82      * The Record currently being read.
83      *
84      * Keep this ongoing reference so we'll close the record even if the caller
85      * doesn't.
86      */

87     private ArchiveRecord currentRecord = null;
88     
89     /**
90      * Descriptive string for the Archive file we're going against:
91      * full path, url, etc. -- depends on context in which file was made.
92      */

93     private String JavaDoc identifier = null;
94     
95     /**
96      * Archive file version.
97      */

98     private String JavaDoc version = null;
99     
100     
101     protected ArchiveReader() {
102         super();
103     }
104     
105     /**
106      * Convenience method used by subclass constructors.
107      * @param i Identifier for Archive file this reader goes against.
108      */

109     protected void initialize(final String JavaDoc i) {
110         setReaderIdentifier(i);
111     }
112     
113     /**
114      * Convenience method for constructors.
115      *
116      * @param f File to read.
117      * @param offset Offset at which to start reading.
118      * @return InputStream to read from.
119      * @throws IOException If failed open or fail to get a memory
120      * mapped byte buffer on file.
121      */

122     protected InputStream JavaDoc getInputStream(final File JavaDoc f, final long offset)
123     throws IOException JavaDoc {
124         return new RandomAccessBufferedInputStream(
125             new RandomAccessInputStream(f, offset));
126     }
127
128     public boolean isCompressed() {
129         return this.compressed;
130     }
131
132     /**
133      * Get record at passed <code>offset</code>.
134      *
135      * @param offset Byte index into file at which a record starts.
136      * @return An Archive Record reference.
137      * @throws IOException
138      */

139     public ArchiveRecord get(long offset) throws IOException JavaDoc {
140         cleanupCurrentRecord();
141         RepositionableStream ps = (RepositionableStream)this.in;
142         long currentOffset = ps.position();
143         if (currentOffset != offset) {
144             currentOffset = offset;
145             ps.position(offset);
146         }
147         return createArchiveRecord(this.in, currentOffset);
148     }
149     
150     /**
151      * @return Return Archive Record created against current offset.
152      * @throws IOException
153      */

154     public ArchiveRecord get() throws IOException JavaDoc {
155         return createArchiveRecord(this.in,
156             ((RepositionableStream)this.in).position());
157     }
158
159     public void close() throws IOException JavaDoc {
160         if (this.in != null) {
161             this.in.close();
162             this.in = null;
163         }
164     }
165     
166     /**
167      * Rewinds stream to start of the Archive file.
168      * @throws IOException if stream is not resettable.
169      */

170     protected void rewind() throws IOException JavaDoc {
171         cleanupCurrentRecord();
172         if (this.in instanceof RepositionableStream) {
173             try {
174                 ((RepositionableStream)this.in).position(0);
175             } catch (IOException JavaDoc e) {
176                 throw new RuntimeException JavaDoc(e);
177             }
178        } else {
179            throw new IOException JavaDoc("Stream is not resettable.");
180        }
181     }
182     
183     /**
184      * Cleanout the current record if there is one.
185      * @throws IOException
186      */

187     protected void cleanupCurrentRecord() throws IOException JavaDoc {
188         if (this.currentRecord != null) {
189             this.currentRecord.close();
190             gotoEOR(this.currentRecord);
191             this.currentRecord = null;
192         }
193     }
194     
195     /**
196      * Return an Archive Record homed on <code>offset</code> into
197      * <code>is</code>.
198      * @param is Stream to read Record from.
199      * @param offset Offset to find Record at.
200      * @return ArchiveRecord instance.
201      * @throws IOException
202      */

203     protected abstract ArchiveRecord createArchiveRecord(InputStream JavaDoc is,
204         long offset)
205     throws IOException JavaDoc;
206     
207     /**
208      * Skip over any trailing new lines at end of the record so we're lined up
209      * ready to read the next.
210      * @param record
211      * @throws IOException
212      */

213     protected abstract void gotoEOR(ArchiveRecord record) throws IOException JavaDoc;
214     
215     public abstract String JavaDoc getFileExtension();
216     public abstract String JavaDoc getDotFileExtension();
217
218     /**
219      * @return Version of this Archive file.
220      */

221     public String JavaDoc getVersion() {
222         return this.version;
223     }
224
225     /**
226      * Validate the Archive file.
227      *
228      * This method iterates over the file throwing exception if it fails
229      * to successfully parse any record.
230      *
231      * <p>Assumes the stream is at the start of the file.
232      * @return List of all read Archive Headers.
233      *
234      * @throws IOException
235      */

236     public List JavaDoc validate() throws IOException JavaDoc {
237         return validate(-1);
238     }
239
240     /**
241      * Validate the Archive file.
242      *
243      * This method iterates over the file throwing exception if it fails
244      * to successfully parse.
245      *
246      * <p>We start validation from whereever we are in the stream.
247      *
248      * @param noRecords Number of records expected. Pass -1 if number is
249      * unknown.
250      *
251      * @return List of all read metadatas. As we validate records, we add
252      * a reference to the read metadata.
253      *
254      * @throws IOException
255      */

256     public List JavaDoc validate(int noRecords) throws IOException JavaDoc {
257         List JavaDoc<ArchiveRecordHeader> hs = new ArrayList JavaDoc<ArchiveRecordHeader>();
258         int count = 0;
259         setStrict(true);
260         for (Iterator JavaDoc<ArchiveRecord> i = iterator(); i.hasNext();) {
261             count++;
262             ArchiveRecord r = i.next();
263             if (r.getHeader().getLength() <= 0
264                 && r.getHeader().getMimetype().
265                     equals(MimetypeUtils.NO_TYPE_MIMETYPE)) {
266                 throw new IOException JavaDoc("ARCRecord content is empty.");
267             }
268             r.close();
269             // Add reference to metadata into a list of metadatas.
270
hs.add(r.getHeader());
271         }
272
273         if (noRecords != -1) {
274             if (count != noRecords) {
275                 throw new IOException JavaDoc("Count of records, " +
276                     Integer.toString(count) + " is less than expected " +
277                     Integer.toString(noRecords));
278             }
279         }
280
281         return hs;
282     }
283
284     /**
285      * Test Archive file is valid.
286      * Assumes the stream is at the start of the file. Be aware that this
287      * method makes a pass over the whole file.
288      * @return True if file can be successfully parsed.
289      */

290     public boolean isValid() {
291         boolean valid = false;
292         try {
293             validate();
294             valid = true;
295         } catch(Exception JavaDoc e) {
296             // File is not valid if exception thrown parsing.
297
valid = false;
298         }
299     
300         return valid;
301     }
302
303     /**
304      * @return Returns the strict.
305      */

306     public boolean isStrict() {
307         return this.strict;
308     }
309
310     /**
311      * @param s The strict to set.
312      */

313     public void setStrict(boolean s) {
314         this.strict = s;
315     }
316
317     /**
318      * @param d True if we're to digest.
319      */

320     public void setDigest(boolean d) {
321         this.digest = d;
322     }
323
324     /**
325      * @return True if we're digesting as we read.
326      */

327     public boolean isDigest() {
328         return this.digest;
329     }
330  
331     protected Logger JavaDoc getLogger() {
332         return Logger.getLogger(this.getClass().getName());
333     }
334     
335     protected InputStream JavaDoc getInputStream() {
336         return this.in;
337     }
338     
339     /**
340      * @return An iterator over ARC records.
341      */

342     public Iterator JavaDoc<ArchiveRecord> iterator() {
343         // Eat up any record outstanding.
344
try {
345             cleanupCurrentRecord();
346         } catch (IOException JavaDoc e) {
347             throw new RuntimeException JavaDoc(e);
348         }
349         
350         // Now reset stream to the start of the arc file.
351
try {
352             rewind();
353         } catch (IOException JavaDoc e) {
354             throw new RuntimeException JavaDoc(e);
355         }
356         return new ArchiveRecordIterator();
357     }
358
359     protected void setCompressed(boolean compressed) {
360         this.compressed = compressed;
361     }
362
363     /**
364      * @return The current ARC record or null if none.
365      * After construction has the arcfile header record.
366      * @see #get()
367      */

368     protected ArchiveRecord getCurrentRecord() {
369         return this.currentRecord;
370     }
371
372     protected ArchiveRecord currentRecord(final ArchiveRecord currentRecord) {
373         this.currentRecord = currentRecord;
374         return currentRecord;
375     }
376
377     protected InputStream JavaDoc getIn() {
378         return in;
379     }
380
381     protected void setIn(InputStream JavaDoc in) {
382         this.in = in;
383     }
384
385     protected void setVersion(String JavaDoc version) {
386         this.version = version;
387     }
388
389     public String JavaDoc getReaderIdentifier() {
390         return this.identifier;
391     }
392
393     protected void setReaderIdentifier(final String JavaDoc i) {
394         this.identifier = i;
395     }
396     
397     /**
398      * Log on stderr.
399      * Logging should go via the logging system. This method
400      * bypasses the logging system going direct to stderr.
401      * Should not generally be used. Its used for rare messages
402      * that come of cmdline usage of ARCReader ERRORs and WARNINGs.
403      * Override if using ARCReader in a context where no stderr or
404      * where you'd like to redirect stderr to other than System.err.
405      * @param level Level to log message at.
406      * @param message Message to log.
407      */

408     public void logStdErr(Level JavaDoc level, String JavaDoc message) {
409         System.err.println(level.toString() + " " + message);
410     }
411     
412     /**
413      * Add buffering to RandomAccessInputStream.
414      */

415     protected class RandomAccessBufferedInputStream
416     extends BufferedInputStream JavaDoc implements RepositionableStream {
417
418         public RandomAccessBufferedInputStream(RandomAccessInputStream is)
419                 throws IOException JavaDoc {
420             super(is);
421         }
422
423         public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size)
424                 throws IOException JavaDoc {
425             super(is, size);
426         }
427
428         public long position() throws IOException JavaDoc {
429             // Current position is the underlying files position
430
// minus the amount thats in the buffer yet to be read.
431
return ((RandomAccessInputStream)this.in).position() -
432                 (this.count - this.pos);
433         }
434
435         public void position(long position) throws IOException JavaDoc {
436             // Force refill of buffer whenever there's been a seek.
437
this.pos = 0;
438             this.count = 0;
439             ((RandomAccessInputStream)this.in).position(position);
440         }
441     }
442     
443     /**
444      * Inner ArchiveRecord Iterator class.
445      * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if
446      * trouble pulling record from underlying stream.
447      * @author stack
448      */

449     protected class ArchiveRecordIterator implements Iterator JavaDoc<ArchiveRecord> {
450         /**
451          * @return True if we have more records to read.
452          * @exception RuntimeException Can throw an IOException wrapped in a
453          * RuntimeException if a problem reading underlying stream (Corrupted
454          * gzip, etc.).
455          */

456         public boolean hasNext() {
457             // Call close on any extant record. This will scoot us past
458
// any content not yet read.
459
try {
460                 cleanupCurrentRecord();
461             } catch (IOException JavaDoc e) {
462                 throw new RuntimeException JavaDoc(e);
463             }
464             return innerHasNext();
465         }
466         
467         protected boolean innerHasNext() {
468             long offset = -1;
469             try {
470                 offset = ((RepositionableStream)getInputStream()).position();
471                 return getInputStream().available() > 0;
472             } catch (IOException JavaDoc e) {
473                 throw new RuntimeException JavaDoc("Offset " + offset, e);
474             }
475         }
476
477         /**
478          * Tries to move to next record if we get
479          * {@link RecoverableIOException}. If not <code>strict</code>
480          * tries to move to next record if we get an
481          * {@link IOException}.
482          * @return Next object.
483          * @exception RuntimeException Throws a runtime exception,
484          * usually a wrapping of an IOException, if trouble getting
485          * a record (Throws exception rather than return null).
486          */

487         public ArchiveRecord next() {
488             long offset = -1;
489             try {
490                 offset = ((RepositionableStream)getInputStream()).position();
491                 return exceptionNext();
492             } catch (IOException JavaDoc e) {
493                 if (!isStrict()) {
494                     // Retry once.
495
try {
496                         if (hasNext()) {
497                             getLogger().warning("Retrying (Current offset " +
498                                 offset + "): " + e.getMessage());
499                             return exceptionNext();
500                         }
501                         // There is no next and we don't have a record
502
// to return. Throw the recoverable.
503
throw new RuntimeException JavaDoc("Retried but " +
504                             "no next record (Offset " + offset + ")",
505                             e);
506                     } catch (IOException JavaDoc e1) {
507                         throw new RuntimeException JavaDoc("After retry (Offset " +
508                                 offset + ")", e1);
509                     }
510                 }
511                 throw new RuntimeException JavaDoc("(Offset " + offset + ")", e);
512             }
513         }
514         
515         /**
516          * A next that throws exceptions and has handling of
517          * recoverable exceptions moving us to next record. Can call
518          * hasNext which itself may throw exceptions.
519          * @return Next record.
520          * @throws IOException
521          * @throws RuntimeException Thrown when we've reached maximum
522          * retries.
523          */

524         protected ArchiveRecord exceptionNext()
525         throws IOException JavaDoc, RuntimeException JavaDoc {
526             ArchiveRecord result = null;
527             IOException JavaDoc ioe = null;
528             for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 &&
529                     result == null; i--) {
530                 ioe = null;
531                 try {
532                     result = innerNext();
533                 } catch (RecoverableIOException e) {
534                     ioe = e;
535                     getLogger().warning(e.getMessage());
536                     if (hasNext()) {
537                         continue;
538                     }
539                     // No records left. Throw exception rather than
540
// return null. The caller is expecting to get
541
// back a record since they've just called
542
// hasNext.
543
break;
544                 }
545             }
546             if (ioe != null) {
547                 // Then we did MAX_ALLOWED_RECOVERABLES retries. Throw
548
// the recoverable ioe wrapped in a RuntimeException so
549
// it goes out pass checks for IOE.
550
throw new RuntimeException JavaDoc("Retried " +
551                     MAX_ALLOWED_RECOVERABLES + " times in a row", ioe);
552             }
553             return result;
554         }
555         
556         protected ArchiveRecord innerNext() throws IOException JavaDoc {
557             return get(((RepositionableStream)getInputStream()).position());
558         }
559         
560         public void remove() {
561             throw new UnsupportedOperationException JavaDoc();
562         }
563     }
564     
565     protected static String JavaDoc stripExtension(final String JavaDoc name,
566             final String JavaDoc ext) {
567         return (!name.endsWith(ext))? name:
568             name.substring(0, name.length() - ext.length());
569     }
570     
571     /**
572      * @return short name of Archive file.
573      */

574     public String JavaDoc getFileName() {
575         return (new File JavaDoc(getReaderIdentifier())).getName();
576     }
577
578     /**
579      * @return short name of Archive file.
580      */

581     public String JavaDoc getStrippedFileName() {
582         return getStrippedFileName(getFileName(),
583             getDotFileExtension());
584     }
585     
586     /**
587      * @param name Name of ARCFile.
588      * @param dotFileExtension '.arc' or '.warc', etc.
589      * @return short name of Archive file.
590      */

591     public static String JavaDoc getStrippedFileName(String JavaDoc name,
592             final String JavaDoc dotFileExtension) {
593         name = stripExtension(name,
594             ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION);
595         return stripExtension(name, dotFileExtension);
596     }
597     
598     /**
599      * @param value Value to test.
600      * @return True if value is 'true', else false.
601      */

602     protected static boolean getTrueOrFalse(final String JavaDoc value) {
603         if (value == null || value.length() <= 0) {
604             return false;
605         }
606         return Boolean.TRUE.toString().equals(value.toLowerCase());
607     }
608     
609     /**
610      * @param format Format to use outputting.
611      * @throws IOException
612      * @throws java.text.ParseException
613      * @return True if handled.
614      */

615     protected boolean output(final String JavaDoc format)
616     throws IOException JavaDoc, java.text.ParseException JavaDoc {
617         boolean result = true;
618         // long start = System.currentTimeMillis();
619

620         // Write output as pseudo-CDX file. See
621
// http://www.archive.org/web/researcher/cdx_legend.php
622
// and http://www.archive.org/web/researcher/example_cdx.php.
623
// Hash is hard-coded straight SHA-1 hash of content.
624
if (format.equals(DUMP)) {
625             // No point digesting dumping.
626
setDigest(false);
627             dump(false);
628         } else if (format.equals(GZIP_DUMP)) {
629             // No point digesting dumping.
630
setDigest(false);
631             dump(true);
632         } else if (format.equals(CDX)) {
633             cdxOutput(false);
634         } else if (format.equals(CDX_FILE)) {
635             cdxOutput(true);
636         } else {
637             result = false;
638         }
639         return result;
640     }
641     
642     protected void cdxOutput(boolean toFile)
643     throws IOException JavaDoc {
644         BufferedWriter JavaDoc cdxWriter = null;
645         if (toFile) {
646             String JavaDoc cdxFilename = stripExtension(getReaderIdentifier(),
647                 DOT_COMPRESSED_FILE_EXTENSION);
648             cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
649             cdxFilename += ('.' + CDX);
650             cdxWriter = new BufferedWriter JavaDoc(new FileWriter JavaDoc(cdxFilename));
651         }
652         
653         String JavaDoc header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
654             + " n g";
655         if (toFile) {
656             cdxWriter.write(header);
657             cdxWriter.newLine();
658         } else {
659             System.out.println(header);
660         }
661         
662         String JavaDoc strippedFileName = getStrippedFileName();
663         try {
664             for (Iterator JavaDoc<ArchiveRecord> ii = iterator(); ii.hasNext();) {
665                 ArchiveRecord r = ii.next();
666                 if (toFile) {
667                     cdxWriter.write(r.outputCdx(strippedFileName));
668                     cdxWriter.newLine();
669                 } else {
670                     System.out.println(r.outputCdx(strippedFileName));
671                 }
672             }
673         } finally {
674             if (toFile) {
675                 cdxWriter.close();
676             }
677         }
678     }
679     
680     /**
681      * Output passed record using passed format specifier.
682      * @param format What format to use outputting.
683      * @throws IOException
684      * @return True if handled.
685      */

686     protected boolean outputRecord(final String JavaDoc format)
687     throws IOException JavaDoc {
688         boolean result = true;
689         if (format.equals(CDX)) {
690             System.out.println(get().outputCdx(getStrippedFileName()));
691         } else if(format.equals(ArchiveFileConstants.DUMP)) {
692             // No point digesting if dumping content.
693
setDigest(false);
694             get().dump();
695         } else {
696             result = false;
697         }
698         return result;
699     }
700
701     /**
702      * Dump this file on STDOUT
703      * @throws compress True if dumped output is compressed.
704      * @throws IOException
705      * @throws java.text.ParseException
706      */

707     public abstract void dump(final boolean compress)
708     throws IOException JavaDoc, java.text.ParseException JavaDoc;
709     
710     /**
711      * @return an ArchiveReader that will delete a local file on close. Used
712      * when we bring Archive files local and need to clean up afterward.
713      */

714     public abstract ArchiveReader getDeleteFileOnCloseReader(final File JavaDoc f);
715 }
716
Popular Tags