KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > arc > ARCReaderFactory


1 /* ARCReaderFactory
2  *
3  * $Id: ARCReaderFactory.java,v 1.40.2.1 2007/01/13 01:31:36 stack-sf Exp $
4  *
5  * Created on May 1, 2004
6  *
7  * Copyright (C) 2004 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.io.arc;
26
27 import java.io.File JavaDoc;
28 import java.io.FileInputStream JavaDoc;
29 import java.io.IOException JavaDoc;
30 import java.io.InputStream JavaDoc;
31 import java.net.MalformedURLException JavaDoc;
32 import java.net.URL JavaDoc;
33 import java.util.Iterator JavaDoc;
34 import java.util.logging.Level JavaDoc;
35
36 import org.archive.io.ArchiveReader;
37 import org.archive.io.ArchiveReaderFactory;
38 import org.archive.io.ArchiveRecord;
39 import org.archive.io.ArchiveRecordHeader;
40 import org.archive.io.GzipHeader;
41 import org.archive.io.GzippedInputStream;
42 import org.archive.io.NoGzipMagicException;
43 import org.archive.util.FileUtils;
44
45
46 /**
47  * Factory that returns an ARCReader.
48  *
49  * Can handle compressed and uncompressed ARCs.
50  *
51  * @author stack
52  */

53 public class ARCReaderFactory extends ArchiveReaderFactory
54 implements ARCConstants {
55     /**
56      * This factory instance.
57      */

58     private static final ARCReaderFactory factory = new ARCReaderFactory();
59
60     /**
61      * Shutdown any access to default constructor.
62      */

63     protected ARCReaderFactory() {
64         super();
65     }
66     
67     public static ARCReader get(String JavaDoc arcFileOrUrl)
68     throws MalformedURLException JavaDoc, IOException JavaDoc {
69         return (ARCReader)ARCReaderFactory.factory.
70             getArchiveReader(arcFileOrUrl);
71     }
72     
73     public static ARCReader get(String JavaDoc arcFileOrUrl, final long offset)
74     throws MalformedURLException JavaDoc, IOException JavaDoc {
75         return (ARCReader)ARCReaderFactory.factory.
76             getArchiveReader(arcFileOrUrl, offset);
77     }
78     
79     public static ARCReader get(final File JavaDoc f) throws IOException JavaDoc {
80         return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f);
81     }
82     
83     public static ARCReader get(final File JavaDoc f, final long offset)
84     throws IOException JavaDoc {
85         return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, offset);
86     }
87     
88     protected ArchiveReader getArchiveReader(final File JavaDoc f, final long offset)
89     throws IOException JavaDoc {
90         return getArchiveReader(f, true, offset);
91     }
92     
93     /**
94      * @param f An arcfile to read.
95      * @param skipSuffixTest Set to true if want to test that ARC has proper
96      * suffix. Use this method and pass <code>false</code> to open ARCs
97      * with the <code>.open</code> or otherwise suffix.
98      * @param offset Have returned ARCReader set to start reading at passed
99      * offset.
100      * @return An ARCReader.
101      * @throws IOException
102      */

103     public static ARCReader get(final File JavaDoc f,
104             final boolean skipSuffixTest, final long offset)
105     throws IOException JavaDoc {
106         return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f,
107             skipSuffixTest, offset);
108     }
109     
110     protected ArchiveReader getArchiveReader(final File JavaDoc arcFile,
111             final boolean skipSuffixTest, final long offset)
112     throws IOException JavaDoc {
113         boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
114         if (!compressed) {
115             if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
116                     ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
117                 throw new IOException JavaDoc(arcFile.getAbsolutePath() +
118                     " is not an Internet Archive ARC file.");
119             }
120         }
121         return compressed?
122             (ARCReader)ARCReaderFactory.factory.
123                 new CompressedARCReader(arcFile, offset):
124             (ARCReader)ARCReaderFactory.factory.
125                 new UncompressedARCReader(arcFile, offset);
126     }
127     
128     public static ArchiveReader get(final String JavaDoc s, final InputStream JavaDoc is,
129             final boolean atFirstRecord)
130     throws IOException JavaDoc {
131         return ARCReaderFactory.factory.getArchiveReader(s, is,
132             atFirstRecord);
133     }
134     
135     protected ArchiveReader getArchiveReader(final String JavaDoc arc,
136             final InputStream JavaDoc is, final boolean atFirstRecord)
137             throws IOException JavaDoc {
138         // For now, assume stream is compressed. Later add test of input
139
// stream or handle exception thrown when figure not compressed stream.
140
return new CompressedARCReader(arc, is, atFirstRecord);
141     }
142     
143     /**
144      * Get an ARCReader aligned at <code>offset</code>. This version of get
145      * will not bring the ARC local but will try to stream across the net making
146      * an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35).
147      *
148      * @param arcUrl HTTP URL for an ARC (All ARCs considered remote).
149      * @param offset Offset into ARC at which to start fetching.
150      * @return An ARCReader aligned at offset.
151      * @throws IOException
152      */

153     public static ARCReader get(final URL JavaDoc arcUrl, final long offset)
154     throws IOException JavaDoc {
155         return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl,
156             offset);
157     }
158     
159     /**
160      * Get an ARCReader.
161      * Pulls the ARC local into whereever the System Property
162      * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
163      * points at this local copy. A close on this ARCReader instance will
164      * remove the local copy.
165      * @param arcUrl An URL that points at an ARC.
166      * @return An ARCReader.
167      * @throws IOException
168      */

169     public static ARCReader get(final URL JavaDoc arcUrl)
170     throws IOException JavaDoc {
171         return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl);
172     }
173     
174     /**
175      * @param arcFile File to test.
176      * @return True if <code>arcFile</code> is compressed ARC.
177      * @throws IOException
178      */

179     public boolean isCompressed(File JavaDoc arcFile) throws IOException JavaDoc {
180         return testCompressedARCFile(arcFile);
181     }
182     
183     /**
184      * Check file is compressed and in ARC GZIP format.
185      *
186      * @param arcFile File to test if its Internet Archive ARC file
187      * GZIP compressed.
188      *
189      * @return True if this is an Internet Archive GZIP'd ARC file (It begins
190      * w/ the Internet Archive GZIP header and has the
191      * COMPRESSED_ARC_FILE_EXTENSION suffix).
192      *
193      * @exception IOException If file does not exist or is not unreadable.
194      */

195     public static boolean testCompressedARCFile(File JavaDoc arcFile)
196     throws IOException JavaDoc {
197         return testCompressedARCFile(arcFile, false);
198     }
199
200     /**
201      * Check file is compressed and in ARC GZIP format.
202      *
203      * @param arcFile File to test if its Internet Archive ARC file
204      * GZIP compressed.
205      * @param skipSuffixCheck Set to true if we're not to test on the
206      * '.arc.gz' suffix.
207      *
208      * @return True if this is an Internet Archive GZIP'd ARC file (It begins
209      * w/ the Internet Archive GZIP header).
210      *
211      * @exception IOException If file does not exist or is not unreadable.
212      */

213     public static boolean testCompressedARCFile(File JavaDoc arcFile,
214             boolean skipSuffixCheck)
215     throws IOException JavaDoc {
216         boolean compressedARCFile = false;
217         FileUtils.isReadable(arcFile);
218         if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
219                 .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
220             return compressedARCFile;
221         }
222         
223         final InputStream JavaDoc is = new FileInputStream JavaDoc(arcFile);
224         try {
225             compressedARCFile = testCompressedARCStream(is);
226         } finally {
227             is.close();
228         }
229         return compressedARCFile;
230     }
231     
232     public static boolean isARCSuffix(final String JavaDoc arcName) {
233         return (arcName == null)?
234             false:
235             (arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
236                 true:
237                 (arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))?
238                 true: false;
239     }
240     
241     /**
242      * Tests passed stream is gzip stream by reading in the HEAD.
243      * Does not reposition the stream. That is left up to the caller.
244      * @param is An InputStream.
245      * @return True if compressed stream.
246      * @throws IOException
247      */

248     public static boolean testCompressedARCStream(final InputStream JavaDoc is)
249             throws IOException JavaDoc {
250         boolean compressedARCFile = false;
251         GzipHeader gh = null;
252         try {
253             gh = new GzipHeader(is);
254         } catch (NoGzipMagicException e ) {
255             return compressedARCFile;
256         }
257         
258         byte[] fextra = gh.getFextra();
259         // Now make sure following bytes are IA GZIP comment.
260
// First check length. ARC_GZIP_EXTRA_FIELD includes length
261
// so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
262
// at +2.
263
if (fextra != null &&
264                 ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
265             compressedARCFile = true;
266             for (int i = 0; i < fextra.length; i++) {
267                 if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
268                     compressedARCFile = false;
269                     break;
270                 }
271             }
272         }
273         return compressedARCFile;
274     }
275
276     /**
277      * Uncompressed arc file reader.
278      * @author stack
279      */

280     private class UncompressedARCReader extends ARCReader {
281         /**
282          * Constructor.
283          * @param f Uncompressed arcfile to read.
284          * @throws IOException
285          */

286         public UncompressedARCReader(final File JavaDoc f)
287         throws IOException JavaDoc {
288             this(f, 0);
289         }
290
291         /**
292          * Constructor.
293          *
294          * @param f Uncompressed arcfile to read.
295          * @param offset Offset at which to position ARCReader.
296          * @throws IOException
297          */

298         public UncompressedARCReader(final File JavaDoc f, final long offset)
299         throws IOException JavaDoc {
300             // Arc file has been tested for existence by time it has come
301
// to here.
302
setIn(getInputStream(f, offset));
303             initialize(f.getAbsolutePath());
304         }
305         
306         /**
307          * Constructor.
308          *
309          * @param f Uncompressed arc to read.
310          * @param is InputStream.
311          */

312         public UncompressedARCReader(final String JavaDoc f, final InputStream JavaDoc is) {
313             // Arc file has been tested for existence by time it has come
314
// to here.
315
setIn(is);
316             initialize(f);
317         }
318     }
319     
320     /**
321      * Compressed arc file reader.
322      *
323      * @author stack
324      */

325     private class CompressedARCReader extends ARCReader {
326
327         /**
328          * Constructor.
329          *
330          * @param f
331          * Compressed arcfile to read.
332          * @throws IOException
333          */

334         public CompressedARCReader(final File JavaDoc f) throws IOException JavaDoc {
335             this(f, 0);
336         }
337
338         /**
339          * Constructor.
340          *
341          * @param f Compressed arcfile to read.
342          * @param offset Position at where to start reading file.
343          * @throws IOException
344          */

345         public CompressedARCReader(final File JavaDoc f, final long offset)
346                 throws IOException JavaDoc {
347             // Arc file has been tested for existence by time it has come
348
// to here.
349
setIn(new GzippedInputStream(getInputStream(f, offset)));
350             setCompressed((offset == 0));
351             initialize(f.getAbsolutePath());
352         }
353         
354         /**
355          * Constructor.
356          *
357          * @param f Compressed arcfile.
358          * @param is InputStream to use.
359          * @throws IOException
360          */

361         public CompressedARCReader(final String JavaDoc f, final InputStream JavaDoc is,
362             final boolean atFirstRecord)
363         throws IOException JavaDoc {
364             // Arc file has been tested for existence by time it has come
365
// to here.
366
setIn(new GzippedInputStream(is));
367             setCompressed(true);
368             setAlignedOnFirstRecord(atFirstRecord);
369             initialize(f);
370         }
371         
372         /**
373          * Get record at passed <code>offset</code>.
374          *
375          * @param offset
376          * Byte index into arcfile at which a record starts.
377          * @return An ARCRecord reference.
378          * @throws IOException
379          */

380         public ARCRecord get(long offset) throws IOException JavaDoc {
381             cleanupCurrentRecord();
382             ((GzippedInputStream)getIn()).gzipMemberSeek(offset);
383             return createArchiveRecord(getIn(), offset);
384         }
385         
386         public Iterator JavaDoc<ArchiveRecord> iterator() {
387             /**
388              * Override ARCRecordIterator so can base returned iterator on
389              * GzippedInputStream iterator.
390              */

391             return new ArchiveRecordIterator() {
392                 private GzippedInputStream gis =
393                     (GzippedInputStream)getInputStream();
394
395                 private Iterator JavaDoc gzipIterator = this.gis.iterator();
396
397                 protected boolean innerHasNext() {
398                     return this.gzipIterator.hasNext();
399                 }
400
401                 protected ArchiveRecord innerNext() throws IOException JavaDoc {
402                     // Get the position before gzipIterator.next moves
403
// it on past the gzip header.
404
long p = this.gis.position();
405                     InputStream JavaDoc is = (InputStream JavaDoc) this.gzipIterator.next();
406                     return createArchiveRecord(is, p);
407                 }
408             };
409         }
410         
411         protected void gotoEOR(ArchiveRecord rec) throws IOException JavaDoc {
412             long skipped = ((GzippedInputStream)getIn()).
413                 gotoEOR(LINE_SEPARATOR);
414             if (skipped <= 0) {
415                 return;
416             }
417             // Report on system error the number of unexpected characters
418
// at the end of this record.
419
ArchiveRecordHeader meta = (getCurrentRecord() != null)?
420                 rec.getHeader(): null;
421             String JavaDoc message = "Record ENDING at " +
422                 ((GzippedInputStream)getIn()).position() +
423                 " has " + skipped + " trailing byte(s): " +
424                 ((meta != null)? meta.toString(): "");
425             if (isStrict()) {
426                 throw new IOException JavaDoc(message);
427             }
428             logStdErr(Level.WARNING, message);
429         }
430     }
431 }
Popular Tags