KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > warc > WARCReaderFactory


1 /* $Id: WARCReaderFactory.java,v 1.2 2006/08/24 00:59:04 stack-sf Exp $
2  *
3  * Created Aug 22, 2006
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io.warc;
24
25 import java.io.File JavaDoc;
26 import java.io.FileInputStream JavaDoc;
27 import java.io.IOException JavaDoc;
28 import java.io.InputStream JavaDoc;
29 import java.net.MalformedURLException JavaDoc;
30 import java.net.URL JavaDoc;
31 import java.util.Iterator JavaDoc;
32
33 import org.archive.io.ArchiveReader;
34 import org.archive.io.ArchiveReaderFactory;
35 import org.archive.io.ArchiveRecord;
36 import org.archive.io.GzippedInputStream;
37 import org.archive.util.FileUtils;
38
39 /**
40  * Factory for WARC Readers.
41  * Figures whether to give out a compressed file Reader or an uncompressed
42  * Reader.
43  * @author stack
44  * @version $Date: 2006/08/24 00:59:04 $ $Version$
45  */

46 public class WARCReaderFactory extends ArchiveReaderFactory
47 implements WARCConstants {
48     private static final WARCReaderFactory factory = new WARCReaderFactory();
49
50     /**
51      * Shutdown any access to default constructor.
52      * This factory is Singleton.
53      */

54     private WARCReaderFactory() {
55         super();
56     }
57     
58     public static WARCReader get(String JavaDoc arcFileOrUrl)
59     throws MalformedURLException JavaDoc, IOException JavaDoc {
60         return (WARCReader)WARCReaderFactory.factory.
61             getArchiveReader(arcFileOrUrl);
62     }
63     
64     public static WARCReader get(final File JavaDoc f) throws IOException JavaDoc {
65         return (WARCReader)WARCReaderFactory.factory.getArchiveReader(f);
66     }
67     
68     /**
69      * @param f An arcfile to read.
70      * @param offset Have returned Reader set to start reading at this offset.
71      * @return A WARCReader.
72      * @throws IOException
73      */

74     public static WARCReader get(final File JavaDoc f, final long offset)
75     throws IOException JavaDoc {
76         return (WARCReader)WARCReaderFactory.factory.
77             getArchiveReader(f, offset);
78     }
79     
80     protected ArchiveReader getArchiveReader(final File JavaDoc f, final long offset)
81     throws IOException JavaDoc {
82         boolean compressed = testCompressedWARCFile(f);
83         if (!compressed) {
84             if (!FileUtils.isReadableWithExtensionAndMagic(f,
85                     DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) {
86                 throw new IOException JavaDoc(f.getAbsolutePath()
87                         + " is not a WARC file.");
88             }
89         }
90         return (WARCReader)(compressed?
91             WARCReaderFactory.factory.new CompressedWARCReader(f, offset):
92             WARCReaderFactory.factory.new UncompressedWARCReader(f, offset));
93     }
94     
95     public static ArchiveReader get(final String JavaDoc s, final InputStream JavaDoc is,
96             final boolean atFirstRecord)
97     throws IOException JavaDoc {
98         return WARCReaderFactory.factory.getArchiveReader(s, is,
99             atFirstRecord);
100     }
101     
102     protected ArchiveReader getArchiveReader(final String JavaDoc f,
103             final InputStream JavaDoc is, final boolean atFirstRecord)
104             throws IOException JavaDoc {
105         // For now, assume stream is compressed. Later add test of input
106
// stream or handle exception thrown when figure not compressed stream.
107
return new CompressedWARCReader(f, is, atFirstRecord);
108     }
109     
110     public static WARCReader get(final URL JavaDoc arcUrl, final long offset)
111     throws IOException JavaDoc {
112         return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl,
113             offset);
114     }
115     
116     /**
117      * Get an ARCReader.
118      * Pulls the ARC local into whereever the System Property
119      * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
120      * points at this local copy. A close on this ARCReader instance will
121      * remove the local copy.
122      * @param arcUrl An URL that points at an ARC.
123      * @return An ARCReader.
124      * @throws IOException
125      */

126     public static WARCReader get(final URL JavaDoc arcUrl)
127     throws IOException JavaDoc {
128         return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl);
129     }
130     
131     /**
132      * Check file is compressed WARC.
133      *
134      * @param f File to test.
135      *
136      * @return True if this is compressed WARC (TODO: Just tests if file is
137      * GZIP'd file (It begins w/ GZIP MAGIC)).
138      *
139      * @exception IOException If file does not exist or is not unreadable.
140      */

141     public static boolean testCompressedWARCFile(final File JavaDoc f)
142     throws IOException JavaDoc {
143         FileUtils.isReadable(f);
144         boolean compressed = false;
145         final InputStream JavaDoc is = new FileInputStream JavaDoc(f);
146         try {
147             compressed = GzippedInputStream.isCompressedStream(is);
148         } finally {
149             is.close();
150         }
151         return compressed;
152     }
153
154     /**
155      * Uncompressed WARC file reader.
156      * @author stack
157      */

158     private class UncompressedWARCReader extends WARCReader {
159         /**
160          * Constructor.
161          * @param f Uncompressed arcfile to read.
162          * @throws IOException
163          */

164         public UncompressedWARCReader(final File JavaDoc f)
165         throws IOException JavaDoc {
166             this(f, 0);
167         }
168
169         /**
170          * Constructor.
171          *
172          * @param f Uncompressed file to read.
173          * @param offset Offset at which to position Reader.
174          * @throws IOException
175          */

176         public UncompressedWARCReader(final File JavaDoc f, final long offset)
177         throws IOException JavaDoc {
178             // File has been tested for existence by time it has come to here.
179
setIn(getInputStream(f, offset));
180             initialize(f.getAbsolutePath());
181         }
182         
183         /**
184          * Constructor.
185          *
186          * @param f Uncompressed file to read.
187          * @param is InputStream.
188          */

189         public UncompressedWARCReader(final String JavaDoc f, final InputStream JavaDoc is) {
190             // Arc file has been tested for existence by time it has come
191
// to here.
192
setIn(is);
193             initialize(f);
194         }
195     }
196     
197     /**
198      * Compressed WARC file reader.
199      *
200      * @author stack
201      */

202     private class CompressedWARCReader extends WARCReader {
203         /**
204          * Constructor.
205          *
206          * @param f Compressed file to read.
207          * @throws IOException
208          */

209         public CompressedWARCReader(final File JavaDoc f) throws IOException JavaDoc {
210             this(f, 0);
211         }
212
213         /**
214          * Constructor.
215          *
216          * @param f Compressed arcfile to read.
217          * @param offset Position at where to start reading file.
218          * @throws IOException
219          */

220         public CompressedWARCReader(final File JavaDoc f, final long offset)
221                 throws IOException JavaDoc {
222             // File has been tested for existence by time it has come to here.
223
setIn(new GzippedInputStream(getInputStream(f, offset)));
224             setCompressed((offset == 0));
225             initialize(f.getAbsolutePath());
226         }
227         
228         /**
229          * Constructor.
230          *
231          * @param f Compressed arcfile.
232          * @param is InputStream to use.
233          * @param atFirstRecord
234          * @throws IOException
235          */

236         public CompressedWARCReader(final String JavaDoc f, final InputStream JavaDoc is,
237             final boolean atFirstRecord)
238         throws IOException JavaDoc {
239             // Arc file has been tested for existence by time it has come
240
// to here.
241
setIn(new GzippedInputStream(is));
242             setCompressed(true);
243             initialize(f);
244             // TODO: Ignore atFirstRecord. Probably doesn't apply in WARC world.
245
}
246         
247         /**
248          * Get record at passed <code>offset</code>.
249          *
250          * @param offset Byte index into file at which a record starts.
251          * @return A WARCRecord reference.
252          * @throws IOException
253          */

254         public WARCRecord get(long offset) throws IOException JavaDoc {
255             cleanupCurrentRecord();
256             ((GzippedInputStream)getIn()).gzipMemberSeek(offset);
257             return (WARCRecord) createArchiveRecord(getIn(), offset);
258         }
259         
260         public Iterator JavaDoc<ArchiveRecord> iterator() {
261             /**
262              * Override ArchiveRecordIterator so can base returned iterator on
263              * GzippedInputStream iterator.
264              */

265             return new ArchiveRecordIterator() {
266                 private GzippedInputStream gis =
267                     (GzippedInputStream)getInputStream();
268
269                 private Iterator JavaDoc gzipIterator = this.gis.iterator();
270
271                 protected boolean innerHasNext() {
272                     return this.gzipIterator.hasNext();
273                 }
274
275                 protected ArchiveRecord innerNext() throws IOException JavaDoc {
276                     // Get the positoin before gzipIterator.next moves
277
// it on past the gzip header.
278
long p = this.gis.position();
279                     InputStream JavaDoc is = (InputStream JavaDoc) this.gzipIterator.next();
280                     return createArchiveRecord(is, p);
281                 }
282             };
283         }
284         
285         protected void gotoEOR(ArchiveRecord rec) throws IOException JavaDoc {
286             // TODO
287
}
288     }
289     
290     public static boolean isWARCSuffix(final String JavaDoc f) {
291         return (f == null)?
292             false:
293             (f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?
294                 true:
295                 (f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))?
296                 true: false;
297     }
298 }
Popular Tags