KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > ArchiveReaderFactory


1 /* $Id: ArchiveReaderFactory.java,v 1.2.2.1 2007/01/13 01:31:31 stack-sf Exp $
2  *
3  * Created on August 18th, 2006
4  *
5  * Copyright (C) 2004 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io;
24
25 import it.unimi.dsi.fastutil.io.RepositionableStream;
26
27 import java.io.File JavaDoc;
28 import java.io.IOException JavaDoc;
29 import java.io.InputStream JavaDoc;
30 import java.net.HttpURLConnection JavaDoc;
31 import java.net.MalformedURLException JavaDoc;
32 import java.net.URL JavaDoc;
33 import java.net.URLConnection JavaDoc;
34
35 import org.archive.io.arc.ARCReaderFactory;
36 import org.archive.io.warc.WARCReaderFactory;
37 import org.archive.net.UURI;
38 import org.archive.net.md5.Md5URLConnection;
39 import org.archive.net.rsync.RsyncURLConnection;
40 import org.archive.util.FileUtils;
41 import org.archive.util.IoUtils;
42
43
44 /**
45  * Factory that returns an Archive file Reader.
46  * Returns Readers for ARCs or WARCs.
47  * @author stack
48  * @version $Date: 2007/01/13 01:31:31 $ $Revision: 1.2.2.1 $
49  */

50 public class ArchiveReaderFactory implements ArchiveFileConstants {
51     /**
52      * Offset value for when we want to stream all.
53      */

54     private final static int STREAM_ALL = -1;
55     
56     private static final ArchiveReaderFactory factory =
57         new ArchiveReaderFactory();
58     
59     /**
60      * Shutdown any public access to default constructor.
61      */

62     protected ArchiveReaderFactory() {
63         super();
64     }
65     
66     /**
67      * Get an Archive file Reader on passed path or url.
68      * Does primitive heuristic figuring if path or URL.
69      * @param arcFileOrUrl File path or URL pointing at an Archive file.
70      * @return An Archive file Reader.
71      * @throws IOException
72      * @throws MalformedURLException
73      * @throws IOException
74      */

75     public static ArchiveReader get(final String JavaDoc arcFileOrUrl)
76     throws MalformedURLException JavaDoc, IOException JavaDoc {
77         return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl);
78     }
79     
80     protected ArchiveReader getArchiveReader(final String JavaDoc arcFileOrUrl)
81     throws MalformedURLException JavaDoc, IOException JavaDoc {
82         return getArchiveReader(arcFileOrUrl, STREAM_ALL);
83     }
84     
85     protected ArchiveReader getArchiveReader(final String JavaDoc arcFileOrUrl,
86         final long offset)
87     throws MalformedURLException JavaDoc, IOException JavaDoc {
88         return UURI.hasScheme(arcFileOrUrl)?
89             get(new URL JavaDoc(arcFileOrUrl), offset):
90                 get(new File JavaDoc(arcFileOrUrl), offset);
91     }
92     
93     /**
94      * @param f An Archive file to read.
95      * @return An ArchiveReader
96      * @throws IOException
97      */

98     public static ArchiveReader get(final File JavaDoc f) throws IOException JavaDoc {
99         return ArchiveReaderFactory.factory.getArchiveReader(f);
100     }
101     
102     protected ArchiveReader getArchiveReader(final File JavaDoc f)
103     throws IOException JavaDoc {
104         return getArchiveReader(f, 0);
105     }
106     
107     /**
108      * @param f An Archive file to read.
109      * @param offset Have returned Reader set to start reading at this offset.
110      * @return An ArchiveReader
111      * @throws IOException
112      */

113     public static ArchiveReader get(final File JavaDoc f, final long offset)
114     throws IOException JavaDoc {
115         return ArchiveReaderFactory.factory.getArchiveReader(f, offset);
116     }
117     
118     protected ArchiveReader getArchiveReader(final File JavaDoc f,
119         final long offset)
120     throws IOException JavaDoc {
121         if (ARCReaderFactory.isARCSuffix(f.getName())) {
122             return ARCReaderFactory.get(f, true, offset);
123         } else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
124             return WARCReaderFactory.get(f, offset);
125         }
126         throw new IOException JavaDoc("Unknown file extension (Not ARC nor WARC): "
127             + f.getName());
128     }
129     
130     /**
131      * Wrap a Reader around passed Stream.
132      * @param s Identifying String for this Stream used in error messages.
133      * Must be a string that ends with the name of the file we're to put
134      * an ArchiveReader on. This code looks at file endings to figure
135      * whether to return an ARC or WARC reader.
136      * @param is Stream. Stream will be wrapped with implementation of
137      * RepositionableStream unless already supported.
138      * @param atFirstRecord Are we at first Record?
139      * @return ArchiveReader.
140      * @throws IOException
141      */

142     public static ArchiveReader get(final String JavaDoc s, final InputStream JavaDoc is,
143         final boolean atFirstRecord)
144     throws IOException JavaDoc {
145         return ArchiveReaderFactory.factory.getArchiveReader(s, is,
146             atFirstRecord);
147     }
148     
149     protected ArchiveReader getArchiveReader(final String JavaDoc id,
150             final InputStream JavaDoc is, final boolean atFirstRecord)
151     throws IOException JavaDoc {
152         InputStream JavaDoc stream = is;
153         if (!(stream instanceof RepositionableStream)) {
154             // RepositionableInputStream calls mark on each read so can
155
// back up at least the read amount. Needed for gzip inflater
156
// overinflations reading into the next gzip member.
157
stream = new RepositionableInputStream(stream, 16 * 1024);
158         }
159         if (ARCReaderFactory.isARCSuffix(id)) {
160             return ARCReaderFactory.get(id, stream, atFirstRecord);
161         } else if (WARCReaderFactory.isWARCSuffix(id)) {
162             return WARCReaderFactory.get(id, stream, atFirstRecord);
163         }
164         throw new IOException JavaDoc("Unknown extension (Not ARC nor WARC): " + id);
165     }
166     
167     /**
168      * Get an Archive Reader aligned at <code>offset</code>.
169      * This version of get will not bring the file local but will try to
170      * stream across the net making an HTTP 1.1 Range request on remote
171      * http server (RFC1435 Section 14.35).
172      * @param u HTTP URL for an Archive file.
173      * @param offset Offset into file at which to start fetching.
174      * @return An ArchiveReader aligned at offset.
175      * @throws IOException
176      */

177     public static ArchiveReader get(final URL JavaDoc u, final long offset)
178     throws IOException JavaDoc {
179         return ArchiveReaderFactory.factory.getArchiveReader(u, offset);
180     }
181     
182     protected ArchiveReader getArchiveReader(final URL JavaDoc f, final long offset)
183     throws IOException JavaDoc {
184         // Get URL connection.
185
URLConnection JavaDoc connection = f.openConnection();
186         if (!(connection instanceof HttpURLConnection JavaDoc)) {
187             throw new IOException JavaDoc("This method only handles HTTP connections.");
188         }
189         addUserAgent((HttpURLConnection JavaDoc)connection);
190         if (offset != STREAM_ALL) {
191             // Use a Range request (Assumes HTTP 1.1 on other end). If
192
// length >= 0, add open-ended range header to the request. Else,
193
// because end-byte is inclusive, subtract 1.
194
connection.addRequestProperty("Range", "bytes=" + offset + "-");
195         }
196         
197         return getArchiveReader(f.toString(), connection.getInputStream(),
198             (offset == 0));
199     }
200     
201     /**
202      * Get an ARCReader.
203      * Pulls the ARC local into whereever the System Property
204      * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
205      * points at this local copy. A close on this ARCReader instance will
206      * remove the local copy.
207      * @param u An URL that points at an ARC.
208      * @return An ARCReader.
209      * @throws IOException
210      */

211     public static ArchiveReader get(final URL JavaDoc u)
212     throws IOException JavaDoc {
213         return ArchiveReaderFactory.factory.getArchiveReader(u);
214     }
215     
216     protected ArchiveReader getArchiveReader(final URL JavaDoc u)
217     throws IOException JavaDoc {
218         // If url represents a local file then return file it points to.
219
if (u.getPath() != null) {
220             // TODO: Add scheme check and host check.
221
File JavaDoc f = new File JavaDoc(u.getPath());
222             if (f.exists()) {
223                 return get(f, 0);
224             }
225         }
226        
227         String JavaDoc scheme = u.getProtocol();
228         if (scheme.startsWith("http") || scheme.equals("s3")) {
229             // Try streaming if http or s3 URLs rather than copying local
230
// and then reading (Passing an offset will get us an Reader
231
// that wraps a Stream).
232
return get(u, STREAM_ALL);
233         }
234         
235         return makeARCLocal(u.openConnection());
236     }
237     
238     protected ArchiveReader makeARCLocal(final URLConnection JavaDoc connection)
239     throws IOException JavaDoc {
240         File JavaDoc localFile = null;
241         if (connection instanceof HttpURLConnection JavaDoc) {
242             // If http url connection, bring down the resource local.
243
String JavaDoc p = connection.getURL().getPath();
244             int index = p.lastIndexOf('/');
245             if (index >= 0) {
246                 // Name file for the file we're making local.
247
localFile = new File JavaDoc(FileUtils.TMPDIR, p.substring(index + 1));
248                 if (localFile.exists()) {
249                     // If file of same name already exists in TMPDIR, then
250
// clean it up (Assuming only reason a file of same name in
251
// TMPDIR is because we failed a previous download).
252
localFile.delete();
253                 }
254             } else {
255                 localFile = File.createTempFile(ArchiveReader.class.getName(),
256                     ".tmp", FileUtils.TMPDIR);
257             }
258             addUserAgent((HttpURLConnection JavaDoc)connection);
259             connection.connect();
260             try {
261                 IoUtils.readFullyToFile(connection.getInputStream(), localFile,
262                     new byte[16 * 1024]);
263             } catch (IOException JavaDoc ioe) {
264                 localFile.delete();
265                 throw ioe;
266             }
267         } else if (connection instanceof RsyncURLConnection) {
268             // Then, connect and this will create a local file.
269
// See implementation of the rsync handler.
270
connection.connect();
271             localFile = ((RsyncURLConnection)connection).getFile();
272         } else if (connection instanceof Md5URLConnection) {
273             // Then, connect and this will create a local file.
274
// See implementation of the md5 handler.
275
connection.connect();
276             localFile = ((Md5URLConnection)connection).getFile();
277         } else {
278             throw new UnsupportedOperationException JavaDoc("No support for " +
279                 connection);
280         }
281         
282         ArchiveReader reader = null;
283         try {
284             reader = get(localFile, 0);
285         } catch (IOException JavaDoc e) {
286             localFile.delete();
287             throw e;
288         }
289         
290         // Return a delegate that does cleanup of downloaded file on close.
291
return reader.getDeleteFileOnCloseReader(localFile);
292     }
293     
294     protected void addUserAgent(final HttpURLConnection JavaDoc connection) {
295         connection.addRequestProperty("User-Agent", this.getClass().getName());
296     }
297     
298     /**
299      * @param f File to test.
300      * @return True if <code>f</code> is compressed.
301      * @throws IOException
302      */

303     protected boolean isCompressed(final File JavaDoc f) throws IOException JavaDoc {
304         return f.getName().toLowerCase().
305             endsWith(DOT_COMPRESSED_FILE_EXTENSION);
306     }
307 }
Popular Tags