1 23 package org.archive.io; 24 25 import it.unimi.dsi.fastutil.io.RepositionableStream; 26 27 import java.io.File ; 28 import java.io.IOException ; 29 import java.io.InputStream ; 30 import java.net.HttpURLConnection ; 31 import java.net.MalformedURLException ; 32 import java.net.URL ; 33 import java.net.URLConnection ; 34 35 import org.archive.io.arc.ARCReaderFactory; 36 import org.archive.io.warc.WARCReaderFactory; 37 import org.archive.net.UURI; 38 import org.archive.net.md5.Md5URLConnection; 39 import org.archive.net.rsync.RsyncURLConnection; 40 import org.archive.util.FileUtils; 41 import org.archive.util.IoUtils; 42 43 44 50 public class ArchiveReaderFactory implements ArchiveFileConstants { 51 54 private final static int STREAM_ALL = -1; 55 56 private static final ArchiveReaderFactory factory = 57 new ArchiveReaderFactory(); 58 59 62 protected ArchiveReaderFactory() { 63 super(); 64 } 65 66 75 public static ArchiveReader get(final String arcFileOrUrl) 76 throws MalformedURLException , IOException { 77 return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl); 78 } 79 80 protected ArchiveReader getArchiveReader(final String arcFileOrUrl) 81 throws MalformedURLException , IOException { 82 return getArchiveReader(arcFileOrUrl, STREAM_ALL); 83 } 84 85 protected ArchiveReader getArchiveReader(final String arcFileOrUrl, 86 final long offset) 87 throws MalformedURLException , IOException { 88 return UURI.hasScheme(arcFileOrUrl)? 89 get(new URL (arcFileOrUrl), offset): 90 get(new File (arcFileOrUrl), offset); 91 } 92 93 98 public static ArchiveReader get(final File f) throws IOException { 99 return ArchiveReaderFactory.factory.getArchiveReader(f); 100 } 101 102 protected ArchiveReader getArchiveReader(final File f) 103 throws IOException { 104 return getArchiveReader(f, 0); 105 } 106 107 113 public static ArchiveReader get(final File f, final long offset) 114 throws IOException { 115 return ArchiveReaderFactory.factory.getArchiveReader(f, offset); 116 } 117 118 protected ArchiveReader getArchiveReader(final File f, 119 final long offset) 120 throws IOException { 121 if (ARCReaderFactory.isARCSuffix(f.getName())) { 122 return ARCReaderFactory.get(f, true, offset); 123 } else if (WARCReaderFactory.isWARCSuffix(f.getName())) { 124 return WARCReaderFactory.get(f, offset); 125 } 126 throw new IOException ("Unknown file extension (Not ARC nor WARC): " 127 + f.getName()); 128 } 129 130 142 public static ArchiveReader get(final String s, final InputStream is, 143 final boolean atFirstRecord) 144 throws IOException { 145 return ArchiveReaderFactory.factory.getArchiveReader(s, is, 146 atFirstRecord); 147 } 148 149 protected ArchiveReader getArchiveReader(final String id, 150 final InputStream is, final boolean atFirstRecord) 151 throws IOException { 152 InputStream stream = is; 153 if (!(stream instanceof RepositionableStream)) { 154 stream = new RepositionableInputStream(stream, 16 * 1024); 158 } 159 if (ARCReaderFactory.isARCSuffix(id)) { 160 return ARCReaderFactory.get(id, stream, atFirstRecord); 161 } else if (WARCReaderFactory.isWARCSuffix(id)) { 162 return WARCReaderFactory.get(id, stream, atFirstRecord); 163 } 164 throw new IOException ("Unknown extension (Not ARC nor WARC): " + id); 165 } 166 167 177 public static ArchiveReader get(final URL u, final long offset) 178 throws IOException { 179 return ArchiveReaderFactory.factory.getArchiveReader(u, offset); 180 } 181 182 protected ArchiveReader getArchiveReader(final URL f, final long offset) 183 throws IOException { 184 URLConnection connection = f.openConnection(); 186 if (!(connection instanceof HttpURLConnection )) { 187 throw new IOException ("This method only handles HTTP connections."); 188 } 189 addUserAgent((HttpURLConnection )connection); 190 if (offset != STREAM_ALL) { 191 connection.addRequestProperty("Range", "bytes=" + offset + "-"); 195 } 196 197 return getArchiveReader(f.toString(), connection.getInputStream(), 198 (offset == 0)); 199 } 200 201 211 public static ArchiveReader get(final URL u) 212 throws IOException { 213 return ArchiveReaderFactory.factory.getArchiveReader(u); 214 } 215 216 protected ArchiveReader getArchiveReader(final URL u) 217 throws IOException { 218 if (u.getPath() != null) { 220 File f = new File (u.getPath()); 222 if (f.exists()) { 223 return get(f, 0); 224 } 225 } 226 227 String scheme = u.getProtocol(); 228 if (scheme.startsWith("http") || scheme.equals("s3")) { 229 return get(u, STREAM_ALL); 233 } 234 235 return makeARCLocal(u.openConnection()); 236 } 237 238 protected ArchiveReader makeARCLocal(final URLConnection connection) 239 throws IOException { 240 File localFile = null; 241 if (connection instanceof HttpURLConnection ) { 242 String p = connection.getURL().getPath(); 244 int index = p.lastIndexOf('/'); 245 if (index >= 0) { 246 localFile = new File (FileUtils.TMPDIR, p.substring(index + 1)); 248 if (localFile.exists()) { 249 localFile.delete(); 253 } 254 } else { 255 localFile = File.createTempFile(ArchiveReader.class.getName(), 256 ".tmp", FileUtils.TMPDIR); 257 } 258 addUserAgent((HttpURLConnection )connection); 259 connection.connect(); 260 try { 261 IoUtils.readFullyToFile(connection.getInputStream(), localFile, 262 new byte[16 * 1024]); 263 } catch (IOException ioe) { 264 localFile.delete(); 265 throw ioe; 266 } 267 } else if (connection instanceof RsyncURLConnection) { 268 connection.connect(); 271 localFile = ((RsyncURLConnection)connection).getFile(); 272 } else if (connection instanceof Md5URLConnection) { 273 connection.connect(); 276 localFile = ((Md5URLConnection)connection).getFile(); 277 } else { 278 throw new UnsupportedOperationException ("No support for " + 279 connection); 280 } 281 282 ArchiveReader reader = null; 283 try { 284 reader = get(localFile, 0); 285 } catch (IOException e) { 286 localFile.delete(); 287 throw e; 288 } 289 290 return reader.getDeleteFileOnCloseReader(localFile); 292 } 293 294 protected void addUserAgent(final HttpURLConnection connection) { 295 connection.addRequestProperty("User-Agent", this.getClass().getName()); 296 } 297 298 303 protected boolean isCompressed(final File f) throws IOException { 304 return f.getName().toLowerCase(). 305 endsWith(DOT_COMPRESSED_FILE_EXTENSION); 306 } 307 } | Popular Tags |