1 23 package org.archive.io.warc; 24 25 import java.io.File ; 26 import java.io.FileInputStream ; 27 import java.io.IOException ; 28 import java.io.InputStream ; 29 import java.net.MalformedURLException ; 30 import java.net.URL ; 31 import java.util.Iterator ; 32 33 import org.archive.io.ArchiveReader; 34 import org.archive.io.ArchiveReaderFactory; 35 import org.archive.io.ArchiveRecord; 36 import org.archive.io.GzippedInputStream; 37 import org.archive.util.FileUtils; 38 39 46 public class WARCReaderFactory extends ArchiveReaderFactory 47 implements WARCConstants { 48 private static final WARCReaderFactory factory = new WARCReaderFactory(); 49 50 54 private WARCReaderFactory() { 55 super(); 56 } 57 58 public static WARCReader get(String arcFileOrUrl) 59 throws MalformedURLException , IOException { 60 return (WARCReader)WARCReaderFactory.factory. 61 getArchiveReader(arcFileOrUrl); 62 } 63 64 public static WARCReader get(final File f) throws IOException { 65 return (WARCReader)WARCReaderFactory.factory.getArchiveReader(f); 66 } 67 68 74 public static WARCReader get(final File f, final long offset) 75 throws IOException { 76 return (WARCReader)WARCReaderFactory.factory. 77 getArchiveReader(f, offset); 78 } 79 80 protected ArchiveReader getArchiveReader(final File f, final long offset) 81 throws IOException { 82 boolean compressed = testCompressedWARCFile(f); 83 if (!compressed) { 84 if (!FileUtils.isReadableWithExtensionAndMagic(f, 85 DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) { 86 throw new IOException (f.getAbsolutePath() 87 + " is not a WARC file."); 88 } 89 } 90 return (WARCReader)(compressed? 91 WARCReaderFactory.factory.new CompressedWARCReader(f, offset): 92 WARCReaderFactory.factory.new UncompressedWARCReader(f, offset)); 93 } 94 95 public static ArchiveReader get(final String s, final InputStream is, 96 final boolean atFirstRecord) 97 throws IOException { 98 return WARCReaderFactory.factory.getArchiveReader(s, is, 99 atFirstRecord); 100 } 101 102 protected ArchiveReader getArchiveReader(final String f, 103 final InputStream is, final boolean atFirstRecord) 104 throws IOException { 105 return new CompressedWARCReader(f, is, atFirstRecord); 108 } 109 110 public static WARCReader get(final URL arcUrl, final long offset) 111 throws IOException { 112 return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl, 113 offset); 114 } 115 116 126 public static WARCReader get(final URL arcUrl) 127 throws IOException { 128 return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl); 129 } 130 131 141 public static boolean testCompressedWARCFile(final File f) 142 throws IOException { 143 FileUtils.isReadable(f); 144 boolean compressed = false; 145 final InputStream is = new FileInputStream (f); 146 try { 147 compressed = GzippedInputStream.isCompressedStream(is); 148 } finally { 149 is.close(); 150 } 151 return compressed; 152 } 153 154 158 private class UncompressedWARCReader extends WARCReader { 159 164 public UncompressedWARCReader(final File f) 165 throws IOException { 166 this(f, 0); 167 } 168 169 176 public UncompressedWARCReader(final File f, final long offset) 177 throws IOException { 178 setIn(getInputStream(f, offset)); 180 initialize(f.getAbsolutePath()); 181 } 182 183 189 public UncompressedWARCReader(final String f, final InputStream is) { 190 setIn(is); 193 initialize(f); 194 } 195 } 196 197 202 private class CompressedWARCReader extends WARCReader { 203 209 public CompressedWARCReader(final File f) throws IOException { 210 this(f, 0); 211 } 212 213 220 public CompressedWARCReader(final File f, final long offset) 221 throws IOException { 222 setIn(new GzippedInputStream(getInputStream(f, offset))); 224 setCompressed((offset == 0)); 225 initialize(f.getAbsolutePath()); 226 } 227 228 236 public CompressedWARCReader(final String f, final InputStream is, 237 final boolean atFirstRecord) 238 throws IOException { 239 setIn(new GzippedInputStream(is)); 242 setCompressed(true); 243 initialize(f); 244 } 246 247 254 public WARCRecord get(long offset) throws IOException { 255 cleanupCurrentRecord(); 256 ((GzippedInputStream)getIn()).gzipMemberSeek(offset); 257 return (WARCRecord) createArchiveRecord(getIn(), offset); 258 } 259 260 public Iterator <ArchiveRecord> iterator() { 261 265 return new ArchiveRecordIterator() { 266 private GzippedInputStream gis = 267 (GzippedInputStream)getInputStream(); 268 269 private Iterator gzipIterator = this.gis.iterator(); 270 271 protected boolean innerHasNext() { 272 return this.gzipIterator.hasNext(); 273 } 274 275 protected ArchiveRecord innerNext() throws IOException { 276 long p = this.gis.position(); 279 InputStream is = (InputStream ) this.gzipIterator.next(); 280 return createArchiveRecord(is, p); 281 } 282 }; 283 } 284 285 protected void gotoEOR(ArchiveRecord rec) throws IOException { 286 } 288 } 289 290 public static boolean isWARCSuffix(final String f) { 291 return (f == null)? 292 false: 293 (f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))? 294 true: 295 (f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))? 296 true: false; 297 } 298 } | Popular Tags |