KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > GzippedInputStream


1 /* GzippedInputStream
2 *
3 * $Id: GzippedInputStream.java,v 1.19.2.1 2007/01/13 01:31:32 stack-sf Exp $
4 *
5 * Created on July 5, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.io;
26
27 import it.unimi.dsi.fastutil.io.RepositionableStream;
28
29 import java.io.ByteArrayOutputStream JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.io.InputStream JavaDoc;
32 import java.util.Iterator JavaDoc;
33 import java.util.zip.Deflater JavaDoc;
34 import java.util.zip.GZIPInputStream JavaDoc;
35 import java.util.zip.GZIPOutputStream JavaDoc;
36 import java.util.zip.Inflater JavaDoc;
37
38
39 /**
40  * Subclass of GZIPInputStream that can handle a stream made of multiple
41  * concatenated GZIP members/records.
42  *
43  * This class is needed because GZIPInputStream only finds the first GZIP
44  * member in the file even if the file is made up of multiple GZIP members.
45  *
46  * <p>Takes an InputStream stream that implements
47  * {@link RepositionableStream} interface so it can backup over-reads done
48  * by the zlib Inflater class.
49  *
50  * <p>Use the {@link #iterator()} method to get a gzip member iterator.
51  * Calls to {@link Iterator#next()} returns the next gzip member in the
52  * stream. Cast return from {@link Iterator#next()} to InputStream.
53  *
54  * <p>Use {@link #gzipMemberSeek(long)} to position stream before reading
55  * a gzip member if doing random accessing of gzip members. Pass it offset
56  * at which gzip member starts.
57  *
58  * <p>If you need to know position at which a gzip member starts, call
59  * {@link #position()} just after a call to {@link Iterator#hasNext()}
60  * and before you call {@link Iterator#next()}.
61  *
62  * @author stack
63  */

64 public class GzippedInputStream
65 extends GZIPInputStream JavaDoc
66 implements RepositionableStream {
67     /**
68      * Tail on gzip members (The CRC).
69      */

70     private static final int GZIP_TRAILER_LENGTH = 8;
71     
72     /**
73      * Utility class used probing for gzip members in stream.
74      * We need this instance to get at the readByte method.
75      */

76     private final GzipHeader gzipHeader = new GzipHeader();
77     
78     /**
79      * Buffer size used skipping over gzip members.
80      */

81     private static final int LINUX_PAGE_SIZE = 4 * 1024;
82     
83     private final long initialOffset;
84     
85     public GzippedInputStream(InputStream JavaDoc is) throws IOException JavaDoc {
86         // Have buffer match linux page size.
87
this(is, LINUX_PAGE_SIZE);
88     }
89     
90     /**
91      * @param is An InputStream that implements RespositionableStream and
92      * returns <code>true</code> when we call
93      * {@link InputStream#markSupported()} (Latter is needed so can setup
94      * an {@link Iterator} against the Gzip stream).
95      * @param size Size of blocks to use reading.
96      * @throws IOException
97      */

98     public GzippedInputStream(final InputStream JavaDoc is, final int size)
99     throws IOException JavaDoc {
100         super(checkStream(is), size);
101         if (!is.markSupported()) {
102             throw new IllegalArgumentException JavaDoc("GzippedInputStream requires " +
103                 "a markable stream");
104         }
105         if (!(is instanceof RepositionableStream)) {
106             throw new IllegalArgumentException JavaDoc("GzippedInputStream requires " +
107             "a stream that implements RepositionableStream");
108         }
109         // We need to calculate the absolute offset of the current
110
// GZIP Member. Its almost always going to be zero but not
111
// always (We may have been passed a stream that is already part
112
// ways through a stream of GZIP Members). So, getting
113
// absolute offset is not exactly straight-forward. The super
114
// class, GZIPInputStream on construction reads in the GZIP Header
115
// which is a pain because I then do not know the absolute offset
116
// at which the GZIP record began. So, the call above to checkStream()
117
// marked the stream before passing it to the super calls. Then
118
// below we get current postion at just past the GZIP Header, call
119
// reset so we go back to the absolute start of the GZIP Member in
120
// the file, record the offset for later should we need to start
121
// over again in this file -- i.e. we're asked to get an iterator
122
// from Record zero on -- then we move the file position to just
123
// after the GZIP Header again so we're again aligned for inflation
124
// of the current record.
125
long afterGZIPHeader = ((RepositionableStream)is).position();
126         is.reset();
127         this.initialOffset = ((RepositionableStream)is).position();
128         ((RepositionableStream)is).position(afterGZIPHeader);
129     }
130     
131     protected static InputStream JavaDoc checkStream(final InputStream JavaDoc is)
132     throws IOException JavaDoc {
133         if (is instanceof RepositionableStream) {
134             // See note above in constructor on why the mark here.
135
// Also minimal gzip header is 10. IA GZIP Headers are 20 bytes.
136
// Multiply by 4 in case extra info in the header.
137
is.mark(GzipHeader.MINIMAL_GZIP_HEADER_LENGTH * 4);
138             return is;
139         }
140         throw new IOException JavaDoc("Passed stream does not" +
141             " implement PositionableStream");
142     }
143     
144     /**
145      * Exhaust current GZIP member content.
146      * Call this method when you think you're on the end of the
147      * GZIP member. It will clean out any dross.
148      * @param ignore Character to ignore counting characters (Usually
149      * trailing new lines).
150      * @return Count of characters skipped over.
151      * @throws IOException
152      */

153     public long gotoEOR(int ignore) throws IOException JavaDoc {
154         long bytesSkipped = 0;
155         if (this.inf.getTotalIn() <= 0) {
156             return bytesSkipped;
157         }
158         if (!this.inf.finished()) {
159             int read = 0;
160             while ((read = read()) != -1) {
161                 if ((byte)read == (byte)ignore) {
162                     continue;
163                 }
164                 bytesSkipped = gotoEOR() + 1;
165                 break;
166             }
167         }
168         return bytesSkipped;
169     }
170     
171     /**
172      * Exhaust current GZIP member content.
173      * Call this method when you think you're on the end of the
174      * GZIP member. It will clean out any dross.
175      * @return Count of characters skipped over.
176      * @throws IOException
177      */

178     public long gotoEOR() throws IOException JavaDoc {
179         long bytesSkipped = 0;
180         if (this.inf.getTotalIn() <= 0) {
181             return bytesSkipped;
182         }
183         while(!this.inf.finished()) {
184             bytesSkipped += skip(Long.MAX_VALUE);
185         }
186         return bytesSkipped;
187     }
188     
189     /**
190      * Returns a GZIP Member Iterator.
191      * Has limitations. Can only get one Iterator per instance of this class;
192      * you must get new instance if you want to get Iterator again.
193      * @return Iterator over GZIP Members.
194      */

195     public Iterator JavaDoc iterator() {
196         try {
197             // We know its a RepositionableStream else we'd have failed
198
// construction. On iterator construction, set file back to
199
// initial position so we're ready to read GZIP Members
200
// (May not always work dependent on how the
201
// RepositionableStream was implemented).
202
((RepositionableStream)this.in).position(this.initialOffset);
203         } catch (IOException JavaDoc e) {
204             throw new RuntimeException JavaDoc(e);
205         }
206         return new Iterator JavaDoc() {
207             private GzippedInputStream compressedStream =
208                 GzippedInputStream.this;
209             
210             public boolean hasNext() {
211                 try {
212                     gotoEOR();
213                 } catch (IOException JavaDoc e) {
214                     throw new RuntimeException JavaDoc(e);
215                 }
216                 return moveToNextGzipMember();
217             }
218             
219             /**
220              * @return An InputStream onto a GZIP Member.
221              */

222             public Object JavaDoc next() {
223                 try {
224                     gzipMemberSeek();
225                 } catch (IOException JavaDoc e) {
226                     throw new RuntimeException JavaDoc("Failed move to EOR or " +
227                         "failed header read: " + e.getMessage());
228                 }
229                 return this.compressedStream;
230             }
231             
232             public void remove() {
233                 throw new UnsupportedOperationException JavaDoc();
234             }
235         };
236     }
237     
238     /**
239      * @return True if we found another record in the stream.
240      */

241     protected boolean moveToNextGzipMember() {
242         boolean result = false;
243         // Move to the next gzip member, if there is one, positioning
244
// ourselves by backing up the stream so we reread any inflater
245
// remaining bytes. Then add 8 bytes to get us past the GZIP
246
// CRC trailer block that ends all gzip members.
247
try {
248             RepositionableStream ps = (RepositionableStream)getInputStream();
249             // 8 is sizeof gzip CRC block thats on tail of gzipped
250
// record. If remaining is < 8 then experience indicates
251
// we're seeking past the gzip header -- don't backup the
252
// stream.
253
if (getInflater().getRemaining() > GZIP_TRAILER_LENGTH) {
254                 ps.position(position() - getInflater().getRemaining() +
255                     GZIP_TRAILER_LENGTH);
256             }
257             for (int read = -1, headerRead = 0; true; headerRead = 0) {
258                 // Give a hint to underlying stream that we're going to want to
259
// do some backing up.
260
getInputStream().mark(3);
261                 if ((read = getInputStream().read()) == -1) {
262                     break;
263                 }
264                 if(compareBytes(read, GZIPInputStream.GZIP_MAGIC)) {
265                     headerRead++;
266                     if ((read = getInputStream().read()) == -1) {
267                         break;
268                     }
269                     if(compareBytes(read, GZIPInputStream.GZIP_MAGIC >> 8)) {
270                         headerRead++;
271                         if ((read = getInputStream().read()) == -1) {
272                             break;
273                         }
274                         if (compareBytes(read, Deflater.DEFLATED)) {
275                             headerRead++;
276                             // Found gzip header. Backup the stream the
277
// bytes we just found and set result true.
278
getInputStream().reset();
279                             result = true;
280                             break;
281                         }
282                     }
283                     // Didn't find gzip header. Reset stream but one byte
284
// futher on then redo header tests.
285
ps.position(ps.position() - headerRead);
286                 }
287             }
288         } catch (IOException JavaDoc e) {
289             throw new RuntimeException JavaDoc("Failed i/o: " + e.getMessage());
290         }
291         return result;
292     }
293     
294     protected boolean compareBytes(final int a, final int b) {
295         return ((byte)(a & 0xff)) == ((byte)(b & 0xff));
296     }
297   
298     protected Inflater JavaDoc getInflater() {
299         return this.inf;
300     }
301     
302     protected InputStream JavaDoc getInputStream() {
303         return this.in;
304     }
305     
306     protected GzipHeader getGzipHeader() {
307         return this.gzipHeader;
308     }
309     
310     /**
311      * Move to next gzip member in the file.
312      */

313     protected void resetInflater() {
314         this.eos = false;
315         this.inf.reset();
316     }
317     
318     /**
319      * Read in the gzip header.
320      * @throws IOException
321      */

322     protected void readHeader() throws IOException JavaDoc {
323         new GzipHeader(this.in);
324         // Reset the crc for subsequent reads.
325
this.crc.reset();
326     }
327
328     /**
329      * Seek to passed offset.
330      *
331      * After positioning the stream, it resets the inflater.
332      * Assumption is that public use of this method is only
333      * to position stream at start of a gzip member.
334      *
335      * @param position Absolute position of a gzip member start.
336      * @throws IOException
337      */

338     public void position(long position) throws IOException JavaDoc {
339         ((RepositionableStream)this.in).position(position);
340         resetInflater();
341     }
342
343     public long position() throws IOException JavaDoc {
344        return ((RepositionableStream)this.in).position();
345     }
346     
347     /**
348      * Seek to a gzip member.
349      *
350      * Moves stream to new position, resets inflater and reads in the gzip
351      * header ready for subsequent calls to read.
352      *
353      * @param position Absolute position of a gzip member start.
354      * @throws IOException
355      */

356     public void gzipMemberSeek(long position) throws IOException JavaDoc {
357         position(position);
358         readHeader();
359     }
360     
361     public void gzipMemberSeek() throws IOException JavaDoc {
362         gzipMemberSeek(position());
363     }
364     
365     /**
366      * Gzip passed bytes.
367      * Use only when bytes is small.
368      * @param bytes What to gzip.
369      * @return A gzip member of bytes.
370      * @throws IOException
371      */

372     public static byte [] gzip(byte [] bytes) throws IOException JavaDoc {
373         ByteArrayOutputStream JavaDoc baos = new ByteArrayOutputStream JavaDoc();
374         GZIPOutputStream JavaDoc gzipOS = new GZIPOutputStream JavaDoc(baos);
375         gzipOS.write(bytes, 0, bytes.length);
376         gzipOS.close();
377         return baos.toByteArray();
378     }
379     
380     /**
381      * Tests passed stream is GZIP stream by reading in the HEAD.
382      * Does reposition of stream when done.
383      * @param rs An InputStream that is Repositionable.
384      * @return True if compressed stream.
385      * @throws IOException
386      */

387     public static boolean isCompressedRepositionableStream(
388             final RepositionableStream rs)
389     throws IOException JavaDoc {
390         boolean result = false;
391         long p = rs.position();
392         try {
393             result = isCompressedStream((InputStream JavaDoc)rs);
394         } finally {
395             rs.position(p);
396         }
397         return result;
398     }
399     
400     /**
401      * Tests passed stream is gzip stream by reading in the HEAD.
402      * Does not reposition stream when done.
403      * @param is An InputStream.
404      * @return True if compressed stream.
405      * @throws IOException
406      */

407     public static boolean isCompressedStream(final InputStream JavaDoc is)
408     throws IOException JavaDoc {
409         try {
410             new GzipHeader(is);
411         } catch (NoGzipMagicException e) {
412             return false;
413         }
414         return true;
415     }
416 }
417
Popular Tags