KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > arc > ARCUtils


1 /* ARCUtils
2  *
3  * Created on Aug 10, 2005
4  *
5  * Copyright (C) 2005 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io.arc;
24
25 import it.unimi.dsi.fastutil.io.RepositionableStream;
26
27 import java.io.File JavaDoc;
28 import java.io.FileInputStream JavaDoc;
29 import java.io.FileNotFoundException JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.io.InputStream JavaDoc;
32 import java.net.URI JavaDoc;
33 import java.net.URISyntaxException JavaDoc;
34
35 import org.archive.io.GzipHeader;
36 import org.archive.io.NoGzipMagicException;
37 import org.archive.net.UURI;
38
39 public class ARCUtils implements ARCConstants {
40     /**
41      * @param pathOrUri Path or URI to extract arc filename from.
42      * @return Extracted arc file name.
43      * @throws URISyntaxException
44      */

45     public static String JavaDoc parseArcFilename(final String JavaDoc pathOrUri)
46     throws URISyntaxException JavaDoc {
47         String JavaDoc path = pathOrUri;
48         if (UURI.hasScheme(pathOrUri)) {
49             URI JavaDoc url = new URI JavaDoc(pathOrUri);
50             path = url.getPath();
51         }
52         return (new File JavaDoc(path)).getName();
53     }
54     
55     /**
56      * @param arcFile File to test.
57      * @return True if <code>arcFile</code> is compressed ARC.
58      * @throws IOException
59      */

60     public static boolean isCompressed(File JavaDoc arcFile) throws IOException JavaDoc {
61         return testCompressedARCFile(arcFile);
62     }
63     
64     /**
65      * Check file is compressed and in ARC GZIP format.
66      *
67      * @param arcFile File to test if its Internet Archive ARC file
68      * GZIP compressed.
69      *
70      * @return True if this is an Internet Archive GZIP'd ARC file (It begins
71      * w/ the Internet Archive GZIP header and has the
72      * COMPRESSED_ARC_FILE_EXTENSION suffix).
73      *
74      * @exception IOException If file does not exist or is not unreadable.
75      */

76     public static boolean testCompressedARCFile(File JavaDoc arcFile)
77     throws IOException JavaDoc {
78         return testCompressedARCFile(arcFile, false);
79     }
80
81     /**
82      * Check file is compressed and in ARC GZIP format.
83      *
84      * @param arcFile File to test if its Internet Archive ARC file
85      * GZIP compressed.
86      * @param skipSuffixCheck Set to true if we're not to test on the
87      * '.arc.gz' suffix.
88      *
89      * @return True if this is an Internet Archive GZIP'd ARC file (It begins
90      * w/ the Internet Archive GZIP header).
91      *
92      * @exception IOException If file does not exist or is not unreadable.
93      */

94     public static boolean testCompressedARCFile(File JavaDoc arcFile,
95             boolean skipSuffixCheck)
96     throws IOException JavaDoc {
97         boolean compressedARCFile = false;
98         isReadable(arcFile);
99         if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
100                 .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
101             return compressedARCFile;
102         }
103         
104         final InputStream JavaDoc is = new FileInputStream JavaDoc(arcFile);
105         try {
106             compressedARCFile = testCompressedARCStream(is);
107         } finally {
108             is.close();
109         }
110         return compressedARCFile;
111     }
112     
113     /**
114      * Tests passed stream is gzip stream by reading in the HEAD.
115      * Does not reposition the stream. That is left up to the caller.
116      * @param is An InputStream.
117      * @return True if compressed stream.
118      * @throws IOException
119      */

120     public static boolean testCompressedARCStream(final InputStream JavaDoc is)
121             throws IOException JavaDoc {
122         boolean compressedARCFile = false;
123         GzipHeader gh = null;
124         try {
125             gh = new GzipHeader(is);
126         } catch (NoGzipMagicException e ) {
127             return compressedARCFile;
128         }
129         
130         byte[] fextra = gh.getFextra();
131         // Now make sure following bytes are IA GZIP comment.
132
// First check length. ARC_GZIP_EXTRA_FIELD includes length
133
// so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
134
// at +2.
135
if (fextra != null &&
136                 ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
137             compressedARCFile = true;
138             for (int i = 0; i < fextra.length; i++) {
139                 if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
140                     compressedARCFile = false;
141                     break;
142                 }
143             }
144         }
145         return compressedARCFile;
146     }
147     
148     /**
149      * Tests passed stream is gzip stream by reading in the HEAD.
150      * Does reposition of stream when done.
151      * @param rs An InputStream that is Repositionable.
152      * @return True if compressed stream.
153      * @throws IOException
154      */

155     public static boolean testCompressedRepositionalStream(
156             final RepositionableStream rs)
157     throws IOException JavaDoc {
158         boolean compressedARCFile = false;
159         long p = rs.position();
160         try {
161             compressedARCFile = testCompressedStream((InputStream JavaDoc)rs);
162         } finally {
163             rs.position(p);
164         }
165         return compressedARCFile;
166     }
167     
168     /**
169      * Tests passed stream is gzip stream by reading in the HEAD.
170      * Does reposition of stream when done.
171      * @param is An InputStream.
172      * @return True if compressed stream.
173      * @throws IOException
174      */

175     public static boolean testCompressedStream(final InputStream JavaDoc is)
176     throws IOException JavaDoc {
177         boolean compressedARCFile = false;
178         try {
179             new GzipHeader(is);
180             compressedARCFile = true;
181         } catch (NoGzipMagicException e) {
182             return compressedARCFile;
183         }
184         return compressedARCFile;
185     }
186     
187     /**
188      * Check file is uncompressed ARC file.
189      *
190      * @param arcFile
191      * File to test if its Internet Archive ARC file uncompressed.
192      *
193      * @return True if this is an Internet Archive ARC file.
194      *
195      * @exception IOException
196      * If file does not exist or is not unreadable.
197      */

198     public static boolean testUncompressedARCFile(File JavaDoc arcFile)
199     throws IOException JavaDoc {
200         boolean uncompressedARCFile = false;
201         isReadable(arcFile);
202         if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) {
203             FileInputStream JavaDoc fis = new FileInputStream JavaDoc(arcFile);
204             try {
205                 byte [] b = new byte[ARC_MAGIC_NUMBER.length()];
206                 int read = fis.read(b, 0, ARC_MAGIC_NUMBER.length());
207                 fis.close();
208                 if (read == ARC_MAGIC_NUMBER.length()) {
209                     StringBuffer JavaDoc beginStr
210                         = new StringBuffer JavaDoc(ARC_MAGIC_NUMBER.length());
211                     for (int i = 0; i < ARC_MAGIC_NUMBER.length(); i++) {
212                         beginStr.append((char)b[i]);
213                     }
214                     
215                     if (beginStr.toString().
216                             equalsIgnoreCase(ARC_MAGIC_NUMBER)) {
217                         uncompressedARCFile = true;
218                     }
219                 }
220             } finally {
221                 fis.close();
222             }
223         }
224
225         return uncompressedARCFile;
226     }
227     
228
229     /**
230      * @param arcFile File to test.
231      * @exception IOException If file does not exist or is not unreadable.
232      */

233     private static void isReadable(File JavaDoc arcFile) throws IOException JavaDoc {
234         if (!arcFile.exists()) {
235             throw new FileNotFoundException JavaDoc(arcFile.getAbsolutePath() +
236                 " does not exist.");
237         }
238
239         if (!arcFile.canRead()) {
240             throw new FileNotFoundException JavaDoc(arcFile.getAbsolutePath() +
241                 " is not readable.");
242         }
243     }
244 }
245
Popular Tags