KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > Warc2Arc


1 /* $Id: Warc2Arc.java,v 1.3.2.1 2007/01/13 01:31:35 stack-sf Exp $
2  *
3  * Created Aug 29, 2006
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io;
24
25
26 import java.io.File JavaDoc;
27 import java.io.IOException JavaDoc;
28 import java.util.ArrayList JavaDoc;
29 import java.util.Arrays JavaDoc;
30 import java.util.Iterator JavaDoc;
31 import java.util.List JavaDoc;
32 import java.util.concurrent.atomic.AtomicInteger JavaDoc;
33 import java.util.logging.Level JavaDoc;
34 import java.util.logging.Logger JavaDoc;
35
36 import org.apache.commons.cli.CommandLine;
37 import org.apache.commons.cli.HelpFormatter;
38 import org.apache.commons.cli.Option;
39 import org.apache.commons.cli.Options;
40 import org.apache.commons.cli.ParseException;
41 import org.apache.commons.cli.PosixParser;
42 import org.archive.io.arc.ARCWriter;
43 import org.archive.io.warc.WARCConstants;
44 import org.archive.io.warc.WARCReader;
45 import org.archive.io.warc.WARCReaderFactory;
46 import org.archive.io.warc.WARCRecord;
47 import org.archive.util.ArchiveUtils;
48 import org.archive.util.FileUtils;
49
50
51 /**
52  * Convert WARCs to (sortof) ARCs.
53  * WARCs can be 1Gig in size, that is, 10x default ARC size. Script takes
54  * directory as output and will write multiple ARCs for a single large WARC.
55  * Only writes resource records of type <code>text/dns</code> or
56  * <code>application/http; msgtype=response</code>. All others -- metadata,
57  * request -- are skipped.
58  * @author stack
59  * @version $Date: 2007/01/13 01:31:35 $ $Revision: 1.3.2.1 $
60  */

61 public class Warc2Arc {
62    private static void usage(HelpFormatter formatter, Options options,
63            int exitCode) {
64        formatter.printHelp("java org.archive.io.arc.Warc2Arc " +
65             "[--force] [--prefix=PREFIX] [--suffix=SUFFIX] WARC_INPUT " +
66                 "OUTPUT_DIR",
67             options);
68        System.exit(exitCode);
69    }
70    
71    static String JavaDoc parseRevision(final String JavaDoc version) {
72        final String JavaDoc ID = "$Revision: ";
73        int index = version.indexOf(ID);
74        return (index < 0)? version:
75            version.substring(index + ID.length(), version.length() - 1).trim();
76    }
77    
78    private static String JavaDoc getRevision() {
79        return parseRevision("$Revision: 1.3.2.1 $");
80    }
81    
82    public void transform(final File JavaDoc warc, final File JavaDoc dir, final String JavaDoc prefix,
83            final String JavaDoc suffix, final boolean force)
84    throws IOException JavaDoc, java.text.ParseException JavaDoc {
85        FileUtils.isReadable(warc);
86        FileUtils.isReadable(dir);
87        WARCReader reader = WARCReaderFactory.get(warc);
88        List JavaDoc<String JavaDoc> metadata = new ArrayList JavaDoc<String JavaDoc>();
89        metadata.add("Made from " + reader.getReaderIdentifier() + " by " +
90            this.getClass().getName() + "/" + getRevision());
91        ARCWriter writer = new ARCWriter(new AtomicInteger JavaDoc(),
92             Arrays.asList(new File JavaDoc [] {dir}), prefix, suffix,
93             reader.isCompressed(), -1, metadata);
94        transform(reader, writer);
95    }
96
97    protected void transform(final WARCReader reader, final ARCWriter writer)
98    throws IOException JavaDoc, java.text.ParseException JavaDoc {
99        // No point digesting. Digest is available after reading of ARC which
100
// is too late for inclusion in WARC.
101
reader.setDigest(false);
102        // I don't want the close being logged -- least, not w/o log of
103
// an opening (and that'd be a little silly for simple script
104
// like this). Currently, it logs at level INFO so that close
105
// of files gets written to log files. Up the log level just
106
// for the close.
107
Logger JavaDoc l = Logger.getLogger(writer.getClass().getName());
108        Level JavaDoc oldLevel = l.getLevel();
109        try {
110            l.setLevel(Level.WARNING);
111            for (final Iterator JavaDoc i = reader.iterator(); i.hasNext();) {
112                WARCRecord r = (WARCRecord)i.next();
113                if (!isARCType(r.getHeader().getMimetype())) {
114                    continue;
115                }
116                if (r.getHeader().getContentBegin() <= 0) {
117                    // Otherwise, because length include Header-Line and
118
// Named Fields, these will end up in the ARC unless there
119
// is a non-zero content begin.
120
continue;
121                }
122                String JavaDoc ip = (String JavaDoc)r.getHeader().
123                    getHeaderValue((WARCConstants.NAMED_FIELD_IP_LABEL));
124                long length = r.getHeader().getLength();
125                int offset = r.getHeader().getContentBegin();
126                // This mimetype is not exactly what you'd expect to find in
127
// an ARC though technically its 'correct'. To get right one,
128
// need to parse the HTTP Headers. Thats messy. Not doing for
129
// now.
130
String JavaDoc mimetype = r.getHeader().getMimetype();
131                long time = ArchiveUtils.getSecondsSinceEpoch(r.getHeader().
132                    getDate()).getTime();
133                writer.write(r.getHeader().getUrl(), mimetype, ip, time,
134                    (int)(length - offset), r);
135            }
136        } finally {
137            if (reader != null) {
138                reader.close();
139            }
140            if (writer != null) {
141                try {
142                    writer.close();
143                } finally {
144                    l.setLevel(oldLevel);
145                }
146            }
147        }
148    }
149    
150    protected boolean isARCType(final String JavaDoc mimetype) {
151        // Comparing mimetypes, especially WARC types can be problematic since
152
// they have whitespace. For now, ignore.
153
if (mimetype == null || mimetype.length() <= 0) {
154            return false;
155        }
156        String JavaDoc cleaned = mimetype.toLowerCase().trim();
157        if (cleaned.equals(WARCConstants.HTTP_RESPONSE_MIMETYPE) ||
158                cleaned.equals("text/dns")) {
159            return true;
160        }
161        return false;
162    }
163
164    /**
165     * Command-line interface to Arc2Warc.
166     *
167     * @param args Command-line arguments.
168     * @throws ParseException Failed parse of the command line.
169     * @throws IOException
170     * @throws java.text.ParseException
171     */

172    public static void main(String JavaDoc [] args)
173    throws ParseException, IOException JavaDoc, java.text.ParseException JavaDoc {
174        Options options = new Options();
175        options.addOption(new Option("h","help", false,
176            "Prints this message and exits."));
177        options.addOption(new Option("f","force", false,
178            "Force overwrite of target file."));
179        options.addOption(new Option("p","prefix", true,
180            "Prefix to use on created ARC files, else uses default."));
181        options.addOption(new Option("s","suffix", true,
182            "Suffix to use on created ARC files, else uses default."));
183        PosixParser parser = new PosixParser();
184        CommandLine cmdline = parser.parse(options, args, false);
185        List JavaDoc cmdlineArgs = cmdline.getArgList();
186        Option [] cmdlineOptions = cmdline.getOptions();
187        HelpFormatter formatter = new HelpFormatter();
188        
189        // If no args, print help.
190
if (cmdlineOptions.length < 0) {
191            usage(formatter, options, 0);
192        }
193
194        // Now look at options passed.
195
boolean force = false;
196        String JavaDoc prefix = "WARC2ARC";
197        String JavaDoc suffix = null;
198        for (int i = 0; i < cmdlineOptions.length; i++) {
199            switch(cmdlineOptions[i].getId()) {
200                case 'h':
201                    usage(formatter, options, 0);
202                    break;
203                    
204                case 'f':
205                    force = true;
206                    break;
207                    
208                case 'p':
209                    prefix = cmdlineOptions[i].getValue();
210                    break;
211                    
212                case 's':
213                    suffix = cmdlineOptions[i].getValue();
214                    break;
215                    
216                default:
217                    throw new RuntimeException JavaDoc("Unexpected option: " +
218                        + cmdlineOptions[i].getId());
219            }
220        }
221        
222        // If no args, print help.
223
if (cmdlineArgs.size() != 2) {
224            usage(formatter, options, 0);
225        }
226        (new Warc2Arc()).transform(new File JavaDoc(cmdlineArgs.get(0).toString()),
227            new File JavaDoc(cmdlineArgs.get(1).toString()), prefix, suffix, force);
228    }
229 }
Popular Tags