KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > Arc2Warc


1 /* $Id: Arc2Warc.java,v 1.5 2006/08/31 00:48:04 stack-sf Exp $
2  *
3  * Created Aug 29, 2006
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io;
24
25 import java.io.BufferedOutputStream JavaDoc;
26 import java.io.ByteArrayOutputStream JavaDoc;
27 import java.io.File JavaDoc;
28 import java.io.FileOutputStream JavaDoc;
29 import java.io.IOException JavaDoc;
30 import java.util.ArrayList JavaDoc;
31 import java.util.Iterator JavaDoc;
32 import java.util.List JavaDoc;
33 import java.util.logging.Level JavaDoc;
34 import java.util.logging.Logger JavaDoc;
35
36 import org.apache.commons.cli.CommandLine;
37 import org.apache.commons.cli.HelpFormatter;
38 import org.apache.commons.cli.Option;
39 import org.apache.commons.cli.Options;
40 import org.apache.commons.cli.ParseException;
41 import org.apache.commons.cli.PosixParser;
42 import org.archive.io.arc.ARCConstants;
43 import org.archive.io.arc.ARCReader;
44 import org.archive.io.arc.ARCReaderFactory;
45 import org.archive.io.arc.ARCRecord;
46 import org.archive.io.warc.ExperimentalWARCWriter;
47 import org.archive.io.warc.WARCConstants;
48 import org.archive.util.FileUtils;
49 import org.archive.util.anvl.ANVLRecord;
50
51
52 /**
53  * Convert ARCs to (sortof) WARCs.
54  * @author stack
55  * @version $Date: 2006/08/31 00:48:04 $ $Revision: 1.5 $
56  */

57 public class Arc2Warc {
58    private static void usage(HelpFormatter formatter, Options options,
59            int exitCode) {
60        formatter.printHelp("java org.archive.io.arc.Arc2Warc " +
61             "[--force] ARC_INPUT WARC_OUTPUT", options);
62        System.exit(exitCode);
63    }
64    
65    private static String JavaDoc getRevision() {
66        return Warc2Arc.parseRevision("$Revision: 1.5 $");
67    }
68    
69    public void transform(final File JavaDoc arc, final File JavaDoc warc, final boolean force)
70    throws IOException JavaDoc {
71        FileUtils.isReadable(arc);
72        if (warc.exists() && !force) {
73            throw new IOException JavaDoc("Target WARC already exists. " +
74                "Will not overwrite.");
75        }
76
77        ARCReader reader = ARCReaderFactory.get(arc, false, 0);
78        transform(reader, warc);
79    }
80    
81    protected void transform(final ARCReader reader, final File JavaDoc warc)
82    throws IOException JavaDoc {
83        ExperimentalWARCWriter writer = null;
84        // No point digesting. Digest is available after reading of ARC which
85
// is too late for inclusion in WARC.
86
reader.setDigest(false);
87        try {
88            BufferedOutputStream JavaDoc bos =
89                new BufferedOutputStream JavaDoc(new FileOutputStream JavaDoc(warc));
90            // Get the body of the first ARC record as a String so can dump it
91
// into first record of WARC.
92
final Iterator JavaDoc<ArchiveRecord> i = reader.iterator();
93            ARCRecord firstRecord = (ARCRecord)i.next();
94            ByteArrayOutputStream JavaDoc baos =
95                new ByteArrayOutputStream JavaDoc((int)firstRecord.getHeader().
96                    getLength());
97            firstRecord.dump(baos);
98            // Add ARC first record content as an ANVLRecord.
99
ANVLRecord ar = new ANVLRecord(1);
100            ar.addLabelValue("Filedesc", baos.toString());
101            List JavaDoc<String JavaDoc> metadata = new ArrayList JavaDoc<String JavaDoc>(1);
102            metadata.add(ar.toString());
103            // Now create the writer. If reader was compressed, lets write
104
// a compressed WARC.
105
writer = new ExperimentalWARCWriter(null, bos, warc,
106                reader.isCompressed(), null, metadata);
107            // Write a warcinfo record with description about how this WARC
108
// was made.
109
writer.writeWarcinfoRecord(warc.getName(),
110                "Made from " + reader.getReaderIdentifier() + " by " +
111                    this.getClass().getName() + "/" + getRevision());
112            for (; i.hasNext();) {
113                write(writer, (ARCRecord)i.next());
114            }
115        } finally {
116            if (reader != null) {
117                reader.close();
118            }
119            if (writer != null) {
120                // I don't want the close being logged -- least, not w/o log of
121
// an opening (and that'd be a little silly for simple script
122
// like this). Currently, it logs at level INFO so that close
123
// of files gets written to log files. Up the log level just
124
// for the close.
125
Logger JavaDoc l = Logger.getLogger(writer.getClass().getName());
126                Level JavaDoc oldLevel = l.getLevel();
127                l.setLevel(Level.WARNING);
128                try {
129                    writer.close();
130                } finally {
131                    l.setLevel(oldLevel);
132                }
133            }
134        }
135    }
136    
137    protected void write(final ExperimentalWARCWriter writer,
138            final ARCRecord r)
139    throws IOException JavaDoc {
140        ANVLRecord ar = new ANVLRecord();
141        String JavaDoc ip = (String JavaDoc)r.getHeader().
142            getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY));
143        if (ip != null && ip.length() > 0) {
144            ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
145        }
146        // If contentBody > 0, assume http headers. Make the mimetype
147
// be application/http. Otherwise, give it ARC mimetype.
148
writer.writeResourceRecord(r.getHeader().getUrl(),
149            r.getHeader().getDate(),
150            (r.getHeader().getContentBegin() > 0)?
151                WARCConstants.HTTP_RESPONSE_MIMETYPE:
152                r.getHeader().getMimetype(),
153                ar, r, r.getHeader().getLength());
154    }
155
156    /**
157     * Command-line interface to Arc2Warc.
158     *
159     * @param args Command-line arguments.
160     * @throws ParseException Failed parse of the command line.
161     * @throws IOException
162     * @throws java.text.ParseException
163     */

164    public static void main(String JavaDoc [] args)
165    throws ParseException, IOException JavaDoc, java.text.ParseException JavaDoc {
166        Options options = new Options();
167        options.addOption(new Option("h","help", false,
168            "Prints this message and exits."));
169        options.addOption(new Option("f","force", false,
170            "Force overwrite of target file."));
171        PosixParser parser = new PosixParser();
172        CommandLine cmdline = parser.parse(options, args, false);
173        List JavaDoc cmdlineArgs = cmdline.getArgList();
174        Option [] cmdlineOptions = cmdline.getOptions();
175        HelpFormatter formatter = new HelpFormatter();
176        
177        // If no args, print help.
178
if (cmdlineOptions.length <= 0) {
179            usage(formatter, options, 0);
180        }
181
182        // Now look at options passed.
183
boolean force = false;
184        for (int i = 0; i < cmdlineOptions.length; i++) {
185            switch(cmdlineOptions[i].getId()) {
186                case 'h':
187                    usage(formatter, options, 0);
188                    break;
189                    
190                case 'f':
191                    force = true;
192                    break;
193                    
194                default:
195                    throw new RuntimeException JavaDoc("Unexpected option: " +
196                        + cmdlineOptions[i].getId());
197            }
198        }
199        
200        // If no args, print help.
201
if (cmdlineArgs.size() != 2) {
202            usage(formatter, options, 0);
203        }
204        (new Arc2Warc()).transform(new File JavaDoc(cmdlineArgs.get(0).toString()),
205            new File JavaDoc(cmdlineArgs.get(1).toString()), force);
206    }
207 }
208
Popular Tags