1 23 package org.archive.io; 24 25 import java.io.BufferedOutputStream ; 26 import java.io.ByteArrayOutputStream ; 27 import java.io.File ; 28 import java.io.FileOutputStream ; 29 import java.io.IOException ; 30 import java.util.ArrayList ; 31 import java.util.Iterator ; 32 import java.util.List ; 33 import java.util.logging.Level ; 34 import java.util.logging.Logger ; 35 36 import org.apache.commons.cli.CommandLine; 37 import org.apache.commons.cli.HelpFormatter; 38 import org.apache.commons.cli.Option; 39 import org.apache.commons.cli.Options; 40 import org.apache.commons.cli.ParseException; 41 import org.apache.commons.cli.PosixParser; 42 import org.archive.io.arc.ARCConstants; 43 import org.archive.io.arc.ARCReader; 44 import org.archive.io.arc.ARCReaderFactory; 45 import org.archive.io.arc.ARCRecord; 46 import org.archive.io.warc.ExperimentalWARCWriter; 47 import org.archive.io.warc.WARCConstants; 48 import org.archive.util.FileUtils; 49 import org.archive.util.anvl.ANVLRecord; 50 51 52 57 public class Arc2Warc { 58 private static void usage(HelpFormatter formatter, Options options, 59 int exitCode) { 60 formatter.printHelp("java org.archive.io.arc.Arc2Warc " + 61 "[--force] ARC_INPUT WARC_OUTPUT", options); 62 System.exit(exitCode); 63 } 64 65 private static String getRevision() { 66 return Warc2Arc.parseRevision("$Revision: 1.5 $"); 67 } 68 69 public void transform(final File arc, final File warc, final boolean force) 70 throws IOException { 71 FileUtils.isReadable(arc); 72 if (warc.exists() && !force) { 73 throw new IOException ("Target WARC already exists. " + 74 "Will not overwrite."); 75 } 76 77 ARCReader reader = ARCReaderFactory.get(arc, false, 0); 78 transform(reader, warc); 79 } 80 81 protected void transform(final ARCReader reader, final File warc) 82 throws IOException { 83 ExperimentalWARCWriter writer = null; 84 reader.setDigest(false); 87 try { 88 BufferedOutputStream bos = 89 new BufferedOutputStream (new FileOutputStream (warc)); 90 final Iterator <ArchiveRecord> i = reader.iterator(); 93 ARCRecord firstRecord = (ARCRecord)i.next(); 94 ByteArrayOutputStream baos = 95 new ByteArrayOutputStream ((int)firstRecord.getHeader(). 96 getLength()); 97 firstRecord.dump(baos); 98 ANVLRecord ar = new ANVLRecord(1); 100 ar.addLabelValue("Filedesc", baos.toString()); 101 List <String > metadata = new ArrayList <String >(1); 102 metadata.add(ar.toString()); 103 writer = new ExperimentalWARCWriter(null, bos, warc, 106 reader.isCompressed(), null, metadata); 107 writer.writeWarcinfoRecord(warc.getName(), 110 "Made from " + reader.getReaderIdentifier() + " by " + 111 this.getClass().getName() + "/" + getRevision()); 112 for (; i.hasNext();) { 113 write(writer, (ARCRecord)i.next()); 114 } 115 } finally { 116 if (reader != null) { 117 reader.close(); 118 } 119 if (writer != null) { 120 Logger l = Logger.getLogger(writer.getClass().getName()); 126 Level oldLevel = l.getLevel(); 127 l.setLevel(Level.WARNING); 128 try { 129 writer.close(); 130 } finally { 131 l.setLevel(oldLevel); 132 } 133 } 134 } 135 } 136 137 protected void write(final ExperimentalWARCWriter writer, 138 final ARCRecord r) 139 throws IOException { 140 ANVLRecord ar = new ANVLRecord(); 141 String ip = (String )r.getHeader(). 142 getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY)); 143 if (ip != null && ip.length() > 0) { 144 ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip); 145 } 146 writer.writeResourceRecord(r.getHeader().getUrl(), 149 r.getHeader().getDate(), 150 (r.getHeader().getContentBegin() > 0)? 151 WARCConstants.HTTP_RESPONSE_MIMETYPE: 152 r.getHeader().getMimetype(), 153 ar, r, r.getHeader().getLength()); 154 } 155 156 164 public static void main(String [] args) 165 throws ParseException, IOException , java.text.ParseException { 166 Options options = new Options(); 167 options.addOption(new Option("h","help", false, 168 "Prints this message and exits.")); 169 options.addOption(new Option("f","force", false, 170 "Force overwrite of target file.")); 171 PosixParser parser = new PosixParser(); 172 CommandLine cmdline = parser.parse(options, args, false); 173 List cmdlineArgs = cmdline.getArgList(); 174 Option [] cmdlineOptions = cmdline.getOptions(); 175 HelpFormatter formatter = new HelpFormatter(); 176 177 if (cmdlineOptions.length <= 0) { 179 usage(formatter, options, 0); 180 } 181 182 boolean force = false; 184 for (int i = 0; i < cmdlineOptions.length; i++) { 185 switch(cmdlineOptions[i].getId()) { 186 case 'h': 187 usage(formatter, options, 0); 188 break; 189 190 case 'f': 191 force = true; 192 break; 193 194 default: 195 throw new RuntimeException ("Unexpected option: " + 196 + cmdlineOptions[i].getId()); 197 } 198 } 199 200 if (cmdlineArgs.size() != 2) { 202 usage(formatter, options, 0); 203 } 204 (new Arc2Warc()).transform(new File (cmdlineArgs.get(0).toString()), 205 new File (cmdlineArgs.get(1).toString()), force); 206 } 207 } 208 | Popular Tags |