1 23 package org.archive.io; 24 25 26 import java.io.File ; 27 import java.io.IOException ; 28 import java.util.ArrayList ; 29 import java.util.Arrays ; 30 import java.util.Iterator ; 31 import java.util.List ; 32 import java.util.concurrent.atomic.AtomicInteger ; 33 import java.util.logging.Level ; 34 import java.util.logging.Logger ; 35 36 import org.apache.commons.cli.CommandLine; 37 import org.apache.commons.cli.HelpFormatter; 38 import org.apache.commons.cli.Option; 39 import org.apache.commons.cli.Options; 40 import org.apache.commons.cli.ParseException; 41 import org.apache.commons.cli.PosixParser; 42 import org.archive.io.arc.ARCWriter; 43 import org.archive.io.warc.WARCConstants; 44 import org.archive.io.warc.WARCReader; 45 import org.archive.io.warc.WARCReaderFactory; 46 import org.archive.io.warc.WARCRecord; 47 import org.archive.util.ArchiveUtils; 48 import org.archive.util.FileUtils; 49 50 51 61 public class Warc2Arc { 62 private static void usage(HelpFormatter formatter, Options options, 63 int exitCode) { 64 formatter.printHelp("java org.archive.io.arc.Warc2Arc " + 65 "[--force] [--prefix=PREFIX] [--suffix=SUFFIX] WARC_INPUT " + 66 "OUTPUT_DIR", 67 options); 68 System.exit(exitCode); 69 } 70 71 static String parseRevision(final String version) { 72 final String ID = "$Revision: "; 73 int index = version.indexOf(ID); 74 return (index < 0)? version: 75 version.substring(index + ID.length(), version.length() - 1).trim(); 76 } 77 78 private static String getRevision() { 79 return parseRevision("$Revision: 1.3.2.1 $"); 80 } 81 82 public void transform(final File warc, final File dir, final String prefix, 83 final String suffix, final boolean force) 84 throws IOException , java.text.ParseException { 85 FileUtils.isReadable(warc); 86 FileUtils.isReadable(dir); 87 WARCReader reader = WARCReaderFactory.get(warc); 88 List <String > metadata = new ArrayList <String >(); 89 metadata.add("Made from " + reader.getReaderIdentifier() + " by " + 90 this.getClass().getName() + "/" + getRevision()); 91 ARCWriter writer = new ARCWriter(new AtomicInteger (), 92 Arrays.asList(new File [] {dir}), prefix, suffix, 93 reader.isCompressed(), -1, metadata); 94 transform(reader, writer); 95 } 96 97 protected void transform(final WARCReader reader, final ARCWriter writer) 98 throws IOException , java.text.ParseException { 99 reader.setDigest(false); 102 Logger l = Logger.getLogger(writer.getClass().getName()); 108 Level oldLevel = l.getLevel(); 109 try { 110 l.setLevel(Level.WARNING); 111 for (final Iterator i = reader.iterator(); i.hasNext();) { 112 WARCRecord r = (WARCRecord)i.next(); 113 if (!isARCType(r.getHeader().getMimetype())) { 114 continue; 115 } 116 if (r.getHeader().getContentBegin() <= 0) { 117 continue; 121 } 122 String ip = (String )r.getHeader(). 123 getHeaderValue((WARCConstants.NAMED_FIELD_IP_LABEL)); 124 long length = r.getHeader().getLength(); 125 int offset = r.getHeader().getContentBegin(); 126 String mimetype = r.getHeader().getMimetype(); 131 long time = ArchiveUtils.getSecondsSinceEpoch(r.getHeader(). 132 getDate()).getTime(); 133 writer.write(r.getHeader().getUrl(), mimetype, ip, time, 134 (int)(length - offset), r); 135 } 136 } finally { 137 if (reader != null) { 138 reader.close(); 139 } 140 if (writer != null) { 141 try { 142 writer.close(); 143 } finally { 144 l.setLevel(oldLevel); 145 } 146 } 147 } 148 } 149 150 protected boolean isARCType(final String mimetype) { 151 if (mimetype == null || mimetype.length() <= 0) { 154 return false; 155 } 156 String cleaned = mimetype.toLowerCase().trim(); 157 if (cleaned.equals(WARCConstants.HTTP_RESPONSE_MIMETYPE) || 158 cleaned.equals("text/dns")) { 159 return true; 160 } 161 return false; 162 } 163 164 172 public static void main(String [] args) 173 throws ParseException, IOException , java.text.ParseException { 174 Options options = new Options(); 175 options.addOption(new Option("h","help", false, 176 "Prints this message and exits.")); 177 options.addOption(new Option("f","force", false, 178 "Force overwrite of target file.")); 179 options.addOption(new Option("p","prefix", true, 180 "Prefix to use on created ARC files, else uses default.")); 181 options.addOption(new Option("s","suffix", true, 182 "Suffix to use on created ARC files, else uses default.")); 183 PosixParser parser = new PosixParser(); 184 CommandLine cmdline = parser.parse(options, args, false); 185 List cmdlineArgs = cmdline.getArgList(); 186 Option [] cmdlineOptions = cmdline.getOptions(); 187 HelpFormatter formatter = new HelpFormatter(); 188 189 if (cmdlineOptions.length < 0) { 191 usage(formatter, options, 0); 192 } 193 194 boolean force = false; 196 String prefix = "WARC2ARC"; 197 String suffix = null; 198 for (int i = 0; i < cmdlineOptions.length; i++) { 199 switch(cmdlineOptions[i].getId()) { 200 case 'h': 201 usage(formatter, options, 0); 202 break; 203 204 case 'f': 205 force = true; 206 break; 207 208 case 'p': 209 prefix = cmdlineOptions[i].getValue(); 210 break; 211 212 case 's': 213 suffix = cmdlineOptions[i].getValue(); 214 break; 215 216 default: 217 throw new RuntimeException ("Unexpected option: " + 218 + cmdlineOptions[i].getId()); 219 } 220 } 221 222 if (cmdlineArgs.size() != 2) { 224 usage(formatter, options, 0); 225 } 226 (new Warc2Arc()).transform(new File (cmdlineArgs.get(0).toString()), 227 new File (cmdlineArgs.get(1).toString()), prefix, suffix, force); 228 } 229 } | Popular Tags |