1 23 package org.archive.io.warc; 24 25 import java.io.File ; 26 import java.io.IOException ; 27 import java.io.InputStream ; 28 import java.util.Iterator ; 29 import java.util.List ; 30 31 import org.apache.commons.cli.CommandLine; 32 import org.apache.commons.cli.HelpFormatter; 33 import org.apache.commons.cli.Option; 34 import org.apache.commons.cli.Options; 35 import org.apache.commons.cli.ParseException; 36 import org.apache.commons.cli.PosixParser; 37 import org.apache.commons.lang.NotImplementedException; 38 import org.archive.io.ArchiveReader; 39 import org.archive.io.ArchiveRecord; 40 41 47 public class WARCReader extends ArchiveReader implements WARCConstants { 48 WARCReader() { 49 super(); 50 } 51 52 @Override 53 protected void initialize(String i) { 54 super.initialize(i); 55 setVersion(WARC_VERSION); 56 } 57 58 64 protected void gotoEOR(ArchiveRecord record) throws IOException { 65 if (record.available() != 0) { 66 throw new IOException ("Record should be exhausted before coming " + 67 "in here"); 68 } 69 70 readExpectedChar(getIn(), CRLF.charAt(0)); 72 readExpectedChar(getIn(), CRLF.charAt(1)); 73 readExpectedChar(getIn(), CRLF.charAt(0)); 74 readExpectedChar(getIn(), CRLF.charAt(1)); 75 } 76 77 protected void readExpectedChar(final InputStream is, final int expected) 78 throws IOException { 79 int c = is.read(); 80 if (c != expected) { 81 throw new IOException ("Unexpected character " + 82 Integer.toHexString(c) + "(Expecting " + 83 Integer.toHexString(expected) + ")"); 84 } 85 } 86 87 95 protected WARCRecord createArchiveRecord(InputStream is, long offset) 96 throws IOException { 97 return (WARCRecord)currentRecord(new WARCRecord(is, 98 getReaderIdentifier(), offset, isDigest(), isStrict())); 99 } 100 101 @Override 102 public void dump(boolean compress) 103 throws IOException , java.text.ParseException { 104 for (final Iterator <ArchiveRecord> i = iterator(); i.hasNext();) { 105 ArchiveRecord r = i.next(); 106 System.out.println(r.getHeader().toString()); 107 r.dump(); 108 System.out.println(); 109 } 110 } 111 112 113 @Override 114 public ArchiveReader getDeleteFileOnCloseReader(final File f) { 115 throw new NotImplementedException("TODO"); 116 } 117 118 @Override 119 public String getDotFileExtension() { 120 return DOT_WARC_FILE_EXTENSION; 121 } 122 123 @Override 124 public String getFileExtension() { 125 return WARC_FILE_EXTENSION; 126 } 127 128 130 136 private static void usage(HelpFormatter formatter, Options options, 137 int exitCode) { 138 formatter.printHelp("java org.archive.io.arc.WARCReader" + 139 " [--digest=true|false] \\\n" + 140 " [--format=cdx|cdxfile|dump|gzipdump]" + 141 " [--offset=#] \\\n[--strict] [--parse] WARC_FILE|WARC_URL", 142 options); 143 System.exit(exitCode); 144 } 145 146 154 protected static void output(WARCReader reader, String format) 155 throws IOException , java.text.ParseException { 156 if (!reader.output(format)) { 157 throw new IOException ("Unsupported format: " + format); 158 } 159 } 160 161 167 protected static void outputRecord(final WARCReader r, 168 final String format) 169 throws IOException { 170 if (!r.outputRecord(format)) { 171 throw new IOException ("Unsupported format" + 172 " (or unsupported on a single record): " + format); 173 } 174 } 175 176 183 public static void createCDXIndexFile(String urlOrPath) 184 throws IOException , java.text.ParseException { 185 WARCReader r = WARCReaderFactory.get(urlOrPath); 186 r.setStrict(false); 187 r.setDigest(true); 188 output(r, CDX_FILE); 189 } 190 191 212 public static void main(String [] args) 213 throws ParseException, IOException , java.text.ParseException { 214 Options options = new Options(); 215 options.addOption(new Option("h","help", false, 216 "Prints this message and exits.")); 217 options.addOption(new Option("o","offset", true, 218 "Outputs record at this offset into arc file.")); 219 options.addOption(new Option("d","digest", true, 220 "Pass true|false. Expensive. Default: true (SHA-1).")); 221 options.addOption(new Option("s","strict", false, 222 "Strict mode. Fails parse if incorrectly formatted WARC.")); 223 options.addOption(new Option("f","format", true, 224 "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," + 225 "'or 'nohead'. Default: 'cdx'.")); 226 PosixParser parser = new PosixParser(); 227 CommandLine cmdline = parser.parse(options, args, false); 228 List cmdlineArgs = cmdline.getArgList(); 229 Option [] cmdlineOptions = cmdline.getOptions(); 230 HelpFormatter formatter = new HelpFormatter(); 231 232 if (cmdlineArgs.size() <= 0) { 234 usage(formatter, options, 0); 235 } 236 237 long offset = -1; 239 boolean digest = false; 240 boolean strict = false; 241 String format = CDX; 242 for (int i = 0; i < cmdlineOptions.length; i++) { 243 switch(cmdlineOptions[i].getId()) { 244 case 'h': 245 usage(formatter, options, 0); 246 break; 247 248 case 'o': 249 offset = 250 Long.parseLong(cmdlineOptions[i].getValue()); 251 break; 252 253 case 's': 254 strict = true; 255 break; 256 257 case 'd': 258 digest = getTrueOrFalse(cmdlineOptions[i].getValue()); 259 break; 260 261 case 'f': 262 format = cmdlineOptions[i].getValue().toLowerCase(); 263 boolean match = false; 264 final String [] supportedFormats = 266 {CDX, DUMP, GZIP_DUMP, CDX_FILE}; 267 for (int ii = 0; ii < supportedFormats.length; ii++) { 268 if (supportedFormats[ii].equals(format)) { 269 match = true; 270 break; 271 } 272 } 273 if (!match) { 274 usage(formatter, options, 1); 275 } 276 break; 277 278 default: 279 throw new RuntimeException ("Unexpected option: " + 280 + cmdlineOptions[i].getId()); 281 } 282 } 283 284 if (offset >= 0) { 285 if (cmdlineArgs.size() != 1) { 286 System.out.println("Error: Pass one arcfile only."); 287 usage(formatter, options, 1); 288 } 289 WARCReader r = WARCReaderFactory.get( 290 new File ((String )cmdlineArgs.get(0)), offset); 291 r.setStrict(strict); 292 outputRecord(r, format); 293 } else { 294 for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) { 295 String urlOrPath = (String )i.next(); 296 try { 297 WARCReader r = WARCReaderFactory.get(urlOrPath); 298 r.setStrict(strict); 299 r.setDigest(digest); 300 output(r, format); 301 } catch (RuntimeException e) { 302 System.err.println("Exception processing " + urlOrPath + 308 ": " + e.getMessage()); 309 e.printStackTrace(System.err); 310 System.exit(1); 311 } 312 } 313 } 314 } 315 } | Popular Tags |