1 23 package org.archive.io.arc; 24 25 import java.io.ByteArrayOutputStream ; 26 import java.io.File ; 27 import java.io.IOException ; 28 import java.io.InputStream ; 29 import java.util.ArrayList ; 30 import java.util.Arrays ; 31 import java.util.HashMap ; 32 import java.util.Iterator ; 33 import java.util.List ; 34 import java.util.Map ; 35 import java.util.concurrent.atomic.AtomicInteger ; 36 import java.util.logging.Level ; 37 import java.util.logging.Logger ; 38 import java.util.regex.Matcher ; 39 40 import org.apache.commons.cli.CommandLine; 41 import org.apache.commons.cli.HelpFormatter; 42 import org.apache.commons.cli.Option; 43 import org.apache.commons.cli.Options; 44 import org.apache.commons.cli.ParseException; 45 import org.apache.commons.cli.PosixParser; 46 import org.archive.io.ArchiveReader; 47 import org.archive.io.ArchiveRecord; 48 import org.archive.io.ArchiveRecordHeader; 49 import org.archive.io.RecoverableIOException; 50 import org.archive.io.WriterPoolMember; 51 import org.archive.util.ArchiveUtils; 52 import org.archive.util.InetAddressUtil; 53 import org.archive.util.TextUtils; 54 55 56 77 public abstract class ARCReader extends ArchiveReader 78 implements ARCConstants { 79 Logger logger = Logger.getLogger(ARCReader.class.getName()); 80 81 88 private boolean alignedOnFirstRecord = true; 89 90 99 private static final int MAX_HEADER_LINE_LENGTH = 1024 * 100; 100 101 106 private final String [] headerFieldNameKeysArray = { 107 URL_FIELD_KEY, 108 IP_HEADER_FIELD_KEY, 109 DATE_FIELD_KEY, 110 MIMETYPE_FIELD_KEY, 111 LENGTH_FIELD_KEY 112 }; 113 114 121 private final List <String > headerFieldNameKeys = 122 Arrays.asList(this.headerFieldNameKeysArray); 123 124 private boolean parseHttpHeaders = true; 125 126 ARCReader() { 127 super(); 128 } 129 130 136 protected void gotoEOR(ArchiveRecord record) throws IOException { 137 if (getIn().available() <= 0) { 138 return; 139 } 140 141 int c = -1; 143 while (getIn().available() > 0) { 144 if (getIn().markSupported()) { 145 getIn().mark(1); 146 } 147 c = getIn().read(); 148 if (c != -1) { 149 if (c == LINE_SEPARATOR) { 150 continue; 151 } 152 if (getIn().markSupported()) { 153 getIn().reset(); 157 break; 158 } 159 ArchiveRecordHeader h = (getCurrentRecord() != null)? 160 record.getHeader(): null; 161 throw new IOException ("Read " + (char)c + 162 " when only " + LINE_SEPARATOR + " expected. " + 163 getReaderIdentifier() + ((h != null)? 164 h.getHeaderFields().toString(): "")); 165 } 166 } 167 } 168 169 188 protected ARCRecord createArchiveRecord(InputStream is, long offset) 189 throws IOException { 190 ArrayList <String > firstLineValues = new ArrayList <String >(20); 191 getTokenizedHeaderLine(is, firstLineValues); 192 int bodyOffset = 0; 193 if (offset == 0 && isAlignedOnFirstRecord()) { 194 ArrayList <String > secondLineValues = new ArrayList <String >(20); 210 bodyOffset += getTokenizedHeaderLine(is, secondLineValues); 211 setVersion((String )secondLineValues.get(0) + 212 "." + (String )secondLineValues.get(1)); 213 bodyOffset += getTokenizedHeaderLine(is, null); 217 } 218 219 try { 220 currentRecord(new ARCRecord(is, 221 (ArchiveRecordHeader)computeMetaData(this.headerFieldNameKeys, 222 firstLineValues, 223 getVersion(), offset), bodyOffset, isDigest(), 224 isStrict(), isParseHttpHeaders())); 225 } catch (IOException e) { 226 IOException newE = new IOException (e.getMessage() + " (Offset " + 227 offset + ")."); 228 newE.setStackTrace(e.getStackTrace()); 229 throw newE; 230 } 231 return (ARCRecord)getCurrentRecord(); 232 } 233 234 243 public String getVersion() { 244 return (super.getVersion() == null)? "1.1": super.getVersion(); 245 } 246 247 259 private int getTokenizedHeaderLine(final InputStream stream, 260 List <String > list) throws IOException { 261 StringBuilder buffer = new StringBuilder (2048 + 20); 263 int read = 0; 264 int previous = -1; 265 for (int c = -1; true;) { 266 previous = c; 267 c = stream.read(); 268 if (c == -1) { 269 throw new RecoverableIOException("Hit EOF before header EOL."); 270 } 271 c &= 0xff; 272 read++; 273 if (read > MAX_HEADER_LINE_LENGTH) { 274 throw new IOException ("Header line longer than max allowed " + 275 " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) + 276 " -- or passed buffer doesn't contain a line (Read: " + 277 buffer.length() + "). Here's" + 278 " some of what was read: " + 279 buffer.substring(0, Math.min(buffer.length(), 256))); 280 } 281 282 if (c == LINE_SEPARATOR) { 283 if (buffer.length() == 0) { 284 continue; 286 } 287 288 if (list != null) { 289 list.add(buffer.toString()); 290 } 291 break; 293 } else if (c == HEADER_FIELD_SEPARATOR) { 294 if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) { 295 continue; 297 } 298 if (list != null) { 299 list.add(buffer.toString()); 300 } 301 buffer.setLength(0); 303 } else { 304 buffer.append((char)c); 305 } 306 } 307 308 if (list != null && (list.size() < 3 || list.size() > 100)) { 311 throw new IOException ("Unparseable header line: " + list); 312 } 313 314 return read; 315 } 316 317 331 private ARCRecordMetaData computeMetaData(List <String > keys, 332 List <String > values, String v, long offset) 333 throws IOException { 334 if (keys.size() != values.size()) { 335 List <String > originalValues = values; 336 if (!isStrict()) { 337 values = fixSpaceInMetadataLine(values, keys.size()); 338 if (keys.size() != values.size()) { 341 if (values.size() == (keys.size() + 1) && 343 values.get(4).toLowerCase().startsWith("charset=")) { 344 List <String > nuvalues = 345 new ArrayList <String >(keys.size()); 346 nuvalues.add(0, values.get(0)); 347 nuvalues.add(1, values.get(1)); 348 nuvalues.add(2, values.get(2)); 349 nuvalues.add(3, values.get(3) + values.get(4)); 350 nuvalues.add(4, values.get(5)); 351 values = nuvalues; 352 } 353 } 354 } 355 if (keys.size() != values.size()) { 356 throw new IOException ("Size of field name keys does" + 357 " not match count of field values: " + values); 358 } 359 logStdErr(Level.WARNING, "Fixed spaces in metadata line at " + 361 "offset " + offset + 362 " Original: " + originalValues + ", New: " + values); 363 } 364 365 Map <Object , Object > headerFields = 366 new HashMap <Object , Object >(keys.size() + 2); 367 for (int i = 0; i < keys.size(); i++) { 368 headerFields.put(keys.get(i), values.get(i)); 369 } 370 371 String url = (String )headerFields.get(URL_FIELD_KEY); 375 if (url != null && url.indexOf('\t') >= 0) { 376 headerFields.put(URL_FIELD_KEY, 377 TextUtils.replaceAll("\t", url, "%09")); 378 } 379 380 headerFields.put(VERSION_FIELD_KEY, v); 381 headerFields.put(ABSOLUTE_OFFSET_KEY, new Long (offset)); 382 383 return new ARCRecordMetaData(getReaderIdentifier(), headerFields); 384 } 385 386 399 protected List <String > fixSpaceInMetadataLine(List <String > values, 400 int requiredSize) { 401 if (!(values.size() > requiredSize) || values.size() < 4) { 406 return values; 407 } 408 String date = (String )values.get(values.size() - 3); 410 if (date.length() != 14) { 411 return values; 412 } 413 for (int i = 0; i < date.length(); i++) { 414 if (!Character.isDigit(date.charAt(i))) { 415 return values; 416 } 417 } 418 String ip = (String )values.get(values.size() - 4); 420 Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip); 421 if (ip == "-" || m.matches()) { 422 List <String > newValues = new ArrayList <String >(requiredSize); 423 StringBuffer url = new StringBuffer (); 424 for (int i = 0; i < (values.size() - 4); i++) { 425 if (i > 0) { 426 url.append("%20"); 427 } 428 url.append(values.get(i)); 429 } 430 newValues.add(url.toString()); 431 for (int i = values.size() - 4; i < values.size(); i++) { 432 newValues.add(values.get(i)); 433 } 434 values = newValues; 435 } 436 return values; 437 } 438 439 protected boolean isAlignedOnFirstRecord() { 440 return alignedOnFirstRecord; 441 } 442 443 protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) { 444 this.alignedOnFirstRecord = alignedOnFirstRecord; 445 } 446 447 450 public boolean isParseHttpHeaders() { 451 return this.parseHttpHeaders; 452 } 453 454 457 public void setParseHttpHeaders(boolean parse) { 458 this.parseHttpHeaders = parse; 459 } 460 461 public String getFileExtension() { 462 return ARC_FILE_EXTENSION; 463 } 464 465 public String getDotFileExtension() { 466 return DOT_ARC_FILE_EXTENSION; 467 } 468 469 protected boolean output(final String format) 470 throws IOException , java.text.ParseException { 471 boolean result = super.output(format); 472 if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) { 473 throw new IOException (format + 474 " format only supported for single Records"); 475 } 476 return result; 477 } 478 479 protected boolean outputRecord(final String format) throws IOException { 480 boolean result = super.outputRecord(format); 481 if (result) { 482 return result; 483 } 484 if (format.equals(NOHEAD)) { 485 setDigest(false); 487 ARCRecord r = (ARCRecord) get(); 488 r.skipHttpHeader(); 489 r.dump(); 490 result = true; 491 } else if (format.equals(HEADER)) { 492 setDigest(false); 494 ARCRecord r = (ARCRecord) get(); 495 r.dumpHttpHeader(); 496 result = true; 497 } 498 499 return result; 500 } 501 502 public void dump(final boolean compress) 503 throws IOException , java.text.ParseException { 504 setDigest(false); 506 boolean firstRecord = true; 507 ARCWriter writer = null; 508 for (Iterator <ArchiveRecord> ii = iterator(); ii.hasNext();) { 509 ARCRecord r = (ARCRecord)ii.next(); 510 ARCRecordMetaData meta = r.getMetaData(); 513 if (firstRecord) { 514 firstRecord = false; 515 ByteArrayOutputStream baos = 517 new ByteArrayOutputStream (r.available()); 518 while (r.available() > 0) { 520 baos.write(r.read()); 521 } 522 List <String > listOfMetadata = new ArrayList <String >(); 523 listOfMetadata.add(baos.toString(WriterPoolMember.UTF8)); 524 writer = new ARCWriter(new AtomicInteger (), System.out, 527 new File (meta.getArc()), 528 compress, meta.getDate(), listOfMetadata); 529 continue; 530 } 531 532 writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(), 533 ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(), 534 (int)meta.getLength(), r); 535 } 536 } 538 539 543 public ARCReader getDeleteFileOnCloseReader(final File f) { 544 final ARCReader d = this; 545 return new ARCReader() { 546 private final ARCReader delegate = d; 547 private File archiveFile = f; 548 549 public void close() throws IOException { 550 this.delegate.close(); 551 if (this.archiveFile != null) { 552 if (archiveFile.exists()) { 553 archiveFile.delete(); 554 } 555 this.archiveFile = null; 556 } 557 } 558 559 public ArchiveRecord get(long o) throws IOException { 560 return this.delegate.get(o); 561 } 562 563 public boolean isDigest() { 564 return this.delegate.isDigest(); 565 } 566 567 public boolean isStrict() { 568 return this.delegate.isStrict(); 569 } 570 571 public Iterator <ArchiveRecord> iterator() { 572 return this.delegate.iterator(); 573 } 574 575 public void setDigest(boolean d) { 576 this.delegate.setDigest(d); 577 } 578 579 public void setStrict(boolean s) { 580 this.delegate.setStrict(s); 581 } 582 583 public List validate() throws IOException { 584 return this.delegate.validate(); 585 } 586 587 @Override 588 public ArchiveRecord get() throws IOException { 589 return this.delegate.get(); 590 } 591 592 @Override 593 public String getVersion() { 594 return this.delegate.getVersion(); 595 } 596 597 @Override 598 public List validate(int noRecords) throws IOException { 599 return this.delegate.validate(noRecords); 600 } 601 602 @Override 603 protected ARCRecord createArchiveRecord(InputStream is, 604 long offset) 605 throws IOException { 606 return this.delegate.createArchiveRecord(is, offset); 607 } 608 609 @Override 610 protected void gotoEOR(ArchiveRecord record) throws IOException { 611 this.delegate.gotoEOR(record); 612 } 613 614 @Override 615 public void dump(boolean compress) 616 throws IOException , java.text.ParseException { 617 this.delegate.dump(compress); 618 } 619 620 @Override 621 public String getDotFileExtension() { 622 return this.delegate.getDotFileExtension(); 623 } 624 625 @Override 626 public String getFileExtension() { 627 return this.delegate.getFileExtension(); 628 } 629 }; 630 } 631 632 634 640 private static void usage(HelpFormatter formatter, Options options, 641 int exitCode) { 642 formatter.printHelp("java org.archive.io.arc.ARCReader" + 643 " [--digest=true|false] \\\n" + 644 " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" + 645 " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL", 646 options); 647 System.exit(exitCode); 648 } 649 650 658 protected static void output(ARCReader reader, String format) 659 throws IOException , java.text.ParseException { 660 if (!reader.output(format)) { 661 throw new IOException ("Unsupported format: " + format); 662 } 663 } 664 665 666 672 protected static void outputRecord(final ARCReader r, final String format) 673 throws IOException { 674 if (!r.outputRecord(format)) { 675 throw new IOException ("Unsupported format" + 676 " (or unsupported on a single record): " + format); 677 } 678 } 679 680 687 public static void createCDXIndexFile(String urlOrPath) 688 throws IOException , java.text.ParseException { 689 ARCReader r = ARCReaderFactory.get(urlOrPath); 690 r.setStrict(false); 691 r.setParseHttpHeaders(true); 692 r.setDigest(true); 693 output(r, CDX_FILE); 694 } 695 696 720 public static void main(String [] args) 721 throws ParseException, IOException , java.text.ParseException { 722 Options options = new Options(); 723 options.addOption(new Option("h","help", false, 724 "Prints this message and exits.")); 725 options.addOption(new Option("o","offset", true, 726 "Outputs record at this offset into arc file.")); 727 options.addOption(new Option("d","digest", true, 728 "Pass true|false. Expensive. Default: true (SHA-1).")); 729 options.addOption(new Option("s","strict", false, 730 "Strict mode. Fails parse if incorrectly formatted ARC.")); 731 options.addOption(new Option("p","parse", true, 732 "Pass true|false to parse HTTP Headers. Default: false.")); 733 options.addOption(new Option("f","format", true, 734 "Output options: 'cdx', 'cdxfile', 'dump', 'gzipdump', " + 735 "'header', or 'nohead'. Default: 'cdx'.")); 736 PosixParser parser = new PosixParser(); 737 CommandLine cmdline = parser.parse(options, args, false); 738 List cmdlineArgs = cmdline.getArgList(); 739 Option [] cmdlineOptions = cmdline.getOptions(); 740 HelpFormatter formatter = new HelpFormatter(); 741 742 if (cmdlineArgs.size() <= 0) { 744 usage(formatter, options, 0); 745 } 746 747 long offset = -1; 749 boolean digest = false; 750 boolean strict = false; 751 boolean parse = false; 752 String format = CDX; 753 for (int i = 0; i < cmdlineOptions.length; i++) { 754 switch(cmdlineOptions[i].getId()) { 755 case 'h': 756 usage(formatter, options, 0); 757 break; 758 759 case 'o': 760 offset = 761 Long.parseLong(cmdlineOptions[i].getValue()); 762 break; 763 764 case 's': 765 strict = true; 766 break; 767 768 case 'p': 769 parse = getTrueOrFalse(cmdlineOptions[i].getValue()); 770 break; 771 772 case 'd': 773 digest = getTrueOrFalse(cmdlineOptions[i].getValue()); 774 break; 775 776 case 'f': 777 format = cmdlineOptions[i].getValue().toLowerCase(); 778 boolean match = false; 779 final String [] supportedFormats = 781 {CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE}; 782 for (int ii = 0; ii < supportedFormats.length; ii++) { 783 if (supportedFormats[ii].equals(format)) { 784 match = true; 785 break; 786 } 787 } 788 if (!match) { 789 usage(formatter, options, 1); 790 } 791 break; 792 793 default: 794 throw new RuntimeException ("Unexpected option: " + 795 + cmdlineOptions[i].getId()); 796 } 797 } 798 799 if (offset >= 0) { 800 if (cmdlineArgs.size() != 1) { 801 System.out.println("Error: Pass one arcfile only."); 802 usage(formatter, options, 1); 803 } 804 ARCReader arc = ARCReaderFactory.get((String )cmdlineArgs.get(0), 805 offset); 806 arc.setStrict(strict); 807 if (format.equals(NOHEAD) || format.equals(HEADER)) { 809 parse = true; 810 } 811 arc.setParseHttpHeaders(parse); 812 outputRecord(arc, format); 813 } else { 814 for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) { 815 String urlOrPath = (String )i.next(); 816 try { 817 ARCReader r = ARCReaderFactory.get(urlOrPath); 818 r.setStrict(strict); 819 r.setParseHttpHeaders(parse); 820 r.setDigest(digest); 821 output(r, format); 822 } catch (RuntimeException e) { 823 System.err.println("Exception processing " + urlOrPath + 829 ": " + e.getMessage()); 830 e.printStackTrace(System.err); 831 System.exit(1); 832 } 833 } 834 } 835 } 836 } 837 | Popular Tags |