1 25 package org.archive.crawler.writer; 26 27 import java.io.File ; 28 import java.io.FileOutputStream ; 29 import java.io.FilenameFilter ; 30 import java.io.IOException ; 31 import java.text.NumberFormat ; 32 import java.util.Collections ; 33 import java.util.HashMap ; 34 import java.util.HashSet ; 35 import java.util.Iterator ; 36 import java.util.Map ; 37 import java.util.Set ; 38 import java.util.TreeMap ; 39 import java.util.logging.Level ; 40 import java.util.logging.Logger ; 41 42 import javax.management.AttributeNotFoundException ; 43 44 import org.archive.crawler.datamodel.CoreAttributeConstants; 45 import org.archive.crawler.datamodel.CrawlURI; 46 import org.archive.crawler.framework.Processor; 47 import org.archive.crawler.settings.ListType; 48 import org.archive.crawler.settings.RegularExpressionConstraint; 49 import org.archive.crawler.settings.SimpleType; 50 import org.archive.crawler.settings.StringList; 51 import org.archive.crawler.settings.Type; 52 import org.archive.io.RecordingInputStream; 53 import org.archive.io.ReplayInputStream; 54 import org.archive.net.UURI; 55 import org.archive.util.IoUtils; 56 57 90 public class MirrorWriterProcessor 91 extends Processor implements CoreAttributeConstants { 92 93 private static final long serialVersionUID = 301407556928389168L; 94 95 98 public static final String ATTR_CASE_SENSITIVE = "case-sensitive"; 99 100 103 public static final String ATTR_CHAR_MAP = "character-map"; 104 105 108 public static final String ATTR_CONTENT_TYPE_MAP = "content-type-map"; 109 110 113 public static final String ATTR_DOT_BEGIN = "dot-begin"; 114 115 118 public static final String ATTR_DOT_END = "dot-end"; 119 120 123 public static final String ATTR_DIRECTORY_FILE = "directory-file"; 124 125 128 public static final String ATTR_HOST_DIRECTORY = "host-directory"; 129 130 133 public static final String ATTR_HOST_MAP = "host-map"; 134 135 138 public static final String ATTR_MAX_PATH_LEN = "max-path-length"; 139 140 143 public static final String ATTR_MAX_SEG_LEN = "max-segment-length"; 144 145 148 public static final String ATTR_PATH = "path"; 149 150 153 public static final String ATTR_PORT_DIRECTORY = "port-directory"; 154 155 158 public static final String ATTR_SUFFIX_AT_END = "suffix-at-end"; 159 160 163 public static final String ATTR_TOO_LONG_DIRECTORY = "too-long-directory"; 164 165 168 public static final String ATTR_UNDERSCORE_SET = "underscore-set"; 169 170 171 private static final String DEFAULT_DOT_BEGIN = "%2E"; 172 173 174 private static final int DEFAULT_MAX_PATH_LEN = 1023; 175 176 177 private static final int DEFAULT_MAX_SEG_LEN = 255; 178 179 180 private static final String DEFAULT_TOO_LONG_DIRECTORY = "LONG"; 181 182 183 private static final Map <String ,String > EMPTY_MAP 184 = Collections.unmodifiableMap(new TreeMap <String ,String >()); 185 186 191 private static final String PATH_SEGMENT_RE = 192 "[^\\" + File.separator + "]+"; 193 194 200 private static final String TOO_LONG_DIRECTORY_RE = 201 "[^\\" + File.separator + "].*"; 202 203 206 private static final Logger logger = 207 Logger.getLogger(MirrorWriterProcessor.class.getName()); 208 209 212 public MirrorWriterProcessor(String name) { 213 super(name, "MirrorWriter processor. " + 214 "A writer that writes each URL to a file on disk named for " + 215 "a derivative of the URL."); 216 Type e; addElementToDefinition(new SimpleType(ATTR_CASE_SENSITIVE, 218 "True if the file system is case-sensitive, like UNIX. " 219 + "False if the file system is case-insensitive, " 220 + "like Macintosh HFS+ and Windows.", 221 Boolean.TRUE)); 222 addElementToDefinition(new StringList(ATTR_CHAR_MAP, 223 "This list is grouped in pairs. " 224 + "The first string in each pair must have a length of one. " 225 + "If it occurs in a URI path, " 226 + "it is replaced by the second string in the pair. " 227 + "For UNIX, no character mapping is normally needed. " 228 + "For Macintosh, the recommended value is [: %%3A]. " 229 + "For Windows, the recommended value is " 230 + "[' ' %%20 " %%22 * %%2A : %%3A < %%3C " 231 + "\\> %%3E ? %%3F \\\\ %%5C ^ %%5E | %%7C].")); 232 addElementToDefinition(new StringList(ATTR_CONTENT_TYPE_MAP, 233 "This list is grouped in pairs. " 234 + "If the content type of a resource begins (case-insensitive) " 235 + "with the first string in a pair, the suffix is set to " 236 + "the second string in the pair, replacing any suffix that may " 237 + "have been in the URI. For example, to force all HTML files " 238 + "to have the same suffix, use [text/html html].")); 239 e = addElementToDefinition(new SimpleType(ATTR_DIRECTORY_FILE, 240 "Implicitly append this to a URI ending with '/'.", 241 "index.html")); 242 e.addConstraint(new RegularExpressionConstraint(PATH_SEGMENT_RE, 243 Level.SEVERE, "This must be a simple file name.")); 244 e = addElementToDefinition(new SimpleType(ATTR_DOT_BEGIN, 245 "If a segment starts with '.', the '.' is replaced by this.", 246 DEFAULT_DOT_BEGIN)); 247 e.addConstraint(new RegularExpressionConstraint(PATH_SEGMENT_RE, 248 Level.SEVERE, 249 "This must not be empty, and must not contain " + File.separator)); 250 addElementToDefinition(new SimpleType(ATTR_DOT_END, 251 "If a directory name ends with '.' it is replaced by this. " 252 + "For all file systems except Windows, '.' is recommended. " 253 + "For Windows, %%2E is recommended.", 254 ".")); 255 addElementToDefinition(new StringList(ATTR_HOST_MAP, 256 "This list is grouped in pairs. " 257 + "If a host name matches (case-insensitive) the first string " 258 + "in a pair, it is replaced by the second string in the pair. " 259 + "This can be used for consistency when several names are used " 260 + "for one host, for example " 261 + "[12.34.56.78 www42.foo.com].")); 262 addElementToDefinition(new SimpleType(ATTR_HOST_DIRECTORY, 263 "Create a subdirectory named for the host in the URI.", 264 Boolean.TRUE)); 265 addElementToDefinition(new SimpleType(ATTR_PATH, 266 "Top-level directory for mirror files.", "mirror")); 267 268 addElementToDefinition(new SimpleType(ATTR_MAX_PATH_LEN, 271 "Maximum file system path length.", 272 new Integer (DEFAULT_MAX_PATH_LEN))); 273 addElementToDefinition(new SimpleType(ATTR_MAX_SEG_LEN, 274 "Maximum file system path segment length.", 275 new Integer (DEFAULT_MAX_SEG_LEN))); 276 addElementToDefinition(new SimpleType(ATTR_PORT_DIRECTORY, 277 "Create a subdirectory named for the port in the URI.", 278 Boolean.FALSE)); 279 addElementToDefinition(new SimpleType(ATTR_SUFFIX_AT_END, 280 "If true, the suffix is placed at the end of the path, " 281 + "after the query (if any). If false, the suffix is placed " 282 + "before the query.", 283 Boolean.TRUE)); 284 e = addElementToDefinition(new SimpleType(ATTR_TOO_LONG_DIRECTORY, 285 "If all the directories in the URI would exceed, " 286 + "or come close to exceeding, the file system maximum " 287 + "path length, then they are all replaced by this.", 288 DEFAULT_TOO_LONG_DIRECTORY)); 289 e.addConstraint(new RegularExpressionConstraint(TOO_LONG_DIRECTORY_RE, 290 Level.SEVERE, "This must be relative and not empty.")); 291 addElementToDefinition(new StringList(ATTR_UNDERSCORE_SET, 292 "If a directory name appears (case-insensitive) in this list " 293 + "then an underscore is placed before it. " 294 + "For all file systems except Windows, this is not needed. " 295 + "For Windows, the following is recommended: " 296 + "[com1 com2 com3 com4 com5 com6 com7 com8 com9 " 297 + "lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9 " 298 + "con nul prn].")); 299 } 300 301 protected void innerProcess(CrawlURI curi) { 302 if (!curi.isSuccess()) { 303 return; 304 } 305 UURI uuri = curi.getUURI(); 307 String scheme = uuri.getScheme(); 309 if (!"http".equalsIgnoreCase(scheme) 310 && !"https".equalsIgnoreCase(scheme)) { 311 return; 312 } 313 RecordingInputStream recis = curi.getHttpRecorder().getRecordedInput(); 314 if (0L == recis.getResponseContentLength()) { 315 return; 316 } 317 318 String baseDir = null; String baseSeg = null; try { 321 baseSeg = (String ) getAttribute(ATTR_PATH, curi); 322 } catch (AttributeNotFoundException e) { 323 logger.warning(e.getLocalizedMessage()); 324 return; 325 } 326 327 while ((baseSeg.length() > 1) && baseSeg.endsWith(File.separator)) { 329 baseSeg = baseSeg.substring(0, baseSeg.length() - 1); 330 } 331 if (0 == baseSeg.length()) { 332 baseDir = getController().getDisk().getPath(); 333 } else if ((new File (baseSeg)).isAbsolute()) { 334 baseDir = baseSeg; 335 } else { 336 baseDir = getController().getDisk().getPath() + File.separator 337 + baseSeg; 338 } 339 340 boolean reCrawl = curi.containsKey(A_MIRROR_PATH); 342 343 354 String mps = null; 355 File destFile = null; try { 357 if (reCrawl) { 358 mps = curi.getString(A_MIRROR_PATH); 359 destFile = new File (baseDir + File.separator + mps); 360 File parent = destFile.getParentFile(); 361 if (null != parent) { 362 IoUtils.ensureWriteableDirectory(parent); 363 } 364 } else { 365 URIToFileReturn r = null; try { 367 r = uriToFile(baseDir, curi); 368 } catch (AttributeNotFoundException e) { 369 logger.warning(e.getLocalizedMessage()); 370 return; 371 } 372 destFile = r.getFile(); 373 mps = r.getRelativePath(); 374 } 375 logger.info(uuri.toString() + " -> " + destFile.getPath()); 376 writeToPath(recis, destFile); 377 if (!reCrawl) { 378 curi.putString(A_MIRROR_PATH, mps); 379 } 380 } catch (IOException e) { 381 curi.addLocalizedError(this.getName(), e, "Mirror"); 382 } 383 } 384 385 404 private URIToFileReturn dirPath(String baseDir, String host, int port, 405 PathSegment[] segs, int maxLen) 406 throws IOException { 407 408 URIToFileReturn r = new URIToFileReturn(baseDir, host, port); 410 r.mkdirs(); 411 for (int i = 0; (segs.length - 1) != i; ++i) { 412 segs[i].addToPath(r); 413 if (r.longerThan(maxLen)) { 414 return null; 415 } 416 } 417 return r; 418 } 419 420 425 private void ensurePairs(ListType list) { 426 if (1 == (list.size() % 2)) { 427 list.remove(list.size() - 1); 428 } 429 } 430 431 445 private URIToFileReturn uriToFile(String baseDir, CrawlURI curi) 446 throws AttributeNotFoundException , IOException { 447 UURI uuri = curi.getUURI(); String host = null; 449 Boolean hd = (Boolean ) getAttribute(ATTR_HOST_DIRECTORY, curi); 450 if (hd.booleanValue()) { 451 host = uuri.getHost(); 452 StringList hostMap = (StringList) getAttribute(ATTR_HOST_MAP, curi); 453 if ((null != hostMap) && (hostMap.size() > 1)) { 454 ensurePairs(hostMap); 455 Iterator <String > i = hostMap.typesafe().iterator(); 456 for (boolean more = true; more && i.hasNext();) { 457 String h1 = i.next(); 458 String h2 = i.next(); 459 if (host.equalsIgnoreCase(h1)) { 460 more = false; 461 if ((null != h2) && (0 != h2.length())) { 462 host = h2; 463 } 464 } 465 } 466 } 467 } 468 469 int port = 470 ((Boolean ) getAttribute(ATTR_PORT_DIRECTORY, curi)).booleanValue() 471 ? uuri.getPort() 472 : -1; 473 474 String suffix = null; StringList ctm = (StringList) getAttribute(ATTR_CONTENT_TYPE_MAP, curi); 476 if ((null != ctm) && (ctm.size() > 1)) { 477 ensurePairs(ctm); 478 String contentType = curi.getContentType().toLowerCase(); 479 Iterator i = ctm.iterator(); 480 for (boolean more = true; more && i.hasNext();) { 481 String ct = (String ) i.next(); 482 String suf = (String ) i.next(); 483 if ((null != ct) && contentType.startsWith(ct.toLowerCase())) { 484 more = false; 485 if ((null != suf) && (0 != suf.length())) { 486 suffix = suf; 487 } 488 } 489 } 490 } 491 492 int maxSegLen = 493 ((Integer ) getAttribute(ATTR_MAX_SEG_LEN, curi)).intValue(); 494 if (maxSegLen < 2) { 495 maxSegLen = DEFAULT_MAX_SEG_LEN; 496 } 497 498 int maxPathLen = 499 ((Integer ) getAttribute(ATTR_MAX_PATH_LEN, curi)).intValue(); 500 if (maxPathLen < 2) { 501 maxPathLen = DEFAULT_MAX_PATH_LEN; 502 } 503 504 Map <String ,String > characterMap = EMPTY_MAP; 505 StringList cm = (StringList) getAttribute(ATTR_CHAR_MAP, curi); 506 if ((null != cm) && (cm.size() > 1)) { 507 ensurePairs(cm); 508 characterMap = new HashMap <String ,String >(cm.size()); 509 for (Iterator i = cm.iterator(); i.hasNext();) { 511 String s1 = (String ) i.next(); 512 String s2 = (String ) i.next(); 513 if ((null != s1) && (1 == s1.length()) && (null != s2) 514 && (0 != s2.length())) { 515 characterMap.put(s1, s2); 516 } 517 } 518 } 519 520 String dotBegin = (String ) getAttribute(ATTR_DOT_BEGIN, curi); 521 if (".".equals(dotBegin)) { 522 dotBegin = null; 523 } 524 525 String dotEnd = (String ) getAttribute(ATTR_DOT_END, curi); 526 if (".".equals(dotEnd)) { 527 dotEnd = null; 528 } 529 530 String tld = (String ) getAttribute(ATTR_TOO_LONG_DIRECTORY, curi); 531 if ((null == tld) || (0 == tld.length()) 532 || (-1 != tld.indexOf(File.separatorChar))) { 533 tld = DEFAULT_TOO_LONG_DIRECTORY; 534 } 535 536 Set <String > underscoreSet = null; 537 StringList us = (StringList) getAttribute(ATTR_UNDERSCORE_SET, curi); 538 if ((null != us) && (0 != us.size())) { 539 underscoreSet = new HashSet <String >(us.size(), 0.5F); 540 for (String s: us.typesafe()) { 541 if ((null != s) && (0 != s.length())) { 542 underscoreSet.add(s.toLowerCase()); 543 } 544 } 545 } 546 547 return uriToFile(curi, host, port, uuri.getPath(), uuri.getQuery(), 548 suffix, baseDir, maxSegLen, maxPathLen, 549 ((Boolean ) getAttribute(ATTR_CASE_SENSITIVE, curi)).booleanValue(), 550 (String ) getAttribute(ATTR_DIRECTORY_FILE, curi), 551 characterMap, dotBegin, dotEnd, tld, 552 ((Boolean ) getAttribute(ATTR_SUFFIX_AT_END, curi)).booleanValue(), 553 underscoreSet); 554 } 555 556 600 private URIToFileReturn uriToFile(CrawlURI curi, String host, int port, 601 String uriPath, String query, String suffix, String baseDir, 602 int maxSegLen, int maxPathLen, boolean caseSensitive, 603 String dirFile, Map characterMap, String dotBegin, String dotEnd, 604 String tooLongDir, boolean suffixAtEnd, Set underscoreSet) 605 throws IOException { 606 assert (null == host) || (0 != host.length()); 607 assert 0 != uriPath.length(); 608 assert '/' == uriPath.charAt(0) : "uriPath: " + uriPath; 609 assert -1 == uriPath.indexOf("//") : "uriPath: " + uriPath; 610 assert -1 == uriPath.indexOf("/./") : "uriPath: " + uriPath; 611 assert !uriPath.endsWith("/.") : "uriPath: " + uriPath; 612 assert (null == query) || (-1 == query.indexOf('/')) 613 : "query: " + query; 614 assert (null == suffix) 615 || ((0 != suffix.length()) && (-1 == suffix.indexOf('/'))) 616 : "suffix: " + suffix; 617 assert 0 != baseDir.length(); 618 assert maxSegLen > 2 : "maxSegLen: " + maxSegLen; 619 assert maxPathLen > 1; 620 assert maxPathLen >= maxSegLen 621 : "maxSegLen: " + maxSegLen + " maxPathLen: " + maxPathLen; 622 assert 0 != dirFile.length(); 623 assert -1 == dirFile.indexOf("/") : "dirFile: " + dirFile; 624 assert null != characterMap; 625 assert (null == dotBegin) || (0 != dotBegin.length()); 626 assert (null == dotEnd) || !dotEnd.endsWith(".") : "dotEnd: " + dotEnd; 627 assert 0 != tooLongDir.length(); 628 assert '/' != tooLongDir.charAt(0) : "tooLongDir: " + tooLongDir; 629 630 int nSegs = 0; for (int i = 0; uriPath.length() != i; ++i) { 632 if ('/' == uriPath.charAt(i)) { 633 ++nSegs; } 635 } 636 assert nSegs > 0 : "uriPath: " + uriPath; 637 PathSegment[] segs = new PathSegment[nSegs]; int slashIndex = 0; for (int i = 0; (segs.length - 1) != i; ++i) { 640 int nsi = uriPath.indexOf('/', slashIndex + 1); assert nsi > slashIndex : "uriPath: " + uriPath; 642 segs[i] = new DirSegment(uriPath, slashIndex + 1, nsi, 643 maxSegLen, caseSensitive, curi, 644 characterMap, dotBegin, dotEnd, 645 underscoreSet); 646 slashIndex = nsi; 647 } 648 if (slashIndex < (uriPath.length() - 1)) { 649 650 segs[segs.length - 1] = new EndSegment(uriPath, slashIndex + 1, 652 uriPath.length(), maxSegLen, caseSensitive, curi, 653 characterMap, dotBegin, query, suffix, maxPathLen, 654 suffixAtEnd); 655 } else { 656 657 segs[segs.length - 1] = new EndSegment(dirFile, 0, dirFile.length(), 659 maxSegLen, caseSensitive, curi, characterMap, null, 660 query, suffix, maxPathLen, suffixAtEnd); 661 } 662 URIToFileReturn r = dirPath(baseDir, host, port, segs, 663 maxPathLen - maxSegLen); 664 if (null == r) { 665 666 PathSegment endSegment = segs[segs.length - 1]; 669 segs = new PathSegment[2]; 670 segs[0] = new DirSegment(tooLongDir, 0, tooLongDir.length(), 671 maxSegLen, caseSensitive, curi, EMPTY_MAP, 672 null, null, null); 673 segs[1] = endSegment; 674 r = dirPath(baseDir, host, port, segs, maxPathLen - maxSegLen); 675 } 676 segs[segs.length - 1].addToPath(r); 677 return r; 678 } 679 680 692 private void writeToPath(RecordingInputStream recis, File dest) 693 throws IOException { 694 ReplayInputStream replayis = recis.getContentReplayInputStream(); 695 File tf = new File (dest.getPath() + "N"); 696 FileOutputStream fos = new FileOutputStream (tf); 697 try { 698 replayis.readFullyTo(fos); 699 } finally { 700 fos.close(); 701 replayis.close(); 702 } 703 if (!tf.renameTo(dest)) { 704 throw new IOException ("Can not rename " + tf.getAbsolutePath() 705 + " to " + dest.getAbsolutePath()); 706 } 707 708 } 709 710 715 abstract class PathSegment { 716 720 protected static final int EXISTS_NOT = 1; 721 722 727 protected static final int EXISTS_EXACT_MATCH = 2; 728 729 735 protected static final int EXISTS_CASE_INSENSITIVE_MATCH = 3; 736 737 738 protected CrawlURI curi; 739 740 746 protected LumpyString mainPart = null; 747 748 754 protected int maxSegLen; 755 756 760 private boolean caseSensitive; 761 762 773 PathSegment(int maxSegLen, boolean caseSensitive, CrawlURI curi) { 774 if (maxSegLen < 2) { 775 throw new IllegalArgumentException ("maxSegLen: " + maxSegLen); 776 } 777 this.maxSegLen = maxSegLen; 778 this.caseSensitive = caseSensitive; 779 this.curi = curi; 780 } 781 782 794 abstract void addToPath(URIToFileReturn currentPath) throws IOException ; 795 796 809 protected int existsMaybeCaseSensitive(File fsf, String segStr, 810 File check) { 811 if (caseSensitive) { 812 return check.exists() ? EXISTS_EXACT_MATCH : EXISTS_NOT; 813 } 814 if (!check.exists()) { 815 return EXISTS_NOT; 816 } 817 818 825 String [] fna = fsf.list(new CaseInsensitiveFilenameFilter(segStr)); 826 for (int i = 0; fna.length != i; ++i) { 827 if (segStr.equals(fna[i])) { 828 return EXISTS_EXACT_MATCH; 829 } 830 } 831 return EXISTS_CASE_INSENSITIVE_MATCH; 832 } 833 834 838 class CaseInsensitiveFilenameFilter implements FilenameFilter { 839 840 private String target; 841 842 848 CaseInsensitiveFilenameFilter(String target) { 849 if (null == target) { 850 throw new IllegalArgumentException ("target null"); 851 } 852 if (0 == target.length()) { 853 throw new IllegalArgumentException ("target empty"); 854 } 855 this.target = target; 856 } 857 858 public boolean accept(File dir, String name) { 859 return target.equalsIgnoreCase(name); 860 } 861 } 862 } 863 864 867 class DirSegment extends PathSegment { 868 869 private Set underscoreSet; 870 871 901 DirSegment(String uriPath, int beginIndex, int endIndex, int maxSegLen, 902 boolean caseSensitive, CrawlURI curi, Map characterMap, 903 String dotBegin, String dotEnd, Set underscoreSet) { 904 super(maxSegLen, caseSensitive, curi); 905 mainPart = new LumpyString(uriPath, beginIndex, endIndex, 906 (null == dotEnd) ? 0 : dotEnd.length(), 907 this.maxSegLen, characterMap, dotBegin); 908 if (null != dotEnd) { 909 910 int dl = dotEnd.length(); 914 while (mainPart.endsWith('.')) { 915 916 mainPart.trimToMax(mainPart.length() - 1); 918 if ((mainPart.length() + dl) <= this.maxSegLen) { 919 mainPart.append(dotEnd); 920 } 921 } 922 } 923 this.underscoreSet = underscoreSet; 924 } 925 926 void addToPath(URIToFileReturn currentPath) throws IOException { 927 NumberFormat nf = null; 928 int startLen = mainPart.length(); for (int i = 0; ; ++i) { 930 if (0 != i) { 931 932 if (null == nf) { 935 nf = NumberFormat.getIntegerInstance(); 936 } 937 String ending = nf.format(i); 938 mainPart.trimToMax(Math.min(startLen, 939 maxSegLen - ending.length())); 940 mainPart.append(ending); 941 } 942 String segStr = mainPart.toString(); 943 if ((null != underscoreSet) 944 && underscoreSet.contains(segStr.toLowerCase())) { 945 mainPart.prepend('_'); 946 ++startLen; 947 mainPart.trimToMax(maxSegLen); 948 segStr = mainPart.toString(); 949 } 950 File fsf = currentPath.getFile(); 951 File f = new File (fsf, segStr); 952 int er = existsMaybeCaseSensitive(fsf, segStr, f); 953 switch (er) { 954 case EXISTS_NOT: 955 if (!f.mkdir()) { 956 throw new IOException ("Can not mkdir " 957 + f.getAbsolutePath()); 958 } 959 currentPath.append(f, segStr); 960 return; 962 case EXISTS_EXACT_MATCH: 963 if (f.isDirectory()) { 964 if (!f.canWrite()) { 965 throw new IOException ("Directory " 966 + f.getAbsolutePath() 967 + " not writeable."); 968 } 969 970 980 currentPath.append(f, segStr); 981 return; 982 } 983 984 994 break; 995 996 case EXISTS_CASE_INSENSITIVE_MATCH: 997 1009 break; 1010 1011 default: 1012 throw new IllegalStateException ("Code: " + er); 1013 } 1014 } 1015 } 1016 } 1017 1018 1021 class EndSegment extends PathSegment { 1022 1026 private int dirPathLen; 1027 1028 1037 private int maxPathLen; 1038 1039 1040 private LumpyString query = null; 1041 1042 1047 private String suffix = null; 1048 1049 1053 private boolean suffixAtEnd; 1054 1055 1056 private String uniquePart = null; 1057 1058 1090 EndSegment(String uriPath, int beginIndex, int endIndex, int maxSegLen, 1091 boolean caseSensitive, CrawlURI curi, Map characterMap, 1092 String dotBegin, String query, String suffix, 1093 int maxPathLen, boolean suffixAtEnd) { 1094 super(maxSegLen - 1, caseSensitive, curi); 1095 int mpe = endIndex; int ldi = uriPath.lastIndexOf('.'); if ((ldi > 0) && (ldi < (endIndex - 1)) && (ldi > beginIndex)) { 1098 mpe = ldi; } 1100 this.suffix = suffix; 1101 if ((null == this.suffix) && (mpe < (endIndex - 1))) { 1102 1103 LumpyString ls = new LumpyString(uriPath, mpe + 1, endIndex, 0, 1106 this.maxSegLen, characterMap, 1107 null); 1108 this.suffix = ls.toString(); 1109 } 1110 int pad = ((null == this.suffix) ? 0 : (1 + this.suffix.length())) 1111 + ((null == query) ? 0 : query.length()); 1112 mainPart = new LumpyString(uriPath, beginIndex, mpe, pad, 1113 this.maxSegLen, characterMap, dotBegin); 1114 this.maxPathLen = maxPathLen - 1; 1115 if (null != query) { 1116 this.query = new LumpyString(query, 0, query.length(), 0, 1117 this.maxSegLen, characterMap, 1118 null); 1119 } 1120 this.suffixAtEnd = suffixAtEnd; 1121 } 1122 1123 void addToPath(URIToFileReturn currentPath) { 1124 File fsf = currentPath.getFile(); 1125 NumberFormat nf = null; 1126 dirPathLen = 1 + fsf.getPath().length(); 1127 for (int i = 0; ; ++i) { 1128 if (0 != i) { 1129 if (null == nf) { 1130 nf = NumberFormat.getIntegerInstance(); 1131 } 1132 uniquePart = nf.format(i); 1133 } 1134 trimWithPadding((null == uniquePart) ? 0 : uniquePart.length()); 1135 String segStr = joinParts(); File f = new File (fsf, segStr); 1137 1138 int er = existsMaybeCaseSensitive(fsf, segStr, f); 1140 switch (er) { 1141 case EXISTS_NOT: 1142 currentPath.append(f, segStr); 1143 return; 1144 1145 case EXISTS_EXACT_MATCH: 1146 if (f.isFile()) { 1147 currentPath.append(f, segStr); 1148 return; 1149 } 1150 1151 1159 break; 1160 1161 case EXISTS_CASE_INSENSITIVE_MATCH: 1162 1174 break; 1175 1176 default: 1177 throw new IllegalStateException ("Code: " + er); 1178 } 1179 } 1180 } 1181 1182 1187 private String joinParts() { 1188 StringBuffer sb = new StringBuffer (length()); 1189 sb.append(mainPart.asStringBuffer()); 1190 if (null != uniquePart) { 1191 sb.append(uniquePart); 1192 } 1193 if (suffixAtEnd) { 1194 if (null != query) { 1195 sb.append(query); 1196 } 1197 if (null != suffix) { 1198 sb.append('.'); 1199 sb.append(suffix); 1200 } 1201 } else { 1202 if (null != suffix) { 1203 sb.append('.'); 1204 sb.append(suffix); 1205 } 1206 if (null != query) { 1207 sb.append(query); 1208 } 1209 } 1210 return sb.toString(); 1211 } 1212 1213 1222 private int lenAvail() { 1223 int len = length(); 1224 return Math.min(maxSegLen - len, maxPathLen - dirPathLen - len); 1225 } 1226 1227 1232 private int length() { 1233 int r = mainPart.length(); if (null != uniquePart) { 1235 r += uniquePart.length(); 1236 } 1237 if (null != query) { 1238 r += query.length(); 1239 } 1240 if (null != suffix) { 1241 r += 1 + suffix.length(); } 1243 return r; 1244 } 1245 1246 1263 private void trimWithPadding(int padding) { 1264 assert padding >= 0 : "padding: " + padding; 1265 int la = lenAvail(); 1266 if (la >= padding) { 1267 return; 1268 } 1269 1270 if (null != query) { 1273 query.trimToMax(Math.max(0, query.length() - (padding - la))); 1274 if (0 == query.length()) { 1275 query = null; 1276 } 1277 la = lenAvail(); 1278 if (la >= padding) { 1279 return; 1280 } 1281 } 1282 mainPart.trimToMax(Math.max(1, mainPart.length() - (padding - la))); 1283 la = lenAvail(); 1284 if (la >= padding) { 1285 return; 1286 } 1287 if (null != suffix) { 1288 suffix = suffix.substring(0, Math.max(1, suffix.length() 1289 - (padding - la))); 1290 la = lenAvail(); 1291 if (la >= padding) { 1292 return; 1293 } 1294 } 1295 throw new IllegalStateException ("Can not trim " + curi.toString()); 1296 } 1297 } 1298 1299 1322 class LumpyString { 1323 1328 private static final byte LUMP_BEGIN = 0x1; 1329 1330 1331 private static final byte LUMP_END = 0x2; 1332 1333 1337 private static final byte LUMP_MID = 0x4; 1338 1339 1340 private byte[] aux; 1341 1342 1343 private StringBuffer string; 1344 1345 1377 LumpyString(String str, int beginIndex, int endIndex, int padding, 1378 int maxLen, Map characterMap, String dotBegin) { 1379 if (beginIndex < 0) { 1380 throw new IllegalArgumentException ("beginIndex < 0: " 1381 + beginIndex); 1382 } 1383 if (endIndex < beginIndex) { 1384 throw new IllegalArgumentException ("endIndex < beginIndex " 1385 + "beginIndex: " + beginIndex + "endIndex: " + endIndex); 1386 } 1387 if (padding < 0) { 1388 throw new IllegalArgumentException ("padding < 0: " + padding); 1389 } 1390 if (maxLen < 1) { 1391 throw new IllegalArgumentException ("maxLen < 1: " + maxLen); 1392 } 1393 if (null == characterMap) { 1394 throw new IllegalArgumentException ("characterMap null"); 1395 } 1396 if ((null != dotBegin) && (0 == dotBegin.length())) { 1397 throw new IllegalArgumentException ("dotBegin empty"); 1398 } 1399 1400 int cap = Math.min(2 * (endIndex - beginIndex) + padding + 1, 1403 maxLen); 1404 string = new StringBuffer (cap); 1405 aux = new byte[cap]; 1406 for (int i = beginIndex; i != endIndex; ++i) { 1407 String s = str.substring(i, i + 1); 1408 String lump; if (".".equals(s) && (i == beginIndex) && (null != dotBegin)) { 1410 lump = dotBegin; 1411 } else { 1412 lump = (String ) characterMap.get(s); 1413 } 1414 if (null == lump) { 1415 if ("%".equals(s) && ((endIndex - i) > 2) 1416 && (-1 != Character.digit(str.charAt(i + 1), 16)) 1417 && (-1 != Character.digit(str.charAt(i + 2), 16))) { 1418 1419 lump = str.substring(i, i + 3); 1421 i += 2; 1422 } else { 1423 lump = s; 1424 } 1425 } 1426 if ((string.length() + lump.length()) > maxLen) { 1427 assert checkInvariants(); 1428 return; 1429 } 1430 append(lump); 1431 } 1432 assert checkInvariants(); 1433 } 1434 1435 1439 public String toString() { 1440 assert checkInvariants(); 1441 return string.toString(); 1442 } 1443 1444 1450 void append(String lump) { 1451 if (null == lump) { 1452 throw new IllegalArgumentException ("lump null"); 1453 } 1454 int lumpLen = lump.length(); 1455 if (0 == lumpLen) { 1456 throw new IllegalArgumentException ("lump empty"); 1457 } 1458 int pos = string.length(); ensureCapacity(pos + lumpLen); 1460 if (1 == lumpLen) { 1461 aux[pos] = LUMP_BEGIN | LUMP_END; 1462 } else { 1463 assert lumpLen > 1; 1464 aux[pos] = LUMP_BEGIN; 1465 ++pos; 1466 for (int i = lumpLen - 2; 0 != i; --i) { 1467 aux[pos] = LUMP_MID; 1468 ++pos; 1469 } 1470 aux[pos] = LUMP_END; 1471 } 1472 string.append(lump); 1473 assert checkInvariants(); 1474 } 1475 1476 1481 StringBuffer asStringBuffer() { 1482 return string; 1483 } 1484 1485 1490 boolean endsWith(char ch) { 1491 assert checkInvariants(); 1492 int len = string.length(); 1493 return (0 != len) && (string.charAt(len - 1) == ch); 1494 } 1495 1496 1500 void prepend(char ch) { 1501 assert checkInvariants(); 1502 int oldLen = string.length(); 1503 ensureCapacity(1 + oldLen); 1504 string.insert(0, ch); 1505 System.arraycopy(aux, 0, aux, 1, oldLen); 1506 aux[0] = LUMP_BEGIN | LUMP_END; 1507 assert checkInvariants(); 1508 } 1509 1510 1514 int length() { 1515 assert checkInvariants(); 1516 return string.length(); 1517 } 1518 1519 1529 void trimToMax(int maxLen) { 1530 if (maxLen < 0) { 1531 throw new IllegalArgumentException ("maxLen < 0: " + maxLen); 1532 } 1533 assert checkInvariants(); 1534 int cl = string.length(); if (cl > maxLen) { 1536 int nl = maxLen; while ((0 != nl) && (LUMP_END != (aux[nl - 1] & LUMP_END))) { 1538 --nl; 1539 } 1540 for (int i = nl; i != cl; ++i) { 1541 aux[i] = 0; 1542 } 1543 string.setLength(nl); 1544 } 1545 assert checkInvariants(); 1546 } 1547 1548 1555 private boolean checkInvariants() { 1556 1557 assert aux.length >= string.length() 1559 : "aux.length: " + aux.length 1560 + " string.length(): " + string.length(); 1561 1562 assert (0 == string.length()) 1564 || (LUMP_BEGIN == (aux[0] & LUMP_BEGIN)) 1565 : "aux[0]: " + aux[0]; 1566 1567 assert (0 == string.length()) 1569 || (LUMP_END == (aux[string.length() - 1] & LUMP_END)) 1570 : "aux[end]: " + aux[string.length() - 1]; 1571 return true; 1572 } 1573 1574 1578 private void ensureCapacity(int minCapacity) { 1579 assert checkInvariants(); 1580 if (minCapacity > aux.length) { 1581 int nc = 2 * aux.length; while (nc < minCapacity) { 1583 nc *= 2; 1584 } 1585 byte[] oldAux = aux; 1586 aux = new byte[nc]; 1587 System.arraycopy(oldAux, 0, aux, 0, string.length()); 1588 } 1589 string.ensureCapacity(minCapacity); 1590 assert checkInvariants(); 1591 } 1592 } 1593 1594 1599 class URIToFileReturn { 1600 1601 private File filePath; 1602 1603 1604 private StringBuffer relativePath = new StringBuffer (255); 1605 1606 1614 URIToFileReturn(String baseDir, String host, int port) { 1615 1616 StringBuffer startPath = new StringBuffer (baseDir.length() + 32); 1618 startPath.append(baseDir); 1619 if (baseDir.endsWith(File.separator)) { 1620 assert 1 != baseDir.length(); 1621 startPath.deleteCharAt(startPath.length() - 1); 1622 } 1623 if (null != host) { 1624 startPath.append(File.separatorChar); 1625 startPath.append(host); 1626 relativePath.append(host); 1627 } 1628 if (port > 0) { 1629 startPath.append(File.separatorChar); 1630 startPath.append(port); 1631 relativePath.append(File.separatorChar); 1632 relativePath.append(port); 1633 } 1634 filePath = new File (startPath.toString()); 1635 } 1636 1637 1642 void append(File f, String nextSegment) { 1643 filePath = f; 1644 if (0 != relativePath.length()) { 1645 relativePath.append(File.separatorChar); 1646 } 1647 relativePath.append(nextSegment); 1648 } 1649 1650 1654 File getFile() { 1655 return filePath; 1656 } 1657 1658 1662 String getRelativePath() { 1663 return relativePath.toString(); 1664 } 1665 1666 1671 boolean longerThan(int maxLen) { 1672 return filePath.getPath().length() > maxLen; 1673 } 1674 1675 1685 void mkdirs() throws IOException { 1686 if (!filePath.exists()) { 1687 if (!filePath.mkdirs()) { 1688 throw new IOException ("Can not mkdir " 1689 + filePath.getAbsolutePath()); 1690 } 1691 } else if (!filePath.canWrite()) { 1692 throw new IOException ("Directory " + filePath.getAbsolutePath() 1693 + " not writeable."); 1694 } else if (!filePath.isDirectory()) { 1695 throw new IOException ("File " + filePath.getAbsolutePath() 1696 + " is not a directory."); 1697 } 1698 } 1699 } 1700} 1701 | Popular Tags |