1 25 package org.archive.crawler.fetcher; 26 27 import it.unimi.dsi.mg4j.util.MutableString; 28 29 import java.io.File ; 30 import java.io.FileNotFoundException ; 31 import java.io.FileOutputStream ; 32 import java.io.IOException ; 33 import java.io.ObjectInputStream ; 34 import java.io.ObjectOutputStream ; 35 import java.io.RandomAccessFile ; 36 import java.security.KeyManagementException ; 37 import java.security.KeyStoreException ; 38 import java.security.NoSuchAlgorithmException ; 39 import java.util.Collection ; 40 import java.util.HashSet ; 41 import java.util.Iterator ; 42 import java.util.List ; 43 import java.util.ListIterator ; 44 import java.util.Map ; 45 import java.util.Set ; 46 import java.util.logging.Level ; 47 import java.util.logging.Logger ; 48 import java.net.InetAddress ; 49 import java.net.UnknownHostException ; 50 51 import javax.management.AttributeNotFoundException ; 52 import javax.management.MBeanException ; 53 import javax.management.ReflectionException ; 54 import javax.net.ssl.SSLContext; 55 import javax.net.ssl.SSLSocketFactory; 56 import javax.net.ssl.TrustManager; 57 58 import org.apache.commons.httpclient.Cookie; 59 import org.apache.commons.httpclient.Header; 60 import org.apache.commons.httpclient.HostConfiguration; 61 import org.apache.commons.httpclient.HttpClient; 62 import org.apache.commons.httpclient.HttpConnection; 63 import org.apache.commons.httpclient.HttpConnectionManager; 64 import org.apache.commons.httpclient.HttpException; 65 import org.apache.commons.httpclient.HttpMethod; 66 import org.apache.commons.httpclient.HttpMethodBase; 67 import org.apache.commons.httpclient.HttpState; 68 import org.apache.commons.httpclient.HttpStatus; 69 import org.apache.commons.httpclient.HttpVersion; 70 import org.apache.commons.httpclient.auth.AuthChallengeParser; 71 import org.apache.commons.httpclient.auth.AuthScheme; 72 import org.apache.commons.httpclient.auth.BasicScheme; 73 import org.apache.commons.httpclient.auth.DigestScheme; 74 import org.apache.commons.httpclient.auth.MalformedChallengeException; 75 import org.apache.commons.httpclient.cookie.CookiePolicy; 76 import org.apache.commons.httpclient.params.HttpClientParams; 77 import org.apache.commons.httpclient.params.HttpConnectionManagerParams; 78 import org.apache.commons.httpclient.params.HttpMethodParams; 79 import org.apache.commons.httpclient.protocol.Protocol; 80 import org.apache.commons.httpclient.protocol.ProtocolSocketFactory; 81 import org.archive.crawler.Heritrix; 82 import org.archive.crawler.datamodel.CoreAttributeConstants; 83 import org.archive.crawler.datamodel.CrawlHost; 84 import org.archive.crawler.datamodel.CrawlOrder; 85 import org.archive.crawler.datamodel.CrawlServer; 86 import org.archive.crawler.datamodel.CrawlURI; 87 import org.archive.crawler.datamodel.CredentialStore; 88 import org.archive.crawler.datamodel.FetchStatusCodes; 89 import org.archive.crawler.datamodel.ServerCache; 90 import org.archive.crawler.datamodel.credential.Credential; 91 import org.archive.crawler.datamodel.credential.CredentialAvatar; 92 import org.archive.crawler.datamodel.credential.Rfc2617Credential; 93 import org.archive.crawler.event.CrawlStatusListener; 94 import org.archive.crawler.framework.Filter; 95 import org.archive.crawler.framework.Processor; 96 import org.archive.crawler.settings.MapType; 97 import org.archive.crawler.settings.SettingsHandler; 98 import org.archive.crawler.settings.SimpleType; 99 import org.archive.crawler.settings.StringList; 100 import org.archive.crawler.settings.Type; 101 import org.archive.httpclient.ConfigurableX509TrustManager; 102 import org.archive.httpclient.HttpRecorderGetMethod; 103 import org.archive.httpclient.HttpRecorderMethod; 104 import org.archive.httpclient.HttpRecorderPostMethod; 105 import org.archive.httpclient.SingleHttpConnectionManager; 106 import org.archive.io.ObjectPlusFilesInputStream; 107 import org.archive.io.RecorderLengthExceededException; 108 import org.archive.io.RecorderTimeoutException; 109 import org.archive.io.RecorderTooMuchHeaderException; 110 import org.archive.util.ArchiveUtils; 111 import org.archive.util.HttpRecorder; 112 113 import com.sleepycat.bind.serial.SerialBinding; 114 import com.sleepycat.bind.serial.StoredClassCatalog; 115 import com.sleepycat.bind.tuple.StringBinding; 116 import com.sleepycat.collections.StoredSortedMap; 117 import com.sleepycat.je.Database; 118 import com.sleepycat.je.DatabaseConfig; 119 import com.sleepycat.je.DatabaseException; 120 import com.sleepycat.je.Environment; 121 122 132 public class FetchHTTP extends Processor 133 implements CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener { 134 private static final long serialVersionUID = 136 ArchiveUtils.classnameBasedUID(FetchHTTP.class,1); 137 138 private static Logger logger = Logger.getLogger(FetchHTTP.class.getName()); 139 140 public static final String ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST; 141 public static final String ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT; 142 public static final String ATTR_TIMEOUT_SECONDS = "timeout-seconds"; 143 public static final String ATTR_SOTIMEOUT_MS = "sotimeout-ms"; 144 public static final String ATTR_MAX_LENGTH_BYTES = "max-length-bytes"; 145 public static final String ATTR_LOAD_COOKIES = "load-cookies-from-file"; 146 public static final String ATTR_SAVE_COOKIES = "save-cookies-to-file"; 147 public static final String ATTR_ACCEPT_HEADERS = "accept-headers"; 148 public static final String ATTR_DEFAULT_ENCODING = "default-encoding"; 149 public static final String ATTR_SHA1_CONTENT = "sha1-content"; 150 public static final String ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth"; 151 152 155 public static final String ATTR_TRUST = "trust-level"; 156 157 private static Integer DEFAULT_TIMEOUT_SECONDS = new Integer (1200); 158 private static Integer DEFAULT_SOTIMEOUT_MS = new Integer (20000); 159 private static Long DEFAULT_MAX_LENGTH_BYTES = new Long (0); 160 private static Integer DEFAULT_FETCH_BANDWIDTH_MAX = 0; 161 162 166 private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L; 167 168 171 private static String DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING; 172 173 176 static Boolean DEFAULT_SHA1_CONTENT = new Boolean (true); 177 public static final String SHA1 = "sha1"; 178 179 private transient HttpClient http = null; 180 181 186 private int recoveryRetries = 0; 187 188 192 private int curisHandled = 0; 193 194 198 public final static String MIDFETCH_ATTR_FILTERS = "midfetch-filters"; 199 200 203 private MapType midfetchfilters = null; 204 205 208 private static final String MIDFETCH_ABORT_LOG = "midFetchAbort"; 209 210 public static final String ATTR_SEND_CONNECTION_CLOSE = 211 "send-connection-close"; 212 private static final Header HEADER_SEND_CONNECTION_CLOSE = 213 new Header("Connection", "close"); 214 public static final String ATTR_SEND_REFERER = "send-referer"; 215 public static final String ATTR_SEND_RANGE = "send-range"; 216 public static final String REFERER = "Referer"; 217 public static final String RANGE = "Range"; 218 public static final String RANGE_PREFIX = "bytes=0-"; 219 public static final String HTTP_SCHEME = "http"; 220 public static final String HTTPS_SCHEME = "https"; 221 222 public static final String ATTR_IGNORE_COOKIES = "ignore-cookies"; 223 private static Boolean DEFAULT_IGNORE_COOKIES = new Boolean (false); 224 225 public static final String ATTR_BDB_COOKIES = "use-bdb-for-cookies"; 226 private static Boolean DEFAULT_BDB_COOKIES = new Boolean (true); 227 228 public static final String ATTR_LOCAL_ADDRESS = "bind-address"; 229 230 233 protected Database cookieDb; 234 237 public static final String COOKIEDB_NAME = "http_cookies"; 238 239 static { 240 Protocol.registerProtocol("http", new Protocol("http", 241 new HeritrixProtocolSocketFactory(), 80)); 242 try { 243 Protocol.registerProtocol("https", 244 new Protocol("https", ((ProtocolSocketFactory) 245 new HeritrixSSLProtocolSocketFactory()), 443)); 246 } catch (KeyManagementException e) { 247 e.printStackTrace(); 248 } catch (KeyStoreException e) { 249 e.printStackTrace(); 250 } catch (NoSuchAlgorithmException e) { 251 e.printStackTrace(); 252 } 253 } 254 static final String SERVER_CACHE_KEY = "heritrix.server.cache"; 255 static final String SSL_FACTORY_KEY = "heritrix.ssl.factory"; 256 257 260 private SSLSocketFactory sslfactory = null; 261 262 263 268 public FetchHTTP(String name) { 269 super(name, "HTTP Fetcher"); 270 this.midfetchfilters = (MapType) addElementToDefinition( 271 new MapType(MIDFETCH_ATTR_FILTERS, "Filters applied after" + 272 " receipt of HTTP response headers but before we start to" + 273 " download the body. If any filter returns" + 274 " FALSE, the fetch is aborted. Prerequisites such as" + 275 " robots.txt by-pass filtering (i.e. they cannot be" + 276 " midfetch aborted.", Filter.class)); 277 281 addElementToDefinition(new SimpleType(ATTR_TIMEOUT_SECONDS, 282 "If the fetch is not completed in this number of seconds," 283 + " give up (and retry later). For optimal configuration, " + 284 " ensure this value is > " + ATTR_TIMEOUT_SECONDS + ".", 285 DEFAULT_TIMEOUT_SECONDS)); 286 Type e = addElementToDefinition(new SimpleType(ATTR_SOTIMEOUT_MS, 287 "If the socket is unresponsive for this number of milliseconds, " + 288 " give up. Set to zero for no timeout (Not." + 289 " recommended. Could hang a thread on an unresponsive server)." + 290 " This timeout is used timing out socket opens " + 291 " and for timing out each socket read. Make sure this " + 292 " value is < " + ATTR_TIMEOUT_SECONDS + " for optimal " + 293 " configuration: ensures at least one retry read.", 294 DEFAULT_SOTIMEOUT_MS)); 295 e.setExpertSetting(true); 296 e = addElementToDefinition(new SimpleType(ATTR_FETCH_BANDWIDTH_MAX, 297 "The maximum KB/sec to use when fetching data from a server. " + 298 "0 means no maximum. Default: "+ DEFAULT_FETCH_BANDWIDTH_MAX 299 + ".", DEFAULT_FETCH_BANDWIDTH_MAX)); 300 e.setExpertSetting(true); 301 e.setOverrideable(true); 302 addElementToDefinition(new SimpleType(ATTR_MAX_LENGTH_BYTES, 303 "Maximum length in bytes to fetch.\n" + 304 "Fetch is truncated at this length. A value of 0 means no limit.", 305 DEFAULT_MAX_LENGTH_BYTES)); 306 e = addElementToDefinition(new SimpleType(ATTR_IGNORE_COOKIES, 307 "Disable cookie-handling.", DEFAULT_IGNORE_COOKIES)); 308 e.setOverrideable(true); 309 e.setExpertSetting(true); 310 e = addElementToDefinition(new SimpleType(ATTR_BDB_COOKIES, 311 "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES)); 312 e.setExpertSetting(true); 313 314 e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES, 315 "File to preload cookies from", "")); 316 e.setExpertSetting(true); 317 e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES, 318 "When crawl finishes save cookies to this file", "")); 319 e.setExpertSetting(true); 320 e = addElementToDefinition(new SimpleType(ATTR_TRUST, 321 "SSL certificate trust level. Range is from the default 'open'" 322 + " (trust all certs including expired, selfsigned, and those for" 323 + " which we do not have a CA) through 'loose' (trust all valid" 324 + " certificates including selfsigned), 'normal' (all valid" 325 + " certificates not including selfsigned) to 'strict' (Cert is" 326 + " valid and DN must match servername)", 327 ConfigurableX509TrustManager.DEFAULT, 328 ConfigurableX509TrustManager.LEVELS_AS_ARRAY)); 329 e.setOverrideable(false); 330 e.setExpertSetting(true); 331 e = addElementToDefinition(new StringList(ATTR_ACCEPT_HEADERS, 332 "Accept Headers to include in each request. Each must be the" 333 + " complete header, e.g., 'Accept-Language: en'")); 334 e.setExpertSetting(true); 335 e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST, 336 "Proxy host IP (set only if needed).", "")); 337 e.setExpertSetting(true); 338 e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT, 339 "Proxy port (set only if needed)", "")); 340 e.setExpertSetting(true); 341 e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING, 342 "The character encoding to use for files that do not have one" + 343 " specified in the HTTP response headers. Default: " + 344 DEFAULT_CONTENT_CHARSET + ".", 345 DEFAULT_CONTENT_CHARSET)); 346 e.setExpertSetting(true); 347 e = addElementToDefinition(new SimpleType(ATTR_SHA1_CONTENT, 348 "Whether or not to perform an on-the-fly SHA1 hash of" + 349 "retrieved content-bodies.", 350 DEFAULT_SHA1_CONTENT)); 351 e.setExpertSetting(true); 352 e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE, 353 "Send 'Connection: close' header with every request.", 354 new Boolean (true))); 355 e.setOverrideable(true); 356 e.setExpertSetting(true); 357 e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER, 358 "Send 'Referer' header with every request.\n" + 359 "The 'Referer' header contans the location the crawler came " + 360 " from, " + 361 "the page the current URI was discovered in. The 'Referer' " + 362 "usually is " + 363 "logged on the remote server and can be of assistance to " + 364 "webmasters trying to figure how a crawler got to a " + 365 "particular area on a site.", 366 new Boolean (true))); 367 e.setOverrideable(true); 368 e.setExpertSetting(true); 369 e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE, 370 "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES + 371 ") on document size.\n" + 372 "Be polite to the HTTP servers and send the 'Range' header," + 373 "stating that you are only interested in the first n bytes. " + 374 "Only pertinent if " + ATTR_MAX_LENGTH_BYTES + " > 0. " + 375 "Sending the 'Range' header results in a " + 376 "'206 Partial Content' status response, which is better than " + 377 "just cutting the response mid-download. On rare occasion, " + 378 " sending 'Range' will " + 379 "generate '416 Request Range Not Satisfiable' response.", 380 new Boolean (false))); 381 e.setOverrideable(true); 382 e.setExpertSetting(true); 383 e = addElementToDefinition(new SimpleType(ATTR_LOCAL_ADDRESS, 384 "Local IP address or hostname to use when making connections " + 385 "(binding sockets). When not specified, uses default local" + 386 "address(es).", "")); 387 e.setExpertSetting(true); 388 } 389 390 protected void innerProcess(final CrawlURI curi) 391 throws InterruptedException { 392 if (!canFetch(curi)) { 393 return; 395 } 396 397 this.curisHandled++; 398 399 curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis()); 401 402 HttpRecorder rec = HttpRecorder.getHttpRecorder(); 404 405 boolean sha1Content = ((Boolean )getUncheckedAttribute(curi, 407 ATTR_SHA1_CONTENT)).booleanValue(); 408 if(sha1Content) { 409 rec.getRecordedInput().setSha1Digest(); 410 } else { 411 rec.getRecordedInput().setDigest(null); 413 } 414 415 String curiString = curi.getUURI().toString(); 418 HttpMethodBase method = null; 419 if (curi.isPost()) { 420 method = new HttpRecorderPostMethod(curiString, rec) { 421 protected void readResponseBody(HttpState state, 422 HttpConnection conn) 423 throws IOException , HttpException { 424 addResponseContent(this, curi); 425 if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) { 426 doAbort(curi, this, MIDFETCH_ABORT_LOG); 427 } else { 428 super.readResponseBody(state, conn); 429 } 430 } 431 }; 432 } else { 433 method = new HttpRecorderGetMethod(curiString, rec) { 434 protected void readResponseBody(HttpState state, 435 HttpConnection conn) 436 throws IOException , HttpException { 437 addResponseContent(this, curi); 438 if (checkMidfetchAbort(curi, this.httpRecorderMethod, 439 conn)) { 440 doAbort(curi, this, MIDFETCH_ABORT_LOG); 441 } else { 442 super.readResponseBody(state, conn); 443 } 444 } 445 }; 446 } 447 448 HostConfiguration customConfigOrNull = configureMethod(curi, method); 449 450 curi.setHttpRecorder(rec); 453 454 boolean addedCredentials = populateCredentials(curi, method); 456 method.setDoAuthentication(addedCredentials); 457 458 try { 459 this.http.executeMethod(customConfigOrNull, method); 460 } catch (RecorderTooMuchHeaderException ex) { 461 doAbort(curi, method, HEADER_TRUNC); 463 } catch (IOException e) { 464 failedExecuteCleanup(method, curi, e); 465 return; 466 } catch (ArrayIndexOutOfBoundsException e) { 467 failedExecuteCleanup(method, curi, e); 472 return; 473 } 474 475 long softMax = method.getResponseContentLength(); 477 478 long hardMax = getMaxLength(curi); 480 481 int maxFetchRate = getMaxFetchRate(curi); 484 485 try { 486 if (!method.isAborted()) { 487 rec.getRecordedInput().readFullyOrUntil(softMax, 490 hardMax, 1000 * getTimeout(curi), maxFetchRate); 491 } 492 } catch (RecorderTimeoutException ex) { 493 doAbort(curi, method, TIMER_TRUNC); 494 } catch (RecorderLengthExceededException ex) { 495 doAbort(curi, method, LENGTH_TRUNC); 496 } catch (IOException e) { 497 cleanup(curi, e, "readFully", S_CONNECT_LOST); 498 return; 499 } catch (ArrayIndexOutOfBoundsException e) { 500 cleanup(curi, e, "readFully", S_CONNECT_LOST); 504 return; 505 } finally { 506 rec.closeRecorders(); 508 if (!method.isAborted()) { 509 method.releaseConnection(); 510 } 511 curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis()); 513 setCharacterEncoding(rec, method); 515 curi.setContentSize(rec.getRecordedInput().getSize()); 516 } 517 518 curi.setContentDigest(SHA1, rec.getRecordedInput().getDigestValue()); 519 if (logger.isLoggable(Level.INFO)) { 520 logger.info((curi.isPost()? "POST": "GET") + " " + 521 curi.getUURI().toString() + " " + method.getStatusCode() + 522 " " + rec.getRecordedInput().getSize() + " " + 523 curi.getContentType()); 524 } 525 526 if (curi.isSuccess() && addedCredentials) { 527 promoteCredentials(curi); 531 if (logger.isLoggable(Level.FINE)) { 532 Header setCookie = method.getResponseHeader("set-cookie"); 534 if (setCookie != null) { 535 logger.fine(setCookie.toString().trim()); 536 } 537 } 538 } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) { 539 handle401(method, curi); 541 } 542 543 if (rec.getRecordedInput().isOpen()) { 544 logger.severe(curi.toString() + " RIS still open. Should have" + 545 " been closed by method release: " + 546 Thread.currentThread().getName()); 547 try { 548 rec.getRecordedInput().close(); 549 } catch (IOException e) { 550 logger.log(Level.SEVERE,"second-chance RIS close failed",e); 551 } 552 } 553 } 554 555 protected void doAbort(CrawlURI curi, HttpMethod method, 556 String annotation) { 557 curi.addAnnotation(annotation); 558 curi.getHttpRecorder().close(); 559 method.abort(); 560 } 561 562 protected boolean checkMidfetchAbort(CrawlURI curi, 563 HttpRecorderMethod method, HttpConnection conn) { 564 if (curi.isPrerequisite() || filtersAccept(midfetchfilters, curi)) { 565 return false; 566 } 567 method.markContentBegin(conn); 568 return true; 569 } 570 571 577 protected void addResponseContent (HttpMethod method, CrawlURI curi) { 578 curi.setFetchStatus(method.getStatusCode()); 579 Header ct = method.getResponseHeader("content-type"); 580 curi.setContentType((ct == null)? null: ct.getValue()); 581 curi.putObject(A_HTTP_TRANSACTION, method); 584 } 585 586 600 private void setCharacterEncoding(final HttpRecorder rec, 601 final HttpMethod method) { 602 String encoding = null; 603 604 try { 605 encoding = ((HttpMethodBase) method).getResponseCharSet(); 606 if (encoding == null || 607 encoding.equals(DEFAULT_CONTENT_CHARSET)) { 608 encoding = (String ) getAttribute(ATTR_DEFAULT_ENCODING); 609 } 610 } catch (Exception e) { 611 logger.warning("Failed get default encoding: " + 612 e.getLocalizedMessage()); 613 } 614 rec.setCharacterEncoding(encoding); 615 } 616 617 623 private void failedExecuteCleanup(final HttpMethod method, 624 final CrawlURI curi, final Exception exception) { 625 cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED); 626 method.releaseConnection(); 627 } 628 629 636 private void cleanup(final CrawlURI curi, final Exception exception, 637 final String message, final int status) { 638 curi.addLocalizedError(this.getName(), exception, message); 639 curi.setFetchStatus(status); 640 curi.getHttpRecorder().close(); 641 } 642 643 651 private boolean canFetch(CrawlURI curi) { 652 if(curi.getFetchStatus()<0) { 653 curi.skipToProcessorChain(getController().getPostprocessorChain()); 656 return false; 657 } 658 String scheme = curi.getUURI().getScheme(); 659 if (!(scheme.equals("http") || scheme.equals("https"))) { 660 return false; 662 } 663 CrawlHost host = getController().getServerCache().getHostFor(curi); 664 if (host.getIP() == null && host.hasBeenLookedUp()) { 666 curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE); 667 return false; 668 } 669 return true; 670 } 671 672 678 protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) { 679 method.setFollowRedirects(false); 681 682 687 method.getParams().setCookiePolicy( 689 (((Boolean )getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)). 690 booleanValue())? 691 CookiePolicy.IGNORE_COOKIES: 692 CookiePolicy.BROWSER_COMPATIBILITY); 693 694 method.getParams().setVersion(HttpVersion.HTTP_1_0); 696 697 CrawlOrder order = getSettingsHandler().getOrder(); 698 String userAgent = curi.getUserAgent(); 699 if (userAgent == null) { 700 userAgent = order.getUserAgent(curi); 701 } 702 method.setRequestHeader("User-Agent", userAgent); 703 method.setRequestHeader("From", order.getFrom(curi)); 704 705 method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, 707 new HeritrixHttpMethodRetryHandler()); 708 709 final long maxLength = getMaxLength(curi); 710 if(maxLength > 0 && 711 ((Boolean )getUncheckedAttribute(curi, ATTR_SEND_RANGE)). 712 booleanValue()) { 713 method.addRequestHeader(RANGE, 714 RANGE_PREFIX.concat(Long.toString(maxLength - 1))); 715 } 716 717 if (((Boolean )getUncheckedAttribute(curi, 718 ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) { 719 method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE); 720 } 721 722 if (((Boolean )getUncheckedAttribute(curi, 723 ATTR_SEND_REFERER)).booleanValue()) { 724 String via = curi.flattenVia(); 727 if (via != null && via.length() > 0 && 728 !(via.startsWith(HTTPS_SCHEME) && 729 curi.getUURI().getScheme().equals(HTTP_SCHEME))) { 730 method.setRequestHeader(REFERER, via); 731 } 732 } 733 734 setAcceptHeaders(curi, method); 737 738 return configureProxy(curi); 739 } 740 741 747 private HostConfiguration configureProxy(CrawlURI curi) { 748 String proxy = (String ) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST); 749 int port = -1; 750 if(proxy.length()==0) { 751 proxy = null; 752 } else { 753 String portString = (String )getAttributeEither(curi, ATTR_HTTP_PROXY_PORT); 754 port = portString.length()>0 ? Integer.parseInt(portString) : -1; 755 } 756 HostConfiguration config = this.http.getHostConfiguration(); 757 if(config.getProxyHost() == proxy && config.getProxyPort() == port) { 758 return null; 760 } 761 if (proxy != null && proxy.equals(config.getProxyHost()) 762 && config.getProxyPort() == port) { 763 return null; 765 } 766 config = new HostConfiguration(config); config.setProxy(proxy,port); 768 return config; 769 } 770 771 779 protected Object getAttributeEither(CrawlURI curi, String key) { 780 Object obj = curi!=null ? curi.getObject(key) : null; 781 if(obj==null) { 782 obj = getUncheckedAttribute(curi, key); 783 } 784 return obj; 785 } 786 787 805 private boolean populateCredentials(CrawlURI curi, HttpMethod method) { 806 CrawlServer server = 810 getController().getServerCache().getServerFor(curi); 811 if (server.hasCredentialAvatars()) { 812 Set avatars = server.getCredentialAvatars(); 813 for (Iterator i = avatars.iterator(); i.hasNext();) { 814 CredentialAvatar ca = (CredentialAvatar)i.next(); 815 Credential c = ca.getCredential(getSettingsHandler(), curi); 816 if (c.isEveryTime()) { 817 c.populate(curi, this.http, method, ca.getPayload()); 818 } 819 } 820 } 821 822 boolean result = false; 823 824 if (curi.hasCredentialAvatars()) { 828 Set avatars = curi.getCredentialAvatars(); 829 for (Iterator i = avatars.iterator(); i.hasNext();) { 830 CredentialAvatar ca = (CredentialAvatar)i.next(); 831 Credential c = ca.getCredential(getSettingsHandler(), curi); 832 if (c.populate(curi, this.http, method, ca.getPayload())) { 833 result = true; 834 } 835 } 836 } 837 838 return result; 839 } 840 841 846 private void promoteCredentials(final CrawlURI curi) { 847 if (!curi.hasCredentialAvatars()) { 848 logger.severe("No credentials to promote when there should be " + 849 curi); 850 } else { 851 Set avatars = curi.getCredentialAvatars(); 852 for (Iterator i = avatars.iterator(); i.hasNext();) { 853 CredentialAvatar ca = (CredentialAvatar)i.next(); 854 curi.removeCredentialAvatar(ca); 855 Credential c = ca.getCredential(getSettingsHandler(), curi); 861 String cd = null; 862 try { 863 cd = c.getCredentialDomain(curi); 864 } 865 catch (AttributeNotFoundException e) { 866 logger.severe("Failed to get cred domain for " + curi + 867 " for " + ca + ": " + e.getMessage()); 868 } 869 if (cd != null) { 870 CrawlServer cs 871 = getController().getServerCache().getServerFor(cd); 872 if (cs != null) { 873 cs.addCredentialAvatar(ca); 874 } 875 } 876 } 877 } 878 } 879 880 890 protected void handle401(final HttpMethod method, final CrawlURI curi) { 891 AuthScheme authscheme = getAuthScheme(method, curi); 892 if (authscheme == null) { 893 return; 894 } 895 String realm = authscheme.getRealm(); 896 897 Set curiRfc2617Credentials = getCredentials(getSettingsHandler(), 901 curi, Rfc2617Credential.class); 902 Rfc2617Credential extant = Rfc2617Credential. 903 getByRealm(curiRfc2617Credentials, realm, curi); 904 if (extant != null) { 905 extant.detachAll(curi); 910 logger.warning("Auth failed (401) though supplied realm " + 911 realm + " to " + curi.toString()); 912 } else { 913 CredentialStore cs = 920 CredentialStore.getCredentialStore(getSettingsHandler()); 921 if (cs == null) { 922 logger.severe("No credential store for " + curi); 923 } else { 924 CrawlServer server = getController().getServerCache(). 925 getServerFor(curi); 926 Set storeRfc2617Credentials = cs.subset(curi, 927 Rfc2617Credential.class, server.getName()); 928 if (storeRfc2617Credentials == null || 929 storeRfc2617Credentials.size() <= 0) { 930 logger.info("No rfc2617 credentials for " + curi); 931 } else { 932 Rfc2617Credential found = Rfc2617Credential. 933 getByRealm(storeRfc2617Credentials, realm, curi); 934 if (found == null) { 935 logger.info("No rfc2617 credentials for realm " + 936 realm + " in " + curi); 937 } else { 938 found.attach(curi, authscheme.getRealm()); 939 logger.info("Found credential for realm " + realm + 940 " in store for " + curi.toString()); 941 } 942 } 943 } 944 } 945 } 946 947 952 protected AuthScheme getAuthScheme(final HttpMethod method, 953 final CrawlURI curi) { 954 Header [] headers = method.getResponseHeaders("WWW-Authenticate"); 955 if (headers == null || headers.length <= 0) { 956 logger.info("We got a 401 but no WWW-Authenticate challenge: " + 957 curi.toString()); 958 return null; 959 } 960 961 Map authschemes = null; 962 try { 963 authschemes = AuthChallengeParser.parseChallenges(headers); 964 } catch(MalformedChallengeException e) { 965 logger.info("Failed challenge parse: " + e.getMessage()); 966 } 967 if (authschemes == null || authschemes.size() <= 0) { 968 logger.info("We got a 401 and WWW-Authenticate challenge" + 969 " but failed parse of the header " + curi.toString()); 970 return null; 971 } 972 973 AuthScheme result = null; 974 for (Iterator i = authschemes.keySet().iterator(); 976 result == null && i.hasNext();) { 977 String key = (String )i.next(); 978 String challenge = (String )authschemes.get(key); 979 if (key == null || key.length() <= 0 || challenge == null || 980 challenge.length() <= 0) { 981 logger.warning("Empty scheme: " + curi.toString() + 982 ": " + headers); 983 } 984 AuthScheme authscheme = null; 985 if (key.equals("basic")) { 986 authscheme = new BasicScheme(); 987 } else if (key.equals("digest")) { 988 authscheme = new DigestScheme(); 989 } else { 990 logger.info("Unsupported scheme: " + key); 991 continue; 992 } 993 994 try { 995 authscheme.processChallenge(challenge); 996 } catch (MalformedChallengeException e) { 997 logger.info(e.getMessage() + " " + curi + " " + headers); 998 continue; 999 } 1000 if (authscheme.isConnectionBased()) { 1001 logger.info("Connection based " + authscheme); 1002 continue; 1003 } 1004 1005 if (authscheme.getRealm() == null || 1006 authscheme.getRealm().length() <= 0) { 1007 logger.info("Empty realm " + authscheme + " for " + curi); 1008 continue; 1009 } 1010 result = authscheme; 1011 } 1012 1013 return result; 1014 } 1015 1016 1022 private Set <Credential> getCredentials(SettingsHandler handler, 1023 CrawlURI curi, Class type) { 1024 Set <Credential> result = null; 1025 1026 if (curi.hasCredentialAvatars()) { 1027 for (Iterator i = curi.getCredentialAvatars().iterator(); 1028 i.hasNext();) { 1029 CredentialAvatar ca = (CredentialAvatar)i.next(); 1030 if (ca.match(type)) { 1031 if (result == null) { 1032 result = new HashSet <Credential>(); 1033 } 1034 result.add(ca.getCredential(handler, curi)); 1035 } 1036 } 1037 } 1038 return result; 1039 } 1040 1041 public void initialTasks() { 1042 super.initialTasks(); 1043 this.getController().addCrawlStatusListener(this); 1044 configureHttp(); 1045 1046 loadCookies(); 1048 1049 try { 1053 SSLContext context = SSLContext.getInstance("SSL"); 1054 context.init(null, new TrustManager[] { 1055 new ConfigurableX509TrustManager((String ) 1056 getAttribute(ATTR_TRUST))}, null); 1057 this.sslfactory = context.getSocketFactory(); 1058 } catch (Exception e) { 1059 logger.log(Level.WARNING, "Failed configure of ssl context " 1060 + e.getMessage(), e); 1061 } 1062 } 1063 1064 public void finalTasks() { 1065 saveCookies(); 1067 cleanupHttp(); 1068 super.finalTasks(); 1069 } 1070 1071 1074 protected void cleanupHttp() { 1075 if(cookieDb!=null) { 1076 try { 1077 cookieDb.close(); 1078 } catch (DatabaseException e) { 1079 e.printStackTrace(); 1081 } 1082 } 1083 } 1084 1085 protected void configureHttp() throws RuntimeException { 1086 int timeout = (getSoTimeout(null) > 0)? getSoTimeout(null): 0; 1088 1089 HttpConnectionManager cm = new SingleHttpConnectionManager(); 1091 1092 HttpConnectionManagerParams hcmp = cm.getParams(); 1095 hcmp.setConnectionTimeout(timeout); 1096 hcmp.setStaleCheckingEnabled(true); 1097 hcmp.setTcpNoDelay(false); 1101 1102 this.http = new HttpClient(cm); 1103 HttpClientParams hcp = this.http.getParams(); 1104 hcp.setSoTimeout(timeout); 1106 hcp.setVersion(HttpVersion.HTTP_1_0); 1108 1109 String addressStr = null; 1110 try { 1111 addressStr = (String ) getAttribute(ATTR_LOCAL_ADDRESS); 1112 } catch (Exception e1) { 1113 } 1115 if (addressStr != null && addressStr.length() > 0) { 1116 try { 1117 InetAddress localAddress = InetAddress.getByName(addressStr); 1118 this.http.getHostConfiguration().setLocalAddress(localAddress); 1119 } catch (UnknownHostException e) { 1120 throw new RuntimeException ("Unknown host " + addressStr 1123 + " in " + ATTR_LOCAL_ADDRESS); 1124 } 1125 } 1126 1127 configureHttpCookies(); 1128 1129 this.http.getParams().setParameter( 1131 HttpMethodParams.SINGLE_COOKIE_HEADER, new Boolean (true)); 1132 this.http.getParams().setParameter( 1133 HttpMethodParams.UNAMBIGUOUS_STATUS_LINE , new Boolean (false)); 1134 this.http.getParams().setParameter( 1135 HttpMethodParams.STRICT_TRANSFER_ENCODING, new Boolean (false)); 1136 this.http.getParams().setIntParameter( 1137 HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10); 1138 1139 HostConfiguration configOrNull = configureProxy(null); 1140 if(configOrNull!=null) { 1141 this.http.setHostConfiguration(configOrNull); 1143 } 1144 1145 final ServerCache cache = getController().getServerCache(); 1148 hcmp.setParameter(SERVER_CACHE_KEY, cache); 1149 hcmp.setParameter(SSL_FACTORY_KEY, this.sslfactory); 1150 } 1151 1152 1156 private void configureHttpCookies() { 1157 if(((Boolean )getUncheckedAttribute(null, ATTR_BDB_COOKIES)). 1159 booleanValue()) { 1160 try { 1161 Environment env = getController().getBdbEnvironment(); 1162 StoredClassCatalog classCatalog = getController().getClassCatalog(); 1163 DatabaseConfig dbConfig = new DatabaseConfig(); 1164 dbConfig.setTransactional(false); 1165 dbConfig.setAllowCreate(true); 1166 cookieDb = env.openDatabase(null, COOKIEDB_NAME, dbConfig); 1167 StoredSortedMap cookiesMap = new StoredSortedMap(cookieDb, 1168 new StringBinding(), new SerialBinding(classCatalog, 1169 Cookie.class), true); 1170 this.http.getState().setCookiesMap(cookiesMap); 1171 } catch (DatabaseException e) { 1172 logger.severe(e.getMessage()); 1174 e.printStackTrace(); 1175 } 1176 } 1177 } 1178 1179 1183 private int getSoTimeout(CrawlURI curi) { 1184 Integer res = null; 1185 try { 1186 res = (Integer ) getAttribute(ATTR_SOTIMEOUT_MS, curi); 1187 } catch (Exception e) { 1188 res = DEFAULT_SOTIMEOUT_MS; 1189 } 1190 return res.intValue(); 1191 } 1192 1193 1197 private int getTimeout(CrawlURI curi) { 1198 Integer res; 1199 try { 1200 res = (Integer ) getAttribute(ATTR_TIMEOUT_SECONDS, curi); 1201 } catch (Exception e) { 1202 res = DEFAULT_TIMEOUT_SECONDS; 1203 } 1204 return res.intValue(); 1205 } 1206 1207 private int getMaxFetchRate(CrawlURI curi) { 1208 Integer res; 1209 try { 1210 res = (Integer )getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi); 1211 } 1212 catch (Exception e) { 1213 res = DEFAULT_FETCH_BANDWIDTH_MAX; 1214 } 1215 return res.intValue(); 1216 } 1217 1218 private long getMaxLength(CrawlURI curi) { 1219 Long res; 1220 try { 1221 res = (Long ) getAttribute(ATTR_MAX_LENGTH_BYTES, curi); 1222 if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) { 1223 res = DEFAULT_MAX_LENGTH_BYTES; 1224 } 1225 } catch (Exception e) { 1226 res = DEFAULT_MAX_LENGTH_BYTES; 1227 } 1228 return res.longValue(); 1229 } 1230 1231 1254 public void loadCookies(String cookiesFile) { 1255 if (cookiesFile == null || cookiesFile.length() <= 0) { 1257 return; 1258 } 1259 RandomAccessFile raf = null; 1260 try { 1261 raf = new RandomAccessFile (cookiesFile, "r"); 1262 String [] cookieParts; 1263 String line; 1264 Cookie cookie = null; 1265 while ((line = raf.readLine()) != null) { 1266 if (!line.startsWith("#")) { 1268 cookieParts = line.split("\\t"); 1269 if (cookieParts.length == 7) { 1270 cookie = 1273 new Cookie(cookieParts[0], cookieParts[5], 1274 cookieParts[6], cookieParts[2], -1, 1275 Boolean.valueOf(cookieParts[3]).booleanValue()); 1276 1277 if (cookieParts[1].toLowerCase().equals("true")) { 1278 cookie.setDomainAttributeSpecified(true); 1279 } else { 1280 cookie.setDomainAttributeSpecified(false); 1281 } 1282 this.http.getState().addCookie(cookie); 1283 logger.fine( 1284 "Adding cookie: " + cookie.toExternalForm()); 1285 } 1286 } 1287 } 1288 } catch (FileNotFoundException e) { 1289 System.out.println("Could not find file: " + cookiesFile 1291 + " (Element: " + ATTR_LOAD_COOKIES + ")"); 1292 1293 } catch (IOException e) { 1294 e.printStackTrace(); 1296 } finally { 1297 try { 1298 if (raf != null) { 1299 raf.close(); 1300 } 1301 } catch (IOException e) { 1302 e.printStackTrace(); 1303 } 1304 } 1305 } 1306 1307 1310 public String report() { 1311 StringBuffer ret = new StringBuffer (); 1312 ret.append("Processor: org.archive.crawler.fetcher.FetchHTTP\n"); 1313 ret.append(" Function: Fetch HTTP URIs\n"); 1314 ret.append(" CrawlURIs handled: " + this.curisHandled + "\n"); 1315 ret.append(" Recovery retries: " + this.recoveryRetries + "\n\n"); 1316 1317 return ret.toString(); 1318 } 1319 1320 1321 1343 public void loadCookies() { 1344 try { 1345 loadCookies((String ) getAttribute(ATTR_LOAD_COOKIES)); 1346 } catch (MBeanException e) { 1347 logger.warning(e.getLocalizedMessage()); 1348 } catch (ReflectionException e) { 1349 logger.warning(e.getLocalizedMessage()); 1350 } catch (AttributeNotFoundException e) { 1351 logger.warning(e.getLocalizedMessage()); 1352 } 1353 } 1354 1360 public void saveCookies() { 1361 try { 1362 saveCookies((String ) getAttribute(ATTR_SAVE_COOKIES)); 1363 } catch (MBeanException e) { 1364 logger.warning(e.getLocalizedMessage()); 1365 } catch (ReflectionException e) { 1366 logger.warning(e.getLocalizedMessage()); 1367 } catch (AttributeNotFoundException e) { 1368 logger.warning(e.getLocalizedMessage()); 1369 } 1370 } 1371 1378 public void saveCookies(String saveCookiesFile) { 1379 if (saveCookiesFile == null || saveCookiesFile.length() <= 0) { 1381 return; 1382 } 1383 1384 FileOutputStream out = null; 1385 try { 1386 out = new FileOutputStream (new File (saveCookiesFile)); 1387 @SuppressWarnings ("unchecked") 1388 Map <String ,Cookie> cookies = http.getState().getCookiesMap(); 1389 String tab ="\t"; 1390 out.write("# Heritrix Cookie File\n".getBytes()); 1391 out.write( 1392 "# This file is the Netscape cookies.txt format\n\n".getBytes()); 1393 for (Cookie cookie: cookies.values()) { 1394 MutableString line = 1395 new MutableString(1024 * 2 ); 1396 line.append(cookie.getDomain()); 1397 line.append(tab); 1398 line.append( 1399 cookie.isDomainAttributeSpecified() == true 1400 ? "TRUE" 1401 : "FALSE"); 1402 line.append(tab); 1403 line.append(cookie.getPath()); 1404 line.append(tab); 1405 line.append( 1406 cookie.getSecure() == true ? "TRUE" : "FALSE"); 1407 line.append(tab); 1408 line.append(cookie.getName()); 1409 line.append(tab); 1410 line.append((null==cookie.getValue())?"":cookie.getValue()); 1411 line.append("\n"); 1412 out.write(line.toString().getBytes()); 1413 } 1414 } catch (FileNotFoundException e) { 1415 System.out.println("Could not find file: " + saveCookiesFile 1417 + " (Element: " + ATTR_SAVE_COOKIES + ")"); 1418 } catch (IOException e) { 1419 e.printStackTrace(); 1420 } finally { 1421 try { 1422 if (out != null) { 1423 out.close(); 1424 } 1425 } catch (IOException e) { 1426 e.printStackTrace(); 1427 } 1428 } 1429 } 1430 1431 1434 protected void listUsedFiles(List <String > list) { 1435 try { 1438 String tmp = (String )getAttribute(ATTR_LOAD_COOKIES); 1439 if(tmp != null && tmp.length() > 0){ 1440 File file = getSettingsHandler(). 1441 getPathRelativeToWorkingDirectory(tmp); 1442 list.add(file.getAbsolutePath()); 1443 } 1444 tmp = (String )getAttribute(ATTR_SAVE_COOKIES); 1445 if(tmp != null && tmp.length() > 0){ 1446 File file = getSettingsHandler(). 1447 getPathRelativeToWorkingDirectory(tmp); 1448 list.add(file.getAbsolutePath()); 1449 } 1450 } catch (AttributeNotFoundException e) { 1451 e.printStackTrace(); 1453 } catch (MBeanException e) { 1454 e.printStackTrace(); 1456 } catch (ReflectionException e) { 1457 e.printStackTrace(); 1459 } 1460 } 1461 1462 private void setAcceptHeaders(CrawlURI curi, HttpMethod get) { 1463 try { 1464 StringList accept_headers = (StringList) getAttribute(ATTR_ACCEPT_HEADERS, curi); 1465 if (!accept_headers.isEmpty()) { 1466 for (ListIterator i = accept_headers.listIterator(); i.hasNext();) { 1467 String hdr = (String ) i.next(); 1468 String [] nvp = hdr.split(": +"); 1469 if (nvp.length == 2) { 1470 get.setRequestHeader(nvp[0], nvp[1]); 1471 } 1472 else { 1473 logger.warning("Invalid accept header: " + hdr); 1474 } 1475 } 1476 } 1477 } 1478 catch (AttributeNotFoundException e) { 1479 logger.severe(e.getMessage()); 1480 } 1481 } 1482 1483 private void writeObject(ObjectOutputStream stream) throws IOException { 1485 stream.defaultWriteObject(); 1486 @SuppressWarnings ("unchecked") 1488 Collection <Cookie> c = http.getState().getCookiesMap().values(); 1489 Cookie[] cookies = c.toArray(new Cookie[c.size()]); 1490 stream.writeObject(cookies); 1491 } 1492 1493 private void readObject(ObjectInputStream stream) throws IOException , ClassNotFoundException { 1494 stream.defaultReadObject(); 1495 Cookie cookies[] = (Cookie[]) stream.readObject(); 1496 ObjectPlusFilesInputStream coistream = (ObjectPlusFilesInputStream)stream; 1497 coistream.registerFinishTask( new PostRestore(cookies) ); 1498 } 1499 1500 1503 protected HttpClient getHttp() { 1504 return this.http; 1505 } 1506 1507 class PostRestore implements Runnable { 1508 Cookie cookies[]; 1509 public PostRestore(Cookie cookies[]) { 1510 this.cookies = cookies; 1511 } 1512 public void run() { 1513 configureHttp(); 1514 for(int i = 0; i < cookies.length; i++) { 1515 getHttp().getState().addCookie(cookies[i]); 1516 } 1517 } 1518 } 1519 1520 1523 public void crawlStarted(String message) { 1524 } 1526 1527 1530 public void crawlCheckpoint(File checkpointDir) { 1531 } 1533 1534 1537 public void crawlEnding(String sExitMessage) { 1538 } 1540 1541 1544 public void crawlEnded(String sExitMessage) { 1545 this.http = null; 1546 this.midfetchfilters = null; 1547 } 1548 1549 1552 public void crawlPausing(String statusMessage) { 1553 } 1555 1556 1559 public void crawlPaused(String statusMessage) { 1560 } 1562 1563 1566 public void crawlResuming(String statusMessage) { 1567 } 1569} 1570 | Popular Tags |