KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > fetcher > FetchHTTP


1 /* FetchHTTP.java
2  *
3  * $Id: FetchHTTP.java,v 1.113.2.1 2007/01/13 01:31:17 stack-sf Exp $
4  *
5  * Created on Jun 5, 2003
6  *
7  * Copyright (C) 2003 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.fetcher;
26
27 import it.unimi.dsi.mg4j.util.MutableString;
28
29 import java.io.File JavaDoc;
30 import java.io.FileNotFoundException JavaDoc;
31 import java.io.FileOutputStream JavaDoc;
32 import java.io.IOException JavaDoc;
33 import java.io.ObjectInputStream JavaDoc;
34 import java.io.ObjectOutputStream JavaDoc;
35 import java.io.RandomAccessFile JavaDoc;
36 import java.security.KeyManagementException JavaDoc;
37 import java.security.KeyStoreException JavaDoc;
38 import java.security.NoSuchAlgorithmException JavaDoc;
39 import java.util.Collection JavaDoc;
40 import java.util.HashSet JavaDoc;
41 import java.util.Iterator JavaDoc;
42 import java.util.List JavaDoc;
43 import java.util.ListIterator JavaDoc;
44 import java.util.Map JavaDoc;
45 import java.util.Set JavaDoc;
46 import java.util.logging.Level JavaDoc;
47 import java.util.logging.Logger JavaDoc;
48 import java.net.InetAddress JavaDoc;
49 import java.net.UnknownHostException JavaDoc;
50
51 import javax.management.AttributeNotFoundException JavaDoc;
52 import javax.management.MBeanException JavaDoc;
53 import javax.management.ReflectionException JavaDoc;
54 import javax.net.ssl.SSLContext;
55 import javax.net.ssl.SSLSocketFactory;
56 import javax.net.ssl.TrustManager;
57
58 import org.apache.commons.httpclient.Cookie;
59 import org.apache.commons.httpclient.Header;
60 import org.apache.commons.httpclient.HostConfiguration;
61 import org.apache.commons.httpclient.HttpClient;
62 import org.apache.commons.httpclient.HttpConnection;
63 import org.apache.commons.httpclient.HttpConnectionManager;
64 import org.apache.commons.httpclient.HttpException;
65 import org.apache.commons.httpclient.HttpMethod;
66 import org.apache.commons.httpclient.HttpMethodBase;
67 import org.apache.commons.httpclient.HttpState;
68 import org.apache.commons.httpclient.HttpStatus;
69 import org.apache.commons.httpclient.HttpVersion;
70 import org.apache.commons.httpclient.auth.AuthChallengeParser;
71 import org.apache.commons.httpclient.auth.AuthScheme;
72 import org.apache.commons.httpclient.auth.BasicScheme;
73 import org.apache.commons.httpclient.auth.DigestScheme;
74 import org.apache.commons.httpclient.auth.MalformedChallengeException;
75 import org.apache.commons.httpclient.cookie.CookiePolicy;
76 import org.apache.commons.httpclient.params.HttpClientParams;
77 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
78 import org.apache.commons.httpclient.params.HttpMethodParams;
79 import org.apache.commons.httpclient.protocol.Protocol;
80 import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
81 import org.archive.crawler.Heritrix;
82 import org.archive.crawler.datamodel.CoreAttributeConstants;
83 import org.archive.crawler.datamodel.CrawlHost;
84 import org.archive.crawler.datamodel.CrawlOrder;
85 import org.archive.crawler.datamodel.CrawlServer;
86 import org.archive.crawler.datamodel.CrawlURI;
87 import org.archive.crawler.datamodel.CredentialStore;
88 import org.archive.crawler.datamodel.FetchStatusCodes;
89 import org.archive.crawler.datamodel.ServerCache;
90 import org.archive.crawler.datamodel.credential.Credential;
91 import org.archive.crawler.datamodel.credential.CredentialAvatar;
92 import org.archive.crawler.datamodel.credential.Rfc2617Credential;
93 import org.archive.crawler.event.CrawlStatusListener;
94 import org.archive.crawler.framework.Filter;
95 import org.archive.crawler.framework.Processor;
96 import org.archive.crawler.settings.MapType;
97 import org.archive.crawler.settings.SettingsHandler;
98 import org.archive.crawler.settings.SimpleType;
99 import org.archive.crawler.settings.StringList;
100 import org.archive.crawler.settings.Type;
101 import org.archive.httpclient.ConfigurableX509TrustManager;
102 import org.archive.httpclient.HttpRecorderGetMethod;
103 import org.archive.httpclient.HttpRecorderMethod;
104 import org.archive.httpclient.HttpRecorderPostMethod;
105 import org.archive.httpclient.SingleHttpConnectionManager;
106 import org.archive.io.ObjectPlusFilesInputStream;
107 import org.archive.io.RecorderLengthExceededException;
108 import org.archive.io.RecorderTimeoutException;
109 import org.archive.io.RecorderTooMuchHeaderException;
110 import org.archive.util.ArchiveUtils;
111 import org.archive.util.HttpRecorder;
112
113 import com.sleepycat.bind.serial.SerialBinding;
114 import com.sleepycat.bind.serial.StoredClassCatalog;
115 import com.sleepycat.bind.tuple.StringBinding;
116 import com.sleepycat.collections.StoredSortedMap;
117 import com.sleepycat.je.Database;
118 import com.sleepycat.je.DatabaseConfig;
119 import com.sleepycat.je.DatabaseException;
120 import com.sleepycat.je.Environment;
121
122 /**
123  * HTTP fetcher that uses <a
124  * HREF="http://jakarta.apache.org/commons/httpclient/">Apache Jakarta Commons
125  * HttpClient</a> library.
126  *
127  * @author Gordon Mohr
128  * @author Igor Ranitovic
129  * @author others
130  * @version $Id: FetchHTTP.java,v 1.113.2.1 2007/01/13 01:31:17 stack-sf Exp $
131  */

132 public class FetchHTTP extends Processor
133 implements CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener {
134     // be robust against trivial implementation changes
135
private static final long serialVersionUID =
136         ArchiveUtils.classnameBasedUID(FetchHTTP.class,1);
137     
138     private static Logger JavaDoc logger = Logger.getLogger(FetchHTTP.class.getName());
139
140     public static final String JavaDoc ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST;
141     public static final String JavaDoc ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT;
142     public static final String JavaDoc ATTR_TIMEOUT_SECONDS = "timeout-seconds";
143     public static final String JavaDoc ATTR_SOTIMEOUT_MS = "sotimeout-ms";
144     public static final String JavaDoc ATTR_MAX_LENGTH_BYTES = "max-length-bytes";
145     public static final String JavaDoc ATTR_LOAD_COOKIES = "load-cookies-from-file";
146     public static final String JavaDoc ATTR_SAVE_COOKIES = "save-cookies-to-file";
147     public static final String JavaDoc ATTR_ACCEPT_HEADERS = "accept-headers";
148     public static final String JavaDoc ATTR_DEFAULT_ENCODING = "default-encoding";
149     public static final String JavaDoc ATTR_SHA1_CONTENT = "sha1-content";
150     public static final String JavaDoc ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth";
151    
152     /**
153      * SSL trust level setting attribute name.
154      */

155     public static final String JavaDoc ATTR_TRUST = "trust-level";
156     
157     private static Integer JavaDoc DEFAULT_TIMEOUT_SECONDS = new Integer JavaDoc(1200);
158     private static Integer JavaDoc DEFAULT_SOTIMEOUT_MS = new Integer JavaDoc(20000);
159     private static Long JavaDoc DEFAULT_MAX_LENGTH_BYTES = new Long JavaDoc(0);
160     private static Integer JavaDoc DEFAULT_FETCH_BANDWIDTH_MAX = 0;
161
162     /**
163      * This is the default value pre-1.4. Needs special handling else
164      * treated as negative number doing math later in processing.
165      */

166     private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L;
167
168     /**
169      * Default character encoding to use for pages that do not specify.
170      */

171     private static String JavaDoc DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING;
172
173     /**
174      * Default whether to perform on-the-fly SHA1 hashing of content-bodies.
175      */

176     static Boolean JavaDoc DEFAULT_SHA1_CONTENT = new Boolean JavaDoc(true);
177     public static final String JavaDoc SHA1 = "sha1";
178
179     private transient HttpClient http = null;
180
181     /**
182      * How many 'instant retries' of HttpRecoverableExceptions have occurred
183      *
184      * Would like it to be 'long', but longs aren't atomic
185      */

186     private int recoveryRetries = 0;
187
188     /**
189      * Count of crawl uris handled.
190      * Would like to be 'long', but longs aren't atomic
191      */

192     private int curisHandled = 0;
193         
194     /**
195      * Filters to apply mid-fetch, just after receipt of the response
196      * headers before we start to download body.
197      */

198     public final static String JavaDoc MIDFETCH_ATTR_FILTERS = "midfetch-filters";
199
200     /**
201      * Instance of midfetchfilters.
202      */

203     private MapType midfetchfilters = null;
204     
205     /**
206      * What to log if midfetch abort.
207      */

208     private static final String JavaDoc MIDFETCH_ABORT_LOG = "midFetchAbort";
209     
210     public static final String JavaDoc ATTR_SEND_CONNECTION_CLOSE =
211         "send-connection-close";
212     private static final Header HEADER_SEND_CONNECTION_CLOSE =
213         new Header("Connection", "close");
214     public static final String JavaDoc ATTR_SEND_REFERER = "send-referer";
215     public static final String JavaDoc ATTR_SEND_RANGE = "send-range";
216     public static final String JavaDoc REFERER = "Referer";
217     public static final String JavaDoc RANGE = "Range";
218     public static final String JavaDoc RANGE_PREFIX = "bytes=0-";
219     public static final String JavaDoc HTTP_SCHEME = "http";
220     public static final String JavaDoc HTTPS_SCHEME = "https";
221     
222     public static final String JavaDoc ATTR_IGNORE_COOKIES = "ignore-cookies";
223     private static Boolean JavaDoc DEFAULT_IGNORE_COOKIES = new Boolean JavaDoc(false);
224
225     public static final String JavaDoc ATTR_BDB_COOKIES = "use-bdb-for-cookies";
226     private static Boolean JavaDoc DEFAULT_BDB_COOKIES = new Boolean JavaDoc(true);
227     
228     public static final String JavaDoc ATTR_LOCAL_ADDRESS = "bind-address";
229     
230     /**
231      * Database backing cookie map, if using BDB
232      */

233     protected Database cookieDb;
234     /**
235      * Name of cookie BDB Database
236      */

237     public static final String JavaDoc COOKIEDB_NAME = "http_cookies";
238     
239     static {
240         Protocol.registerProtocol("http", new Protocol("http",
241             new HeritrixProtocolSocketFactory(), 80));
242         try {
243             Protocol.registerProtocol("https",
244                 new Protocol("https", ((ProtocolSocketFactory)
245                     new HeritrixSSLProtocolSocketFactory()), 443));
246         } catch (KeyManagementException JavaDoc e) {
247             e.printStackTrace();
248         } catch (KeyStoreException JavaDoc e) {
249             e.printStackTrace();
250         } catch (NoSuchAlgorithmException JavaDoc e) {
251             e.printStackTrace();
252         }
253     }
254     static final String JavaDoc SERVER_CACHE_KEY = "heritrix.server.cache";
255     static final String JavaDoc SSL_FACTORY_KEY = "heritrix.ssl.factory";
256     
257     /***
258      * Socket factory that has the configurable trust manager installed.
259      */

260     private SSLSocketFactory sslfactory = null;
261     
262
263     /**
264      * Constructor.
265      *
266      * @param name Name of this processor.
267      */

268     public FetchHTTP(String JavaDoc name) {
269         super(name, "HTTP Fetcher");
270         this.midfetchfilters = (MapType) addElementToDefinition(
271             new MapType(MIDFETCH_ATTR_FILTERS, "Filters applied after" +
272                 " receipt of HTTP response headers but before we start to" +
273                 " download the body. If any filter returns" +
274                 " FALSE, the fetch is aborted. Prerequisites such as" +
275                 " robots.txt by-pass filtering (i.e. they cannot be" +
276                 " midfetch aborted.", Filter.class));
277 // see [ 1379040 ] regex for midfetch filter not being stored in crawl order
278
// http://sourceforge.net/support/tracker.php?aid=1379040
279
// this.midfetchfilters.setExpertSetting(true);
280

281         addElementToDefinition(new SimpleType(ATTR_TIMEOUT_SECONDS,
282             "If the fetch is not completed in this number of seconds,"
283             + " give up (and retry later). For optimal configuration, " +
284             " ensure this value is > " + ATTR_TIMEOUT_SECONDS + ".",
285             DEFAULT_TIMEOUT_SECONDS));
286         Type e = addElementToDefinition(new SimpleType(ATTR_SOTIMEOUT_MS,
287             "If the socket is unresponsive for this number of milliseconds, " +
288             " give up. Set to zero for no timeout (Not." +
289             " recommended. Could hang a thread on an unresponsive server)." +
290             " This timeout is used timing out socket opens " +
291             " and for timing out each socket read. Make sure this " +
292             " value is < " + ATTR_TIMEOUT_SECONDS + " for optimal " +
293             " configuration: ensures at least one retry read.",
294                 DEFAULT_SOTIMEOUT_MS));
295         e.setExpertSetting(true);
296         e = addElementToDefinition(new SimpleType(ATTR_FETCH_BANDWIDTH_MAX,
297             "The maximum KB/sec to use when fetching data from a server. " +
298             "0 means no maximum. Default: "+ DEFAULT_FETCH_BANDWIDTH_MAX
299              + ".", DEFAULT_FETCH_BANDWIDTH_MAX));
300         e.setExpertSetting(true);
301         e.setOverrideable(true);
302         addElementToDefinition(new SimpleType(ATTR_MAX_LENGTH_BYTES,
303             "Maximum length in bytes to fetch.\n" +
304             "Fetch is truncated at this length. A value of 0 means no limit.",
305             DEFAULT_MAX_LENGTH_BYTES));
306         e = addElementToDefinition(new SimpleType(ATTR_IGNORE_COOKIES,
307             "Disable cookie-handling.", DEFAULT_IGNORE_COOKIES));
308         e.setOverrideable(true);
309         e.setExpertSetting(true);
310         e = addElementToDefinition(new SimpleType(ATTR_BDB_COOKIES,
311                 "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES));
312         e.setExpertSetting(true);
313
314         e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES,
315             "File to preload cookies from", ""));
316         e.setExpertSetting(true);
317         e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES,
318             "When crawl finishes save cookies to this file", ""));
319         e.setExpertSetting(true);
320         e = addElementToDefinition(new SimpleType(ATTR_TRUST,
321             "SSL certificate trust level. Range is from the default 'open'"
322             + " (trust all certs including expired, selfsigned, and those for"
323             + " which we do not have a CA) through 'loose' (trust all valid"
324             + " certificates including selfsigned), 'normal' (all valid"
325             + " certificates not including selfsigned) to 'strict' (Cert is"
326             + " valid and DN must match servername)",
327             ConfigurableX509TrustManager.DEFAULT,
328             ConfigurableX509TrustManager.LEVELS_AS_ARRAY));
329         e.setOverrideable(false);
330         e.setExpertSetting(true);
331         e = addElementToDefinition(new StringList(ATTR_ACCEPT_HEADERS,
332             "Accept Headers to include in each request. Each must be the"
333             + " complete header, e.g., 'Accept-Language: en'"));
334         e.setExpertSetting(true);
335         e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST,
336             "Proxy host IP (set only if needed).", ""));
337         e.setExpertSetting(true);
338         e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT,
339             "Proxy port (set only if needed)", ""));
340         e.setExpertSetting(true);
341         e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING,
342             "The character encoding to use for files that do not have one" +
343             " specified in the HTTP response headers. Default: " +
344             DEFAULT_CONTENT_CHARSET + ".",
345             DEFAULT_CONTENT_CHARSET));
346         e.setExpertSetting(true);
347         e = addElementToDefinition(new SimpleType(ATTR_SHA1_CONTENT,
348                 "Whether or not to perform an on-the-fly SHA1 hash of" +
349                 "retrieved content-bodies.",
350                 DEFAULT_SHA1_CONTENT));
351         e.setExpertSetting(true);
352         e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE,
353             "Send 'Connection: close' header with every request.",
354              new Boolean JavaDoc(true)));
355         e.setOverrideable(true);
356         e.setExpertSetting(true);
357         e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER,
358              "Send 'Referer' header with every request.\n" +
359              "The 'Referer' header contans the location the crawler came " +
360              " from, " +
361              "the page the current URI was discovered in. The 'Referer' " +
362              "usually is " +
363              "logged on the remote server and can be of assistance to " +
364              "webmasters trying to figure how a crawler got to a " +
365              "particular area on a site.",
366              new Boolean JavaDoc(true)));
367         e.setOverrideable(true);
368         e.setExpertSetting(true);
369         e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE,
370               "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES +
371               ") on document size.\n" +
372               "Be polite to the HTTP servers and send the 'Range' header," +
373               "stating that you are only interested in the first n bytes. " +
374               "Only pertinent if " + ATTR_MAX_LENGTH_BYTES + " > 0. " +
375               "Sending the 'Range' header results in a " +
376               "'206 Partial Content' status response, which is better than " +
377               "just cutting the response mid-download. On rare occasion, " +
378               " sending 'Range' will " +
379               "generate '416 Request Range Not Satisfiable' response.",
380               new Boolean JavaDoc(false)));
381            e.setOverrideable(true);
382            e.setExpertSetting(true);
383            e = addElementToDefinition(new SimpleType(ATTR_LOCAL_ADDRESS,
384                "Local IP address or hostname to use when making connections " +
385                "(binding sockets). When not specified, uses default local" +
386                "address(es).", ""));
387            e.setExpertSetting(true);
388     }
389
390     protected void innerProcess(final CrawlURI curi)
391     throws InterruptedException JavaDoc {
392         if (!canFetch(curi)) {
393             // Cannot fetch this, due to protocol, retries, or other problems
394
return;
395         }
396
397         this.curisHandled++;
398
399         // Note begin time
400
curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
401
402         // Get a reference to the HttpRecorder that is set into this ToeThread.
403
HttpRecorder rec = HttpRecorder.getHttpRecorder();
404         
405         // Shall we get a digest on the content downloaded?
406
boolean sha1Content = ((Boolean JavaDoc)getUncheckedAttribute(curi,
407             ATTR_SHA1_CONTENT)).booleanValue();
408         if(sha1Content) {
409             rec.getRecordedInput().setSha1Digest();
410         } else {
411             // clear
412
rec.getRecordedInput().setDigest(null);
413         }
414         
415         // Below we do two inner classes that add check of midfetch
416
// filters just as we're about to receive the response body.
417
String JavaDoc curiString = curi.getUURI().toString();
418         HttpMethodBase method = null;
419         if (curi.isPost()) {
420             method = new HttpRecorderPostMethod(curiString, rec) {
421                 protected void readResponseBody(HttpState state,
422                         HttpConnection conn)
423                 throws IOException JavaDoc, HttpException {
424                     addResponseContent(this, curi);
425                     if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) {
426                         doAbort(curi, this, MIDFETCH_ABORT_LOG);
427                     } else {
428                         super.readResponseBody(state, conn);
429                     }
430                 }
431             };
432         } else {
433             method = new HttpRecorderGetMethod(curiString, rec) {
434                 protected void readResponseBody(HttpState state,
435                         HttpConnection conn)
436                 throws IOException JavaDoc, HttpException {
437                     addResponseContent(this, curi);
438                     if (checkMidfetchAbort(curi, this.httpRecorderMethod,
439                             conn)) {
440                         doAbort(curi, this, MIDFETCH_ABORT_LOG);
441                     } else {
442                         super.readResponseBody(state, conn);
443                     }
444                 }
445             };
446         }
447
448         HostConfiguration customConfigOrNull = configureMethod(curi, method);
449         
450         // Set httpRecorder into curi. Subsequent code both here and later
451
// in extractors expects to find the HttpRecorder in the CrawlURI.
452
curi.setHttpRecorder(rec);
453         
454         // Populate credentials. Set config so auth. is not automatic.
455
boolean addedCredentials = populateCredentials(curi, method);
456         method.setDoAuthentication(addedCredentials);
457         
458         try {
459             this.http.executeMethod(customConfigOrNull, method);
460         } catch (RecorderTooMuchHeaderException ex) {
461             // when too much header material, abort like other truncations
462
doAbort(curi, method, HEADER_TRUNC);
463         } catch (IOException JavaDoc e) {
464             failedExecuteCleanup(method, curi, e);
465             return;
466         } catch (ArrayIndexOutOfBoundsException JavaDoc e) {
467             // For weird windows-only ArrayIndex exceptions in native
468
// code... see
469
// http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
470
// treating as if it were an IOException
471
failedExecuteCleanup(method, curi, e);
472             return;
473         }
474         
475         // set softMax on bytes to get (if implied by content-length)
476
long softMax = method.getResponseContentLength();
477         
478         // set hardMax on bytes (if set by operator)
479
long hardMax = getMaxLength(curi);
480
481     // Get max fetch rate (bytes/ms). It comes in in KB/sec, which
482
// requires nothing to normalize.
483
int maxFetchRate = getMaxFetchRate(curi);
484
485         try {
486             if (!method.isAborted()) {
487                 // Force read-to-end, so that any socket hangs occur here,
488
// not in later modules.
489
rec.getRecordedInput().readFullyOrUntil(softMax,
490                         hardMax, 1000 * getTimeout(curi), maxFetchRate);
491             }
492         } catch (RecorderTimeoutException ex) {
493             doAbort(curi, method, TIMER_TRUNC);
494         } catch (RecorderLengthExceededException ex) {
495             doAbort(curi, method, LENGTH_TRUNC);
496         } catch (IOException JavaDoc e) {
497             cleanup(curi, e, "readFully", S_CONNECT_LOST);
498             return;
499         } catch (ArrayIndexOutOfBoundsException JavaDoc e) {
500             // For weird windows-only ArrayIndex exceptions from native code
501
// see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
502
// treating as if it were an IOException
503
cleanup(curi, e, "readFully", S_CONNECT_LOST);
504             return;
505         } finally {
506             // ensure recording has stopped
507
rec.closeRecorders();
508             if (!method.isAborted()) {
509                 method.releaseConnection();
510             }
511             // Note completion time
512
curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
513             // Set the response charset into the HttpRecord if available.
514
setCharacterEncoding(rec, method);
515             curi.setContentSize(rec.getRecordedInput().getSize());
516         }
517         
518         curi.setContentDigest(SHA1, rec.getRecordedInput().getDigestValue());
519         if (logger.isLoggable(Level.INFO)) {
520             logger.info((curi.isPost()? "POST": "GET") + " " +
521                 curi.getUURI().toString() + " " + method.getStatusCode() +
522                 " " + rec.getRecordedInput().getSize() + " " +
523                 curi.getContentType());
524         }
525
526         if (curi.isSuccess() && addedCredentials) {
527             // Promote the credentials from the CrawlURI to the CrawlServer
528
// so they are available for all subsequent CrawlURIs on this
529
// server.
530
promoteCredentials(curi);
531             if (logger.isLoggable(Level.FINE)) {
532                 // Print out the cookie. Might help with the debugging.
533
Header setCookie = method.getResponseHeader("set-cookie");
534                 if (setCookie != null) {
535                     logger.fine(setCookie.toString().trim());
536                 }
537             }
538         } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {
539             // 401 is not 'success'.
540
handle401(method, curi);
541         }
542         
543         if (rec.getRecordedInput().isOpen()) {
544             logger.severe(curi.toString() + " RIS still open. Should have" +
545                 " been closed by method release: " +
546                 Thread.currentThread().getName());
547             try {
548                 rec.getRecordedInput().close();
549             } catch (IOException JavaDoc e) {
550                 logger.log(Level.SEVERE,"second-chance RIS close failed",e);
551             }
552         }
553     }
554     
555     protected void doAbort(CrawlURI curi, HttpMethod method,
556             String JavaDoc annotation) {
557         curi.addAnnotation(annotation);
558         curi.getHttpRecorder().close();
559         method.abort();
560     }
561     
562     protected boolean checkMidfetchAbort(CrawlURI curi,
563             HttpRecorderMethod method, HttpConnection conn) {
564         if (curi.isPrerequisite() || filtersAccept(midfetchfilters, curi)) {
565             return false;
566         }
567         method.markContentBegin(conn);
568         return true;
569     }
570     
571     /**
572      * This method populates <code>curi</code> with response status and
573      * content type.
574      * @param curi CrawlURI to populate.
575      * @param method Method to get response status and headers from.
576      */

577     protected void addResponseContent (HttpMethod method, CrawlURI curi) {
578         curi.setFetchStatus(method.getStatusCode());
579         Header ct = method.getResponseHeader("content-type");
580         curi.setContentType((ct == null)? null: ct.getValue());
581         // Save method into curi too. Midfetch filters may want to leverage
582
// info in here.
583
curi.putObject(A_HTTP_TRANSACTION, method);
584     }
585
586     /**
587      * Set the character encoding based on the result headers or default.
588      *
589      * The HttpClient returns its own default encoding ("ISO-8859-1") if one
590      * isn't specified in the Content-Type response header. We give the user
591      * the option of overriding this, so we need to detect the case where the
592      * default is returned.
593      *
594      * Now, it may well be the case that the default returned by HttpClient
595      * and the default defined by the user are the same.
596      *
597      * @param rec Recorder for this request.
598      * @param method Method used for the request.
599      */

600     private void setCharacterEncoding(final HttpRecorder rec,
601         final HttpMethod method) {
602         String JavaDoc encoding = null;
603
604         try {
605             encoding = ((HttpMethodBase) method).getResponseCharSet();
606             if (encoding == null ||
607                     encoding.equals(DEFAULT_CONTENT_CHARSET)) {
608                 encoding = (String JavaDoc) getAttribute(ATTR_DEFAULT_ENCODING);
609             }
610         } catch (Exception JavaDoc e) {
611             logger.warning("Failed get default encoding: " +
612                 e.getLocalizedMessage());
613         }
614         rec.setCharacterEncoding(encoding);
615     }
616
617     /**
618      * Cleanup after a failed method execute.
619      * @param curi CrawlURI we failed on.
620      * @param method Method we failed on.
621      * @param exception Exception we failed with.
622      */

623     private void failedExecuteCleanup(final HttpMethod method,
624             final CrawlURI curi, final Exception JavaDoc exception) {
625         cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED);
626         method.releaseConnection();
627     }
628     
629     /**
630      * Cleanup after a failed method execute.
631      * @param curi CrawlURI we failed on.
632      * @param exception Exception we failed with.
633      * @param message Message to log with failure.
634      * @param status Status to set on the fetch.
635      */

636     private void cleanup(final CrawlURI curi, final Exception JavaDoc exception,
637             final String JavaDoc message, final int status) {
638         curi.addLocalizedError(this.getName(), exception, message);
639         curi.setFetchStatus(status);
640         curi.getHttpRecorder().close();
641     }
642
643     /**
644      * Can this processor fetch the given CrawlURI. May set a fetch
645      * status if this processor would usually handle the CrawlURI,
646      * but cannot in this instance.
647      *
648      * @param curi
649      * @return True if processor can fetch.
650      */

651     private boolean canFetch(CrawlURI curi) {
652         if(curi.getFetchStatus()<0) {
653             // already marked as errored, this pass through
654
// skip to end
655
curi.skipToProcessorChain(getController().getPostprocessorChain());
656             return false;
657         }
658         String JavaDoc scheme = curi.getUURI().getScheme();
659          if (!(scheme.equals("http") || scheme.equals("https"))) {
660              // handles only plain http and https
661
return false;
662          }
663          CrawlHost host = getController().getServerCache().getHostFor(curi);
664          // make sure the dns lookup succeeded
665
if (host.getIP() == null && host.hasBeenLookedUp()) {
666              curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
667              return false;
668          }
669         return true;
670     }
671
672     /**
673      * Configure the HttpMethod setting options and headers.
674      *
675      * @param curi CrawlURI from which we pull configuration.
676      * @param method The Method to configure.
677      */

678     protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) {
679         // Don't auto-follow redirects
680
method.setFollowRedirects(false);
681         
682 // // set soTimeout
683
// method.getParams().setSoTimeout(
684
// ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))
685
// .intValue());
686

687         // Set cookie policy.
688
method.getParams().setCookiePolicy(
689             (((Boolean JavaDoc)getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)).
690                 booleanValue())?
691                     CookiePolicy.IGNORE_COOKIES:
692                 CookiePolicy.BROWSER_COMPATIBILITY);
693
694         // Use only HTTP/1.0 (to avoid receiving chunked responses)
695
method.getParams().setVersion(HttpVersion.HTTP_1_0);
696
697         CrawlOrder order = getSettingsHandler().getOrder();
698         String JavaDoc userAgent = curi.getUserAgent();
699         if (userAgent == null) {
700             userAgent = order.getUserAgent(curi);
701         }
702         method.setRequestHeader("User-Agent", userAgent);
703         method.setRequestHeader("From", order.getFrom(curi));
704         
705         // Set retry handler.
706
method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
707             new HeritrixHttpMethodRetryHandler());
708         
709         final long maxLength = getMaxLength(curi);
710         if(maxLength > 0 &&
711                 ((Boolean JavaDoc)getUncheckedAttribute(curi, ATTR_SEND_RANGE)).
712                     booleanValue()) {
713             method.addRequestHeader(RANGE,
714                 RANGE_PREFIX.concat(Long.toString(maxLength - 1)));
715         }
716         
717         if (((Boolean JavaDoc)getUncheckedAttribute(curi,
718                 ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) {
719             method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE);
720         }
721         
722         if (((Boolean JavaDoc)getUncheckedAttribute(curi,
723                 ATTR_SEND_REFERER)).booleanValue()) {
724             // RFC2616 says no referer header if referer is https and the url
725
// is not
726
String JavaDoc via = curi.flattenVia();
727             if (via != null && via.length() > 0 &&
728                 !(via.startsWith(HTTPS_SCHEME) &&
729                     curi.getUURI().getScheme().equals(HTTP_SCHEME))) {
730                 method.setRequestHeader(REFERER, via);
731             }
732         }
733         
734         // TODO: What happens if below method adds a header already
735
// added above: e.g. Connection, Range, or Referer?
736
setAcceptHeaders(curi, method);
737         
738         return configureProxy(curi);
739     }
740
741     /**
742      * Setup proxy, based on attributes in CrawlURI and settings,
743      * for this CrawlURI only.
744      * @return HostConfiguration customized as necessary, or null if no
745      * customization required
746      */

747     private HostConfiguration configureProxy(CrawlURI curi) {
748         String JavaDoc proxy = (String JavaDoc) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST);
749         int port = -1;
750         if(proxy.length()==0) {
751             proxy = null;
752         } else {
753             String JavaDoc portString = (String JavaDoc)getAttributeEither(curi, ATTR_HTTP_PROXY_PORT);
754             port = portString.length()>0 ? Integer.parseInt(portString) : -1;
755         }
756         HostConfiguration config = this.http.getHostConfiguration();
757         if(config.getProxyHost() == proxy && config.getProxyPort() == port) {
758             // no change
759
return null;
760         }
761         if (proxy != null && proxy.equals(config.getProxyHost())
762                 && config.getProxyPort() == port) {
763             // no change
764
return null;
765         }
766         config = new HostConfiguration(config); // copy of config
767
config.setProxy(proxy,port);
768         return config;
769     }
770
771     /**
772      * Get a value either from inside the CrawlURI instance, or from
773      * settings (module attributes).
774      *
775      * @param curi CrawlURI to consult
776      * @param key key to lookup
777      * @return value from either CrawlURI (preferred) or settings
778      */

779     protected Object JavaDoc getAttributeEither(CrawlURI curi, String JavaDoc key) {
780         Object JavaDoc obj = curi!=null ? curi.getObject(key) : null;
781         if(obj==null) {
782             obj = getUncheckedAttribute(curi, key);
783         }
784         return obj;
785     }
786
787     /**
788      * Add credentials if any to passed <code>method</code>.
789      *
790      * Do credential handling. Credentials are in two places. 1. Credentials
791      * that succeeded are added to the CrawlServer (Or rather, avatars for
792      * credentials are whats added because its not safe to keep around
793      * references to credentials). 2. Credentials to be tried are in the curi.
794      * Returns true if found credentials to be tried.
795      *
796      * @param curi Current CrawlURI.
797      * @param method The method to add to.
798      * @return True if prepopulated <code>method</code> with credentials AND the
799      * credentials came from the <code>curi</code>, not from the CrawlServer.
800      * The former is special in that if the <code>curi</curi> credentials
801      * succeed, then the caller needs to promote them from the CrawlURI to the
802      * CrawlServer so they are available for all subsequent CrawlURIs on this
803      * server.
804      */

805     private boolean populateCredentials(CrawlURI curi, HttpMethod method) {
806         // First look at the server avatars. Add any that are to be volunteered
807
// on every request (e.g. RFC2617 credentials). Every time creds will
808
// return true when we call 'isEveryTime().
809
CrawlServer server =
810             getController().getServerCache().getServerFor(curi);
811         if (server.hasCredentialAvatars()) {
812             Set JavaDoc avatars = server.getCredentialAvatars();
813             for (Iterator JavaDoc i = avatars.iterator(); i.hasNext();) {
814                 CredentialAvatar ca = (CredentialAvatar)i.next();
815                 Credential c = ca.getCredential(getSettingsHandler(), curi);
816                 if (c.isEveryTime()) {
817                     c.populate(curi, this.http, method, ca.getPayload());
818                 }
819             }
820         }
821
822         boolean result = false;
823
824         // Now look in the curi. The Curi will have credentials loaded either
825
// by the handle401 method if its a rfc2617 or it'll have been set into
826
// the curi by the preconditionenforcer as this login uri came through.
827
if (curi.hasCredentialAvatars()) {
828             Set JavaDoc avatars = curi.getCredentialAvatars();
829             for (Iterator JavaDoc i = avatars.iterator(); i.hasNext();) {
830                 CredentialAvatar ca = (CredentialAvatar)i.next();
831                 Credential c = ca.getCredential(getSettingsHandler(), curi);
832                 if (c.populate(curi, this.http, method, ca.getPayload())) {
833                     result = true;
834                 }
835             }
836         }
837
838         return result;
839     }
840
841     /**
842      * Promote successful credential to the server.
843      *
844      * @param curi CrawlURI whose credentials we are to promote.
845      */

846     private void promoteCredentials(final CrawlURI curi) {
847         if (!curi.hasCredentialAvatars()) {
848             logger.severe("No credentials to promote when there should be " +
849                 curi);
850         } else {
851             Set JavaDoc avatars = curi.getCredentialAvatars();
852             for (Iterator JavaDoc i = avatars.iterator(); i.hasNext();) {
853                 CredentialAvatar ca = (CredentialAvatar)i.next();
854                 curi.removeCredentialAvatar(ca);
855                 // The server to attach too may not be the server that hosts
856
// this passed curi. It might be of another subdomain.
857
// The avatar needs to be added to the server that is dependent
858
// on this precondition. Find it by name. Get the name from
859
// the credential this avatar represents.
860
Credential c = ca.getCredential(getSettingsHandler(), curi);
861                 String JavaDoc cd = null;
862                 try {
863                     cd = c.getCredentialDomain(curi);
864                 }
865                 catch (AttributeNotFoundException JavaDoc e) {
866                     logger.severe("Failed to get cred domain for " + curi +
867                         " for " + ca + ": " + e.getMessage());
868                 }
869                 if (cd != null) {
870                     CrawlServer cs
871                         = getController().getServerCache().getServerFor(cd);
872                     if (cs != null) {
873                         cs.addCredentialAvatar(ca);
874                     }
875                 }
876             }
877         }
878     }
879
880     /**
881      * Server is looking for basic/digest auth credentials (RFC2617). If we have
882      * any, put them into the CrawlURI and have it come around again. Presence
883      * of the credential serves as flag to frontier to requeue promptly. If we
884      * already tried this domain and still got a 401, then our credentials are
885      * bad. Remove them and let this curi die.
886      *
887      * @param method Method that got a 401.
888      * @param curi CrawlURI that got a 401.
889      */

890     protected void handle401(final HttpMethod method, final CrawlURI curi) {
891         AuthScheme authscheme = getAuthScheme(method, curi);
892         if (authscheme == null) {
893             return;
894         }
895         String JavaDoc realm = authscheme.getRealm();
896         
897         // Look to see if this curi had rfc2617 avatars loaded. If so, are
898
// any of them for this realm? If so, then the credential failed
899
// if we got a 401 and it should be let die a natural 401 death.
900
Set JavaDoc curiRfc2617Credentials = getCredentials(getSettingsHandler(),
901                 curi, Rfc2617Credential.class);
902         Rfc2617Credential extant = Rfc2617Credential.
903             getByRealm(curiRfc2617Credentials, realm, curi);
904         if (extant != null) {
905             // Then, already tried this credential. Remove ANY rfc2617
906
// credential since presence of a rfc2617 credential serves
907
// as flag to frontier to requeue this curi and let the curi
908
// die a natural death.
909
extant.detachAll(curi);
910             logger.warning("Auth failed (401) though supplied realm " +
911                     realm + " to " + curi.toString());
912         } else {
913             // Look see if we have a credential that corresponds to this
914
// realm in credential store. Filter by type and credential
915
// domain. If not, let this curi die. Else, add it to the
916
// curi and let it come around again. Add in the AuthScheme
917
// we got too. Its needed when we go to run the Auth on
918
// second time around.
919
CredentialStore cs =
920                 CredentialStore.getCredentialStore(getSettingsHandler());
921             if (cs == null) {
922                 logger.severe("No credential store for " + curi);
923             } else {
924                 CrawlServer server = getController().getServerCache().
925                     getServerFor(curi);
926                 Set JavaDoc storeRfc2617Credentials = cs.subset(curi,
927                     Rfc2617Credential.class, server.getName());
928                 if (storeRfc2617Credentials == null ||
929                         storeRfc2617Credentials.size() <= 0) {
930                     logger.info("No rfc2617 credentials for " + curi);
931                 } else {
932                     Rfc2617Credential found = Rfc2617Credential.
933                         getByRealm(storeRfc2617Credentials, realm, curi);
934                     if (found == null) {
935                         logger.info("No rfc2617 credentials for realm " +
936                                 realm + " in " + curi);
937                     } else {
938                         found.attach(curi, authscheme.getRealm());
939                         logger.info("Found credential for realm " + realm +
940                             " in store for " + curi.toString());
941                     }
942                 }
943             }
944         }
945     }
946     
947     /**
948      * @param method Method that got a 401.
949      * @param curi CrawlURI that got a 401.
950      * @return Returns first wholesome authscheme found else null.
951      */

952     protected AuthScheme getAuthScheme(final HttpMethod method,
953             final CrawlURI curi) {
954         Header [] headers = method.getResponseHeaders("WWW-Authenticate");
955         if (headers == null || headers.length <= 0) {
956             logger.info("We got a 401 but no WWW-Authenticate challenge: " +
957                 curi.toString());
958             return null;
959         }
960
961         Map JavaDoc authschemes = null;
962         try {
963             authschemes = AuthChallengeParser.parseChallenges(headers);
964         } catch(MalformedChallengeException e) {
965             logger.info("Failed challenge parse: " + e.getMessage());
966         }
967         if (authschemes == null || authschemes.size() <= 0) {
968             logger.info("We got a 401 and WWW-Authenticate challenge" +
969                 " but failed parse of the header " + curi.toString());
970             return null;
971         }
972          
973         AuthScheme result = null;
974         // Use the first auth found.
975
for (Iterator JavaDoc i = authschemes.keySet().iterator();
976                 result == null && i.hasNext();) {
977             String JavaDoc key = (String JavaDoc)i.next();
978             String JavaDoc challenge = (String JavaDoc)authschemes.get(key);
979             if (key == null || key.length() <= 0 || challenge == null ||
980                   challenge.length() <= 0) {
981                 logger.warning("Empty scheme: " + curi.toString() +
982                   ": " + headers);
983             }
984             AuthScheme authscheme = null;
985             if (key.equals("basic")) {
986                 authscheme = new BasicScheme();
987             } else if (key.equals("digest")) {
988                 authscheme = new DigestScheme();
989             } else {
990                 logger.info("Unsupported scheme: " + key);
991                 continue;
992             }
993             
994             try {
995                 authscheme.processChallenge(challenge);
996             } catch (MalformedChallengeException e) {
997                 logger.info(e.getMessage() + " " + curi + " " + headers);
998                 continue;
999             }
1000            if (authscheme.isConnectionBased()) {
1001                logger.info("Connection based " + authscheme);
1002                continue;
1003            }
1004            
1005            if (authscheme.getRealm() == null ||
1006                    authscheme.getRealm().length() <= 0) {
1007                logger.info("Empty realm " + authscheme + " for " + curi);
1008                continue;
1009            }
1010            result = authscheme;
1011        }
1012        
1013        return result;
1014    }
1015        
1016    /**
1017     * @param handler Settings Handler.
1018     * @param curi CrawlURI that got a 401.
1019     * @param type Class of credential to get from curi.
1020     * @return Set of credentials attached to this curi.
1021     */

1022    private Set JavaDoc<Credential> getCredentials(SettingsHandler handler,
1023            CrawlURI curi, Class JavaDoc type) {
1024        Set JavaDoc<Credential> result = null;
1025
1026        if (curi.hasCredentialAvatars()) {
1027            for (Iterator JavaDoc i = curi.getCredentialAvatars().iterator();
1028                    i.hasNext();) {
1029                CredentialAvatar ca = (CredentialAvatar)i.next();
1030                if (ca.match(type)) {
1031                    if (result == null) {
1032                        result = new HashSet JavaDoc<Credential>();
1033                    }
1034                    result.add(ca.getCredential(handler, curi));
1035                }
1036            }
1037        }
1038        return result;
1039    }
1040
1041    public void initialTasks() {
1042        super.initialTasks();
1043        this.getController().addCrawlStatusListener(this);
1044        configureHttp();
1045
1046        // load cookies from a file if specified in the order file.
1047
loadCookies();
1048
1049        // I tried to get the default KeyManagers but doesn't work unless you
1050
// point at a physical keystore. Passing null seems to do the right
1051
// thing so we'll go w/ that.
1052
try {
1053            SSLContext context = SSLContext.getInstance("SSL");
1054            context.init(null, new TrustManager[] {
1055                new ConfigurableX509TrustManager((String JavaDoc)
1056                    getAttribute(ATTR_TRUST))}, null);
1057            this.sslfactory = context.getSocketFactory();
1058        } catch (Exception JavaDoc e) {
1059            logger.log(Level.WARNING, "Failed configure of ssl context "
1060                + e.getMessage(), e);
1061        }
1062    }
1063    
1064    public void finalTasks() {
1065        // At the end save cookies to the file specified in the order file.
1066
saveCookies();
1067        cleanupHttp();
1068        super.finalTasks();
1069    }
1070
1071    /**
1072     * Perform any final cleanup related to the HttpClient instance.
1073     */

1074    protected void cleanupHttp() {
1075        if(cookieDb!=null) {
1076            try {
1077                cookieDb.close();
1078            } catch (DatabaseException e) {
1079                // TODO Auto-generated catch block
1080
e.printStackTrace();
1081            }
1082        }
1083    }
1084
1085    protected void configureHttp() throws RuntimeException JavaDoc {
1086        // Get timeout. Use it for socket and for connection timeout.
1087
int timeout = (getSoTimeout(null) > 0)? getSoTimeout(null): 0;
1088        
1089        // HttpConnectionManager cm = new ThreadLocalHttpConnectionManager();
1090
HttpConnectionManager cm = new SingleHttpConnectionManager();
1091        
1092        // TODO: The following settings should be made in the corresponding
1093
// HttpConnectionManager, not here.
1094
HttpConnectionManagerParams hcmp = cm.getParams();
1095        hcmp.setConnectionTimeout(timeout);
1096        hcmp.setStaleCheckingEnabled(true);
1097        // Minimizes bandwidth usage. Setting to true disables Nagle's
1098
// algorithm. IBM JVMs < 142 give an NPE setting this boolean
1099
// on ssl sockets.
1100
hcmp.setTcpNoDelay(false);
1101        
1102        this.http = new HttpClient(cm);
1103        HttpClientParams hcp = this.http.getParams();
1104        // Set default socket timeout.
1105
hcp.setSoTimeout(timeout);
1106        // Set client to be version 1.0.
1107
hcp.setVersion(HttpVersion.HTTP_1_0);
1108
1109        String JavaDoc addressStr = null;
1110        try {
1111            addressStr = (String JavaDoc) getAttribute(ATTR_LOCAL_ADDRESS);
1112        } catch (Exception JavaDoc e1) {
1113            // If exception, just use default.
1114
}
1115        if (addressStr != null && addressStr.length() > 0) {
1116            try {
1117                InetAddress JavaDoc localAddress = InetAddress.getByName(addressStr);
1118                this.http.getHostConfiguration().setLocalAddress(localAddress);
1119            } catch (UnknownHostException JavaDoc e) {
1120                // Convert all to RuntimeException so get an exception out
1121
// if initialization fails.
1122
throw new RuntimeException JavaDoc("Unknown host " + addressStr
1123                    + " in " + ATTR_LOCAL_ADDRESS);
1124            }
1125        }
1126
1127        configureHttpCookies();
1128        
1129        // Configure how we want the method to act.
1130
this.http.getParams().setParameter(
1131            HttpMethodParams.SINGLE_COOKIE_HEADER, new Boolean JavaDoc(true));
1132        this.http.getParams().setParameter(
1133            HttpMethodParams.UNAMBIGUOUS_STATUS_LINE , new Boolean JavaDoc(false));
1134        this.http.getParams().setParameter(
1135            HttpMethodParams.STRICT_TRANSFER_ENCODING, new Boolean JavaDoc(false));
1136        this.http.getParams().setIntParameter(
1137            HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10);
1138        
1139        HostConfiguration configOrNull = configureProxy(null);
1140        if(configOrNull!=null) {
1141            // global proxy settings are in effect
1142
this.http.setHostConfiguration(configOrNull);
1143        }
1144        
1145        // Use our own protocol factory, one that gets IP to use from
1146
// heritrix cache (They're cached in CrawlHost instances).
1147
final ServerCache cache = getController().getServerCache();
1148        hcmp.setParameter(SERVER_CACHE_KEY, cache);
1149        hcmp.setParameter(SSL_FACTORY_KEY, this.sslfactory);
1150    }
1151
1152    /**
1153     * Set the HttpClient HttpState instance to use a BDB-backed
1154     * StoredSortedMap for cookie storage, if that option is chosen.
1155     */

1156    private void configureHttpCookies() {
1157        // If Bdb-backed cookies chosen, replace map in HttpState
1158
if(((Boolean JavaDoc)getUncheckedAttribute(null, ATTR_BDB_COOKIES)).
1159                booleanValue()) {
1160            try {
1161                Environment env = getController().getBdbEnvironment();
1162                StoredClassCatalog classCatalog = getController().getClassCatalog();
1163                DatabaseConfig dbConfig = new DatabaseConfig();
1164                dbConfig.setTransactional(false);
1165                dbConfig.setAllowCreate(true);
1166                cookieDb = env.openDatabase(null, COOKIEDB_NAME, dbConfig);
1167                StoredSortedMap cookiesMap = new StoredSortedMap(cookieDb,
1168                        new StringBinding(), new SerialBinding(classCatalog,
1169                                Cookie.class), true);
1170                this.http.getState().setCookiesMap(cookiesMap);
1171            } catch (DatabaseException e) {
1172                // TODO Auto-generated catch block
1173
logger.severe(e.getMessage());
1174                e.printStackTrace();
1175            }
1176        }
1177    }
1178
1179    /**
1180     * @param curi Current CrawlURI. Used to get context.
1181     * @return Socket timeout value.
1182     */

1183    private int getSoTimeout(CrawlURI curi) {
1184        Integer JavaDoc res = null;
1185        try {
1186            res = (Integer JavaDoc) getAttribute(ATTR_SOTIMEOUT_MS, curi);
1187        } catch (Exception JavaDoc e) {
1188            res = DEFAULT_SOTIMEOUT_MS;
1189        }
1190        return res.intValue();
1191    }
1192
1193    /**
1194     * @param curi Current CrawlURI. Used to get context.
1195     * @return Timeout value for total request.
1196     */

1197    private int getTimeout(CrawlURI curi) {
1198        Integer JavaDoc res;
1199        try {
1200            res = (Integer JavaDoc) getAttribute(ATTR_TIMEOUT_SECONDS, curi);
1201        } catch (Exception JavaDoc e) {
1202            res = DEFAULT_TIMEOUT_SECONDS;
1203        }
1204        return res.intValue();
1205    }
1206
1207    private int getMaxFetchRate(CrawlURI curi) {
1208        Integer JavaDoc res;
1209        try {
1210            res = (Integer JavaDoc)getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi);
1211        }
1212        catch (Exception JavaDoc e) {
1213            res = DEFAULT_FETCH_BANDWIDTH_MAX;
1214        }
1215        return res.intValue();
1216    }
1217
1218    private long getMaxLength(CrawlURI curi) {
1219        Long JavaDoc res;
1220        try {
1221            res = (Long JavaDoc) getAttribute(ATTR_MAX_LENGTH_BYTES, curi);
1222            if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) {
1223                res = DEFAULT_MAX_LENGTH_BYTES;
1224            }
1225        } catch (Exception JavaDoc e) {
1226            res = DEFAULT_MAX_LENGTH_BYTES;
1227        }
1228        return res.longValue();
1229    }
1230
1231    /**
1232     * Load cookies from a file before the first fetch.
1233     * <p>
1234     * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1235     * Example entry of cookies.txt file:<br>
1236     * <br>
1237     * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1238     * <br>
1239     * Each line has 7 tab-separated fields:<br>
1240     * <li>1. DOMAIN: The domain that created and have access to the cookie
1241     * value.
1242     * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1243     * domain can access the cookie value.
1244     * <li>3. PATH: The path within the domain that the cookie value is valid
1245     * for.
1246     * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1247     * connection to access the cookie value.
1248     * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1249     * <li>6. NAME: The name of the cookie value
1250     * <li>7. VALUE: The cookie value
1251     *
1252     * @param cookiesFile file in the Netscape's 'cookies.txt' format.
1253     */

1254    public void loadCookies(String JavaDoc cookiesFile) {
1255        // Do nothing if cookiesFile is not specified.
1256
if (cookiesFile == null || cookiesFile.length() <= 0) {
1257            return;
1258        }
1259        RandomAccessFile JavaDoc raf = null;
1260        try {
1261            raf = new RandomAccessFile JavaDoc(cookiesFile, "r");
1262            String JavaDoc[] cookieParts;
1263            String JavaDoc line;
1264            Cookie cookie = null;
1265            while ((line = raf.readLine()) != null) {
1266                // Line that starts with # is commented line, therefore skip it.
1267
if (!line.startsWith("#")) {
1268                    cookieParts = line.split("\\t");
1269                    if (cookieParts.length == 7) {
1270                        // Create cookie with not expiration date (-1 value).
1271
// TODO: add this as an option.
1272
cookie =
1273                            new Cookie(cookieParts[0], cookieParts[5],
1274                                cookieParts[6], cookieParts[2], -1,
1275                                Boolean.valueOf(cookieParts[3]).booleanValue());
1276
1277                        if (cookieParts[1].toLowerCase().equals("true")) {
1278                            cookie.setDomainAttributeSpecified(true);
1279                        } else {
1280                            cookie.setDomainAttributeSpecified(false);
1281                        }
1282                        this.http.getState().addCookie(cookie);
1283                        logger.fine(
1284                            "Adding cookie: " + cookie.toExternalForm());
1285                    }
1286                }
1287            }
1288        } catch (FileNotFoundException JavaDoc e) {
1289            // We should probably throw FatalConfigurationException.
1290
System.out.println("Could not find file: " + cookiesFile
1291                    + " (Element: " + ATTR_LOAD_COOKIES + ")");
1292
1293        } catch (IOException JavaDoc e) {
1294            // We should probably throw FatalConfigurationException.
1295
e.printStackTrace();
1296        } finally {
1297            try {
1298                if (raf != null) {
1299                    raf.close();
1300                }
1301            } catch (IOException JavaDoc e) {
1302                e.printStackTrace();
1303            }
1304        }
1305    }
1306
1307    /* (non-Javadoc)
1308     * @see org.archive.crawler.framework.Processor#report()
1309     */

1310    public String JavaDoc report() {
1311        StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
1312        ret.append("Processor: org.archive.crawler.fetcher.FetchHTTP\n");
1313        ret.append(" Function: Fetch HTTP URIs\n");
1314        ret.append(" CrawlURIs handled: " + this.curisHandled + "\n");
1315        ret.append(" Recovery retries: " + this.recoveryRetries + "\n\n");
1316
1317        return ret.toString();
1318    }
1319
1320
1321    /**
1322     * Load cookies from the file specified in the order file.
1323     *
1324     * <p>
1325     * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1326     * Example entry of cookies.txt file:<br>
1327     * <br>
1328     * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1329     * <br>
1330     * Each line has 7 tab-separated fields:<br>
1331     * <li>1. DOMAIN: The domain that created and have access to the cookie
1332     * value.
1333     * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1334     * domain can access the cookie value.
1335     * <li>3. PATH: The path within the domain that the cookie value is valid
1336     * for.
1337     * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1338     * connection to access the cookie value.
1339     * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1340     * <li>6. NAME: The name of the cookie value
1341     * <li>7. VALUE: The cookie value
1342     */

1343    public void loadCookies() {
1344        try {
1345            loadCookies((String JavaDoc) getAttribute(ATTR_LOAD_COOKIES));
1346        } catch (MBeanException JavaDoc e) {
1347            logger.warning(e.getLocalizedMessage());
1348        } catch (ReflectionException JavaDoc e) {
1349            logger.warning(e.getLocalizedMessage());
1350        } catch (AttributeNotFoundException JavaDoc e) {
1351            logger.warning(e.getLocalizedMessage());
1352        }
1353    }
1354    /**
1355     * Saves cookies to the file specified in the order file.
1356     *
1357     * Output file is in the Netscape 'cookies.txt' format.
1358     *
1359     */

1360    public void saveCookies() {
1361        try {
1362            saveCookies((String JavaDoc) getAttribute(ATTR_SAVE_COOKIES));
1363        } catch (MBeanException JavaDoc e) {
1364            logger.warning(e.getLocalizedMessage());
1365        } catch (ReflectionException JavaDoc e) {
1366            logger.warning(e.getLocalizedMessage());
1367        } catch (AttributeNotFoundException JavaDoc e) {
1368            logger.warning(e.getLocalizedMessage());
1369        }
1370    }
1371    /**
1372     * Saves cookies to a file.
1373     *
1374     * Output file is in the Netscape 'cookies.txt' format.
1375     *
1376     * @param saveCookiesFile output file.
1377     */

1378    public void saveCookies(String JavaDoc saveCookiesFile) {
1379        // Do nothing if cookiesFile is not specified.
1380
if (saveCookiesFile == null || saveCookiesFile.length() <= 0) {
1381            return;
1382        }
1383
1384        FileOutputStream JavaDoc out = null;
1385        try {
1386            out = new FileOutputStream JavaDoc(new File JavaDoc(saveCookiesFile));
1387            @SuppressWarnings JavaDoc("unchecked")
1388            Map JavaDoc<String JavaDoc,Cookie> cookies = http.getState().getCookiesMap();
1389            String JavaDoc tab ="\t";
1390            out.write("# Heritrix Cookie File\n".getBytes());
1391            out.write(
1392                "# This file is the Netscape cookies.txt format\n\n".getBytes());
1393            for (Cookie cookie: cookies.values()) {
1394                MutableString line =
1395                    new MutableString(1024 * 2 /*Guess an initial size*/);
1396                line.append(cookie.getDomain());
1397                line.append(tab);
1398                line.append(
1399                    cookie.isDomainAttributeSpecified() == true
1400                        ? "TRUE"
1401                        : "FALSE");
1402                line.append(tab);
1403                line.append(cookie.getPath());
1404                line.append(tab);
1405                line.append(
1406                    cookie.getSecure() == true ? "TRUE" : "FALSE");
1407                line.append(tab);
1408                line.append(cookie.getName());
1409                line.append(tab);
1410                line.append((null==cookie.getValue())?"":cookie.getValue());
1411                line.append("\n");
1412                out.write(line.toString().getBytes());
1413            }
1414        } catch (FileNotFoundException JavaDoc e) {
1415            // We should probably throw FatalConfigurationException.
1416
System.out.println("Could not find file: " + saveCookiesFile
1417                    + " (Element: " + ATTR_SAVE_COOKIES + ")");
1418        } catch (IOException JavaDoc e) {
1419            e.printStackTrace();
1420        } finally {
1421            try {
1422                if (out != null) {
1423                    out.close();
1424                }
1425            } catch (IOException JavaDoc e) {
1426                e.printStackTrace();
1427            }
1428        }
1429    }
1430
1431    /* (non-Javadoc)
1432     * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
1433     */

1434    protected void listUsedFiles(List JavaDoc<String JavaDoc> list) {
1435        // List the cookies files
1436
// Add seed file
1437
try {
1438            String JavaDoc tmp = (String JavaDoc)getAttribute(ATTR_LOAD_COOKIES);
1439            if(tmp != null && tmp.length() > 0){
1440                File JavaDoc file = getSettingsHandler().
1441                        getPathRelativeToWorkingDirectory(tmp);
1442                list.add(file.getAbsolutePath());
1443            }
1444            tmp = (String JavaDoc)getAttribute(ATTR_SAVE_COOKIES);
1445            if(tmp != null && tmp.length() > 0){
1446                File JavaDoc file = getSettingsHandler().
1447                        getPathRelativeToWorkingDirectory(tmp);
1448                list.add(file.getAbsolutePath());
1449            }
1450        } catch (AttributeNotFoundException JavaDoc e) {
1451            // TODO Auto-generated catch block
1452
e.printStackTrace();
1453        } catch (MBeanException JavaDoc e) {
1454            // TODO Auto-generated catch block
1455
e.printStackTrace();
1456        } catch (ReflectionException JavaDoc e) {
1457            // TODO Auto-generated catch block
1458
e.printStackTrace();
1459        }
1460    }
1461    
1462    private void setAcceptHeaders(CrawlURI curi, HttpMethod get) {
1463        try {
1464            StringList accept_headers = (StringList) getAttribute(ATTR_ACCEPT_HEADERS, curi);
1465            if (!accept_headers.isEmpty()) {
1466                for (ListIterator JavaDoc i = accept_headers.listIterator(); i.hasNext();) {
1467                    String JavaDoc hdr = (String JavaDoc) i.next();
1468                    String JavaDoc[] nvp = hdr.split(": +");
1469                    if (nvp.length == 2) {
1470                        get.setRequestHeader(nvp[0], nvp[1]);
1471                    }
1472                    else {
1473                        logger.warning("Invalid accept header: " + hdr);
1474                    }
1475                }
1476            }
1477        }
1478        catch (AttributeNotFoundException JavaDoc e) {
1479            logger.severe(e.getMessage());
1480        }
1481    }
1482
1483    // custom serialization
1484
private void writeObject(ObjectOutputStream JavaDoc stream) throws IOException JavaDoc {
1485        stream.defaultWriteObject();
1486        // save cookies
1487
@SuppressWarnings JavaDoc("unchecked")
1488        Collection JavaDoc<Cookie> c = http.getState().getCookiesMap().values();
1489        Cookie[] cookies = c.toArray(new Cookie[c.size()]);
1490        stream.writeObject(cookies);
1491    }
1492    
1493    private void readObject(ObjectInputStream JavaDoc stream) throws IOException JavaDoc, ClassNotFoundException JavaDoc {
1494        stream.defaultReadObject();
1495        Cookie cookies[] = (Cookie[]) stream.readObject();
1496        ObjectPlusFilesInputStream coistream = (ObjectPlusFilesInputStream)stream;
1497        coistream.registerFinishTask( new PostRestore(cookies) );
1498    }
1499    
1500    /**
1501     * @return Returns the http instance.
1502     */

1503    protected HttpClient getHttp() {
1504        return this.http;
1505    }
1506    
1507    class PostRestore implements Runnable JavaDoc {
1508        Cookie cookies[];
1509        public PostRestore(Cookie cookies[]) {
1510            this.cookies = cookies;
1511        }
1512        public void run() {
1513            configureHttp();
1514            for(int i = 0; i < cookies.length; i++) {
1515                getHttp().getState().addCookie(cookies[i]);
1516            }
1517        }
1518    }
1519
1520    /* (non-Javadoc)
1521     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1522     */

1523    public void crawlStarted(String JavaDoc message) {
1524        // TODO Auto-generated method stub
1525
}
1526    
1527    /* (non-Javadoc)
1528     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1529     */

1530    public void crawlCheckpoint(File JavaDoc checkpointDir) {
1531        // TODO Auto-generated method stub
1532
}
1533
1534    /* (non-Javadoc)
1535     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
1536     */

1537    public void crawlEnding(String JavaDoc sExitMessage) {
1538        // TODO Auto-generated method stub
1539
}
1540
1541    /* (non-Javadoc)
1542     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
1543     */

1544    public void crawlEnded(String JavaDoc sExitMessage) {
1545        this.http = null;
1546        this.midfetchfilters = null;
1547    }
1548
1549    /* (non-Javadoc)
1550     * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
1551     */

1552    public void crawlPausing(String JavaDoc statusMessage) {
1553        // TODO Auto-generated method stub
1554
}
1555
1556    /* (non-Javadoc)
1557     * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
1558     */

1559    public void crawlPaused(String JavaDoc statusMessage) {
1560        // TODO Auto-generated method stub
1561
}
1562
1563    /* (non-Javadoc)
1564     * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
1565     */

1566    public void crawlResuming(String JavaDoc statusMessage) {
1567        // TODO Auto-generated method stub
1568
}
1569}
1570
Popular Tags