KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > datamodel > CrawlURI


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * CrawlURI.java
20  * Created on Apr 16, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/CrawlURI.java,v 1.101.4.1 2007/01/13 01:31:09 stack-sf Exp $
23  */

24 package org.archive.crawler.datamodel;
25
26 import java.io.IOException JavaDoc;
27 import java.io.ObjectInputStream JavaDoc;
28 import java.io.ObjectOutputStream JavaDoc;
29 import java.util.ArrayList JavaDoc;
30 import java.util.Collection JavaDoc;
31 import java.util.HashSet JavaDoc;
32 import java.util.Iterator JavaDoc;
33 import java.util.List JavaDoc;
34 import java.util.Set JavaDoc;
35 import java.util.concurrent.CopyOnWriteArrayList JavaDoc;
36
37 import org.apache.commons.httpclient.HttpStatus;
38 import org.apache.commons.httpclient.URIException;
39 import org.archive.crawler.datamodel.credential.CredentialAvatar;
40 import org.archive.crawler.datamodel.credential.Rfc2617Credential;
41 import org.archive.crawler.extractor.Link;
42 import org.archive.crawler.framework.Processor;
43 import org.archive.crawler.framework.ProcessorChain;
44 import org.archive.crawler.util.Transform;
45 import org.archive.net.UURI;
46 import org.archive.net.UURIFactory;
47 import org.archive.util.Base32;
48 import org.archive.util.HttpRecorder;
49
50 import st.ata.util.AList;
51 import st.ata.util.HashtableAList;
52
53
54 /**
55  * Represents a candidate URI and the associated state it
56  * collects as it is crawled.
57  *
58  * <p>Core state is in instance variables but a flexible
59  * attribute list is also available. Use this 'bucket' to carry
60  * custom processing extracted data and state across CrawlURI
61  * processing. See the {@link #putString(String, String)},
62  * {@link #getString(String)}, etc.
63  *
64  * @author Gordon Mohr
65  */

66 public class CrawlURI extends CandidateURI
67 implements FetchStatusCodes {
68
69     private static final long serialVersionUID = 7874096757350100472L;
70
71     public static final int UNCALCULATED = -1;
72     
73     // INHERITED FROM CANDIDATEURI
74
// uuri: core identity: the "usable URI" to be crawled
75
// isSeed
76
// inScopeVersion
77
// pathFromSeed
78
// via
79

80     // Processing progress
81
transient private Processor nextProcessor;
82     transient private ProcessorChain nextProcessorChain;
83     private int fetchStatus = 0; // default to unattempted
84
private int deferrals = 0; // count of postponements for prerequisites
85
private int fetchAttempts = 0; // the number of fetch attempts that have been made
86
transient private int threadNumber;
87
88     // dynamic context
89
private int linkHopCount = UNCALCULATED; // from seeds
90
private int embedHopCount = UNCALCULATED; // from a sure link; reset upon any link traversal
91

92     // User agent to masquerade as when crawling this URI. If null, globals should be used
93
private String JavaDoc userAgent = null;
94
95     // Once a link extractor has finished processing this curi this will be
96
// set as true
97
transient private boolean linkExtractorFinished = false;
98
99     /**
100      * Protection against outlink overflow.
101      * Change value by setting alternate maximum in heritrix.properties.
102      */

103     public static final int MAX_OUTLINKS = Integer.
104         parseInt(System.getProperty(CrawlURI.class.getName() + ".maxOutLinks",
105             "6000"));
106     
107     transient private int discardedOutlinks = 0;
108     
109 ////////////////////////////////////////////////////////////////////
110
private long contentSize = UNCALCULATED;
111     private long contentLength = UNCALCULATED;
112
113     /**
114      * Current http recorder.
115      *
116      * Gets set upon successful request. Reset at start of processing chain.
117      */

118     private transient HttpRecorder httpRecorder = null;
119
120     /**
121      * Content type of a successfully fetched URI.
122      *
123      * May be null even on successfully fetched URI.
124      */

125     private String JavaDoc contentType = null;
126
127     /**
128      * True if this CrawlURI has been deemed a prerequisite by the
129      * {@link org.archive.crawler.prefetch.PreconditionEnforcer}.
130      *
131      * This flag is used at least inside in the precondition enforcer so that
132      * subsequent prerequisite tests know to let this CrawlURI through because
133      * its a prerequisite needed by an earlier prerequisite tests (e.g. If
134      * this is a robots.txt, then the subsequent login credentials prereq
135      * test must not throw it out because its not a login curi).
136      */

137     private boolean prerequisite = false;
138
139     /**
140      * Set to true if this <code>curi</code> is to be POST'd rather than GET-d.
141      */

142     private boolean post = false;
143
144     /**
145      * Monotonically increasing number within a crawl;
146      * useful for tending towards breadth-first ordering.
147      * Will sometimes be truncated to 48 bits, so behavior
148      * over 281 trillion instantiated CrawlURIs may be
149      * buggy
150      */

151     protected long ordinal;
152
153     /**
154      * Cache of this candidate uuri as a string.
155      *
156      * Profiling shows us spending about 1-2% of total elapsed time in
157      * toString.
158      */

159     private String JavaDoc cachedCrawlURIString = null;
160     
161     /**
162      * Array to hold keys of alist members that persist across URI processings.
163      * Any key mentioned in this list will not be cleared out at the end
164      * of a pass down the processing chain.
165      */

166     private static final List JavaDoc<Object JavaDoc> alistPersistentMember
167      = new CopyOnWriteArrayList JavaDoc<Object JavaDoc>(
168             new String JavaDoc [] {A_CREDENTIAL_AVATARS_KEY});
169
170     /**
171      * A digest (hash, usually SHA1) of retrieved content-body.
172      *
173      */

174     private byte[] contentDigest = null;
175     private String JavaDoc contentDigestScheme = null;
176
177
178     /**
179      * Create a new instance of CrawlURI from a {@link UURI}.
180      *
181      * @param uuri the UURI to base this CrawlURI on.
182      */

183     public CrawlURI(UURI uuri) {
184         super(uuri);
185     }
186
187     /**
188      * Create a new instance of CrawlURI from a {@link CandidateURI}
189      *
190      * @param caUri the CandidateURI to base this CrawlURI on.
191      * @param o Monotonically increasing number within a crawl.
192      */

193     @SuppressWarnings JavaDoc("deprecation")
194     public CrawlURI(CandidateURI caUri, long o) {
195         super(caUri.getUURI(), caUri.getPathFromSeed(), caUri.getVia(),
196             caUri.getViaContext());
197         ordinal = o;
198         setIsSeed(caUri.isSeed());
199         setSchedulingDirective(caUri.getSchedulingDirective());
200         setAList(caUri.getAList());
201     }
202
203     /**
204      * Takes a status code and converts it into a human readable string.
205      *
206      * @param code the status code
207      * @return a human readable string declaring what the status code is.
208      */

209     public static String JavaDoc fetchStatusCodesToString(int code){
210         switch(code){
211             // DNS
212
case S_DNS_SUCCESS : return "DNS-1-OK";
213             // HTTP Informational 1xx
214
case 100 : return "HTTP-100-Info-Continue";
215             case 101 : return "HTTP-101-Info-Switching Protocols";
216             // HTTP Successful 2xx
217
case 200 : return "HTTP-200-Success-OK";
218             case 201 : return "HTTP-201-Success-Created";
219             case 202 : return "HTTP-202-Success-Accepted";
220             case 203 : return "HTTP-203-Success-Non-Authoritative";
221             case 204 : return "HTTP-204-Success-No Content ";
222             case 205 : return "HTTP-205-Success-Reset Content";
223             case 206 : return "HTTP-206-Success-Partial Content";
224             // HTTP Redirection 3xx
225
case 300 : return "HTTP-300-Redirect-Multiple Choices";
226             case 301 : return "HTTP-301-Redirect-Moved Permanently";
227             case 302 : return "HTTP-302-Redirect-Found";
228             case 303 : return "HTTP-303-Redirect-See Other";
229             case 304 : return "HTTP-304-Redirect-Not Modified";
230             case 305 : return "HTTP-305-Redirect-Use Proxy";
231             case 307 : return "HTTP-307-Redirect-Temporary Redirect";
232             // HTTP Client Error 4xx
233
case 400 : return "HTTP-400-ClientErr-Bad Request";
234             case 401 : return "HTTP-401-ClientErr-Unauthorized";
235             case 402 : return "HTTP-402-ClientErr-Payment Required";
236             case 403 : return "HTTP-403-ClientErr-Forbidden";
237             case 404 : return "HTTP-404-ClientErr-Not Found";
238             case 405 : return "HTTP-405-ClientErr-Method Not Allowed";
239             case 407 : return "HTTP-406-ClientErr-Not Acceptable";
240             case 408 : return "HTTP-407-ClientErr-Proxy Authentication Required";
241             case 409 : return "HTTP-408-ClientErr-Request Timeout";
242             case 410 : return "HTTP-409-ClientErr-Conflict";
243             case 406 : return "HTTP-410-ClientErr-Gone";
244             case 411 : return "HTTP-411-ClientErr-Length Required";
245             case 412 : return "HTTP-412-ClientErr-Precondition Failed";
246             case 413 : return "HTTP-413-ClientErr-Request Entity Too Large";
247             case 414 : return "HTTP-414-ClientErr-Request-URI Too Long";
248             case 415 : return "HTTP-415-ClientErr-Unsupported Media Type";
249             case 416 : return "HTTP-416-ClientErr-Requested Range Not Satisfiable";
250             case 417 : return "HTTP-417-ClientErr-Expectation Failed";
251             // HTTP Server Error 5xx
252
case 500 : return "HTTP-500-ServerErr-Internal Server Error";
253             case 501 : return "HTTP-501-ServerErr-Not Implemented";
254             case 502 : return "HTTP-502-ServerErr-Bad Gateway";
255             case 503 : return "HTTP-503-ServerErr-Service Unavailable";
256             case 504 : return "HTTP-504-ServerErr-Gateway Timeout";
257             case 505 : return "HTTP-505-ServerErr-HTTP Version Not Supported";
258             // Heritrix internal codes (all negative numbers
259
case S_BLOCKED_BY_USER:
260                 return "Heritrix(" + S_BLOCKED_BY_USER + ")-Blocked by user";
261             case S_BLOCKED_BY_CUSTOM_PROCESSOR:
262                 return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR +
263                 ")-Blocked by custom prefetch processor";
264             case S_DELETED_BY_USER:
265                 return "Heritrix(" + S_DELETED_BY_USER + ")-Deleted by user";
266             case S_CONNECT_FAILED:
267                 return "Heritrix(" + S_CONNECT_FAILED + ")-Connection failed";
268             case S_CONNECT_LOST:
269                 return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost";
270             case S_DEEMED_CHAFF:
271                 return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff";
272             case S_DEFERRED:
273                 return "Heritrix(" + S_DEFERRED + ")-Deferred";
274             case S_DOMAIN_UNRESOLVABLE:
275                 return "Heritrix(" + S_DOMAIN_UNRESOLVABLE
276                         + ")-Domain unresolvable";
277             case S_OUT_OF_SCOPE:
278                 return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope";
279             case S_DOMAIN_PREREQUISITE_FAILURE:
280                 return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE
281                         + ")-Domain prerequisite failure";
282             case S_ROBOTS_PREREQUISITE_FAILURE:
283                 return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE
284                         + ")-Robots prerequisite failure";
285             case S_OTHER_PREREQUISITE_FAILURE:
286                 return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE
287                         + ")-Other prerequisite failure";
288             case S_PREREQUISITE_UNSCHEDULABLE_FAILURE:
289                 return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE
290                         + ")-Prerequisite unschedulable failure";
291             case S_ROBOTS_PRECLUDED:
292                 return "Heritrix(" + S_ROBOTS_PRECLUDED + ")-Robots precluded";
293             case S_RUNTIME_EXCEPTION:
294                 return "Heritrix(" + S_RUNTIME_EXCEPTION
295                         + ")-Runtime exception";
296             case S_SERIOUS_ERROR:
297                 return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error";
298             case S_TIMEOUT:
299                 return "Heritrix(" + S_TIMEOUT + ")-Timeout";
300             case S_TOO_MANY_EMBED_HOPS:
301                 return "Heritrix(" + S_TOO_MANY_EMBED_HOPS
302                         + ")-Too many embed hops";
303             case S_TOO_MANY_LINK_HOPS:
304                 return "Heritrix(" + S_TOO_MANY_LINK_HOPS
305                         + ")-Too many link hops";
306             case S_TOO_MANY_RETRIES:
307                 return "Heritrix(" + S_TOO_MANY_RETRIES + ")-Too many retries";
308             case S_UNATTEMPTED:
309                 return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted";
310             case S_UNFETCHABLE_URI:
311                 return "Heritrix(" + S_UNFETCHABLE_URI + ")-Unfetchable URI";
312             case S_PROCESSING_THREAD_KILLED:
313                 return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-" +
314                     "Processing thread killed";
315             // Unknown return code
316
default : return Integer.toString(code);
317         }
318     }
319
320
321     /**
322      * Return the overall/fetch status of this CrawlURI for its
323      * current trip through the processing loop.
324      *
325      * @return a value from FetchStatusCodes
326      */

327     public int getFetchStatus(){
328         return fetchStatus;
329     }
330
331     /**
332      * Set the overall/fetch status of this CrawlURI for
333      * its current trip through the processing loop.
334      *
335      * @param newstatus a value from FetchStatusCodes
336      */

337     public void setFetchStatus(int newstatus){
338         fetchStatus = newstatus;
339     }
340
341     /**
342      * Get the number of attempts at getting the document referenced by this
343      * URI.
344      *
345      * @return the number of attempts at getting the document referenced by this
346      * URI.
347      */

348     public int getFetchAttempts() {
349         return fetchAttempts;
350     }
351
352     /**
353      * Increment the number of attempts at getting the document referenced by
354      * this URI.
355      *
356      * @return the number of attempts at getting the document referenced by this
357      * URI.
358      */

359     public int incrementFetchAttempts() {
360         // TODO: rename, this is actually processing-loop-attempts
361
return fetchAttempts++;
362     }
363
364     /**
365      * Reset fetchAttempts counter.
366      */

367     public void resetFetchAttempts() {
368         this.fetchAttempts = 0;
369     }
370
371     /**
372      * Reset deferrals counter.
373      */

374     public void resetDeferrals() {
375         this.deferrals = 0;
376     }
377
378     /**
379      * Get the next processor to process this URI.
380      *
381      * @return the processor that should process this URI next.
382      */

383     public Processor nextProcessor() {
384         return nextProcessor;
385     }
386
387     /**
388      * Get the processor chain that should be processing this URI after the
389      * current chain is finished with it.
390      *
391      * @return the next processor chain to process this URI.
392      */

393     public ProcessorChain nextProcessorChain() {
394         return nextProcessorChain;
395     }
396
397     /**
398      * Set the next processor to process this URI.
399      *
400      * @param processor the next processor to process this URI.
401      */

402     public void setNextProcessor(Processor processor) {
403         nextProcessor = processor;
404     }
405
406     /**
407      * Set the next processor chain to process this URI.
408      *
409      * @param nextProcessorChain the next processor chain to process this URI.
410      */

411     public void setNextProcessorChain(ProcessorChain nextProcessorChain) {
412         this.nextProcessorChain = nextProcessorChain;
413     }
414
415     /**
416      * Do all actions associated with setting a <code>CrawlURI</code> as
417      * requiring a prerequisite.
418      *
419      * @param lastProcessorChain Last processor chain reference. This chain is
420      * where this <code>CrawlURI</code> goes next.
421      * @param preq Object to set a prerequisite.
422      * @throws URIException
423      */

424     public void markPrerequisite(String JavaDoc preq,
425             ProcessorChain lastProcessorChain) throws URIException {
426         Link link = createLink(preq,Link.PREREQ_MISC,Link.PREREQ_HOP);
427         setPrerequisiteUri(link);
428         incrementDeferrals();
429         setFetchStatus(S_DEFERRED);
430         skipToProcessorChain(lastProcessorChain);
431     }
432
433     /**
434      * Set a prerequisite for this URI.
435      * <p>
436      * A prerequisite is a URI that must be crawled before this URI can be
437      * crawled.
438      *
439      * @param link Link to set as prereq.
440      */

441     public void setPrerequisiteUri(Object JavaDoc link) {
442         putObject(A_PREREQUISITE_URI, link);
443     }
444
445     /**
446      * Get the prerequisite for this URI.
447      * <p>
448      * A prerequisite is a URI that must be crawled before this URI can be
449      * crawled.
450      *
451      * @return the prerequisite for this URI or null if no prerequisite.
452      */

453     public Object JavaDoc getPrerequisiteUri() {
454         return getObject(A_PREREQUISITE_URI);
455     }
456     
457     /**
458      * @return True if this CrawlURI has a prerequisite.
459      */

460     public boolean hasPrerequisiteUri() {
461         return containsKey(A_PREREQUISITE_URI);
462     }
463
464     /**
465      * Returns true if this CrawlURI is a prerequisite.
466      *
467      * @return true if this CrawlURI is a prerequisite.
468      */

469     public boolean isPrerequisite() {
470         return this.prerequisite;
471     }
472
473     /**
474      * Set if this CrawlURI is itself a prerequisite URI.
475      *
476      * @param prerequisite True if this CrawlURI is itself a prerequiste uri.
477      */

478     public void setPrerequisite(boolean prerequisite) {
479         this.prerequisite = prerequisite;
480     }
481
482     /**
483      * @return This crawl URI as a string wrapped with 'CrawlURI(' +
484      * ')'.
485      */

486     public String JavaDoc getCrawlURIString() {
487         if (this.cachedCrawlURIString == null) {
488             synchronized (this) {
489                 if (this.cachedCrawlURIString == null) {
490                     this.cachedCrawlURIString =
491                         "CrawlURI(" + toString() + ")";
492                 }
493             }
494         }
495         return this.cachedCrawlURIString;
496     }
497
498     /**
499      * Get the content type of this URI.
500      *
501      * @return Fetched URIs content type. May be null.
502      */

503     public String JavaDoc getContentType() {
504         return this.contentType;
505     }
506
507     /**
508      * Set a fetched uri's content type.
509      *
510      * @param ct Contenttype. May be null.
511      */

512     public void setContentType(String JavaDoc ct) {
513         this.contentType = ct;
514     }
515
516     /**
517      * Set the number of the ToeThread responsible for processing this uri.
518      *
519      * @param i the ToeThread number.
520      */

521     public void setThreadNumber(int i) {
522         threadNumber = i;
523     }
524
525     /**
526      * Get the number of the ToeThread responsible for processing this uri.
527      *
528      * @return the ToeThread number.
529      */

530     public int getThreadNumber() {
531         return threadNumber;
532     }
533
534     /**
535      * Increment the deferral count.
536      *
537      */

538     public void incrementDeferrals() {
539         deferrals++;
540     }
541
542     /**
543      * Get the deferral count.
544      *
545      * @return the deferral count.
546      */

547     public int getDeferrals() {
548         return deferrals;
549     }
550
551     /**
552      * Remove all attributes set on this uri.
553      * <p>
554      * This methods removes the attribute list.
555      */

556     public void stripToMinimal() {
557         clearAList();
558     }
559
560     /** Get the size in bytes of this URI's content.
561      * This may be set at any time by any class and therefore should not be
562      * trusted. Primarily it exists to ease the calculation of statistics.
563      * @return contentSize
564      */

565     public long getContentSize(){
566         return contentSize;
567     }
568
569     /**
570      * Make note of a non-fatal error, local to a particular Processor,
571      * which should be logged somewhere, but allows processing to continue.
572      *
573      * This is how you add to the local-error log (the 'localized' in
574      * the below is making an error local rather than global, not
575      * making a swiss-french version of the error.).
576      *
577      * @param processorName Name of processor the exception was thrown
578      * in.
579      * @param ex Throwable to log.
580      * @param message Extra message to log beyond exception message.
581      */

582     public void addLocalizedError(final String JavaDoc processorName,
583             final Throwable JavaDoc ex, final String JavaDoc message) {
584         List JavaDoc<LocalizedError> localizedErrors;
585         if (containsKey(A_LOCALIZED_ERRORS)) {
586             @SuppressWarnings JavaDoc("unchecked")
587             List JavaDoc<LocalizedError> temp // to prevent warning on cast
588
= (List JavaDoc<LocalizedError>) getObject(A_LOCALIZED_ERRORS);
589             localizedErrors = temp;
590         } else {
591             localizedErrors = new ArrayList JavaDoc<LocalizedError>();
592             putObject(A_LOCALIZED_ERRORS, localizedErrors);
593         }
594
595         localizedErrors.add(new LocalizedError(processorName, ex, message));
596         addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@" +
597             processorName);
598     }
599     
600     // TODO: Move to utils.
601
protected String JavaDoc getClassSimpleName(final Class JavaDoc c) {
602         String JavaDoc classname = c.getName();
603         int index = classname.lastIndexOf('.');
604         return ((index > 0 && (index + 1) < classname.length())?
605             classname.substring(index + 1): classname);
606     }
607
608     /**
609      * Add an annotation: an abbrieviated indication of something special
610      * about this URI that need not be present in every crawl.log line,
611      * but should be noted for future reference.
612      *
613      * @param annotation the annotation to add; should not contain
614      * whitespace or a comma
615      */

616     public void addAnnotation(String JavaDoc annotation) {
617         String JavaDoc annotations;
618         if(containsKey(A_ANNOTATIONS)) {
619             annotations = getString(A_ANNOTATIONS);
620             annotations += ","+annotation;
621         } else {
622             annotations = annotation;
623         }
624
625         putString(A_ANNOTATIONS,annotations);
626     }
627     
628     /**
629      * TODO: Implement truncation using booleans rather than as this
630      * ugly String parse.
631      * @return True if fetch was truncated.
632      */

633     public boolean isTruncatedFetch() {
634         return annotationContains(TRUNC_SUFFIX);
635     }
636     
637     public boolean isLengthTruncatedFetch() {
638         return annotationContains(LENGTH_TRUNC);
639     }
640     
641     public boolean isTimeTruncatedFetch() {
642         return annotationContains(TIMER_TRUNC);
643     }
644     
645     public boolean isHeaderTruncatedFetch() {
646         return annotationContains(HEADER_TRUNC);
647     }
648     
649     protected boolean annotationContains(final String JavaDoc str2Find) {
650         boolean result = false;
651         if (!containsKey(A_ANNOTATIONS)) {
652             return result;
653         }
654         String JavaDoc annotations = getString(A_ANNOTATIONS);
655         if (annotations != null && annotations.length() > 0) {
656             result = annotations.indexOf(str2Find) >= 0;
657         }
658         return result;
659     }
660
661     /**
662      * Get the annotations set for this uri.
663      *
664      * @return the annotations set for this uri.
665      */

666     public String JavaDoc getAnnotations() {
667         return (containsKey(A_ANNOTATIONS))?
668             getString(A_ANNOTATIONS): null;
669     }
670
671     /**
672      * Get the embeded hop count.
673      *
674      * @return the embeded hop count.
675      */

676     public int getEmbedHopCount() {
677         return embedHopCount;
678     }
679
680     /**
681      * Get the link hop count.
682      *
683      * @return the link hop count.
684      */

685     public int getLinkHopCount() {
686         return linkHopCount;
687     }
688
689     /**
690      * Mark this uri as being a seed.
691      *
692      */

693     public void markAsSeed() {
694         linkHopCount = 0;
695         embedHopCount = 0;
696     }
697
698     /**
699      * Get the user agent to use for crawling this URI.
700      *
701      * If null the global setting should be used.
702      *
703      * @return user agent or null
704      */

705     public String JavaDoc getUserAgent() {
706         return userAgent;
707     }
708
709     /**
710      * Set the user agent to use when crawling this URI.
711      *
712      * If not set the global settings should be used.
713      *
714      * @param string user agent to use
715      */

716     public void setUserAgent(String JavaDoc string) {
717         userAgent = string;
718     }
719
720     /**
721      * Set which processor should be the next processor to process this uri
722      * instead of using the default next processor.
723      *
724      * @param processorChain the processor chain to skip to.
725      * @param processor the processor in the processor chain to skip to.
726      */

727     public void skipToProcessor(ProcessorChain processorChain,
728             Processor processor) {
729         setNextProcessorChain(processorChain);
730         setNextProcessor(processor);
731     }
732
733     /**
734      * Set which processor chain should be processing this uri next.
735      *
736      * @param processorChain the processor chain to skip to.
737      */

738     public void skipToProcessorChain(ProcessorChain processorChain) {
739         setNextProcessorChain(processorChain);
740         setNextProcessor(null);
741     }
742
743     /**
744      * For completed HTTP transactions, the length of the content-body.
745      *
746      * @return For completed HTTP transactions, the length of the content-body.
747      */

748     public long getContentLength() {
749         if (this.contentLength < 0) {
750             this.contentLength = (getHttpRecorder() != null)?
751                 getHttpRecorder().getResponseContentLength(): 0;
752         }
753         return this.contentLength;
754     }
755
756     /**
757      * @param l Content size.
758      */

759     public void setContentSize(long l) {
760         contentSize = l;
761     }
762
763     /**
764      * If true then a link extractor has already claimed this CrawlURI and
765      * performed link extraction on the document content. This does not
766      * preclude other link extractors that may have an interest in this
767      * CrawlURI from also doing link extraction but default behavior should
768      * be to not run if link extraction has already been done.
769      *
770      * <p>There is an onus on link extractors to set this flag if they have
771      * run.
772      *
773      * <p>The only extractor of the default Heritrix set that does not
774      * respect this flag is
775      * {@link org.archive.crawler.extractor.ExtractorHTTP}.
776      * It runs against HTTP headers, not the document content.
777      *
778      * @return True if a processor has performed link extraction on this
779      * CrawlURI
780      *
781      * @see #linkExtractorFinished()
782      */

783     public boolean hasBeenLinkExtracted(){
784         return linkExtractorFinished;
785     }
786
787     /**
788      * Note that link extraction has been performed on this CrawlURI. A processor
789      * doing link extraction should invoke this method once it has finished it's
790      * work. It should invoke it even if no links are extracted. It should only
791      * invoke this method if the link extraction was performed on the document
792      * body (not the HTTP headers etc.).
793      *
794      * @see #hasBeenLinkExtracted()
795      */

796     public void linkExtractorFinished() {
797         linkExtractorFinished = true;
798         if(discardedOutlinks>0) {
799             addAnnotation("dol:"+discardedOutlinks);
800         }
801     }
802
803     /**
804      * Notify CrawlURI it is about to be logged; opportunity
805      * for self-annotation
806      */

807     public void aboutToLog() {
808         if (fetchAttempts>1) {
809             addAnnotation(fetchAttempts+"t");
810         }
811     }
812
813     /**
814      * Get the http recorder associated with this uri.
815      *
816      * @return Returns the httpRecorder. May be null but its set early in
817      * FetchHttp so there is an issue if its null.
818      */

819     public HttpRecorder getHttpRecorder() {
820         return httpRecorder;
821     }
822
823     /**
824      * Set the http recorder to be associated with this uri.
825      *
826      * @param httpRecorder The httpRecorder to set.
827      */

828     public void setHttpRecorder(HttpRecorder httpRecorder) {
829         this.httpRecorder = httpRecorder;
830     }
831
832     /**
833      * Return true if this is a http transaction.
834      *
835      * TODO: Compound this and {@link #isPost()} method so that there is one
836      * place to go to find out if get http, post http, ftp, dns.
837      *
838      * @return True if this is a http transaction.
839      */

840     public boolean isHttpTransaction() {
841         return containsKey(A_HTTP_TRANSACTION);
842     }
843
844     /**
845      * Clean up after a run through the processing chain.
846      *
847      * Called on the end of processing chain by Frontier#finish. Null out any
848      * state gathered during processing.
849      */

850     public void processingCleanup() {
851         this.httpRecorder = null;
852         this.fetchStatus = S_UNATTEMPTED;
853         this.setPrerequisite(false);
854         this.contentSize = UNCALCULATED;
855         this.contentLength = UNCALCULATED;
856         // Clear 'links extracted' flag.
857
this.linkExtractorFinished = false;
858         // Clean the alist of all but registered permanent members.
859
setAList(getPersistentAList());
860     }
861     
862     @SuppressWarnings JavaDoc("deprecation")
863     protected AList getPersistentAList() {
864         AList newAList = new HashtableAList();
865         // copy declared persistent keys
866
if(alistPersistentMember!=null && alistPersistentMember.size() > 0) {
867             newAList.copyKeysFrom(alistPersistentMember.iterator(), getAList());
868         }
869         // also copy declared 'heritable' keys
870
List JavaDoc heritableKeys = (List JavaDoc) getObject(A_HERITABLE_KEYS);
871         if(heritableKeys!=null) {
872             newAList.copyKeysFrom(heritableKeys.iterator(), getAList());
873         }
874         return newAList;
875     }
876
877     /**
878      * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>.
879      *
880      * Its safe to pass a CrawlURI instance. In this case we just return it
881      * as a result. Otherwise, we create new CrawlURI instance.
882      *
883      * @param caUri Candidate URI.
884      * @param ordinal
885      * @return A crawlURI made from the passed CandidateURI.
886      */

887     public static CrawlURI from(CandidateURI caUri, long ordinal) {
888         return (caUri instanceof CrawlURI)?
889             (CrawlURI)caUri: new CrawlURI(caUri, ordinal);
890     }
891
892     /**
893      * @param avatars Credential avatars to save off.
894      */

895     private void setCredentialAvatars(Set JavaDoc avatars) {
896         putObject(A_CREDENTIAL_AVATARS_KEY, avatars);
897     }
898
899     /**
900      * @return Credential avatars. Null if none set.
901      */

902     @SuppressWarnings JavaDoc("unchecked")
903     public Set JavaDoc<CredentialAvatar> getCredentialAvatars() {
904         return (Set JavaDoc)getObject(A_CREDENTIAL_AVATARS_KEY);
905     }
906
907     /**
908      * @return True if there are avatars attached to this instance.
909      */

910     public boolean hasCredentialAvatars() {
911         return getCredentialAvatars() != null &&
912             getCredentialAvatars().size() > 0;
913     }
914
915     /**
916      * Add an avatar.
917      *
918      * We do lazy instantiation.
919      *
920      * @param ca Credential avatar to add to set of avatars.
921      */

922     public void addCredentialAvatar(CredentialAvatar ca) {
923         Set JavaDoc<CredentialAvatar> avatars = getCredentialAvatars();
924         if (avatars == null) {
925             avatars = new HashSet JavaDoc<CredentialAvatar>();
926             setCredentialAvatars(avatars);
927         }
928         avatars.add(ca);
929     }
930
931     /**
932      * Remove all credential avatars from this crawl uri.
933      */

934     public void removeCredentialAvatars() {
935         if (hasCredentialAvatars()) {
936             remove(A_CREDENTIAL_AVATARS_KEY);
937         }
938     }
939
940     /**
941      * Remove all credential avatars from this crawl uri.
942      * @param ca Avatar to remove.
943      * @return True if we removed passed parameter. False if no operation
944      * performed.
945      */

946     public boolean removeCredentialAvatar(CredentialAvatar ca) {
947         boolean result = false;
948         Set JavaDoc avatars = getCredentialAvatars();
949         if (avatars != null && avatars.size() > 0) {
950             result = avatars.remove(ca);
951         }
952         return result;
953     }
954
955     /**
956      * Ask this URI if it was a success or not.
957      *
958      * Only makes sense to call this method after execution of
959      * HttpMethod#execute. Regard any status larger then 0 as success
960      * except for below caveat regarding 401s. Use {@link #is2XXSuccess()} if
961      * looking for a status code in the 200 range.
962      *
963      * <p>401s caveat: If any rfc2617 credential data present and we got a 401
964      * assume it got loaded in FetchHTTP on expectation that we're to go around
965      * the processing chain again. Report this condition as a failure so we
966      * get another crack at the processing chain only this time we'll be making
967      * use of the loaded credential data.
968      *
969      * @return True if ths URI has been successfully processed.
970      * @see #is2XXSuccess()
971      */

972     public boolean isSuccess() {
973         boolean result = false;
974         int statusCode = this.fetchStatus;
975         if (statusCode == HttpStatus.SC_UNAUTHORIZED &&
976             hasRfc2617CredentialAvatar()) {
977             result = false;
978         } else {
979             result = (statusCode > 0);
980         }
981         return result;
982     }
983     
984     /**
985      * @return True if status code is in the 2xx range.
986      * @see #isSuccess()
987      */

988     public boolean is2XXSuccess() {
989         return this.fetchStatus >= 200 && this.fetchStatus < 300;
990     }
991
992     /**
993      * @return True if we have an rfc2617 payload.
994      */

995     public boolean hasRfc2617CredentialAvatar() {
996         boolean result = false;
997         Set JavaDoc avatars = getCredentialAvatars();
998         if (avatars != null && avatars.size() > 0) {
999             for (Iterator JavaDoc i = avatars.iterator(); i.hasNext();) {
1000                if (((CredentialAvatar)i.next()).
1001                    match(Rfc2617Credential.class)) {
1002                    result = true;
1003                    break;
1004                }
1005            }
1006        }
1007        return result;
1008    }
1009
1010    /**
1011     * Set whether this URI should be fetched by sending a HTTP POST request.
1012     * Else a HTTP GET request will be used.
1013     *
1014     * @param b Set whether this curi is to be POST'd. Else its to be GET'd.
1015     */

1016    public void setPost(boolean b) {
1017        this.post = b;
1018    }
1019
1020    /**
1021     * Returns true if this URI should be fetched by sending a HTTP POST request.
1022     *
1023     *
1024     * TODO: Compound this and {@link #isHttpTransaction()} method so that there
1025     * is one place to go to find out if get http, post http, ftp, dns.
1026     *
1027     * @return Returns is this CrawlURI instance is to be posted.
1028     */

1029    public boolean isPost() {
1030        return this.post;
1031    }
1032
1033    /**
1034     * Set the retained content-digest value (usu. SHA1).
1035     *
1036     * @param digestValue
1037     * @deprecated Use {@link #setContentDigest(String scheme, byte[])}
1038     */

1039    public void setContentDigest(byte[] digestValue) {
1040        setContentDigest("SHA1", digestValue);
1041    }
1042    
1043    public void setContentDigest(final String JavaDoc scheme,
1044            final byte [] digestValue) {
1045        this.contentDigest = digestValue;
1046        this.contentDigestScheme = scheme;
1047    }
1048    
1049    public String JavaDoc getContentDigestSchemeString() {
1050        return this.contentDigestScheme + ":" + getContentDigestString();
1051    }
1052
1053    /**
1054     * Return the retained content-digest value, if any.
1055     *
1056     * @return Digest value.
1057     */

1058    public Object JavaDoc getContentDigest() {
1059        return contentDigest;
1060    }
1061    
1062    public String JavaDoc getContentDigestString() {
1063        return Base32.encode(this.contentDigest);
1064    }
1065
1066    transient Object JavaDoc holder;
1067    transient Object JavaDoc holderKey;
1068
1069    /**
1070     * Remember a 'holder' to which some enclosing/queueing
1071     * facility has assigned this CrawlURI
1072     * .
1073     * @param obj
1074     */

1075    public void setHolder(Object JavaDoc obj) {
1076        holder=obj;
1077    }
1078
1079    /**
1080     * Return the 'holder' for the convenience of
1081     * an external facility.
1082     *
1083     * @return holder
1084     */

1085    public Object JavaDoc getHolder() {
1086        return holder;
1087    }
1088
1089    /**
1090     * Remember a 'holderKey' which some enclosing/queueing
1091     * facility has assigned this CrawlURI
1092     * .
1093     * @param obj
1094     */

1095    public void setHolderKey(Object JavaDoc obj) {
1096        holderKey=obj;
1097    }
1098    /**
1099     * Return the 'holderKey' for convenience of
1100     * an external facility (Frontier).
1101     *
1102     * @return holderKey
1103     */

1104    public Object JavaDoc getHolderKey() {
1105        return holderKey;
1106    }
1107
1108    /**
1109     * Get the ordinal (serial number) assigned at creation.
1110     *
1111     * @return ordinal
1112     */

1113    public long getOrdinal() {
1114        return ordinal;
1115    }
1116
1117    /** spot for an integer cost to be placed by external facility (frontier).
1118     * cost is truncated to 8 bits at times, so should not exceed 255 */

1119    int holderCost = UNCALCULATED;
1120    /**
1121     * Return the 'holderCost' for convenience of external facility (frontier)
1122     * @return value of holderCost
1123     */

1124    public int getHolderCost() {
1125        return holderCost;
1126    }
1127
1128    /**
1129     * Remember a 'holderCost' which some enclosing/queueing
1130     * facility has assigned this CrawlURI
1131     * @param cost value to remember
1132     */

1133    public void setHolderCost(int cost) {
1134        holderCost = cost;
1135    }
1136
1137    /**
1138     * All discovered outbound Links (navlinks, embeds, etc.)
1139     * Can either contain Link instances or CandidateURI instances, or both.
1140     * The LinksScoper processor converts Link instances in this collection
1141     * to CandidateURI instances.
1142     */

1143    transient Collection JavaDoc<Object JavaDoc> outLinks = new HashSet JavaDoc<Object JavaDoc>();
1144    
1145    /**
1146     * Returns discovered links. The returned collection might be empty if
1147     * no links were discovered, or if something like LinksScoper promoted
1148     * the links to CandidateURIs.
1149     *
1150     * Elements can be removed from the returned collection, but not added.
1151     * To add a discovered link, use one of the createAndAdd methods or
1152     * {@link #getOutObjects()}.
1153     *
1154     * @return Collection of all discovered outbound Links
1155     */

1156    public Collection JavaDoc<Link> getOutLinks() {
1157        return Transform.subclasses(outLinks, Link.class);
1158    }
1159    
1160    /**
1161     * Returns discovered candidate URIs. The returned collection will be
1162     * emtpy until something like LinksScoper promotes discovered Links
1163     * into CandidateURIs.
1164     *
1165     * Elements can be removed from the returned collection, but not added.
1166     * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or
1167     * {@link #getOutObjects}.
1168     *
1169     * @return Collection of candidate URIs
1170     */

1171    public Collection JavaDoc<CandidateURI> getOutCandidates() {
1172        return Transform.subclasses(outLinks, CandidateURI.class);
1173    }
1174    
1175    
1176    /**
1177     * Returns all of the outbound objects. The returned Collection will
1178     * contain Link instances, or CandidateURI instances, or both.
1179     *
1180     * @return the collection of Links and/or CandidateURIs
1181     */

1182    public Collection JavaDoc<Object JavaDoc> getOutObjects() {
1183        return outLinks;
1184    }
1185    
1186    /**
1187     * Add a discovered Link, unless it would exceed the max number
1188     * to accept. (If so, increment discarded link counter.)
1189     *
1190     * @param link the Link to add
1191     */

1192    public void addOutLink(Link link) {
1193        if (outLinks.size() < MAX_OUTLINKS) {
1194            outLinks.add(link);
1195        } else {
1196            // note & discard
1197
discardedOutlinks++;
1198        }
1199    }
1200    
1201    public void clearOutlinks() {
1202        this.outLinks.clear();
1203    }
1204    
1205    /**
1206     * Replace current collection of links w/ passed list.
1207     * Used by Scopers adjusting the list of links (removing those
1208     * not in scope and promoting Links to CandidateURIs).
1209     *
1210     * @param a collection of CandidateURIs replacing any previously
1211     * existing outLinks or outCandidates
1212     */

1213    public void replaceOutlinks(Collection JavaDoc<CandidateURI> links) {
1214        clearOutlinks();
1215        this.outLinks.addAll(links);
1216    }
1217    
1218    
1219    /**
1220     * @return Count of outlinks.
1221     */

1222    public int outlinksSize() {
1223        return this.outLinks.size();
1224    }
1225
1226    /**
1227     * Convenience method for creating a Link discovered at this URI
1228     * with the given string and context
1229     *
1230     * @param url
1231     * String to use to create Link
1232     * @param context
1233     * CharSequence context to use
1234     * @param hopType
1235     * @return Link.
1236     * @throws URIException
1237     * if Link UURI cannot be constructed
1238     */

1239    public Link createLink(String JavaDoc url, CharSequence JavaDoc context,
1240            char hopType) throws URIException {
1241        return new Link(getUURI(), UURIFactory.getInstance(getUURI(),
1242                url), context, hopType);
1243    }
1244    
1245    /**
1246     * Convenience method for creating a Link with the given string and
1247     * context
1248     *
1249     * @param url
1250     * String to use to create Link
1251     * @param context
1252     * CharSequence context to use
1253     * @param hopType
1254     * @throws URIException
1255     * if Link UURI cannot be constructed
1256     */

1257    public void createAndAddLink(String JavaDoc url, CharSequence JavaDoc context,
1258            char hopType) throws URIException {
1259        addOutLink(createLink(url, context, hopType));
1260    }
1261
1262    /**
1263     * Convenience method for creating a Link with the given string and
1264     * context, relative to a previously set base HREF if available (or
1265     * relative to the current CrawlURI if no other base has been set)
1266     *
1267     * @param url String URL to add as destination of link
1268     * @param context String context where link was discovered
1269     * @param hopType char hop-type indicator
1270     * @throws URIException
1271     */

1272    public void createAndAddLinkRelativeToBase(String JavaDoc url,
1273            CharSequence JavaDoc context, char hopType) throws URIException {
1274        addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1275                getBaseURI(), url), context, hopType));
1276    }
1277    
1278    /**
1279     * Convenience method for creating a Link with the given string and
1280     * context, relative to this CrawlURI's via UURI if available. (If
1281     * a via is not available, falls back to using
1282     * #createAndAddLinkRelativeToBase.)
1283     *
1284     * @param url String URL to add as destination of link
1285     * @param context String context where link was discovered
1286     * @param hopType char hop-type indicator
1287     * @throws URIException
1288     */

1289    public void createAndAddLinkRelativeToVia(String JavaDoc url,
1290            CharSequence JavaDoc context, char hopType) throws URIException {
1291        if(getVia()!=null) {
1292            addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1293                getVia(), url), context, hopType));
1294        } else {
1295            // if no 'via', fall back to base/self
1296
createAndAddLinkRelativeToBase(url,context,hopType);
1297        }
1298    }
1299    
1300    /**
1301     * Set the (HTML) Base URI used for derelativizing internal URIs.
1302     *
1303     * @param baseHref String base href to use
1304     * @throws URIException if supplied string cannot be interpreted as URI
1305     */

1306    public void setBaseURI(String JavaDoc baseHref) throws URIException {
1307        putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref));
1308    }
1309      
1310    /**
1311     * Get the (HTML) Base URI used for derelativizing internal URIs.
1312     *
1313     * @return UURI base URI previously set
1314     */

1315    public UURI getBaseURI() {
1316        if (!containsKey(A_HTML_BASE)) {
1317            return getUURI();
1318        }
1319        return (UURI)getObject(A_HTML_BASE);
1320    }
1321    
1322    /**
1323     * Add the key of alist items you want to persist across
1324     * processings.
1325     * @param key Key to add.
1326     */

1327    public static void addAlistPersistentMember(Object JavaDoc key) {
1328        alistPersistentMember.add(key);
1329    }
1330    
1331    /**
1332     * @param key Key to remove.
1333     * @return True if list contained the element.
1334     */

1335    public static boolean removeAlistPersistentMember(Object JavaDoc key) {
1336        return alistPersistentMember.remove(key);
1337    }
1338
1339    /**
1340     * Custom serialization writing an empty 'outLinks' as null. Estimated
1341     * to save ~20 bytes in serialized form.
1342     *
1343     * @param stream
1344     * @throws IOException
1345     */

1346    private void writeObject(ObjectOutputStream JavaDoc stream) throws IOException JavaDoc {
1347        stream.defaultWriteObject();
1348        stream.writeObject((outLinks.isEmpty()) ? null : outLinks);
1349    }
1350
1351    /**
1352     * Custom deserialization recreating empty HashSet from null in 'outLinks'
1353     * slot.
1354     *
1355     * @param stream
1356     * @throws IOException
1357     * @throws ClassNotFoundException
1358     */

1359    private void readObject(ObjectInputStream JavaDoc stream) throws IOException JavaDoc,
1360            ClassNotFoundException JavaDoc {
1361        stream.defaultReadObject();
1362        @SuppressWarnings JavaDoc("unchecked")
1363        HashSet JavaDoc<Object JavaDoc> ol = (HashSet JavaDoc<Object JavaDoc>) stream.readObject();
1364        outLinks = (ol == null) ? new HashSet JavaDoc<Object JavaDoc>() : ol;
1365    }
1366
1367
1368}
1369
Popular Tags