CrawlURI


1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CrawlURI.java
20   * Created on Apr 16, 2003
21   *
22   * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/CrawlURI.java,v 1.101.4.1 2007/01/13 01:31:09 stack-sf Exp $
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.io.IOException  ;
27  import java.io.ObjectInputStream  ;
28  import java.io.ObjectOutputStream  ;
29  import java.util.ArrayList  ;
30  import java.util.Collection  ;
31  import java.util.HashSet  ;
32  import java.util.Iterator  ;
33  import java.util.List  ;
34  import java.util.Set  ;
35  import java.util.concurrent.CopyOnWriteArrayList  ;
36  
37  import org.apache.commons.httpclient.HttpStatus;
38  import org.apache.commons.httpclient.URIException;
39  import org.archive.crawler.datamodel.credential.CredentialAvatar;
40  import org.archive.crawler.datamodel.credential.Rfc2617Credential;
41  import org.archive.crawler.extractor.Link;
42  import org.archive.crawler.framework.Processor;
43  import org.archive.crawler.framework.ProcessorChain;
44  import org.archive.crawler.util.Transform;
45  import org.archive.net.UURI;
46  import org.archive.net.UURIFactory;
47  import org.archive.util.Base32;
48  import org.archive.util.HttpRecorder;
49  
50  import st.ata.util.AList;
51  import st.ata.util.HashtableAList;
52  
53  
54  /**
55   * Represents a candidate URI and the associated state it
56   * collects as it is crawled.
57   *
58   * <p>Core state is in instance variables but a flexible
59   * attribute list is also available. Use this 'bucket' to carry
60   * custom processing extracted data and state across CrawlURI
61   * processing.  See the {@link #putString(String, String)},
62   * {@link #getString(String)}, etc. 
63   *
64   * @author Gordon Mohr
65   */
66  public class CrawlURI extends CandidateURI
67  implements FetchStatusCodes {
68  
69      private static final long serialVersionUID = 7874096757350100472L;
70  
71      public static final int UNCALCULATED = -1;
72      
73      // INHERITED FROM CANDIDATEURI
74      // uuri: core identity: the "usable URI" to be crawled
75      // isSeed
76      // inScopeVersion
77      // pathFromSeed
78      // via
79  
80      // Processing progress
81      transient private Processor nextProcessor;
82      transient private ProcessorChain nextProcessorChain;
83      private int fetchStatus = 0;    // default to unattempted
84      private int deferrals = 0;     // count of postponements for prerequisites
85      private int fetchAttempts = 0; // the number of fetch attempts that have been made
86      transient private int threadNumber;
87  
88      // dynamic context
89      private int linkHopCount = UNCALCULATED; // from seeds
90      private int embedHopCount = UNCALCULATED; // from a sure link; reset upon any link traversal
91  
92      // User agent to masquerade as when crawling this URI. If null, globals should be used
93      private String   userAgent = null;
94  
95      // Once a link extractor has finished processing this curi this will be
96      // set as true
97      transient private boolean linkExtractorFinished = false;
98  
99      /**
100      * Protection against outlink overflow.
101      * Change value by setting alternate maximum in heritrix.properties.
102      */
103     public static final int MAX_OUTLINKS = Integer.
104         parseInt(System.getProperty(CrawlURI.class.getName() + ".maxOutLinks",
105             "6000"));
106     
107     transient private int discardedOutlinks = 0; 
108     
109 ////////////////////////////////////////////////////////////////////
110     private long contentSize = UNCALCULATED;
111     private long contentLength = UNCALCULATED;
112 
113     /**
114      * Current http recorder.
115      *
116      * Gets set upon successful request.  Reset at start of processing chain.
117      */
118     private transient HttpRecorder httpRecorder = null;
119 
120     /**
121      * Content type of a successfully fetched URI.
122      *
123      * May be null even on successfully fetched URI.
124      */
125     private String   contentType = null;
126 
127     /**
128      * True if this CrawlURI has been deemed a prerequisite by the
129      * {@link org.archive.crawler.prefetch.PreconditionEnforcer}.
130      *
131      * This flag is used at least inside in the precondition enforcer so that
132      * subsequent prerequisite tests know to let this CrawlURI through because
133      * its a prerequisite needed by an earlier prerequisite tests (e.g. If
134      * this is a robots.txt, then the subsequent login credentials prereq
135      * test must not throw it out because its not a login curi).
136      */
137     private boolean prerequisite = false;
138 
139     /**
140      * Set to true if this <code>curi</code> is to be POST'd rather than GET-d.
141      */
142     private boolean post = false;
143 
144     /** 
145      * Monotonically increasing number within a crawl;
146      * useful for tending towards breadth-first ordering.
147      * Will sometimes be truncated to 48 bits, so behavior
148      * over 281 trillion instantiated CrawlURIs may be 
149      * buggy
150      */
151     protected long ordinal;
152 
153     /**
154      * Cache of this candidate uuri as a string.
155      *
156      * Profiling shows us spending about 1-2% of total elapsed time in
157      * toString.
158      */
159     private String   cachedCrawlURIString = null;
160     
161     /**
162      * Array to hold keys of alist members that persist across URI processings.
163      * Any key mentioned in this list will not be cleared out at the end
164      * of a pass down the processing chain.
165      */
166     private static final List  <Object  > alistPersistentMember
167      = new CopyOnWriteArrayList  <Object  >(
168             new String   [] {A_CREDENTIAL_AVATARS_KEY});
169 
170     /**
171      * A digest (hash, usually SHA1) of retrieved content-body. 
172      * 
173      */
174     private byte[] contentDigest = null;
175     private String   contentDigestScheme = null;
176 
177 
178     /**
179      * Create a new instance of CrawlURI from a {@link UURI}.
180      *
181      * @param uuri the UURI to base this CrawlURI on.
182      */
183     public CrawlURI(UURI uuri) {
184         super(uuri);
185     }
186 
187     /**
188      * Create a new instance of CrawlURI from a {@link CandidateURI}
189      *
190      * @param caUri the CandidateURI to base this CrawlURI on.
191      * @param o Monotonically increasing number within a crawl.
192      */
193     @SuppressWarnings  ("deprecation")
194     public CrawlURI(CandidateURI caUri, long o) {
195         super(caUri.getUURI(), caUri.getPathFromSeed(), caUri.getVia(),
196             caUri.getViaContext());
197         ordinal = o;
198         setIsSeed(caUri.isSeed());
199         setSchedulingDirective(caUri.getSchedulingDirective());
200         setAList(caUri.getAList());
201     }
202 
203     /**
204      * Takes a status code and converts it into a human readable string.
205      *
206      * @param code the status code
207      * @return a human readable string declaring what the status code is.
208      */
209     public static String   fetchStatusCodesToString(int code){
210         switch(code){
211             // DNS
212             case S_DNS_SUCCESS : return "DNS-1-OK";
213             // HTTP Informational 1xx
214             case 100  : return "HTTP-100-Info-Continue";
215             case 101  : return "HTTP-101-Info-Switching Protocols";
216             // HTTP Successful 2xx
217             case 200  : return "HTTP-200-Success-OK";
218             case 201  : return "HTTP-201-Success-Created";
219             case 202  : return "HTTP-202-Success-Accepted";
220             case 203  : return "HTTP-203-Success-Non-Authoritative";
221             case 204  : return "HTTP-204-Success-No Content ";
222             case 205  : return "HTTP-205-Success-Reset Content";
223             case 206  : return "HTTP-206-Success-Partial Content";
224             // HTTP Redirection 3xx
225             case 300  : return "HTTP-300-Redirect-Multiple Choices";
226             case 301  : return "HTTP-301-Redirect-Moved Permanently";
227             case 302  : return "HTTP-302-Redirect-Found";
228             case 303  : return "HTTP-303-Redirect-See Other";
229             case 304  : return "HTTP-304-Redirect-Not Modified";
230             case 305  : return "HTTP-305-Redirect-Use Proxy";
231             case 307  : return "HTTP-307-Redirect-Temporary Redirect";
232             // HTTP Client Error 4xx
233             case 400  : return "HTTP-400-ClientErr-Bad Request";
234             case 401  : return "HTTP-401-ClientErr-Unauthorized";
235             case 402  : return "HTTP-402-ClientErr-Payment Required";
236             case 403  : return "HTTP-403-ClientErr-Forbidden";
237             case 404  : return "HTTP-404-ClientErr-Not Found";
238             case 405  : return "HTTP-405-ClientErr-Method Not Allowed";
239             case 407  : return "HTTP-406-ClientErr-Not Acceptable";
240             case 408  : return "HTTP-407-ClientErr-Proxy Authentication Required";
241             case 409  : return "HTTP-408-ClientErr-Request Timeout";
242             case 410  : return "HTTP-409-ClientErr-Conflict";
243             case 406  : return "HTTP-410-ClientErr-Gone";
244             case 411  : return "HTTP-411-ClientErr-Length Required";
245             case 412  : return "HTTP-412-ClientErr-Precondition Failed";
246             case 413  : return "HTTP-413-ClientErr-Request Entity Too Large";
247             case 414  : return "HTTP-414-ClientErr-Request-URI Too Long";
248             case 415  : return "HTTP-415-ClientErr-Unsupported Media Type";
249             case 416  : return "HTTP-416-ClientErr-Requested Range Not Satisfiable";
250             case 417  : return "HTTP-417-ClientErr-Expectation Failed";
251             // HTTP Server Error 5xx
252             case 500  : return "HTTP-500-ServerErr-Internal Server Error";
253             case 501  : return "HTTP-501-ServerErr-Not Implemented";
254             case 502  : return "HTTP-502-ServerErr-Bad Gateway";
255             case 503  : return "HTTP-503-ServerErr-Service Unavailable";
256             case 504  : return "HTTP-504-ServerErr-Gateway Timeout";
257             case 505  : return "HTTP-505-ServerErr-HTTP Version Not Supported";
258             // Heritrix internal codes (all negative numbers
259             case S_BLOCKED_BY_USER:
260                 return "Heritrix(" + S_BLOCKED_BY_USER + ")-Blocked by user";
261             case S_BLOCKED_BY_CUSTOM_PROCESSOR:
262                 return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR +
263                 ")-Blocked by custom prefetch processor";
264             case S_DELETED_BY_USER:
265                 return "Heritrix(" + S_DELETED_BY_USER + ")-Deleted by user";
266             case S_CONNECT_FAILED:
267                 return "Heritrix(" + S_CONNECT_FAILED + ")-Connection failed";
268             case S_CONNECT_LOST:
269                 return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost";
270             case S_DEEMED_CHAFF:
271                 return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff";
272             case S_DEFERRED:
273                 return "Heritrix(" + S_DEFERRED + ")-Deferred";
274             case S_DOMAIN_UNRESOLVABLE:
275                 return "Heritrix(" + S_DOMAIN_UNRESOLVABLE
276                         + ")-Domain unresolvable";
277             case S_OUT_OF_SCOPE:
278                 return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope";
279             case S_DOMAIN_PREREQUISITE_FAILURE:
280                 return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE
281                         + ")-Domain prerequisite failure";
282             case S_ROBOTS_PREREQUISITE_FAILURE:
283                 return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE
284                         + ")-Robots prerequisite failure";
285             case S_OTHER_PREREQUISITE_FAILURE:
286                 return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE
287                         + ")-Other prerequisite failure";
288             case S_PREREQUISITE_UNSCHEDULABLE_FAILURE:
289                 return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE
290                         + ")-Prerequisite unschedulable failure";
291             case S_ROBOTS_PRECLUDED:
292                 return "Heritrix(" + S_ROBOTS_PRECLUDED + ")-Robots precluded";
293             case S_RUNTIME_EXCEPTION:
294                 return "Heritrix(" + S_RUNTIME_EXCEPTION
295                         + ")-Runtime exception";
296             case S_SERIOUS_ERROR:
297                 return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error";
298             case S_TIMEOUT:
299                 return "Heritrix(" + S_TIMEOUT + ")-Timeout";
300             case S_TOO_MANY_EMBED_HOPS:
301                 return "Heritrix(" + S_TOO_MANY_EMBED_HOPS
302                         + ")-Too many embed hops";
303             case S_TOO_MANY_LINK_HOPS:
304                 return "Heritrix(" + S_TOO_MANY_LINK_HOPS
305                         + ")-Too many link hops";
306             case S_TOO_MANY_RETRIES:
307                 return "Heritrix(" + S_TOO_MANY_RETRIES + ")-Too many retries";
308             case S_UNATTEMPTED:
309                 return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted";
310             case S_UNFETCHABLE_URI:
311                 return "Heritrix(" + S_UNFETCHABLE_URI + ")-Unfetchable URI";
312             case S_PROCESSING_THREAD_KILLED:
313                 return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-" +
314                     "Processing thread killed";
315             // Unknown return code
316             default : return Integer.toString(code);
317         }
318     }
319 
320 
321     /**
322      * Return the overall/fetch status of this CrawlURI for its
323      * current trip through the processing loop.
324      *
325      * @return a value from FetchStatusCodes
326      */
327     public int getFetchStatus(){
328         return fetchStatus;
329     }
330 
331     /**
332      * Set the overall/fetch status of this CrawlURI for
333      * its current trip through the processing loop.
334      *
335      * @param newstatus a value from FetchStatusCodes
336      */
337     public void setFetchStatus(int newstatus){
338         fetchStatus = newstatus;
339     }
340 
341     /**
342      * Get the number of attempts at getting the document referenced by this
343      * URI.
344      *
345      * @return the number of attempts at getting the document referenced by this
346      *         URI.
347      */
348     public int getFetchAttempts() {
349         return fetchAttempts;
350     }
351 
352     /**
353      * Increment the number of attempts at getting the document referenced by
354      * this URI.
355      *
356      * @return the number of attempts at getting the document referenced by this
357      *         URI.
358      */
359     public int incrementFetchAttempts() {
360         // TODO: rename, this is actually processing-loop-attempts
361         return fetchAttempts++;
362     }
363 
364     /**
365      * Reset fetchAttempts counter.
366      */
367     public void resetFetchAttempts() {
368         this.fetchAttempts = 0;
369     }
370 
371     /**
372      * Reset deferrals counter.
373      */
374     public void resetDeferrals() {
375         this.deferrals = 0;
376     }
377 
378     /**
379      * Get the next processor to process this URI.
380      *
381      * @return the processor that should process this URI next.
382      */
383     public Processor nextProcessor() {
384         return nextProcessor;
385     }
386 
387     /**
388      * Get the processor chain that should be processing this URI after the
389      * current chain is finished with it.
390      *
391      * @return the next processor chain to process this URI.
392      */
393     public ProcessorChain nextProcessorChain() {
394         return nextProcessorChain;
395     }
396 
397     /**
398      * Set the next processor to process this URI.
399      *
400      * @param processor the next processor to process this URI.
401      */
402     public void setNextProcessor(Processor processor) {
403         nextProcessor = processor;
404     }
405 
406     /**
407      * Set the next processor chain to process this URI.
408      *
409      * @param nextProcessorChain the next processor chain to process this URI.
410      */
411     public void setNextProcessorChain(ProcessorChain nextProcessorChain) {
412         this.nextProcessorChain = nextProcessorChain;
413     }
414 
415     /**
416      * Do all actions associated with setting a <code>CrawlURI</code> as
417      * requiring a prerequisite.
418      *
419      * @param lastProcessorChain Last processor chain reference.  This chain is
420      * where this <code>CrawlURI</code> goes next.
421      * @param preq Object to set a prerequisite.
422      * @throws URIException
423      */
424     public void markPrerequisite(String   preq,
425             ProcessorChain lastProcessorChain) throws URIException {
426         Link link = createLink(preq,Link.PREREQ_MISC,Link.PREREQ_HOP);
427         setPrerequisiteUri(link);
428         incrementDeferrals();
429         setFetchStatus(S_DEFERRED);
430         skipToProcessorChain(lastProcessorChain);
431     }
432 
433     /**
434      * Set a prerequisite for this URI.
435      * <p>
436      * A prerequisite is a URI that must be crawled before this URI can be
437      * crawled.
438      *
439      * @param link Link to set as prereq.
440      */
441     public void setPrerequisiteUri(Object   link) {
442         putObject(A_PREREQUISITE_URI, link);
443     }
444 
445     /**
446      * Get the prerequisite for this URI.
447      * <p>
448      * A prerequisite is a URI that must be crawled before this URI can be
449      * crawled.
450      *
451      * @return the prerequisite for this URI or null if no prerequisite.
452      */
453     public Object   getPrerequisiteUri() {
454         return getObject(A_PREREQUISITE_URI);
455     }
456     
457     /**
458      * @return True if this CrawlURI has a prerequisite.
459      */
460     public boolean hasPrerequisiteUri() {
461         return containsKey(A_PREREQUISITE_URI);
462     }
463 
464     /**
465      * Returns true if this CrawlURI is a prerequisite.
466      *
467      * @return true if this CrawlURI is a prerequisite.
468      */
469     public boolean isPrerequisite() {
470         return this.prerequisite;
471     }
472 
473     /**
474      * Set if this CrawlURI is itself a prerequisite URI.
475      *
476      * @param prerequisite True if this CrawlURI is itself a prerequiste uri.
477      */
478     public void setPrerequisite(boolean prerequisite) {
479         this.prerequisite = prerequisite;
480     }
481 
482     /**
483      * @return This crawl URI as a string wrapped with 'CrawlURI(' +
484      * ')'.
485      */
486     public String   getCrawlURIString() {
487         if (this.cachedCrawlURIString == null) {
488             synchronized (this) {
489                 if (this.cachedCrawlURIString == null) {
490                     this.cachedCrawlURIString =
491                         "CrawlURI(" + toString() + ")";
492                 }
493             }
494         }
495         return this.cachedCrawlURIString;
496     }
497 
498     /**
499      * Get the content type of this URI.
500      *
501      * @return Fetched URIs content type.  May be null.
502      */
503     public String   getContentType() {
504         return this.contentType;
505     }
506 
507     /**
508      * Set a fetched uri's content type.
509      *
510      * @param ct Contenttype.  May be null.
511      */
512     public void setContentType(String   ct) {
513         this.contentType = ct;
514     }
515 
516     /**
517      * Set the number of the ToeThread responsible for processing this uri.
518      *
519      * @param i the ToeThread number.
520      */
521     public void setThreadNumber(int i) {
522         threadNumber = i;
523     }
524 
525     /**
526      * Get the number of the ToeThread responsible for processing this uri.
527      *
528      * @return the ToeThread number.
529      */
530     public int getThreadNumber() {
531         return threadNumber;
532     }
533 
534     /**
535      * Increment the deferral count.
536      *
537      */
538     public void incrementDeferrals() {
539         deferrals++;
540     }
541 
542     /**
543      * Get the deferral count.
544      *
545      * @return the deferral count.
546      */
547     public int getDeferrals() {
548         return deferrals;
549     }
550 
551     /**
552      * Remove all attributes set on this uri.
553      * <p>
554      * This methods removes the attribute list.
555      */
556     public void stripToMinimal() {
557         clearAList();
558     }
559 
560     /** Get the size in bytes of this URI's content.
561      * This may be set at any time by any class and therefore should not be
562      * trusted. Primarily it exists to ease the calculation of statistics.
563      * @return contentSize
564      */
565     public long getContentSize(){
566         return contentSize;
567     }
568 
569     /**
570      * Make note of a non-fatal error, local to a particular Processor,
571      * which should be logged somewhere, but allows processing to continue.
572      *
573      * This is how you add to the local-error log (the 'localized' in
574      * the below is making an error local rather than global, not
575      * making a swiss-french version of the error.).
576      * 
577      * @param processorName Name of processor the exception was thrown
578      * in.
579      * @param ex Throwable to log.
580      * @param message Extra message to log beyond exception message.
581      */
582     public void addLocalizedError(final String   processorName,
583             final Throwable   ex, final String   message) {
584         List  <LocalizedError> localizedErrors;
585         if (containsKey(A_LOCALIZED_ERRORS)) {
586             @SuppressWarnings  ("unchecked")
587             List  <LocalizedError> temp // to prevent warning on cast
588              = (List  <LocalizedError>) getObject(A_LOCALIZED_ERRORS);
589             localizedErrors = temp;
590         } else {
591             localizedErrors = new ArrayList  <LocalizedError>();
592             putObject(A_LOCALIZED_ERRORS, localizedErrors);
593         }
594 
595         localizedErrors.add(new LocalizedError(processorName, ex, message));
596         addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@" +
597             processorName);
598     }
599     
600     // TODO: Move to utils.
601     protected String   getClassSimpleName(final Class   c) {
602         String   classname = c.getName();
603         int index = classname.lastIndexOf('.');
604         return ((index > 0 && (index + 1) < classname.length())?
605             classname.substring(index + 1): classname);
606     }
607 
608     /**
609      * Add an annotation: an abbrieviated indication of something special
610      * about this URI that need not be present in every crawl.log line,
611      * but should be noted for future reference. 
612      *
613      * @param annotation the annotation to add; should not contain 
614      * whitespace or a comma
615      */
616     public void addAnnotation(String   annotation) {
617         String   annotations;
618         if(containsKey(A_ANNOTATIONS)) {
619             annotations = getString(A_ANNOTATIONS);
620             annotations += ","+annotation;
621         } else {
622             annotations = annotation;
623         }
624 
625         putString(A_ANNOTATIONS,annotations);
626     }
627     
628     /**
629      * TODO: Implement truncation using booleans rather than as this
630      * ugly String parse.
631      * @return True if fetch was truncated.
632      */
633     public boolean isTruncatedFetch() {
634         return annotationContains(TRUNC_SUFFIX);
635     }
636     
637     public boolean isLengthTruncatedFetch() {
638         return annotationContains(LENGTH_TRUNC);
639     }
640     
641     public boolean isTimeTruncatedFetch() {
642         return annotationContains(TIMER_TRUNC);
643     }
644     
645     public boolean isHeaderTruncatedFetch() {
646         return annotationContains(HEADER_TRUNC);
647     }
648     
649     protected boolean annotationContains(final String   str2Find) {
650         boolean result = false;
651         if (!containsKey(A_ANNOTATIONS)) {
652             return result;
653         }
654         String   annotations = getString(A_ANNOTATIONS);
655         if (annotations != null && annotations.length() > 0) {
656             result = annotations.indexOf(str2Find) >= 0;
657         }
658         return result;
659     }
660 
661     /**
662      * Get the annotations set for this uri.
663      *
664      * @return the annotations set for this uri.
665      */
666     public String   getAnnotations() {
667         return (containsKey(A_ANNOTATIONS))?
668             getString(A_ANNOTATIONS): null;
669     }
670 
671     /**
672      * Get the embeded hop count.
673      *
674      * @return the embeded hop count.
675      */
676     public int getEmbedHopCount() {
677         return embedHopCount;
678     }
679 
680     /**
681      * Get the link hop count.
682      *
683      * @return the link hop count.
684      */
685     public int getLinkHopCount() {
686         return linkHopCount;
687     }
688 
689     /**
690      * Mark this uri as being a seed.
691      *
692      */
693     public void markAsSeed() {
694         linkHopCount = 0;
695         embedHopCount = 0;
696     }
697 
698     /**
699      * Get the user agent to use for crawling this URI.
700      *
701      * If null the global setting should be used.
702      *
703      * @return user agent or null
704      */
705     public String   getUserAgent() {
706         return userAgent;
707     }
708 
709     /**
710      * Set the user agent to use when crawling this URI.
711      *
712      * If not set the global settings should be used.
713      *
714      * @param string user agent to use
715      */
716     public void setUserAgent(String   string) {
717         userAgent = string;
718     }
719 
720     /**
721      * Set which processor should be the next processor to process this uri
722      * instead of using the default next processor.
723      *
724      * @param processorChain the processor chain to skip to.
725      * @param processor the processor in the processor chain to skip to.
726      */
727     public void skipToProcessor(ProcessorChain processorChain,
728             Processor processor) {
729         setNextProcessorChain(processorChain);
730         setNextProcessor(processor);
731     }
732 
733     /**
734      * Set which processor chain should be processing this uri next.
735      *
736      * @param processorChain the processor chain to skip to.
737      */
738     public void skipToProcessorChain(ProcessorChain processorChain) {
739         setNextProcessorChain(processorChain);
740         setNextProcessor(null);
741     }
742 
743     /**
744      * For completed HTTP transactions, the length of the content-body.
745      *
746      * @return For completed HTTP transactions, the length of the content-body.
747      */
748     public long getContentLength() {
749         if (this.contentLength < 0) {
750             this.contentLength = (getHttpRecorder() != null)?
751                 getHttpRecorder().getResponseContentLength(): 0;
752         }
753         return this.contentLength;
754     }
755 
756     /**
757      * @param l Content size.
758      */
759     public void setContentSize(long l) {
760         contentSize = l;
761     }
762 
763     /**
764      * If true then a link extractor has already claimed this CrawlURI and
765      * performed link extraction on the document content. This does not
766      * preclude other link extractors that may have an interest in this
767      * CrawlURI from also doing link extraction but default behavior should
768      * be to not run if link extraction has already been done.
769      * 
770      * <p>There is an onus on link extractors to set this flag if they have
771      * run.
772      * 
773      * <p>The only extractor of the default Heritrix set that does not
774      * respect this flag is
775      * {@link org.archive.crawler.extractor.ExtractorHTTP}.
776      * It runs against HTTP headers, not the document content.
777      * 
778      * @return True if a processor has performed link extraction on this
779      * CrawlURI
780      *
781      * @see #linkExtractorFinished()
782      */
783     public boolean hasBeenLinkExtracted(){
784         return linkExtractorFinished;
785     }
786 
787     /**
788      * Note that link extraction has been performed on this CrawlURI. A processor
789      * doing link extraction should invoke this method once it has finished it's
790      * work. It should invoke it even if no links are extracted. It should only
791      * invoke this method if the link extraction was performed on the document
792      * body (not the HTTP headers etc.).
793      *
794      * @see #hasBeenLinkExtracted()
795      */
796     public void linkExtractorFinished() {
797         linkExtractorFinished = true;
798         if(discardedOutlinks>0) {
799             addAnnotation("dol:"+discardedOutlinks);
800         }
801     }
802 
803     /**
804      * Notify CrawlURI it is about to be logged; opportunity
805      * for self-annotation
806      */
807     public void aboutToLog() {
808         if (fetchAttempts>1) {
809             addAnnotation(fetchAttempts+"t");
810         }
811     }
812 
813     /**
814      * Get the http recorder associated with this uri.
815      *
816      * @return Returns the httpRecorder.  May be null but its set early in
817      * FetchHttp so there is an issue if its null.
818      */
819     public HttpRecorder getHttpRecorder() {
820         return httpRecorder;
821     }
822 
823     /**
824      * Set the http recorder to be associated with this uri.
825      *
826      * @param httpRecorder The httpRecorder to set.
827      */
828     public void setHttpRecorder(HttpRecorder httpRecorder) {
829         this.httpRecorder = httpRecorder;
830     }
831 
832     /**
833      * Return true if this is a http transaction.
834      *
835      * TODO: Compound this and {@link #isPost()} method so that there is one
836      * place to go to find out if get http, post http, ftp, dns.
837      *
838      * @return True if this is a http transaction.
839      */
840     public boolean isHttpTransaction() {
841         return containsKey(A_HTTP_TRANSACTION);
842     }
843 
844     /**
845      * Clean up after a run through the processing chain.
846      *
847      * Called on the end of processing chain by Frontier#finish.  Null out any
848      * state gathered during processing.
849      */
850     public void processingCleanup() {
851         this.httpRecorder = null;
852         this.fetchStatus = S_UNATTEMPTED;
853         this.setPrerequisite(false);
854         this.contentSize = UNCALCULATED;
855         this.contentLength = UNCALCULATED;
856         // Clear 'links extracted' flag.
857         this.linkExtractorFinished = false;
858         // Clean the alist of all but registered permanent members.
859         setAList(getPersistentAList());
860     }
861     
862     @SuppressWarnings  ("deprecation")
863     protected AList getPersistentAList() {
864         AList newAList = new HashtableAList();
865         // copy declared persistent keys
866         if(alistPersistentMember!=null && alistPersistentMember.size() > 0) {
867             newAList.copyKeysFrom(alistPersistentMember.iterator(), getAList());
868         } 
869         // also copy declared 'heritable' keys
870         List   heritableKeys = (List  ) getObject(A_HERITABLE_KEYS);
871         if(heritableKeys!=null) {
872             newAList.copyKeysFrom(heritableKeys.iterator(), getAList());
873         }
874         return newAList;
875     }
876 
877     /**
878      * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>.
879      *
880      * Its safe to pass a CrawlURI instance.  In this case we just return it
881      * as a result. Otherwise, we create new CrawlURI instance.
882      *
883      * @param caUri Candidate URI.
884      * @param ordinal
885      * @return A crawlURI made from the passed CandidateURI.
886      */
887     public static CrawlURI from(CandidateURI caUri, long ordinal) {
888         return (caUri instanceof CrawlURI)?
889             (CrawlURI)caUri: new CrawlURI(caUri, ordinal);
890     }
891 
892     /**
893      * @param avatars Credential avatars to save off.
894      */
895     private void setCredentialAvatars(Set   avatars) {
896         putObject(A_CREDENTIAL_AVATARS_KEY, avatars);
897     }
898 
899     /**
900      * @return Credential avatars.  Null if none set.
901      */
902     @SuppressWarnings  ("unchecked")
903     public Set  <CredentialAvatar> getCredentialAvatars() {
904         return (Set  )getObject(A_CREDENTIAL_AVATARS_KEY);
905     }
906 
907     /**
908      * @return True if there are avatars attached to this instance.
909      */
910     public boolean hasCredentialAvatars() {
911         return getCredentialAvatars() != null &&
912             getCredentialAvatars().size() > 0;
913     }
914 
915     /**
916      * Add an avatar.
917      *
918      * We do lazy instantiation.
919      *
920      * @param ca Credential avatar to add to set of avatars.
921      */
922     public void addCredentialAvatar(CredentialAvatar ca) {
923         Set  <CredentialAvatar> avatars = getCredentialAvatars();
924         if (avatars == null) {
925             avatars = new HashSet  <CredentialAvatar>();
926             setCredentialAvatars(avatars);
927         }
928         avatars.add(ca);
929     }
930 
931     /**
932      * Remove all credential avatars from this crawl uri.
933      */
934     public void removeCredentialAvatars() {
935         if (hasCredentialAvatars()) {
936             remove(A_CREDENTIAL_AVATARS_KEY);
937         }
938     }
939 
940     /**
941      * Remove all credential avatars from this crawl uri.
942      * @param ca Avatar to remove.
943      * @return True if we removed passed parameter.  False if no operation
944      * performed.
945      */
946     public boolean removeCredentialAvatar(CredentialAvatar ca) {
947         boolean result = false;
948         Set   avatars = getCredentialAvatars();
949         if (avatars != null && avatars.size() > 0) {
950             result = avatars.remove(ca);
951         }
952         return result;
953     }
954 
955     /**
956      * Ask this URI if it was a success or not.
957      *
958      * Only makes sense to call this method after execution of
959      * HttpMethod#execute. Regard any status larger then 0 as success
960      * except for below caveat regarding 401s.  Use {@link #is2XXSuccess()} if
961      * looking for a status code in the 200 range.
962      *
963      * <p>401s caveat: If any rfc2617 credential data present and we got a 401
964      * assume it got loaded in FetchHTTP on expectation that we're to go around
965      * the processing chain again. Report this condition as a failure so we
966      * get another crack at the processing chain only this time we'll be making
967      * use of the loaded credential data.
968      *
969      * @return True if ths URI has been successfully processed.
970      * @see #is2XXSuccess()
971      */
972     public boolean isSuccess() {
973         boolean result = false;
974         int statusCode = this.fetchStatus;
975         if (statusCode == HttpStatus.SC_UNAUTHORIZED &&
976             hasRfc2617CredentialAvatar()) {
977             result = false;
978         } else {
979             result = (statusCode > 0);
980         }
981         return result;
982     }
983     
984     /**
985      * @return True if status code is in the 2xx range.
986      * @see #isSuccess()
987      */
988     public boolean is2XXSuccess() {
989         return this.fetchStatus >= 200 && this.fetchStatus < 300;
990     }
991 
992     /**
993      * @return True if we have an rfc2617 payload.
994      */
995     public boolean hasRfc2617CredentialAvatar() {
996         boolean result = false;
997         Set   avatars = getCredentialAvatars();
998         if (avatars != null && avatars.size() > 0) {
999             for (Iterator   i = avatars.iterator(); i.hasNext();) {
1000                if (((CredentialAvatar)i.next()).
1001                    match(Rfc2617Credential.class)) {
1002                    result = true;
1003                    break;
1004                }
1005            }
1006        }
1007        return result;
1008    }
1009
1010    /**
1011     * Set whether this URI should be fetched by sending a HTTP POST request.
1012     * Else a HTTP GET request will be used.
1013     *
1014     * @param b Set whether this curi is to be POST'd.  Else its to be GET'd.
1015     */
1016    public void setPost(boolean b) {
1017        this.post = b;
1018    }
1019
1020    /**
1021     * Returns true if this URI should be fetched by sending a HTTP POST request.
1022     *
1023     *
1024     * TODO: Compound this and {@link #isHttpTransaction()} method so that there
1025     * is one place to go to find out if get http, post http, ftp, dns.
1026     *
1027     * @return Returns is this CrawlURI instance is to be posted.
1028     */
1029    public boolean isPost() {
1030        return this.post;
1031    }
1032
1033    /**
1034     * Set the retained content-digest value (usu. SHA1). 
1035     * 
1036     * @param digestValue
1037     * @deprecated Use {@link #setContentDigest(String scheme, byte[])}
1038     */
1039    public void setContentDigest(byte[] digestValue) {
1040        setContentDigest("SHA1", digestValue);
1041    }
1042    
1043    public void setContentDigest(final String   scheme,
1044            final byte [] digestValue) {
1045        this.contentDigest = digestValue;
1046        this.contentDigestScheme = scheme;
1047    }
1048    
1049    public String   getContentDigestSchemeString() {
1050        return this.contentDigestScheme + ":" + getContentDigestString();
1051    }
1052
1053    /**
1054     * Return the retained content-digest value, if any.
1055     * 
1056     * @return Digest value.
1057     */
1058    public Object   getContentDigest() {
1059        return contentDigest;
1060    }
1061    
1062    public String   getContentDigestString() {
1063        return Base32.encode(this.contentDigest);
1064    }
1065
1066    transient Object   holder;
1067    transient Object   holderKey;
1068
1069    /**
1070     * Remember a 'holder' to which some enclosing/queueing
1071     * facility has assigned this CrawlURI
1072     * .
1073     * @param obj
1074     */
1075    public void setHolder(Object   obj) {
1076        holder=obj;
1077    }
1078
1079    /**
1080     * Return the 'holder' for the convenience of 
1081     * an external facility.
1082     *
1083     * @return holder
1084     */
1085    public Object   getHolder() {
1086        return holder;
1087    }
1088
1089    /**
1090     * Remember a 'holderKey' which some enclosing/queueing
1091     * facility has assigned this CrawlURI
1092     * .
1093     * @param obj
1094     */
1095    public void setHolderKey(Object   obj) {
1096        holderKey=obj;
1097    }
1098    /**
1099     * Return the 'holderKey' for convenience of 
1100     * an external facility (Frontier).
1101     * 
1102     * @return holderKey 
1103     */
1104    public Object   getHolderKey() {
1105        return holderKey;
1106    }
1107
1108    /**
1109     * Get the ordinal (serial number) assigned at creation.
1110     * 
1111     * @return ordinal
1112     */
1113    public long getOrdinal() {
1114        return ordinal;
1115    }
1116
1117    /** spot for an integer cost to be placed by external facility (frontier).
1118     *  cost is truncated to 8 bits at times, so should not exceed 255 */
1119    int holderCost = UNCALCULATED;
1120    /**
1121     * Return the 'holderCost' for convenience of external facility (frontier)
1122     * @return value of holderCost
1123     */
1124    public int getHolderCost() {
1125        return holderCost;
1126    }
1127
1128    /**
1129     * Remember a 'holderCost' which some enclosing/queueing
1130     * facility has assigned this CrawlURI
1131     * @param cost value to remember
1132     */
1133    public void setHolderCost(int cost) {
1134        holderCost = cost;
1135    }
1136
1137    /** 
1138     * All discovered outbound Links (navlinks, embeds, etc.) 
1139     * Can either contain Link instances or CandidateURI instances, or both.
1140     * The LinksScoper processor converts Link instances in this collection
1141     * to CandidateURI instances. 
1142     */
1143    transient Collection  <Object  > outLinks = new HashSet  <Object  >();
1144    
1145    /**
1146     * Returns discovered links.  The returned collection might be empty if
1147     * no links were discovered, or if something like LinksScoper promoted
1148     * the links to CandidateURIs.
1149     * 
1150     * Elements can be removed from the returned collection, but not added.
1151     * To add a discovered link, use one of the createAndAdd methods or
1152     * {@link #getOutObjects()}.
1153     * 
1154     * @return Collection of all discovered outbound Links
1155     */
1156    public Collection  <Link> getOutLinks() {
1157        return Transform.subclasses(outLinks, Link.class);
1158    }
1159    
1160    /**
1161     * Returns discovered candidate URIs.  The returned collection will be
1162     * emtpy until something like LinksScoper promotes discovered Links
1163     * into CandidateURIs.
1164     * 
1165     * Elements can be removed from the returned collection, but not added.
1166     * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or
1167     * {@link #getOutObjects}.
1168     * 
1169     * @return  Collection of candidate URIs
1170     */
1171    public Collection  <CandidateURI> getOutCandidates() {
1172        return Transform.subclasses(outLinks, CandidateURI.class);
1173    }
1174    
1175    
1176    /**
1177     * Returns all of the outbound objects.  The returned Collection will
1178     * contain Link instances, or CandidateURI instances, or both.  
1179     * 
1180     * @return  the collection of Links and/or CandidateURIs
1181     */
1182    public Collection  <Object  > getOutObjects() {
1183        return outLinks;
1184    }
1185    
1186    /**
1187     * Add a discovered Link, unless it would exceed the max number
1188     * to accept. (If so, increment discarded link counter.) 
1189     * 
1190     * @param link the Link to add
1191     */
1192    public void addOutLink(Link link) {
1193        if (outLinks.size() < MAX_OUTLINKS) {
1194            outLinks.add(link);
1195        } else {
1196            // note & discard
1197            discardedOutlinks++;
1198        }
1199    }
1200    
1201    public void clearOutlinks() {
1202        this.outLinks.clear();
1203    }
1204    
1205    /**
1206     * Replace current collection of links w/ passed list.
1207     * Used by Scopers adjusting the list of links (removing those
1208     * not in scope and promoting Links to CandidateURIs).
1209     * 
1210     * @param a collection of CandidateURIs replacing any previously
1211     *   existing outLinks or outCandidates
1212     */
1213    public void replaceOutlinks(Collection  <CandidateURI> links) {
1214        clearOutlinks();
1215        this.outLinks.addAll(links);
1216    }
1217    
1218    
1219    /**
1220     * @return Count of outlinks.
1221     */
1222    public int outlinksSize() {
1223        return this.outLinks.size();
1224    }
1225
1226    /**
1227     * Convenience method for creating a Link discovered at this URI
1228     * with the given string and context
1229     * 
1230     * @param url
1231     *            String to use to create Link
1232     * @param context
1233     *            CharSequence context to use
1234     * @param hopType
1235     * @return Link.
1236     * @throws URIException
1237     *             if Link UURI cannot be constructed
1238     */
1239    public Link createLink(String   url, CharSequence   context,
1240            char hopType) throws URIException {
1241        return new Link(getUURI(), UURIFactory.getInstance(getUURI(),
1242                url), context, hopType);
1243    }
1244    
1245    /**
1246     * Convenience method for creating a Link with the given string and
1247     * context
1248     * 
1249     * @param url
1250     *            String to use to create Link
1251     * @param context
1252     *            CharSequence context to use
1253     * @param hopType
1254     * @throws URIException
1255     *             if Link UURI cannot be constructed
1256     */
1257    public void createAndAddLink(String   url, CharSequence   context,
1258            char hopType) throws URIException {
1259        addOutLink(createLink(url, context, hopType));
1260    }
1261
1262    /**
1263     * Convenience method for creating a Link with the given string and
1264     * context, relative to a previously set base HREF if available (or
1265     * relative to the current CrawlURI if no other base has been set)
1266     * 
1267     * @param url String URL to add as destination of link
1268     * @param context String context where link was discovered
1269     * @param hopType char hop-type indicator
1270     * @throws URIException
1271     */
1272    public void createAndAddLinkRelativeToBase(String   url,
1273            CharSequence   context, char hopType) throws URIException {
1274        addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1275                getBaseURI(), url), context, hopType));
1276    }
1277    
1278    /**
1279     * Convenience method for creating a Link with the given string and
1280     * context, relative to this CrawlURI's via UURI if available. (If
1281     * a via is not available, falls back to using 
1282     * #createAndAddLinkRelativeToBase.)
1283     * 
1284     * @param url String URL to add as destination of link
1285     * @param context String context where link was discovered
1286     * @param hopType char hop-type indicator
1287     * @throws URIException
1288     */
1289    public void createAndAddLinkRelativeToVia(String   url,
1290            CharSequence   context, char hopType) throws URIException {
1291        if(getVia()!=null) {
1292            addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1293                getVia(), url), context, hopType));
1294        } else {
1295            // if no 'via', fall back to base/self
1296            createAndAddLinkRelativeToBase(url,context,hopType);
1297        }
1298    }
1299    
1300    /**
1301     * Set the (HTML) Base URI used for derelativizing internal URIs. 
1302     * 
1303     * @param baseHref String base href to use
1304     * @throws URIException if supplied string cannot be interpreted as URI
1305     */
1306    public void setBaseURI(String   baseHref) throws URIException {
1307        putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref));
1308    }
1309      
1310    /**
1311     * Get the (HTML) Base URI used for derelativizing internal URIs. 
1312     *
1313     * @return UURI base URI previously set 
1314     */  
1315    public UURI getBaseURI() {
1316        if (!containsKey(A_HTML_BASE)) {
1317            return getUURI();
1318        }
1319        return (UURI)getObject(A_HTML_BASE);
1320    }
1321    
1322    /**
1323     * Add the key of alist items you want to persist across
1324     * processings.
1325     * @param key Key to add.
1326     */
1327    public static void addAlistPersistentMember(Object   key) {
1328        alistPersistentMember.add(key);
1329    }
1330    
1331    /**
1332     * @param key Key to remove.
1333     * @return True if list contained the element.
1334     */
1335    public static boolean removeAlistPersistentMember(Object   key) {
1336        return alistPersistentMember.remove(key);
1337    }
1338
1339    /**
1340     * Custom serialization writing an empty 'outLinks' as null. Estimated
1341     * to save ~20 bytes in serialized form. 
1342     * 
1343     * @param stream
1344     * @throws IOException
1345     */
1346    private void writeObject(ObjectOutputStream   stream) throws IOException   {
1347        stream.defaultWriteObject();
1348        stream.writeObject((outLinks.isEmpty()) ? null : outLinks);
1349    }
1350
1351    /**
1352     * Custom deserialization recreating empty HashSet from null in 'outLinks'
1353     * slot. 
1354     * 
1355     * @param stream
1356     * @throws IOException
1357     * @throws ClassNotFoundException
1358     */
1359    private void readObject(ObjectInputStream   stream) throws IOException  ,
1360            ClassNotFoundException   {
1361        stream.defaultReadObject();
1362        @SuppressWarnings  ("unchecked")
1363        HashSet  <Object  > ol = (HashSet  <Object  >) stream.readObject();
1364        outLinks = (ol == null) ? new HashSet  <Object  >() : ol;
1365    }
1366
1367
1368}
1369
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags