KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > datamodel > CandidateURI


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * CandidateURI.java
20  * Created on Sep 30, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/CandidateURI.java,v 1.49.4.1 2007/01/13 01:31:08 stack-sf Exp $
23  */

24 package org.archive.crawler.datamodel;
25
26 import java.io.IOException JavaDoc;
27 import java.io.ObjectInputStream JavaDoc;
28 import java.io.ObjectOutputStream JavaDoc;
29 import java.io.PrintWriter JavaDoc;
30 import java.io.Serializable JavaDoc;
31 import java.util.ArrayList JavaDoc;
32 import java.util.Iterator JavaDoc;
33 import java.util.List JavaDoc;
34
35 import org.apache.commons.httpclient.URIException;
36 import org.archive.crawler.extractor.Link;
37 import org.archive.net.UURI;
38 import org.archive.net.UURIFactory;
39 import org.archive.util.ArchiveUtils;
40 import org.archive.util.Reporter;
41
42 import st.ata.util.AList;
43 import st.ata.util.HashtableAList;
44
45 /**
46  * A URI, discovered or passed-in, that may be scheduled.
47  * When scheduled, a CandidateURI becomes a {@link CrawlURI}
48  * made with the data contained herein. A CandidateURI
49  * contains just the fields necessary to perform quick in-scope analysis.
50  *
51  * <p>Has a flexible attribute list that will be promoted into
52  * any {@link CrawlURI} created from this CandidateURI. Use it
53  * to add custom data or state needed later doing custom processing.
54  * See accessors/setters {@link #putString(String, String)},
55  * {@link #getString(String)}, etc.
56  *
57  * @author Gordon Mohr
58  */

59 public class CandidateURI
60 implements Serializable JavaDoc, Reporter, CoreAttributeConstants {
61     private static final long serialVersionUID = -7152937921526560388L;
62
63     /** Highest scheduling priority.
64      * Before any others of its class.
65      */

66     public static final int HIGHEST = 0;
67     
68     /** High scheduling priority.
69      * After any {@link #HIGHEST}.
70      */

71     public static final int HIGH = 1;
72     
73     /** Medium priority.
74      * After any {@link #HIGH}.
75      */

76     public static final int MEDIUM = 2;
77     
78     /** Normal/low priority.
79      * Whenever/end of queue.
80      */

81     public static final int NORMAL = 3;
82     
83     private int schedulingDirective = NORMAL;
84     
85     /**
86      * Usuable URI under consideration. Transient to allow
87      * more efficient custom serialization
88      */

89     private transient UURI uuri;
90     
91     /** Seed status */
92     private boolean isSeed = false;
93
94     private boolean forceRevisit = false; // even if already visited
95

96     /** String of letters indicating how this URI was reached from a seed.
97      * <pre>
98      * P precondition
99      * R redirection
100      * E embedded (as frame, src, link, codebase, etc.)
101      * X speculative embed (as from javascript, some alternate-format extractors
102      * L link</pre>
103      * For example LLLE (an embedded image on a page 3 links from seed).
104      */

105     private String JavaDoc pathFromSeed;
106     
107     /**
108      * Where this URI was (presently) discovered. . Transient to allow
109      * more efficient custom serialization
110      */

111     private transient UURI via;
112
113     /**
114      * Context of URI's discovery, as per the 'context' in Link
115      */

116     private CharSequence JavaDoc viaContext;
117     
118     /**
119      * Flexible dynamic attributes list.
120      * <p>
121      * The attribute list is a flexible map of key/value pairs for storing
122      * status of this URI for use by other processors. By convention the
123      * attribute list is keyed by constants found in the
124      * {@link CoreAttributeConstants} interface. Use this list to carry
125      * data or state produced by custom processors rather change the
126      * classes {@link CrawlURI} or this class, CandidateURI.
127      *
128      * Transient to allow more efficient custom serialization.
129      */

130     private transient AList alist;
131     
132     /**
133      * Cache of this candidate uuri as a string.
134      *
135      * Profiling shows us spending about 1-2% of total elapsed time in
136      * toString.
137      */

138     private String JavaDoc cachedCandidateURIString = null;
139     
140
141     /**
142      * Frontier/Scheduler lifecycle info.
143      * This is an identifier set by the Frontier for its
144      * purposes. Usually its the name of the Frontier queue
145      * this URI gets queued to. Values can be host + port
146      * or IP, etc.
147      */

148     private String JavaDoc classKey;
149
150     /**
151      * Constructor.
152      * Protected access to block access to default constructor.
153      */

154     protected CandidateURI () {
155         super();
156     }
157     
158     /**
159      * @param u uuri instance this CandidateURI wraps.
160      */

161     public CandidateURI(UURI u) {
162         this.uuri = u;
163     }
164     
165     /**
166      * @param u uuri instance this CandidateURI wraps.
167      * @param pathFromSeed
168      * @param via
169      * @param viaContext
170      */

171     public CandidateURI(UURI u, String JavaDoc pathFromSeed, UURI via,
172             CharSequence JavaDoc viaContext) {
173         this.uuri = u;
174         this.pathFromSeed = pathFromSeed;
175         this.via = via;
176         this.viaContext = viaContext;
177     }
178
179     /**
180      * Set the <tt>isSeed</tt> attribute of this URI.
181      * @param b Is this URI a seed, true or false.
182      */

183     public void setIsSeed(boolean b) {
184         this.isSeed = b;
185         if (this.isSeed) {
186             if(pathFromSeed==null) {
187                 this.pathFromSeed = "";
188             }
189 // seeds created on redirect must have a via to be recognized; don't clear
190
// setVia(null);
191
}
192     }
193
194     /**
195      * @return UURI
196      */

197     public UURI getUURI() {
198         return this.uuri;
199     }
200
201     /**
202      * @return Whether seeded.
203      */

204     public boolean isSeed() {
205         return this.isSeed;
206     }
207
208     /**
209      * @return path (hop-types) from seed
210      */

211     public String JavaDoc getPathFromSeed() {
212         return this.pathFromSeed;
213     }
214
215     /**
216      * @return URI via which this one was discovered
217      */

218     public UURI getVia() {
219         return this.via;
220     }
221
222     /**
223      * @return CharSequence context in which this one was discovered
224      */

225     public CharSequence JavaDoc getViaContext() {
226         return this.viaContext;
227     }
228     
229     /**
230      * @param string
231      */

232     protected void setPathFromSeed(String JavaDoc string) {
233         pathFromSeed = string;
234     }
235     
236     /**
237      * Called when making a copy of another CandidateURI.
238      * @param alist AList to use.
239      */

240     protected void setAList(AList alist) {
241         this.alist = alist;
242     }
243
244     public void setVia(UURI via) {
245         this.via = via;
246     }
247
248     /**
249      * @return This candidate URI as a string wrapped with 'CandidateURI(' +
250      * ')'.
251      */

252     public synchronized String JavaDoc getCandidateURIString() {
253         if (this.cachedCandidateURIString == null) {
254             this.cachedCandidateURIString =
255                 "CandidateURI(" + toString() + ")";
256         }
257         return this.cachedCandidateURIString;
258     }
259
260     /**
261      * Method returns string version of this URI's referral URI.
262      * @return String version of referral URI
263      */

264     public String JavaDoc flattenVia() {
265         return (via == null)? "": via.toString();
266     }
267     
268     /**
269      * @return The UURI this CandidateURI wraps as a string
270      * (We used return what {@link #getCandidateURIString()}
271      * returns on a toString -- use that method if you still need
272      * this functionality).
273      * @see #getCandidateURIString()
274      */

275     public String JavaDoc toString() {
276         return getURIString();
277     }
278
279     /**
280      * @return URI String
281      * @deprecated Use {@link #toString()}.
282      */

283     public String JavaDoc getURIString() {
284         return getUURI().toString();
285     }
286
287     /**
288      * Compares the domain of this CandidateURI with that of another
289      * CandidateURI
290      *
291      * @param other The other CandidateURI
292      *
293      * @return True if both are in the same domain, false otherwise.
294      * @throws URIException
295      */

296     public boolean sameDomainAs(CandidateURI other) throws URIException {
297         String JavaDoc domain = getUURI().getHost();
298         if (domain == null) {
299             return false;
300         }
301         while(domain.lastIndexOf('.') > domain.indexOf('.')) {
302             // While has more than one dot, lop off first segment
303
domain = domain.substring(domain.indexOf('.') + 1);
304         }
305         if(other.getUURI().getHost() == null) {
306             return false;
307         }
308         return other.getUURI().getHost().endsWith(domain);
309     }
310
311     /**
312      * If this method returns true, this URI should be fetched even though
313      * it already has been crawled. This also implies
314      * that this URI will be scheduled for crawl before any other waiting
315      * URIs for the same host.
316      *
317      * This value is used to refetch any expired robots.txt or dns-lookups.
318      *
319      * @return true if crawling of this URI should be forced
320      */

321     public boolean forceFetch() {
322         return forceRevisit;
323     }
324
325    /**
326      * Method to signal that this URI should be fetched even though
327      * it already has been crawled. Setting this to true also implies
328      * that this URI will be scheduled for crawl before any other waiting
329      * URIs for the same host.
330      *
331      * This value is used to refetch any expired robots.txt or dns-lookups.
332      *
333      * @param b set to true to enforce the crawling of this URI
334      */

335     public void setForceFetch(boolean b) {
336         forceRevisit = b;
337     }
338
339     /**
340      * @return Returns the schedulingDirective.
341      */

342     public int getSchedulingDirective() {
343         return schedulingDirective;
344     }
345     /**
346      * @param schedulingDirective The schedulingDirective to set.
347      */

348     public void setSchedulingDirective(int schedulingDirective) {
349         this.schedulingDirective = schedulingDirective;
350     }
351
352
353     /**
354      * @return True if needs immediate scheduling.
355      */

356     public boolean needsImmediateScheduling() {
357         return schedulingDirective == HIGH;
358     }
359
360     /**
361      * @return True if needs soon but not top scheduling.
362      */

363     public boolean needsSoonScheduling() {
364         return schedulingDirective == MEDIUM;
365     }
366
367     /**
368      * Tally up the number of transitive (non-simple-link) hops at
369      * the end of this CandidateURI's pathFromSeed.
370      *
371      * In some cases, URIs with greater than zero but less than some
372      * threshold such hops are treated specially.
373      *
374      * <p>TODO: consider moving link-count in here as well, caching
375      * calculation, and refactoring CrawlScope.exceedsMaxHops() to use this.
376      *
377      * @return Transhop count.
378      */

379     public int getTransHops() {
380         String JavaDoc path = getPathFromSeed();
381         int transCount = 0;
382         for(int i=path.length()-1;i>=0;i--) {
383             if(path.charAt(i)==Link.NAVLINK_HOP) {
384                 break;
385             }
386             transCount++;
387         }
388         return transCount;
389     }
390
391     /**
392      * Given a string containing a URI, then optional whitespace
393      * delimited hops-path and via info, create a CandidateURI
394      * instance.
395      *
396      * @param uriHopsViaString String with a URI.
397      * @return A CandidateURI made from passed <code>uriHopsViaString</code>.
398      * @throws URIException
399      */

400     public static CandidateURI fromString(String JavaDoc uriHopsViaString)
401             throws URIException {
402         String JavaDoc args[] = uriHopsViaString.split("\\s+");
403         String JavaDoc pathFromSeeds = (args.length > 1 && !args[1].equals("-")) ?
404                 args[1]: "";
405         UURI via = (args.length > 2 && !args[2].equals("-")) ?
406                 UURIFactory.getInstance(args[2]) : null;
407         CharSequence JavaDoc viaContext = (args.length > 3 && !args[3].equals("-")) ?
408                 args[2]: null;
409         return new CandidateURI(UURIFactory.getInstance(args[0]),
410                 pathFromSeeds, via, viaContext);
411     }
412     
413     public static CandidateURI createSeedCandidateURI(UURI uuri) {
414         CandidateURI c = new CandidateURI(uuri);
415         c.setIsSeed(true);
416         return c;
417     }
418     
419     /**
420      * Utility method for creation of CandidateURIs found extracting
421      * links from this CrawlURI.
422      * @param baseUURI BaseUURI for <code>link</code>.
423      * @param link Link to wrap CandidateURI in.
424      * @return New candidateURI wrapper around <code>link</code>.
425      * @throws URIException
426      */

427     public CandidateURI createCandidateURI(UURI baseUURI, Link link)
428     throws URIException {
429         UURI u = (link.getDestination() instanceof UURI)?
430             (UURI)link.getDestination():
431             UURIFactory.getInstance(baseUURI,
432                 link.getDestination().toString());
433         CandidateURI newCaURI = new CandidateURI(u, getPathFromSeed() + link.getHopType(),
434                 getUURI(), link.getContext());
435         newCaURI.inheritFrom(this);
436         return newCaURI;
437     }
438
439     /**
440      * Utility method for creation of CandidateURIs found extracting
441      * links from this CrawlURI.
442      * @param baseUURI BaseUURI for <code>link</code>.
443      * @param link Link to wrap CandidateURI in.
444      * @param scheduling How new CandidateURI should be scheduled.
445      * @param seed True if this CandidateURI is a seed.
446      * @return New candidateURI wrapper around <code>link</code>.
447      * @throws URIException
448      */

449     public CandidateURI createCandidateURI(UURI baseUURI, Link link,
450         int scheduling, boolean seed)
451     throws URIException {
452         final CandidateURI caURI = createCandidateURI(baseUURI, link);
453         caURI.setSchedulingDirective(scheduling);
454         caURI.setIsSeed(seed);
455         return caURI;
456     }
457     
458     /**
459      * Inherit (copy) the relevant keys-values from the ancestor.
460      *
461      * @param ancestor
462      */

463     protected void inheritFrom(CandidateURI ancestor) {
464         List JavaDoc heritableKeys = (List JavaDoc) ancestor.getObject(A_HERITABLE_KEYS);
465         if(heritableKeys!=null) {
466             getAList().copyKeysFrom(heritableKeys.iterator(),ancestor.getAList());
467         }
468     }
469     
470     /**
471      * Get the token (usually the hostname + port) which indicates
472      * what "class" this CrawlURI should be grouped with,
473      * for the purposes of ensuring only one item of the
474      * class is processed at once, all items of the class
475      * are held for a politeness period, etc.
476      *
477      * @return Token (usually the hostname) which indicates
478      * what "class" this CrawlURI should be grouped with.
479      */

480     public String JavaDoc getClassKey() {
481         return classKey;
482     }
483
484     public void setClassKey(String JavaDoc key) {
485         classKey = key;
486     }
487     
488     /**
489      * Assumption is that only one thread at a time will ever be accessing
490      * a particular CandidateURI.
491      *
492      * @deprecated Public access will be deprecated. This methods access
493      * will change in next release. Use specialized accessors instead such
494      * as {@link #getString(String)}.
495      *
496      * @return the attribute list.
497      */

498     public AList getAList() {
499         if (this.alist == null) {
500             this.alist = new HashtableAList();
501         }
502         return this.alist;
503     }
504     
505     protected void clearAList() {
506         this.alist = null;
507     }
508     
509     public void putObject(String JavaDoc key, Object JavaDoc value) {
510         getAList().putObject(key, value);
511     }
512     
513     public Object JavaDoc getObject(String JavaDoc key) {
514         return getAList().getObject(key);
515     }
516     
517     public String JavaDoc getString(String JavaDoc key) {
518         return getAList().getString(key);
519     }
520     
521     public void putString(String JavaDoc key, String JavaDoc value) {
522         getAList().putString(key, value);
523     }
524     
525     public long getLong(String JavaDoc key) {
526         return getAList().getLong(key);
527     }
528     
529     public void putLong(String JavaDoc key, long value) {
530         getAList().putLong(key, value);
531     }
532     
533     public int getInt(String JavaDoc key) {
534         return getAList().getInt(key);
535     }
536     
537     public void putInt(String JavaDoc key, int value) {
538         getAList().putInt(key, value);
539     }
540     
541     public boolean containsKey(String JavaDoc key) {
542         return getAList().containsKey(key);
543     }
544     
545     public void remove(String JavaDoc key) {
546         getAList().remove(key);
547     }
548     
549     public Iterator JavaDoc keys() {
550         return getAList().getKeys();
551     }
552     
553     /**
554      * @return True if this CandidateURI was result of a redirect:
555      * i.e. Its parent URI redirected to here, this URI was what was in
556      * the 'Location:' or 'Content-Location:' HTTP Header.
557      */

558     public boolean isLocation() {
559         return this.pathFromSeed != null && this.pathFromSeed.length() > 0 &&
560             this.pathFromSeed.charAt(this.pathFromSeed.length() - 1) ==
561                 Link.REFER_HOP;
562     }
563
564     /**
565      * Custom serialization writing 'uuri' and 'via' as Strings, rather
566      * than the bloated full serialization of their object classes, and
567      * an empty alist as 'null'. Shrinks serialized form by 50% or more
568      * in short tests.
569      *
570      * @param stream
571      * @throws IOException
572      */

573     private void writeObject(ObjectOutputStream JavaDoc stream)
574         throws IOException JavaDoc {
575         stream.defaultWriteObject();
576         stream.writeUTF(uuri.toString());
577         stream.writeObject((via == null) ? null : via.getURI());
578         stream.writeObject((alist==null) ? null : alist);
579     }
580
581     /**
582      * Custom deserialization to reconstruct UURI instances from more
583      * compact Strings.
584      *
585      * @param stream
586      * @throws IOException
587      * @throws ClassNotFoundException
588      */

589     private void readObject(ObjectInputStream JavaDoc stream)
590         throws IOException JavaDoc, ClassNotFoundException JavaDoc {
591         stream.defaultReadObject();
592         uuri = readUuri(stream.readUTF());
593         via = readUuri((String JavaDoc)stream.readObject());
594         alist = (AList) stream.readObject();
595     }
596
597     /**
598      * Read a UURI from a String, handling a null or URIException
599      *
600      * @param u String or null from which to create UURI
601      * @return the best UURI instance creatable
602      */

603     protected UURI readUuri(String JavaDoc u) {
604         if (u == null) {
605             return null;
606         }
607         try {
608             return UURIFactory.getInstance(u);
609         } catch (URIException ux) {
610             // simply continue to next try
611
}
612         try {
613             // try adding an junk scheme
614
return UURIFactory.getInstance("invalid:" + u);
615         } catch (URIException ux) {
616             ux.printStackTrace();
617             // ignored; method continues
618
}
619         try {
620             // return total junk
621
return UURIFactory.getInstance("invalid:");
622         } catch (URIException e) {
623             e.printStackTrace();
624             return null;
625         }
626     }
627     
628     //
629
// Reporter implementation
630
//
631

632     public String JavaDoc singleLineReport() {
633         return ArchiveUtils.singleLineReport(this);
634     }
635     
636     public void singleLineReportTo(PrintWriter JavaDoc w) {
637         String JavaDoc className = this.getClass().getName();
638         className = className.substring(className.lastIndexOf(".")+1);
639         w.print(className);
640         w.print(" ");
641         w.print(getUURI().toString());
642         w.print(" ");
643         w.print(pathFromSeed);
644         w.print(" ");
645         w.print(flattenVia());
646     }
647
648     /* (non-Javadoc)
649      * @see org.archive.util.Reporter#singleLineLegend()
650      */

651     public String JavaDoc singleLineLegend() {
652         return "className uri hopsPath viaUri";
653     }
654     
655     /* (non-Javadoc)
656      * @see org.archive.util.Reporter#getReports()
657      */

658     public String JavaDoc[] getReports() {
659         // none but default: empty options
660
return new String JavaDoc[] {};
661     }
662
663     /* (non-Javadoc)
664      * @see org.archive.util.Reporter#reportTo(java.lang.String, java.io.Writer)
665      */

666     public void reportTo(String JavaDoc name, PrintWriter JavaDoc writer) {
667         singleLineReportTo(writer);
668         writer.print("\n");
669     }
670
671     /* (non-Javadoc)
672      * @see org.archive.util.Reporter#reportTo(java.io.Writer)
673      */

674     public void reportTo(PrintWriter JavaDoc writer) throws IOException JavaDoc {
675         reportTo(null,writer);
676     }
677
678     /** Make the given key 'heritable', meaning its value will be
679      * added to descendant CandidateURIs. Only keys with immutable
680      * values should be made heritable -- the value instance may
681      * be shared until the AList is serialized/deserialized.
682      *
683      * @param key to make heritable
684      */

685     public void makeHeritable(String JavaDoc key) {
686         @SuppressWarnings JavaDoc("unchecked")
687         List JavaDoc<String JavaDoc> heritableKeys = (List JavaDoc<String JavaDoc>) getObject(A_HERITABLE_KEYS);
688         if(heritableKeys==null) {
689             heritableKeys = new ArrayList JavaDoc<String JavaDoc>();
690             heritableKeys.add(A_HERITABLE_KEYS);
691             putObject(A_HERITABLE_KEYS,heritableKeys);
692         }
693         heritableKeys.add(key);
694     }
695     
696     /** Make the given key non-'heritable', meaning its value will
697      * not be added to descendant CandidateURIs. Only meaningful if
698      * key was previously made heritable.
699      *
700      * @param key to make non-heritable
701      */

702     public void makeNonHeritable(String JavaDoc key) {
703         List JavaDoc heritableKeys = (List JavaDoc) getObject(A_HERITABLE_KEYS);
704         if(heritableKeys==null) {
705             return;
706         }
707         heritableKeys.remove(key);
708         if(heritableKeys.size()==1) {
709             // only remaining heritable key is itself; disable completely
710
remove(A_HERITABLE_KEYS);
711         }
712     }
713 }
714
Popular Tags