KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > scope > HostScope


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * BasicScope.java
20  * Created on Oct 1, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/scope/HostScope.java,v 1.20.2.1 2007/01/13 01:31:25 stack-sf Exp $
23  */

24 package org.archive.crawler.scope;
25
26 import java.util.Iterator JavaDoc;
27
28 import org.archive.crawler.deciderules.DecidingScope;
29 import org.archive.crawler.filter.FilePatternFilter;
30 import org.archive.crawler.filter.TransclusionFilter;
31 import org.archive.crawler.framework.Filter;
32 import org.archive.net.UURI;
33
34 /**
35  * A core CrawlScope suitable for the most common
36  * crawl needs.
37  *
38  * Roughly, its logic is that a URI is included if:
39  *
40  * (( isSeed(uri) || focusFilter.accepts(uri) )
41  * || transitiveFilter.accepts(uri) )
42  * && ! excludeFilter.accepts(uri)
43  *
44  * The focusFilter may be specified by either:
45  * - adding a 'mode' attribute to the
46  * <code>scope</code> element. mode="broad" is equivalent
47  * to no focus; modes "path", "host", and "domain"
48  * imply a SeedExtensionFilter will be used, with
49  * the <code>scope</code> element providing its configuration
50  * - adding a <code>focus</code> subelement
51  * If unspecified, the focusFilter will default to
52  * an accepts-all filter.
53  *
54  * The transitiveFilter may be specified by supplying
55  * a <code>transitive</code> subelement. If unspecified, a
56  * TransclusionFilter will be used, with the <code>scope</code>
57  * element providing its configuration.
58  *
59  * The excludeFilter may be specified by supplying
60  * a <code>exclude</code> subelement. If unspecified, a
61  * accepts-none filter will be used -- meaning that
62  * no URIs will pass the filter and thus be excluded.
63  *
64  * @author gojomo
65  * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
66  */

67 public class HostScope extends SeedCachingScope {
68
69     private static final long serialVersionUID = -6257664892667267266L;
70
71     public static final String JavaDoc ATTR_TRANSITIVE_FILTER = "transitiveFilter";
72     public static final String JavaDoc ATTR_ADDITIONAL_FOCUS_FILTER =
73         "additionalScopeFocus";
74
75     Filter additionalFocusFilter;
76     Filter transitiveFilter;
77
78     public HostScope(String JavaDoc name) {
79         super(name);
80         setDescription(
81             "HostScope: A scope for host crawls *Deprecated* Use " +
82             "DecidingScope instead. Crawls made with this scope" +
83             " will be limited to the hosts its seeds. Thus if one of" +
84             " the seeds is 'archive.org' the subdomain" +
85             " 'crawler.archive.org' will not be crawled." +
86             " 'www.host' is considered to be the same as host.");
87        additionalFocusFilter = (Filter) addElementToDefinition(
88                 new FilePatternFilter(ATTR_ADDITIONAL_FOCUS_FILTER));
89         this.transitiveFilter = (Filter) addElementToDefinition(
90                 new TransclusionFilter(ATTR_TRANSITIVE_FILTER));
91     }
92
93     /**
94      * @param o
95      * @return True if transitive filter accepts passed object.
96      */

97     protected boolean transitiveAccepts(Object JavaDoc o) {
98         if (this.transitiveFilter == null) {
99             return true;
100         }
101         return this.transitiveFilter.accepts(o);
102     }
103
104     /**
105      * @param o
106      * @return True if focus filter accepts passed object.
107      */

108     protected boolean focusAccepts(Object JavaDoc o) {
109         UURI u = UURI.from(o);
110         if (u == null) {
111             return false;
112         }
113         // Get the seeds to refresh
114
Iterator JavaDoc iter = seedsIterator();
115         while(iter.hasNext()) {
116             if (isSameHost((UURI)iter.next(), u)) {
117                 checkClose(iter);
118                 return true;
119             }
120         }
121         // if none found, fail
122
checkClose(iter);
123         return false;
124     }
125
126    
127     // Javadoc inherited.
128
@Override JavaDoc
129     protected boolean additionalFocusAccepts(Object JavaDoc o) {
130         return additionalFocusFilter.accepts(o);
131     }
132
133 }
134
Popular Tags