KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > scope > ClassicScope


1 /* Copyright (C) 2005 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * ClassicScope.java
20  * Created on Apr 1, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/scope/ClassicScope.java,v 1.5.14.1 2007/01/13 01:31:25 stack-sf Exp $
23  */

24 package org.archive.crawler.scope;
25
26 import javax.management.AttributeNotFoundException JavaDoc;
27
28 import org.archive.crawler.datamodel.CandidateURI;
29 import org.archive.crawler.extractor.Link;
30 //import org.archive.crawler.filter.OrFilter;
31
import org.archive.crawler.framework.CrawlScope;
32 import org.archive.crawler.settings.SimpleType;
33
34 /**
35  * ClassicScope: superclass with shared Scope behavior for
36  * most common scopes.
37  *
38  * Roughly, its logic is captured in innerAccept(). A URI is
39  * included if:
40  * <pre>
41  * forceAccepts(uri)
42  * || (((isSeed(uri)
43  * || focusAccepts(uri))
44  * || additionalFocusAccepts(uri)
45  * || transitiveAccepts(uri))
46  * && !excludeAccepts(uri));</pre>
47  *
48  * Subclasses should override focusAccepts, additionalFocusAccepts,
49  * and transitiveAccepts.
50  *
51  * The excludeFilter may be specified by supplying
52  * a <code>exclude</code> subelement. If unspecified, a
53  * accepts-none filter will be used -- meaning that
54  * no URIs will pass the filter and thus be excluded.
55  *
56  * @author gojomo
57  */

58 public class ClassicScope extends CrawlScope {
59
60     private static final long serialVersionUID = 4494905304855590002L;
61
62     //private static final Logger logger = Logger.getLogger(ClassicScope.class
63
// .getName());
64

65     public static final String JavaDoc ATTR_EXCLUDE_FILTER = "exclude-filter";
66     public static final String JavaDoc ATTR_FORCE_ACCEPT_FILTER = "force-accept-filter";
67
68     public static final String JavaDoc ATTR_MAX_LINK_HOPS = "max-link-hops";
69
70     public static final String JavaDoc ATTR_MAX_TRANS_HOPS = "max-trans-hops";
71
72     // FIXME: Replace deprecated OrFilter with non-deprecated something
73

74     @SuppressWarnings JavaDoc("deprecation")
75     private org.archive.crawler.filter.OrFilter excludeFilter;
76     @SuppressWarnings JavaDoc("deprecation")
77     private org.archive.crawler.filter.OrFilter forceAcceptFilter;
78
79     /**
80      * @param name
81      * ignored by superclass
82      */

83     @SuppressWarnings JavaDoc("deprecation")
84     public ClassicScope(String JavaDoc name) {
85         super(name);
86         addElementToDefinition(new SimpleType(ATTR_MAX_LINK_HOPS,
87             "Max link hops to include. URIs more than this number "
88             + "of links from a seed will not be ruled in-scope. (Such "
89             + "determination does not preclude later inclusion if a "
90             + "shorter path is later discovered.)", new Integer JavaDoc(25)));
91         addElementToDefinition(new SimpleType(ATTR_MAX_TRANS_HOPS,
92             "Max transitive hops (embeds, referrals, preconditions) to " +
93             "include. URIs reached by more than this number of transitive " +
94             "hops will not be ruled in-scope, even if otherwise on an " +
95             "in-focus site. (Such determination does not preclude later " +
96             " inclusion if a shorter path is later discovered.)",
97             new Integer JavaDoc(5)));
98         this.excludeFilter = (org.archive.crawler.filter.OrFilter)
99             addElementToDefinition(new org.archive.crawler.filter.OrFilter(
100                 ATTR_EXCLUDE_FILTER));
101         this.forceAcceptFilter = (org.archive.crawler.filter.OrFilter)
102             addElementToDefinition(
103                 new org.archive.crawler.filter.OrFilter(
104                         ATTR_FORCE_ACCEPT_FILTER));
105         this.forceAcceptFilter.setExpertSetting(true);
106
107         // Try to preserve the values of these attributes when we exchange
108
// scopes.
109
setPreservedFields(new String JavaDoc[] { ATTR_SEEDS, ATTR_MAX_LINK_HOPS,
110             ATTR_MAX_TRANS_HOPS, ATTR_EXCLUDE_FILTER,
111             ATTR_FORCE_ACCEPT_FILTER });
112     }
113
114     /**
115      * Default constructor.
116      */

117     public ClassicScope() {
118         this(CrawlScope.ATTR_NAME);
119     }
120
121     /**
122      * Returns whether the given object (typically a CandidateURI) falls within
123      * this scope.
124      *
125      * @param o
126      * Object to test.
127      * @return Whether the given object (typically a CandidateURI) falls within
128      * this scope.
129      */

130     protected final boolean innerAccepts(Object JavaDoc o) {
131         return forceAccepts(o) || (((isSeed(o) || focusAccepts(o)) ||
132             additionalFocusAccepts(o) || transitiveAccepts(o)) &&
133             !excludeAccepts(o));
134     }
135
136     /**
137      * Check if URI is accepted by the additional focus of this scope.
138      *
139      * This method should be overridden in subclasses.
140      *
141      * @param o
142      * the URI to check.
143      * @return True if additional focus filter accepts passed object.
144      */

145     protected boolean additionalFocusAccepts(Object JavaDoc o) {
146         return false;
147     }
148
149     /**
150      * @param o
151      * the URI to check.
152      * @return True if transitive filter accepts passed object.
153      */

154     protected boolean transitiveAccepts(Object JavaDoc o) {
155         return false;
156     }
157
158     /**
159      * @param o the URI to check.
160      * @return True if force-accepts filter accepts passed object.
161      */

162     protected boolean forceAccepts(Object JavaDoc o) {
163         return false;
164     }
165     
166     /**
167      * Check if URI is accepted by the focus of this scope.
168      *
169      * This method should be overridden in subclasses.
170      *
171      * @param o
172      * the URI to check.
173      * @return True if focus filter accepts passed object.
174      */

175     protected boolean focusAccepts(Object JavaDoc o) {
176         // The CrawlScope doesn't accept any URIs
177
return false;
178     }
179
180     /**
181      * Check if URI is excluded by any filters.
182      *
183      * @param o
184      * the URI to check.
185      * @return True if exclude filter accepts passed object.
186      */

187     @SuppressWarnings JavaDoc("deprecation")
188     protected boolean excludeAccepts(Object JavaDoc o) {
189         return (this.excludeFilter.isEmpty(o)) ? exceedsMaxHops(o)
190                 : this.excludeFilter.accepts(o) || exceedsMaxHops(o);
191     }
192
193     /**
194      * Check if there are too many hops
195      *
196      * @param o
197      * URI to check.
198      * @return true if too many hops.
199      */

200     protected boolean exceedsMaxHops(Object JavaDoc o) {
201         if (!(o instanceof CandidateURI)) {
202             return false;
203         }
204
205         int maxLinkHops = 0;
206 // int maxTransHops = 0;
207

208         try {
209             maxLinkHops = ((Integer JavaDoc) getAttribute(o, ATTR_MAX_LINK_HOPS))
210                     .intValue();
211 // maxTransHops = ((Integer) getAttribute(o, ATTR_MAX_TRANS_HOPS))
212
// .intValue();
213
} catch (AttributeNotFoundException JavaDoc e) {
214             // TODO Auto-generated catch block
215
e.printStackTrace();
216         }
217
218         CandidateURI cand = (CandidateURI) o;
219
220         String JavaDoc path = cand.getPathFromSeed();
221         int linkCount = 0;
222         int transCount = 0;
223         for (int i = path.length() - 1; i >= 0; i--) {
224             if (path.charAt(i) == Link.NAVLINK_HOP) {
225                 linkCount++;
226             } else if (linkCount == 0) {
227                 transCount++;
228             }
229         }
230 // return (linkCount > maxLinkHops) || (transCount > maxTransHops);
231
// base only on links, don't treat trans count as hard max
232
return (linkCount > maxLinkHops);
233     }
234
235     /**
236      * Take note of a situation (such as settings edit) where involved
237      * reconfiguration (such as reading from external files) may be necessary.
238      */

239     @SuppressWarnings JavaDoc("deprecation")
240     public void kickUpdate() {
241         super.kickUpdate();
242         excludeFilter.kickUpdate();
243     }
244 }
245
Popular Tags