KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > scope > PathScope


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * BasicScope.java
20  * Created on Oct 1, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/scope/PathScope.java,v 1.22.2.1 2007/01/13 01:31:25 stack-sf Exp $
23  */

24 package org.archive.crawler.scope;
25
26 import java.util.Iterator JavaDoc;
27 import java.util.logging.Logger JavaDoc;
28
29 import org.apache.commons.httpclient.URIException;
30 import org.archive.crawler.deciderules.DecidingScope;
31 import org.archive.crawler.filter.FilePatternFilter;
32 import org.archive.crawler.filter.TransclusionFilter;
33 import org.archive.crawler.framework.Filter;
34 import org.archive.net.UURI;
35
36 /**
37  * A core CrawlScope suitable for the most common
38  * crawl needs.
39  *
40  * Roughly, its logic is that a URI is included if:
41  *
42  * (( isSeed(uri) || focusFilter.accepts(uri) )
43  * || transitiveFilter.accepts(uri) )
44  * && ! excludeFilter.accepts(uri)
45  *
46  * The focusFilter may be specified by either:
47  * - adding a 'mode' attribute to the
48  * <code>scope</code> element. mode="broad" is equivalent
49  * to no focus; modes "path", "host", and "domain"
50  * imply a SeedExtensionFilter will be used, with
51  * the <code>scope</code> element providing its configuration
52  * - adding a <code>focus</code> subelement
53  * If unspecified, the focusFilter will default to
54  * an accepts-all filter.
55  *
56  * The transitiveFilter may be specified by supplying
57  * a <code>transitive</code> subelement. If unspecified, a
58  * TransclusionFilter will be used, with the <code>scope</code>
59  * element providing its configuration.
60  *
61  * The excludeFilter may be specified by supplying
62  * a <code>exclude</code> subelement. If unspecified, a
63  * accepts-none filter will be used -- meaning that
64  * no URIs will pass the filter and thus be excluded.
65  *
66  * @author gojomo
67  * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
68  */

69 public class PathScope extends SeedCachingScope {
70
71     private static final long serialVersionUID = -2217024073240277527L;
72
73     private static Logger JavaDoc logger =
74         Logger.getLogger("org.archive.crawler.basic.PathScope");
75
76     public static final String JavaDoc ATTR_TRANSITIVE_FILTER = "transitiveFilter";
77     public static final String JavaDoc ATTR_ADDITIONAL_FOCUS_FILTER =
78         "additionalScopeFocus";
79
80     Filter additionalFocusFilter;
81     Filter transitiveFilter;
82
83     public PathScope(String JavaDoc name) {
84         super(name);
85         setDescription(
86             "PathScope: A scope for path crawls *Deprecated* Use " +
87             "DecidingScope instead. Crawls made with this scope" +
88             " will be limited to a specific portion of the hosts its seeds" +
89             " provide. More specifically the paths those seeds provide." +
90             " For example if one of the seeds is 'archive.org/example/'" +
91             " all URIs under the path 'examples' will be crawled (like" +
92             " 'archive.org/examples/hello.html') but not URIs in other" +
93             " paths or root (i.e. 'archive.org/index.html).");
94         this.additionalFocusFilter = (Filter) addElementToDefinition(
95                 new FilePatternFilter(ATTR_ADDITIONAL_FOCUS_FILTER));
96         this.transitiveFilter = (Filter) addElementToDefinition(
97                 new TransclusionFilter(ATTR_TRANSITIVE_FILTER));
98     }
99
100     /**
101      * @param o
102      * @return True if transitive filter accepts passed object.
103      */

104     protected boolean transitiveAccepts(Object JavaDoc o) {
105         if (this.transitiveFilter == null) {
106             return true;
107         }
108         return this.transitiveFilter.accepts(o);
109     }
110
111     /**
112      * @param o
113      * @return True if focus filter accepts passed object.
114      */

115     protected boolean focusAccepts(Object JavaDoc o) {
116         UURI u = UURI.from(o);
117         if (u == null) {
118             return false;
119         }
120         // Get the seeds to refresh
121
Iterator JavaDoc iter = seedsIterator();
122         while(iter.hasNext()) {
123             UURI s = (UURI) iter.next();
124             if (isSameHost(s, u)) {
125                 try {
126                     // Protect against non-parseable URIs. See
127
// "[ 910120 ] java.net.URI#getHost fails when
128
// leading digit"
129
if (s.getPath() == null || u.getPath() == null) {
130                         continue;
131                     }
132                 }
133                 catch (URIException e) {
134                     logger.severe("Failed get path on " + u + " or " + s +
135                         ": " + e.getMessage());
136                 }
137                 try {
138                     if (s.getPath().regionMatches(0, u.getPath(), 0,
139                         s.getPath().lastIndexOf('/'))) {
140                         // matches up to last '/'
141
checkClose(iter);
142                         return true;
143                     } else {
144                         // no match; try next seed
145
continue;
146                     }
147                 }
148                 catch (URIException e) {
149                     logger.severe("Failed get path on " + u + " or " + s +
150                         ": " + e.getMessage());
151                 }
152             }
153         }
154         // if none found, fail
155
checkClose(iter);
156         return false;
157     }
158
159     // Javadoc inherited
160
@Override JavaDoc
161     protected boolean additionalFocusAccepts(Object JavaDoc o) {
162         return this.additionalFocusFilter.accepts(o);
163     }
164
165 }
166
Popular Tags