KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > prefetch > Preselector


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * SimplePreselector.java
20  * Created on Sep 22, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/prefetch/Preselector.java,v 1.7.8.1 2007/01/13 01:31:24 stack-sf Exp $
23  */

24 package org.archive.crawler.prefetch;
25
26 import javax.management.AttributeNotFoundException JavaDoc;
27
28 import org.archive.crawler.datamodel.CrawlURI;
29 import org.archive.crawler.datamodel.FetchStatusCodes;
30 import org.archive.crawler.framework.Scoper;
31 import org.archive.crawler.settings.SimpleType;
32 import org.archive.crawler.settings.Type;
33 import org.archive.util.TextUtils;
34
35 /**
36  * If set to recheck the crawl's scope, gives a yes/no on whether
37  * a CrawlURI should be processed at all. If not, its status
38  * will be marked OUT_OF_SCOPE and the URI will skip directly
39  * to the first "postprocessor".
40  *
41  *
42  * @author gojomo
43  *
44  */

45 public class Preselector extends Scoper
46 implements FetchStatusCodes {
47
48     private static final long serialVersionUID = 3738560264369561017L;
49
50     /** whether to reapply crawl scope at this step */
51     public static final String JavaDoc ATTR_RECHECK_SCOPE = "recheck-scope";
52     /** indicator allowing all URIs (of a given host, typically) to
53      * be blocked at this step*/

54     public static final String JavaDoc ATTR_BLOCK_ALL = "block-all";
55     /** indicator allowing all matching URIs to be blocked at this step */
56     public static final String JavaDoc ATTR_BLOCK_BY_REGEXP = "block-by-regexp";
57     /** indicator allowing all matching URIs */
58     public static final String JavaDoc ATTR_ALLOW_BY_REGEXP = "allow-by-regexp";
59
60     /**
61      * Constructor.
62      * @param name Name of this processor.
63      */

64     public Preselector(String JavaDoc name) {
65         super(name, "Preselector. Does one last bit of checking to make " +
66             "sure that the current URI should be fetched.");
67         Type e;
68         e = addElementToDefinition(new SimpleType(ATTR_RECHECK_SCOPE,
69                 "Recheck if uri is in scope. This is meaningful if the scope" +
70                 " is altered during a crawl. URIs are checked against the" +
71                 " scope when they are added to queues. Setting this value to" +
72                 " true forces the URI to be checked against the scope when it" +
73                 " is comming out of the queue, possibly after the scope is" +
74                 " altered.", new Boolean JavaDoc(false)));
75         e.setExpertSetting(true);
76
77         e = addElementToDefinition(new SimpleType(ATTR_BLOCK_ALL,
78                 "Block all URIs from being processed. This is most likely to" +
79                 " be used in overrides to easily reject certain hosts from" +
80                 " being processed.", new Boolean JavaDoc(false)));
81         e.setExpertSetting(true);
82
83         e = addElementToDefinition(new SimpleType(ATTR_BLOCK_BY_REGEXP,
84                 "Block all URIs matching the regular expression from being" +
85                 " processed.", ""));
86         e.setExpertSetting(true);
87
88         e = addElementToDefinition(new SimpleType(ATTR_ALLOW_BY_REGEXP,
89                 "Allow only URIs matching the regular expression to be" +
90                 " processed.", ""));
91         e.setExpertSetting(true);
92     }
93
94     protected void innerProcess(CrawlURI curi) {
95         // Check if uris should be blocked
96
try {
97             if (((Boolean JavaDoc) getAttribute(ATTR_BLOCK_ALL, curi)).booleanValue()) {
98                 curi.setFetchStatus(S_BLOCKED_BY_USER);
99                 curi.skipToProcessorChain(getController().
100                     getPostprocessorChain());
101             }
102         } catch (AttributeNotFoundException JavaDoc e) {
103             // Act as attribute was false, that is: do nothing.
104
}
105
106         // Check if allowed by regular expression
107
try {
108             String JavaDoc regexp = (String JavaDoc) getAttribute(ATTR_ALLOW_BY_REGEXP, curi);
109             if (regexp != null && !regexp.equals("")) {
110                 if (!TextUtils.matches(regexp, curi.toString())) {
111                     curi.setFetchStatus(S_BLOCKED_BY_USER);
112                     curi.skipToProcessorChain(getController().
113                         getPostprocessorChain());
114                 }
115             }
116         } catch (AttributeNotFoundException JavaDoc e) {
117             // Act as regexp was null, that is: do nothing.
118
}
119
120
121         // Check if blocked by regular expression
122
try {
123             String JavaDoc regexp = (String JavaDoc) getAttribute(ATTR_BLOCK_BY_REGEXP, curi);
124             if (regexp != null && !regexp.equals("")) {
125                 if (TextUtils.matches(regexp, curi.toString())) {
126                     curi.setFetchStatus(S_BLOCKED_BY_USER);
127                     curi.skipToProcessorChain(getController().
128                         getPostprocessorChain());
129                 }
130             }
131         } catch (AttributeNotFoundException JavaDoc e) {
132             // Act as regexp was null, that is: do nothing.
133
}
134
135         // Possibly recheck scope
136
try {
137             if (((Boolean JavaDoc) getAttribute(ATTR_RECHECK_SCOPE, curi)).
138                     booleanValue()) {
139                 if (!isInScope(curi)) {
140                     // Scope rejected
141
curi.setFetchStatus(S_OUT_OF_SCOPE);
142                     curi.skipToProcessorChain(getController().
143                         getPostprocessorChain());
144                 }
145             }
146         } catch (AttributeNotFoundException JavaDoc e) {
147             // Act as attribute was false, that is: do nothing.
148
}
149     }
150 }
151
Popular Tags