KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > datamodel > RobotsHonoringPolicy


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * RobotsHonoringPolicy.java
20  * Created on Oct 30, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/RobotsHonoringPolicy.java,v 1.16.4.1 2007/01/13 01:31:12 stack-sf Exp $
23  */

24 package org.archive.crawler.datamodel;
25
26 import java.util.logging.Logger JavaDoc;
27
28 import javax.management.AttributeNotFoundException JavaDoc;
29
30 import org.archive.crawler.settings.CrawlerSettings;
31 import org.archive.crawler.settings.ModuleType;
32 import org.archive.crawler.settings.SimpleType;
33 import org.archive.crawler.settings.StringList;
34 import org.archive.crawler.settings.TextField;
35
36 /**
37  * RobotsHonoringPolicy represent the strategy used by the crawler
38  * for determining how robots.txt files will be honored.
39  *
40  * Five kinds of policies exist:
41  * <dl>
42  * <dt>classic:</dt>
43  * <dd>obey the first set of robots.txt directives that apply to your
44  * current user-agent</dd>
45  * <dt>ignore:</dt>
46  * <dd>ignore robots.txt directives entirely</dd>
47  * <dt>custom:</dt>
48  * <dd>obey a specific operator-entered set of robots.txt directives
49  * for a given host</dd>
50  * <dt>most-favored:</dt>
51  * <dd>obey the most liberal restrictions offered (if *any* crawler is
52  * allowed to get a page, get it)</dd>
53  * <dt>most-favored-set:</dt>
54  * <dd>given some set of user-agent patterns, obey the most liberal
55  * restriction offered to any</dd>
56  * </dl>
57  *
58  * The two last ones has the opportunity of adopting a different user-agent
59  * to reflect the restrictions we've opted to use.
60  *
61  * @author John Erik Halse
62  *
63  */

64 public class RobotsHonoringPolicy extends ModuleType {
65
66     private static final long serialVersionUID = 8850011643923116605L;
67
68     private static Logger JavaDoc logger =
69         Logger.getLogger("org.archive.crawler.datamodel.RobotsHonoringPolicy");
70
71     public final static int CLASSIC = 0;
72     public final static int IGNORE = 1;
73     public final static int CUSTOM = 2;
74     public final static int MOST_FAVORED = 3;
75     public final static int MOST_FAVORED_SET = 4;
76
77     public final static String JavaDoc ATTR_NAME = "robots-honoring-policy";
78     public final static String JavaDoc ATTR_TYPE = "type";
79     public final static String JavaDoc ATTR_MASQUERADE = "masquerade";
80     public final static String JavaDoc ATTR_CUSTOM_ROBOTS = "custom-robots";
81     public final static String JavaDoc ATTR_USER_AGENTS = "user-agents";
82
83
84     /**
85      * Creates a new instance of RobotsHonoringPolicy.
86      *
87      * @param name the name of the RobotsHonoringPolicy attirubte.
88      */

89     public RobotsHonoringPolicy(String JavaDoc name) {
90         super(name, "Robots honoring policy");
91
92         String JavaDoc[] allowedTypes = new String JavaDoc[] {
93                 "classic", "ignore", "custom",
94                 "most-favored", "most-favored-set"};
95
96         addElementToDefinition(new SimpleType(ATTR_TYPE,
97                 "Policy type. The 'classic' policy simply obeys all " +
98                 "robots.txt rules for the configured user-agent. The " +
99                 "'ignore' policy ignores all robots rules. The 'custom' " +
100                 "policy allows you to specify a policy, in robots.txt " +
101                 "format, as a setting. The 'most-favored' policy will " +
102                 "crawl an URL if the robots.txt allows any user-agent to " +
103                 "crawl it. The 'most-favored-set' policy requires you " +
104                 "to supply an list of alternate user-agents, and for" +
105                 "every page, if any agent of the set is allowed, the" +
106                 "page will be crawled.", "classic", allowedTypes));
107         addElementToDefinition(new SimpleType(ATTR_MASQUERADE,
108                 "Should we masquerade as another user agent when obeying " +
109                 "the rules declared for it. Only relevant if the " +
110                 "policy type is 'most-favored' or 'most-favored-set'.",
111                 new Boolean JavaDoc(false)));
112         addElementToDefinition(new SimpleType(ATTR_CUSTOM_ROBOTS,
113                 "Custom robots to use if policy type is 'custom'. " +
114                 "Compose as if an actual robots.txt file.",
115                 new TextField("")));
116         addElementToDefinition(new StringList(ATTR_USER_AGENTS,
117                 "Alternate user-agent values to consider using for " +
118                 "the 'most-favored-set' policy."));
119     }
120
121     public RobotsHonoringPolicy() {
122         this(ATTR_NAME);
123     }
124
125     /**
126      * If policy-type is most favored crawler of set, then this method
127      * gets a list of all useragents in that set.
128      *
129      * @return List of Strings with user agents
130      */

131     public StringList getUserAgents(CrawlerSettings settings) {
132         if (isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) {
133             try {
134                 return (StringList) getAttribute(settings, ATTR_USER_AGENTS);
135             } catch (AttributeNotFoundException JavaDoc e) {
136                 logger.severe(e.getMessage());
137             }
138         }
139         return null;
140     }
141
142     /**
143      * This method returns true if the crawler should masquerade as the user agent
144      * which restrictions it opted to use.
145      *
146      * (Only relevant for policy-types: most-favored and most-favored-set).
147      *
148      * @return true if we should masquerade
149      */

150     public boolean shouldMasquerade(CrawlURI curi) {
151         try {
152             return ((Boolean JavaDoc) getAttribute(curi, ATTR_MASQUERADE)).booleanValue();
153         } catch (AttributeNotFoundException JavaDoc e) {
154             logger.severe(e.getMessage());
155             return false;
156         }
157     }
158
159     /**
160      * Get the supplied custom robots.txt
161      *
162      * @return String with content of alternate robots.txt
163      */

164     public String JavaDoc getCustomRobots(CrawlerSettings settings) {
165         if(isType(settings, RobotsHonoringPolicy.CUSTOM)) {
166             try {
167                 return getAttribute(settings, ATTR_CUSTOM_ROBOTS).toString();
168             } catch (AttributeNotFoundException JavaDoc e) {
169                 logger.severe(e.getMessage());
170             }
171         }
172         return null;
173     }
174
175     /**
176      * Get the policy-type.
177      *
178      * @see #CLASSIC
179      * @see #IGNORE
180      * @see #CUSTOM
181      * @see #MOST_FAVORED
182      * @see #MOST_FAVORED_SET
183      *
184      * @return policy type
185      */

186     public int getType(Object JavaDoc context) {
187         int type = CLASSIC;
188         try {
189             String JavaDoc typeName = (String JavaDoc) getAttribute(context, "type");
190             if(typeName.equals("classic")) {
191                 type = RobotsHonoringPolicy.CLASSIC;
192             } else if(typeName.equals("ignore")) {
193                 type = RobotsHonoringPolicy.IGNORE;
194             } else if(typeName.equals("custom")) {
195                 type = RobotsHonoringPolicy.CUSTOM;
196             } else if(typeName.equals("most-favored")) {
197                 type = RobotsHonoringPolicy.MOST_FAVORED;
198             } else if(typeName.equals("most-favored-set")) {
199                 type = RobotsHonoringPolicy.MOST_FAVORED_SET;
200             } else {
201                 throw new IllegalArgumentException JavaDoc();
202             }
203         } catch (AttributeNotFoundException JavaDoc e) {
204             logger.severe(e.getMessage());
205         }
206         return type;
207     }
208
209     /**
210      * Check if policy is of a certain type.
211      *
212      * @param o An object that can be resolved into a settings object.
213      * @param type the type to check against.
214      * @return true if the policy is of the submitted type
215      */

216     public boolean isType(Object JavaDoc o, int type) {
217         return type == getType(o);
218     }
219
220 }
221
Popular Tags