1 24 package org.archive.crawler.datamodel; 25 26 import java.util.logging.Logger ; 27 28 import javax.management.AttributeNotFoundException ; 29 30 import org.archive.crawler.settings.CrawlerSettings; 31 import org.archive.crawler.settings.ModuleType; 32 import org.archive.crawler.settings.SimpleType; 33 import org.archive.crawler.settings.StringList; 34 import org.archive.crawler.settings.TextField; 35 36 64 public class RobotsHonoringPolicy extends ModuleType { 65 66 private static final long serialVersionUID = 8850011643923116605L; 67 68 private static Logger logger = 69 Logger.getLogger("org.archive.crawler.datamodel.RobotsHonoringPolicy"); 70 71 public final static int CLASSIC = 0; 72 public final static int IGNORE = 1; 73 public final static int CUSTOM = 2; 74 public final static int MOST_FAVORED = 3; 75 public final static int MOST_FAVORED_SET = 4; 76 77 public final static String ATTR_NAME = "robots-honoring-policy"; 78 public final static String ATTR_TYPE = "type"; 79 public final static String ATTR_MASQUERADE = "masquerade"; 80 public final static String ATTR_CUSTOM_ROBOTS = "custom-robots"; 81 public final static String ATTR_USER_AGENTS = "user-agents"; 82 83 84 89 public RobotsHonoringPolicy(String name) { 90 super(name, "Robots honoring policy"); 91 92 String [] allowedTypes = new String [] { 93 "classic", "ignore", "custom", 94 "most-favored", "most-favored-set"}; 95 96 addElementToDefinition(new SimpleType(ATTR_TYPE, 97 "Policy type. The 'classic' policy simply obeys all " + 98 "robots.txt rules for the configured user-agent. The " + 99 "'ignore' policy ignores all robots rules. The 'custom' " + 100 "policy allows you to specify a policy, in robots.txt " + 101 "format, as a setting. The 'most-favored' policy will " + 102 "crawl an URL if the robots.txt allows any user-agent to " + 103 "crawl it. The 'most-favored-set' policy requires you " + 104 "to supply an list of alternate user-agents, and for" + 105 "every page, if any agent of the set is allowed, the" + 106 "page will be crawled.", "classic", allowedTypes)); 107 addElementToDefinition(new SimpleType(ATTR_MASQUERADE, 108 "Should we masquerade as another user agent when obeying " + 109 "the rules declared for it. Only relevant if the " + 110 "policy type is 'most-favored' or 'most-favored-set'.", 111 new Boolean (false))); 112 addElementToDefinition(new SimpleType(ATTR_CUSTOM_ROBOTS, 113 "Custom robots to use if policy type is 'custom'. " + 114 "Compose as if an actual robots.txt file.", 115 new TextField(""))); 116 addElementToDefinition(new StringList(ATTR_USER_AGENTS, 117 "Alternate user-agent values to consider using for " + 118 "the 'most-favored-set' policy.")); 119 } 120 121 public RobotsHonoringPolicy() { 122 this(ATTR_NAME); 123 } 124 125 131 public StringList getUserAgents(CrawlerSettings settings) { 132 if (isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) { 133 try { 134 return (StringList) getAttribute(settings, ATTR_USER_AGENTS); 135 } catch (AttributeNotFoundException e) { 136 logger.severe(e.getMessage()); 137 } 138 } 139 return null; 140 } 141 142 150 public boolean shouldMasquerade(CrawlURI curi) { 151 try { 152 return ((Boolean ) getAttribute(curi, ATTR_MASQUERADE)).booleanValue(); 153 } catch (AttributeNotFoundException e) { 154 logger.severe(e.getMessage()); 155 return false; 156 } 157 } 158 159 164 public String getCustomRobots(CrawlerSettings settings) { 165 if(isType(settings, RobotsHonoringPolicy.CUSTOM)) { 166 try { 167 return getAttribute(settings, ATTR_CUSTOM_ROBOTS).toString(); 168 } catch (AttributeNotFoundException e) { 169 logger.severe(e.getMessage()); 170 } 171 } 172 return null; 173 } 174 175 186 public int getType(Object context) { 187 int type = CLASSIC; 188 try { 189 String typeName = (String ) getAttribute(context, "type"); 190 if(typeName.equals("classic")) { 191 type = RobotsHonoringPolicy.CLASSIC; 192 } else if(typeName.equals("ignore")) { 193 type = RobotsHonoringPolicy.IGNORE; 194 } else if(typeName.equals("custom")) { 195 type = RobotsHonoringPolicy.CUSTOM; 196 } else if(typeName.equals("most-favored")) { 197 type = RobotsHonoringPolicy.MOST_FAVORED; 198 } else if(typeName.equals("most-favored-set")) { 199 type = RobotsHonoringPolicy.MOST_FAVORED_SET; 200 } else { 201 throw new IllegalArgumentException (); 202 } 203 } catch (AttributeNotFoundException e) { 204 logger.severe(e.getMessage()); 205 } 206 return type; 207 } 208 209 216 public boolean isType(Object o, int type) { 217 return type == getType(o); 218 } 219 220 } 221 | Popular Tags |