1 24 package org.archive.crawler.datamodel; 25 26 import java.io.BufferedReader ; 27 import java.io.IOException ; 28 import java.io.ObjectInputStream ; 29 import java.io.ObjectOutputStream ; 30 import java.io.Serializable ; 31 import java.util.ArrayList ; 32 import java.util.HashMap ; 33 import java.util.Iterator ; 34 import java.util.LinkedList ; 35 import java.util.List ; 36 import java.util.logging.Level ; 37 import java.util.logging.Logger ; 38 39 import org.apache.commons.httpclient.URIException; 40 import org.archive.crawler.settings.CrawlerSettings; 41 42 58 public class RobotsExclusionPolicy implements Serializable { 59 60 private static final long serialVersionUID = 6323907991237383113L; 61 62 private static final Logger logger = 63 Logger.getLogger(RobotsExclusionPolicy.class.getName()); 64 65 private final static int NORMAL_TYPE = 0; 66 private final static int ALLOWALL_TYPE = 1; 67 private final static int DENYALL_TYPE = 2; 68 private transient int type = NORMAL_TYPE; 69 70 public static RobotsExclusionPolicy ALLOWALL = 71 new RobotsExclusionPolicy(ALLOWALL_TYPE); 72 public static RobotsExclusionPolicy DENYALL = 73 new RobotsExclusionPolicy(DENYALL_TYPE); 74 75 private LinkedList <String > userAgents = null; 76 private HashMap <String ,List <String >> disallows = null; 77 transient RobotsHonoringPolicy honoringPolicy = null; 78 79 private String lastUsedUserAgent = null; 80 private List <String > userAgentsToTest = null; 81 82 89 public static RobotsExclusionPolicy policyFor(CrawlerSettings settings, 90 BufferedReader reader, RobotsHonoringPolicy honoringPolicy) 91 throws IOException { 92 LinkedList <String > userAgents = new LinkedList <String >(); 93 HashMap <String ,List <String >> disallows 94 = new HashMap <String ,List <String >>(); 95 Robotstxt.parse(reader, userAgents, disallows); 96 return (disallows.isEmpty())? 97 ALLOWALL: 98 new RobotsExclusionPolicy(settings, userAgents, disallows, 99 honoringPolicy); 100 } 101 102 103 104 110 public RobotsExclusionPolicy(CrawlerSettings settings, LinkedList <String > u, 111 HashMap <String ,List <String >> d, 112 RobotsHonoringPolicy honoringPolicy) { 113 userAgents = u; 114 disallows = d; 115 this.honoringPolicy = honoringPolicy; 116 117 if(honoringPolicy == null) return; 118 119 if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED)) { 121 userAgentsToTest = userAgents; 122 123 } else if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) { 125 userAgentsToTest = new ArrayList <String >(); 126 Iterator userAgentSet = honoringPolicy.getUserAgents(settings).iterator(); 127 while(userAgentSet.hasNext()) { 128 String userAgent = (String ) userAgentSet.next(); 129 130 Iterator iter = userAgents.iterator(); 131 while ( iter.hasNext() ) { 132 String ua = (String )iter.next(); 133 if (userAgent.indexOf(ua)>-1) { 134 userAgentsToTest.add(ua); 135 break; 136 } 137 } 138 } 139 } 140 } 141 142 public RobotsExclusionPolicy(int type) { 143 this(null, null, null, null); 144 this.type = type; 145 } 146 147 public boolean disallows(CrawlURI curi, String userAgent) { 148 if (this == ALLOWALL) 149 return false; 150 if (this == DENYALL) 151 return true; 152 153 if((honoringPolicy.isType(curi, RobotsHonoringPolicy.CLASSIC) 156 || honoringPolicy.isType(curi, RobotsHonoringPolicy.CUSTOM)) 157 && (lastUsedUserAgent == null 158 || !lastUsedUserAgent.equals(userAgent))) { 159 160 lastUsedUserAgent = userAgent; 161 userAgentsToTest = new ArrayList <String >(); 162 Iterator iter = userAgents.iterator(); 163 String lowerCaseUserAgent = userAgent.toLowerCase(); 164 while ( iter.hasNext() ) { 165 String ua = (String )iter.next(); 166 if (lowerCaseUserAgent.indexOf(ua)>-1) { 168 userAgentsToTest.add(ua); 169 break; } 171 } 172 } 173 174 boolean disallow = false; 175 boolean examined = false; 176 String ua = null; 177 178 Iterator uas = userAgentsToTest.iterator(); 180 while(uas.hasNext() && examined == false) { 181 disallow = false; 182 ua = (String ) uas.next(); 183 Iterator dis = ((List ) disallows.get(ua)).iterator(); 184 185 while(dis.hasNext() && examined == false && disallow == false) { 187 String disallowedPath = (String ) dis.next(); 188 if(disallowedPath.length() == 0) { 189 examined = true; 191 disallow = false; 192 break; 193 } 194 try { 195 String p = curi.getUURI().getPathQuery(); 196 if (p != null && p.startsWith(disallowedPath) ) { 197 disallow = true; 199 } 200 } 201 catch (URIException e) { 202 logger.log(Level.SEVERE,"Failed getPathQuery from " + curi, e); 203 } 204 } 205 if(disallow == false) { 206 examined = true; 208 } 209 } 210 211 if(honoringPolicy.shouldMasquerade(curi) && ua != null && !ua.equals("")) { 214 curi.setUserAgent(ua); 215 } 216 return disallow; 217 } 218 219 221 227 private void writeObject(ObjectOutputStream stream) throws IOException { 228 stream.writeInt(type); 229 if (type == NORMAL_TYPE) { 230 stream.defaultWriteObject(); 231 } 232 } 233 234 241 private void readObject(ObjectInputStream stream) 242 throws IOException , ClassNotFoundException { 243 type = stream.readInt(); 244 if (type == NORMAL_TYPE) { 245 stream.defaultReadObject(); 246 } 247 } 248 249 253 private Object readResolve() { 254 if (type == NORMAL_TYPE) { 255 return this; 256 } else if (type == ALLOWALL_TYPE) { 257 return ALLOWALL; 258 } else if (type == DENYALL_TYPE) { 259 return DENYALL; 260 } 261 return null; 262 } 263 264 } 265 | Popular Tags |