1 package net.javacoding.jspider.core.rule.impl; 2 3 4 import net.javacoding.jspider.api.model.Decision; 5 import net.javacoding.jspider.api.model.Site; 6 import net.javacoding.jspider.core.SpiderContext; 7 import net.javacoding.jspider.core.model.DecisionInternal; 8 import net.javacoding.jspider.core.model.SiteInternal; 9 import net.javacoding.jspider.core.util.URLUtil; 10 import net.javacoding.jspider.core.util.html.RobotsTXTLine; 11 import net.javacoding.jspider.core.util.html.RobotsTXTLineSet; 12 13 import java.io.IOException ; 14 import java.io.InputStream ; 15 import java.net.URL ; 16 17 18 28 public class RobotsTXTRule extends BaseRuleImpl { 29 30 31 protected String effectiveUserAgent; 32 33 34 protected String obeyedUserAgent; 35 36 37 protected RobotsTXTLine[] forbiddenPaths; 38 39 40 46 public RobotsTXTRule(String userAgent, InputStream is) throws IOException { 47 RobotsTXTLineSet lineSet = RobotsTXTLineSet.findLineSet(is, userAgent); 48 this.effectiveUserAgent = userAgent; 49 if (lineSet == null) { 50 this.obeyedUserAgent = null; 51 forbiddenPaths = new RobotsTXTLine[0]; 52 } else { 53 this.obeyedUserAgent = lineSet.getUserAgent(); 54 forbiddenPaths = lineSet.getLines(); 55 } 56 } 57 58 65 public String getObeyedUserAgent() { 66 return obeyedUserAgent; 67 } 68 69 76 public Decision apply(SpiderContext context, Site currentSite, URL url) { 77 String path = url.getPath(); 78 Decision decision = new DecisionInternal(); 79 80 if ((context.getStorage().getSiteDAO().find(URLUtil.getSiteURL(url))).getObeyRobotsTXT()) { 81 82 for (int i = 0; i < forbiddenPaths.length; i++) { 83 RobotsTXTLine forbiddenPath = forbiddenPaths[i]; 84 if (forbiddenPath.matches(url)) { 85 decision = new DecisionInternal(Decision.RULE_FORBIDDEN, "access to '" + path + "' forbidden"); 86 break; 87 } 88 } 89 } 90 return decision; 91 } 92 93 } 94 | Popular Tags |