KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > javacoding > jspider > core > rule > impl > RobotsTXTRule


1 package net.javacoding.jspider.core.rule.impl;
2
3
4 import net.javacoding.jspider.api.model.Decision;
5 import net.javacoding.jspider.api.model.Site;
6 import net.javacoding.jspider.core.SpiderContext;
7 import net.javacoding.jspider.core.model.DecisionInternal;
8 import net.javacoding.jspider.core.model.SiteInternal;
9 import net.javacoding.jspider.core.util.URLUtil;
10 import net.javacoding.jspider.core.util.html.RobotsTXTLine;
11 import net.javacoding.jspider.core.util.html.RobotsTXTLineSet;
12
13 import java.io.IOException JavaDoc;
14 import java.io.InputStream JavaDoc;
15 import java.net.URL JavaDoc;
16
17
18 /**
19  * Rule implementation that applies the rules expressed by a site's robots.txt
20  * file to the resources we want to fetch on that site.
21  * This file allows webmasters to exclude certain resources and folders not to
22  * be spidered by web robots, to disallow inclusion in search engines, etc ...
23  *
24  * $Id: RobotsTXTRule.java,v 1.13 2003/03/28 17:26:28 vanrogu Exp $
25  *
26  * @author Günther Van Roey
27  */

28 public class RobotsTXTRule extends BaseRuleImpl {
29
30     /** user agent under which we're operating. */
31     protected String JavaDoc effectiveUserAgent;
32
33     /** user agent in the robots.txt file we're obeying */
34     protected String JavaDoc obeyedUserAgent;
35
36     /** all lines in the robots.txt file that apply to us and forbid access to a part of the site. */
37     protected RobotsTXTLine[] forbiddenPaths;
38
39
40     /**
41      * Public constructor.
42      * @param userAgent the userAgent under which we're operating
43      * @param is the inputstream to read the robots.txt file from
44      * @throws IOException in case something goes wrong reading the robots.txt
45      */

46     public RobotsTXTRule(String JavaDoc userAgent, InputStream JavaDoc is) throws IOException JavaDoc {
47         RobotsTXTLineSet lineSet = RobotsTXTLineSet.findLineSet(is, userAgent);
48         this.effectiveUserAgent = userAgent;
49         if (lineSet == null) {
50             this.obeyedUserAgent = null;
51             forbiddenPaths = new RobotsTXTLine[0];
52         } else {
53             this.obeyedUserAgent = lineSet.getUserAgent();
54             forbiddenPaths = lineSet.getLines();
55         }
56     }
57
58     /**
59      * Returns the user agent from robots.txt we're obeying. (can be '*').
60      * This user agent identification is the first match we encountered in the file,
61      * a match is given if our effective user agent contains the user agent
62      * identification as a substring in a case-insensitive way.
63      * @return the useragent selector we're obeying.
64      */

65     public String JavaDoc getObeyedUserAgent() {
66         return obeyedUserAgent;
67     }
68
69     /**
70      * Applies the rule to a given URL
71      * @param context the spider context we're working in
72      * @param currentSite the site we're spidering
73      * @param url the url of the resource to be tested for spider permission
74      * @return Decision object expressing this rule's decision on the resource
75      */

76     public Decision apply(SpiderContext context, Site currentSite, URL JavaDoc url) {
77         String JavaDoc path = url.getPath();
78         Decision decision = new DecisionInternal();
79
80         if ((context.getStorage().getSiteDAO().find(URLUtil.getSiteURL(url))).getObeyRobotsTXT()) {
81
82             for (int i = 0; i < forbiddenPaths.length; i++) {
83                 RobotsTXTLine forbiddenPath = forbiddenPaths[i];
84                 if (forbiddenPath.matches(url)) {
85                     decision = new DecisionInternal(Decision.RULE_FORBIDDEN, "access to '" + path + "' forbidden");
86                     break;
87                 }
88             }
89         }
90         return decision;
91     }
92
93 }
94
Popular Tags