KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > opensymphony > module > sitemesh > mapper > RobotDecoratorMapper


1 /*
2  * Title: RobotDecoratorMapper
3  * Description:
4  *
5  * This software is published under the terms of the OpenSymphony Software
6  * License version 1.1, of which a copy has been included with this
7  * distribution in the LICENSE.txt file.
8  */

9
10 package com.opensymphony.module.sitemesh.mapper;
11
12 import com.opensymphony.module.sitemesh.Config;
13 import com.opensymphony.module.sitemesh.Decorator;
14 import com.opensymphony.module.sitemesh.DecoratorMapper;
15 import com.opensymphony.module.sitemesh.Page;
16 import com.opensymphony.module.sitemesh.RequestConstants;
17
18 import javax.servlet.http.HttpServletRequest JavaDoc;
19 import javax.servlet.http.HttpSession JavaDoc;
20 import java.util.Properties JavaDoc;
21
22 /**
23  * The RobotDecoratorMapper will use the specified decorator when the requester
24  * is identified as a robot (also known as spider, crawler, ferret) of a search engine.
25  *
26  * <p>The name of this decorator should be supplied in the <code>decorator</code>
27  * property.</p>
28  *
29  * @author <a HREF="mailto:pathos@pandora.be">Mathias Bogaert</a>
30  * @version $Revision: 1.2 $
31  *
32  * @see com.opensymphony.module.sitemesh.DecoratorMapper
33  */

34 public final class RobotDecoratorMapper extends AbstractDecoratorMapper {
35     private String JavaDoc decoratorName = null;
36
37     /** All known robot hosts (list can be found <a HREF="http://www.spiderhunter.com">here</a>). */
38     private static final String JavaDoc[] botHosts = {"alltheweb.com", "alta-vista.net", "altavista.com",
39                                               "atext.com", "euroseek.net", "excite.com",
40                                               "fast-search.net", "google.com", "googlebot.com",
41                                               "infoseek.co.jp", "infoseek.com", "inktomi.com",
42                                               "inktomisearch.com", "linuxtoday.com.au", "lycos.com",
43                                               "lycos.com", "northernlight.com", "pa-x.dec.com"};
44
45     /**
46      * All known robot user-agent headers (list can be found
47      * <a HREF="http://www.robotstxt.org/wc/active.html">here</a>).
48      *
49      * <p>NOTE: To avoid bad detection:</p>
50      *
51      * <ul>
52      * <li>Robots with ID of 2 letters only were removed</li>
53      * <li>Robot called "webs" were removed</li>
54      * <li>directhit was changed in direct_hit (its real id)</li>
55      * </ul>
56      */

57     private static final String JavaDoc[] botAgents = {
58         "acme.spider", "ahoythehomepagefinder", "alkaline", "appie", "arachnophilia",
59         "architext", "aretha", "ariadne", "aspider", "atn.txt", "atomz", "auresys",
60         "backrub", "bigbrother", "bjaaland", "blackwidow", "blindekuh", "bloodhound",
61         "brightnet", "bspider", "cactvschemistryspider", "calif", "cassandra",
62         "cgireader", "checkbot", "churl", "cmc", "collective", "combine", "conceptbot",
63         "core", "cshkust", "cusco", "cyberspyder", "deweb", "dienstspider", "diibot",
64         "direct_hit", "dnabot", "download_express", "dragonbot", "dwcp", "ebiness",
65         "eit", "emacs", "emcspider", "esther", "evliyacelebi", "fdse", "felix",
66         "ferret", "fetchrover", "fido", "finnish", "fireball", "fish", "fouineur",
67         "francoroute", "freecrawl", "funnelweb", "gazz", "gcreep", "getbot", "geturl",
68         "golem", "googlebot", "grapnel", "griffon", "gromit", "gulliver", "hambot",
69         "harvest", "havindex", "hometown", "wired-digital", "htdig", "htmlgobble",
70         "hyperdecontextualizer", "ibm", "iconoclast", "ilse", "imagelock", "incywincy",
71         "informant", "infoseek", "infoseeksidewinder", "infospider", "inspectorwww",
72         "intelliagent", "iron33", "israelisearch", "javabee", "jcrawler", "jeeves",
73         "jobot", "joebot", "jubii", "jumpstation", "katipo", "kdd", "kilroy",
74         "ko_yappo_robot", "labelgrabber.txt", "larbin", "legs", "linkscan",
75         "linkwalker", "lockon", "logo_gif", "lycos", "macworm", "magpie", "mediafox",
76         "merzscope", "meshexplorer", "mindcrawler", "moget", "momspider", "monster",
77         "motor", "muscatferret", "mwdsearch", "myweb", "netcarta", "netmechanic",
78         "netscoop", "newscan-online", "nhse", "nomad", "northstar", "nzexplorer",
79         "occam", "octopus", "orb_search", "packrat", "pageboy", "parasite", "patric",
80         "perignator", "perlcrawler", "phantom", "piltdownman", "pioneer", "pitkow",
81         "pjspider", "pka", "plumtreewebaccessor", "poppi", "portalb", "puu", "python",
82         "raven", "rbse", "resumerobot", "rhcs", "roadrunner", "robbie", "robi",
83         "roverbot", "safetynetrobot", "scooter", "search_au", "searchprocess",
84         "senrigan", "sgscout", "shaggy", "shaihulud", "sift", "simbot", "site-valet",
85         "sitegrabber", "sitetech", "slurp", "smartspider", "snooper", "solbot",
86         "spanner", "speedy", "spider_monkey", "spiderbot", "spiderman", "spry",
87         "ssearcher", "suke", "sven", "tach_bw", "tarantula", "tarspider", "tcl",
88         "techbot", "templeton", "titin", "titan", "tkwww", "tlspider", "ucsd",
89         "udmsearch", "urlck", "valkyrie", "victoria", "visionsearch", "voyager",
90         "vwbot", "w3index", "w3m2", "wanderer", "webbandit", "webcatcher", "webcopy",
91         "webfetcher", "webfoot", "weblayers", "weblinker", "webmirror", "webmoose",
92         "webquest", "webreader", "webreaper", "websnarf", "webspider", "webvac",
93         "webwalk", "webwalker", "webwatch", "wget", "whowhere", "wmir", "wolp",
94         "wombat", "worm", "wwwc", "wz101", "xget", "nederland.zoek"
95     };
96
97     public void init(Config config, Properties JavaDoc properties, DecoratorMapper parent) throws InstantiationException JavaDoc {
98         super.init(config, properties, parent);
99         decoratorName = properties.getProperty("decorator");
100     }
101
102     public Decorator getDecorator(HttpServletRequest JavaDoc request, Page page) {
103         Decorator result = null;
104
105         if (decoratorName != null && isBot(request)) {
106             result = getNamedDecorator(request, decoratorName);
107         }
108
109         return result == null ? super.getDecorator(request, page) : result;
110     }
111
112     /** Check if the current request came from a robot (also known as spider, crawler, ferret) */
113     private static boolean isBot(HttpServletRequest JavaDoc request) {
114         if (request == null) return false;
115
116         // force creation of a session
117
HttpSession JavaDoc session = request.getSession(true);
118
119         if (Boolean.FALSE.equals(session.getAttribute(RequestConstants.ROBOT))) {
120             return false;
121         }
122         else if (Boolean.TRUE.equals(session.getAttribute(RequestConstants.ROBOT))) {
123             // a key was found in the session indicating it is a robot
124
return true;
125         }
126         else {
127             if ("robots.txt".indexOf(request.getRequestURI()) != -1) {
128                 // there is a specific request for the robots.txt file, so we assume
129
// it must be a robot (only robots request robots.txt)
130

131                 // set a key in the session, so the next time we don't have to manually
132
// detect the robot again
133
session.setAttribute(RequestConstants.ROBOT, Boolean.TRUE);
134                 return true;
135             }
136             else {
137                 String JavaDoc userAgent = request.getHeader("User-Agent");
138
139                 if (userAgent != null && userAgent.trim().length() > 2) {
140                     // first check for common user-agent headers, so that we can speed
141
// this thing up, hopefully clever spiders will not send a fake header
142
if (userAgent.indexOf("MSIE") != -1 || userAgent.indexOf("Gecko") != -1 // MSIE and Mozilla
143
|| userAgent.indexOf("Opera") != -1 || userAgent.indexOf("iCab") != -1 // Opera and iCab (mac browser)
144
|| userAgent.indexOf("Konqueror") != -1 || userAgent.indexOf("KMeleon") != -1 // Konqueror and KMeleon
145
|| userAgent.indexOf("4.7") != -1 || userAgent.indexOf("Lynx") != -1) { // NS 4.78 and Lynx
146
// indicate this session is not a robot
147
session.setAttribute(RequestConstants.ROBOT, Boolean.FALSE);
148                         return false;
149                     }
150
151                     for (int i = 0; i < botAgents.length; i++) {
152                         if (userAgent.indexOf(botAgents[i]) != -1) {
153                             // set a key in the session, so the next time we don't have to manually
154
// detect the robot again
155
session.setAttribute(RequestConstants.ROBOT, Boolean.TRUE);
156                             return true;
157                         }
158                     }
159                 }
160
161                 // detect the robot from the host or user-agent
162
String JavaDoc remoteHost = request.getRemoteHost(); // requires one DNS lookup
163

164                 // if the DNS server didn't return a hostname, getRemoteHost returns the
165
// IP address, which is ignored here (the last char is checked, because some
166
// remote hosts begin with the IP)
167
if (remoteHost != null && remoteHost.length() > 0 && remoteHost.charAt(remoteHost.length() - 1) > 64) {
168                     for (int i = 0; i < botHosts.length; i++) {
169                         if (remoteHost.indexOf(botHosts[i]) != -1) {
170                             // set a key in the session, so the next time we don't have to manually
171
// detect the robot again
172
session.setAttribute(RequestConstants.ROBOT, Boolean.TRUE);
173                             return true;
174                         }
175                     }
176                 }
177
178                 // remote host and user agent are not in the predefined list,
179
// so it must be an unknown robot or not a robot
180

181                 // indicate this session is not a robot
182
session.setAttribute(RequestConstants.ROBOT, Boolean.FALSE);
183                 return false;
184             }
185         }
186     }
187 }
Popular Tags