KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > javacoding > jspider > core > impl > SpiderContextImpl


1 /**
2  * $Id: SpiderContextImpl.java,v 1.32 2003/04/10 16:19:05 vanrogu Exp $
3  */

4 package net.javacoding.jspider.core.impl;
5
6
7 import net.javacoding.jspider.Constants;
8 import net.javacoding.jspider.spi.Rule;
9 import net.javacoding.jspider.api.event.site.UserAgentObeyedEvent;
10 import net.javacoding.jspider.api.model.Cookie;
11 import net.javacoding.jspider.api.model.Site;
12 import net.javacoding.jspider.core.Agent;
13 import net.javacoding.jspider.core.SpiderContext;
14 import net.javacoding.jspider.core.dispatch.EventDispatcher;
15 import net.javacoding.jspider.core.logging.Log;
16 import net.javacoding.jspider.core.logging.LogFactory;
17 import net.javacoding.jspider.core.model.SiteInternal;
18 import net.javacoding.jspider.core.rule.*;
19 import net.javacoding.jspider.core.rule.impl.*;
20 import net.javacoding.jspider.core.storage.Storage;
21 import net.javacoding.jspider.core.throttle.Throttle;
22 import net.javacoding.jspider.core.throttle.ThrottleFactory;
23 import net.javacoding.jspider.core.util.Base64Encoder;
24 import net.javacoding.jspider.core.util.URLUtil;
25 import net.javacoding.jspider.core.util.config.*;
26 import net.javacoding.jspider.core.util.http.CookieUtil;
27
28 import java.io.IOException JavaDoc;
29 import java.io.InputStream JavaDoc;
30 import java.net.URL JavaDoc;
31 import java.net.URLConnection JavaDoc;
32 import java.util.HashMap JavaDoc;
33 import java.util.Map JavaDoc;
34
35
36 /**
37  *
38  * $Id: SpiderContextImpl.java,v 1.32 2003/04/10 16:19:05 vanrogu Exp $
39  *
40  * @author Günther Van Roey
41  */

42 public class SpiderContextImpl implements SpiderContext {
43
44     protected Agent agent;
45     protected URL JavaDoc baseURL;
46     protected EventDispatcher eventDispatcher;
47     protected ThrottleFactory throttleFactory;
48     protected Map JavaDoc throttles;
49     protected Map JavaDoc spiderRules;
50     protected Map JavaDoc parserRules;
51     protected Map JavaDoc robotsTXTRules;
52     protected Storage storage;
53     protected CookieUtil cookieUtil;
54     protected String JavaDoc authenticationString;
55     protected boolean useProxyAuthentication;
56     protected boolean useProxy;
57     protected Ruleset generalSpiderRules;
58     protected Ruleset generalParserRules;
59     protected String JavaDoc defaultUserAgent;
60     protected Log log;
61
62     public SpiderContextImpl(URL JavaDoc baseURL, EventDispatcher eventDispatcher, ThrottleFactory throttleFactory, Storage storage) {
63         this.baseURL = URLUtil.normalize(baseURL);
64         this.eventDispatcher = eventDispatcher;
65         this.throttleFactory = throttleFactory;
66         this.storage = storage;
67         this.cookieUtil = new CookieUtil();
68         this.throttles = new HashMap JavaDoc();
69         this.spiderRules = new HashMap JavaDoc ( );
70         this.parserRules = new HashMap JavaDoc ( );
71         this.robotsTXTRules = new HashMap JavaDoc ( );
72         this.generalSpiderRules = RuleFactory.createGeneralSpiderRules();
73         this.generalParserRules = RuleFactory.createGeneralParserRules();
74         this.log = LogFactory.getLog(SpiderContext.class);
75
76         PropertySet props = ConfigurationFactory.getConfiguration().getJSpiderConfiguration();
77
78         this.defaultUserAgent = props.getString(ConfigConstants.CONFIG_USERAGENT, Constants.USERAGENT );
79         log.info("default user Agent is '" + defaultUserAgent + "'");
80
81         PropertySet proxyProps = new MappedPropertySet ( ConfigConstants.CONFIG_PROXY, props );
82
83         useProxy = proxyProps.getBoolean(ConfigConstants.CONFIG_PROXY_USE, false);
84         if (useProxy) {
85
86             String JavaDoc proxyHost = proxyProps.getString(ConfigConstants.CONFIG_PROXY_HOST, "");
87             String JavaDoc proxyPort = proxyProps.getString(ConfigConstants.CONFIG_PROXY_PORT, "");
88
89             System.setProperty("http.proxyHost", proxyHost);
90             System.setProperty("http.proxyPort", proxyPort);
91
92             log.info ("Using proxy " + proxyHost + ":" + proxyPort );
93
94             useProxyAuthentication = proxyProps.getBoolean(ConfigConstants.CONFIG_PROXY_AUTHENTICATE, false);
95             if (useProxyAuthentication) {
96                 String JavaDoc proxyUser = proxyProps.getString(ConfigConstants.CONFIG_PROXY_USERNAME, "");
97                 String JavaDoc proxyPwd = proxyProps.getString(ConfigConstants.CONFIG_PROXY_PASSWORD, "");
98                 String JavaDoc plain = proxyUser + ":" + proxyPwd;
99                 authenticationString = "Basic " + Base64Encoder.base64Encode(plain);
100                 log.info("Authenticating against proxy, user:" + proxyUser);
101             }
102         }
103     }
104
105     public void setAgent(Agent agent) {
106         this.agent = agent;
107     }
108
109     public synchronized void setCookies(Site site, Cookie[] cookies) {
110         if (cookies != null && cookies.length > 0) {
111           storage.getCookieDAO().save(site, cookies);
112         }
113     }
114
115     public void preHandle(URLConnection JavaDoc connection, Site site) {
116         connection.setDefaultUseCaches(false);
117         connection.setUseCaches(false);
118         connection.setRequestProperty("Cache-Control","max-age=0,no-cache");
119         connection.setRequestProperty("Pragma","no-cache");
120
121         if (useProxyAuthentication) {
122             connection.setRequestProperty("Proxy-Authorization", authenticationString);
123         }
124
125         String JavaDoc cookieString = site.getCookieString();
126         boolean useCookies = site.getUseCookies();
127         if (useCookies && cookieString != null) {
128             connection.setRequestProperty("Cookie", cookieString);
129         }
130     }
131
132     public void postHandle(URLConnection JavaDoc connection, Site site) {
133         setCookies(site, cookieUtil.getCookies(connection));
134         storage.getSiteDAO().save(site);
135     }
136
137     public Agent getAgent() {
138         return agent;
139     }
140
141     public URL JavaDoc getBaseURL() {
142         return baseURL;
143     }
144
145     public EventDispatcher getEventDispatcher() {
146         return eventDispatcher;
147     }
148
149     public void throttle(Site site) {
150         Throttle throttle = null;
151
152         throttle = (Throttle) throttles.get(site.getHost());
153         if (throttle == null) {
154             throttle = throttleFactory.createThrottle(site);
155             throttles.put(site.getHost(), throttle);
156         }
157         throttle.throttle();
158     }
159
160     public Ruleset getGeneralSpiderRules ( ) {
161         return generalSpiderRules;
162     }
163
164     public Ruleset getGeneralParserRules ( ) {
165         return generalParserRules;
166     }
167
168     public Ruleset getSiteSpiderRules(Site site) {
169         Ruleset ruleSet =(Ruleset)spiderRules.get(site);
170         if ( ruleSet == null) {
171             return generalSpiderRules;
172         } else {
173             return ruleSet;
174         }
175     }
176
177     public Rule getSiteRobotsTXTRule(Site site) {
178         Rule rule = (Rule) robotsTXTRules.get(site);
179         if ( rule == null ) {
180             rule = new RobotsTXTSkippedRule();
181         }
182         return rule;
183     }
184
185     public Ruleset getSiteParserRules(Site site) {
186         Ruleset ruleSet =(Ruleset)parserRules.get(site);
187         if ( ruleSet == null) {
188             return generalParserRules;
189         } else {
190             return ruleSet;
191         }
192     }
193
194     public Storage getStorage() {
195         return storage;
196     }
197
198     public void registerRobotsTXT(Site site, InputStream JavaDoc inputStream) {
199         try {
200             RobotsTXTRule robotsTxtRule = new RobotsTXTRule(defaultUserAgent, inputStream);
201             ((Ruleset)spiderRules.get(site)).addRule(robotsTxtRule);
202             robotsTXTRules.put(site, robotsTxtRule);
203             eventDispatcher.dispatch(new UserAgentObeyedEvent(site, robotsTxtRule.getObeyedUserAgent()));
204         } catch (IOException JavaDoc e) {
205             log.error("i/o exception while reading robots.txt", e);
206         }
207     }
208
209     public void registerRobotsTXTError(Site site) {
210         RobotsTXTErrorRule robotsTxtRule = new RobotsTXTErrorRule();
211         ((Ruleset)spiderRules.get(site)).addRule(robotsTxtRule);
212         robotsTXTRules.put(site, robotsTxtRule);
213     }
214
215     public void registerRobotsTXTSkipped(Site site) {
216         RobotsTXTSkippedRule robotsTxtRule = new RobotsTXTSkippedRule();
217         ((Ruleset)spiderRules.get(site)).addRule(robotsTxtRule);
218         robotsTXTRules.put(site, robotsTxtRule);
219     }
220
221     public void registerNewSite(Site site) {
222         SiteInternal sitei = (SiteInternal) site;
223
224         sitei.setBaseSite(URLUtil.getSiteURL(baseURL).equals(site.getURL()));
225
226         PropertySet siteProps = ConfigurationFactory.getConfiguration().getSiteConfiguration(site);
227         sitei.setUseCookies(siteProps.getBoolean(ConfigConstants.SITE_COOKIES_USE, true));
228         sitei.setUseProxy (siteProps.getBoolean(ConfigConstants.SITE_PROXY_USE, true));
229         sitei.setObeyRobotsTXT (siteProps.getBoolean(ConfigConstants.SITE_ROBOTSTXT_OBEY, true));
230         sitei.setFetchRobotsTXT (siteProps.getBoolean(ConfigConstants.SITE_ROBOTSTXT_FETCH, true));
231         sitei.setUserAgent(siteProps.getString(ConfigConstants.SITE_USERAGENT, defaultUserAgent));
232         sitei.setHandle(siteProps.getBoolean(ConfigConstants.SITE_HANDLE, false));
233
234         if ( sitei.mustHandle () ) {
235
236         log.info("using userAgent '" + sitei.getUserAgent() + "' for site '" + site.getURL() + "'");
237
238         if ((!siteProps.getBoolean(ConfigConstants.SITE_PROXY_USE, true)) && getUseProxy()) {
239             log.info("Using no proxy for " + site.getURL());
240             String JavaDoc nonProxyHosts = System.getProperty("http.nonProxyHosts");
241             if (nonProxyHosts == null) {
242                 nonProxyHosts = site.getURL().getHost();
243             } else {
244                 nonProxyHosts += "|" + site.getURL().getHost();
245             }
246             System.setProperty("http.nonProxyHosts", nonProxyHosts);
247         } else {
248             if (getUseProxy()) {
249                 log.info("Using proxy for " + site.getURL());
250             }
251         }
252         } else {
253             log.info("site " + sitei.getURL() + " must not be handled.");
254         }
255
256         spiderRules.put(site, RuleFactory.createSiteSpiderRules(site));
257         parserRules.put(site, RuleFactory.createSiteParserRules(site));
258     }
259
260     public boolean getUseProxy() {
261         return useProxy;
262     }
263
264     public String JavaDoc getUserAgent() {
265         return defaultUserAgent;
266     }
267 }
268
Popular Tags