1 4 package net.javacoding.jspider.core.impl; 5 6 7 import net.javacoding.jspider.Constants; 8 import net.javacoding.jspider.spi.Rule; 9 import net.javacoding.jspider.api.event.site.UserAgentObeyedEvent; 10 import net.javacoding.jspider.api.model.Cookie; 11 import net.javacoding.jspider.api.model.Site; 12 import net.javacoding.jspider.core.Agent; 13 import net.javacoding.jspider.core.SpiderContext; 14 import net.javacoding.jspider.core.dispatch.EventDispatcher; 15 import net.javacoding.jspider.core.logging.Log; 16 import net.javacoding.jspider.core.logging.LogFactory; 17 import net.javacoding.jspider.core.model.SiteInternal; 18 import net.javacoding.jspider.core.rule.*; 19 import net.javacoding.jspider.core.rule.impl.*; 20 import net.javacoding.jspider.core.storage.Storage; 21 import net.javacoding.jspider.core.throttle.Throttle; 22 import net.javacoding.jspider.core.throttle.ThrottleFactory; 23 import net.javacoding.jspider.core.util.Base64Encoder; 24 import net.javacoding.jspider.core.util.URLUtil; 25 import net.javacoding.jspider.core.util.config.*; 26 import net.javacoding.jspider.core.util.http.CookieUtil; 27 28 import java.io.IOException ; 29 import java.io.InputStream ; 30 import java.net.URL ; 31 import java.net.URLConnection ; 32 import java.util.HashMap ; 33 import java.util.Map ; 34 35 36 42 public class SpiderContextImpl implements SpiderContext { 43 44 protected Agent agent; 45 protected URL baseURL; 46 protected EventDispatcher eventDispatcher; 47 protected ThrottleFactory throttleFactory; 48 protected Map throttles; 49 protected Map spiderRules; 50 protected Map parserRules; 51 protected Map robotsTXTRules; 52 protected Storage storage; 53 protected CookieUtil cookieUtil; 54 protected String authenticationString; 55 protected boolean useProxyAuthentication; 56 protected boolean useProxy; 57 protected Ruleset generalSpiderRules; 58 protected Ruleset generalParserRules; 59 protected String defaultUserAgent; 60 protected Log log; 61 62 public SpiderContextImpl(URL baseURL, EventDispatcher eventDispatcher, ThrottleFactory throttleFactory, Storage storage) { 63 this.baseURL = URLUtil.normalize(baseURL); 64 this.eventDispatcher = eventDispatcher; 65 this.throttleFactory = throttleFactory; 66 this.storage = storage; 67 this.cookieUtil = new CookieUtil(); 68 this.throttles = new HashMap (); 69 this.spiderRules = new HashMap ( ); 70 this.parserRules = new HashMap ( ); 71 this.robotsTXTRules = new HashMap ( ); 72 this.generalSpiderRules = RuleFactory.createGeneralSpiderRules(); 73 this.generalParserRules = RuleFactory.createGeneralParserRules(); 74 this.log = LogFactory.getLog(SpiderContext.class); 75 76 PropertySet props = ConfigurationFactory.getConfiguration().getJSpiderConfiguration(); 77 78 this.defaultUserAgent = props.getString(ConfigConstants.CONFIG_USERAGENT, Constants.USERAGENT ); 79 log.info("default user Agent is '" + defaultUserAgent + "'"); 80 81 PropertySet proxyProps = new MappedPropertySet ( ConfigConstants.CONFIG_PROXY, props ); 82 83 useProxy = proxyProps.getBoolean(ConfigConstants.CONFIG_PROXY_USE, false); 84 if (useProxy) { 85 86 String proxyHost = proxyProps.getString(ConfigConstants.CONFIG_PROXY_HOST, ""); 87 String proxyPort = proxyProps.getString(ConfigConstants.CONFIG_PROXY_PORT, ""); 88 89 System.setProperty("http.proxyHost", proxyHost); 90 System.setProperty("http.proxyPort", proxyPort); 91 92 log.info ("Using proxy " + proxyHost + ":" + proxyPort ); 93 94 useProxyAuthentication = proxyProps.getBoolean(ConfigConstants.CONFIG_PROXY_AUTHENTICATE, false); 95 if (useProxyAuthentication) { 96 String proxyUser = proxyProps.getString(ConfigConstants.CONFIG_PROXY_USERNAME, ""); 97 String proxyPwd = proxyProps.getString(ConfigConstants.CONFIG_PROXY_PASSWORD, ""); 98 String plain = proxyUser + ":" + proxyPwd; 99 authenticationString = "Basic " + Base64Encoder.base64Encode(plain); 100 log.info("Authenticating against proxy, user:" + proxyUser); 101 } 102 } 103 } 104 105 public void setAgent(Agent agent) { 106 this.agent = agent; 107 } 108 109 public synchronized void setCookies(Site site, Cookie[] cookies) { 110 if (cookies != null && cookies.length > 0) { 111 storage.getCookieDAO().save(site, cookies); 112 } 113 } 114 115 public void preHandle(URLConnection connection, Site site) { 116 connection.setDefaultUseCaches(false); 117 connection.setUseCaches(false); 118 connection.setRequestProperty("Cache-Control","max-age=0,no-cache"); 119 connection.setRequestProperty("Pragma","no-cache"); 120 121 if (useProxyAuthentication) { 122 connection.setRequestProperty("Proxy-Authorization", authenticationString); 123 } 124 125 String cookieString = site.getCookieString(); 126 boolean useCookies = site.getUseCookies(); 127 if (useCookies && cookieString != null) { 128 connection.setRequestProperty("Cookie", cookieString); 129 } 130 } 131 132 public void postHandle(URLConnection connection, Site site) { 133 setCookies(site, cookieUtil.getCookies(connection)); 134 storage.getSiteDAO().save(site); 135 } 136 137 public Agent getAgent() { 138 return agent; 139 } 140 141 public URL getBaseURL() { 142 return baseURL; 143 } 144 145 public EventDispatcher getEventDispatcher() { 146 return eventDispatcher; 147 } 148 149 public void throttle(Site site) { 150 Throttle throttle = null; 151 152 throttle = (Throttle) throttles.get(site.getHost()); 153 if (throttle == null) { 154 throttle = throttleFactory.createThrottle(site); 155 throttles.put(site.getHost(), throttle); 156 } 157 throttle.throttle(); 158 } 159 160 public Ruleset getGeneralSpiderRules ( ) { 161 return generalSpiderRules; 162 } 163 164 public Ruleset getGeneralParserRules ( ) { 165 return generalParserRules; 166 } 167 168 public Ruleset getSiteSpiderRules(Site site) { 169 Ruleset ruleSet =(Ruleset)spiderRules.get(site); 170 if ( ruleSet == null) { 171 return generalSpiderRules; 172 } else { 173 return ruleSet; 174 } 175 } 176 177 public Rule getSiteRobotsTXTRule(Site site) { 178 Rule rule = (Rule) robotsTXTRules.get(site); 179 if ( rule == null ) { 180 rule = new RobotsTXTSkippedRule(); 181 } 182 return rule; 183 } 184 185 public Ruleset getSiteParserRules(Site site) { 186 Ruleset ruleSet =(Ruleset)parserRules.get(site); 187 if ( ruleSet == null) { 188 return generalParserRules; 189 } else { 190 return ruleSet; 191 } 192 } 193 194 public Storage getStorage() { 195 return storage; 196 } 197 198 public void registerRobotsTXT(Site site, InputStream inputStream) { 199 try { 200 RobotsTXTRule robotsTxtRule = new RobotsTXTRule(defaultUserAgent, inputStream); 201 ((Ruleset)spiderRules.get(site)).addRule(robotsTxtRule); 202 robotsTXTRules.put(site, robotsTxtRule); 203 eventDispatcher.dispatch(new UserAgentObeyedEvent(site, robotsTxtRule.getObeyedUserAgent())); 204 } catch (IOException e) { 205 log.error("i/o exception while reading robots.txt", e); 206 } 207 } 208 209 public void registerRobotsTXTError(Site site) { 210 RobotsTXTErrorRule robotsTxtRule = new RobotsTXTErrorRule(); 211 ((Ruleset)spiderRules.get(site)).addRule(robotsTxtRule); 212 robotsTXTRules.put(site, robotsTxtRule); 213 } 214 215 public void registerRobotsTXTSkipped(Site site) { 216 RobotsTXTSkippedRule robotsTxtRule = new RobotsTXTSkippedRule(); 217 ((Ruleset)spiderRules.get(site)).addRule(robotsTxtRule); 218 robotsTXTRules.put(site, robotsTxtRule); 219 } 220 221 public void registerNewSite(Site site) { 222 SiteInternal sitei = (SiteInternal) site; 223 224 sitei.setBaseSite(URLUtil.getSiteURL(baseURL).equals(site.getURL())); 225 226 PropertySet siteProps = ConfigurationFactory.getConfiguration().getSiteConfiguration(site); 227 sitei.setUseCookies(siteProps.getBoolean(ConfigConstants.SITE_COOKIES_USE, true)); 228 sitei.setUseProxy (siteProps.getBoolean(ConfigConstants.SITE_PROXY_USE, true)); 229 sitei.setObeyRobotsTXT (siteProps.getBoolean(ConfigConstants.SITE_ROBOTSTXT_OBEY, true)); 230 sitei.setFetchRobotsTXT (siteProps.getBoolean(ConfigConstants.SITE_ROBOTSTXT_FETCH, true)); 231 sitei.setUserAgent(siteProps.getString(ConfigConstants.SITE_USERAGENT, defaultUserAgent)); 232 sitei.setHandle(siteProps.getBoolean(ConfigConstants.SITE_HANDLE, false)); 233 234 if ( sitei.mustHandle () ) { 235 236 log.info("using userAgent '" + sitei.getUserAgent() + "' for site '" + site.getURL() + "'"); 237 238 if ((!siteProps.getBoolean(ConfigConstants.SITE_PROXY_USE, true)) && getUseProxy()) { 239 log.info("Using no proxy for " + site.getURL()); 240 String nonProxyHosts = System.getProperty("http.nonProxyHosts"); 241 if (nonProxyHosts == null) { 242 nonProxyHosts = site.getURL().getHost(); 243 } else { 244 nonProxyHosts += "|" + site.getURL().getHost(); 245 } 246 System.setProperty("http.nonProxyHosts", nonProxyHosts); 247 } else { 248 if (getUseProxy()) { 249 log.info("Using proxy for " + site.getURL()); 250 } 251 } 252 } else { 253 log.info("site " + sitei.getURL() + " must not be handled."); 254 } 255 256 spiderRules.put(site, RuleFactory.createSiteSpiderRules(site)); 257 parserRules.put(site, RuleFactory.createSiteParserRules(site)); 258 } 259 260 public boolean getUseProxy() { 261 return useProxy; 262 } 263 264 public String getUserAgent() { 265 return defaultUserAgent; 266 } 267 } 268 | Popular Tags |