| 1 package net.javacoding.jspider.core.impl; 2 3 4 import net.javacoding.jspider.api.event.resource.*; 5 import net.javacoding.jspider.api.event.site.*; 6 import net.javacoding.jspider.api.model.*; 7 import net.javacoding.jspider.core.Agent; 8 import net.javacoding.jspider.core.SpiderContext; 9 import net.javacoding.jspider.core.dispatch.EventDispatcher; 10 import net.javacoding.jspider.core.event.CoreEvent; 11 import net.javacoding.jspider.core.event.CoreEventVisitor; 12 import net.javacoding.jspider.core.event.impl.*; 13 import net.javacoding.jspider.core.exception.SpideringDoneException; 14 import net.javacoding.jspider.core.exception.TaskAssignmentException; 15 import net.javacoding.jspider.core.logging.Log; 16 import net.javacoding.jspider.core.logging.LogFactory; 17 import net.javacoding.jspider.core.model.SiteInternal; 18 import net.javacoding.jspider.core.storage.Storage; 19 import net.javacoding.jspider.core.task.*; 20 import net.javacoding.jspider.core.task.work.*; 21 import net.javacoding.jspider.core.util.URLUtil; 22 23 import java.io.ByteArrayInputStream ; 24 import java.net.URL ; 25 26 27 33 public class AgentImpl implements Agent, CoreEventVisitor { 34 35 protected Storage storage; 36 protected SpiderContext context; 37 protected EventDispatcher eventDispatcher; 38 protected Scheduler scheduler; 39 protected Log log; 40 41 42 public AgentImpl(SpiderContext context) { 43 this.context = context; 44 this.storage = context.getStorage(); 45 this.eventDispatcher = context.getEventDispatcher(); 46 this.scheduler = new SchedulerFactory().createScheduler(context); 47 48 log = LogFactory.getLog(Agent.class); 49 50 } 51 52 public synchronized void start() { 53 URL baseURL = context.getBaseURL(); 54 visit(null, new URLFoundEvent(context, null, baseURL)); 55 notifyAll(); 56 } 57 58 public synchronized void flagDone(WorkerTask task) { 59 scheduler.flagDone(task); 60 notifyAll(); 61 } 62 63 public synchronized WorkerTask getThinkerTask() throws TaskAssignmentException { 64 while (true) { 65 try { 66 return scheduler.getThinkerTask(); 67 } catch (SpideringDoneException e) { 68 throw e; 69 } catch (TaskAssignmentException e) { 70 try { 71 wait(); 72 } catch (InterruptedException e1) { 73 Thread.currentThread().interrupt(); 74 } 75 } 76 } 77 } 78 79 public synchronized WorkerTask getSpiderTask() throws TaskAssignmentException { 80 while (true) { 81 try { 82 return scheduler.getFethTask(); 83 } catch (SpideringDoneException e) { 84 throw e; 85 } catch (TaskAssignmentException e) { 86 try { 87 wait(); 88 } catch (InterruptedException e1) { 89 Thread.currentThread().interrupt(); 90 } 91 } 92 } 93 } 94 95 98 public synchronized void scheduleForSpidering(URL foundURL) { 99 URL siteURL = URLUtil.getSiteURL(foundURL); 100 Site site = storage.getSiteDAO().find(siteURL); 101 scheduler.schedule(new SpiderHttpURLTask(context, foundURL, site)); 102 notifyAll(); 103 } 104 105 public synchronized void scheduleForParsing(URL url) { 106 scheduler.schedule(new InterpreteHTMLTask(context, (FetchedResource) storage.getResourceDAO().getResource(url))); 107 notifyAll(); 108 } 109 110 public synchronized void registerEvent(URL url, CoreEvent event) { 111 event.accept(url, this); 112 notifyAll(); 113 } 114 115 116 public void visit(URL url, CoreEvent event) { 117 log.error("ERROR -- UNHANDLED COREEVENT IN AGENT !!!"); 118 } 119 120 public void visit(URL url, URLSpideredOkEvent event) { 121 storage.getResourceDAO().setSpidered(url, event); 122 eventDispatcher.dispatch(new ResourceFetchedEvent(storage.getResourceDAO().getResource(url))); 123 scheduler.schedule(new DecideOnParsingTask(context, url)); 124 } 125 126 public void visit(URL url, URLSpideredErrorEvent event) { 127 storage.getResourceDAO().setError(url, event); 128 eventDispatcher.dispatch(new ResourceFetchErrorEvent(storage.getResourceDAO().getResource(url), event.getHttpStatus())); 129 } 130 131 public void visit(URL url, ResourceParsedOkEvent event) { 132 storage.getResourceDAO().setParsed(url, event); 133 eventDispatcher.dispatch(new ResourceParsedEvent(storage.getResourceDAO().getResource(url))); 134 } 135 136 public void visit(URL url, ResourceParsedErrorEvent event) { 137 storage.getResourceDAO().setError(url, event); 138 } 139 140 public void visit(URL url, URLFoundEvent event) { 141 URL foundURL = event.getFoundURL(); 142 URL siteURL = URLUtil.getSiteURL(foundURL); 143 Site site = storage.getSiteDAO().find(siteURL); 144 145 boolean newResource = (storage.getResourceDAO().getResource(foundURL) == null); 146 147 if (site == null) { 148 site = storage.getSiteDAO().createSite(siteURL); 149 context.registerNewSite(site); 150 storage.getSiteDAO().save(site); 151 152 eventDispatcher.dispatch(new SiteDiscoveredEvent(site)); 153 154 if (site.getFetchRobotsTXT()) { 155 if (site.mustHandle()) { 156 URL robotsTXTUrl = URLUtil.getRobotsTXTURL(siteURL); 157 scheduler.schedule(new FetchRobotsTXTTaskImpl(context, robotsTXTUrl, site)); 158 if (newResource) { 159 scheduler.block(siteURL, new DecideOnSpideringTask(context, new URLFoundEvent(context, url, foundURL))); 160 } 161 } 162 163 } else { 164 if (site.mustHandle()) { 165 ((SiteInternal) site).registerRobotsTXTSkipped(); 166 context.registerRobotsTXTSkipped(site); 167 eventDispatcher.dispatch(new RobotsTXTSkippedEvent(site)); 168 if (newResource) { 169 scheduler.schedule(new DecideOnSpideringTask(context, event)); 170 } 171 } 172 notifyAll(); 173 } 174 } else if (site.isRobotsTXTHandled()) { 175 if (newResource) { 176 scheduler.schedule(new DecideOnSpideringTask(context, event)); 177 } 178 notifyAll(); 179 } else { 180 if (site.mustHandle()) { 181 if (newResource) { 182 scheduler.block(siteURL, new DecideOnSpideringTask(context, new URLFoundEvent(context, url, foundURL))); 183 } 184 } 185 } 186 187 if (newResource) { 188 storage.getResourceDAO().registerURL(foundURL); 189 if ( !site.mustHandle()) { 190 storage.getResourceDAO().setIgnoredForFetching(foundURL, event); 191 } 192 eventDispatcher.dispatch(new ResourceDiscoveredEvent(storage.getResourceDAO().getResource(foundURL))); 193 } 194 storage.getResourceDAO().registerURLReference(foundURL, url); 195 if (url != null) { 196 eventDispatcher.dispatch(new ResourceReferenceDiscoveredEvent(storage.getResourceDAO().getResource(url), storage.getResourceDAO().getResource(foundURL))); 197 } 198 199 } 200 201 public void visit(URL url, RobotsTXTSpideredOkEvent event) { 202 URL robotsTxtURL = event.getRobotsTXTURL(); 203 URL siteURL = URLUtil.getSiteURL(robotsTxtURL); 204 SiteInternal site = (SiteInternal) storage.getSiteDAO().find(siteURL); 205 206 DecideOnSpideringTask[] tasks = scheduler.unblock(siteURL); 207 for (int i = 0; i < tasks.length; i++) { 208 scheduler.schedule(tasks[i]); 209 } 210 211 storage.getResourceDAO().registerURL(robotsTxtURL); 212 storage.getResourceDAO().setSpidered(robotsTxtURL, event); 213 storage.getResourceDAO().setIgnoredForParsing(robotsTxtURL); 214 Resource resource = storage.getResourceDAO().getResource(robotsTxtURL); 215 byte[] bytes = event.getBytes(); 216 site.registerRobotsTXT(); 217 eventDispatcher.dispatch(new ResourceDiscoveredEvent(resource)); 218 eventDispatcher.dispatch(new ResourceFetchedEvent(resource)); 219 eventDispatcher.dispatch(new RobotsTXTFetchedEvent(site, new String (bytes))); 220 context.registerRobotsTXT(site, new ByteArrayInputStream (bytes)); 221 storage.getSiteDAO().save(site); 222 } 223 224 public void visit(URL url, RobotsTXTSpideredErrorEvent event) { 225 URL robotsTxtURL = event.getRobotsTXTURL(); 226 URL siteURL = URLUtil.getSiteURL(robotsTxtURL); 227 Site site = storage.getSiteDAO().find(siteURL); 228 ((SiteInternal) site).registerRobotsTXTError(); 229 230 DecideOnSpideringTask[] tasks = scheduler.unblock(siteURL); 231 for (int i = 0; i < tasks.length; i++) { 232 scheduler.schedule(tasks[i]); 233 } 234 235 storage.getResourceDAO().registerURL(robotsTxtURL); 236 storage.getResourceDAO().setError(robotsTxtURL, event); 237 eventDispatcher.dispatch(new RobotsTXTFetchErrorEvent(site, event.getException())); 238 context.registerRobotsTXTError(site); 239 storage.getSiteDAO().save(site); 240 } 241 242 public void visit(URL url, RobotsTXTUnexistingEvent event) { 243 URL robotsTxtURL = event.getRobotsTXTURL(); 244 URL siteURL = URLUtil.getSiteURL(robotsTxtURL); 245 Site site = storage.getSiteDAO().find(siteURL); 246 ((SiteInternal) site).registerNoRobotsTXTFound(); 247 248 DecideOnSpideringTask[] tasks = scheduler.unblock(siteURL); 249 for (int i = 0; i < tasks.length; i++) { 250 scheduler.schedule(tasks[i]); 251 } 252 storage.getSiteDAO().save(site); 253 eventDispatcher.dispatch(new RobotsTXTMissingEvent(site)); 254 } 255 256 257 } 258 | Popular Tags |