KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > javacoding > jspider > core > impl > AgentImpl


1 package net.javacoding.jspider.core.impl;
2
3
4 import net.javacoding.jspider.api.event.resource.*;
5 import net.javacoding.jspider.api.event.site.*;
6 import net.javacoding.jspider.api.model.*;
7 import net.javacoding.jspider.core.Agent;
8 import net.javacoding.jspider.core.SpiderContext;
9 import net.javacoding.jspider.core.dispatch.EventDispatcher;
10 import net.javacoding.jspider.core.event.CoreEvent;
11 import net.javacoding.jspider.core.event.CoreEventVisitor;
12 import net.javacoding.jspider.core.event.impl.*;
13 import net.javacoding.jspider.core.exception.SpideringDoneException;
14 import net.javacoding.jspider.core.exception.TaskAssignmentException;
15 import net.javacoding.jspider.core.logging.Log;
16 import net.javacoding.jspider.core.logging.LogFactory;
17 import net.javacoding.jspider.core.model.SiteInternal;
18 import net.javacoding.jspider.core.storage.Storage;
19 import net.javacoding.jspider.core.task.*;
20 import net.javacoding.jspider.core.task.work.*;
21 import net.javacoding.jspider.core.util.URLUtil;
22
23 import java.io.ByteArrayInputStream JavaDoc;
24 import java.net.URL JavaDoc;
25
26
27 /**
28  *
29  * $Id: AgentImpl.java,v 1.32 2003/04/29 17:53:47 vanrogu Exp $
30  *
31  * @author Günther Van Roey
32  */

33 public class AgentImpl implements Agent, CoreEventVisitor {
34
35     protected Storage storage;
36     protected SpiderContext context;
37     protected EventDispatcher eventDispatcher;
38     protected Scheduler scheduler;
39     protected Log log;
40
41
42     public AgentImpl(SpiderContext context) {
43         this.context = context;
44         this.storage = context.getStorage();
45         this.eventDispatcher = context.getEventDispatcher();
46         this.scheduler = new SchedulerFactory().createScheduler(context);
47
48         log = LogFactory.getLog(Agent.class);
49
50     }
51
52     public synchronized void start() {
53         URL JavaDoc baseURL = context.getBaseURL();
54         visit(null, new URLFoundEvent(context, null, baseURL));
55         notifyAll();
56     }
57
58     public synchronized void flagDone(WorkerTask task) {
59         scheduler.flagDone(task);
60         notifyAll();
61     }
62
63     public synchronized WorkerTask getThinkerTask() throws TaskAssignmentException {
64         while (true) {
65             try {
66                 return scheduler.getThinkerTask();
67             } catch (SpideringDoneException e) {
68                 throw e;
69             } catch (TaskAssignmentException e) {
70                 try {
71                     wait();
72                 } catch (InterruptedException JavaDoc e1) {
73                     Thread.currentThread().interrupt();
74                 }
75             }
76         }
77     }
78
79     public synchronized WorkerTask getSpiderTask() throws TaskAssignmentException {
80         while (true) {
81             try {
82                 return scheduler.getFethTask();
83             } catch (SpideringDoneException e) {
84                 throw e;
85             } catch (TaskAssignmentException e) {
86                 try {
87                     wait();
88                 } catch (InterruptedException JavaDoc e1) {
89                     Thread.currentThread().interrupt();
90                 }
91             }
92         }
93     }
94
95     /**
96      * @param foundURL
97      */

98     public synchronized void scheduleForSpidering(URL JavaDoc foundURL) {
99         URL JavaDoc siteURL = URLUtil.getSiteURL(foundURL);
100         Site site = storage.getSiteDAO().find(siteURL);
101         scheduler.schedule(new SpiderHttpURLTask(context, foundURL, site));
102         notifyAll();
103     }
104
105     public synchronized void scheduleForParsing(URL JavaDoc url) {
106         scheduler.schedule(new InterpreteHTMLTask(context, (FetchedResource) storage.getResourceDAO().getResource(url)));
107         notifyAll();
108     }
109
110     public synchronized void registerEvent(URL JavaDoc url, CoreEvent event) {
111         event.accept(url, this);
112         notifyAll();
113     }
114
115
116     public void visit(URL JavaDoc url, CoreEvent event) {
117         log.error("ERROR -- UNHANDLED COREEVENT IN AGENT !!!");
118     }
119
120     public void visit(URL JavaDoc url, URLSpideredOkEvent event) {
121         storage.getResourceDAO().setSpidered(url, event);
122         eventDispatcher.dispatch(new ResourceFetchedEvent(storage.getResourceDAO().getResource(url)));
123         scheduler.schedule(new DecideOnParsingTask(context, url));
124     }
125
126     public void visit(URL JavaDoc url, URLSpideredErrorEvent event) {
127         storage.getResourceDAO().setError(url, event);
128         eventDispatcher.dispatch(new ResourceFetchErrorEvent(storage.getResourceDAO().getResource(url), event.getHttpStatus()));
129     }
130
131     public void visit(URL JavaDoc url, ResourceParsedOkEvent event) {
132         storage.getResourceDAO().setParsed(url, event);
133         eventDispatcher.dispatch(new ResourceParsedEvent(storage.getResourceDAO().getResource(url)));
134     }
135
136     public void visit(URL JavaDoc url, ResourceParsedErrorEvent event) {
137         storage.getResourceDAO().setError(url, event);
138     }
139
140     public void visit(URL JavaDoc url, URLFoundEvent event) {
141         URL JavaDoc foundURL = event.getFoundURL();
142         URL JavaDoc siteURL = URLUtil.getSiteURL(foundURL);
143         Site site = storage.getSiteDAO().find(siteURL);
144
145         boolean newResource = (storage.getResourceDAO().getResource(foundURL) == null);
146
147         if (site == null) {
148             site = storage.getSiteDAO().createSite(siteURL);
149             context.registerNewSite(site);
150             storage.getSiteDAO().save(site);
151
152             eventDispatcher.dispatch(new SiteDiscoveredEvent(site));
153
154             if (site.getFetchRobotsTXT()) {
155                 if (site.mustHandle()) {
156                     URL JavaDoc robotsTXTUrl = URLUtil.getRobotsTXTURL(siteURL);
157                     scheduler.schedule(new FetchRobotsTXTTaskImpl(context, robotsTXTUrl, site));
158                     if (newResource) {
159                         scheduler.block(siteURL, new DecideOnSpideringTask(context, new URLFoundEvent(context, url, foundURL)));
160                     }
161                 }
162
163             } else {
164                 if (site.mustHandle()) {
165                     ((SiteInternal) site).registerRobotsTXTSkipped();
166                     context.registerRobotsTXTSkipped(site);
167                     eventDispatcher.dispatch(new RobotsTXTSkippedEvent(site));
168                     if (newResource) {
169                         scheduler.schedule(new DecideOnSpideringTask(context, event));
170                     }
171                 }
172                 notifyAll();
173             }
174         } else if (site.isRobotsTXTHandled()) {
175             if (newResource) {
176                 scheduler.schedule(new DecideOnSpideringTask(context, event));
177             }
178             notifyAll();
179         } else {
180             if (site.mustHandle()) {
181                 if (newResource) {
182                     scheduler.block(siteURL, new DecideOnSpideringTask(context, new URLFoundEvent(context, url, foundURL)));
183                 }
184             }
185         }
186
187         if (newResource) {
188             storage.getResourceDAO().registerURL(foundURL);
189             if ( !site.mustHandle()) {
190                 storage.getResourceDAO().setIgnoredForFetching(foundURL, event);
191             }
192             eventDispatcher.dispatch(new ResourceDiscoveredEvent(storage.getResourceDAO().getResource(foundURL)));
193         }
194         storage.getResourceDAO().registerURLReference(foundURL, url);
195         if (url != null) {
196             eventDispatcher.dispatch(new ResourceReferenceDiscoveredEvent(storage.getResourceDAO().getResource(url), storage.getResourceDAO().getResource(foundURL)));
197         }
198
199     }
200
201     public void visit(URL JavaDoc url, RobotsTXTSpideredOkEvent event) {
202         URL JavaDoc robotsTxtURL = event.getRobotsTXTURL();
203         URL JavaDoc siteURL = URLUtil.getSiteURL(robotsTxtURL);
204         SiteInternal site = (SiteInternal) storage.getSiteDAO().find(siteURL);
205
206         DecideOnSpideringTask[] tasks = scheduler.unblock(siteURL);
207         for (int i = 0; i < tasks.length; i++) {
208             scheduler.schedule(tasks[i]);
209         }
210
211         storage.getResourceDAO().registerURL(robotsTxtURL);
212         storage.getResourceDAO().setSpidered(robotsTxtURL, event);
213         storage.getResourceDAO().setIgnoredForParsing(robotsTxtURL);
214         Resource resource = storage.getResourceDAO().getResource(robotsTxtURL);
215         byte[] bytes = event.getBytes();
216         site.registerRobotsTXT();
217         eventDispatcher.dispatch(new ResourceDiscoveredEvent(resource));
218         eventDispatcher.dispatch(new ResourceFetchedEvent(resource));
219         eventDispatcher.dispatch(new RobotsTXTFetchedEvent(site, new String JavaDoc(bytes)));
220         context.registerRobotsTXT(site, new ByteArrayInputStream JavaDoc(bytes));
221         storage.getSiteDAO().save(site);
222     }
223
224     public void visit(URL JavaDoc url, RobotsTXTSpideredErrorEvent event) {
225         URL JavaDoc robotsTxtURL = event.getRobotsTXTURL();
226         URL JavaDoc siteURL = URLUtil.getSiteURL(robotsTxtURL);
227         Site site = storage.getSiteDAO().find(siteURL);
228         ((SiteInternal) site).registerRobotsTXTError();
229
230         DecideOnSpideringTask[] tasks = scheduler.unblock(siteURL);
231         for (int i = 0; i < tasks.length; i++) {
232             scheduler.schedule(tasks[i]);
233         }
234
235         storage.getResourceDAO().registerURL(robotsTxtURL);
236         storage.getResourceDAO().setError(robotsTxtURL, event);
237         eventDispatcher.dispatch(new RobotsTXTFetchErrorEvent(site, event.getException()));
238         context.registerRobotsTXTError(site);
239         storage.getSiteDAO().save(site);
240     }
241
242     public void visit(URL JavaDoc url, RobotsTXTUnexistingEvent event) {
243         URL JavaDoc robotsTxtURL = event.getRobotsTXTURL();
244         URL JavaDoc siteURL = URLUtil.getSiteURL(robotsTxtURL);
245         Site site = storage.getSiteDAO().find(siteURL);
246         ((SiteInternal) site).registerNoRobotsTXTFound();
247
248         DecideOnSpideringTask[] tasks = scheduler.unblock(siteURL);
249         for (int i = 0; i < tasks.length; i++) {
250             scheduler.schedule(tasks[i]);
251         }
252         storage.getSiteDAO().save(site);
253         eventDispatcher.dispatch(new RobotsTXTMissingEvent(site));
254     }
255
256
257 }
258
Popular Tags