KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > matuschek > spider > WebRobot


1 package net.matuschek.spider;
2
3 /**
4  * This class implements a web robot that does a search trough
5  * the web starting from a given start document up to a given
6  * search depth.
7  *
8  * @author Daniel Matuschek / Oliver Schmidt
9  * @version $Revision: 1.35 $
10  */

11
12 import java.io.File JavaDoc;
13 import java.io.FileInputStream JavaDoc;
14 import java.io.IOException JavaDoc;
15 import java.lang.reflect.Field JavaDoc;
16 import java.lang.reflect.Modifier JavaDoc;
17 import java.net.MalformedURLException JavaDoc;
18 import java.net.URL JavaDoc;
19 import java.util.Date JavaDoc;
20 import java.util.HashMap JavaDoc;
21 import java.util.HashSet JavaDoc;
22 import java.util.StringTokenizer JavaDoc;
23 import java.util.Vector JavaDoc;
24
25 import net.matuschek.html.FormFiller;
26 import net.matuschek.html.HtmlDocument;
27 import net.matuschek.http.DocManagerException;
28 import net.matuschek.http.DownloadRuleSet;
29 import net.matuschek.http.ExtendedURL;
30 import net.matuschek.http.HttpConstants;
31 import net.matuschek.http.HttpDoc;
32 import net.matuschek.http.HttpDocManager;
33 import net.matuschek.http.HttpException;
34 import net.matuschek.http.HttpHeader;
35 import net.matuschek.http.HttpTool;
36 import net.matuschek.http.HttpToolCallback;
37 import net.matuschek.http.NTLMAuthorization;
38 import net.matuschek.http.cookie.CookieManager;
39 import net.matuschek.spider.docfilter.FilterChain;
40 import net.matuschek.spider.docfilter.FilterException;
41
42 import org.apache.log4j.Category;
43 import org.w3c.dom.Element JavaDoc;
44
45 public class WebRobot implements Runnable JavaDoc, Cloneable JavaDoc {
46
47     /** the name of the robot */
48     private final static String JavaDoc ROBOT_NAME = "JoBo";
49
50     /** the default agent name */
51     private final static String JavaDoc AGENT_NAME =
52           ROBOT_NAME+"/1.4 (http://www.matuschek.net/jobo.html)";
53
54     /** the robot exception handler*/
55     protected RobotExceptionHandler exceptionHandler =
56           new DefaultRobotExceptionHandler();
57
58     /** default maximal search depth */
59     private final static int DEFAULT_DEPTH = 10;
60
61     /** the URL where the robot walk starts from */
62     protected URL JavaDoc startURL = null;
63
64     /** the host and directory where retrieval started from */
65     protected String JavaDoc startDir = "";
66
67     /** maximal search depth */
68     protected int maxDepth = DEFAULT_DEPTH;
69
70     /** is it allowed to walk to other hosts then the starting host ? */
71     protected boolean walkToOtherHosts = false;
72
73     /** DocManager will store or process retrieved documents */
74     protected HttpDocManager docManager;
75
76     /** HttpTool will be used to retrieve documents from a web server */
77     protected HttpTool httpTool = new HttpTool();
78
79     /** Log4J category for logging */
80     protected Category log;
81
82     /** Referer used to retrieve to first document */
83     protected String JavaDoc startReferer = "-";
84
85     /** test for robots.txt */
86     protected NoRobots robCheck;
87
88     /** current tasks */
89     protected TaskList todo = null;
90
91     /** a list of all URLs we got already */
92     protected TaskList visited = null;
93     
94     /** ignore settings in /robots.txt ? */
95     protected boolean ignoreRobotsTxt = false;
96
97     /** sleep that number of seconds after every retrieved document */
98     protected int sleepTime = 1;
99
100     /** fill out forms */
101     protected FormFiller formFiller = new FormFiller();
102
103     /** this URLs can be visited more then once */
104     protected Vector JavaDoc visitMany = new Vector JavaDoc();
105
106     /** for callback to the user interface **/
107     protected WebRobotCallback webRobotCallback = null;
108
109     /** should we stop robot operation ? **/
110     protected boolean stopIt = false;
111
112     /** to check if it is allowed to travel to a given URL **/
113     protected URLCheck urlCheck = null;
114
115     /** should the robot suspend the current walk() **/
116     protected boolean sleep;
117
118     /** list of allowed URLs (even if walkToOtherHosts is false) **/
119     protected Vector JavaDoc allowedURLs = new Vector JavaDoc();
120
121     /** allow travelling the whole host ? */
122     protected boolean allowWholeHost = true;
123
124     /**
125      * maximum document age in seconds, negative value means
126      * no limit
127      */

128     protected long maxDocumentAge = -1; // no limit
129

130     /**
131      * allow travelling to all subdomains of the start host ?
132      * @see #setAllowWholeDomain(boolean)
133      */

134     protected boolean allowWholeDomain = true;
135
136     /**
137      * do more flexible tests if the new URL is on the same host
138      * @see #basicURLCheck(URL)
139      */

140     protected boolean flexibleHostCheck = false;
141
142     /**
143      * FilterChain to filter the document before storing it
144      */

145     protected FilterChain filters = null;
146
147     /**
148      * don't retrieve pages again that are already stored in the DocManager
149      */

150     protected boolean allowCaching = true;
151     
152     /**
153      * Check for documents with the same content
154      */

155     protected boolean duplicateCheck = false;
156     
157     /**
158      * initializes the robot with the default implementation
159      * of the TaskList interface
160      *
161      * @param expected document count
162      */

163     public WebRobot(int expectedDocumentCount) {
164         log = Category.getInstance(getClass().getName());
165         content2UrlMap = new HashMap JavaDoc(expectedDocumentCount);
166         registerVisitedList(new HashedMemoryTaskList(false,
167                     expectedDocumentCount));
168         registerToDoList(new HashedMemoryTaskList(true,
169                     expectedDocumentCount));
170         this.expectedDocumentCount = expectedDocumentCount;
171         this.setAgentName(AGENT_NAME);
172     }
173
174     /**
175      * initializes the robot with the default implementation of the TaskList
176      * interface
177      */

178     public WebRobot() {
179         this(DEFAULT_EXPECTED_DOCUMENT_COUNT);
180     }
181     
182     /**
183      * Sets the implementation class for the backend task list storage.
184      * WebRobot uses the TaskList interface to store future tasks.
185      *
186      * If you want to use your own TaskList implementation, just call
187      * this method.
188      *
189      * @param todo TaskList to be used for the "to do" list
190      */

191     public void registerToDoList(TaskList todo) {
192         this.todo = todo;
193     }
194
195     /**
196      * Sets the implementation class for the backend task list storage.
197      * WebRobot uses the TaskList interface to store URLs that have
198      * been retrieved before.
199      *
200      * If you want to use your own TaskList implementation, just call
201      * this method.
202      *
203      * @param visited TaskList to be used for the list of visited URLs
204      */

205     public void registerVisitedList(TaskList visited) {
206         this.visited = visited;
207     }
208
209     /**
210      * @return the start URL for this robot
211      */

212     public URL JavaDoc getStartURL() {
213         return startURL;
214     }
215
216     /**
217      * Sets the start URL for this robot
218      * @param startURL the start URL
219      */

220     public void setStartURL(URL JavaDoc startURL) {
221         String JavaDoc path = startURL.getPath();
222         this.startURL = startURL;
223
224         // is it a directory ?
225
if (path.endsWith("/")) {
226             this.startDir = startURL.getHost() + path;
227         } else {
228             int pos = path.lastIndexOf("/");
229             if (pos < 0) {
230                 // this happens for URLs without a path
231
this.startDir = startURL.getHost() + "/";
232             } else {
233                 this.startDir = startURL.getHost() + path.substring(0, pos + 1);
234             }
235         }
236     }
237
238     /**
239      * @return the maximal allowed search depth
240      */

241     public int getMaxDepth() {
242         return maxDepth;
243     }
244
245     /**
246      * sets the maximal search depth
247      * @param maxDepth
248      */

249     public void setMaxDepth(int maxDepth) {
250         this.maxDepth = maxDepth;
251     }
252
253     /**
254      * Get the value of bandwith of the used HttpTool
255      * @return value of bandwith.
256      */

257     public int getBandwidth() {
258         return httpTool.getBandwidth();
259     }
260
261     /**
262      * Set the value of bandwith of the used HttpTool
263      * @param bandwidth Value to assign to bandwith.
264      */

265     public void setBandwidth(int bandwidth) {
266         httpTool.setBandwidth(bandwidth);
267     }
268
269     /**
270      * gets the WalkToOtherHost status
271      * @return true if the Robot is allowed to travel to other
272      * host then the start host, false otherwise
273      */

274     public boolean getWalkToOtherHosts() {
275         return walkToOtherHosts;
276     }
277
278     /**
279      * sets the WalkToOtherHosts status
280      * @param walkToOtherHosts true if the Robot is allowed to travel to other
281      * host then the start host, false otherwise
282      */

283     public void setWalkToOtherHosts(boolean walkToOtherHosts) {
284         this.walkToOtherHosts = walkToOtherHosts;
285     }
286
287     /**
288      * gets the AllowWholeHost value
289      * @return true if the Robot is allowed to travel to the whole
290      * host where it started from, false otherwise. If false, it is only
291      * allowed to travel to URLs below the start URL
292      */

293     public boolean getAllowWholeHost() {
294         return allowWholeHost;
295     }
296
297     /**
298      * sets the AllowWholeHost status
299      * @param allowWholeHost if true, the Robot is allowed to
300      * travel to the whole host where it started from. Otherwise it is only
301      * allowed to travel to URLs below the start URL.
302      */

303     public void setAllowWholeHost(boolean allowWholeHost) {
304         this.allowWholeHost = allowWholeHost;
305     }
306
307     /**
308      * Gets the AllowWholeDomain value.
309      * @return true if the Robot is allowed to travel to the whole
310      * domain of the start host, false otherwise.
311      * @see #setAllowWholeDomain(boolean)
312      */

313     public boolean getAllowWholeDomain() {
314         return allowWholeDomain;
315     }
316
317     /**
318      * Sets the AllowWholeDomain status
319      * @param allowWholeDomain if true, the Robot is allows to travel
320      * to all hosts in the same domain as the starting host. E.g. if you
321      * start at www.apache.org, it is also allowed to travel to
322      * jakarta.apache.org, xml.apache.org ...
323      */

324     public void setAllowWholeDomain(boolean allowWholeDomain) {
325         this.allowWholeDomain = allowWholeDomain;
326     }
327
328     /**
329      * Gets the state of flexible host checking (enabled or disabled).
330      *
331      * To find out if a new URL is on the same host, the robot usually
332      * compares the host part of both. Some web servers have an inconsistent
333      * addressing scheme and use the hostname www.domain.com and domain.com.
334      * With flexible host check enabled, the robot will consider both
335      * hosts as equal.
336      *
337      * @return true, if flexible host checking is enabled
338      */

339     public boolean getFlexibleHostCheck() {
340         return flexibleHostCheck;
341     }
342
343     /**
344      * Defines if the host test should be more flexible.
345      *
346      * To find out if a new URL is on the same host, the robot usually
347      * compares the host part of both. Some web servers have an inconsistent
348      * addressing scheme and use the hostname www.domain.com and domain.com.
349      * With flexible host check enabled, the robot will consider both
350      * hosts as equal.
351      *
352      * @param flexibleHostCheck set this true, to enable flexible host checking
353      * (disabled by default)
354      */

355     public void setFlexibleHostCheck(boolean flexibleHostCheck) {
356         this.flexibleHostCheck = flexibleHostCheck;
357     }
358
359     /**
360      * Gets the AllowCaching value.
361      * @return true if the Robot is allowed to cache documents in the
362      * docManager
363      * @see #setAllowCaching(boolean)
364      */

365     public boolean getAllowCaching() {
366         return allowCaching;
367     }
368
369     /**
370      * Sets the AllowCaching status
371      *
372      * @param allowCaching if true, the Robot is allows to use
373      * cached documents. That means it will first try to get teh document
374      * from the docManager cache and will only retrieve it if it is
375      * not found in the cache. If the cache returns a document, the robot
376      * will NEVER retrieve it again. Therefore, expiration mechanisms have
377      * to be included in the HttpDocManager method retrieveFromCache.
378      * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
379      */

380     public void setAllowCaching(boolean allowCaching) {
381         this.allowCaching = allowCaching;
382     }
383
384     /**
385      * @return the document manager of this robot
386      * @see HttpDocManager
387      */

388     public HttpDocManager getDocManager() {
389         return docManager;
390     }
391
392     /**
393      * Sets the document manager for this robot <br />
394      * Without a document manager, the robot will travel through the web but
395      * don't do anything with the retrieved documents (simply forget
396      * them).
397      * A document manager can store them, extract information or
398      * whatever you like.
399      * There can be only one document manager, but you are free to combine
400      * functionalities of available document managers in a new object (e.g.
401      * to store the document and extract meta informations).
402      * @param docManager
403      */

404     public void setDocManager(HttpDocManager docManager) {
405         this.docManager = docManager;
406     }
407
408     /**
409      * Sets the CookieManager used by the HttpTool
410      * By default a MemoryCookieManager will be used, but you can
411      * use this method to use your own CookieManager implementation.
412      *
413      * @param cm an object that implements the CookieManager interface
414      */

415     public void setCookieManager(CookieManager cm) {
416         httpTool.setCookieManager(cm);
417     }
418
419     /**
420      * Gets the CookieManager used by the HttpTool
421      *
422      * @return the CookieManager that will be used by the HttpTool
423      */

424     public CookieManager getCookieManager() {
425         return httpTool.getCookieManager();
426     }
427
428     /**
429      * Sets the DownloadRule
430      * @param rule the download rule set to use
431      */

432     public void setDownloadRuleSet(DownloadRuleSet rules) {
433         httpTool.setDownloadRuleSet(rules);
434     }
435
436     /**
437      * Sets the URLCheck for this robot
438      * @param check
439      */

440     public void setURLCheck(URLCheck check) {
441         this.urlCheck = check;
442     }
443
444     /**
445      * sets a proxy to use
446      * @param proxyDescr the Proxy definition in the format host:port
447      */

448     public void setProxy(String JavaDoc proxyDescr) throws HttpException {
449         httpTool.setProxy(proxyDescr);
450     }
451
452     /**
453      * @return the current proxy setting in the format host:port
454      */

455     public String JavaDoc getProxy() {
456         return httpTool.getProxy();
457     }
458
459     /**
460      * @return the Referer setting for the first HTTP reuest
461      */

462     public String JavaDoc getStartReferer() {
463         return startReferer;
464     }
465
466     /**
467      * sets the Referer setting for the first HTTP reuest
468      * @param startReferer an URL (e.g. http://www.matuschek.net)
469      */

470     public void setStartReferer(String JavaDoc startReferer) {
471         this.startReferer = startReferer;
472     }
473
474     /**
475      * should we ignore robots.txt Robot Exclusion protocol ?
476      * @param ignoreRobotsTxt if set to true, the robot will ignore
477      * the settings of the /robots.txt file on the webserver
478      * <b>Know what you are doing if you change this setting</b>
479      */

480     public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
481         robCheck.setIgnore(ignoreRobotsTxt);
482     }
483
484     /**
485      * @return the sleeptime setting
486      */

487     public int getSleepTime() {
488         return sleepTime;
489     }
490
491     /**
492      * set the sleeptime<br />
493      * after every retrieved document the robot will wait this time
494      * before getting the next document. this allows it to limit the
495      * load on the server
496      * @param sleeptime wait time in seconds
497      */

498     public void setSleepTime(int sleepTime) {
499         this.sleepTime = sleepTime;
500     }
501
502     /**
503      * sets the From: HTTP header<br />
504      * this should be a valid email address. it is not needed for the robot,
505      * but you should use it, because the administrator of the web server
506      * can contact you if the robot is doing things that he don't want
507      * @param fromAdress an RFC 822 email adress
508      */

509     public void setFromAddress(String JavaDoc fromAddress) {
510         httpTool.setFromAddress(fromAddress);
511     }
512
513     /**
514      * sets the list of form handlers
515      * @see net.matuschek.html.FormHandler for more
516      * information about form handlers
517      */

518     public void setFormHandlers(Vector JavaDoc handlers) {
519         formFiller.setFormHandlers(handlers);
520         if (handlers != null && handlers.size() > 0) {
521             hasFormHandlers = true;
522         }
523     }
524
525     /**
526      * @return the list of form handlers
527      * @see net.matuschek.html.FormHandler for more information
528      * about form handlers
529      */

530     public Vector JavaDoc getFormHandlers() {
531         return formFiller.getFormHandlers();
532     }
533
534     /**
535      * Gets the name of the "User-Agent" header that the robot will use
536      * @return the user agent name
537      */

538     public String JavaDoc getAgentName() {
539         if (httpTool != null) {
540             return httpTool.getAgentName();
541         } else {
542             return null;
543         }
544     }
545
546     /**
547      * sets the Agent-Name authentication for this robot
548      * @param name a name for this robot
549      * (e.g. "Mozilla 4.0 (compatible; Robot)")
550      */

551     public void setAgentName(String JavaDoc name) {
552         httpTool.setAgentName(name);
553         // robCheck = new NoRobots(ROBOT_NAME, httpTool);
554
robCheck = new NoRobots(name, httpTool);
555     }
556
557     /**
558      * Gets the timeout for getting data in seconds of the used HttpTool
559      * @return the value of sockerTimeout
560      * @see #setTimeout(int)
561      */

562     public int getTimeout() {
563         if (httpTool != null) {
564             return httpTool.getTimeout();
565         } else {
566             return -1;
567         }
568     }
569
570     /**
571      * Sets the timeout for getting data. If HttpTool can't read data from a
572      * remote web server after this number of seconds it will stop the download
573      * of the current file
574      * @param timeout Timeout in seconds
575      */

576     public void setTimeout(int timeout) {
577         httpTool.setTimeout(timeout);
578     }
579
580     /**
581      * Gets the ntlmAuthentication of the robot
582      * @return the ntlmAuthentication
583      */

584     public NTLMAuthorization getNtlmAuthorization() {
585         if (httpTool != null) {
586             return httpTool.getNtlmAuthorization();
587         } else {
588             return null;
589         }
590     }
591
592     /**
593      * sets a ntlmAuthentication for this robot
594      * @param ntlmAuthentication for this robot
595      */

596     public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
597         httpTool.setNtlmAuthorization(ntlmAuthorization);
598     }
599
600     /**
601      * Gets the setting of the IgnoreRobotsTxt property
602      * @return true if robots.txt will be ignored, false otherwise
603      */

604     public boolean getIgnoreRobotsTxt() {
605         return ignoreRobotsTxt;
606     }
607
608     /**
609      * Gets a vector of URLs that can be visited more then once
610      * @return a vector containing URLs formated as Strings
611      */

612     public Vector JavaDoc getVisitMany() {
613         return visitMany;
614     }
615
616     public void setVisitMany(Vector JavaDoc visitMany) {
617         this.visitMany = visitMany;
618     }
619
620     public void setHttpToolCallback(HttpToolCallback callback) {
621         httpTool.setCallback(callback);
622     }
623
624     public WebRobotCallback getWebRobotCallback() {
625         return webRobotCallback;
626     }
627
628     public void setWebRobotCallback(WebRobotCallback webRobotCallback) {
629         this.webRobotCallback = webRobotCallback;
630     }
631
632     /**
633      * Sets the sleep status for this robot. If a WebRobot is set to sleep
634      * after starting run(), is will wait after retrieving the current document
635      * and wait for setSleep(false)
636      */

637     public void setSleep(boolean sleep) {
638         this.sleep = sleep;
639     }
640
641     /**
642      * Is the robot sleeping ?
643      */

644     public boolean isSleeping() {
645         return this.sleep;
646     }
647
648     /**
649      * Set the list of allowed URLs
650      * @param allowed a Vector containing Strings. URLs will be checked
651      * if they begin of a string in this vector
652      */

653     public void setAllowedURLs(Vector JavaDoc allowed) {
654         this.allowedURLs = allowed;
655     }
656
657     /**
658      * Gets the list of allowed URLs
659      * @return a Vector containing Strings
660      * @see #setAllowedURLs(Vector)
661      */

662     public Vector JavaDoc getAllowedURLs() {
663         return this.allowedURLs;
664     }
665     
666     /**
667      * Enable/disable cookies
668      * @param enable if true, HTTP cookies will be enabled, if false
669      * the robot will not use cookies
670      */

671     public void setEnableCookies(boolean enable) {
672         httpTool.setEnableCookies(enable);
673     }
674
675     /**
676      * Get the status of the cookie engine
677      * @return true, if HTTP cookies are enabled, false otherwise
678      */

679     public boolean getEnableCookies() {
680         return httpTool.getEnableCookies();
681     }
682
683     /**
684      * Set the maximum age of documents to retrieve to this number
685      * of seconds
686      * @param maxAge integer value of the maximum document age
687      * (in seconds), negative value means no limit.
688      */

689     public void setMaxDocumentAge(long maxAge) {
690         this.maxDocumentAge = maxAge;
691     }
692     
693
694
695     /**
696      * Gets the maximum age of documents to retrieve
697      * @return maximum document age (in seconds), negative value means
698      * no limit.
699      */

700     public long getMaxDocumentAge() {
701         return this.maxDocumentAge;
702     }
703
704     /**
705      * Sets a FilterChain. If teh WebRobot use a FilterChain it will
706      * process any retrieved document by this FilterChain before
707      * storing it
708      *
709      * @param filter a FilterChain to use for filtering HttpDocs
710      */

711     public void setFilters(FilterChain filters) {
712         this.filters = filters;
713     }
714
715     /**
716      * Delete all cookies
717      */

718     public void clearCookies() {
719         httpTool.clearCookies();
720     }
721
722     /**
723      * thread run() method, simply calls work()
724      * @see #work()
725      */

726     public void run() {
727         work();
728     }
729
730     /**
731      * do your job travel through the web using the configured
732      * parameters and retrieve documents
733      */

734     public void work() {
735         RobotTask task = createRobotTask(startURL, maxDepth, startReferer);
736         todo.add(task);
737         walkTree();
738         // ok, we did it, clean up dynamic data (the vistited vector)
739
cleanUp();
740         log.info("Documents retrieved by: Web=" + countWeb + " Cache=" + countCache + " Refresh=" + countRefresh+ " NoRefresh=" + countNoRefresh);
741     }
742
743     /**
744      * stop the current robot run
745      * note that this will not abourt the current download but stop after
746      * the current download has finished
747      */

748     public void stopRobot() {
749         stopIt = true;
750     }
751
752     /**
753      * Holds information about memory status.
754      * @see handleMemoryError(OutOfMemoryError)
755      */

756     private int memoryLevel = 0;
757     
758     /** Can new tasks be added? (may depend on memoryLevel) */
759     protected boolean activatedNewTasks = true;
760     
761     /** Are visited URLs collected? (may depend on memoryLevel) */
762     protected boolean activatedUrlHistory = true;
763     
764     /** Are visited contents collected? (may depend on memoryLevel) */
765     protected boolean activatedContentHistory = true;
766     
767     /** memory buffer of 200 KB to be freed in case of urgent memory needs */
768     private byte memoryBuffer[] = new byte[200 * 1024];
769
770     /**
771      * do your job !
772      */

773     
774     public void walkTree() {
775         while ((todo.size() > 0) && (!stopIt)) {
776             RobotTask task;
777             synchronized(visited) {
778                 task = todo.removeFirst();
779                 if (visited.contains(task) && (!visitMany.contains(task.getUrl().toString()))) {
780                     log.debug("already visited: " + task.getUrl());
781                     continue;
782                 }
783                 if (activatedUrlHistory) {
784                     visited.add(task);
785                 }
786             }
787             
788             boolean repeat = true;
789             while (repeat) {
790                 try {
791                     retrieveURL(task);
792                     repeat = false;
793                 } catch (OutOfMemoryError JavaDoc memoryError) {
794                     handleMemoryError(memoryError);
795                 }
796             }
797
798             // sleep, if sleep is set to true
799
while (sleep) {
800                 // callback
801
if (webRobotCallback != null) {
802                     webRobotCallback.webRobotSleeping(true);
803                 }
804
805                 try {
806                     Thread.sleep(1000);
807                 } catch (InterruptedException JavaDoc e) {
808                 };
809             }
810
811             // callback
812
if (webRobotCallback != null) {
813                 webRobotCallback.webRobotSleeping(false);
814             }
815
816             // callback
817
if (webRobotCallback != null) {
818                 webRobotCallback.webRobotUpdateQueueStatus(todo.size());
819             }
820             spawnThread();
821         }
822
823         // callback
824
if (webRobotCallback != null) {
825             finishThreads();
826         }
827     }
828
829     /**
830      * Implements OutOfMemory handling strategies.
831      * Action depends on memoryLevel
832      * @param memoryError
833      * @throws OutOfMemoryError
834      */

835     protected void handleMemoryError(OutOfMemoryError JavaDoc memoryError)
836         throws OutOfMemoryError JavaDoc {
837         memoryLevel++;
838         log.error("OutOfMemoryError level=" + memoryLevel + "! (visited=" + visited.size() + ", todo=" + todo.size() + ")");
839         switch (memoryLevel) {
840             case 1:
841                 // donīt remember visited URLs and contents any more
842
// and try it again
843
visited.clear(); activatedUrlHistory = false;
844                 content2UrlMap.clear(); activatedContentHistory = false;
845                 System.gc();
846                 break;
847             case 2:
848                 // stop adding new Tasks, just process todo-list.
849
// free memory buffer
850
// and try it again
851
activatedNewTasks = false;
852                 memoryBuffer = null;
853                 System.gc();
854                 break;
855             case 3:
856                 // there is nothing we can do any more.
857
// throw exception to stop robot
858
throw memoryError;
859             default :
860                 // Should never be reached.
861
if (memoryBuffer != null) {
862                     // avoid removal of memoryBuffer by compiler
863
System.err.println(memoryBuffer[0]);
864                 }
865                 throw memoryError;
866         }
867     }
868
869     /**
870      * calls webRobotDone and finishes docManager if
871      * executed in mainThread
872      */

873     protected void finishThreads() {
874         webRobotCallback.webRobotDone();
875         if (docManager != null) {
876           docManager.finish();
877         }
878     }
879     
880     /**
881      * Start subThreads for spidering.
882      * WARNING: Should only be implemented and used for local
883      * spidering purposes!
884      */

885     protected synchronized void spawnThread() {
886     }
887     
888     /** counter for calls of retrieveURL */
889     protected int iteration = 0;
890     
891     /**
892      * retrieve the next URL, save it, extract all included links and
893      * add those links to the tasks list
894      * @param task task to retrieve, function does nothing if this is null
895      */

896     public void retrieveURL(RobotTask task) {
897         if (task == null) {
898             log.debug("Empty task found, ignoring");
899             return;
900         }
901         
902         long now = System.currentTimeMillis();
903
904         updateProgressInfo();
905
906         URL JavaDoc u = task.getUrl();
907         String JavaDoc urlString = u.toString();
908         String JavaDoc referer = task.getReferer();
909         int depth = task.getMaxDepth();
910
911         if (depth < 0) {
912             log.info("Max search depth reached");
913             return;
914         }
915
916         // we may need this additional check even if we
917
// tested it during adding to the tasks list
918
if (!isAllowed(u)) {
919             log.info("Url '" + u + "' filtered out.");
920             return;
921         }
922
923         if (u.getFile().equals("")) {
924             try {
925                 urlString = urlString + "/";
926                 u = new URL JavaDoc(urlString);
927                 // fix for double retrieved files
928
task.setUrl(u);
929             } catch (MalformedURLException JavaDoc e) {
930                 log.error("URL not well formed: " + e.toString());
931                 // use exception handler to handle exception
932
exceptionHandler.handleException(this, u, e);
933                 return;
934             }
935         }
936
937         log.info("retrieving " + urlString);
938         httpTool.setReferer(referer);
939
940         HttpDoc doc = null;
941         Vector JavaDoc links = null;
942         boolean cached = false;
943
944         // look in the cache first, but only for static pages
945
boolean reScan = true;
946         if ((docManager != null && allowCaching)
947             && (task.getMethod() == HttpConstants.GET)
948             && (task.getParamString() == null)) {
949             doc = docManager.retrieveFromCache(u);
950 /* if (doc != null) {
951                 try {
952                     links = ((UrlCollector) docManager).retrieveLinks(doc);
953                 } catch (IOException e) {
954                     log.info("Could not get links for " + u + ": " + e.getMessage());
955                     links = null;
956                 }
957             }*/

958             
959             if (doc != null) {
960                 countCache++;
961                 long lastRetrieved = doc.getDateAsMilliSeconds();
962                 double ageInSeconds = (now - lastRetrieved) / 1000;
963                 if (ageInSeconds < 0) {
964                     log.warn("DocumentAge < 0!");
965                 }
966                 reScan = maxDocumentAge >= 0 && ageInSeconds > maxDocumentAge;
967                 if (reScan) {
968                     long lastModified = doc.getLastModifiedAsMilliSeconds();
969                     Date JavaDoc lastModifiedDate = new Date JavaDoc(lastModified);
970                     httpTool.setIfModifiedSince(lastModifiedDate);
971                 }
972             } else {
973                 httpTool.setIfModifiedSince(null);
974             }
975         }
976
977         // if not found in cache, retrieve from the web page
978
if (reScan) {
979             HttpDoc newDoc;
980             boolean error = false;
981             try {
982                 if (u.getProtocol().equalsIgnoreCase("file")) {
983                     // retrieve from file
984
newDoc = retrieveFileURL(u, httpTool.getIfModifiedSince());
985                 } else {
986                     // retrieve from Web
987
newDoc = httpTool.retrieveDocument(u, task.getMethod(), task.getParamString());
988                     if (newDoc != null) {
989                         newDoc.setDate(now);
990                     }
991                     sleepNow();
992                 }
993                 
994                 if (newDoc!= null && !newDoc.isNotModified()) {
995                     if (!(newDoc.isOk() || newDoc.isRedirect())) {
996                         error = true;
997                     }
998                 } else {
999                     // (newDoc == null || newDoc.isNotModified()) && doc != null
1000
// -> Not modified
1001
// -> refresh time stamp
1002
if (doc != null) {
1003                        doc.setDate(now);
1004                        doc.setCached(false);
1005                        newDoc = null;
1006                    }
1007                }
1008            } catch (HttpException hex) {
1009                error = true; newDoc = null;
1010            }
1011            if (error) {
1012                int retry = task.retry();
1013                if (retry <= maxRetries) {
1014                    synchronized(visited) {
1015                        todo.add(task);
1016                        visited.remove(task);
1017                    }
1018                    log.info("Adding " + u + " for retry no. " + retry);
1019                    return;
1020                } else {
1021                    doc = docManager.retrieveFromCache(u);
1022                    if (doc == null) {
1023                        log.warn("Unsuccessfull retries for " + u);
1024                        return;
1025                    } else {
1026                        long docDate = doc.getDateAsMilliSeconds();
1027                        long age = (now - docDate);
1028                        age /= 1000;
1029                        if (expirationAge < 0 || age < expirationAge) {
1030                            newDoc = doc;
1031                            cached = true;
1032                            log.info("Cached document not expired: " + u);
1033                        } else {
1034                            log.warn("Cached document expired: " + u);
1035                            docManager.removeDocument(u);
1036                            return;
1037                        }
1038                    }
1039                }
1040            }
1041            
1042            if (newDoc != null) {
1043                countWeb++;
1044                doc = newDoc;
1045                links = null; // force recalculation of links
1046
countRefresh++;
1047            } else {
1048                cached = true;
1049                countNoRefresh++;
1050            }
1051        } else {
1052            cached = true;
1053            log.debug("Page " + u + " retrieved from cache");
1054        }
1055
1056        // Add it to the visited vector
1057
// needs to be synchronized with todo-list
1058
// visited.add(task);
1059

1060        // got a NULL document, that doc was not retrieved
1061
// usually, it was not downloaded because a rule didn't allow
1062
// to download it
1063
if (doc == null) {
1064            log.info("not downloaded " + u);
1065            return;
1066        }
1067
1068        // Duplicate check
1069
String JavaDoc duplicate=null;
1070        if (duplicateCheck) {
1071            duplicate = getContentVisitedURL(doc);
1072            if (duplicate != null) {
1073                log.info("URLs with same content found: " + urlString + " = " + duplicate);
1074            } else {
1075                try {
1076                    duplicate = docManager.findDuplicate(doc);
1077                    if (duplicate != null) {
1078                        log.info("URLs with same content found in cache: " + urlString + " = " + duplicate);
1079                    }
1080                } catch (IOException JavaDoc e) {
1081                    e.printStackTrace();
1082                }
1083            }
1084            
1085            if (duplicate != null) {
1086                String JavaDoc pureDuplicate = removeParameters(duplicate);
1087                String JavaDoc pureUrl = removeParameters(urlString);
1088                if (!pureUrl.equals(pureDuplicate) && !cached) {
1089                    // different url not yet stored -> store it
1090
try {
1091                        // retrieve links from original
1092
HttpDoc linksDoc = docManager.retrieveFromCache(new URL JavaDoc(duplicate));
1093                        if (linksDoc != null) {
1094                            doc.setLinks(linksDoc.getLinks());
1095                        }
1096                        docManager.storeDocument(doc);
1097                    } catch (Exception JavaDoc e) {
1098                        e.printStackTrace();
1099                    }
1100                }
1101                RobotTask newTask;
1102                try {
1103                    newTask = createRobotTask(new URL JavaDoc(duplicate), depth, referer);
1104                    // check already here for visited tasks to save memory
1105
if (!visited.contains(newTask)) {
1106                        addTask(newTask);
1107                    }
1108                } catch (MalformedURLException JavaDoc e) {
1109                    e.printStackTrace(); // Canīt happen
1110
}
1111                return;
1112            }
1113        }
1114
1115        // was it an UnAuthorized document ?
1116
if (doc.isUnauthorized()) {
1117            log.info("got HTTP Unauthorized for URL " + u);
1118        }
1119
1120        if (doc.isOk() || cached) {
1121            // callback
1122
if (webRobotCallback != null) {
1123                int contentLength=0;
1124                if (doc.getContent() != null) { contentLength=doc.getContent().length; }
1125                webRobotCallback.webRobotRetrievedDoc(urlString, contentLength);
1126            }
1127
1128            // extract links
1129
try {
1130                if (doc.isHTML() && (depth > 0)) {
1131                    // solving encoding problem
1132
// HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
1133
HtmlDocument htmlDoc = null;
1134                    HttpHeader contentTypeHeader = doc.getHeader("Content-type");
1135                    if (contentTypeHeader != null) {
1136                        String JavaDoc contentType = contentTypeHeader.getValue();
1137                        int index = contentType.toLowerCase().indexOf("charset=");
1138                        if (index > 0) {
1139                            htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
1140                        } else {
1141                            htmlDoc = new HtmlDocument(u, doc.getContent());
1142                        }
1143                    } else {
1144                        htmlDoc = new HtmlDocument(u, doc.getContent());
1145                    }
1146    
1147                    // add links
1148

1149                    // this depth-check is critical!
1150
// otherwise far too many RobotTasks will be created
1151
// this will cause a premature OutOfMemoryException!
1152
if (depth > 0) {
1153                        if (duplicate != null) {
1154                            HttpDoc linksDoc = docManager.retrieveFromCache(new URL JavaDoc(duplicate));
1155                            doc.setLinks(linksDoc.getLinks());
1156                        } else if (cached) {
1157                        }
1158                        if (links == null) {
1159                            links = htmlDoc.getLinks();
1160                            doc.setLinks(links);
1161                        }
1162                        if (duplicate == null) {
1163                            HashSet JavaDoc checkedLinks = new HashSet JavaDoc();
1164                            for (int i = 0; i < links.size(); i++) {
1165                                URL JavaDoc link = (URL JavaDoc) links.elementAt(i);
1166                                log.info("Link: "+link);
1167                                // check already here for duplicate links to avoid expensive
1168
// creation of RobotTasks
1169
if (!checkedLinks.contains(link)) {
1170                                    checkedLinks.add(link);
1171                                    String JavaDoc myReferer = u.toString();
1172                                    if (u.getUserInfo() != null) {
1173                                        // remove userinfo from referer
1174
int endindex = myReferer.indexOf("@")+1;
1175                                        myReferer = "http://"+ myReferer.substring(endindex);
1176                                    }
1177                                    
1178                                    RobotTask newTask = createRobotTask((URL JavaDoc) links.elementAt(i), depth - 1, myReferer);
1179                                    // check already here for visited tasks to save memory
1180
if (!visited.contains(newTask)) {
1181                                        // bad workaround to retrieve images first
1182
if (newTask.urlString.endsWith(".jpg")) {
1183                                            addTaskAtStart(newTask);
1184                                        } else {
1185                                            addTask(newTask);
1186                                        }
1187                                    }
1188                                }
1189                            }
1190                        }
1191                    }
1192                    
1193                    if (hasFormHandlers) {
1194                        // add forms
1195
Vector JavaDoc forms = htmlDoc.getElements("form");
1196                        for (int i = 0; i < forms.size(); i++) {
1197                            ExtendedURL eurl = formFiller.fillForm(u, (Element JavaDoc) forms.elementAt(i));
1198                            if (eurl != null) {
1199                                RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString());
1200                                newTask.setParamString(eurl.getParams());
1201                                newTask.setMethod(eurl.getRequestMethod());
1202                                addTask(newTask);
1203                            }
1204                        }
1205                    }
1206    
1207                }
1208            // catch any occuring error to keep on processing
1209
} catch (OutOfMemoryError JavaDoc e) {
1210                throw e;
1211            } catch (Throwable JavaDoc e){
1212                log.error("Unexpected error while extraction links from url '" + u + "':"+e);
1213                e.printStackTrace();
1214                // continue processing
1215
}
1216
1217            // filter and store the document
1218
if ((docManager != null)) {
1219                try {
1220                    if (filters != null) {
1221                        doc = filters.process(doc);
1222                    } else {
1223                        log.debug("No filters defined");
1224                    }
1225                    
1226                    if (isProcessingAllowed(doc)) {
1227                        docManager.processDocument(doc);
1228                    } else {
1229                        String JavaDoc md5 = doc.getHeaderValue(HttpHeader.CONTENT_MD5);
1230                        doc.setContent("Not for indexing".getBytes());
1231                        doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5);
1232                    }
1233                    
1234                    try {
1235                        docManager.storeDocument(doc);
1236                    } catch (Exception JavaDoc e) {
1237                        log.warn("could not store (not for indexing) " + urlString + ": " + e.getMessage());
1238                    }
1239                    if (activatedContentHistory && duplicate==null) {
1240                        setContentVisitedURL(doc, urlString);
1241                    }
1242                } catch (DocManagerException e1) {
1243                    log.error("could not process document: " + e1.getMessage());
1244                    exceptionHandler.handleException(this, u, e1);
1245                } catch (FilterException e2) {
1246                    log.error(e2.getMessage());
1247                }
1248            }
1249
1250        } else {
1251            // it was NOT a 200 return code !
1252

1253            if (doc.isRedirect()) {
1254                String JavaDoc ref = doc.getLocation();
1255                log.info("Got redirect to " + ref);
1256
1257                try {
1258                    URL JavaDoc u2 = new URL JavaDoc(u, ref);
1259                    // is it on another host ?
1260

1261                    // On a redirect, browsers use the old Referer instead of the
1262
// URL that got this redirect
1263
// Therefore we do not use u.toString as Referer but the old Referer
1264
RobotTask newTask = createRobotTask(u2, depth - 1, referer);
1265
1266                    // it will be inserted at the beginning of the vector !
1267
addTaskAtStart(newTask);
1268                } catch (MalformedURLException JavaDoc e) {
1269                    // ignore this URL
1270
}
1271                // handle other values
1272
} else if (doc.isNotFound()) {
1273                // the document was not found
1274
exceptionHandler.handleException(this, u, new HttpException("Document not found"));
1275            } else if (doc.isUnauthorized()) {
1276                // the document was not found
1277
exceptionHandler.handleException(
1278                    this,
1279                    u,
1280                    new HttpException("No authorization for the document."));
1281            } else {
1282                // an other error occured.
1283
exceptionHandler.handleException(this, u, new HttpException("Unknown document error (Http return code "+doc.getHttpCode()+")."));
1284            }
1285        }
1286    }
1287
1288    /**
1289     * Inform about spidering progress.
1290     * May use iteration, startTime,
1291     * countCache, countWeb, countRefresh, countNoRefresh
1292     */

1293    public void updateProgressInfo() {
1294    }
1295
1296    /**
1297     * sleep for sleepTime seconds.
1298     */

1299    public void sleepNow() {
1300        if (sleepTime > 0) {
1301            synchronized(this) {
1302                if (webRobotCallback != null) {
1303                    webRobotCallback.webRobotSleeping(true);
1304                }
1305                
1306                try {
1307                    Thread.sleep(sleepTime * 1000);
1308                } catch (InterruptedException JavaDoc e) {
1309                }
1310            
1311                if (webRobotCallback != null) {
1312                    webRobotCallback.webRobotSleeping(false);
1313                }
1314            }
1315        }
1316    }
1317
1318    /**
1319     * retrieves a file from the local file system.
1320     * @param url the url of the file to retrieve
1321     * @return HttpDoc containing the content and mime type
1322     */

1323    private HttpDoc retrieveFileURL(URL JavaDoc url, Date JavaDoc ifModifiedSince) throws HttpException {
1324        HttpDoc doc = new HttpDoc();
1325
1326        try {
1327            String JavaDoc host = url.getHost();
1328            String JavaDoc filename = url.getFile();
1329            if ((host == null) || (host.equals(""))) {
1330                // local file
1331
// remove leading / or \
1332
if ((filename.startsWith("\\")) || (filename.startsWith("/"))) {
1333                    filename = filename.substring(1);
1334                }
1335            } else {
1336                filename = "//" + host + filename;
1337            }
1338            // get the mimetype and put in the http header
1339
String JavaDoc mimetypestr = getMimeTypeForFilename(filename);
1340            if (mimetypestr != null) {
1341                HttpHeader header = new HttpHeader("content-type", mimetypestr);
1342                doc.addHeader(header);
1343            }
1344            
1345            // get the content from the file
1346
File JavaDoc file = new File JavaDoc(filename);
1347            if (!file.exists()) {
1348                doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTFOUND);
1349                return doc;
1350            }
1351            long fileLastModified = file.lastModified();
1352            long ifModifiedSinceTime = ifModifiedSince == null ? 0 : ifModifiedSince.getTime();
1353            if (fileLastModified > ifModifiedSinceTime) {
1354                byte[] content = readFileToByteArray(file);
1355                doc.setContent(content);
1356                doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK);
1357            } else {
1358                doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTMODIFIED);
1359            }
1360            doc.setLastModified(fileLastModified);
1361            doc.setDate(System.currentTimeMillis());
1362            doc.setURL(url);
1363            
1364            return doc;
1365        } catch (Exception JavaDoc e) {
1366            throw new HttpException(e.getMessage());
1367        }
1368    }
1369
1370    /**
1371     * Get the Mime type for the given filename.
1372     * @param filename
1373     * @return Mime type
1374     */

1375    protected String JavaDoc getMimeTypeForFilename(String JavaDoc filename) {
1376        if (filename.endsWith(".html") || filename.endsWith(".htm")) {
1377            return "text/html";
1378        } else {
1379            return null;
1380        }
1381    }
1382    
1383    /**
1384     * Clean up temporary data
1385     */

1386    protected void cleanUp() {
1387        stopIt = false;
1388        visited.clear();
1389        todo.clear();
1390    }
1391
1392    /**
1393     * adds a new task to the task vector but does some checks to
1394     */

1395    protected void addTask(RobotTask task) {
1396        if (taskAddAllowed(task) && activatedNewTasks) {
1397            todo.add(task);
1398        }
1399    }
1400
1401    /**
1402     * adds a new tasks at the beginning of the tasks list
1403     * @see #addTask(RobotTask)
1404     */

1405    protected void addTaskAtStart(RobotTask task) {
1406        if (taskAddAllowed(task) && activatedNewTasks) {
1407            todo.addAtStart(task);
1408        }
1409    }
1410
1411    /**
1412     * Checks if a tasks should be added to the task list
1413     * @param robotTask
1414     * @return true if this tasks can be added to the task list,
1415     * false otherwise
1416     */

1417    protected boolean taskAddAllowed(RobotTask task) {
1418        if (task == null) {
1419            log.info("Null task not allowed");
1420            return false;
1421        }
1422
1423        if (!isAllowed(task.getUrl())) {
1424            return false;
1425        }
1426
1427        if (todo.contains(task)) {
1428            return false;
1429        }
1430
1431        return true;
1432    }
1433
1434    /**
1435     * Is it allowed to travel to this new URL ?
1436     * @param u the URL to test
1437     * @return true if traveling to this URL is allowed, false otherwise
1438     */

1439    protected boolean isAllowed(URL JavaDoc u) {
1440
1441        // do the basic checks
1442
if (basicURLCheck(u)) {
1443
1444            // if we have an URLCheck then test this URL against it
1445
if ((urlCheck != null) && (!urlCheck.checkURL(u))) {
1446                log.debug("not allowed by URLCheck:" + u);
1447                return false;
1448            }
1449
1450            if (robCheck.ok(u)) {
1451                return true;
1452            } else {
1453                log.debug("not allowed by robots.txt:" + u);
1454                return false;
1455            }
1456        }
1457        return false;
1458    }
1459    
1460    /**
1461     * Is it allowed to process this document ?
1462     * @param document
1463     * @return true if processing of this URL is allowed
1464     */

1465    protected boolean isProcessingAllowed(HttpDoc doc) {
1466        URL JavaDoc u = doc.getURL();
1467        if ((urlCheck != null) && (!urlCheck.checkURLForProcessing(u))) {
1468            log.debug("processing not allowed by URLCheck:" + u);
1469            return false;
1470        }
1471        
1472        DownloadRuleSet downloadRuleSet = httpTool.getDownloadRuleSet();
1473        if (downloadRuleSet != null && !downloadRuleSet.processAllowed(doc.getHttpHeaders())) {
1474            log.debug("processing not allowed by DownloadRuleSet:" + u);
1475            return false;
1476        }
1477
1478        return true;
1479    }
1480
1481    /**
1482     * Basic URL allow check
1483     * it is allowed to walk to a new URL if <ul>
1484     * <li>WalkToOtherHost is true. In this case there will be no additional
1485     * tests.</li>
1486     * <li>The new URL is located below the start URL, e.g. is the start URL
1487     * is http://localhost/test, the URL http://localhost/test/index.html
1488     * is allowed, but http://localhost/ is not allowed.</li>
1489     * <li>AllowWholeHost is true and the new URL is located on the same host
1490     * as the start URL.</li>
1491     * <li>FlexibleHostCheck is true and the host part of the current URL
1492     * is equal to the host part of the start URL modulo the prefix "www."
1493     * </li>
1494     * <li>The URL starts with a string in the "AllowedURLs" list.</li>
1495     * </ul>
1496     */

1497    protected boolean basicURLCheck(URL JavaDoc currURL) {
1498        String JavaDoc currURLStr = currURL.getHost() + currURL.getPath();
1499        String JavaDoc currHost = currURL.getHost().toLowerCase();
1500        String JavaDoc startHost = startURL.getHost().toLowerCase();
1501
1502        // no more checks, if walkToOtherHosts is true
1503
if (walkToOtherHosts) {
1504            return true;
1505        }
1506
1507        // new URL below start URL ?
1508
if (currURLStr.startsWith(startDir)) {
1509            return true;
1510        }
1511
1512        // on the same host ?
1513
if (allowWholeHost && (currURL.getHost().equalsIgnoreCase(startURL.getHost()))) {
1514            return true;
1515        }
1516
1517        // on the same host with flexible test (host name with and without "www."
1518
if (flexibleHostCheck) {
1519            if (cutWWW(currHost).equalsIgnoreCase(cutWWW(startHost))) {
1520                return true;
1521            }
1522        }
1523
1524        // allow whole domain ?
1525
if (allowWholeDomain) {
1526            if (currHost.endsWith(getDomain(startHost))) {
1527                return true;
1528            }
1529        }
1530
1531        // in the list of allowed URLs ?
1532
for (int i = 0; i < allowedURLs.size(); i++) {
1533            String JavaDoc s = (String JavaDoc) allowedURLs.elementAt(i);
1534            if (currURLStr.startsWith(s)) {
1535                return true;
1536            }
1537        }
1538        log.debug("URL " + currURLStr + " not allowed");
1539        return false;
1540    }
1541
1542    /**
1543     * remove a leading www. from a given hostname
1544     *
1545     * @param hostname some hostname
1546     * @return the hostname if it doesn't start with "www." otherwise
1547     * the hostname without the leading www.
1548     */

1549    private String JavaDoc cutWWW(String JavaDoc hostname) {
1550        if (hostname.toLowerCase().startsWith("www.")) {
1551            return hostname.substring(4);
1552        } else {
1553            return hostname;
1554        }
1555    }
1556
1557    /**
1558     * Gets the domain name of a given host (just delete everything
1559     * to the last "."
1560     *
1561     * @param hostname some hostname
1562     * @return the domain part of this hostname
1563     */

1564    private String JavaDoc getDomain(String JavaDoc hostname) {
1565        int pos = hostname.indexOf(".");
1566        if (pos < 0) {
1567            // this should not happen !
1568
return hostname;
1569        } else {
1570            return hostname.substring(pos + 1);
1571        }
1572    }
1573
1574    /**
1575     * Method getExceptionHandler.
1576     * @return RobotExceptionHandler the exceptionhandler of the robot
1577     */

1578    public RobotExceptionHandler getExceptionHandler() {
1579        return exceptionHandler;
1580    }
1581
1582    /**
1583     * Method setExceptionHandler.
1584     * sets the exceptionhandler of the robot
1585     * @param newExceptionHandler the new exception handler
1586     */

1587    public void setExceptionHandler(RobotExceptionHandler newExceptionHandler) {
1588        if (newExceptionHandler != null) {
1589            exceptionHandler = newExceptionHandler;
1590        }
1591    }
1592
1593    /**
1594     * Method setStart.
1595     * sets the start URL
1596     * @param the startURL as String
1597     */

1598    public void setStart(String JavaDoc startURL) {
1599        try {
1600            setStartURL(new URL JavaDoc(startURL));
1601        } catch (MalformedURLException JavaDoc e) {
1602            e.printStackTrace();
1603        }
1604    }
1605
1606    /**
1607     * Method getStart.
1608     * gets the start url as string
1609     * @return String
1610     */

1611    public String JavaDoc getStart() {
1612        URL JavaDoc url = getStartURL();
1613        if (url != null) {
1614            return url.toExternalForm();
1615        } else {
1616            return null;
1617        }
1618    }
1619
1620    /**
1621     * This method finishes HttpTool, NoRobots, HttpDocManager.
1622     */

1623    public void finish() {
1624        if (httpTool != null) {
1625            httpTool.finish();
1626        }
1627        if (robCheck != null) {
1628            robCheck.finish();
1629        }
1630        if (docManager != null) {
1631            docManager.finish();
1632        }
1633    }
1634
1635    public static void main(String JavaDoc[] args) {
1636        if (args.length > 0) System.err.println("Arguments will be ignored!");
1637        Field JavaDoc[] fields = WebRobot.class.getDeclaredFields();
1638        StringBuffer JavaDoc str = new StringBuffer JavaDoc(60);
1639        for (int i = 0; i < fields.length; i++) {
1640            if (!Modifier.isFinal(fields[i].getModifiers())
1641                && !Modifier.isStatic(fields[i].getModifiers())) {
1642                str.delete(0, str.length());
1643                str.append(" robot." + fields[i].getName() + " = " + fields[i].getName() + ";");
1644                while (str.length() < 50) {
1645                    str.append(" ");
1646                }
1647                System.out.println(str.toString()+"// ("+fields[i].getType().getName()+")");
1648            }
1649        }
1650    }
1651
1652    /** default expected count of documents */
1653    private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000;
1654    
1655    /** expected count of documents */
1656    protected int expectedDocumentCount = DEFAULT_EXPECTED_DOCUMENT_COUNT;
1657     
1658    /** remember visited content here (md5, urlString) */
1659    protected HashMap JavaDoc content2UrlMap;
1660
1661    /** counter for pages that were found in cache */
1662    long countCache = 0;
1663    
1664    /** counter for pages retrieved by web */
1665    long countWeb = 0;
1666    
1667    /** counter for pages that didnīt need a refresh */
1668    long countNoRefresh = 0;
1669    
1670    /** counter for refreshed pages (=cache+web) */
1671    long countRefresh = 0;
1672    
1673    /**
1674     * Method getContentVisitedURL.
1675     * Checks if the content was visited before and retrieves the corresponding URL.
1676     * @param content
1677     * @return found url or null if not found
1678     */

1679    public String JavaDoc getContentVisitedURL(HttpDoc doc) {
1680        Object JavaDoc key = doc.getContentMD5();
1681        synchronized(content2UrlMap) {
1682            String JavaDoc url = (String JavaDoc) content2UrlMap.get(key);
1683            return url;
1684        }
1685    }
1686    
1687    /**
1688     * Method setContentVisitedURL.
1689     * Makes an URL retrievable by its content by entering it in content2UrlMap.
1690     * @param content
1691     * @param url
1692     */

1693    public void setContentVisitedURL(HttpDoc doc, String JavaDoc url) {
1694        Object JavaDoc key = doc.getContentMD5();
1695        synchronized(content2UrlMap) {
1696            content2UrlMap.put(key, url);
1697        }
1698    }
1699    
1700    private final RobotTask createRobotTask(URL JavaDoc url, int maxDepth, String JavaDoc startReferer) {
1701        url = removeWasteParameters(url);
1702        return new RobotTask(url, maxDepth, startReferer);
1703    }
1704
1705    /** only true if form-handlers are defined */
1706    boolean hasFormHandlers = false;
1707    
1708    /** list of wasteParameters (will be removed from URLs) **/
1709    protected Vector JavaDoc wasteParameters = new Vector JavaDoc();
1710    
1711    /**
1712     * Set the list of wasteParameters (will be removed from URLs)
1713     * @param wasteParameters
1714     * if they begin of a string in this vector
1715     */

1716    public void setWasteParameters(Vector JavaDoc wasteParameters) {
1717        this.wasteParameters = wasteParameters;
1718    }
1719
1720    /**
1721     * Gets the list of wasteParameters (will be removed from URLs)
1722     * @return a Vector containing Strings
1723     */

1724    public Vector JavaDoc getWasteParameters() {
1725        return this.wasteParameters;
1726    }
1727
1728    /** Removes wasteParameters from URL.
1729     * (eg. ID)
1730     * @param url
1731     * @return URL
1732     */

1733    public URL JavaDoc removeWasteParameters(URL JavaDoc url) {
1734        String JavaDoc urlString = url.toExternalForm();
1735        String JavaDoc newUrlString = removeParametersFromString(urlString, wasteParameters);
1736        if (urlString != newUrlString) {
1737            try {
1738                url = new URL JavaDoc(newUrlString);
1739            } catch (MalformedURLException JavaDoc ex) {
1740                ex.printStackTrace();
1741            }
1742        };
1743        return url;
1744    }
1745    
1746    /**
1747     * Remove passed Parameters from UrlString
1748     * @param urlString
1749     * @param wasteParameters
1750     * @return String
1751     */

1752    public static String JavaDoc removeParametersFromString(String JavaDoc urlString, Vector JavaDoc wasteParameters) {
1753        if (wasteParameters != null && wasteParameters.size() > 0) {
1754            int questionMark = urlString.indexOf("?");
1755            if (questionMark>0 && questionMark<urlString.length()) {
1756                int restPosition = urlString.indexOf("#", questionMark);
1757                String JavaDoc parameters;
1758                String JavaDoc rest;
1759                if (restPosition<0) {
1760                    parameters = urlString.substring(questionMark+1);
1761                    rest = null;
1762                } else {
1763                    parameters = urlString.substring(questionMark+1,restPosition);
1764                    rest = urlString.substring(restPosition);
1765                }
1766                
1767                StringBuffer JavaDoc filteredUrl = new StringBuffer JavaDoc(urlString.substring(0,questionMark));
1768                StringTokenizer JavaDoc tokenizer = new StringTokenizer JavaDoc(parameters, "&");
1769                String JavaDoc and = "?";
1770                boolean changed = false;
1771                while (tokenizer.hasMoreTokens()) {
1772                    String JavaDoc token = tokenizer.nextToken();
1773                    boolean keep = true;
1774                    for (int w=0; w<wasteParameters.size(); w++) {
1775                        String JavaDoc wasteParameter = (String JavaDoc) wasteParameters.elementAt(w);
1776                        if (token.startsWith(wasteParameter + "=")) {
1777                            keep = false;
1778                            changed = true;
1779                            break;
1780                        }
1781                    }
1782                    if (keep) {
1783                        filteredUrl.append(and);
1784                        filteredUrl.append(token);
1785                        and = "&";
1786                    }
1787                }
1788                if (rest != null) filteredUrl.append(rest);
1789                if (changed) {
1790                    urlString = filteredUrl.toString();
1791                }
1792            }
1793        }
1794        return urlString;
1795    }
1796    
1797    /** time of WebRobot start in milliseconds */
1798    protected long startTime = System.currentTimeMillis();
1799    
1800    /** number of allowed retries for document retrieval */
1801    protected int maxRetries = 0;
1802    
1803    /**
1804     * Set allowed retries for document retrieval
1805     * @param maxRetries
1806     */

1807    public void setMaxRetries(int maxRetries) { this.maxRetries = maxRetries; }
1808    
1809    /**
1810     * Get allowed retries for document retrieval
1811     * @return maxRetries
1812     */

1813    public int getMaxRetries() { return maxRetries; }
1814    
1815    /**
1816     * expiration age of documents in cache.
1817     * Documents older than expirationAge will be removed,
1818     * negative value means no limit.
1819     */

1820    protected long expirationAge = -1;
1821    
1822    /**
1823     * set expiration age of documents in cache.
1824     * Documents older than expirationAge will be removed,
1825     * negative value means no limit.
1826     * @param age
1827     */

1828    public void setExpirationAge(long age) { expirationAge = age; }
1829    
1830    /**
1831     * get expiration age of documents in cache.
1832     * @return long
1833     */

1834    public long getExpirationAge() { return expirationAge; }
1835    
1836    /**
1837     * Remove Parameters from Url
1838     * @param url
1839     * @return url without parameters
1840     */

1841    private final static String JavaDoc removeParameters(String JavaDoc url) {
1842        int pos = url.indexOf("?");
1843        return pos >= 0 ? url.substring(0,pos) : url;
1844    }
1845    
1846    /**
1847     * Reads a File to a byte array.
1848     * @param file
1849     * @return byte[]
1850     * @throws IOException
1851     */

1852    protected byte[] readFileToByteArray(File JavaDoc file) throws IOException JavaDoc
1853    {
1854        FileInputStream JavaDoc in = null;
1855
1856        try
1857        {
1858            byte[] buffer = new byte[(int) file.length()];
1859            in = new FileInputStream JavaDoc(file);
1860            in.read(buffer);
1861
1862            return buffer;
1863        }
1864        finally
1865        {
1866            if (in != null)
1867            {
1868                try
1869                {
1870                    in.close();
1871                }
1872                catch (IOException JavaDoc e)
1873                {
1874                }
1875            }
1876        }
1877    }
1878    
1879}
1880
1881
Popular Tags