WebRobot


1   package net.matuschek.spider;
2   
3   /**
4    * This class implements a web robot that does a search trough
5    * the web starting from a given start document up to a given 
6    * search depth.
7    * 
8    * @author Daniel Matuschek / Oliver Schmidt 
9    * @version $Revision: 1.35 $
10   */
11  
12  import java.io.File  ;
13  import java.io.FileInputStream  ;
14  import java.io.IOException  ;
15  import java.lang.reflect.Field  ;
16  import java.lang.reflect.Modifier  ;
17  import java.net.MalformedURLException  ;
18  import java.net.URL  ;
19  import java.util.Date  ;
20  import java.util.HashMap  ;
21  import java.util.HashSet  ;
22  import java.util.StringTokenizer  ;
23  import java.util.Vector  ;
24  
25  import net.matuschek.html.FormFiller;
26  import net.matuschek.html.HtmlDocument;
27  import net.matuschek.http.DocManagerException;
28  import net.matuschek.http.DownloadRuleSet;
29  import net.matuschek.http.ExtendedURL;
30  import net.matuschek.http.HttpConstants;
31  import net.matuschek.http.HttpDoc;
32  import net.matuschek.http.HttpDocManager;
33  import net.matuschek.http.HttpException;
34  import net.matuschek.http.HttpHeader;
35  import net.matuschek.http.HttpTool;
36  import net.matuschek.http.HttpToolCallback;
37  import net.matuschek.http.NTLMAuthorization;
38  import net.matuschek.http.cookie.CookieManager;
39  import net.matuschek.spider.docfilter.FilterChain;
40  import net.matuschek.spider.docfilter.FilterException;
41  
42  import org.apache.log4j.Category;
43  import org.w3c.dom.Element  ;
44  
45  public class WebRobot implements Runnable  , Cloneable   {
46  
47      /** the name of the robot */
48      private final static String   ROBOT_NAME = "JoBo";
49  
50      /** the default agent name */
51      private final static String   AGENT_NAME = 
52            ROBOT_NAME+"/1.4 (http://www.matuschek.net/jobo.html)";
53  
54      /** the robot exception handler*/
55      protected RobotExceptionHandler exceptionHandler = 
56            new DefaultRobotExceptionHandler();
57  
58      /** default maximal search depth */
59      private final static int DEFAULT_DEPTH = 10;
60  
61      /** the URL where the robot walk starts from */
62      protected URL   startURL = null;
63  
64      /** the host and directory where retrieval started from */
65      protected String   startDir = "";
66  
67      /** maximal search depth */
68      protected int maxDepth = DEFAULT_DEPTH;
69  
70      /** is it allowed to walk to other hosts then the starting host ? */
71      protected boolean walkToOtherHosts = false;
72  
73      /** DocManager will store or process retrieved documents */
74      protected HttpDocManager docManager;
75  
76      /** HttpTool will be used to retrieve documents from a web server */
77      protected HttpTool httpTool = new HttpTool();
78  
79      /** Log4J category for logging */
80      protected Category log;
81  
82      /** Referer used to retrieve to first document */
83      protected String   startReferer = "-";
84  
85      /** test for robots.txt */
86      protected NoRobots robCheck;
87  
88      /** current tasks */
89      protected TaskList todo = null;
90  
91      /** a list of all URLs we got already */
92      protected TaskList visited = null;
93      
94      /** ignore settings in /robots.txt ? */
95      protected boolean ignoreRobotsTxt = false;
96  
97      /** sleep that number of seconds after every retrieved document */
98      protected int sleepTime = 1;
99  
100     /** fill out forms */
101     protected FormFiller formFiller = new FormFiller();
102 
103     /** this URLs can be visited more then once */
104     protected Vector   visitMany = new Vector  ();
105 
106     /** for callback to the user interface **/
107     protected WebRobotCallback webRobotCallback = null;
108 
109     /** should we stop robot operation ? **/
110     protected boolean stopIt = false;
111 
112     /** to check if it is allowed to travel to a given URL **/
113     protected URLCheck urlCheck = null;
114 
115     /** should the robot suspend the current walk() **/
116     protected boolean sleep;
117 
118     /** list of allowed URLs (even if walkToOtherHosts is false) **/
119     protected Vector   allowedURLs = new Vector  ();
120 
121     /** allow travelling the whole host ? */
122     protected boolean allowWholeHost = true;
123 
124     /** 
125      * maximum document age in seconds, negative value means
126      * no limit 
127      */
128     protected long maxDocumentAge = -1; // no limit
129 
130     /** 
131      * allow travelling to all subdomains of the start host ? 
132      * @see #setAllowWholeDomain(boolean)
133      */
134     protected boolean allowWholeDomain = true;
135 
136     /** 
137      * do more flexible tests if the new URL is on the same host
138      * @see #basicURLCheck(URL)
139      */
140     protected boolean flexibleHostCheck = false;
141 
142     /**
143      * FilterChain to filter the document before storing it
144      */
145     protected FilterChain filters = null;
146 
147     /**
148      * don't retrieve pages again that are already stored in the DocManager
149      */
150     protected boolean allowCaching = true;
151     
152     /**
153      * Check for documents with the same content
154      */
155     protected boolean duplicateCheck = false;
156     
157     /**
158      * initializes the robot with the default implementation 
159      * of the TaskList interface
160      * 
161      * @param expected document count
162      */
163     public WebRobot(int expectedDocumentCount) {
164         log = Category.getInstance(getClass().getName());
165         content2UrlMap = new HashMap  (expectedDocumentCount);
166         registerVisitedList(new HashedMemoryTaskList(false,
167                     expectedDocumentCount));
168         registerToDoList(new HashedMemoryTaskList(true,
169                     expectedDocumentCount));
170         this.expectedDocumentCount = expectedDocumentCount;
171         this.setAgentName(AGENT_NAME);
172     }
173 
174     /**
175      * initializes the robot with the default implementation of the TaskList
176      * interface
177      */
178     public WebRobot() {
179         this(DEFAULT_EXPECTED_DOCUMENT_COUNT);
180     }
181     
182     /**
183      * Sets the implementation class for the backend task list storage.
184      * WebRobot uses the TaskList interface to store future tasks.
185      *
186      * If you want to use your own TaskList implementation, just call
187      * this method.
188      * 
189      * @param todo TaskList to be used for the "to do" list
190      */
191     public void registerToDoList(TaskList todo) {
192         this.todo = todo;
193     }
194 
195     /**
196      * Sets the implementation class for the backend task list storage.
197      * WebRobot uses the TaskList interface to store URLs that have
198      * been retrieved before.
199      *
200      * If you want to use your own TaskList implementation, just call
201      * this method.
202      * 
203      * @param visited TaskList to be used for the list of visited URLs
204      */
205     public void registerVisitedList(TaskList visited) {
206         this.visited = visited;
207     }
208 
209     /**
210      * @return the start URL for this robot
211      */
212     public URL   getStartURL() {
213         return startURL;
214     }
215 
216     /**
217      * Sets the start URL for this robot
218      * @param startURL the start URL
219      */
220     public void setStartURL(URL   startURL) {
221         String   path = startURL.getPath();
222         this.startURL = startURL;
223 
224         // is it a directory ?
225         if (path.endsWith("/")) {
226             this.startDir = startURL.getHost() + path;
227         } else {
228             int pos = path.lastIndexOf("/");
229             if (pos < 0) {
230                 // this happens for URLs without a path
231                 this.startDir = startURL.getHost() + "/";
232             } else {
233                 this.startDir = startURL.getHost() + path.substring(0, pos + 1);
234             }
235         }
236     }
237 
238     /**
239      * @return the maximal allowed search depth
240      */
241     public int getMaxDepth() {
242         return maxDepth;
243     }
244 
245     /**
246      * sets the maximal search depth
247      * @param maxDepth
248      */
249     public void setMaxDepth(int maxDepth) {
250         this.maxDepth = maxDepth;
251     }
252 
253     /**
254      * Get the value of bandwith of the used HttpTool
255      * @return value of bandwith.
256      */
257     public int getBandwidth() {
258         return httpTool.getBandwidth();
259     }
260 
261     /**
262      * Set the value of bandwith  of the used HttpTool
263      * @param bandwidth  Value to assign to bandwith.
264      */
265     public void setBandwidth(int bandwidth) {
266         httpTool.setBandwidth(bandwidth);
267     }
268 
269     /**
270      * gets the WalkToOtherHost status
271      * @return true if the Robot is allowed to travel to other
272      * host then the start host, false otherwise
273      */
274     public boolean getWalkToOtherHosts() {
275         return walkToOtherHosts;
276     }
277 
278     /**
279      * sets the WalkToOtherHosts status
280      * @param walkToOtherHosts true if the Robot is allowed to travel to other
281      * host then the start host, false otherwise
282      */
283     public void setWalkToOtherHosts(boolean walkToOtherHosts) {
284         this.walkToOtherHosts = walkToOtherHosts;
285     }
286 
287     /**
288      * gets the AllowWholeHost value
289      * @return true if the Robot is allowed to travel to the whole 
290      * host where it started from, false otherwise. If false, it is only
291      * allowed to travel to URLs below the start URL
292      */
293     public boolean getAllowWholeHost() {
294         return allowWholeHost;
295     }
296 
297     /**
298      * sets the AllowWholeHost status
299      * @param allowWholeHost if true, the Robot is allowed to
300      * travel to the whole host where it started from. Otherwise it is only
301      * allowed to travel to URLs below the start URL.
302      */
303     public void setAllowWholeHost(boolean allowWholeHost) {
304         this.allowWholeHost = allowWholeHost;
305     }
306 
307     /**
308      * Gets the AllowWholeDomain value.
309      * @return true if the Robot is allowed to travel to the whole 
310      * domain of the start host, false otherwise. 
311      * @see #setAllowWholeDomain(boolean)
312      */
313     public boolean getAllowWholeDomain() {
314         return allowWholeDomain;
315     }
316 
317     /**
318      * Sets the AllowWholeDomain status
319      * @param allowWholeDomain if true, the Robot is allows to travel
320      * to all hosts in the same domain as the starting host. E.g. if you
321      * start at www.apache.org, it is also allowed to travel to
322      * jakarta.apache.org, xml.apache.org ...
323      */
324     public void setAllowWholeDomain(boolean allowWholeDomain) {
325         this.allowWholeDomain = allowWholeDomain;
326     }
327 
328     /**
329      * Gets the state of flexible host checking (enabled or disabled).
330      *
331      * To find out if a new URL is on the same host, the robot usually
332      * compares the host part of both. Some web servers have an inconsistent
333      * addressing scheme and use the hostname www.domain.com and domain.com.
334      * With flexible host check enabled, the robot will consider both
335      * hosts as equal.
336      *
337      * @return true, if flexible host checking is enabled
338      */
339     public boolean getFlexibleHostCheck() {
340         return flexibleHostCheck;
341     }
342 
343     /**
344      * Defines if the host test should be more flexible.
345      *
346      * To find out if a new URL is on the same host, the robot usually
347      * compares the host part of both. Some web servers have an inconsistent
348      * addressing scheme and use the hostname www.domain.com and domain.com.
349      * With flexible host check enabled, the robot will consider both
350      * hosts as equal.
351      *
352      * @param flexibleHostCheck set this true, to enable flexible host checking
353      * (disabled by default)
354      */
355     public void setFlexibleHostCheck(boolean flexibleHostCheck) {
356         this.flexibleHostCheck = flexibleHostCheck;
357     }
358 
359     /**
360      * Gets the AllowCaching value.
361      * @return true if the Robot is allowed to cache documents in the
362      * docManager
363      * @see #setAllowCaching(boolean)
364      */
365     public boolean getAllowCaching() {
366         return allowCaching;
367     }
368 
369     /**
370      * Sets the AllowCaching status
371      *
372      * @param allowCaching if true, the Robot is allows to use
373      * cached documents. That means it will first try to get teh document
374      * from the docManager cache and will only retrieve it if it is
375      * not found in the cache. If the cache returns a document, the robot
376      * will NEVER retrieve it again. Therefore, expiration mechanisms have
377      * to be included in the HttpDocManager method retrieveFromCache.
378      * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
379      */
380     public void setAllowCaching(boolean allowCaching) {
381         this.allowCaching = allowCaching;
382     }
383 
384     /**
385      * @return the document manager of this robot
386      * @see HttpDocManager
387      */
388     public HttpDocManager getDocManager() {
389         return docManager;
390     }
391 
392     /**
393      * Sets the document manager for this robot <br />
394      * Without a document manager, the robot will travel through the web but
395      * don't do anything with the retrieved documents (simply forget
396      * them). 
397      * A document manager can store them, extract information or 
398      * whatever you like. 
399      * There can be only one document manager, but you are free to combine
400      * functionalities of available document managers in a new object (e.g.
401      * to store the document and extract meta informations).
402      * @param docManager
403      */
404     public void setDocManager(HttpDocManager docManager) {
405         this.docManager = docManager;
406     }
407 
408     /**
409      * Sets the CookieManager used by the HttpTool
410      * By default a MemoryCookieManager will be used, but you can
411      * use this method to use your own CookieManager implementation.
412      *
413      * @param cm an object that implements the CookieManager interface
414      */
415     public void setCookieManager(CookieManager cm) {
416         httpTool.setCookieManager(cm);
417     }
418 
419     /**
420      * Gets the CookieManager used by the HttpTool
421      *
422      * @return the CookieManager that will be used by the HttpTool
423      */
424     public CookieManager getCookieManager() {
425         return httpTool.getCookieManager();
426     }
427 
428     /**
429      * Sets the DownloadRule
430      * @param rule the download rule set to use
431      */
432     public void setDownloadRuleSet(DownloadRuleSet rules) {
433         httpTool.setDownloadRuleSet(rules);
434     }
435 
436     /**
437      * Sets the URLCheck for this robot
438      * @param check
439      */
440     public void setURLCheck(URLCheck check) {
441         this.urlCheck = check;
442     }
443 
444     /** 
445      *  sets a proxy to use 
446      *  @param proxyDescr the Proxy definition in the format host:port
447      */
448     public void setProxy(String   proxyDescr) throws HttpException {
449         httpTool.setProxy(proxyDescr);
450     }
451 
452     /**
453      * @return the current proxy setting in the format host:port
454      */
455     public String   getProxy() {
456         return httpTool.getProxy();
457     }
458 
459     /**
460      * @return the Referer setting for the first HTTP reuest
461      */
462     public String   getStartReferer() {
463         return startReferer;
464     }
465 
466     /**
467      * sets the Referer setting for the first HTTP reuest
468      * @param startReferer an URL (e.g. http://www.matuschek.net)
469      */
470     public void setStartReferer(String   startReferer) {
471         this.startReferer = startReferer;
472     }
473 
474     /**
475      * should we ignore robots.txt Robot Exclusion protocol ?
476      * @param ignoreRobotsTxt if set to true, the robot will ignore
477      * the settings of the /robots.txt file on the webserver
478      * <b>Know what you are doing if you change this setting</b>
479      */
480     public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
481         robCheck.setIgnore(ignoreRobotsTxt);
482     }
483 
484     /** 
485      * @return the sleeptime setting
486      */
487     public int getSleepTime() {
488         return sleepTime;
489     }
490 
491     /**
492      * set the sleeptime<br />
493      * after every retrieved document the robot will wait this time
494      * before getting the next document. this allows it to limit the
495      * load on the server
496      * @param sleeptime wait time in seconds
497      */
498     public void setSleepTime(int sleepTime) {
499         this.sleepTime = sleepTime;
500     }
501 
502     /**
503      * sets the From: HTTP header<br />
504      * this should be a valid email address. it is not needed for the robot,
505      * but you should use it, because the administrator of the web server
506      * can contact you if the robot is doing things that he don't want
507      * @param fromAdress an RFC 822 email adress
508      */
509     public void setFromAddress(String   fromAddress) {
510         httpTool.setFromAddress(fromAddress);
511     }
512 
513     /**
514      * sets the list of form handlers
515      * @see net.matuschek.html.FormHandler for more 
516      * information about form handlers
517      */
518     public void setFormHandlers(Vector   handlers) {
519         formFiller.setFormHandlers(handlers);
520         if (handlers != null && handlers.size() > 0) {
521             hasFormHandlers = true;
522         }
523     }
524 
525     /**
526      * @return the list of form handlers
527      * @see net.matuschek.html.FormHandler for more information 
528      * about form handlers
529      */
530     public Vector   getFormHandlers() {
531         return formFiller.getFormHandlers();
532     }
533 
534     /**
535      * Gets the name of the "User-Agent" header that the robot will use
536      * @return the user agent name 
537      */
538     public String   getAgentName() {
539         if (httpTool != null) {
540             return httpTool.getAgentName();
541         } else {
542             return null;
543         }
544     }
545 
546     /**
547      * sets the Agent-Name authentication for this robot
548      * @param name a name for this robot 
549      * (e.g. "Mozilla 4.0 (compatible; Robot)")
550      */
551     public void setAgentName(String   name) {
552         httpTool.setAgentName(name);
553         // robCheck = new NoRobots(ROBOT_NAME, httpTool);
554         robCheck = new NoRobots(name, httpTool);
555     }
556 
557     /**
558      * Gets the timeout for getting data in seconds of the used HttpTool
559      * @return the value of sockerTimeout
560      * @see #setTimeout(int)
561      */
562     public int getTimeout() {
563         if (httpTool != null) {
564             return httpTool.getTimeout();
565         } else {
566             return -1;
567         }
568     }
569 
570     /**
571      * Sets the timeout for getting data. If HttpTool can't read data from a
572      * remote web server after this number of seconds it will stop the download
573      * of the current file
574      * @param timeout Timeout in seconds
575      */
576     public void setTimeout(int timeout) {
577         httpTool.setTimeout(timeout);
578     }
579 
580     /**
581      * Gets the ntlmAuthentication of the robot
582      * @return the ntlmAuthentication
583      */
584     public NTLMAuthorization getNtlmAuthorization() {
585         if (httpTool != null) {
586             return httpTool.getNtlmAuthorization();
587         } else {
588             return null;
589         }
590     }
591 
592     /**
593      * sets a ntlmAuthentication for this robot
594      * @param ntlmAuthentication for this robot 
595      */
596     public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
597         httpTool.setNtlmAuthorization(ntlmAuthorization);
598     }
599 
600     /**
601      * Gets the setting of the IgnoreRobotsTxt property
602      * @return true if robots.txt will be ignored, false otherwise
603      */
604     public boolean getIgnoreRobotsTxt() {
605         return ignoreRobotsTxt;
606     }
607 
608     /**
609      * Gets a vector of URLs that can be visited more then once
610      * @return a vector containing URLs formated as Strings
611      */
612     public Vector   getVisitMany() {
613         return visitMany;
614     }
615 
616     public void setVisitMany(Vector   visitMany) {
617         this.visitMany = visitMany;
618     }
619 
620     public void setHttpToolCallback(HttpToolCallback callback) {
621         httpTool.setCallback(callback);
622     }
623 
624     public WebRobotCallback getWebRobotCallback() {
625         return webRobotCallback;
626     }
627 
628     public void setWebRobotCallback(WebRobotCallback webRobotCallback) {
629         this.webRobotCallback = webRobotCallback;
630     }
631 
632     /**
633      * Sets the sleep status for this robot. If a WebRobot is set to sleep
634      * after starting run(), is will wait after retrieving the current document
635      * and wait for setSleep(false)
636      */
637     public void setSleep(boolean sleep) {
638         this.sleep = sleep;
639     }
640 
641     /**
642      * Is the robot sleeping ?
643      */
644     public boolean isSleeping() {
645         return this.sleep;
646     }
647 
648     /** 
649      * Set the list of allowed URLs
650      * @param allowed a Vector containing Strings. URLs will be checked
651      * if they begin of a string in this vector
652      */
653     public void setAllowedURLs(Vector   allowed) {
654         this.allowedURLs = allowed;
655     }
656 
657     /**
658      * Gets the list of allowed URLs
659      * @return a Vector containing Strings
660      * @see #setAllowedURLs(Vector)
661      */
662     public Vector   getAllowedURLs() {
663         return this.allowedURLs;
664     }
665     
666     /**
667      * Enable/disable cookies
668      * @param enable if true, HTTP cookies will be enabled, if false
669      * the robot will not use cookies
670      */
671     public void setEnableCookies(boolean enable) {
672         httpTool.setEnableCookies(enable);
673     }
674 
675     /**
676      * Get the status of the cookie engine
677      * @return true, if HTTP cookies are enabled, false otherwise
678      */
679     public boolean getEnableCookies() {
680         return httpTool.getEnableCookies();
681     }
682 
683     /** 
684      * Set the maximum age of documents to retrieve to this number
685      * of seconds
686      * @param maxAge integer value of the maximum document age 
687      * (in seconds), negative value means no limit.
688      */
689     public void setMaxDocumentAge(long maxAge) {
690         this.maxDocumentAge = maxAge;
691     }
692     
693 
694 
695     /**
696      * Gets the maximum age of documents to retrieve
697      * @return maximum document age (in seconds), negative value means 
698      * no limit.
699      */
700     public long getMaxDocumentAge() {
701         return this.maxDocumentAge;
702     }
703 
704     /**
705      * Sets a FilterChain. If teh WebRobot use a FilterChain it will
706      * process any retrieved document by this FilterChain before
707      * storing it
708      *
709      * @param filter a FilterChain to use for filtering HttpDocs
710      */
711     public void setFilters(FilterChain filters) {
712         this.filters = filters;
713     }
714 
715     /**
716      * Delete all cookies
717      */
718     public void clearCookies() {
719         httpTool.clearCookies();
720     }
721 
722     /**
723      * thread run() method, simply calls work()
724      * @see #work()
725      */
726     public void run() {
727         work();
728     }
729 
730     /**
731      * do your job travel through the web using the configured 
732      * parameters and retrieve documents
733      */
734     public void work() {
735         RobotTask task = createRobotTask(startURL, maxDepth, startReferer);
736         todo.add(task);
737         walkTree();
738         // ok, we did it, clean up dynamic data (the vistited vector)
739         cleanUp();
740         log.info("Documents retrieved by: Web=" + countWeb + " Cache=" + countCache + " Refresh=" + countRefresh+ " NoRefresh=" + countNoRefresh);
741     }
742 
743     /**
744      * stop the current robot run 
745      * note that this will not abourt the current download but stop after
746      * the current download has finished
747      */
748     public void stopRobot() {
749         stopIt = true;
750     }
751 
752     /**
753      * Holds information about memory status.
754      * @see handleMemoryError(OutOfMemoryError)
755      */
756     private int memoryLevel = 0;
757     
758     /** Can new tasks be added? (may depend on memoryLevel) */
759     protected boolean activatedNewTasks = true;
760     
761     /** Are visited URLs collected? (may depend on memoryLevel) */
762     protected boolean activatedUrlHistory = true;
763     
764     /** Are visited contents collected? (may depend on memoryLevel) */
765     protected boolean activatedContentHistory = true;
766     
767     /** memory buffer of 200 KB to be freed in case of urgent memory needs */
768     private byte memoryBuffer[] = new byte[200 * 1024];
769 
770     /**
771      * do your job !
772      */
773     
774     public void walkTree() {
775         while ((todo.size() > 0) && (!stopIt)) {
776             RobotTask task;
777             synchronized(visited) {
778                 task = todo.removeFirst();
779                 if (visited.contains(task) && (!visitMany.contains(task.getUrl().toString()))) {
780                     log.debug("already visited: " + task.getUrl());
781                     continue;
782                 }
783                 if (activatedUrlHistory) {
784                     visited.add(task);
785                 }
786             }
787             
788             boolean repeat = true;
789             while (repeat) {
790                 try {
791                     retrieveURL(task);
792                     repeat = false;
793                 } catch (OutOfMemoryError   memoryError) {
794                     handleMemoryError(memoryError); 
795                 }
796             }
797 
798             // sleep, if sleep is set to true
799             while (sleep) {
800                 // callback
801                 if (webRobotCallback != null) {
802                     webRobotCallback.webRobotSleeping(true);
803                 }
804 
805                 try {
806                     Thread.sleep(1000);
807                 } catch (InterruptedException   e) {
808                 };
809             }
810 
811             // callback
812             if (webRobotCallback != null) {
813                 webRobotCallback.webRobotSleeping(false);
814             }
815 
816             // callback
817             if (webRobotCallback != null) {
818                 webRobotCallback.webRobotUpdateQueueStatus(todo.size());
819             }
820             spawnThread();
821         }
822 
823         // callback
824         if (webRobotCallback != null) {
825             finishThreads();
826         }
827     }
828 
829     /**
830      * Implements OutOfMemory handling strategies.
831      * Action depends on memoryLevel
832      * @param memoryError
833      * @throws OutOfMemoryError
834      */
835     protected void handleMemoryError(OutOfMemoryError   memoryError)
836         throws OutOfMemoryError   {
837         memoryLevel++;
838         log.error("OutOfMemoryError level=" + memoryLevel + "! (visited=" + visited.size() + ", todo=" + todo.size() + ")");
839         switch (memoryLevel) {
840             case 1:
841                 // don�t remember visited URLs and contents any more
842                 // and try it again
843                 visited.clear(); activatedUrlHistory = false;
844                 content2UrlMap.clear(); activatedContentHistory = false;
845                 System.gc();
846                 break;
847             case 2:
848                 // stop adding new Tasks, just process todo-list.
849                 // free memory buffer 
850                 // and try it again 
851                 activatedNewTasks = false;
852                 memoryBuffer = null;
853                 System.gc();
854                 break;
855             case 3:
856                 // there is nothing we can do any more.
857                 // throw exception to stop robot
858                 throw memoryError;
859             default :
860                 // Should never be reached.
861                 if (memoryBuffer != null) {
862                     // avoid removal of memoryBuffer by compiler
863                     System.err.println(memoryBuffer[0]);
864                 }
865                 throw memoryError;
866         }
867     }
868 
869     /**
870      * calls webRobotDone and finishes docManager if 
871      * executed in mainThread
872      */
873     protected void finishThreads() {
874         webRobotCallback.webRobotDone();
875         if (docManager != null) {
876           docManager.finish();
877         }
878     }
879     
880     /**
881      * Start subThreads for spidering.
882      * WARNING: Should only be implemented and used for local
883      * spidering purposes!
884      */
885     protected synchronized void spawnThread() {
886     }
887     
888     /** counter for calls of retrieveURL */
889     protected int iteration = 0;
890     
891     /**
892      * retrieve the next URL, save it, extract all included links and
893      * add those links to the tasks list
894      * @param task task to retrieve, function does nothing if this is null
895      */
896     public void retrieveURL(RobotTask task) {
897         if (task == null) {
898             log.debug("Empty task found, ignoring");
899             return;
900         }
901         
902         long now = System.currentTimeMillis();
903 
904         updateProgressInfo();
905 
906         URL   u = task.getUrl();
907         String   urlString = u.toString();
908         String   referer = task.getReferer();
909         int depth = task.getMaxDepth();
910 
911         if (depth < 0) {
912             log.info("Max search depth reached");
913             return;
914         }
915 
916         // we may need this additional check even if we
917         // tested it during adding to the tasks list 
918         if (!isAllowed(u)) {
919             log.info("Url '" + u + "' filtered out.");
920             return;
921         }
922 
923         if (u.getFile().equals("")) {
924             try {
925                 urlString = urlString + "/";
926                 u = new URL  (urlString);
927                 // fix for double retrieved files
928                 task.setUrl(u);
929             } catch (MalformedURLException   e) {
930                 log.error("URL not well formed: " + e.toString());
931                 // use exception handler to handle exception
932                 exceptionHandler.handleException(this, u, e);
933                 return;
934             }
935         }
936 
937         log.info("retrieving " + urlString);
938         httpTool.setReferer(referer);
939 
940         HttpDoc doc = null;
941         Vector   links = null;
942         boolean cached = false;
943 
944         // look in the cache first, but only for static pages
945         boolean reScan = true;
946         if ((docManager != null && allowCaching)
947             && (task.getMethod() == HttpConstants.GET)
948             && (task.getParamString() == null)) {
949             doc = docManager.retrieveFromCache(u);
950 /*          if (doc != null) {
951                 try {
952                     links = ((UrlCollector) docManager).retrieveLinks(doc);
953                 } catch (IOException e) {
954                     log.info("Could not get links for " + u + ": " + e.getMessage());
955                     links = null;
956                 } 
957             }*/
958             
959             if (doc != null) {
960                 countCache++;
961                 long lastRetrieved = doc.getDateAsMilliSeconds();
962                 double ageInSeconds = (now - lastRetrieved) / 1000;
963                 if (ageInSeconds < 0) {
964                     log.warn("DocumentAge < 0!");
965                 }
966                 reScan = maxDocumentAge >= 0 && ageInSeconds > maxDocumentAge;
967                 if (reScan) {
968                     long lastModified = doc.getLastModifiedAsMilliSeconds();
969                     Date   lastModifiedDate = new Date  (lastModified);
970                     httpTool.setIfModifiedSince(lastModifiedDate);
971                 }
972             } else {
973                 httpTool.setIfModifiedSince(null);
974             }
975         }
976 
977         // if not found in cache, retrieve from the web page
978         if (reScan) {
979             HttpDoc newDoc;
980             boolean error = false;
981             try {
982                 if (u.getProtocol().equalsIgnoreCase("file")) {
983                     // retrieve from file
984                     newDoc = retrieveFileURL(u, httpTool.getIfModifiedSince());
985                 } else {
986                     // retrieve from Web
987                     newDoc = httpTool.retrieveDocument(u, task.getMethod(), task.getParamString());
988                     if (newDoc != null) {
989                         newDoc.setDate(now);
990                     }
991                     sleepNow();
992                 }
993                 
994                 if (newDoc!= null && !newDoc.isNotModified()) {
995                     if (!(newDoc.isOk() || newDoc.isRedirect())) {
996                         error = true;
997                     }
998                 } else {
999                     // (newDoc == null || newDoc.isNotModified()) && doc != null 
1000                    // -> Not modified
1001                    // -> refresh time stamp
1002                    if (doc != null) {
1003                        doc.setDate(now);
1004                        doc.setCached(false);
1005                        newDoc = null;
1006                    }
1007                }
1008            } catch (HttpException hex) {
1009                error = true; newDoc = null;
1010            }
1011            if (error) {
1012                int retry = task.retry();
1013                if (retry <= maxRetries) {
1014                    synchronized(visited) {
1015                        todo.add(task);
1016                        visited.remove(task);
1017                    }
1018                    log.info("Adding " + u + " for retry no. " + retry);
1019                    return;
1020                } else {
1021                    doc = docManager.retrieveFromCache(u);
1022                    if (doc == null) {
1023                        log.warn("Unsuccessfull retries for " + u);
1024                        return;
1025                    } else {
1026                        long docDate = doc.getDateAsMilliSeconds();
1027                        long age = (now - docDate);
1028                        age /= 1000;
1029                        if (expirationAge < 0 || age < expirationAge) {
1030                            newDoc = doc;
1031                            cached = true;
1032                            log.info("Cached document not expired: " + u);
1033                        } else {
1034                            log.warn("Cached document expired: " + u);
1035                            docManager.removeDocument(u);
1036                            return;
1037                        }
1038                    }
1039                }
1040            }
1041            
1042            if (newDoc != null) {
1043                countWeb++;
1044                doc = newDoc;
1045                links = null; // force recalculation of links
1046                countRefresh++;
1047            } else {
1048                cached = true;
1049                countNoRefresh++;
1050            }
1051        } else {
1052            cached = true;
1053            log.debug("Page " + u + " retrieved from cache");
1054        }
1055
1056        // Add it to the visited vector
1057        // needs to be synchronized with todo-list
1058//      visited.add(task); 
1059        
1060        // got a NULL document, that doc was not retrieved
1061        // usually, it was not downloaded because a rule didn't allow
1062        // to download it
1063        if (doc == null) {
1064            log.info("not downloaded " + u);
1065            return;
1066        }
1067
1068        // Duplicate check
1069        String   duplicate=null;
1070        if (duplicateCheck) {
1071            duplicate = getContentVisitedURL(doc);
1072            if (duplicate != null) {
1073                log.info("URLs with same content found: " + urlString + " = " + duplicate);
1074            } else {    
1075                try {
1076                    duplicate = docManager.findDuplicate(doc);
1077                    if (duplicate != null) {
1078                        log.info("URLs with same content found in cache: " + urlString + " = " + duplicate);
1079                    }
1080                } catch (IOException   e) {
1081                    e.printStackTrace();
1082                }
1083            }
1084            
1085            if (duplicate != null) {
1086                String   pureDuplicate = removeParameters(duplicate);
1087                String   pureUrl = removeParameters(urlString);
1088                if (!pureUrl.equals(pureDuplicate) && !cached) {
1089                    // different url not yet stored -> store it
1090                    try {
1091                        // retrieve links from original
1092                        HttpDoc linksDoc = docManager.retrieveFromCache(new URL  (duplicate));
1093                        if (linksDoc != null) {     
1094                            doc.setLinks(linksDoc.getLinks());
1095                        }
1096                        docManager.storeDocument(doc);
1097                    } catch (Exception   e) {
1098                        e.printStackTrace();
1099                    }
1100                }
1101                RobotTask newTask;
1102                try {
1103                    newTask = createRobotTask(new URL  (duplicate), depth, referer);
1104                    // check already here for visited tasks to save memory
1105                    if (!visited.contains(newTask)) {
1106                        addTask(newTask);
1107                    }
1108                } catch (MalformedURLException   e) {
1109                    e.printStackTrace(); // Can�t happen
1110                }
1111                return;
1112            } 
1113        }
1114
1115        // was it an UnAuthorized document ?
1116        if (doc.isUnauthorized()) {
1117            log.info("got HTTP Unauthorized for URL " + u);
1118        }
1119
1120        if (doc.isOk() || cached) {
1121            // callback
1122            if (webRobotCallback != null) {
1123                int contentLength=0;
1124                if (doc.getContent() != null) { contentLength=doc.getContent().length; }
1125                webRobotCallback.webRobotRetrievedDoc(urlString, contentLength);
1126            }
1127
1128            // extract links
1129            try {
1130                if (doc.isHTML() && (depth > 0)) {
1131                    // solving encoding problem
1132                    // HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
1133                    HtmlDocument htmlDoc = null;
1134                    HttpHeader contentTypeHeader = doc.getHeader("Content-type");
1135                    if (contentTypeHeader != null) {
1136                        String   contentType = contentTypeHeader.getValue();
1137                        int index = contentType.toLowerCase().indexOf("charset=");
1138                        if (index > 0) {
1139                            htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
1140                        } else {
1141                            htmlDoc = new HtmlDocument(u, doc.getContent());
1142                        }
1143                    } else {
1144                        htmlDoc = new HtmlDocument(u, doc.getContent());
1145                    }
1146    
1147                    // add links
1148                    
1149                    // this depth-check is critical!
1150                    // otherwise far too many RobotTasks will be created
1151                    // this will cause a premature OutOfMemoryException!
1152                    if (depth > 0) {
1153                        if (duplicate != null) {
1154                            HttpDoc linksDoc = docManager.retrieveFromCache(new URL  (duplicate));
1155                            doc.setLinks(linksDoc.getLinks());
1156                        } else if (cached) {
1157                        } 
1158                        if (links == null) {
1159                            links = htmlDoc.getLinks();
1160                            doc.setLinks(links);
1161                        }
1162                        if (duplicate == null) {
1163                            HashSet   checkedLinks = new HashSet  ();
1164                            for (int i = 0; i < links.size(); i++) {
1165                                URL   link = (URL  ) links.elementAt(i);
1166                                log.info("Link: "+link);
1167                                // check already here for duplicate links to avoid expensive
1168                                // creation of RobotTasks
1169                                if (!checkedLinks.contains(link)) {
1170                                    checkedLinks.add(link);
1171                                    String   myReferer = u.toString();
1172                                    if (u.getUserInfo() != null) {
1173                                        // remove userinfo from referer
1174                                        int endindex = myReferer.indexOf("@")+1;
1175                                        myReferer = "http://"+ myReferer.substring(endindex);
1176                                    }
1177                                    
1178                                    RobotTask newTask = createRobotTask((URL  ) links.elementAt(i), depth - 1, myReferer);
1179                                    // check already here for visited tasks to save memory
1180                                    if (!visited.contains(newTask)) {
1181                                        // bad workaround to retrieve images first
1182                                        if (newTask.urlString.endsWith(".jpg")) {
1183                                            addTaskAtStart(newTask);
1184                                        } else {
1185                                            addTask(newTask);
1186                                        }
1187                                    }
1188                                }
1189                            }
1190                        }
1191                    }
1192                    
1193                    if (hasFormHandlers) {
1194                        // add forms
1195                        Vector   forms = htmlDoc.getElements("form");
1196                        for (int i = 0; i < forms.size(); i++) {
1197                            ExtendedURL eurl = formFiller.fillForm(u, (Element  ) forms.elementAt(i));
1198                            if (eurl != null) {
1199                                RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString());
1200                                newTask.setParamString(eurl.getParams());
1201                                newTask.setMethod(eurl.getRequestMethod());
1202                                addTask(newTask);
1203                            }
1204                        }
1205                    }
1206    
1207                }
1208            // catch any occuring error to keep on processing
1209            } catch (OutOfMemoryError   e) {
1210                throw e;
1211            } catch (Throwable   e){
1212                log.error("Unexpected error while extraction links from url '" + u + "':"+e);
1213                e.printStackTrace();
1214                // continue processing
1215            }
1216
1217            // filter and store the document
1218            if ((docManager != null)) {
1219                try {
1220                    if (filters != null) {
1221                        doc = filters.process(doc);
1222                    } else {
1223                        log.debug("No filters defined");
1224                    }
1225                    
1226                    if (isProcessingAllowed(doc)) {
1227                        docManager.processDocument(doc);
1228                    } else  {
1229                        String   md5 = doc.getHeaderValue(HttpHeader.CONTENT_MD5);
1230                        doc.setContent("Not for indexing".getBytes());
1231                        doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5);
1232                    } 
1233                    
1234                    try {
1235                        docManager.storeDocument(doc);
1236                    } catch (Exception   e) {
1237                        log.warn("could not store (not for indexing) " + urlString + ": " + e.getMessage());
1238                    }
1239                    if (activatedContentHistory && duplicate==null) {
1240                        setContentVisitedURL(doc, urlString);
1241                    }
1242                } catch (DocManagerException e1) {
1243                    log.error("could not process document: " + e1.getMessage());
1244                    exceptionHandler.handleException(this, u, e1);
1245                } catch (FilterException e2) {
1246                    log.error(e2.getMessage());
1247                }
1248            }
1249
1250        } else {
1251            // it was NOT a 200 return code !
1252
1253            if (doc.isRedirect()) {
1254                String   ref = doc.getLocation();
1255                log.info("Got redirect to " + ref);
1256
1257                try {
1258                    URL   u2 = new URL  (u, ref);
1259                    // is it on another host ?
1260
1261                    // On a redirect, browsers use the old Referer instead of the
1262                    // URL that got this redirect
1263                    // Therefore we do not use u.toString as Referer but the old Referer
1264                    RobotTask newTask = createRobotTask(u2, depth - 1, referer);
1265
1266                    // it will be inserted at the beginning of the vector !
1267                    addTaskAtStart(newTask);
1268                } catch (MalformedURLException   e) {
1269                    // ignore this URL
1270                }
1271                // handle other values
1272            } else if (doc.isNotFound()) {
1273                // the document was not found
1274                exceptionHandler.handleException(this, u, new HttpException("Document not found"));
1275            } else if (doc.isUnauthorized()) {
1276                // the document was not found
1277                exceptionHandler.handleException(
1278                    this,
1279                    u,
1280                    new HttpException("No authorization for the document."));
1281            } else {
1282                // an other error occured.
1283                exceptionHandler.handleException(this, u, new HttpException("Unknown document error (Http return code "+doc.getHttpCode()+")."));
1284            }
1285        }
1286    }
1287
1288    /**
1289     * Inform about spidering progress.
1290     * May use iteration, startTime,
1291     * countCache, countWeb, countRefresh, countNoRefresh
1292     */
1293    public void updateProgressInfo() {
1294    }
1295
1296    /**
1297     * sleep for sleepTime seconds.
1298     */
1299    public void sleepNow() {
1300        if (sleepTime > 0) {
1301            synchronized(this) {
1302                if (webRobotCallback != null) {
1303                    webRobotCallback.webRobotSleeping(true);
1304                }
1305                
1306                try {
1307                    Thread.sleep(sleepTime * 1000);
1308                } catch (InterruptedException   e) {
1309                }
1310            
1311                if (webRobotCallback != null) {
1312                    webRobotCallback.webRobotSleeping(false);
1313                }
1314            }
1315        }
1316    }
1317
1318    /**
1319     * retrieves a file from the local file system.
1320     * @param url the url of the file to retrieve
1321     * @return HttpDoc containing the content and mime type
1322     */
1323    private HttpDoc retrieveFileURL(URL   url, Date   ifModifiedSince) throws HttpException {
1324        HttpDoc doc = new HttpDoc();
1325
1326        try {
1327            String   host = url.getHost();
1328            String   filename = url.getFile();
1329            if ((host == null) || (host.equals(""))) {
1330                // local file
1331                // remove leading / or \
1332                if ((filename.startsWith("\\")) || (filename.startsWith("/"))) {
1333                    filename = filename.substring(1);
1334                }
1335            } else {
1336                filename = "//" + host + filename;
1337            }
1338            // get the mimetype and put in the http header
1339            String   mimetypestr = getMimeTypeForFilename(filename);
1340            if (mimetypestr != null) {
1341                HttpHeader header = new HttpHeader("content-type", mimetypestr);
1342                doc.addHeader(header);
1343            }
1344            
1345            // get the content from the file
1346            File   file = new File  (filename);
1347            if (!file.exists()) {
1348                doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTFOUND);
1349                return doc;
1350            }
1351            long fileLastModified = file.lastModified();
1352            long ifModifiedSinceTime = ifModifiedSince == null ? 0 : ifModifiedSince.getTime();
1353            if (fileLastModified > ifModifiedSinceTime) {
1354                byte[] content = readFileToByteArray(file);
1355                doc.setContent(content);
1356                doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK);
1357            } else {
1358                doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTMODIFIED);
1359            }
1360            doc.setLastModified(fileLastModified);
1361            doc.setDate(System.currentTimeMillis());
1362            doc.setURL(url);
1363            
1364            return doc;
1365        } catch (Exception   e) {
1366            throw new HttpException(e.getMessage());
1367        }
1368    }
1369
1370    /**
1371     * Get the Mime type for the given filename.
1372     * @param filename
1373     * @return Mime type
1374     */
1375    protected String   getMimeTypeForFilename(String   filename) {
1376        if (filename.endsWith(".html") || filename.endsWith(".htm")) {
1377            return "text/html";
1378        } else {
1379            return null;
1380        }
1381    }
1382    
1383    /** 
1384     * Clean up temporary data
1385     */
1386    protected void cleanUp() {
1387        stopIt = false;
1388        visited.clear();
1389        todo.clear();
1390    }
1391
1392    /** 
1393     * adds a new task to the task vector but does some checks to 
1394     */
1395    protected void addTask(RobotTask task) {
1396        if (taskAddAllowed(task) && activatedNewTasks) {
1397            todo.add(task);
1398        }
1399    }
1400
1401    /** 
1402     * adds a new tasks at the beginning of the tasks list 
1403     * @see #addTask(RobotTask)
1404     */
1405    protected void addTaskAtStart(RobotTask task) {
1406        if (taskAddAllowed(task) && activatedNewTasks) {
1407            todo.addAtStart(task);
1408        }
1409    }
1410
1411    /**
1412     * Checks if a tasks should be added to the task list
1413     * @param robotTask 
1414     * @return true if this tasks can be added to the task list,
1415     * false otherwise
1416     */
1417    protected boolean taskAddAllowed(RobotTask task) {
1418        if (task == null) {
1419            log.info("Null task not allowed");
1420            return false;
1421        }
1422
1423        if (!isAllowed(task.getUrl())) {
1424            return false;
1425        }
1426
1427        if (todo.contains(task)) {
1428            return false;
1429        }
1430
1431        return true;
1432    }
1433
1434    /**
1435     * Is it allowed to travel to this new URL ?
1436     * @param u the URL to test
1437     * @return true if traveling to this URL is allowed, false otherwise
1438     */
1439    protected boolean isAllowed(URL   u) {
1440
1441        // do the basic checks
1442        if (basicURLCheck(u)) {
1443
1444            // if we have an URLCheck then test this URL against it 
1445            if ((urlCheck != null) && (!urlCheck.checkURL(u))) {
1446                log.debug("not allowed by URLCheck:" + u);
1447                return false;
1448            }
1449
1450            if (robCheck.ok(u)) {
1451                return true;
1452            } else {
1453                log.debug("not allowed by robots.txt:" + u);
1454                return false;
1455            }
1456        }
1457        return false;
1458    }
1459    
1460    /**
1461     * Is it allowed to process this document ?
1462     * @param document
1463     * @return true if processing of this URL is allowed
1464     */
1465    protected boolean isProcessingAllowed(HttpDoc doc) {
1466        URL   u = doc.getURL();
1467        if ((urlCheck != null) && (!urlCheck.checkURLForProcessing(u))) {
1468            log.debug("processing not allowed by URLCheck:" + u);
1469            return false;
1470        }
1471        
1472        DownloadRuleSet downloadRuleSet = httpTool.getDownloadRuleSet();
1473        if (downloadRuleSet != null && !downloadRuleSet.processAllowed(doc.getHttpHeaders())) {
1474            log.debug("processing not allowed by DownloadRuleSet:" + u);
1475            return false;
1476        }
1477
1478        return true;
1479    }
1480
1481    /**
1482     * Basic URL allow check
1483     * it is allowed to walk to a new URL if <ul>
1484     *  <li>WalkToOtherHost is true. In this case there will be no additional
1485     *      tests.</li>
1486     *  <li>The new URL is located below the start URL, e.g. is the start URL
1487     *      is http://localhost/test, the URL http://localhost/test/index.html
1488     *      is allowed, but http://localhost/ is not allowed.</li>
1489     *  <li>AllowWholeHost is true and the new URL is located on the same host
1490     *      as the start URL.</li>
1491     *  <li>FlexibleHostCheck is true and the host part of the current URL
1492     *      is equal to the host part of the start URL modulo the prefix "www."
1493     *      </li>
1494     *  <li>The URL starts with a string in the "AllowedURLs" list.</li>
1495     * </ul>
1496     */
1497    protected boolean basicURLCheck(URL   currURL) {
1498        String   currURLStr = currURL.getHost() + currURL.getPath();
1499        String   currHost = currURL.getHost().toLowerCase();
1500        String   startHost = startURL.getHost().toLowerCase();
1501
1502        // no more checks, if walkToOtherHosts is true
1503        if (walkToOtherHosts) {
1504            return true;
1505        }
1506
1507        // new URL below start URL ?
1508        if (currURLStr.startsWith(startDir)) {
1509            return true;
1510        }
1511
1512        // on the same host ?
1513        if (allowWholeHost && (currURL.getHost().equalsIgnoreCase(startURL.getHost()))) {
1514            return true;
1515        }
1516
1517        // on the same host with flexible test (host name with and without "www."
1518        if (flexibleHostCheck) {
1519            if (cutWWW(currHost).equalsIgnoreCase(cutWWW(startHost))) {
1520                return true;
1521            }
1522        }
1523
1524        // allow whole domain ?
1525        if (allowWholeDomain) {
1526            if (currHost.endsWith(getDomain(startHost))) {
1527                return true;
1528            }
1529        }
1530
1531        // in the list of allowed URLs ?
1532        for (int i = 0; i < allowedURLs.size(); i++) {
1533            String   s = (String  ) allowedURLs.elementAt(i);
1534            if (currURLStr.startsWith(s)) {
1535                return true;
1536            }
1537        }
1538        log.debug("URL " + currURLStr + " not allowed");
1539        return false;
1540    }
1541
1542    /**
1543     * remove a leading www. from a given hostname
1544     * 
1545     * @param hostname some hostname
1546     * @return the hostname if it doesn't start with "www." otherwise
1547     *  the hostname without the leading www.
1548     */
1549    private String   cutWWW(String   hostname) {
1550        if (hostname.toLowerCase().startsWith("www.")) {
1551            return hostname.substring(4);
1552        } else {
1553            return hostname;
1554        }
1555    }
1556
1557    /** 
1558     * Gets the domain name of a given host (just delete everything
1559     * to the last "."
1560     *
1561     * @param hostname some hostname
1562     * @return the domain part of this hostname
1563     */
1564    private String   getDomain(String   hostname) {
1565        int pos = hostname.indexOf(".");
1566        if (pos < 0) {
1567            // this should not happen !
1568            return hostname;
1569        } else {
1570            return hostname.substring(pos + 1);
1571        }
1572    }
1573
1574    /**
1575     * Method getExceptionHandler.
1576     * @return RobotExceptionHandler the exceptionhandler of the robot
1577     */
1578    public RobotExceptionHandler getExceptionHandler() {
1579        return exceptionHandler;
1580    }
1581
1582    /**
1583     * Method setExceptionHandler.
1584     * sets the exceptionhandler of the robot
1585     * @param newExceptionHandler the new exception handler
1586     */
1587    public void setExceptionHandler(RobotExceptionHandler newExceptionHandler) {
1588        if (newExceptionHandler != null) {
1589            exceptionHandler = newExceptionHandler;
1590        }
1591    }
1592
1593    /**
1594     * Method setStart.
1595     * sets the start URL 
1596     * @param the startURL as String
1597     */
1598    public void setStart(String   startURL) {
1599        try {
1600            setStartURL(new URL  (startURL));
1601        } catch (MalformedURLException   e) {
1602            e.printStackTrace();
1603        }
1604    }
1605
1606    /**
1607     * Method getStart.
1608     * gets the start url as string
1609     * @return String
1610     */
1611    public String   getStart() {
1612        URL   url = getStartURL();
1613        if (url != null) {
1614            return url.toExternalForm();
1615        } else {
1616            return null;
1617        }
1618    }
1619
1620    /**
1621     * This method finishes HttpTool, NoRobots, HttpDocManager.
1622     */
1623    public void finish() {
1624        if (httpTool != null) {
1625            httpTool.finish();
1626        }
1627        if (robCheck != null) {
1628            robCheck.finish();
1629        }
1630        if (docManager != null) {
1631            docManager.finish();
1632        }
1633    }
1634
1635    public static void main(String  [] args) {
1636        if (args.length > 0) System.err.println("Arguments will be ignored!");
1637        Field  [] fields = WebRobot.class.getDeclaredFields();
1638        StringBuffer   str = new StringBuffer  (60);
1639        for (int i = 0; i < fields.length; i++) {
1640            if (!Modifier.isFinal(fields[i].getModifiers())
1641                && !Modifier.isStatic(fields[i].getModifiers())) {
1642                str.delete(0, str.length());
1643                str.append("        robot." + fields[i].getName() + " = " + fields[i].getName() + ";");
1644                while (str.length() < 50) {
1645                    str.append(" ");
1646                }
1647                System.out.println(str.toString()+"// ("+fields[i].getType().getName()+")");
1648            }
1649        }
1650    }
1651
1652    /** default expected count of documents */
1653    private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000;
1654    
1655    /** expected count of documents */
1656    protected int expectedDocumentCount = DEFAULT_EXPECTED_DOCUMENT_COUNT;
1657     
1658    /** remember visited content here (md5, urlString) */ 
1659    protected HashMap   content2UrlMap;
1660
1661    /**  counter for pages that were found in cache */
1662    long countCache = 0;
1663    
1664    /** counter for pages retrieved by web */
1665    long countWeb = 0;
1666    
1667    /** counter for pages that didn�t need a refresh */
1668    long countNoRefresh = 0;
1669    
1670    /** counter for refreshed pages (=cache+web) */
1671    long countRefresh = 0;
1672    
1673    /**
1674     * Method getContentVisitedURL.
1675     * Checks if the content was visited before and retrieves the corresponding URL.
1676     * @param content
1677     * @return found url or null if not found
1678     */
1679    public String   getContentVisitedURL(HttpDoc doc) {
1680        Object   key = doc.getContentMD5();
1681        synchronized(content2UrlMap) {
1682            String   url = (String  ) content2UrlMap.get(key);
1683            return url;
1684        }
1685    }
1686    
1687    /**
1688     * Method setContentVisitedURL.
1689     * Makes an URL retrievable by its content by entering it in content2UrlMap.
1690     * @param content
1691     * @param url
1692     */
1693    public void setContentVisitedURL(HttpDoc doc, String   url) {
1694        Object   key = doc.getContentMD5();
1695        synchronized(content2UrlMap) {
1696            content2UrlMap.put(key, url);
1697        }
1698    }
1699    
1700    private final RobotTask createRobotTask(URL   url, int maxDepth, String   startReferer) {
1701        url = removeWasteParameters(url);
1702        return new RobotTask(url, maxDepth, startReferer);
1703    }
1704
1705    /** only true if form-handlers are defined */
1706    boolean hasFormHandlers = false;
1707    
1708    /** list of wasteParameters (will be removed from URLs) **/
1709    protected Vector   wasteParameters = new Vector  ();
1710    
1711    /** 
1712     * Set the list of wasteParameters (will be removed from URLs)
1713     * @param wasteParameters 
1714     * if they begin of a string in this vector
1715     */
1716    public void setWasteParameters(Vector   wasteParameters) {
1717        this.wasteParameters = wasteParameters;
1718    }
1719
1720    /**
1721     * Gets the list of wasteParameters (will be removed from URLs)
1722     * @return a Vector containing Strings
1723     */
1724    public Vector   getWasteParameters() {
1725        return this.wasteParameters;
1726    }
1727
1728    /** Removes wasteParameters from URL.
1729     * (eg. ID)
1730     * @param url
1731     * @return URL
1732     */
1733    public URL   removeWasteParameters(URL   url) {
1734        String   urlString = url.toExternalForm();
1735        String   newUrlString = removeParametersFromString(urlString, wasteParameters);
1736        if (urlString != newUrlString) {
1737            try {
1738                url = new URL  (newUrlString);
1739            } catch (MalformedURLException   ex) {
1740                ex.printStackTrace();
1741            }
1742        };
1743        return url;
1744    }
1745    
1746    /**
1747     * Remove passed Parameters from UrlString
1748     * @param urlString
1749     * @param wasteParameters
1750     * @return String
1751     */
1752    public static String   removeParametersFromString(String   urlString, Vector   wasteParameters) {
1753        if (wasteParameters != null && wasteParameters.size() > 0) {
1754            int questionMark = urlString.indexOf("?");
1755            if (questionMark>0 && questionMark<urlString.length()) {
1756                int restPosition = urlString.indexOf("#", questionMark);
1757                String   parameters;
1758                String   rest;
1759                if (restPosition<0) {
1760                    parameters = urlString.substring(questionMark+1);
1761                    rest = null;
1762                } else {
1763                    parameters = urlString.substring(questionMark+1,restPosition);
1764                    rest = urlString.substring(restPosition);
1765                }
1766                
1767                StringBuffer   filteredUrl = new StringBuffer  (urlString.substring(0,questionMark));
1768                StringTokenizer   tokenizer = new StringTokenizer  (parameters, "&");
1769                String   and = "?";
1770                boolean changed = false;
1771                while (tokenizer.hasMoreTokens()) {
1772                    String   token = tokenizer.nextToken();
1773                    boolean keep = true;
1774                    for (int w=0; w<wasteParameters.size(); w++) {
1775                        String   wasteParameter = (String  ) wasteParameters.elementAt(w);
1776                        if (token.startsWith(wasteParameter + "=")) {
1777                            keep = false; 
1778                            changed = true;
1779                            break;
1780                        }
1781                    }
1782                    if (keep) {
1783                        filteredUrl.append(and);
1784                        filteredUrl.append(token);
1785                        and = "&";
1786                    }
1787                }
1788                if (rest != null) filteredUrl.append(rest);
1789                if (changed) {
1790                    urlString = filteredUrl.toString();
1791                }
1792            }
1793        }
1794        return urlString;
1795    }
1796    
1797    /** time of WebRobot start in milliseconds */
1798    protected long startTime = System.currentTimeMillis();
1799    
1800    /** number of allowed retries for document retrieval */
1801    protected int maxRetries = 0;
1802    
1803    /**
1804     * Set allowed retries for document retrieval
1805     * @param maxRetries
1806     */
1807    public void setMaxRetries(int maxRetries) { this.maxRetries = maxRetries; }
1808    
1809    /**
1810     * Get allowed retries for document retrieval
1811     * @return maxRetries
1812     */
1813    public int getMaxRetries() { return maxRetries; }
1814    
1815    /** 
1816     * expiration age of documents in cache.
1817     * Documents older than expirationAge will be removed,
1818     * negative value means no limit. 
1819     */
1820    protected long expirationAge = -1;
1821    
1822    /**
1823     * set expiration age of documents in cache.
1824     * Documents older than expirationAge will be removed,
1825     * negative value means no limit. 
1826     * @param age
1827     */
1828    public void setExpirationAge(long age) { expirationAge = age; }
1829    
1830    /**
1831     * get expiration age of documents in cache.
1832     * @return long
1833     */
1834    public long getExpirationAge() { return expirationAge; }
1835    
1836    /**
1837     * Remove Parameters from Url
1838     * @param url
1839     * @return url without parameters
1840     */
1841    private final static String   removeParameters(String   url) {
1842        int pos = url.indexOf("?");
1843        return pos >= 0 ? url.substring(0,pos) : url;
1844    }
1845    
1846    /**
1847     * Reads a File to a byte array.
1848     * @param file
1849     * @return byte[]
1850     * @throws IOException
1851     */
1852    protected byte[] readFileToByteArray(File   file) throws IOException  
1853    {
1854        FileInputStream   in = null;
1855
1856        try
1857        {
1858            byte[] buffer = new byte[(int) file.length()];
1859            in = new FileInputStream  (file);
1860            in.read(buffer);
1861
1862            return buffer;
1863        }
1864        finally
1865        {
1866            if (in != null)
1867            {
1868                try
1869                {
1870                    in.close();
1871                }
1872                catch (IOException   e)
1873                {
1874                }
1875            }
1876        }
1877    }
1878    
1879}
1880
1881
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags