Crawler


1   /*
2    * WebSphinx web-crawling toolkit
3    *
4    * Copyright (c) 1998-2002 Carnegie Mellon University.  All rights
5    * reserved.
6    *
7    * Redistribution and use in source and binary forms, with or without
8    * modification, are permitted provided that the following conditions
9    * are met:
10   *
11   * 1. Redistributions of source code must retain the above copyright
12   *    notice, this list of conditions and the following disclaimer.
13   *
14   * 2. Redistributions in binary form must reproduce the above copyright
15   *    notice, this list of conditions and the following disclaimer in
16   *    the documentation and/or other materials provided with the
17   *    distribution.
18   *
19   * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20   * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21   * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22   * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23   * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26   * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27   * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29   * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30   *
31   */
32  
33  
34  package websphinx;
35  
36  import rcm.util.PriorityQueue;
37  import rcm.util.Timer;
38  import java.util.Vector  ;
39  import java.util.Enumeration  ;
40  import java.util.Hashtable  ;
41  import java.util.StringTokenizer  ;
42  import java.net.URL  ;
43  import java.net.MalformedURLException  ;
44  import java.io.IOException  ;
45  //#ifdef JDK1.1 
46  import java.io.Serializable  ;
47  import java.io.ObjectInputStream  ;
48  import java.io.ObjectOutputStream  ;
49  //#endif JDK1.1
50  
51  /**
52   * Web crawler.
53   * <P>
54   * To write a crawler, extend this class and override 
55   * shouldVisit () and visit() to create your own crawler.
56   * <P>
57   * To use a crawler:
58   * <OL>
59   * <LI>Initialize the crawler by calling
60   * setRoot() (or one of its variants) and setting other 
61   * crawl parameters.
62   * <LI>Register any classifiers you need with addClassifier().
63   * <LI>Connect event listeners to monitor the crawler,
64   *     such as websphinx.EventLog, websphinx.workbench.WebGraph,
65   *     or websphinx.workbench.Statistics.
66   * <LI>Call run() to start the crawler.
67   * </OL>
68   * A running crawler consists of a priority queue of 
69   * Links waiting to be visited and a set of threads 
70   * retrieving pages in parallel.  When a page is downloaded,
71   * it is processed as follows:
72   * <OL>
73   * <LI><B>classify()</B>: The page is passed to the classify() method of 
74   * every registered classifier, in increasing order of
75   * their priority values.  Classifiers typically attach
76   * informative labels to the page and its links, such as "homepage"
77   * or "root page".
78   * <LI><B>visit()</B>: The page is passed to the crawler's
79   * visit() method for user-defined processing.
80   * <LI><B>expand()</B>: The page is passed to the crawler's
81   * expand() method to be expanded.  The default implementation
82   * tests every unvisited hyperlink on the page with shouldVisit(), 
83   * and puts
84   * each link approved by shouldVisit() into the crawling queue.
85   * </OL>
86   * By default, when expanding the links of a page, the crawler 
87   * only considers hyperlinks (not applets or inline images, for instance) that
88   * point to Web pages (not mailto: links, for instance).  If you want
89   * shouldVisit() to test every link on the page, use setLinkType(Crawler.ALL_LINKS).
90   * 
91   */
92  
93  public class Crawler implements Runnable  
94  //#ifdef JDK1.1 
95  , Serializable   
96  //#endif JDK1.1
97  {
98  
99      //#ifdef JDK1.1 
100     private static final long serialVersionUID = -3757789861952010450L;
101     //#endif JDK1.1
102 
103     /**
104      * Specify WEB as the crawl domain to allow the crawler
105      * to visit any page on the World Wide Web.
106      */
107     public static final String  [] WEB = null;
108 
109     /**
110      * Specify SERVER as the crawl domain to limit the crawler
111      * to visit only pages on the same Web server (hostname
112      * and port number) as the root link from which it started.
113      */
114     public static final String  [] SERVER = {"local"};
115 
116     /**
117      * Specify SUBTREE as the crawl domain to limit the crawler
118      * to visit only pages which are descendants of the root link 
119      * from which it started.
120      */
121     public static final String  [] SUBTREE = {"sibling", "descendent"};
122 
123 
124     /**
125      * Specify HYPERLINKS as the link type to allow the crawler
126      * to visit only hyperlinks (A, AREA, and FRAME tags which
127      * point to http:, ftp:, file:, or gopher: URLs).
128      */
129     public static final String  [] HYPERLINKS = {"hyperlink"};
130 
131     /**
132      * Specify HYPERLINKS_AND_IMAGES as the link type to allow the crawler
133      * to visit only hyperlinks and inline images.
134      */
135     public static final String  [] HYPERLINKS_AND_IMAGES = {"hyperlink", "image"};
136 
137     /**
138      * Specify ALL_LINKS as the link type to allow the crawler
139      * to visit any kind of link
140      */
141     public static final String  [] ALL_LINKS = null;
142     
143     // Crawler parameters
144     private String   name = getClass().getName();   // crawler's name
145     private transient Link[] roots = null;
146     private String  [] rootHrefs = null;   // exists only when serializing crawler
147     private String  [] domain = WEB;
148     private boolean synchronous = false;
149     private boolean depthFirst = true;
150     private String  [] type = HYPERLINKS;
151     private boolean ignoreVisitedLinks = true;
152     private int maxDepth = 5;
153     private DownloadParameters dp = new DownloadParameters ()
154                                   .changeUserAgent (name);
155     private Vector   classifiers = new Vector   ();
156     private LinkPredicate linkPredicate;
157     private PagePredicate pagePredicate;
158     private Action action;
159     
160     // Transient state
161 
162     private transient Link[] crawledRoots = null;
163 
164     private transient int state = CrawlEvent.CLEARED;
165     
166     private transient Worm[] worms;
167         // background threads
168 
169     private transient PriorityQueue fetchQueue; 
170           // links waiting to be downloaded
171     private transient PriorityQueue crawlQueue;
172           // all links that have been expanded but not
173           // processed (used only if crawler is in synchronous mode)
174 
175     private transient int numLinksTested;
176         // number of links tested by shouldVisit()
177     private transient int numPagesVisited;
178         // number of pages passed to visit()
179     private transient int numPagesLeft;
180           // all links that have been expanded but not processed
181           // == crawlQueue.size ()
182 
183     // FIX: convert to immutable linked lists
184     private transient Vector   crawlListeners;
185         // list of CrawlListeners
186     private transient Vector   linkListeners;
187         // list of LinkListeners
188 
189     private transient Hashtable   visitedPages;
190         // visited pages (a set of URLs)
191 
192     private transient RobotExclusion robotExclusion;
193         // robot exclusion cache
194 
195     /**
196      * Make a new Crawler.
197      */
198     public Crawler () {
199         addClassifier (new StandardClassifier());
200         init ();
201     }
202 
203     /*
204      * Initialize the transient fields of the crawler.
205      */
206     private void init () {
207         state = CrawlEvent.CLEARED;
208         
209         numLinksTested = 0;
210         numPagesVisited = 0;
211         numPagesLeft = 0;
212         
213         worms = null;
214         crawlQueue = new PriorityQueue();
215         fetchQueue = new PriorityQueue();
216 
217         crawlListeners = new Vector   ();
218         linkListeners = new Vector   ();
219 
220         visitedPages = new Hashtable   ();
221         robotExclusion = new RobotExclusion (getName ());
222     }
223 
224     /*
225      * Write a Crawler to an output stream.
226      */       
227 //#ifdef JDK1.1 
228     private void writeObject (ObjectOutputStream   out) 
229             throws IOException   {
230         if (roots != null) {
231             rootHrefs = new String  [roots.length];
232             for (int i=0; i<roots.length; ++i)
233                 rootHrefs[i] = roots[i].getURL().toString();
234         }
235         else
236             rootHrefs = null;
237 
238         out.defaultWriteObject ();
239 
240         rootHrefs = null;
241     }
242 //#endif JDK1.1
243 
244     /*
245      * Read a Crawler from an input stream.
246      */
247 //#ifdef JDK1.1 
248     private void readObject (ObjectInputStream   in) 
249            throws IOException  , ClassNotFoundException   {
250         in.defaultReadObject ();
251 
252         if (rootHrefs != null) {
253             roots = new Link [rootHrefs.length];
254             for (int i=0; i<rootHrefs.length; ++i)
255                 roots[i] = new Link (rootHrefs[i]);
256         }
257         else
258             roots = null;
259 
260         domain = useStandard (WEB, domain);
261         domain = useStandard (SERVER, domain);
262         domain = useStandard (SUBTREE, domain);
263 
264         type = useStandard (HYPERLINKS, type);
265         type = useStandard (HYPERLINKS_AND_IMAGES, type);
266         type = useStandard (ALL_LINKS, type);
267                  
268         init ();
269 
270         if (linkPredicate != null)
271             linkPredicate.connected (this);
272         if (pagePredicate != null)
273             pagePredicate.connected (this);
274         if (action != null)
275             action.connected (this);        
276     }
277 
278     private static String  [] useStandard (String  [] standard, String  [] s) {
279         if (s == null || standard == null || standard == s)
280             return s;
281         if (s.length != standard.length)
282             return s;
283         for (int i=0; i<s.length; ++i)
284             if (!s[i].equals (standard[i]))
285                 return s;
286         return standard;
287     }
288 //#endif JDK1.1
289 
290     /**
291      * Start crawling.  Returns either when the crawl is done, or 
292      * when pause() or stop() is called.  Because this method implements the
293      * java.lang.Runnable interface, a crawler can be run in the
294      * background thread.
295      */
296     public void run () {
297         crawledRoots = roots;
298 
299         if (state == CrawlEvent.STOPPED)
300             clear ();
301             
302         if (state == CrawlEvent.CLEARED && crawledRoots != null) {
303             // give each root a default priority based on its position in the array
304             float priority = 0;
305             float increment = 1.0f/crawledRoots.length;
306             for (int i=0; i<crawledRoots.length; ++i) {
307                 crawledRoots[i].setPriority (priority);
308                 priority += increment;
309             }
310             submit (crawledRoots);
311         }
312             
313         state = CrawlEvent.STARTED;
314         sendCrawlEvent (state);
315         
316         synchronized (crawlQueue) {            
317             Timer timer = new CrawlTimer (this);
318             int timeout = dp.getCrawlTimeout();
319             if (timeout > 0)
320                 timer.set (timeout*1000, false);
321 
322             int nWorms = Math.max (dp.getMaxThreads (), 1);
323             worms = new Worm[nWorms];
324             for (int i=0; i<nWorms; ++i) {
325                 worms[i] = new Worm (this, i);
326                 worms[i].start ();
327             }
328 
329             try {
330                 while (state == CrawlEvent.STARTED) {
331                     if (numPagesLeft == 0) {
332                         // ran out of links to crawl
333                         state = CrawlEvent.STOPPED;
334                         sendCrawlEvent (state);
335                     }
336                     else if (synchronous) {
337                         // Synchronous mode.
338                         // Main thread calls process() on each link
339                         // in crawlQueue, in priority order.
340                         Link link = (Link)crawlQueue.getMin ();
341                         if (link.getStatus () == LinkEvent.DOWNLOADED)
342                             process (link);
343                         else
344                             crawlQueue.wait ();
345                     }
346                     else
347                         // Asynchronous crawling.
348                         // Main thread does nothing but wait, while
349                         // background threads call process().
350                         crawlQueue.wait ();
351                 }
352             } catch (InterruptedException   e) {}
353 
354             timer.cancel ();
355                 
356             for (int i=0; i<worms.length; ++i)
357                 worms[i].die ();
358             if (state == CrawlEvent.PAUSED) {
359                 // put partly-processed links back in fetchQueue
360                 synchronized (fetchQueue) {
361                     for (int i=0; i<worms.length; ++i)
362                         if (worms[i].link != null)
363                             fetchQueue.put (worms[i].link);
364                 }
365             }
366             worms = null;
367         }
368     }
369 
370     /**
371      * Initialize the crawler for a fresh crawl.  Clears the crawling queue
372      * and sets all crawling statistics to 0.  Stops the crawler
373      * if it is currently running.
374      */
375     public void clear () {
376         stop ();
377         numPagesVisited = 0;
378         numLinksTested = 0;
379         clearVisited ();
380         if (crawledRoots != null)
381             for (int i=0; i < crawledRoots.length; ++i)
382                 crawledRoots[i].disconnect ();
383         crawledRoots = null;
384         state = CrawlEvent.CLEARED;
385         sendCrawlEvent (state);
386     }
387 
388     /**
389      * Pause the crawl in progress.  If the crawler is running, then
390      * it finishes processing the current page, then returns.  The queues remain as-is,
391      * so calling run() again will resume the crawl exactly where it left off.
392      * pause() can be called from any thread.
393      */
394     public void pause () {
395         if (state == CrawlEvent.STARTED) {
396             synchronized (crawlQueue) {
397                 state = CrawlEvent.PAUSED;
398                 crawlQueue.notify ();
399             }
400             sendCrawlEvent (state);
401         }
402     }
403 
404     /**
405      * Stop the crawl in progress.  If the crawler is running, then
406      * it finishes processing the current page, then returns.
407      * Empties the crawling queue.
408      */
409     public void stop () {
410         if (state == CrawlEvent.STARTED || state == CrawlEvent.PAUSED) {
411             synchronized (crawlQueue) {
412                 synchronized (fetchQueue) {
413                     state = CrawlEvent.STOPPED;
414                     fetchQueue.clear ();
415                     crawlQueue.clear ();
416                     numPagesLeft = 0;
417                     crawlQueue.notify ();
418                 }
419             }
420             sendCrawlEvent (state);
421         }
422     }
423 
424     /*
425      * Timeout the crawl in progress.  Used internally by
426      * the CrawlTimer.
427      */
428     void timedOut () {
429         if (state == CrawlEvent.STARTED) {
430             synchronized (crawlQueue) {
431                 synchronized (fetchQueue) {
432                     state = CrawlEvent.TIMED_OUT;
433                     fetchQueue.clear ();
434                     crawlQueue.clear ();
435                     numPagesLeft = 0;
436                     crawlQueue.notify ();
437                 }
438             }
439             sendCrawlEvent (state);
440         }
441     }
442 
443     
444     /**
445      * Get state of crawler.
446      * @return one of CrawlEvent.STARTED, CrawlEvent.PAUSED, STOPPED, CLEARED.
447      */
448     public int getState () {
449         return state;
450     }
451 
452     /**
453      * Callback for visiting a page.  Default version does nothing.
454      *
455      * @param page Page retrieved by the crawler
456      */
457     public void visit (Page page) {
458     }
459 
460     /**
461      * Callback for testing whether a link should be traversed.
462      * Default version returns true for all links. Override this method
463      * for more interesting behavior.
464      *
465      * @param l Link encountered by the crawler
466      * @return true if link should be followed, false if it should be ignored.
467      */
468     public boolean shouldVisit (Link l) {
469         return true;
470     }
471 
472     /** 
473      * Expand the crawl from a page.  The default implementation of this
474      * method tests every link on the page using shouldVisit (), and 
475      * submit()s the links that are approved.  A subclass may want to override
476      * this method if it's inconvenient to consider the links individually 
477      * with shouldVisit().
478      * @param page Page to expand
479      */
480     public void expand (Page page) { 
481         // examine each link on the page
482         Link[] links = page.getLinks();
483 
484         if (links != null && links.length > 0) {
485             // give each link a default priority based on its page
486             // and position on page
487             float priority = (depthFirst ? -numPagesVisited : numPagesVisited);
488             float increment = 1.0f/links.length;
489 
490             for (int i=0;  i<links.length; ++i) {
491                 Link l = links[i];
492 
493                 // set default download parameters
494                 l.setPriority (priority);
495                 priority += increment;
496                 l.setDownloadParameters (dp);
497 
498                 ++numLinksTested;
499                 if (ignoreVisitedLinks && visited (l))
500                     // FIX: use atomic test-and-set
501                     // FIX: set l.page somehow?
502                     sendLinkEvent (l, LinkEvent.ALREADY_VISITED);
503                 else if (!((type == null || l.hasAnyLabels (type))
504                            && (domain == null || l.hasAnyLabels (domain))
505                            && (linkPredicate == null || linkPredicate.shouldVisit (l))
506                            && shouldVisit (l)))
507                     sendLinkEvent (l, LinkEvent.SKIPPED);
508                 else if (page.getDepth() >= maxDepth)
509                     sendLinkEvent (l, LinkEvent.TOO_DEEP);
510                 else
511                     submit (l);
512             }
513         }
514     }
515 
516     /*
517      * Crawl statistics
518      */
519 
520     /**
521      * Get number of pages visited.
522      * @return number of pages passed to visit() so far in this crawl
523      */
524     public int getPagesVisited() {
525         return numPagesVisited;
526     }
527     /**
528      * Get number of links tested.
529      * @return number of links passed to shouldVisit() so far in this crawl
530      */
531     public int getLinksTested() {
532         return numLinksTested;
533     }
534     /**
535      * Get number of pages left to be visited.
536      * @return number of links approved by shouldVisit() but not yet visited
537      */
538     public int getPagesLeft() {
539         return numPagesLeft;
540     }
541     /**
542      * Get number of threads currently working.
543      * @return number of threads downloading pages
544      */
545     public int getActiveThreads () {
546         Worm[] w = worms;
547         
548         if (w == null)
549             return 0;
550             
551         int n = 0;
552         for (int i=0; i<w.length; ++i)
553             if (w[i] != null && w[i].link != null)
554                 ++n;                
555         return n;
556     }
557 
558     /*
559      * Crawler parameters
560      */
561 
562     /**
563      * Get human-readable name of crawler.  Default value is the
564      * class name, e.g., "Crawler".  Useful for identifying the crawler in a
565      * user interface; also used as the default User-agent for identifying
566      * the crawler to a remote Web server.  (The User-agent can be
567      * changed independently of the crawler name with setDownloadParameters().)
568      * @return human-readable name of crawler
569      */
570     public String   getName () {
571         return name;
572     }
573     /**
574      * Set human-readable name of crawler.
575      * @param name new name for crawler
576      */
577     public void setName (String   name) {
578         this.name = name;
579     }
580 
581     /**
582      * Convert the crawler to a String.
583      * @return Human-readable name of crawler.
584      */
585     public String   toString () {
586         return getName ();
587     }
588 
589     /**
590      * Get starting points of crawl as an array of Link objects.
591      * @return array of Links from which crawler will start its next crawl.
592      */
593     public Link[] getRoots () {
594         if (roots == null)
595             return new Link[0];
596             
597         Link[] result = new Link[roots.length];
598         System.arraycopy (roots, 0, result, 0, roots.length);
599         return result;
600     }
601     /**
602      * Get roots of last crawl.  May differ from getRoots() 
603      * if new roots have been set.
604      * @return array of Links from which crawler started its last crawl,
605      * or null if the crawler was cleared.
606      */
607     public Link[] getCrawledRoots () {
608         if (crawledRoots == null)
609             return null;
610             
611         Link[] result = new Link[crawledRoots.length];
612         System.arraycopy (crawledRoots, 0, result, 0, crawledRoots.length);
613         return result;
614     }
615     /**
616      * Get starting points of crawl as a String of newline-delimited URLs.
617      * @return URLs where crawler will start, separated by newlines.
618      */
619     public String   getRootHrefs () {
620         StringBuffer   buf = new StringBuffer   ();
621         if (roots != null) {
622             for (int i=0; i<roots.length; ++i) {
623                 if (buf.length() > 0)
624                     buf.append ('\n');
625                 buf.append (roots[i].getURL().toExternalForm());
626             }
627         }
628         return buf.toString ();
629     }
630     /**
631      * Set starting points of crawl as a string of whitespace-delimited URLs.
632      * @param hrefs URLs of starting point, separated by space, \t, or \n
633      * @exception java.net.MalformedURLException if any of the URLs is invalid,
634      *    leaving starting points unchanged
635      */
636     public void setRootHrefs (String   hrefs) throws MalformedURLException   {
637         Vector   v = new Vector   ();
638         StringTokenizer   tok = new StringTokenizer   (hrefs);        
639         while (tok.hasMoreElements ())
640             v.addElement (new Link (tok.nextToken()));
641         roots = new Link[v.size()];
642         v.copyInto (roots);
643     }
644     /**
645      * Set starting point of crawl as a single Link.
646      * @param link starting point
647      */
648     public void setRoot (Link link) {
649         roots = new Link[1];
650         roots[0] = link;
651     }
652     /**
653      * Set starting points of crawl as an array of Links.
654      * @param links starting points
655      */
656     public void setRoots (Link[] links) {
657         roots = new Link[links.length];
658         System.arraycopy (links, 0, roots, 0, links.length);
659     }
660 
661     /**
662      * Add a root to the existing set of roots.
663      * @param link starting point to add
664      */
665     public void addRoot (Link link) {
666         if (roots == null)
667             setRoot (link);
668         else {
669             Link newroots[] = new Link[roots.length+1];
670             System.arraycopy (roots, 0, newroots, 0, roots.length);
671             newroots[newroots.length-1] = link;
672             roots = newroots;
673         }
674     }
675 
676     /**
677      * Get crawl domain.  Default value is WEB.
678      * @return WEB, SERVER, or SUBTREE.
679      */
680     public String  [] getDomain () {
681         return domain;
682     }
683     /**
684      * Set crawl domain.
685      * @param domain one of WEB, SERVER, or SUBTREE.
686      */
687     public void setDomain (String  [] domain) {
688         this.domain = domain;
689     }
690 
691     /**
692      * Get legal link types to crawl.  Default value is HYPERLINKS.
693      * @return HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.
694      */
695     public String  [] getLinkType () {
696         return type;
697     }
698     /**
699      * Set legal link types to crawl.
700      * @param domain one of HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.
701      */
702     public void setLinkType (String  [] type) {
703         this.type = type;
704     }
705 
706     /**
707      * Get depth-first search flag.  Default value is true.
708      * @return true if search is depth-first, false if search is breadth-first.
709      */
710     public boolean getDepthFirst() {
711         return depthFirst;
712     }
713     /**
714      * Set depth-first search flag.  If neither depth-first nor breadth-first
715      * is desired, then override shouldVisit() to set a custom priority on
716      * each link.
717      * @param useDFS true if search should be depth-first, false if search should be breadth-first.
718      */
719     public void setDepthFirst(boolean useDFS) {
720         depthFirst = useDFS;
721     }
722     /**
723      * Get synchronous flag.  Default value is false.
724      * @return true if crawler must visit the pages in priority order; false if crawler can visit 
725      * pages in any order.
726      */
727     public boolean getSynchronous() {
728         return synchronous;
729     }
730     /**
731      * Set ssynchronous flag.
732      * @param f true if crawler must visit the pages in priority order; false if crawler can visit 
733      * pages in any order.
734      */
735     public void setSynchronous(boolean f) {
736         synchronous = f;
737     }
738     /**
739      * Get ignore-visited-links flag.  Default value is true.
740      * @return true if search skips links whose URLs have already been visited
741      * (or queued for visiting).
742      */
743     public boolean getIgnoreVisitedLinks() {
744         return ignoreVisitedLinks;
745     }
746     /**
747      * Set ignore-visited-links flag.
748      * @param f true if search skips links whose URLs have already been visited
749      * (or queued for visiting).
750      */
751     public void setIgnoreVisitedLinks(boolean f) {
752         ignoreVisitedLinks = f;
753     }
754     /**
755      * Get maximum depth.  Default value is 5.
756      * @return maximum depth of crawl, in hops from starting point.
757      */
758     public int getMaxDepth() {
759         return maxDepth;
760     }
761     /**
762      * Set maximum depth.
763      * @param maxDepth maximum depth of crawl, in hops from starting point
764      */
765     public void setMaxDepth(int maxDepth) {
766         this.maxDepth = maxDepth;
767     }
768     /**
769      * Get download parameters (such as number of threads, timeouts, maximum
770      * page size, etc.)
771      */
772     public DownloadParameters getDownloadParameters() {
773         return dp;
774     }
775     /**
776      * Set download parameters  (such as number of threads, timeouts, maximum
777      * page size, etc.)
778      * @param dp Download parameters
779      */
780     public void setDownloadParameters(DownloadParameters dp) {
781         this.dp = dp;
782     }
783 
784     /**
785      * Set link predicate.  This is an alternative way to
786      * specify the links to walk.  If the link predicate is
787      * non-null, then only links that satisfy
788      * the link predicate AND shouldVisit() are crawled.
789      * @param pred Link predicate
790      */
791     public void setLinkPredicate (LinkPredicate pred) {
792         if (pred == linkPredicate
793             || (pred != null && pred.equals (linkPredicate)))
794             return;
795         if (linkPredicate != null)
796             linkPredicate.disconnected (this);
797         linkPredicate = pred;
798         if (linkPredicate != null)
799             linkPredicate.connected (this);
800     }
801 
802     /**
803      * Get link predicate.
804      * @return current link predicate
805      */
806     public LinkPredicate getLinkPredicate () {
807         return linkPredicate;
808     }
809 
810     /**
811      * Set page predicate.  This is a way to filter the pages
812      * passed to visit().  If the page predicate is
813      * non-null, then only pages that satisfy it are passed to visit().
814      * @param pred Page predicate
815      */
816     public void setPagePredicate (PagePredicate pred) {
817         if (pred == pagePredicate
818             || (pred != null && pred.equals (pagePredicate)))
819             return;
820         if (pagePredicate != null)
821             pagePredicate.disconnected (this);
822         pagePredicate = pred;
823         if (pagePredicate != null)
824             pagePredicate.connected (this);
825     }
826 
827     /**
828      * Get page predicate.
829      * @return current page predicate
830      */
831     public PagePredicate getPagePredicate () {
832         return pagePredicate;
833     }
834 
835     /**
836      * Set the action.  This is an alternative way to specify
837      * an action performed on every page.  If act is non-null,
838      * then every page passed to visit() is also passed to this
839      * action.
840      * @param act Action
841      */
842     public void setAction (Action act) {
843         if (act == action
844             || (act != null && act.equals (action)))
845             return;
846         if (action != null)
847             action.disconnected (this);
848         action = act;
849         if (action != null)
850             action.connected (this);
851     }
852 
853     /**
854      * Get action.
855      * @return current action
856      */
857     public Action getAction () {
858         return action;
859     }
860 
861 
862     /*
863      * Link queue management
864      *
865      */
866 
867     /**
868      * Puts a link into the crawling queue.  If the crawler is running, the
869      * link will eventually be retrieved and passed to visit().
870      * @param link Link to put in queue
871      */
872     public void submit (Link link) {
873         markVisited (link); // FIX: need atomic test-and-set of visited flag
874         sendLinkEvent (link, LinkEvent.QUEUED);
875         synchronized (crawlQueue) {
876             synchronized (fetchQueue) {
877                 crawlQueue.put (link);
878                 ++numPagesLeft;
879                 fetchQueue.put (link);
880                 fetchQueue.notifyAll ();  // wake up worms
881             }
882         }
883     }
884     /**
885      * Submit an array of Links for crawling.  If the crawler is running,
886      * these links will eventually be retrieved and passed to visit().
887      * @param links Links to put in queue
888      */
889     public void submit (Link[] links) {
890         for (int i=0; i<links.length; ++i)
891             submit (links[i]);
892     }
893 
894     /**
895      * Enumerate crawling queue.
896      * @return an enumeration of Link objects which are waiting to be visited.
897      */
898     // FIX: enumerate in priority order
899     public Enumeration   enumerateQueue () {
900         return crawlQueue.elements ();
901     }
902 
903     /*
904      * Classifiers
905      *
906      */
907 
908     /**
909      * Adds a classifier to this crawler.  If the
910      * classifier is already found in the set, does nothing.
911      * @param c a classifier
912      */
913     public void addClassifier (Classifier c) {
914         if (!classifiers.contains (c)) {
915             float cpriority = c.getPriority ();
916             
917             for (int i=0; i<classifiers.size(); ++i) {
918                 Classifier d = (Classifier)classifiers.elementAt (i);
919                 if (cpriority < d.getPriority ()) {
920                     classifiers.insertElementAt (c, i);
921                     return;
922                 }
923             }
924             classifiers.addElement (c);
925         }
926     }
927 
928     /**
929      * Removes a classifier from the set of classifiers.  
930      * If c is not found in the set, does nothing.
931      *
932      * @param c a classifier
933      */
934     public void removeClassifier (Classifier c) {
935         classifiers.removeElement (c);
936     }
937 
938     /**
939      * Clears the set of classifiers.
940      */
941     public void removeAllClassifiers () {
942         classifiers.removeAllElements ();
943     }
944 
945     /**
946      * Enumerates the set of classifiers.
947      *
948      * @return An enumeration of the classifiers.
949      */
950     public Enumeration   enumerateClassifiers () {
951         return classifiers.elements();
952     }
953 
954     /**
955      * Get the set of classifiers
956      *
957      * @return An array containing the registered classifiers.
958      */
959     public Classifier[] getClassifiers () {
960         Classifier[] c = new Classifier[classifiers.size()];
961         classifiers.copyInto (c);
962         return c;
963     }
964 
965     /*
966      * Event listeners
967      *
968      */
969 
970     /**
971      * Adds a listener to the set of CrawlListeners for this crawler.
972      * If the listener is already found in the set, does nothing.
973      *
974      * @param listen a listener
975      */
976     public void addCrawlListener (CrawlListener listen) {
977         if (!crawlListeners.contains (listen))
978             crawlListeners.addElement (listen);
979     }
980 
981     /**
982      * Removes a listener from the set of CrawlListeners.  If it is not found in the set,
983      * does nothing.
984      *
985      * @param listen a listener
986      */
987     public void removeCrawlListener (CrawlListener listen) {
988         crawlListeners.removeElement (listen);
989     }
990 
991     /**
992      * Adds a listener to the set of LinkListeners for this crawler.
993      * If the listener is already found in the set, does nothing.
994      *
995      * @param listen a listener
996      */
997     public void addLinkListener (LinkListener listen) {
998         if (!linkListeners.contains (listen))
999             linkListeners.addElement (listen);
1000    }
1001
1002    /**
1003     * Removes a listener from the set of LinkListeners.  If it is not found in the set,
1004     * does nothing.
1005     *
1006     * @param listen a listener
1007     */
1008    public void removeLinkListener (LinkListener listen) {
1009        linkListeners.removeElement (listen);
1010    }
1011
1012    /**
1013     * Send a CrawlEvent to all CrawlListeners registered with this crawler.
1014     * @param id Event id
1015     */
1016    protected void sendCrawlEvent (int id) {
1017        CrawlEvent evt = new CrawlEvent (this, id);
1018        for (int j=0, len=crawlListeners.size(); j<len; ++j) {
1019            CrawlListener listen = (CrawlListener)crawlListeners.elementAt(j);
1020            switch (id) {
1021              case CrawlEvent.STARTED: 
1022                listen.started (evt);
1023                break;
1024              case CrawlEvent.STOPPED: 
1025                listen.stopped (evt);
1026                break;
1027              case CrawlEvent.CLEARED: 
1028                listen.cleared (evt);
1029                break;
1030              case CrawlEvent.TIMED_OUT: 
1031                listen.timedOut (evt);
1032                break;
1033              case CrawlEvent.PAUSED: 
1034                listen.paused (evt);
1035                break;
1036            }
1037        }
1038    }
1039
1040    /**
1041     * Send a LinkEvent to all LinkListeners registered with this crawler.
1042     * @param l Link related to event
1043     * @param id Event id
1044     */
1045    protected void sendLinkEvent (Link l, int id) {
1046        LinkEvent evt = new LinkEvent (this, id, l);
1047        l.setStatus (id);
1048        for (int j=0, len=linkListeners.size(); j<len; ++j) {
1049            LinkListener listen = (LinkListener)linkListeners.elementAt(j);
1050            listen.crawled (evt);
1051        }
1052    }
1053
1054    /**
1055     * Send an exceptional LinkEvent to all LinkListeners registered with this crawler.
1056     * @param l Link related to event
1057     * @param id Event id
1058     * @param exception Exception associated with event
1059     */
1060    protected void sendLinkEvent (Link l, int id, Throwable   exception) {
1061        LinkEvent evt = new LinkEvent (this, id, l, exception);
1062        l.setStatus (id);
1063        l.setLabel ("exception", exception.toString ());
1064        for (int j=0, len=linkListeners.size(); j<len; ++j) {
1065            LinkListener listen = (LinkListener)linkListeners.elementAt(j);
1066            listen.crawled (evt);
1067        }
1068    }
1069
1070    /*
1071     * Visited pages table
1072     *
1073     */
1074
1075    /**
1076     * Test whether the page corresponding to a link has been visited
1077     * (or queued for visiting).
1078     * @param link  Link to test
1079     * @return true if link has been passed to walk() during this crawl
1080     */
1081    public boolean visited (Link link) {
1082        return visitedPages.containsKey (link.getPageURL().toString());
1083    }
1084
1085    /**
1086     * Register that a link has been visited.
1087     * @param link  Link that has been visited
1088     */
1089    protected void markVisited (Link link) {
1090        visitedPages.put (link.getPageURL().toString(), this);
1091    }
1092
1093    /**
1094     * Clear the set of visited links.
1095     */
1096    protected void clearVisited () {
1097        visitedPages.clear ();
1098    }
1099
1100    /*
1101     * Fetch loop
1102     *
1103     */
1104
1105    void fetch (Worm w) {
1106        Timer timer = new WormTimer (w);
1107
1108        while (!w.dead) {
1109            //System.err.println (w + ": fetching a link");
1110
1111            // pull the highest-priority link from the fetch queue
1112            synchronized (fetchQueue) {
1113                while (!w.dead
1114                       && (w.link = (Link)fetchQueue.deleteMin ()) == null) {
1115                    try {
1116                        fetchQueue.wait ();
1117                    } catch (InterruptedException   e) {}
1118                }
1119            }
1120
1121            if (w.dead)
1122                return;
1123                
1124            //System.err.println (w + ": processing " + w.link.toDescription());
1125            
1126            try {
1127                // download the link to get a page
1128                DownloadParameters dp;
1129                Page page;
1130
1131                dp = w.link.getDownloadParameters();
1132                if (dp == null)
1133                    dp = this.dp;
1134                int timeout = dp.getDownloadTimeout();
1135
1136                sendLinkEvent (w.link, LinkEvent.RETRIEVING);
1137                try {
1138                    
1139                    if (timeout > 0)
1140                        timer.set (timeout*1000, false);
1141
1142                    if (dp.getObeyRobotExclusion() 
1143                        && robotExclusion.disallowed (w.link.getURL()))
1144                        throw new IOException   ("disallowed by Robot Exclusion Standard (robots.txt)");
1145
1146                    page = new Page (w.link, dp);
1147                    
1148                } finally {
1149                    timer.cancel ();
1150                }
1151                    
1152                if (w.dead)
1153                    return;
1154                    
1155                sendLinkEvent (w.link, LinkEvent.DOWNLOADED);
1156
1157                if (synchronous) {
1158                    // Synchronous mode.
1159                    // Main thread will call process() when
1160                    // this link's turn arrives (in priority order).
1161                    // Wake up the main thread.
1162                    synchronized (crawlQueue) {
1163                        crawlQueue.notify ();
1164                    }
1165                }
1166                else {
1167                    // Asynchronous mode.
1168                    // Each worm calls process() on its link. 
1169                    process (w.link);
1170                }
1171                
1172                w.link = null;
1173
1174                // loop around and fetch another link
1175
1176            } catch (ThreadDeath   e) {
1177                throw e;  // have to continue dying 
1178            } catch (Throwable   e) {
1179                // Some other exception occurred, either during the page fetch
1180                // or in some user code.  Mark up the link with the error.
1181                if (w.dead)
1182                    return;
1183                    
1184                sendLinkEvent (w.link, LinkEvent.ERROR, e);
1185                synchronized (crawlQueue) {
1186                    crawlQueue.delete (w.link);
1187                    --numPagesLeft;
1188                    w.link = null;
1189                    crawlQueue.notify ();
1190                }
1191            }
1192        }
1193    }
1194
1195    void process (Link link) {
1196        Page page = link.getPage ();
1197
1198        // classify the page
1199        for (int j=0, len=classifiers.size(); j<len; ++j) {
1200            Classifier cl = (Classifier)classifiers.elementAt(j);
1201            cl.classify (page);
1202        }
1203
1204        // invoke callbacks on the page
1205        ++numPagesVisited;
1206        if (pagePredicate == null || pagePredicate.shouldActOn (page)) {
1207            if (action != null)
1208                action.visit (page);
1209            visit (page);
1210        }
1211        expand (page);
1212        
1213        // send out the event
1214        sendLinkEvent (link, LinkEvent.VISITED);
1215        
1216        // discard link
1217        synchronized (crawlQueue) {
1218            crawlQueue.delete (link);
1219            --numPagesLeft;
1220            crawlQueue.notify ();
1221        }
1222    }
1223
1224    void fetchTimedOut (Worm w, int interval) {
1225        if (w.dead)
1226            return;
1227
1228        w.die ();
1229        sendLinkEvent (w.link, LinkEvent.ERROR, 
1230                       new IOException   ("Timeout after " + interval + " seconds"));
1231
1232        synchronized (crawlQueue) {
1233            crawlQueue.delete (w.link);
1234            --numPagesLeft;
1235            
1236            worms[w.i] = new Worm (this, w.i);
1237            worms[w.i].start ();
1238            
1239            crawlQueue.notify ();
1240        }
1241    }
1242
1243//#ifdef JDK1.1
1244  // FIX: more error checking here
1245  public static void main (String  [] args) throws Exception   {
1246    java.io.ObjectInputStream   in = 
1247      new java.io.ObjectInputStream   (new java.io.FileInputStream   (args[0]));
1248    Crawler loadedCrawler = (Crawler)in.readObject ();
1249    in.close ();
1250
1251    EventLog.monitor (loadedCrawler).setOnlyNetworkEvents (false);
1252    loadedCrawler.run ();
1253  }
1254//#endif JDK1.1
1255
1256}
1257
1258/* Simple Thread subclass that invokes a crawler's fetch loop. */
1259class Worm extends Thread   {
1260    Crawler crawler; // crawler in charge of this worm
1261    int i;           // index of this worm in crawler.worms[]
1262    Link link;       // link this worm is currently working on
1263    boolean dead = false; // true if this worm has been killed
1264
1265    public Worm (Crawler crawler, int i) {
1266        super (crawler.getName() + " worm " + i);
1267        setDaemon (true);
1268        this.crawler = crawler;
1269        this.i = i;
1270    }
1271
1272    public void run () {
1273        crawler.fetch (this);
1274    }
1275    
1276    public void die () {
1277        dead = true;
1278        stop ();
1279    }
1280        
1281}
1282
1283class WormTimer extends Timer {
1284    Worm worm;
1285
1286    public WormTimer (Worm worm) {
1287        this.worm = worm;
1288    }
1289
1290    protected void alarm () {
1291        worm.crawler.fetchTimedOut (worm, getInterval()/1000);
1292    }
1293}
1294
1295class CrawlTimer extends Timer {
1296    Crawler crawler;
1297    
1298    public CrawlTimer (Crawler crawler) {
1299        this.crawler = crawler;
1300    }
1301    
1302    protected void alarm () { 
1303        crawler.timedOut ();
1304    }        
1305}
1306
1307
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags