KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > Crawler


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33
34 package websphinx;
35
36 import rcm.util.PriorityQueue;
37 import rcm.util.Timer;
38 import java.util.Vector JavaDoc;
39 import java.util.Enumeration JavaDoc;
40 import java.util.Hashtable JavaDoc;
41 import java.util.StringTokenizer JavaDoc;
42 import java.net.URL JavaDoc;
43 import java.net.MalformedURLException JavaDoc;
44 import java.io.IOException JavaDoc;
45 //#ifdef JDK1.1
46
import java.io.Serializable JavaDoc;
47 import java.io.ObjectInputStream JavaDoc;
48 import java.io.ObjectOutputStream JavaDoc;
49 //#endif JDK1.1
50

51 /**
52  * Web crawler.
53  * <P>
54  * To write a crawler, extend this class and override
55  * shouldVisit () and visit() to create your own crawler.
56  * <P>
57  * To use a crawler:
58  * <OL>
59  * <LI>Initialize the crawler by calling
60  * setRoot() (or one of its variants) and setting other
61  * crawl parameters.
62  * <LI>Register any classifiers you need with addClassifier().
63  * <LI>Connect event listeners to monitor the crawler,
64  * such as websphinx.EventLog, websphinx.workbench.WebGraph,
65  * or websphinx.workbench.Statistics.
66  * <LI>Call run() to start the crawler.
67  * </OL>
68  * A running crawler consists of a priority queue of
69  * Links waiting to be visited and a set of threads
70  * retrieving pages in parallel. When a page is downloaded,
71  * it is processed as follows:
72  * <OL>
73  * <LI><B>classify()</B>: The page is passed to the classify() method of
74  * every registered classifier, in increasing order of
75  * their priority values. Classifiers typically attach
76  * informative labels to the page and its links, such as "homepage"
77  * or "root page".
78  * <LI><B>visit()</B>: The page is passed to the crawler's
79  * visit() method for user-defined processing.
80  * <LI><B>expand()</B>: The page is passed to the crawler's
81  * expand() method to be expanded. The default implementation
82  * tests every unvisited hyperlink on the page with shouldVisit(),
83  * and puts
84  * each link approved by shouldVisit() into the crawling queue.
85  * </OL>
86  * By default, when expanding the links of a page, the crawler
87  * only considers hyperlinks (not applets or inline images, for instance) that
88  * point to Web pages (not mailto: links, for instance). If you want
89  * shouldVisit() to test every link on the page, use setLinkType(Crawler.ALL_LINKS).
90  *
91  */

92
93 public class Crawler implements Runnable JavaDoc
94 //#ifdef JDK1.1
95
, Serializable JavaDoc
96 //#endif JDK1.1
97
{
98
99     //#ifdef JDK1.1
100
private static final long serialVersionUID = -3757789861952010450L;
101     //#endif JDK1.1
102

103     /**
104      * Specify WEB as the crawl domain to allow the crawler
105      * to visit any page on the World Wide Web.
106      */

107     public static final String JavaDoc[] WEB = null;
108
109     /**
110      * Specify SERVER as the crawl domain to limit the crawler
111      * to visit only pages on the same Web server (hostname
112      * and port number) as the root link from which it started.
113      */

114     public static final String JavaDoc[] SERVER = {"local"};
115
116     /**
117      * Specify SUBTREE as the crawl domain to limit the crawler
118      * to visit only pages which are descendants of the root link
119      * from which it started.
120      */

121     public static final String JavaDoc[] SUBTREE = {"sibling", "descendent"};
122
123
124     /**
125      * Specify HYPERLINKS as the link type to allow the crawler
126      * to visit only hyperlinks (A, AREA, and FRAME tags which
127      * point to http:, ftp:, file:, or gopher: URLs).
128      */

129     public static final String JavaDoc[] HYPERLINKS = {"hyperlink"};
130
131     /**
132      * Specify HYPERLINKS_AND_IMAGES as the link type to allow the crawler
133      * to visit only hyperlinks and inline images.
134      */

135     public static final String JavaDoc[] HYPERLINKS_AND_IMAGES = {"hyperlink", "image"};
136
137     /**
138      * Specify ALL_LINKS as the link type to allow the crawler
139      * to visit any kind of link
140      */

141     public static final String JavaDoc[] ALL_LINKS = null;
142     
143     // Crawler parameters
144
private String JavaDoc name = getClass().getName(); // crawler's name
145
private transient Link[] roots = null;
146     private String JavaDoc[] rootHrefs = null; // exists only when serializing crawler
147
private String JavaDoc[] domain = WEB;
148     private boolean synchronous = false;
149     private boolean depthFirst = true;
150     private String JavaDoc[] type = HYPERLINKS;
151     private boolean ignoreVisitedLinks = true;
152     private int maxDepth = 5;
153     private DownloadParameters dp = new DownloadParameters ()
154                                   .changeUserAgent (name);
155     private Vector JavaDoc classifiers = new Vector JavaDoc ();
156     private LinkPredicate linkPredicate;
157     private PagePredicate pagePredicate;
158     private Action action;
159     
160     // Transient state
161

162     private transient Link[] crawledRoots = null;
163
164     private transient int state = CrawlEvent.CLEARED;
165     
166     private transient Worm[] worms;
167         // background threads
168

169     private transient PriorityQueue fetchQueue;
170           // links waiting to be downloaded
171
private transient PriorityQueue crawlQueue;
172           // all links that have been expanded but not
173
// processed (used only if crawler is in synchronous mode)
174

175     private transient int numLinksTested;
176         // number of links tested by shouldVisit()
177
private transient int numPagesVisited;
178         // number of pages passed to visit()
179
private transient int numPagesLeft;
180           // all links that have been expanded but not processed
181
// == crawlQueue.size ()
182

183     // FIX: convert to immutable linked lists
184
private transient Vector JavaDoc crawlListeners;
185         // list of CrawlListeners
186
private transient Vector JavaDoc linkListeners;
187         // list of LinkListeners
188

189     private transient Hashtable JavaDoc visitedPages;
190         // visited pages (a set of URLs)
191

192     private transient RobotExclusion robotExclusion;
193         // robot exclusion cache
194

195     /**
196      * Make a new Crawler.
197      */

198     public Crawler () {
199         addClassifier (new StandardClassifier());
200         init ();
201     }
202
203     /*
204      * Initialize the transient fields of the crawler.
205      */

206     private void init () {
207         state = CrawlEvent.CLEARED;
208         
209         numLinksTested = 0;
210         numPagesVisited = 0;
211         numPagesLeft = 0;
212         
213         worms = null;
214         crawlQueue = new PriorityQueue();
215         fetchQueue = new PriorityQueue();
216
217         crawlListeners = new Vector JavaDoc ();
218         linkListeners = new Vector JavaDoc ();
219
220         visitedPages = new Hashtable JavaDoc ();
221         robotExclusion = new RobotExclusion (getName ());
222     }
223
224     /*
225      * Write a Crawler to an output stream.
226      */

227 //#ifdef JDK1.1
228
private void writeObject (ObjectOutputStream JavaDoc out)
229             throws IOException JavaDoc {
230         if (roots != null) {
231             rootHrefs = new String JavaDoc[roots.length];
232             for (int i=0; i<roots.length; ++i)
233                 rootHrefs[i] = roots[i].getURL().toString();
234         }
235         else
236             rootHrefs = null;
237
238         out.defaultWriteObject ();
239
240         rootHrefs = null;
241     }
242 //#endif JDK1.1
243

244     /*
245      * Read a Crawler from an input stream.
246      */

247 //#ifdef JDK1.1
248
private void readObject (ObjectInputStream JavaDoc in)
249            throws IOException JavaDoc, ClassNotFoundException JavaDoc {
250         in.defaultReadObject ();
251
252         if (rootHrefs != null) {
253             roots = new Link [rootHrefs.length];
254             for (int i=0; i<rootHrefs.length; ++i)
255                 roots[i] = new Link (rootHrefs[i]);
256         }
257         else
258             roots = null;
259
260         domain = useStandard (WEB, domain);
261         domain = useStandard (SERVER, domain);
262         domain = useStandard (SUBTREE, domain);
263
264         type = useStandard (HYPERLINKS, type);
265         type = useStandard (HYPERLINKS_AND_IMAGES, type);
266         type = useStandard (ALL_LINKS, type);
267                  
268         init ();
269
270         if (linkPredicate != null)
271             linkPredicate.connected (this);
272         if (pagePredicate != null)
273             pagePredicate.connected (this);
274         if (action != null)
275             action.connected (this);
276     }
277
278     private static String JavaDoc[] useStandard (String JavaDoc[] standard, String JavaDoc[] s) {
279         if (s == null || standard == null || standard == s)
280             return s;
281         if (s.length != standard.length)
282             return s;
283         for (int i=0; i<s.length; ++i)
284             if (!s[i].equals (standard[i]))
285                 return s;
286         return standard;
287     }
288 //#endif JDK1.1
289

290     /**
291      * Start crawling. Returns either when the crawl is done, or
292      * when pause() or stop() is called. Because this method implements the
293      * java.lang.Runnable interface, a crawler can be run in the
294      * background thread.
295      */

296     public void run () {
297         crawledRoots = roots;
298
299         if (state == CrawlEvent.STOPPED)
300             clear ();
301             
302         if (state == CrawlEvent.CLEARED && crawledRoots != null) {
303             // give each root a default priority based on its position in the array
304
float priority = 0;
305             float increment = 1.0f/crawledRoots.length;
306             for (int i=0; i<crawledRoots.length; ++i) {
307                 crawledRoots[i].setPriority (priority);
308                 priority += increment;
309             }
310             submit (crawledRoots);
311         }
312             
313         state = CrawlEvent.STARTED;
314         sendCrawlEvent (state);
315         
316         synchronized (crawlQueue) {
317             Timer timer = new CrawlTimer (this);
318             int timeout = dp.getCrawlTimeout();
319             if (timeout > 0)
320                 timer.set (timeout*1000, false);
321
322             int nWorms = Math.max (dp.getMaxThreads (), 1);
323             worms = new Worm[nWorms];
324             for (int i=0; i<nWorms; ++i) {
325                 worms[i] = new Worm (this, i);
326                 worms[i].start ();
327             }
328
329             try {
330                 while (state == CrawlEvent.STARTED) {
331                     if (numPagesLeft == 0) {
332                         // ran out of links to crawl
333
state = CrawlEvent.STOPPED;
334                         sendCrawlEvent (state);
335                     }
336                     else if (synchronous) {
337                         // Synchronous mode.
338
// Main thread calls process() on each link
339
// in crawlQueue, in priority order.
340
Link link = (Link)crawlQueue.getMin ();
341                         if (link.getStatus () == LinkEvent.DOWNLOADED)
342                             process (link);
343                         else
344                             crawlQueue.wait ();
345                     }
346                     else
347                         // Asynchronous crawling.
348
// Main thread does nothing but wait, while
349
// background threads call process().
350
crawlQueue.wait ();
351                 }
352             } catch (InterruptedException JavaDoc e) {}
353
354             timer.cancel ();
355                 
356             for (int i=0; i<worms.length; ++i)
357                 worms[i].die ();
358             if (state == CrawlEvent.PAUSED) {
359                 // put partly-processed links back in fetchQueue
360
synchronized (fetchQueue) {
361                     for (int i=0; i<worms.length; ++i)
362                         if (worms[i].link != null)
363                             fetchQueue.put (worms[i].link);
364                 }
365             }
366             worms = null;
367         }
368     }
369
370     /**
371      * Initialize the crawler for a fresh crawl. Clears the crawling queue
372      * and sets all crawling statistics to 0. Stops the crawler
373      * if it is currently running.
374      */

375     public void clear () {
376         stop ();
377         numPagesVisited = 0;
378         numLinksTested = 0;
379         clearVisited ();
380         if (crawledRoots != null)
381             for (int i=0; i < crawledRoots.length; ++i)
382                 crawledRoots[i].disconnect ();
383         crawledRoots = null;
384         state = CrawlEvent.CLEARED;
385         sendCrawlEvent (state);
386     }
387
388     /**
389      * Pause the crawl in progress. If the crawler is running, then
390      * it finishes processing the current page, then returns. The queues remain as-is,
391      * so calling run() again will resume the crawl exactly where it left off.
392      * pause() can be called from any thread.
393      */

394     public void pause () {
395         if (state == CrawlEvent.STARTED) {
396             synchronized (crawlQueue) {
397                 state = CrawlEvent.PAUSED;
398                 crawlQueue.notify ();
399             }
400             sendCrawlEvent (state);
401         }
402     }
403
404     /**
405      * Stop the crawl in progress. If the crawler is running, then
406      * it finishes processing the current page, then returns.
407      * Empties the crawling queue.
408      */

409     public void stop () {
410         if (state == CrawlEvent.STARTED || state == CrawlEvent.PAUSED) {
411             synchronized (crawlQueue) {
412                 synchronized (fetchQueue) {
413                     state = CrawlEvent.STOPPED;
414                     fetchQueue.clear ();
415                     crawlQueue.clear ();
416                     numPagesLeft = 0;
417                     crawlQueue.notify ();
418                 }
419             }
420             sendCrawlEvent (state);
421         }
422     }
423
424     /*
425      * Timeout the crawl in progress. Used internally by
426      * the CrawlTimer.
427      */

428     void timedOut () {
429         if (state == CrawlEvent.STARTED) {
430             synchronized (crawlQueue) {
431                 synchronized (fetchQueue) {
432                     state = CrawlEvent.TIMED_OUT;
433                     fetchQueue.clear ();
434                     crawlQueue.clear ();
435                     numPagesLeft = 0;
436                     crawlQueue.notify ();
437                 }
438             }
439             sendCrawlEvent (state);
440         }
441     }
442
443     
444     /**
445      * Get state of crawler.
446      * @return one of CrawlEvent.STARTED, CrawlEvent.PAUSED, STOPPED, CLEARED.
447      */

448     public int getState () {
449         return state;
450     }
451
452     /**
453      * Callback for visiting a page. Default version does nothing.
454      *
455      * @param page Page retrieved by the crawler
456      */

457     public void visit (Page page) {
458     }
459
460     /**
461      * Callback for testing whether a link should be traversed.
462      * Default version returns true for all links. Override this method
463      * for more interesting behavior.
464      *
465      * @param l Link encountered by the crawler
466      * @return true if link should be followed, false if it should be ignored.
467      */

468     public boolean shouldVisit (Link l) {
469         return true;
470     }
471
472     /**
473      * Expand the crawl from a page. The default implementation of this
474      * method tests every link on the page using shouldVisit (), and
475      * submit()s the links that are approved. A subclass may want to override
476      * this method if it's inconvenient to consider the links individually
477      * with shouldVisit().
478      * @param page Page to expand
479      */

480     public void expand (Page page) {
481         // examine each link on the page
482
Link[] links = page.getLinks();
483
484         if (links != null && links.length > 0) {
485             // give each link a default priority based on its page
486
// and position on page
487
float priority = (depthFirst ? -numPagesVisited : numPagesVisited);
488             float increment = 1.0f/links.length;
489
490             for (int i=0; i<links.length; ++i) {
491                 Link l = links[i];
492
493                 // set default download parameters
494
l.setPriority (priority);
495                 priority += increment;
496                 l.setDownloadParameters (dp);
497
498                 ++numLinksTested;
499                 if (ignoreVisitedLinks && visited (l))
500                     // FIX: use atomic test-and-set
501
// FIX: set l.page somehow?
502
sendLinkEvent (l, LinkEvent.ALREADY_VISITED);
503                 else if (!((type == null || l.hasAnyLabels (type))
504                            && (domain == null || l.hasAnyLabels (domain))
505                            && (linkPredicate == null || linkPredicate.shouldVisit (l))
506                            && shouldVisit (l)))
507                     sendLinkEvent (l, LinkEvent.SKIPPED);
508                 else if (page.getDepth() >= maxDepth)
509                     sendLinkEvent (l, LinkEvent.TOO_DEEP);
510                 else
511                     submit (l);
512             }
513         }
514     }
515
516     /*
517      * Crawl statistics
518      */

519
520     /**
521      * Get number of pages visited.
522      * @return number of pages passed to visit() so far in this crawl
523      */

524     public int getPagesVisited() {
525         return numPagesVisited;
526     }
527     /**
528      * Get number of links tested.
529      * @return number of links passed to shouldVisit() so far in this crawl
530      */

531     public int getLinksTested() {
532         return numLinksTested;
533     }
534     /**
535      * Get number of pages left to be visited.
536      * @return number of links approved by shouldVisit() but not yet visited
537      */

538     public int getPagesLeft() {
539         return numPagesLeft;
540     }
541     /**
542      * Get number of threads currently working.
543      * @return number of threads downloading pages
544      */

545     public int getActiveThreads () {
546         Worm[] w = worms;
547         
548         if (w == null)
549             return 0;
550             
551         int n = 0;
552         for (int i=0; i<w.length; ++i)
553             if (w[i] != null && w[i].link != null)
554                 ++n;
555         return n;
556     }
557
558     /*
559      * Crawler parameters
560      */

561
562     /**
563      * Get human-readable name of crawler. Default value is the
564      * class name, e.g., "Crawler". Useful for identifying the crawler in a
565      * user interface; also used as the default User-agent for identifying
566      * the crawler to a remote Web server. (The User-agent can be
567      * changed independently of the crawler name with setDownloadParameters().)
568      * @return human-readable name of crawler
569      */

570     public String JavaDoc getName () {
571         return name;
572     }
573     /**
574      * Set human-readable name of crawler.
575      * @param name new name for crawler
576      */

577     public void setName (String JavaDoc name) {
578         this.name = name;
579     }
580
581     /**
582      * Convert the crawler to a String.
583      * @return Human-readable name of crawler.
584      */

585     public String JavaDoc toString () {
586         return getName ();
587     }
588
589     /**
590      * Get starting points of crawl as an array of Link objects.
591      * @return array of Links from which crawler will start its next crawl.
592      */

593     public Link[] getRoots () {
594         if (roots == null)
595             return new Link[0];
596             
597         Link[] result = new Link[roots.length];
598         System.arraycopy (roots, 0, result, 0, roots.length);
599         return result;
600     }
601     /**
602      * Get roots of last crawl. May differ from getRoots()
603      * if new roots have been set.
604      * @return array of Links from which crawler started its last crawl,
605      * or null if the crawler was cleared.
606      */

607     public Link[] getCrawledRoots () {
608         if (crawledRoots == null)
609             return null;
610             
611         Link[] result = new Link[crawledRoots.length];
612         System.arraycopy (crawledRoots, 0, result, 0, crawledRoots.length);
613         return result;
614     }
615     /**
616      * Get starting points of crawl as a String of newline-delimited URLs.
617      * @return URLs where crawler will start, separated by newlines.
618      */

619     public String JavaDoc getRootHrefs () {
620         StringBuffer JavaDoc buf = new StringBuffer JavaDoc ();
621         if (roots != null) {
622             for (int i=0; i<roots.length; ++i) {
623                 if (buf.length() > 0)
624                     buf.append ('\n');
625                 buf.append (roots[i].getURL().toExternalForm());
626             }
627         }
628         return buf.toString ();
629     }
630     /**
631      * Set starting points of crawl as a string of whitespace-delimited URLs.
632      * @param hrefs URLs of starting point, separated by space, \t, or \n
633      * @exception java.net.MalformedURLException if any of the URLs is invalid,
634      * leaving starting points unchanged
635      */

636     public void setRootHrefs (String JavaDoc hrefs) throws MalformedURLException JavaDoc {
637         Vector JavaDoc v = new Vector JavaDoc ();
638         StringTokenizer JavaDoc tok = new StringTokenizer JavaDoc (hrefs);
639         while (tok.hasMoreElements ())
640             v.addElement (new Link (tok.nextToken()));
641         roots = new Link[v.size()];
642         v.copyInto (roots);
643     }
644     /**
645      * Set starting point of crawl as a single Link.
646      * @param link starting point
647      */

648     public void setRoot (Link link) {
649         roots = new Link[1];
650         roots[0] = link;
651     }
652     /**
653      * Set starting points of crawl as an array of Links.
654      * @param links starting points
655      */

656     public void setRoots (Link[] links) {
657         roots = new Link[links.length];
658         System.arraycopy (links, 0, roots, 0, links.length);
659     }
660
661     /**
662      * Add a root to the existing set of roots.
663      * @param link starting point to add
664      */

665     public void addRoot (Link link) {
666         if (roots == null)
667             setRoot (link);
668         else {
669             Link newroots[] = new Link[roots.length+1];
670             System.arraycopy (roots, 0, newroots, 0, roots.length);
671             newroots[newroots.length-1] = link;
672             roots = newroots;
673         }
674     }
675
676     /**
677      * Get crawl domain. Default value is WEB.
678      * @return WEB, SERVER, or SUBTREE.
679      */

680     public String JavaDoc[] getDomain () {
681         return domain;
682     }
683     /**
684      * Set crawl domain.
685      * @param domain one of WEB, SERVER, or SUBTREE.
686      */

687     public void setDomain (String JavaDoc[] domain) {
688         this.domain = domain;
689     }
690
691     /**
692      * Get legal link types to crawl. Default value is HYPERLINKS.
693      * @return HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.
694      */

695     public String JavaDoc[] getLinkType () {
696         return type;
697     }
698     /**
699      * Set legal link types to crawl.
700      * @param domain one of HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.
701      */

702     public void setLinkType (String JavaDoc[] type) {
703         this.type = type;
704     }
705
706     /**
707      * Get depth-first search flag. Default value is true.
708      * @return true if search is depth-first, false if search is breadth-first.
709      */

710     public boolean getDepthFirst() {
711         return depthFirst;
712     }
713     /**
714      * Set depth-first search flag. If neither depth-first nor breadth-first
715      * is desired, then override shouldVisit() to set a custom priority on
716      * each link.
717      * @param useDFS true if search should be depth-first, false if search should be breadth-first.
718      */

719     public void setDepthFirst(boolean useDFS) {
720         depthFirst = useDFS;
721     }
722     /**
723      * Get synchronous flag. Default value is false.
724      * @return true if crawler must visit the pages in priority order; false if crawler can visit
725      * pages in any order.
726      */

727     public boolean getSynchronous() {
728         return synchronous;
729     }
730     /**
731      * Set ssynchronous flag.
732      * @param f true if crawler must visit the pages in priority order; false if crawler can visit
733      * pages in any order.
734      */

735     public void setSynchronous(boolean f) {
736         synchronous = f;
737     }
738     /**
739      * Get ignore-visited-links flag. Default value is true.
740      * @return true if search skips links whose URLs have already been visited
741      * (or queued for visiting).
742      */

743     public boolean getIgnoreVisitedLinks() {
744         return ignoreVisitedLinks;
745     }
746     /**
747      * Set ignore-visited-links flag.
748      * @param f true if search skips links whose URLs have already been visited
749      * (or queued for visiting).
750      */

751     public void setIgnoreVisitedLinks(boolean f) {
752         ignoreVisitedLinks = f;
753     }
754     /**
755      * Get maximum depth. Default value is 5.
756      * @return maximum depth of crawl, in hops from starting point.
757      */

758     public int getMaxDepth() {
759         return maxDepth;
760     }
761     /**
762      * Set maximum depth.
763      * @param maxDepth maximum depth of crawl, in hops from starting point
764      */

765     public void setMaxDepth(int maxDepth) {
766         this.maxDepth = maxDepth;
767     }
768     /**
769      * Get download parameters (such as number of threads, timeouts, maximum
770      * page size, etc.)
771      */

772     public DownloadParameters getDownloadParameters() {
773         return dp;
774     }
775     /**
776      * Set download parameters (such as number of threads, timeouts, maximum
777      * page size, etc.)
778      * @param dp Download parameters
779      */

780     public void setDownloadParameters(DownloadParameters dp) {
781         this.dp = dp;
782     }
783
784     /**
785      * Set link predicate. This is an alternative way to
786      * specify the links to walk. If the link predicate is
787      * non-null, then only links that satisfy
788      * the link predicate AND shouldVisit() are crawled.
789      * @param pred Link predicate
790      */

791     public void setLinkPredicate (LinkPredicate pred) {
792         if (pred == linkPredicate
793             || (pred != null && pred.equals (linkPredicate)))
794             return;
795         if (linkPredicate != null)
796             linkPredicate.disconnected (this);
797         linkPredicate = pred;
798         if (linkPredicate != null)
799             linkPredicate.connected (this);
800     }
801
802     /**
803      * Get link predicate.
804      * @return current link predicate
805      */

806     public LinkPredicate getLinkPredicate () {
807         return linkPredicate;
808     }
809
810     /**
811      * Set page predicate. This is a way to filter the pages
812      * passed to visit(). If the page predicate is
813      * non-null, then only pages that satisfy it are passed to visit().
814      * @param pred Page predicate
815      */

816     public void setPagePredicate (PagePredicate pred) {
817         if (pred == pagePredicate
818             || (pred != null && pred.equals (pagePredicate)))
819             return;
820         if (pagePredicate != null)
821             pagePredicate.disconnected (this);
822         pagePredicate = pred;
823         if (pagePredicate != null)
824             pagePredicate.connected (this);
825     }
826
827     /**
828      * Get page predicate.
829      * @return current page predicate
830      */

831     public PagePredicate getPagePredicate () {
832         return pagePredicate;
833     }
834
835     /**
836      * Set the action. This is an alternative way to specify
837      * an action performed on every page. If act is non-null,
838      * then every page passed to visit() is also passed to this
839      * action.
840      * @param act Action
841      */

842     public void setAction (Action act) {
843         if (act == action
844             || (act != null && act.equals (action)))
845             return;
846         if (action != null)
847             action.disconnected (this);
848         action = act;
849         if (action != null)
850             action.connected (this);
851     }
852
853     /**
854      * Get action.
855      * @return current action
856      */

857     public Action getAction () {
858         return action;
859     }
860
861
862     /*
863      * Link queue management
864      *
865      */

866
867     /**
868      * Puts a link into the crawling queue. If the crawler is running, the
869      * link will eventually be retrieved and passed to visit().
870      * @param link Link to put in queue
871      */

872     public void submit (Link link) {
873         markVisited (link); // FIX: need atomic test-and-set of visited flag
874
sendLinkEvent (link, LinkEvent.QUEUED);
875         synchronized (crawlQueue) {
876             synchronized (fetchQueue) {
877                 crawlQueue.put (link);
878                 ++numPagesLeft;
879                 fetchQueue.put (link);
880                 fetchQueue.notifyAll (); // wake up worms
881
}
882         }
883     }
884     /**
885      * Submit an array of Links for crawling. If the crawler is running,
886      * these links will eventually be retrieved and passed to visit().
887      * @param links Links to put in queue
888      */

889     public void submit (Link[] links) {
890         for (int i=0; i<links.length; ++i)
891             submit (links[i]);
892     }
893
894     /**
895      * Enumerate crawling queue.
896      * @return an enumeration of Link objects which are waiting to be visited.
897      */

898     // FIX: enumerate in priority order
899
public Enumeration JavaDoc enumerateQueue () {
900         return crawlQueue.elements ();
901     }
902
903     /*
904      * Classifiers
905      *
906      */

907
908     /**
909      * Adds a classifier to this crawler. If the
910      * classifier is already found in the set, does nothing.
911      * @param c a classifier
912      */

913     public void addClassifier (Classifier c) {
914         if (!classifiers.contains (c)) {
915             float cpriority = c.getPriority ();
916             
917             for (int i=0; i<classifiers.size(); ++i) {
918                 Classifier d = (Classifier)classifiers.elementAt (i);
919                 if (cpriority < d.getPriority ()) {
920                     classifiers.insertElementAt (c, i);
921                     return;
922                 }
923             }
924             classifiers.addElement (c);
925         }
926     }
927
928     /**
929      * Removes a classifier from the set of classifiers.
930      * If c is not found in the set, does nothing.
931      *
932      * @param c a classifier
933      */

934     public void removeClassifier (Classifier c) {
935         classifiers.removeElement (c);
936     }
937
938     /**
939      * Clears the set of classifiers.
940      */

941     public void removeAllClassifiers () {
942         classifiers.removeAllElements ();
943     }
944
945     /**
946      * Enumerates the set of classifiers.
947      *
948      * @return An enumeration of the classifiers.
949      */

950     public Enumeration JavaDoc enumerateClassifiers () {
951         return classifiers.elements();
952     }
953
954     /**
955      * Get the set of classifiers
956      *
957      * @return An array containing the registered classifiers.
958      */

959     public Classifier[] getClassifiers () {
960         Classifier[] c = new Classifier[classifiers.size()];
961         classifiers.copyInto (c);
962         return c;
963     }
964
965     /*
966      * Event listeners
967      *
968      */

969
970     /**
971      * Adds a listener to the set of CrawlListeners for this crawler.
972      * If the listener is already found in the set, does nothing.
973      *
974      * @param listen a listener
975      */

976     public void addCrawlListener (CrawlListener listen) {
977         if (!crawlListeners.contains (listen))
978             crawlListeners.addElement (listen);
979     }
980
981     /**
982      * Removes a listener from the set of CrawlListeners. If it is not found in the set,
983      * does nothing.
984      *
985      * @param listen a listener
986      */

987     public void removeCrawlListener (CrawlListener listen) {
988         crawlListeners.removeElement (listen);
989     }
990
991     /**
992      * Adds a listener to the set of LinkListeners for this crawler.
993      * If the listener is already found in the set, does nothing.
994      *
995      * @param listen a listener
996      */

997     public void addLinkListener (LinkListener listen) {
998         if (!linkListeners.contains (listen))
999             linkListeners.addElement (listen);
1000    }
1001
1002    /**
1003     * Removes a listener from the set of LinkListeners. If it is not found in the set,
1004     * does nothing.
1005     *
1006     * @param listen a listener
1007     */

1008    public void removeLinkListener (LinkListener listen) {
1009        linkListeners.removeElement (listen);
1010    }
1011
1012    /**
1013     * Send a CrawlEvent to all CrawlListeners registered with this crawler.
1014     * @param id Event id
1015     */

1016    protected void sendCrawlEvent (int id) {
1017        CrawlEvent evt = new CrawlEvent (this, id);
1018        for (int j=0, len=crawlListeners.size(); j<len; ++j) {
1019            CrawlListener listen = (CrawlListener)crawlListeners.elementAt(j);
1020            switch (id) {
1021              case CrawlEvent.STARTED:
1022                listen.started (evt);
1023                break;
1024              case CrawlEvent.STOPPED:
1025                listen.stopped (evt);
1026                break;
1027              case CrawlEvent.CLEARED:
1028                listen.cleared (evt);
1029                break;
1030              case CrawlEvent.TIMED_OUT:
1031                listen.timedOut (evt);
1032                break;
1033              case CrawlEvent.PAUSED:
1034                listen.paused (evt);
1035                break;
1036            }
1037        }
1038    }
1039
1040    /**
1041     * Send a LinkEvent to all LinkListeners registered with this crawler.
1042     * @param l Link related to event
1043     * @param id Event id
1044     */

1045    protected void sendLinkEvent (Link l, int id) {
1046        LinkEvent evt = new LinkEvent (this, id, l);
1047        l.setStatus (id);
1048        for (int j=0, len=linkListeners.size(); j<len; ++j) {
1049            LinkListener listen = (LinkListener)linkListeners.elementAt(j);
1050            listen.crawled (evt);
1051        }
1052    }
1053
1054    /**
1055     * Send an exceptional LinkEvent to all LinkListeners registered with this crawler.
1056     * @param l Link related to event
1057     * @param id Event id
1058     * @param exception Exception associated with event
1059     */

1060    protected void sendLinkEvent (Link l, int id, Throwable JavaDoc exception) {
1061        LinkEvent evt = new LinkEvent (this, id, l, exception);
1062        l.setStatus (id);
1063        l.setLabel ("exception", exception.toString ());
1064        for (int j=0, len=linkListeners.size(); j<len; ++j) {
1065            LinkListener listen = (LinkListener)linkListeners.elementAt(j);
1066            listen.crawled (evt);
1067        }
1068    }
1069
1070    /*
1071     * Visited pages table
1072     *
1073     */

1074
1075    /**
1076     * Test whether the page corresponding to a link has been visited
1077     * (or queued for visiting).
1078     * @param link Link to test
1079     * @return true if link has been passed to walk() during this crawl
1080     */

1081    public boolean visited (Link link) {
1082        return visitedPages.containsKey (link.getPageURL().toString());
1083    }
1084
1085    /**
1086     * Register that a link has been visited.
1087     * @param link Link that has been visited
1088     */

1089    protected void markVisited (Link link) {
1090        visitedPages.put (link.getPageURL().toString(), this);
1091    }
1092
1093    /**
1094     * Clear the set of visited links.
1095     */

1096    protected void clearVisited () {
1097        visitedPages.clear ();
1098    }
1099
1100    /*
1101     * Fetch loop
1102     *
1103     */

1104
1105    void fetch (Worm w) {
1106        Timer timer = new WormTimer (w);
1107
1108        while (!w.dead) {
1109            //System.err.println (w + ": fetching a link");
1110

1111            // pull the highest-priority link from the fetch queue
1112
synchronized (fetchQueue) {
1113                while (!w.dead
1114                       && (w.link = (Link)fetchQueue.deleteMin ()) == null) {
1115                    try {
1116                        fetchQueue.wait ();
1117                    } catch (InterruptedException JavaDoc e) {}
1118                }
1119            }
1120
1121            if (w.dead)
1122                return;
1123                
1124            //System.err.println (w + ": processing " + w.link.toDescription());
1125

1126            try {
1127                // download the link to get a page
1128
DownloadParameters dp;
1129                Page page;
1130
1131                dp = w.link.getDownloadParameters();
1132                if (dp == null)
1133                    dp = this.dp;
1134                int timeout = dp.getDownloadTimeout();
1135
1136                sendLinkEvent (w.link, LinkEvent.RETRIEVING);
1137                try {
1138                    
1139                    if (timeout > 0)
1140                        timer.set (timeout*1000, false);
1141
1142                    if (dp.getObeyRobotExclusion()
1143                        && robotExclusion.disallowed (w.link.getURL()))
1144                        throw new IOException JavaDoc ("disallowed by Robot Exclusion Standard (robots.txt)");
1145
1146                    page = new Page (w.link, dp);
1147                    
1148                } finally {
1149                    timer.cancel ();
1150                }
1151                    
1152                if (w.dead)
1153                    return;
1154                    
1155                sendLinkEvent (w.link, LinkEvent.DOWNLOADED);
1156
1157                if (synchronous) {
1158                    // Synchronous mode.
1159
// Main thread will call process() when
1160
// this link's turn arrives (in priority order).
1161
// Wake up the main thread.
1162
synchronized (crawlQueue) {
1163                        crawlQueue.notify ();
1164                    }
1165                }
1166                else {
1167                    // Asynchronous mode.
1168
// Each worm calls process() on its link.
1169
process (w.link);
1170                }
1171                
1172                w.link = null;
1173
1174                // loop around and fetch another link
1175

1176            } catch (ThreadDeath JavaDoc e) {
1177                throw e; // have to continue dying
1178
} catch (Throwable JavaDoc e) {
1179                // Some other exception occurred, either during the page fetch
1180
// or in some user code. Mark up the link with the error.
1181
if (w.dead)
1182                    return;
1183                    
1184                sendLinkEvent (w.link, LinkEvent.ERROR, e);
1185                synchronized (crawlQueue) {
1186                    crawlQueue.delete (w.link);
1187                    --numPagesLeft;
1188                    w.link = null;
1189                    crawlQueue.notify ();
1190                }
1191            }
1192        }
1193    }
1194
1195    void process (Link link) {
1196        Page page = link.getPage ();
1197
1198        // classify the page
1199
for (int j=0, len=classifiers.size(); j<len; ++j) {
1200            Classifier cl = (Classifier)classifiers.elementAt(j);
1201            cl.classify (page);
1202        }
1203
1204        // invoke callbacks on the page
1205
++numPagesVisited;
1206        if (pagePredicate == null || pagePredicate.shouldActOn (page)) {
1207            if (action != null)
1208                action.visit (page);
1209            visit (page);
1210        }
1211        expand (page);
1212        
1213        // send out the event
1214
sendLinkEvent (link, LinkEvent.VISITED);
1215        
1216        // discard link
1217
synchronized (crawlQueue) {
1218            crawlQueue.delete (link);
1219            --numPagesLeft;
1220            crawlQueue.notify ();
1221        }
1222    }
1223
1224    void fetchTimedOut (Worm w, int interval) {
1225        if (w.dead)
1226            return;
1227
1228        w.die ();
1229        sendLinkEvent (w.link, LinkEvent.ERROR,
1230                       new IOException JavaDoc ("Timeout after " + interval + " seconds"));
1231
1232        synchronized (crawlQueue) {
1233            crawlQueue.delete (w.link);
1234            --numPagesLeft;
1235            
1236            worms[w.i] = new Worm (this, w.i);
1237            worms[w.i].start ();
1238            
1239            crawlQueue.notify ();
1240        }
1241    }
1242
1243//#ifdef JDK1.1
1244
// FIX: more error checking here
1245
public static void main (String JavaDoc[] args) throws Exception JavaDoc {
1246    java.io.ObjectInputStream JavaDoc in =
1247      new java.io.ObjectInputStream JavaDoc (new java.io.FileInputStream JavaDoc (args[0]));
1248    Crawler loadedCrawler = (Crawler)in.readObject ();
1249    in.close ();
1250
1251    EventLog.monitor (loadedCrawler).setOnlyNetworkEvents (false);
1252    loadedCrawler.run ();
1253  }
1254//#endif JDK1.1
1255

1256}
1257
1258/* Simple Thread subclass that invokes a crawler's fetch loop. */
1259class Worm extends Thread JavaDoc {
1260    Crawler crawler; // crawler in charge of this worm
1261
int i; // index of this worm in crawler.worms[]
1262
Link link; // link this worm is currently working on
1263
boolean dead = false; // true if this worm has been killed
1264

1265    public Worm (Crawler crawler, int i) {
1266        super (crawler.getName() + " worm " + i);
1267        setDaemon (true);
1268        this.crawler = crawler;
1269        this.i = i;
1270    }
1271
1272    public void run () {
1273        crawler.fetch (this);
1274    }
1275    
1276    public void die () {
1277        dead = true;
1278        stop ();
1279    }
1280        
1281}
1282
1283class WormTimer extends Timer {
1284    Worm worm;
1285
1286    public WormTimer (Worm worm) {
1287        this.worm = worm;
1288    }
1289
1290    protected void alarm () {
1291        worm.crawler.fetchTimedOut (worm, getInterval()/1000);
1292    }
1293}
1294
1295class CrawlTimer extends Timer {
1296    Crawler crawler;
1297    
1298    public CrawlTimer (Crawler crawler) {
1299        this.crawler = crawler;
1300    }
1301    
1302    protected void alarm () {
1303        crawler.timedOut ();
1304    }
1305}
1306
1307
Popular Tags