KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > weblech > spider > Spider


1 /*
2  * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
3  *
4  * Copyright (c) 2001 Brian Pitcher
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */

24
25 // $Header: /cvsroot/weblech/weblech/src/weblech/spider/Spider.java,v 1.8 2002/06/09 11:34:38 weblech Exp $
26

27 package weblech.spider;
28
29 import weblech.util.Logger;
30 import weblech.util.Log4j;
31
32 import java.util.*;
33 import java.io.*;
34 import java.net.URL JavaDoc;
35
36 import org.apache.log4j.Category;
37
38 public class Spider extends Logger implements Runnable JavaDoc, Constants
39 {
40     /** Config for the spider */
41     private SpiderConfig config;
42     /**
43      * Download queue.
44      * Thread safety: To access the queue, first synchronize on it.
45      */

46     private DownloadQueue queue;
47     /**
48      * Set of URLs downloaded or scheduled, so we don't download a
49      * URL more than once.
50      * Thread safety: To access the set, first synchronize on it.
51      */

52     private Set urlsDownloadedOrScheduled;
53     /**
54      * Set of URLs currently being downloaded by Spider threads.
55      * Thread safety: To access the set, first synchronize on it.
56      */

57     private Set urlsDownloading;
58     /**
59      * Number of downloads currently taking place.
60      * Thread safety: To modify this value, first synchronize on
61      * the download queue.
62      */

63     private int downloadsInProgress;
64     /** Whether the spider should quit */
65     private boolean quit;
66     /** Count of running Spider threads. */
67     private int running;
68     /** Time we last checkpointed. */
69     private long lastCheckpoint;
70
71     public Spider(SpiderConfig config)
72     {
73         this.config = config;
74         queue = new DownloadQueue(config);
75         queue.queueURL(new URLToDownload(config.getStartLocation(), 0));
76         urlsDownloadedOrScheduled = new HashSet();
77         urlsDownloading = new HashSet();
78         downloadsInProgress = 0;
79         lastCheckpoint = 0;
80     }
81
82     public void start()
83     {
84         quit = false;
85         running = 0;
86
87         for(int i = 0; i < config.getSpiderThreads(); i++)
88         {
89             _logClass.info("Starting Spider thread");
90             Thread JavaDoc t = new Thread JavaDoc(this, "Spider-Thread-" + (i + 1));
91             t.start();
92             running++;
93         }
94     }
95
96     public void stop()
97     {
98         quit = true;
99     }
100
101     public boolean isRunning()
102     {
103         return running == 0;
104     }
105
106     private void checkpointIfNeeded()
107     {
108         if(config.getCheckpointInterval() == 0)
109         {
110             return;
111         }
112
113         if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
114         {
115             synchronized(queue)
116             {
117                 if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
118                 {
119                     writeCheckpoint();
120                     lastCheckpoint = System.currentTimeMillis();
121                 }
122             }
123         }
124     }
125
126     private void writeCheckpoint()
127     {
128         _logClass.debug("writeCheckpoint()");
129         try
130         {
131             FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);
132             ObjectOutputStream oos = new ObjectOutputStream(fos);
133             oos.writeObject(queue);
134             oos.writeObject(urlsDownloading);
135             oos.close();
136         }
137         catch(IOException ioe)
138         {
139             _logClass.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);
140         }
141     }
142
143     public void readCheckpoint()
144     {
145         try
146         {
147             FileInputStream fis = new FileInputStream("spider.checkpoint");
148             ObjectInputStream ois = new ObjectInputStream(fis);
149             queue = (DownloadQueue) ois.readObject();
150             urlsDownloading = (Set) ois.readObject();
151             queue.queueURLs(urlsDownloading);
152             urlsDownloading.clear();
153         }
154         catch(Exception JavaDoc e)
155         {
156             _logClass.error("Caught exception reading checkpoint: " + e.getMessage(), e);
157         }
158     }
159
160     public void run()
161     {
162         HTMLParser htmlParser = new HTMLParser(config);
163         URLGetter urlGetter = new URLGetter(config);
164
165         while((queueSize() > 0 || downloadsInProgress > 0) && quit == false)
166         {
167             checkpointIfNeeded();
168             if(queueSize() == 0 && downloadsInProgress > 0)
169             {
170                 // Wait for a download to finish before seeing if this thread should stop
171
try
172                 {
173                     Thread.sleep(QUEUE_CHECK_INTERVAL);
174                 }
175                 catch(InterruptedException JavaDoc ignored)
176                 {
177                 }
178                 // Have another go at the loop
179
continue;
180             }
181             else if(queueSize() == 0)
182             {
183                 break;
184             }
185             URLToDownload nextURL;
186             synchronized(queue)
187             {
188                 nextURL = queue.getNextInQueue();
189                 downloadsInProgress++;
190             }
191             synchronized(urlsDownloading)
192             {
193                 urlsDownloading.add(nextURL);
194             }
195             int newDepth = nextURL.getDepth() + 1;
196             int maxDepth = config.getMaxDepth();
197             synchronized(urlsDownloading)
198             {
199                 urlsDownloading.remove(nextURL);
200             }
201             List newURLs = downloadURL(nextURL, urlGetter, htmlParser);
202
203             newURLs = filterURLs(newURLs);
204
205             ArrayList u2dsToQueue = new ArrayList();
206             for(Iterator i = newURLs.iterator(); i.hasNext(); )
207             {
208                 URL JavaDoc u = (URL JavaDoc) i.next();
209                 // Download if not yet downloaded, and the new depth is less than the maximum
210
synchronized(urlsDownloadedOrScheduled)
211                 {
212                     if(!urlsDownloadedOrScheduled.contains(u)
213                     && (maxDepth == 0 || newDepth <= maxDepth))
214                     {
215                         u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth));
216                         urlsDownloadedOrScheduled.add(u);
217                     }
218                 }
219             }
220             synchronized(queue)
221             {
222                 queue.queueURLs(u2dsToQueue);
223                 downloadsInProgress--;
224             }
225         }
226         _logClass.info("Spider thread stopping");
227         running--;
228     }
229
230     /**
231      * Get the size of the download queue in a thread-safe manner.
232      */

233     private int queueSize()
234     {
235         synchronized(queue)
236         {
237             return queue.size();
238         }
239     }
240
241     /**
242      * Get a URL, and return new URLs that are referenced from it.
243      *
244      * @return A List of URL objects.
245      */

246     private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser)
247     {
248         _logClass.debug("downloadURL(" + url + ")");
249
250         // Bail out early if image and already on disk
251
URLObject obj = new URLObject(url.getURL(), config);
252         if(obj.existsOnDisk())
253         {
254             if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML()))
255             {
256                 _logClass.info("Q: [" + queue + "] " + url);
257                 obj = urlGetter.getURL(url);
258             }
259             else if(config.refreshImages() && obj.isImage())
260             {
261                 _logClass.info("Q: [" + queue + "] " + url);
262                 obj = urlGetter.getURL(url);
263             }
264         }
265         else
266         {
267             _logClass.info("Q: [" + queue + "] " + url);
268             obj = urlGetter.getURL(url);
269         }
270
271         if(obj == null)
272         {
273             return new ArrayList();
274         }
275
276         if(!obj.existsOnDisk())
277         {
278             obj.writeToFile();
279         }
280
281         if(obj.isHTML() || obj.isXML())
282         {
283             return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent());
284         }
285         else if(obj.isImage())
286         {
287             return new ArrayList();
288         }
289         else
290         {
291             _logClass.warn("Unsupported content type received: " + obj.getContentType());
292             _logClass.info("URL was " + url);
293             return new ArrayList();
294         }
295     }
296
297     private List filterURLs(List URLs)
298     {
299         String JavaDoc match = config.getURLMatch();
300         ArrayList retVal = new ArrayList();
301
302         synchronized(urlsDownloadedOrScheduled)
303         {
304             for(Iterator i = URLs.iterator(); i.hasNext(); )
305             {
306                 URL JavaDoc u = (URL JavaDoc) i.next();
307                 if(urlsDownloadedOrScheduled.contains(u))
308                 {
309                     continue;
310                 }
311
312                 String JavaDoc s = u.toExternalForm();
313                 if(s.indexOf(match) != -1)
314                 {
315                     retVal.add(u);
316                 }
317             }
318         }
319         return retVal;
320     }
321
322 }
323
Popular Tags