KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > weblech > spider > SpiderConfig


1 /*
2  * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
3  *
4  * Copyright (c) 2001 Brian Pitcher
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */

24
25 // $Header: /cvsroot/weblech/weblech/src/weblech/spider/SpiderConfig.java,v 1.9 2002/06/09 11:36:23 weblech Exp $
26

27 package weblech.spider;
28
29 import weblech.util.Logger;
30
31 import java.io.File JavaDoc;
32 import java.io.Serializable JavaDoc;
33 import java.util.*;
34 import java.net.URL JavaDoc;
35 import java.net.MalformedURLException JavaDoc;
36
37 public class SpiderConfig extends Logger implements Serializable JavaDoc
38 {
39     private File JavaDoc saveRootDirectory;
40     private File JavaDoc mailtoLogFile;
41
42     private boolean refreshHTMLs;
43     private boolean refreshImages;
44     private boolean refreshOthers;
45
46     private Set htmlExtensions;
47     private Set imageExtensions;
48
49     private URL JavaDoc startLocation;
50     private String JavaDoc urlMatch;
51
52     private List interestingURLSubstrings;
53     private List boringURLSubstrings;
54
55     private boolean depthFirst;
56     private int maxDepth;
57
58     private String JavaDoc userAgent;
59
60     private String JavaDoc basicAuthUser;
61     private String JavaDoc basicAuthPassword;
62
63     private int spiderThreads;
64
65     private long checkpointInterval;
66
67     /**
68      * Create a default config.
69      */

70     public SpiderConfig()
71     {
72         _logClass.debug("SpiderConfig()");
73
74         saveRootDirectory = new File JavaDoc(".");
75         mailtoLogFile = new File JavaDoc("mailto.txt");
76
77         refreshHTMLs = true;
78         refreshImages = false;
79         refreshOthers = false;
80
81         htmlExtensions = new HashSet();
82         htmlExtensions.add("htm");
83         htmlExtensions.add("html");
84         htmlExtensions.add("shtml");
85
86         imageExtensions = new HashSet();
87         imageExtensions.add("jpg");
88         imageExtensions.add("gif");
89         imageExtensions.add("png");
90
91         urlMatch = null;
92         interestingURLSubstrings = new ArrayList();
93         boringURLSubstrings = new ArrayList();
94         depthFirst = false;
95         maxDepth = 0;
96
97         userAgent = "WebLech Spider 0.01alpha";
98         basicAuthUser = "";
99         basicAuthPassword = "";
100
101         spiderThreads = 1;
102
103         checkpointInterval = 0;
104     }
105
106     /**
107      * Create a config from a java.util.Properties object.
108      */

109     public SpiderConfig(Properties props)
110     {
111         _logClass.debug("SpiderConfig(props)");
112
113         saveRootDirectory = new File JavaDoc(props.getProperty("saveRootDirectory", "."));
114         if(!saveRootDirectory.exists())
115         {
116             if(!saveRootDirectory.mkdirs())
117             {
118                 _logClass.error("Couldn't create root directory: " + saveRootDirectory);
119                 _logClass.info("Defaulting to . instead");
120                 saveRootDirectory = new File JavaDoc(".");
121             }
122         }
123         else if(!saveRootDirectory.isDirectory())
124         {
125             _logClass.error("Save root is not a directory: " + saveRootDirectory);
126             _logClass.info("Defaulting to . instead");
127             saveRootDirectory = new File JavaDoc(".");
128         }
129
130         String JavaDoc mailtoFileStr = props.getProperty("mailtoLogFile", "mailto.txt");
131         // Check if absolute or relative name given
132
if(mailtoFileStr.indexOf(":") != -1 || mailtoFileStr.startsWith("/") || mailtoFileStr.startsWith("\\"))
133         {
134             _logClass.debug("Using absolute file name " + mailtoFileStr);
135             mailtoLogFile = new File JavaDoc(mailtoFileStr);
136         }
137         else
138         {
139             _logClass.debug("Constructing relative file name " + saveRootDirectory.getPath() + "/" + mailtoFileStr);
140             mailtoLogFile = new File JavaDoc(saveRootDirectory.getPath() + "/" + mailtoFileStr);
141         }
142
143         refreshHTMLs = Boolean.valueOf(props.getProperty("refreshHTMLs", "true")).booleanValue();
144         refreshImages = Boolean.valueOf(props.getProperty("refreshImages", "false")).booleanValue();
145         refreshOthers = Boolean.valueOf(props.getProperty("refreshOthers", "false")).booleanValue();
146
147         htmlExtensions = parseSet(props.getProperty("htmlExtensions", "htm,html,shtml"));
148         imageExtensions = parseSet(props.getProperty("imageExtensions", "jpg,gif,png"));
149
150         String JavaDoc startLocStr = props.getProperty("startLocation");
151         if(startLocStr != null)
152         {
153             try
154             {
155                 startLocation = new URL JavaDoc(startLocStr);
156             }
157             catch(MalformedURLException JavaDoc murle)
158             {
159                 _logClass.error("Caught MalformedURLException parsing start URL '" + startLocStr + "' : " + murle.getMessage(), murle);
160             }
161         }
162         else
163         {
164             _logClass.warn("startLocation not found in properties");
165         }
166
167         urlMatch = props.getProperty("urlMatch");
168
169         interestingURLSubstrings = parsePropCommaSeparated(props.getProperty("interestingURLs"));
170         boringURLSubstrings = parsePropCommaSeparated(props.getProperty("boringURLs"));
171
172         depthFirst = Boolean.valueOf(props.getProperty("depthFirst", "false")).booleanValue();
173         try
174         {
175             String JavaDoc maxDepthStr = props.getProperty("maxDepth", "0");
176             maxDepth = Integer.parseInt(maxDepthStr);
177         }
178         catch(NumberFormatException JavaDoc nfe)
179         {
180             _logClass.error("Caught number format exception parsing max depth, defaulting to 1", nfe);
181             maxDepth = 1;
182         }
183
184         userAgent = props.getProperty("userAgent", "WebLech Spider 0.01alpha");
185         basicAuthUser = props.getProperty("basicAuthUser", "");
186         basicAuthPassword = props.getProperty("basicAuthPassword", "");
187
188         try
189         {
190             String JavaDoc threadsStr = props.getProperty("spiderThreads", "1");
191             spiderThreads = Integer.parseInt(threadsStr);
192         }
193         catch(NumberFormatException JavaDoc nfe)
194         {
195             _logClass.error("Caught number format exception parsing number of threads, defaulting to 1", nfe);
196             spiderThreads = 1;
197         }
198
199         try
200         {
201             String JavaDoc intervalStr = props.getProperty("checkpointInterval", "0");
202             checkpointInterval = Long.parseLong(intervalStr);
203         }
204         catch(NumberFormatException JavaDoc nfe)
205         {
206             _logClass.error("Caught number format exception parsing checkpoint interval, defaulting to 0", nfe);
207             spiderThreads = 1;
208         }
209     }
210
211     private List parsePropCommaSeparated(String JavaDoc str)
212     {
213         ArrayList result = new ArrayList();
214         if(str != null && str.length() > 0)
215         {
216             StringTokenizer tok = new StringTokenizer(str, ",");
217             while(tok.hasMoreTokens())
218             {
219                 result.add(tok.nextToken());
220             }
221         }
222         return result;
223     }
224
225
226     public void setRefreshHTMLs(boolean refreshHTMLs)
227     {
228         this.refreshHTMLs = refreshHTMLs;
229     }
230
231     public boolean refreshHTMLs()
232     {
233         return refreshHTMLs;
234     }
235
236     public void setRefreshImages(boolean refreshImages)
237     {
238         this.refreshImages = refreshImages;
239     }
240
241     public boolean refreshImages()
242     {
243         return refreshImages;
244     }
245
246     public void setRefreshOthers(boolean refreshOthers)
247     {
248         this.refreshOthers = refreshOthers;
249     }
250
251     public boolean refreshOthers()
252     {
253         return refreshOthers;
254     }
255
256     public void setSaveRootDirectory(File JavaDoc saveRootDirectory)
257     {
258         this.saveRootDirectory = saveRootDirectory;
259     }
260
261     public File JavaDoc getSaveRootDirectory()
262     {
263         return saveRootDirectory;
264     }
265
266     public void setMailtoLogFile(File JavaDoc mailtoLogFile)
267     {
268         this.mailtoLogFile = mailtoLogFile;
269     }
270
271     public File JavaDoc getMailtoLogFile()
272     {
273         return mailtoLogFile;
274     }
275
276     public void setStartLocation(URL JavaDoc startLocation)
277     {
278         this.startLocation = startLocation;
279     }
280
281     public URL JavaDoc getStartLocation()
282     {
283         return startLocation;
284     }
285
286     public void setURLMatch(String JavaDoc urlMatch)
287     {
288         this.urlMatch = urlMatch;
289     }
290
291     public String JavaDoc getURLMatch()
292     {
293         return urlMatch;
294     }
295
296     public List getInterestingURLSubstrings()
297     {
298         return interestingURLSubstrings;
299     }
300
301     public void setInterestingURLSubstrings(List interestingURLSubstrings)
302     {
303         this.interestingURLSubstrings = interestingURLSubstrings;
304     }
305
306     public List getBoringURLSubstrings()
307     {
308         return boringURLSubstrings;
309     }
310
311     public void setBoringURLSubstrings(List boringURLSubstrings)
312     {
313         this.boringURLSubstrings = boringURLSubstrings;
314     }
315
316     public boolean isInteresting(URL JavaDoc u)
317     {
318         return matchURL(u, interestingURLSubstrings);
319     }
320
321     public boolean isBoring(URL JavaDoc u)
322     {
323         return matchURL(u, boringURLSubstrings);
324     }
325
326     private boolean matchURL(URL JavaDoc u, List substrings)
327     {
328         String JavaDoc str = u.toExternalForm();
329         for(Iterator i = substrings.iterator(); i.hasNext(); )
330         {
331             String JavaDoc substr = (String JavaDoc) i.next();
332             if(str.indexOf(substr) != -1)
333             {
334                 return true;
335             }
336         }
337         return false;
338     }
339
340     public void setDepthFirstSearch(boolean depthFirst)
341     {
342         this.depthFirst = depthFirst;
343     }
344
345     public boolean isDepthFirstSearch()
346     {
347         return depthFirst;
348     }
349
350     public void setMaxDepth(int maxDepth)
351     {
352         this.maxDepth = maxDepth;
353     }
354
355     public int getMaxDepth()
356     {
357         return maxDepth;
358     }
359
360     public void setUserAgent(String JavaDoc userAgent)
361     {
362         this.userAgent = userAgent;
363     }
364
365     public String JavaDoc getUserAgent()
366     {
367         return userAgent;
368     }
369
370     public void setBasicAuthUser(String JavaDoc basicAuthUser)
371     {
372         this.basicAuthUser = basicAuthUser;
373     }
374
375     public String JavaDoc getBasicAuthUser()
376     {
377         return basicAuthUser;
378     }
379
380     public void setBasicAuthPassword(String JavaDoc basicAuthPassword)
381     {
382         this.basicAuthPassword = basicAuthPassword;
383     }
384
385     public String JavaDoc getBasicAuthPassword()
386     {
387         return basicAuthPassword;
388     }
389
390     public void setSpiderThreads(int spiderThreads)
391     {
392         this.spiderThreads = spiderThreads;
393     }
394
395     public int getSpiderThreads()
396     {
397         return spiderThreads;
398     }
399
400     public void setCheckpointInterval(long interval)
401     {
402         this.checkpointInterval = interval;
403     }
404
405     public long getCheckpointInterval()
406     {
407         return checkpointInterval;
408     }
409
410     public String JavaDoc toString()
411     {
412         return "depthFirst:\t" + depthFirst
413            + "\nmaxDepth:\t" + maxDepth
414            + "\nhtmlExtensions:\t" + fromSet(htmlExtensions)
415            + "\nimageExtensions:\t" + fromSet(imageExtensions)
416            + "\nrefreshHTMLs:\t" + refreshHTMLs
417            + "\nrefreshImages:\t" + refreshImages
418            + "\nrefreshOthers:\t" + refreshOthers
419            + "\nsaveRootDirectory:\t" + saveRootDirectory
420            + "\nstartLocation:\t" + startLocation
421            + "\nurlMatch:\t" + urlMatch
422            + "\nuserAgent:\t" + userAgent
423            + "\nbasicAuthUser:\t" + basicAuthUser
424            + "\nbasicAuthPassword:\t" + "***"
425            + "\nspiderThreads:\t" + spiderThreads
426            + "\ncheckpointInterval:\t" + checkpointInterval;
427     }
428
429     private Set parseSet(String JavaDoc str)
430     {
431         _logClass.debug("parseSet(" + str + ")");
432         HashSet result = new HashSet();
433         StringTokenizer sTok = new StringTokenizer(str, ",");
434         while(sTok.hasMoreTokens())
435         {
436             String JavaDoc tok = sTok.nextToken().trim();
437             result.add(tok);
438         }
439         return result;
440     }
441
442     private String JavaDoc fromSet(Set s)
443     {
444         StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
445         boolean first = true;
446         for(Iterator i = s.iterator(); i.hasNext(); )
447         {
448             String JavaDoc str = (String JavaDoc) i.next();
449             if(first)
450             {
451                 first = false;
452             }
453             else
454             {
455                 sb.append(",");
456             }
457             sb.append(str);
458         }
459         return sb.toString();
460     }
461
462 } // End class SpiderConfig
463
Popular Tags