KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > DownloadParameters


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 /**
36  * Download parameters. These parameters are limits on
37  * how Page can download a Link. A Crawler has a
38  * default set of download parameters, but the defaults
39  * can be overridden on individual links by calling
40  * Link.setDownloadParameters().
41  * <P>
42  * DownloadParameters is an immutable class (like String).
43  * "Changing" a parameter actually returns a new instance
44  * of the class with only the specified parameter changed.
45  */

46 public class DownloadParameters implements Cloneable JavaDoc
47 //#ifdef JDK1.1
48
, java.io.Serializable JavaDoc
49 //#endif JDK1.1
50
{
51     private int maxThreads = 4;
52         // number of background threads used by the crawler
53
private int maxPageSize = 100;
54         // maximum page size in kilobytes (-1 for no maximum)
55
private int downloadTimeout = 60;
56         // timeout for a single page, in seconds (-1 for no timeout)
57
private int crawlTimeout = -1;
58         // timeout for entire crawl in seconds (-1 for no timeout)
59
private boolean obeyRobotExclusion = false;
60         // obey crawling rules in robots.txt
61

62     // not implemented yet
63
// private int maxRequestsPerServer = 2;
64
// // maximum number of simultaneous requests to a server (-1 for no maximum)
65
// private int delay = 500;
66
// // delay (in milliseconds) between starts of requests to same server (0 for no delay)
67

68     private boolean interactive = true;
69         // user is available to answer dialog boxes, e.g. for authentication
70
private boolean useCaches = true;
71         // use cached pages to satisfy requests wherever possible
72
private String JavaDoc acceptedMIMETypes = null;
73         // accept header for HTTP request, or null to use default
74
private String JavaDoc userAgent = null;
75         // User-Agent header for HTTP request, or null to use default
76

77
78     public static final DownloadParameters DEFAULT = new DownloadParameters ();
79     public static final DownloadParameters NO_LIMITS =
80         DEFAULT
81         .changeMaxPageSize (-1)
82         .changeDownloadTimeout (-1)
83         .changeCrawlTimeout (-1)
84         ;
85
86     /**
87      * Make a DownloadParameters object with default settigns.
88      */

89     public DownloadParameters () {
90     }
91     
92     /**
93      * Clone a DownloadParameters object.
94      */

95     public Object JavaDoc clone () {
96         try {
97             return super.clone ();
98         } catch (CloneNotSupportedException JavaDoc e) {
99             throw new RuntimeException JavaDoc ("Internal error: " + e);
100         }
101     }
102
103     /**
104      * Get maximum threads.
105      * @return maximum number of background threads used by crawler.
106      * Default is 4.
107      */

108     public int getMaxThreads() {
109         return maxThreads;
110     }
111     /**
112      * Set maximum threads.
113      * @param maxthreads maximum number of background threads used by crawler
114      * @return new DownloadParameters object with the specified parameter changed.
115      */

116     public DownloadParameters changeMaxThreads(int maxthreads) {
117         DownloadParameters dp = (DownloadParameters)clone();
118         dp.maxThreads = maxthreads;
119         return dp;
120     }
121     /**
122      * Get maximum page size. Pages larger than this limit are neither
123      * downloaded nor parsed.
124      * Default value is 100 (KB). 0 or negative values mean no limit.
125      * @return maximum page size in kilobytes
126      */

127     public int getMaxPageSize() {
128         return maxPageSize;
129     }
130     /**
131      * Change maximum page size. Pages larger than this limit are treated as
132      * leaves in the crawl graph -- neither downloaded nor parsed.
133      * @param maxPageSize maximum page size in kilobytes
134      * @return new DownloadParameters object with the specified parameter changed.
135      */

136     public DownloadParameters changeMaxPageSize(int maxPageSize) {
137         DownloadParameters dp = (DownloadParameters)clone();
138         dp.maxPageSize = maxPageSize;
139         return dp;
140     }
141     /**
142      * Get download timeout value.
143      * @return length of time (in seconds) that crawler will wait for a page to download
144      * before aborting it.
145      * timeout. Default is 60 seconds.
146      */

147     public int getDownloadTimeout() {
148         return downloadTimeout;
149     }
150     /**
151      * Change download timeout value.
152      * @param timeout length of time (in seconds) to wait for a page to download
153      * Use a negative value to turn off timeout.
154      * @return new DownloadParameters object with the specified parameter changed.
155      */

156     public DownloadParameters changeDownloadTimeout(int timeout) {
157         DownloadParameters dp = (DownloadParameters)clone();
158         dp.downloadTimeout = timeout;
159         return dp;
160     }
161     /**
162      * Get timeout on entire crawl.
163      * @return maximum length of time (in seconds) that crawler will run
164      * before aborting. Default is -1 (no limit).
165      */

166     public int getCrawlTimeout() {
167         return crawlTimeout;
168     }
169     /**
170      * Change timeout value.
171      * @param timeout maximum length of time (in seconds) that crawler will run.
172      * Use a negative value to turn off timeout.
173      * @return new DownloadParameters object with the specified parameter changed.
174      */

175     public DownloadParameters changeCrawlTimeout(int timeout) {
176         DownloadParameters dp = (DownloadParameters)clone();
177         dp.crawlTimeout = timeout;
178         return dp;
179     }
180     /**
181      * Get obey-robot-exclusion flag.
182      * @return true iff the
183      * crawler checks robots.txt on the remote Web site
184      * before downloading a page. Default is false.
185      */

186     public boolean getObeyRobotExclusion() {
187         return obeyRobotExclusion;
188     }
189     /**
190      * Change obey-robot-exclusion flag.
191      * @param f If true, then the
192      * crawler checks robots.txt on the remote Web site
193      * before downloading a page.
194      * @return new DownloadParameters object with the specified parameter changed.
195      */

196     public DownloadParameters changeObeyRobotExclusion(boolean f) {
197         DownloadParameters dp = (DownloadParameters)clone();
198         dp.obeyRobotExclusion = f;
199         return dp;
200     }
201     /**
202      * Get interactive flag.
203      * @return true if a user is available to respond to
204      * dialog boxes (for instance, to enter passwords for
205      * authentication). Default is true.
206      */

207     public boolean getInteractive() {
208         return interactive;
209     }
210     /**
211      * Change interactive flag.
212      * @param f true if a user is available to respond
213      * to dialog boxes
214      * @return new DownloadParameters object with the specified parameter changed.
215      */

216     public DownloadParameters changeInteractive(boolean f) {
217         DownloadParameters dp = (DownloadParameters)clone();
218         dp.interactive = f;
219         return dp;
220     }
221     /**
222      * Get use-caches flag.
223      * @return true if cached pages should be used whenever
224      * possible
225      */

226     public boolean getUseCaches() {
227         return useCaches;
228     }
229     /**
230      * Change use-caches flag.
231      * @param f true if cached pages should be used whenever possible
232      * @return new DownloadParameters object with the specified parameter changed.
233      */

234     public DownloadParameters changeUseCaches(boolean f) {
235         DownloadParameters dp = (DownloadParameters)clone();
236         dp.useCaches = f;
237         return dp;
238     }
239     /**
240      * Get accepted MIME types.
241      * @return list of MIME types that can be handled by
242      * the crawler (which are passed as the Accept header
243      * in the HTTP request).
244      * Default is null.
245      */

246     public String JavaDoc getAcceptedMIMETypes() {
247         return acceptedMIMETypes;
248     }
249     /**
250      * Change accepted MIME types.
251      * @param types list of MIME types that can be handled
252      * by the crawler. Use null if the crawler can handle anything.
253      * @return new DownloadParameters object with the specified parameter changed.
254      */

255     public DownloadParameters changeAcceptedMIMETypes(String JavaDoc types) {
256         DownloadParameters dp = (DownloadParameters)clone();
257         dp.acceptedMIMETypes = types;
258         return dp;
259     }
260     /**
261      * Get User-agent header used in HTTP requests.
262      * @return user-agent field used in HTTP requests,
263      * or null if the Java library's default user-agent
264      * is used. Default value is null (but for a Crawler,
265      * the default DownloadParameters has the Crawler's
266      * name as its default user-agent).
267      */

268     public String JavaDoc getUserAgent() {
269         return userAgent;
270     }
271     /**
272      * Change User-agent field used in HTTP requests.
273      * @param userAgent user-agent field used in HTTP
274      * requests. Pass null to use the Java library's default
275      * user-agent field.
276      * @return new DownloadParameters object with the specified parameter changed.
277      */

278     public DownloadParameters changeUserAgent(String JavaDoc userAgent) {
279         DownloadParameters dp = (DownloadParameters)clone();
280         dp.userAgent = userAgent;
281         return dp;
282     }
283 }
284
Popular Tags