DownloadParameters


1   /*
2    * WebSphinx web-crawling toolkit
3    *
4    * Copyright (c) 1998-2002 Carnegie Mellon University.  All rights
5    * reserved.
6    *
7    * Redistribution and use in source and binary forms, with or without
8    * modification, are permitted provided that the following conditions
9    * are met:
10   *
11   * 1. Redistributions of source code must retain the above copyright
12   *    notice, this list of conditions and the following disclaimer.
13   *
14   * 2. Redistributions in binary form must reproduce the above copyright
15   *    notice, this list of conditions and the following disclaimer in
16   *    the documentation and/or other materials provided with the
17   *    distribution.
18   *
19   * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20   * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21   * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22   * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23   * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26   * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27   * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29   * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30   *
31   */
32  
33  package websphinx;
34  
35  /**
36   * Download parameters.  These parameters are limits on
37   * how Page can download a Link.  A Crawler has a
38   * default set of download parameters, but the defaults
39   * can be overridden on individual links by calling
40   * Link.setDownloadParameters().
41   * <P>
42   * DownloadParameters is an immutable class (like String).
43   * "Changing" a parameter actually returns a new instance
44   * of the class with only the specified parameter changed.
45   */ 
46  public class DownloadParameters implements Cloneable  
47  //#ifdef JDK1.1 
48  , java.io.Serializable   
49  //#endif JDK1.1
50  {
51      private int maxThreads = 4;
52          // number of background threads used by the crawler
53      private int maxPageSize = 100;
54          // maximum page size in kilobytes (-1 for no maximum)
55      private int downloadTimeout = 60; 
56          // timeout for a single page, in seconds (-1 for no timeout)
57      private int crawlTimeout = -1;
58          // timeout for entire crawl in seconds (-1 for no timeout)
59      private boolean obeyRobotExclusion = false;
60          // obey crawling rules in robots.txt
61  
62      // not implemented yet
63  //     private int maxRequestsPerServer = 2; 
64  //         // maximum number of simultaneous requests to a server (-1 for no maximum)
65  //     private int delay = 500;
66  //         // delay (in milliseconds) between starts of requests to same server (0 for no delay)
67  
68      private boolean interactive = true;
69          // user is available to answer dialog boxes, e.g. for authentication
70      private boolean useCaches = true;
71          // use cached pages to satisfy requests wherever possible
72      private String   acceptedMIMETypes = null;
73          // accept header for HTTP request, or null to use default
74      private String   userAgent = null;
75          // User-Agent header for HTTP request, or null to use default
76  
77  
78      public static final DownloadParameters DEFAULT = new DownloadParameters ();
79      public static final DownloadParameters NO_LIMITS = 
80          DEFAULT
81          .changeMaxPageSize (-1)
82          .changeDownloadTimeout (-1)
83          .changeCrawlTimeout (-1)
84          ;
85  
86      /**
87       * Make a DownloadParameters object with default settigns.
88       */
89      public DownloadParameters () {
90      }
91      
92      /**
93       * Clone a DownloadParameters object.
94       */
95      public Object   clone () {
96          try {
97              return super.clone ();
98          } catch (CloneNotSupportedException   e) {
99              throw new RuntimeException   ("Internal error: " + e);
100         }
101     }
102 
103     /**
104      * Get maximum threads.
105      * @return maximum number of background threads used by crawler.
106      *   Default is 4.
107      */
108     public int getMaxThreads() {
109         return maxThreads;
110     }
111     /**
112      * Set maximum threads.
113      * @param maxthreads maximum number of background threads used by crawler
114      * @return new DownloadParameters object with the specified parameter changed.
115      */
116     public DownloadParameters changeMaxThreads(int maxthreads) {
117         DownloadParameters dp = (DownloadParameters)clone();
118         dp.maxThreads = maxthreads;
119         return dp;
120     }
121     /**
122      * Get maximum page size.  Pages larger than this limit are neither
123      * downloaded nor parsed.
124      * Default value is 100 (KB).  0 or negative values mean no limit.
125      * @return maximum page size in kilobytes
126      */
127     public int getMaxPageSize() {
128         return maxPageSize;
129     }
130     /**
131      * Change maximum page size.  Pages larger than this limit are treated as
132      * leaves in the crawl graph  -- neither downloaded nor parsed.
133      * @param maxPageSize maximum page size in kilobytes
134      * @return new DownloadParameters object with the specified parameter changed.
135      */
136     public DownloadParameters changeMaxPageSize(int maxPageSize) {
137         DownloadParameters dp = (DownloadParameters)clone();
138         dp.maxPageSize = maxPageSize;
139         return dp;
140     }
141     /**
142      * Get download timeout value.
143      * @return length of time (in seconds) that crawler will wait for a page to download
144      * before aborting it.
145      * timeout. Default is 60 seconds.
146      */
147     public int getDownloadTimeout() {
148         return downloadTimeout;
149     }
150     /**
151      * Change download timeout value.
152      * @param timeout length of time (in seconds) to wait for a page to download
153      *     Use a negative value to turn off timeout.
154      * @return new DownloadParameters object with the specified parameter changed.
155      */
156     public DownloadParameters changeDownloadTimeout(int timeout) {
157         DownloadParameters dp = (DownloadParameters)clone();
158         dp.downloadTimeout = timeout;
159         return dp;
160     }
161     /**
162      * Get timeout on entire crawl.
163      * @return maximum length of time (in seconds) that crawler will run
164      * before aborting.  Default is -1 (no limit).
165      */
166     public int getCrawlTimeout() {
167         return crawlTimeout;
168     }
169     /**
170      * Change timeout value.
171      * @param timeout maximum length of time (in seconds) that crawler will run.
172      *     Use a negative value to turn off timeout.
173      * @return new DownloadParameters object with the specified parameter changed.
174      */
175     public DownloadParameters changeCrawlTimeout(int timeout) {
176         DownloadParameters dp = (DownloadParameters)clone();
177         dp.crawlTimeout = timeout;
178         return dp;
179     }
180     /**
181      * Get obey-robot-exclusion flag.  
182      * @return true iff the
183      * crawler checks robots.txt on the remote Web site
184      * before downloading a page.  Default is false.
185      */
186     public boolean getObeyRobotExclusion() {
187         return obeyRobotExclusion;
188     }
189     /**
190      * Change obey-robot-exclusion flag.
191      * @param f   If true, then the
192      * crawler checks robots.txt on the remote Web site
193      * before downloading a page.
194      * @return new DownloadParameters object with the specified parameter changed.
195      */
196     public DownloadParameters changeObeyRobotExclusion(boolean f) {
197         DownloadParameters dp = (DownloadParameters)clone();
198         dp.obeyRobotExclusion = f;
199         return dp;
200     }
201     /**
202      * Get interactive flag.
203      * @return true if a user is available to respond to
204      * dialog boxes (for instance, to enter passwords for
205      * authentication).  Default is true.
206      */
207     public boolean getInteractive() {
208         return interactive;
209     }
210     /**
211      * Change interactive flag.
212      * @param f true if a user is available to respond
213      * to dialog boxes
214      * @return new DownloadParameters object with the specified parameter changed.
215      */
216     public DownloadParameters changeInteractive(boolean f) {
217         DownloadParameters dp = (DownloadParameters)clone();
218         dp.interactive = f;
219         return dp;
220     }
221     /**
222      * Get use-caches flag.
223      * @return true if cached pages should be used whenever
224      * possible
225      */
226     public boolean getUseCaches() {
227         return useCaches;
228     }
229     /**
230      * Change use-caches flag.
231      * @param f true if cached pages should be used whenever possible
232      * @return new DownloadParameters object with the specified parameter changed.
233      */
234     public DownloadParameters changeUseCaches(boolean f) {
235         DownloadParameters dp = (DownloadParameters)clone();
236         dp.useCaches = f;
237         return dp;
238     }
239     /**
240      * Get accepted MIME types.
241      * @return list of MIME types that can be handled by 
242      * the crawler (which are passed as the Accept header
243      * in the HTTP request).
244      * Default is null.
245      */
246     public String   getAcceptedMIMETypes() {
247         return acceptedMIMETypes;
248     }
249     /**
250      * Change accepted MIME types.
251      * @param types list of MIME types that can be handled
252      * by the crawler.  Use null if the crawler can handle anything.
253      * @return new DownloadParameters object with the specified parameter changed.
254      */
255     public DownloadParameters changeAcceptedMIMETypes(String   types) {
256         DownloadParameters dp = (DownloadParameters)clone();
257         dp.acceptedMIMETypes = types;
258         return dp;
259     }
260     /**
261      * Get User-agent header used in HTTP requests.
262      * @return user-agent field used in HTTP requests,
263      * or null if the Java library's default user-agent
264      * is used.  Default value is null (but for a Crawler,
265      * the default DownloadParameters has the Crawler's
266      * name as its default user-agent).
267      */
268     public String   getUserAgent() {
269         return userAgent;
270     }
271     /**
272      * Change User-agent field used in HTTP requests.
273      * @param userAgent user-agent field used in HTTP
274      * requests.  Pass null to use the Java library's default
275      * user-agent field.
276      * @return new DownloadParameters object with the specified parameter changed.
277      */
278     public DownloadParameters changeUserAgent(String   userAgent) {
279         DownloadParameters dp = (DownloadParameters)clone();
280         dp.userAgent = userAgent;
281         return dp;
282     }
283 }
284
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags