KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > datamodel > CrawlOrder


1 /*
2  * CrawlOrder
3  *
4  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/CrawlOrder.java,v 1.57.4.1 2007/01/13 01:31:08 stack-sf Exp $
5  *
6  * Created on May 15, 2003
7  *
8  * Copyright (C) 2003 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  *
26  */

27
28 package org.archive.crawler.datamodel;
29
30 import java.io.File JavaDoc;
31 import java.io.Serializable JavaDoc;
32 import java.util.logging.Logger JavaDoc;
33
34 import javax.management.AttributeNotFoundException JavaDoc;
35
36 import org.archive.crawler.framework.CrawlController;
37 import org.archive.crawler.framework.CrawlScope;
38 import org.archive.crawler.framework.Frontier;
39 import org.archive.crawler.framework.Processor;
40 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
41 import org.archive.crawler.settings.MapType;
42 import org.archive.crawler.settings.ModuleType;
43 import org.archive.crawler.settings.SimpleType;
44 import org.archive.crawler.settings.Type;
45 import org.archive.crawler.url.canonicalize.BaseRule;
46
47 /**
48  * Represents the 'root' of the settings hierarchy. Contains those settings that
49  * do not belong to any specific module, but rather relate to the crawl as a
50  * whole (much of this is used by the CrawlController directly or indirectly).
51  *
52  * @see org.archive.crawler.settings.ModuleType
53  */

54 public class CrawlOrder extends ModuleType implements Serializable JavaDoc {
55
56     private static final long serialVersionUID = -6715840285961511669L;
57
58     private static Logger JavaDoc logger =
59         Logger.getLogger("org.archive.crawler.datamodel.CrawlOrder");
60
61     public static final String JavaDoc ATTR_NAME = "crawl-order";
62     public static final String JavaDoc ATTR_SETTINGS_DIRECTORY = "settings-directory";
63     public static final String JavaDoc ATTR_DISK_PATH = "disk-path";
64     public static final String JavaDoc ATTR_LOGS_PATH = "logs-path";
65     public static final String JavaDoc ATTR_CHECKPOINTS_PATH = "checkpoints-path";
66     public static final String JavaDoc ATTR_STATE_PATH = "state-path";
67     public static final String JavaDoc ATTR_SCRATCH_PATH = "scratch-path";
68     public static final String JavaDoc ATTR_RECOVER_PATH = "recover-path";
69     public static final String JavaDoc ATTR_RECOVER_RETAIN_FAILURES =
70         "recover-retain-failures";
71     public static final String JavaDoc ATTR_MAX_BYTES_DOWNLOAD = "max-bytes-download";
72     public static final String JavaDoc ATTR_MAX_DOCUMENT_DOWNLOAD =
73         "max-document-download";
74     public static final String JavaDoc ATTR_MAX_TIME_SEC = "max-time-sec";
75     public static final String JavaDoc ATTR_MAX_TOE_THREADS = "max-toe-threads";
76     public static final String JavaDoc ATTR_HTTP_HEADERS = "http-headers";
77     public static final String JavaDoc ATTR_USER_AGENT = "user-agent";
78     public static final String JavaDoc ATTR_FROM = "from";
79     public static final String JavaDoc ATTR_PRE_FETCH_PROCESSORS =
80         "pre-fetch-processors";
81     public static final String JavaDoc ATTR_FETCH_PROCESSORS = "fetch-processors";
82     public static final String JavaDoc ATTR_EXTRACT_PROCESSORS = "extract-processors";
83     public static final String JavaDoc ATTR_WRITE_PROCESSORS = "write-processors";
84     public static final String JavaDoc ATTR_POST_PROCESSORS = "post-processors";
85     public static final String JavaDoc ATTR_LOGGERS = "loggers";
86     public static final String JavaDoc ATTR_RULES = "uri-canonicalization-rules";
87     public static final String JavaDoc ATTR_RECORDER_OUT_BUFFER =
88         "recorder-out-buffer-bytes";
89     public static final String JavaDoc ATTR_RECORDER_IN_BUFFER =
90         "recorder-in-buffer-bytes";
91     
92     /** Percentage of heap to allocate to bdb cache */
93     public static final String JavaDoc ATTR_BDB_CACHE_PERCENT =
94         "bdb-cache-percent";
95     
96     /**
97      * When checkpointing, copy the bdb logs.
98      * Default is true. If false, then we do not copy logs on checkpoint AND
99      * we tell bdbje never to delete log files; instead it renames
100      * files-to-delete with a '.del' extension. Assumption is that when this
101      * setting is false, an external process is managing the removing of
102      * bdbje log files and that come time to recover from a checkpoint, the
103      * files that comprise a checkpoint are manually assembled.
104      */

105     public static final String JavaDoc ATTR_CHECKPOINT_COPY_BDBJE_LOGS =
106         "checkpoint-copy-bdbje-logs";
107     public static final Boolean JavaDoc DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS =
108         Boolean.TRUE;
109     
110     /**
111      * Default size of bdb cache.
112      */

113     private final static Integer JavaDoc DEFAULT_BDB_CACHE_PERCENT = new Integer JavaDoc(0);
114
115     private transient MapType httpHeaders;
116     private transient MapType loggers;
117
118     private transient CrawlController controller;
119
120     /**
121      * Regex for acceptable user-agent format.
122      */

123     private static String JavaDoc ACCEPTABLE_USER_AGENT =
124         "\\S+.*\\(.*\\+http(s)?://\\S+\\.\\S+.*\\).*";
125
126     /**
127      * Regex for acceptable from address.
128      */

129     private static String JavaDoc ACCEPTABLE_FROM = "\\S+@\\S+\\.\\S+";
130     
131
132     /** Construct a CrawlOrder.
133      */

134     public CrawlOrder() {
135         super(ATTR_NAME, "Heritrix crawl order. This forms the root of " +
136                 "the settings framework.");
137         Type e;
138
139         e = addElementToDefinition(new SimpleType(ATTR_SETTINGS_DIRECTORY,
140                 "Directory where override settings are kept. The settings " +
141                 "for many modules can be overridden based on the domain or " +
142                 "subdomain of the URI being processed. This setting specifies" +
143                 " a file level directory to store those settings. The path" +
144                 " is relative to 'disk-path' unless" +
145                 " an absolute path is provided.", "settings"));
146         e.setOverrideable(false);
147         e.setExpertSetting(true);
148
149         e = addElementToDefinition(new SimpleType(ATTR_DISK_PATH,
150                 "Directory where logs, arcs and other run time files will " +
151                 "be kept. If this path is a relative path, it will be " +
152                 "relative to the crawl order.", ""));
153         e.setOverrideable(false);
154         e.setExpertSetting(true);
155
156         e = addElementToDefinition(new SimpleType(ATTR_LOGS_PATH,
157                 "Directory where crawler log files will be kept. If this path " +
158                 "is a relative path, it will be relative to the 'disk-path'.",
159                 "logs"));
160         e.setOverrideable(false);
161         e.setExpertSetting(true);
162
163         e = addElementToDefinition(new SimpleType(ATTR_CHECKPOINTS_PATH,
164                 "Directory where crawler checkpoint files will be kept. " +
165                 "If this path " +
166                 "is a relative path, it will be relative to the 'disk-path'.",
167                 "checkpoints"));
168         e.setOverrideable(false);
169         e.setExpertSetting(true);
170
171         e = addElementToDefinition(new SimpleType(ATTR_STATE_PATH,
172                 "Directory where crawler-state files will be kept. If this path " +
173                 "is a relative path, it will be relative to the 'disk-path'.",
174                 "state"));
175         e.setOverrideable(false);
176         e.setExpertSetting(true);
177
178         e = addElementToDefinition(new SimpleType(ATTR_SCRATCH_PATH,
179                 "Directory where discardable temporary files will be kept. " +
180                 "If this path " +
181                 "is a relative path, it will be relative to the 'disk-path'.",
182                 "scratch"));
183         e.setOverrideable(false);
184         e.setExpertSetting(true);
185
186         e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_DOWNLOAD,
187                 "Maximum number of bytes to download. Once this number is" +
188                 " exceeded the crawler will stop. " +
189                 "A value of zero means no upper limit.", new Long JavaDoc(0)));
190         e.setOverrideable(false);
191
192         e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCUMENT_DOWNLOAD,
193                 "Maximum number of documents to download. Once this number" +
194                 " is exceeded the crawler will stop. " +
195                 "A value of zero means no upper limit.", new Long JavaDoc(0)));
196         e.setOverrideable(false);
197
198         e = addElementToDefinition(new SimpleType(ATTR_MAX_TIME_SEC,
199                 "Maximum amount of time to crawl (in seconds). Once this" +
200                 " much time has elapsed the crawler will stop. A value of" +
201                 " zero means no upper limit.",
202                 new Long JavaDoc(0)));
203         e.setOverrideable(false);
204         
205         e = addElementToDefinition(new SimpleType(ATTR_MAX_TOE_THREADS,
206                 "Maximum number of threads processing URIs at the same time.",
207                 new Integer JavaDoc(100)));
208         e.setOverrideable(false);
209
210         e = addElementToDefinition(new SimpleType(ATTR_RECORDER_OUT_BUFFER,
211                 "Size in bytes of in-memory buffer to record outbound " +
212                 "traffic. One such buffer is reserved for every ToeThread.",
213                 new Integer JavaDoc(4096)));
214         e.setOverrideable(false);
215         e.setExpertSetting(true);
216         
217         e = addElementToDefinition(new SimpleType(ATTR_RECORDER_IN_BUFFER,
218                 "Size in bytes of in-memory buffer to record inbound " +
219                 "traffic. One such buffer is reserved for every ToeThread.",
220                 new Integer JavaDoc(65536)));
221         e.setOverrideable(false);
222         e.setExpertSetting(true);
223         
224         e = addElementToDefinition(new SimpleType(ATTR_BDB_CACHE_PERCENT,
225                 "Percentage of heap to allocate to BerkeleyDB JE cache. " +
226                 "Default of zero means no preference (accept BDB's default, " +
227                 "usually 60%, or the je.maxMemoryPercent property value).",
228                 DEFAULT_BDB_CACHE_PERCENT));
229         e.setExpertSetting(true);
230         e.setOverrideable(false);
231         
232         addElementToDefinition(new CrawlScope());
233
234         httpHeaders = (MapType) addElementToDefinition(new MapType(
235                 ATTR_HTTP_HEADERS, "HTTP headers. Information that will " +
236                         "be used when constructing the HTTP headers of " +
237                         "the crawler's HTTP requests."));
238
239         e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_USER_AGENT,
240                 "User agent to act as. Field must contain valid URL " +
241                 "that links to website of person or organization " +
242                 "running the crawl. Replace 'PROJECT_URL_HERE' in " +
243                 "initial template. E.g. If organization " +
244                 "is Library of Congress, a valid user agent would be:" +
245                 "'Mozilla/5.0 (compatible; loc-crawler/0.11.0 " +
246                 "+http://loc.gov)'. " +
247                 "Note, you must preserve the '+' before the 'http'.",
248           "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)"));
249
250         e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_FROM,
251                 "Contact information. This field must contain a valid " +
252                 "e-mail address for the person or organization responsible" +
253                 "for this crawl: e.g. 'webmaster@loc.gov'",
254                 "CONTACT_EMAIL_ADDRESS_HERE"));
255
256         addElementToDefinition(new RobotsHonoringPolicy());
257
258         e = addElementToDefinition(new ModuleType(
259                 Frontier.ATTR_NAME, "Frontier"));
260         e.setLegalValueType(Frontier.class);
261
262         e = (MapType) addElementToDefinition(new MapType(ATTR_RULES,
263             "Ordered list of url canonicalization rules. " +
264             "Rules are applied in the order listed from top to bottom.",
265             BaseRule.class));
266         e.setOverrideable(true);
267         e.setExpertSetting(true);
268         
269         e = addElementToDefinition(new MapType(
270                 ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to" +
271                         " fetching anything from the network.",
272                         Processor.class));
273         e.setOverrideable(false);
274
275         e = addElementToDefinition(new MapType(
276                 ATTR_FETCH_PROCESSORS, "Processors that fetch documents."
277                 , Processor.class));
278         e.setOverrideable(false);
279
280         e = addElementToDefinition(new MapType(
281                 ATTR_EXTRACT_PROCESSORS, "Processors that extract new URIs" +
282                         " from fetched documents.", Processor.class));
283         e.setOverrideable(false);
284
285         e = addElementToDefinition(new MapType(
286                 ATTR_WRITE_PROCESSORS, "Processors that write documents" +
287                         " to archives.", Processor.class));
288         e.setOverrideable(false);
289
290         e = addElementToDefinition(new MapType(
291                 ATTR_POST_PROCESSORS, "Processors that do cleanup and feed" +
292                         " the frontier with new URIs.", Processor.class));
293         e.setOverrideable(false);
294
295         loggers = (MapType) addElementToDefinition(new MapType(ATTR_LOGGERS,
296                 "Statistics tracking modules. Any number of specialized " +
297                 "statistics tracker that monitor a crawl and write logs, " +
298                 "reports and/or provide information to the user interface."));
299
300         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_PATH,
301                 "Optional. Points at recover log (or recover.gz log) OR " +
302                 "the checkpoint directory to use recovering a crawl.", ""));
303         e.setOverrideable(false);
304         e.setExpertSetting(true);
305         
306         e = addElementToDefinition(new SimpleType(
307             ATTR_CHECKPOINT_COPY_BDBJE_LOGS,
308             "When true, on a checkpoint, we copy off the bdbje log files to " +
309             "the checkpoint directory. To recover a checkpoint, just " +
310             "set the " + ATTR_RECOVER_PATH + " to point at the checkpoint " +
311             "directory to recover. This is default setting. " +
312             "But if crawl is large, " +
313             "copying bdbje log files can take tens of minutes and even " +
314             "upwards of an hour (Copying bdbje log files will consume bulk " +
315             "of time checkpointing). If this setting is false, we do NOT copy " +
316             "bdbje logs on checkpoint AND we set bdbje to NEVER delete log " +
317             "files (instead we have it rename files-to-delete with a '.del'" +
318             "extension). Assumption is that when this setting is false, " +
319             "an external process is managing the removal of bdbje log files " +
320             "and that come time to recover from a checkpoint, the files that " +
321             "comprise a checkpoint are manually assembled. This is an expert " +
322             "setting.",
323             DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS));
324         e.setOverrideable(false);
325         e.setExpertSetting(true);
326
327         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_RETAIN_FAILURES,
328                 "When recovering via the recover.log, should failures " +
329                 "in the log be retained in the recovered crawl, " +
330                 "preventing the corresponding URIs from being retried. " +
331                 "Default is false, meaning failures are forgotten, and " +
332                 "the corresponding URIs will be retried in the recovered " +
333                 "crawl.", Boolean.FALSE));
334         e.setOverrideable(false);
335         e.setExpertSetting(true);
336         
337         e = addElementToDefinition(
338            new CredentialStore(CredentialStore.ATTR_NAME));
339         e.setOverrideable(true);
340         e.setExpertSetting(true);
341     }
342
343     /**
344      * @param curi
345      * @return user-agent header value to use
346      */

347     public String JavaDoc getUserAgent(CrawlURI curi) {
348         return ((String JavaDoc) httpHeaders.getUncheckedAttribute(curi, ATTR_USER_AGENT));
349     }
350
351     /**
352      * @param curi
353      * @return from header value to use
354      */

355     public String JavaDoc getFrom(CrawlURI curi) {
356         String JavaDoc res = null;
357         try {
358             res = (String JavaDoc) httpHeaders.getAttribute(ATTR_FROM, curi);
359         } catch (AttributeNotFoundException JavaDoc e) {
360             logger.severe(e.getMessage());
361         }
362         return res;
363     }
364
365     /**
366      * Returns the set number of maximum toe threads.
367      * @return Number of maximum toe threads
368      */

369     public int getMaxToes() {
370         Integer JavaDoc res = null;
371         try {
372             res = (Integer JavaDoc) getAttribute(null, ATTR_MAX_TOE_THREADS);
373         } catch (AttributeNotFoundException JavaDoc e) {
374             logger.severe(e.getMessage());
375         }
376         return res.intValue();
377     }
378
379     /**
380      * This method gets the RobotsHonoringPolicy object from the orders file.
381      *
382      * @return the new RobotsHonoringPolicy
383      */

384     public RobotsHonoringPolicy getRobotsHonoringPolicy() {
385         try {
386             return (RobotsHonoringPolicy) getAttribute(null, RobotsHonoringPolicy.ATTR_NAME);
387         } catch (AttributeNotFoundException JavaDoc e) {
388             logger.severe(e.getMessage());
389             return null;
390         }
391     }
392
393     /** Get the name of the order file.
394      *
395      * @return the name of the order file.
396      */

397     public String JavaDoc getCrawlOrderName() {
398         return getSettingsHandler().getSettingsObject(null).getName();
399     }
400
401     /**
402      * @return The crawl controller.
403      */

404     public CrawlController getController() {
405         return controller;
406     }
407
408     /**
409      * @param controller
410      */

411     public void setController(CrawlController controller) {
412         this.controller = controller;
413     }
414
415     /**
416      * Returns the Map of the StatisticsTracking modules that are included in the
417      * configuration that the current instance of this class is representing.
418      * @return Map of the StatisticsTracking modules
419      */

420     public MapType getLoggers() {
421         return loggers;
422     }
423
424     /**
425      * Checks if the User Agent and From field are set 'correctly' in
426      * the specified Crawl Order.
427      *
428      * @throws FatalConfigurationException
429      */

430     public void checkUserAgentAndFrom() throws FatalConfigurationException {
431         // don't start the crawl if they're using the default user-agent
432
String JavaDoc userAgent = this.getUserAgent(null);
433         String JavaDoc from = this.getFrom(null);
434         if (!(userAgent.matches(ACCEPTABLE_USER_AGENT)
435             && from.matches(ACCEPTABLE_FROM))) {
436             throw new FatalConfigurationException("unacceptable user-agent " +
437                     " or from (Reedit your order file).");
438         }
439     }
440
441     /**
442      * @return Checkpoint directory.
443      */

444     public File JavaDoc getCheckpointsDirectory() {
445         try {
446             return getDirectoryRelativeToDiskPath((String JavaDoc) getAttribute(null,
447                     CrawlOrder.ATTR_CHECKPOINTS_PATH));
448         } catch (AttributeNotFoundException JavaDoc e) {
449             // TODO Auto-generated catch block
450
e.printStackTrace();
451             return null;
452         }
453     }
454
455     private File JavaDoc getDirectoryRelativeToDiskPath(String JavaDoc subpath) {
456         File JavaDoc disk;
457         try {
458             disk = getSettingsHandler().getPathRelativeToWorkingDirectory(
459                     (String JavaDoc) getAttribute(null, CrawlOrder.ATTR_DISK_PATH));
460             return new File JavaDoc(disk, subpath);
461         } catch (AttributeNotFoundException JavaDoc e) {
462             // TODO Auto-generated catch block
463
e.printStackTrace();
464             return null;
465         }
466     }
467     
468     /**
469      * Return fullpath to the directory named by <code>key</code>
470      * in settings.
471      * If directory does not exist, it and all intermediary dirs
472      * will be created.
473      * @param key Key to use going to settings.
474      * @return Full path to directory named by <code>key</code>.
475      * @throws AttributeNotFoundException
476      */

477     public File JavaDoc getSettingsDir(String JavaDoc key)
478     throws AttributeNotFoundException JavaDoc {
479         String JavaDoc path = (String JavaDoc)getAttribute(null, key);
480         File JavaDoc f = new File JavaDoc(path);
481         if (!f.isAbsolute()) {
482             f = getDirectoryRelativeToDiskPath(path);
483         }
484         if (!f.exists()) {
485             f.mkdirs();
486         }
487         return f;
488     }
489     
490     
491 }
492
Popular Tags