CrawlOrder


1   /*
2    * CrawlOrder
3    *
4    * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/CrawlOrder.java,v 1.57.4.1 2007/01/13 01:31:08 stack-sf Exp $
5    *
6    * Created on May 15, 2003
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   *
26   */
27  
28  package org.archive.crawler.datamodel;
29  
30  import java.io.File  ;
31  import java.io.Serializable  ;
32  import java.util.logging.Logger  ;
33  
34  import javax.management.AttributeNotFoundException  ;
35  
36  import org.archive.crawler.framework.CrawlController;
37  import org.archive.crawler.framework.CrawlScope;
38  import org.archive.crawler.framework.Frontier;
39  import org.archive.crawler.framework.Processor;
40  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
41  import org.archive.crawler.settings.MapType;
42  import org.archive.crawler.settings.ModuleType;
43  import org.archive.crawler.settings.SimpleType;
44  import org.archive.crawler.settings.Type;
45  import org.archive.crawler.url.canonicalize.BaseRule;
46  
47  /**
48   * Represents the 'root' of the settings hierarchy. Contains those settings that
49   * do not belong to any specific module, but rather relate to the crawl as a
50   * whole (much of this is used by the CrawlController directly or indirectly).
51   *
52   * @see org.archive.crawler.settings.ModuleType
53   */
54  public class CrawlOrder extends ModuleType implements Serializable   {
55  
56      private static final long serialVersionUID = -6715840285961511669L;
57  
58      private static Logger   logger =
59          Logger.getLogger("org.archive.crawler.datamodel.CrawlOrder");
60  
61      public static final String   ATTR_NAME = "crawl-order";
62      public static final String   ATTR_SETTINGS_DIRECTORY = "settings-directory";
63      public static final String   ATTR_DISK_PATH = "disk-path";
64      public static final String   ATTR_LOGS_PATH = "logs-path";
65      public static final String   ATTR_CHECKPOINTS_PATH = "checkpoints-path";
66      public static final String   ATTR_STATE_PATH = "state-path";
67      public static final String   ATTR_SCRATCH_PATH = "scratch-path";
68      public static final String   ATTR_RECOVER_PATH = "recover-path";
69      public static final String   ATTR_RECOVER_RETAIN_FAILURES =
70          "recover-retain-failures";
71      public static final String   ATTR_MAX_BYTES_DOWNLOAD = "max-bytes-download";
72      public static final String   ATTR_MAX_DOCUMENT_DOWNLOAD =
73          "max-document-download";
74      public static final String   ATTR_MAX_TIME_SEC = "max-time-sec";
75      public static final String   ATTR_MAX_TOE_THREADS = "max-toe-threads";
76      public static final String   ATTR_HTTP_HEADERS = "http-headers";
77      public static final String   ATTR_USER_AGENT = "user-agent";
78      public static final String   ATTR_FROM = "from";
79      public static final String   ATTR_PRE_FETCH_PROCESSORS =
80          "pre-fetch-processors";
81      public static final String   ATTR_FETCH_PROCESSORS = "fetch-processors";
82      public static final String   ATTR_EXTRACT_PROCESSORS = "extract-processors";
83      public static final String   ATTR_WRITE_PROCESSORS = "write-processors";
84      public static final String   ATTR_POST_PROCESSORS = "post-processors";
85      public static final String   ATTR_LOGGERS = "loggers";
86      public static final String   ATTR_RULES = "uri-canonicalization-rules";
87      public static final String   ATTR_RECORDER_OUT_BUFFER =
88          "recorder-out-buffer-bytes";
89      public static final String   ATTR_RECORDER_IN_BUFFER =
90          "recorder-in-buffer-bytes";
91      
92      /** Percentage of heap to allocate to bdb cache */
93      public static final String   ATTR_BDB_CACHE_PERCENT =
94          "bdb-cache-percent";
95      
96      /**
97       * When checkpointing, copy the bdb logs.
98       * Default is true.  If false, then we do not copy logs on checkpoint AND
99       * we tell bdbje never to delete log files; instead it renames
100      * files-to-delete with a '.del' extension.  Assumption is that when this
101      * setting is false, an external process is managing the removing of
102      * bdbje log files and that come time to recover from a checkpoint, the
103      * files that comprise a checkpoint are manually assembled.
104      */
105     public static final String   ATTR_CHECKPOINT_COPY_BDBJE_LOGS =
106         "checkpoint-copy-bdbje-logs";
107     public static final Boolean   DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS =
108         Boolean.TRUE;
109     
110     /**
111      * Default size of bdb cache.
112      */
113     private final static Integer   DEFAULT_BDB_CACHE_PERCENT = new Integer  (0);
114 
115     private transient MapType httpHeaders;
116     private transient MapType loggers;
117 
118     private transient CrawlController controller;
119 
120     /**
121      * Regex for acceptable user-agent format.
122      */
123     private static String   ACCEPTABLE_USER_AGENT =
124         "\\S+.*\\(.*\\+http(s)?://\\S+\\.\\S+.*\\).*";
125 
126     /**
127      * Regex for acceptable from address.
128      */
129     private static String   ACCEPTABLE_FROM = "\\S+@\\S+\\.\\S+";
130     
131 
132     /** Construct a CrawlOrder.
133      */
134     public CrawlOrder() {
135         super(ATTR_NAME, "Heritrix crawl order. This forms the root of " +
136                 "the settings framework.");
137         Type e;
138 
139         e = addElementToDefinition(new SimpleType(ATTR_SETTINGS_DIRECTORY,
140                 "Directory where override settings are kept. The settings " +
141                 "for many modules can be overridden based on the domain or " +
142                 "subdomain of the URI being processed. This setting specifies" +
143                 " a file level directory to store those settings. The path" +
144                 " is relative to 'disk-path' unless" +
145                 " an absolute path is provided.", "settings"));
146         e.setOverrideable(false);
147         e.setExpertSetting(true);
148 
149         e = addElementToDefinition(new SimpleType(ATTR_DISK_PATH,
150                 "Directory where logs, arcs and other run time files will " +
151                 "be kept. If this path is a relative path, it will be " +
152                 "relative to the crawl order.", ""));
153         e.setOverrideable(false);
154         e.setExpertSetting(true);
155 
156         e = addElementToDefinition(new SimpleType(ATTR_LOGS_PATH,
157                 "Directory where crawler log files will be kept. If this path " +
158                 "is a relative path, it will be relative to the 'disk-path'.",
159                 "logs"));
160         e.setOverrideable(false);
161         e.setExpertSetting(true);
162 
163         e = addElementToDefinition(new SimpleType(ATTR_CHECKPOINTS_PATH,
164                 "Directory where crawler checkpoint files will be kept. " +
165                 "If this path " +
166                 "is a relative path, it will be relative to the 'disk-path'.",
167                 "checkpoints"));
168         e.setOverrideable(false);
169         e.setExpertSetting(true);
170 
171         e = addElementToDefinition(new SimpleType(ATTR_STATE_PATH,
172                 "Directory where crawler-state files will be kept. If this path " +
173                 "is a relative path, it will be relative to the 'disk-path'.",
174                 "state"));
175         e.setOverrideable(false);
176         e.setExpertSetting(true);
177 
178         e = addElementToDefinition(new SimpleType(ATTR_SCRATCH_PATH,
179                 "Directory where discardable temporary files will be kept. " +
180                 "If this path " +
181                 "is a relative path, it will be relative to the 'disk-path'.",
182                 "scratch"));
183         e.setOverrideable(false);
184         e.setExpertSetting(true);
185 
186         e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_DOWNLOAD,
187                 "Maximum number of bytes to download. Once this number is" +
188                 " exceeded the crawler will stop. " +
189                 "A value of zero means no upper limit.", new Long  (0)));
190         e.setOverrideable(false);
191 
192         e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCUMENT_DOWNLOAD,
193                 "Maximum number of documents to download. Once this number" +
194                 " is exceeded the crawler will stop. " +
195                 "A value of zero means no upper limit.", new Long  (0)));
196         e.setOverrideable(false);
197 
198         e = addElementToDefinition(new SimpleType(ATTR_MAX_TIME_SEC,
199                 "Maximum amount of time to crawl (in seconds). Once this" +
200                 " much time has elapsed the crawler will stop. A value of" +
201                 " zero means no upper limit.",
202                 new Long  (0)));
203         e.setOverrideable(false);
204         
205         e = addElementToDefinition(new SimpleType(ATTR_MAX_TOE_THREADS,
206                 "Maximum number of threads processing URIs at the same time.",
207                 new Integer  (100)));
208         e.setOverrideable(false);
209 
210         e = addElementToDefinition(new SimpleType(ATTR_RECORDER_OUT_BUFFER,
211                 "Size in bytes of in-memory buffer to record outbound " +
212                 "traffic. One such buffer is reserved for every ToeThread.",
213                 new Integer  (4096)));
214         e.setOverrideable(false);
215         e.setExpertSetting(true);
216         
217         e = addElementToDefinition(new SimpleType(ATTR_RECORDER_IN_BUFFER,
218                 "Size in bytes of in-memory buffer to record inbound " +
219                 "traffic. One such buffer is reserved for every ToeThread.",
220                 new Integer  (65536)));
221         e.setOverrideable(false);
222         e.setExpertSetting(true);
223         
224         e = addElementToDefinition(new SimpleType(ATTR_BDB_CACHE_PERCENT,
225                 "Percentage of heap to allocate to BerkeleyDB JE cache. " +
226                 "Default of zero means no preference (accept BDB's default, " +
227                 "usually 60%, or the je.maxMemoryPercent property value).",
228                 DEFAULT_BDB_CACHE_PERCENT));
229         e.setExpertSetting(true);
230         e.setOverrideable(false);
231         
232         addElementToDefinition(new CrawlScope());
233 
234         httpHeaders = (MapType) addElementToDefinition(new MapType(
235                 ATTR_HTTP_HEADERS, "HTTP headers. Information that will " +
236                         "be used when constructing the HTTP headers of " +
237                         "the crawler's HTTP requests."));
238 
239         e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_USER_AGENT,
240                 "User agent to act as. Field must contain valid URL " +
241                 "that links to website of person or organization " +
242                 "running the crawl. Replace 'PROJECT_URL_HERE' in " +
243                 "initial template. E.g. If organization " +
244                 "is Library of Congress, a valid user agent would be:" +
245                 "'Mozilla/5.0 (compatible; loc-crawler/0.11.0 " +
246                 "+http://loc.gov)'. " +
247                 "Note, you must preserve the '+' before the 'http'.",
248           "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)"));
249 
250         e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_FROM,
251                 "Contact information. This field must contain a valid " +
252                 "e-mail address for the person or organization responsible" +
253                 "for this crawl: e.g. 'webmaster@loc.gov'",
254                 "CONTACT_EMAIL_ADDRESS_HERE"));
255 
256         addElementToDefinition(new RobotsHonoringPolicy());
257 
258         e = addElementToDefinition(new ModuleType(
259                 Frontier.ATTR_NAME, "Frontier"));
260         e.setLegalValueType(Frontier.class);
261 
262         e = (MapType) addElementToDefinition(new MapType(ATTR_RULES,
263             "Ordered list of url canonicalization rules. " +
264             "Rules are applied in the order listed from top to bottom.",
265             BaseRule.class));
266         e.setOverrideable(true);
267         e.setExpertSetting(true);
268         
269         e = addElementToDefinition(new MapType(
270                 ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to" +
271                         " fetching anything from the network.",
272                         Processor.class));
273         e.setOverrideable(false);
274 
275         e = addElementToDefinition(new MapType(
276                 ATTR_FETCH_PROCESSORS, "Processors that fetch documents."
277                 , Processor.class));
278         e.setOverrideable(false);
279 
280         e = addElementToDefinition(new MapType(
281                 ATTR_EXTRACT_PROCESSORS, "Processors that extract new URIs" +
282                         " from fetched documents.", Processor.class));
283         e.setOverrideable(false);
284 
285         e = addElementToDefinition(new MapType(
286                 ATTR_WRITE_PROCESSORS, "Processors that write documents" +
287                         " to archives.", Processor.class));
288         e.setOverrideable(false);
289 
290         e = addElementToDefinition(new MapType(
291                 ATTR_POST_PROCESSORS, "Processors that do cleanup and feed" +
292                         " the frontier with new URIs.", Processor.class));
293         e.setOverrideable(false);
294 
295         loggers = (MapType) addElementToDefinition(new MapType(ATTR_LOGGERS,
296                 "Statistics tracking modules. Any number of specialized " +
297                 "statistics tracker that monitor a crawl and write logs, " +
298                 "reports and/or provide information to the user interface."));
299 
300         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_PATH,
301                 "Optional. Points at recover log (or recover.gz log) OR " +
302                 "the checkpoint directory to use recovering a crawl.", ""));
303         e.setOverrideable(false);
304         e.setExpertSetting(true);
305         
306         e = addElementToDefinition(new SimpleType(
307             ATTR_CHECKPOINT_COPY_BDBJE_LOGS,
308             "When true, on a checkpoint, we copy off the bdbje log files to " +
309             "the checkpoint directory. To recover a checkpoint, just " +
310             "set the " + ATTR_RECOVER_PATH + " to point at the checkpoint " +
311             "directory to recover.  This is default setting. " +
312             "But if crawl is large, " +
313             "copying bdbje log files can take tens of minutes and even " +
314             "upwards of an hour (Copying bdbje log files will consume bulk " +
315             "of time checkpointing). If this setting is false, we do NOT copy " +
316             "bdbje logs on checkpoint AND we set bdbje to NEVER delete log " +
317             "files (instead we have it rename files-to-delete with a '.del'" +
318             "extension). Assumption is that when this setting is false, " +
319             "an external process is managing the removal of bdbje log files " +
320             "and that come time to recover from a checkpoint, the files that " +
321             "comprise a checkpoint are manually assembled. This is an expert " +
322             "setting.",
323             DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS));
324         e.setOverrideable(false);
325         e.setExpertSetting(true);
326 
327         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_RETAIN_FAILURES,
328                 "When recovering via the recover.log, should failures " +
329                 "in the log be retained in the recovered crawl, " +
330                 "preventing the corresponding URIs from being retried. " +
331                 "Default is false, meaning failures are forgotten, and " +
332                 "the corresponding URIs will be retried in the recovered " +
333                 "crawl.", Boolean.FALSE));
334         e.setOverrideable(false);
335         e.setExpertSetting(true);
336         
337         e = addElementToDefinition(
338            new CredentialStore(CredentialStore.ATTR_NAME));
339         e.setOverrideable(true);
340         e.setExpertSetting(true);
341     }
342 
343     /**
344      * @param curi
345      * @return user-agent header value to use
346      */
347     public String   getUserAgent(CrawlURI curi) {
348         return ((String  ) httpHeaders.getUncheckedAttribute(curi, ATTR_USER_AGENT));
349     }
350 
351     /**
352      * @param curi
353      * @return from header value to use
354      */
355     public String   getFrom(CrawlURI curi) {
356         String   res = null;
357         try {
358             res = (String  ) httpHeaders.getAttribute(ATTR_FROM, curi);
359         } catch (AttributeNotFoundException   e) {
360             logger.severe(e.getMessage());
361         }
362         return res;
363     }
364 
365     /**
366      * Returns the set number of maximum toe threads.
367      * @return Number of maximum toe threads
368      */
369     public int getMaxToes() {
370         Integer   res = null;
371         try {
372             res = (Integer  ) getAttribute(null, ATTR_MAX_TOE_THREADS);
373         } catch (AttributeNotFoundException   e) {
374             logger.severe(e.getMessage());
375         }
376         return res.intValue();
377     }
378 
379     /**
380      * This method gets the RobotsHonoringPolicy object from the orders file.
381      *
382      * @return the new RobotsHonoringPolicy
383      */
384     public RobotsHonoringPolicy getRobotsHonoringPolicy() {
385         try {
386             return (RobotsHonoringPolicy) getAttribute(null, RobotsHonoringPolicy.ATTR_NAME);
387         } catch (AttributeNotFoundException   e) {
388             logger.severe(e.getMessage());
389             return null;
390         } 
391     }
392 
393     /** Get the name of the order file.
394      *
395      * @return the name of the order file.
396      */
397     public String   getCrawlOrderName() {
398         return getSettingsHandler().getSettingsObject(null).getName();
399     }
400 
401     /**
402      * @return The crawl controller.
403      */
404     public CrawlController getController() {
405         return controller;
406     }
407 
408     /**
409      * @param controller
410      */
411     public void setController(CrawlController controller) {
412         this.controller = controller;
413     }
414 
415     /**
416      * Returns the Map of the StatisticsTracking modules that are included in the
417      * configuration that the current instance of this class is representing.
418      * @return Map of the StatisticsTracking modules
419      */
420     public MapType getLoggers() {
421         return loggers;
422     }
423 
424     /**
425      * Checks if the User Agent and From field are set 'correctly' in
426      * the specified Crawl Order.
427      *
428      * @throws FatalConfigurationException
429      */
430     public void checkUserAgentAndFrom() throws FatalConfigurationException {
431         // don't start the crawl if they're using the default user-agent
432         String   userAgent = this.getUserAgent(null);
433         String   from = this.getFrom(null);
434         if (!(userAgent.matches(ACCEPTABLE_USER_AGENT)
435             && from.matches(ACCEPTABLE_FROM))) {
436             throw new FatalConfigurationException("unacceptable user-agent " +
437                     " or from (Reedit your order file).");
438         }
439     }
440 
441     /**
442      * @return Checkpoint directory.
443      */
444     public File   getCheckpointsDirectory() {
445         try {
446             return getDirectoryRelativeToDiskPath((String  ) getAttribute(null,
447                     CrawlOrder.ATTR_CHECKPOINTS_PATH));
448         } catch (AttributeNotFoundException   e) {
449             // TODO Auto-generated catch block
450             e.printStackTrace();
451             return null;
452         }
453     }
454 
455     private File   getDirectoryRelativeToDiskPath(String   subpath) {
456         File   disk;
457         try {
458             disk = getSettingsHandler().getPathRelativeToWorkingDirectory(
459                     (String  ) getAttribute(null, CrawlOrder.ATTR_DISK_PATH));
460             return new File  (disk, subpath);
461         } catch (AttributeNotFoundException   e) {
462             // TODO Auto-generated catch block
463             e.printStackTrace();
464             return null;
465         }
466     }
467     
468     /**
469      * Return fullpath to the directory named by <code>key</code>
470      * in settings.
471      * If directory does not exist, it and all intermediary dirs
472      * will be created.
473      * @param key Key to use going to settings.
474      * @return Full path to directory named by <code>key</code>.
475      * @throws AttributeNotFoundException
476      */
477     public File   getSettingsDir(String   key)
478     throws AttributeNotFoundException   {
479         String   path = (String  )getAttribute(null, key);
480         File   f = new File  (path);
481         if (!f.isAbsolute()) {
482             f = getDirectoryRelativeToDiskPath(path);
483         }
484         if (!f.exists()) {
485             f.mkdirs();
486         }
487         return f;
488     }
489     
490     
491 }
492
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags