1 27 28 package org.archive.crawler.datamodel; 29 30 import java.io.File ; 31 import java.io.Serializable ; 32 import java.util.logging.Logger ; 33 34 import javax.management.AttributeNotFoundException ; 35 36 import org.archive.crawler.framework.CrawlController; 37 import org.archive.crawler.framework.CrawlScope; 38 import org.archive.crawler.framework.Frontier; 39 import org.archive.crawler.framework.Processor; 40 import org.archive.crawler.framework.exceptions.FatalConfigurationException; 41 import org.archive.crawler.settings.MapType; 42 import org.archive.crawler.settings.ModuleType; 43 import org.archive.crawler.settings.SimpleType; 44 import org.archive.crawler.settings.Type; 45 import org.archive.crawler.url.canonicalize.BaseRule; 46 47 54 public class CrawlOrder extends ModuleType implements Serializable { 55 56 private static final long serialVersionUID = -6715840285961511669L; 57 58 private static Logger logger = 59 Logger.getLogger("org.archive.crawler.datamodel.CrawlOrder"); 60 61 public static final String ATTR_NAME = "crawl-order"; 62 public static final String ATTR_SETTINGS_DIRECTORY = "settings-directory"; 63 public static final String ATTR_DISK_PATH = "disk-path"; 64 public static final String ATTR_LOGS_PATH = "logs-path"; 65 public static final String ATTR_CHECKPOINTS_PATH = "checkpoints-path"; 66 public static final String ATTR_STATE_PATH = "state-path"; 67 public static final String ATTR_SCRATCH_PATH = "scratch-path"; 68 public static final String ATTR_RECOVER_PATH = "recover-path"; 69 public static final String ATTR_RECOVER_RETAIN_FAILURES = 70 "recover-retain-failures"; 71 public static final String ATTR_MAX_BYTES_DOWNLOAD = "max-bytes-download"; 72 public static final String ATTR_MAX_DOCUMENT_DOWNLOAD = 73 "max-document-download"; 74 public static final String ATTR_MAX_TIME_SEC = "max-time-sec"; 75 public static final String ATTR_MAX_TOE_THREADS = "max-toe-threads"; 76 public static final String ATTR_HTTP_HEADERS = "http-headers"; 77 public static final String ATTR_USER_AGENT = "user-agent"; 78 public static final String ATTR_FROM = "from"; 79 public static final String ATTR_PRE_FETCH_PROCESSORS = 80 "pre-fetch-processors"; 81 public static final String ATTR_FETCH_PROCESSORS = "fetch-processors"; 82 public static final String ATTR_EXTRACT_PROCESSORS = "extract-processors"; 83 public static final String ATTR_WRITE_PROCESSORS = "write-processors"; 84 public static final String ATTR_POST_PROCESSORS = "post-processors"; 85 public static final String ATTR_LOGGERS = "loggers"; 86 public static final String ATTR_RULES = "uri-canonicalization-rules"; 87 public static final String ATTR_RECORDER_OUT_BUFFER = 88 "recorder-out-buffer-bytes"; 89 public static final String ATTR_RECORDER_IN_BUFFER = 90 "recorder-in-buffer-bytes"; 91 92 93 public static final String ATTR_BDB_CACHE_PERCENT = 94 "bdb-cache-percent"; 95 96 105 public static final String ATTR_CHECKPOINT_COPY_BDBJE_LOGS = 106 "checkpoint-copy-bdbje-logs"; 107 public static final Boolean DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS = 108 Boolean.TRUE; 109 110 113 private final static Integer DEFAULT_BDB_CACHE_PERCENT = new Integer (0); 114 115 private transient MapType httpHeaders; 116 private transient MapType loggers; 117 118 private transient CrawlController controller; 119 120 123 private static String ACCEPTABLE_USER_AGENT = 124 "\\S+.*\\(.*\\+http(s)?://\\S+\\.\\S+.*\\).*"; 125 126 129 private static String ACCEPTABLE_FROM = "\\S+@\\S+\\.\\S+"; 130 131 132 134 public CrawlOrder() { 135 super(ATTR_NAME, "Heritrix crawl order. This forms the root of " + 136 "the settings framework."); 137 Type e; 138 139 e = addElementToDefinition(new SimpleType(ATTR_SETTINGS_DIRECTORY, 140 "Directory where override settings are kept. The settings " + 141 "for many modules can be overridden based on the domain or " + 142 "subdomain of the URI being processed. This setting specifies" + 143 " a file level directory to store those settings. The path" + 144 " is relative to 'disk-path' unless" + 145 " an absolute path is provided.", "settings")); 146 e.setOverrideable(false); 147 e.setExpertSetting(true); 148 149 e = addElementToDefinition(new SimpleType(ATTR_DISK_PATH, 150 "Directory where logs, arcs and other run time files will " + 151 "be kept. If this path is a relative path, it will be " + 152 "relative to the crawl order.", "")); 153 e.setOverrideable(false); 154 e.setExpertSetting(true); 155 156 e = addElementToDefinition(new SimpleType(ATTR_LOGS_PATH, 157 "Directory where crawler log files will be kept. If this path " + 158 "is a relative path, it will be relative to the 'disk-path'.", 159 "logs")); 160 e.setOverrideable(false); 161 e.setExpertSetting(true); 162 163 e = addElementToDefinition(new SimpleType(ATTR_CHECKPOINTS_PATH, 164 "Directory where crawler checkpoint files will be kept. " + 165 "If this path " + 166 "is a relative path, it will be relative to the 'disk-path'.", 167 "checkpoints")); 168 e.setOverrideable(false); 169 e.setExpertSetting(true); 170 171 e = addElementToDefinition(new SimpleType(ATTR_STATE_PATH, 172 "Directory where crawler-state files will be kept. If this path " + 173 "is a relative path, it will be relative to the 'disk-path'.", 174 "state")); 175 e.setOverrideable(false); 176 e.setExpertSetting(true); 177 178 e = addElementToDefinition(new SimpleType(ATTR_SCRATCH_PATH, 179 "Directory where discardable temporary files will be kept. " + 180 "If this path " + 181 "is a relative path, it will be relative to the 'disk-path'.", 182 "scratch")); 183 e.setOverrideable(false); 184 e.setExpertSetting(true); 185 186 e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_DOWNLOAD, 187 "Maximum number of bytes to download. Once this number is" + 188 " exceeded the crawler will stop. " + 189 "A value of zero means no upper limit.", new Long (0))); 190 e.setOverrideable(false); 191 192 e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCUMENT_DOWNLOAD, 193 "Maximum number of documents to download. Once this number" + 194 " is exceeded the crawler will stop. " + 195 "A value of zero means no upper limit.", new Long (0))); 196 e.setOverrideable(false); 197 198 e = addElementToDefinition(new SimpleType(ATTR_MAX_TIME_SEC, 199 "Maximum amount of time to crawl (in seconds). Once this" + 200 " much time has elapsed the crawler will stop. A value of" + 201 " zero means no upper limit.", 202 new Long (0))); 203 e.setOverrideable(false); 204 205 e = addElementToDefinition(new SimpleType(ATTR_MAX_TOE_THREADS, 206 "Maximum number of threads processing URIs at the same time.", 207 new Integer (100))); 208 e.setOverrideable(false); 209 210 e = addElementToDefinition(new SimpleType(ATTR_RECORDER_OUT_BUFFER, 211 "Size in bytes of in-memory buffer to record outbound " + 212 "traffic. One such buffer is reserved for every ToeThread.", 213 new Integer (4096))); 214 e.setOverrideable(false); 215 e.setExpertSetting(true); 216 217 e = addElementToDefinition(new SimpleType(ATTR_RECORDER_IN_BUFFER, 218 "Size in bytes of in-memory buffer to record inbound " + 219 "traffic. One such buffer is reserved for every ToeThread.", 220 new Integer (65536))); 221 e.setOverrideable(false); 222 e.setExpertSetting(true); 223 224 e = addElementToDefinition(new SimpleType(ATTR_BDB_CACHE_PERCENT, 225 "Percentage of heap to allocate to BerkeleyDB JE cache. " + 226 "Default of zero means no preference (accept BDB's default, " + 227 "usually 60%, or the je.maxMemoryPercent property value).", 228 DEFAULT_BDB_CACHE_PERCENT)); 229 e.setExpertSetting(true); 230 e.setOverrideable(false); 231 232 addElementToDefinition(new CrawlScope()); 233 234 httpHeaders = (MapType) addElementToDefinition(new MapType( 235 ATTR_HTTP_HEADERS, "HTTP headers. Information that will " + 236 "be used when constructing the HTTP headers of " + 237 "the crawler's HTTP requests.")); 238 239 e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_USER_AGENT, 240 "User agent to act as. Field must contain valid URL " + 241 "that links to website of person or organization " + 242 "running the crawl. Replace 'PROJECT_URL_HERE' in " + 243 "initial template. E.g. If organization " + 244 "is Library of Congress, a valid user agent would be:" + 245 "'Mozilla/5.0 (compatible; loc-crawler/0.11.0 " + 246 "+http://loc.gov)'. " + 247 "Note, you must preserve the '+' before the 'http'.", 248 "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)")); 249 250 e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_FROM, 251 "Contact information. This field must contain a valid " + 252 "e-mail address for the person or organization responsible" + 253 "for this crawl: e.g. 'webmaster@loc.gov'", 254 "CONTACT_EMAIL_ADDRESS_HERE")); 255 256 addElementToDefinition(new RobotsHonoringPolicy()); 257 258 e = addElementToDefinition(new ModuleType( 259 Frontier.ATTR_NAME, "Frontier")); 260 e.setLegalValueType(Frontier.class); 261 262 e = (MapType) addElementToDefinition(new MapType(ATTR_RULES, 263 "Ordered list of url canonicalization rules. " + 264 "Rules are applied in the order listed from top to bottom.", 265 BaseRule.class)); 266 e.setOverrideable(true); 267 e.setExpertSetting(true); 268 269 e = addElementToDefinition(new MapType( 270 ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to" + 271 " fetching anything from the network.", 272 Processor.class)); 273 e.setOverrideable(false); 274 275 e = addElementToDefinition(new MapType( 276 ATTR_FETCH_PROCESSORS, "Processors that fetch documents." 277 , Processor.class)); 278 e.setOverrideable(false); 279 280 e = addElementToDefinition(new MapType( 281 ATTR_EXTRACT_PROCESSORS, "Processors that extract new URIs" + 282 " from fetched documents.", Processor.class)); 283 e.setOverrideable(false); 284 285 e = addElementToDefinition(new MapType( 286 ATTR_WRITE_PROCESSORS, "Processors that write documents" + 287 " to archives.", Processor.class)); 288 e.setOverrideable(false); 289 290 e = addElementToDefinition(new MapType( 291 ATTR_POST_PROCESSORS, "Processors that do cleanup and feed" + 292 " the frontier with new URIs.", Processor.class)); 293 e.setOverrideable(false); 294 295 loggers = (MapType) addElementToDefinition(new MapType(ATTR_LOGGERS, 296 "Statistics tracking modules. Any number of specialized " + 297 "statistics tracker that monitor a crawl and write logs, " + 298 "reports and/or provide information to the user interface.")); 299 300 e = addElementToDefinition(new SimpleType(ATTR_RECOVER_PATH, 301 "Optional. Points at recover log (or recover.gz log) OR " + 302 "the checkpoint directory to use recovering a crawl.", "")); 303 e.setOverrideable(false); 304 e.setExpertSetting(true); 305 306 e = addElementToDefinition(new SimpleType( 307 ATTR_CHECKPOINT_COPY_BDBJE_LOGS, 308 "When true, on a checkpoint, we copy off the bdbje log files to " + 309 "the checkpoint directory. To recover a checkpoint, just " + 310 "set the " + ATTR_RECOVER_PATH + " to point at the checkpoint " + 311 "directory to recover. This is default setting. " + 312 "But if crawl is large, " + 313 "copying bdbje log files can take tens of minutes and even " + 314 "upwards of an hour (Copying bdbje log files will consume bulk " + 315 "of time checkpointing). If this setting is false, we do NOT copy " + 316 "bdbje logs on checkpoint AND we set bdbje to NEVER delete log " + 317 "files (instead we have it rename files-to-delete with a '.del'" + 318 "extension). Assumption is that when this setting is false, " + 319 "an external process is managing the removal of bdbje log files " + 320 "and that come time to recover from a checkpoint, the files that " + 321 "comprise a checkpoint are manually assembled. This is an expert " + 322 "setting.", 323 DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS)); 324 e.setOverrideable(false); 325 e.setExpertSetting(true); 326 327 e = addElementToDefinition(new SimpleType(ATTR_RECOVER_RETAIN_FAILURES, 328 "When recovering via the recover.log, should failures " + 329 "in the log be retained in the recovered crawl, " + 330 "preventing the corresponding URIs from being retried. " + 331 "Default is false, meaning failures are forgotten, and " + 332 "the corresponding URIs will be retried in the recovered " + 333 "crawl.", Boolean.FALSE)); 334 e.setOverrideable(false); 335 e.setExpertSetting(true); 336 337 e = addElementToDefinition( 338 new CredentialStore(CredentialStore.ATTR_NAME)); 339 e.setOverrideable(true); 340 e.setExpertSetting(true); 341 } 342 343 347 public String getUserAgent(CrawlURI curi) { 348 return ((String ) httpHeaders.getUncheckedAttribute(curi, ATTR_USER_AGENT)); 349 } 350 351 355 public String getFrom(CrawlURI curi) { 356 String res = null; 357 try { 358 res = (String ) httpHeaders.getAttribute(ATTR_FROM, curi); 359 } catch (AttributeNotFoundException e) { 360 logger.severe(e.getMessage()); 361 } 362 return res; 363 } 364 365 369 public int getMaxToes() { 370 Integer res = null; 371 try { 372 res = (Integer ) getAttribute(null, ATTR_MAX_TOE_THREADS); 373 } catch (AttributeNotFoundException e) { 374 logger.severe(e.getMessage()); 375 } 376 return res.intValue(); 377 } 378 379 384 public RobotsHonoringPolicy getRobotsHonoringPolicy() { 385 try { 386 return (RobotsHonoringPolicy) getAttribute(null, RobotsHonoringPolicy.ATTR_NAME); 387 } catch (AttributeNotFoundException e) { 388 logger.severe(e.getMessage()); 389 return null; 390 } 391 } 392 393 397 public String getCrawlOrderName() { 398 return getSettingsHandler().getSettingsObject(null).getName(); 399 } 400 401 404 public CrawlController getController() { 405 return controller; 406 } 407 408 411 public void setController(CrawlController controller) { 412 this.controller = controller; 413 } 414 415 420 public MapType getLoggers() { 421 return loggers; 422 } 423 424 430 public void checkUserAgentAndFrom() throws FatalConfigurationException { 431 String userAgent = this.getUserAgent(null); 433 String from = this.getFrom(null); 434 if (!(userAgent.matches(ACCEPTABLE_USER_AGENT) 435 && from.matches(ACCEPTABLE_FROM))) { 436 throw new FatalConfigurationException("unacceptable user-agent " + 437 " or from (Reedit your order file)."); 438 } 439 } 440 441 444 public File getCheckpointsDirectory() { 445 try { 446 return getDirectoryRelativeToDiskPath((String ) getAttribute(null, 447 CrawlOrder.ATTR_CHECKPOINTS_PATH)); 448 } catch (AttributeNotFoundException e) { 449 e.printStackTrace(); 451 return null; 452 } 453 } 454 455 private File getDirectoryRelativeToDiskPath(String subpath) { 456 File disk; 457 try { 458 disk = getSettingsHandler().getPathRelativeToWorkingDirectory( 459 (String ) getAttribute(null, CrawlOrder.ATTR_DISK_PATH)); 460 return new File (disk, subpath); 461 } catch (AttributeNotFoundException e) { 462 e.printStackTrace(); 464 return null; 465 } 466 } 467 468 477 public File getSettingsDir(String key) 478 throws AttributeNotFoundException { 479 String path = (String )getAttribute(null, key); 480 File f = new File (path); 481 if (!f.isAbsolute()) { 482 f = getDirectoryRelativeToDiskPath(path); 483 } 484 if (!f.exists()) { 485 f.mkdirs(); 486 } 487 return f; 488 } 489 490 491 } 492 | Popular Tags |