1 23 package org.archive.crawler.processor; 24 25 import java.util.regex.Matcher ; 26 27 import org.archive.crawler.datamodel.CandidateURI; 28 import org.archive.crawler.settings.SimpleType; 29 import org.archive.util.TextUtils; 30 31 import st.ata.util.FPGenerator; 32 33 40 public class HashCrawlMapper extends CrawlMapper { 41 private static final long serialVersionUID = 1L; 42 43 44 public static final String ATTR_CRAWLER_COUNT = "crawler-count"; 45 public static final Long DEFAULT_CRAWLER_COUNT = new Long (1); 46 47 48 public static final String ATTR_REDUCE_PATTERN = "reduce-prefix-pattern"; 49 public static final String DEFAULT_REDUCE_PATTERN = ""; 50 51 55 long bucketCount = 1; 56 String reducePattern = null; 57 59 63 public HashCrawlMapper(String name) { 64 super(name, "HashCrawlMapper. Maps URIs to a numerically named " + 65 "crawler by hashing the URI's (possibly transfored) " + 66 "classKey to one of the specified number of buckets."); 67 addElementToDefinition(new SimpleType(ATTR_CRAWLER_COUNT, 68 "Number of crawlers among which to split up the URIs. " + 69 "Their names are assumed to be 0..N-1.", 70 DEFAULT_CRAWLER_COUNT)); 71 addElementToDefinition(new SimpleType(ATTR_REDUCE_PATTERN, 72 "A regex pattern to apply to the classKey, using " + 73 "the first match as the mapping key. If empty (the" + 74 "default), use the full classKey.", 75 DEFAULT_REDUCE_PATTERN)); 76 } 77 78 85 protected String map(CandidateURI cauri) { 86 String key = getController().getFrontier().getClassKey(cauri); 88 return mapString(key, reducePattern, bucketCount); 89 } 90 91 protected void initialTasks() { 92 super.initialTasks(); 93 bucketCount = (Long ) getUncheckedAttribute(null,ATTR_CRAWLER_COUNT); 94 kickUpdate(); 95 } 96 97 @Override 98 public void kickUpdate() { 99 super.kickUpdate(); 100 reducePattern = (String )getUncheckedAttribute(null, ATTR_REDUCE_PATTERN); 101 } 102 103 public static String mapString(String key, String reducePattern, long bucketCount) { 104 if(reducePattern!=null && reducePattern.length()>0) { 105 Matcher matcher = TextUtils.getMatcher(reducePattern,key); 106 if(matcher.find()) { 107 key = matcher.group(); 108 } 109 TextUtils.recycleMatcher(matcher); 110 } 111 long fp = FPGenerator.std64.fp(key); 112 long bucket = fp % bucketCount; 113 return Long.toString(bucket >= 0 ? bucket : -bucket); 114 } 115 } | Popular Tags |