KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > processor > HashCrawlMapper


1 /* HashCrawlMapper
2  *
3  * Created on Sep 30, 2005
4  *
5  * Copyright (C) 2005 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.processor;
24
25 import java.util.regex.Matcher JavaDoc;
26
27 import org.archive.crawler.datamodel.CandidateURI;
28 import org.archive.crawler.settings.SimpleType;
29 import org.archive.util.TextUtils;
30
31 import st.ata.util.FPGenerator;
32
33 /**
34  * Maps URIs to one of N crawler names by applying a hash to the
35  * URI's (possibly-transformed) classKey.
36  *
37  * @author gojomo
38  * @version $Date: 2007/01/13 01:31:25 $, $Revision: 1.2.4.1 $
39  */

40 public class HashCrawlMapper extends CrawlMapper {
41     private static final long serialVersionUID = 1L;
42     
43     /** count of crawlers */
44     public static final String JavaDoc ATTR_CRAWLER_COUNT = "crawler-count";
45     public static final Long JavaDoc DEFAULT_CRAWLER_COUNT = new Long JavaDoc(1);
46
47     /** regex pattern for reducing classKey */
48     public static final String JavaDoc ATTR_REDUCE_PATTERN = "reduce-prefix-pattern";
49     public static final String JavaDoc DEFAULT_REDUCE_PATTERN = "";
50     
51 // /** replace pattern for reducing classKey */
52
// public static final String ATTR_REPLACE_PATTERN = "replace-pattern";
53
// public static final String DEFAULT_REPLACE_PATTERN = "";
54

55     long bucketCount = 1;
56     String JavaDoc reducePattern = null;
57 // String replacePattern = null;
58

59     /**
60      * Constructor.
61      * @param name Name of this processor.
62      */

63     public HashCrawlMapper(String JavaDoc name) {
64         super(name, "HashCrawlMapper. Maps URIs to a numerically named " +
65                 "crawler by hashing the URI's (possibly transfored) " +
66                 "classKey to one of the specified number of buckets.");
67         addElementToDefinition(new SimpleType(ATTR_CRAWLER_COUNT,
68             "Number of crawlers among which to split up the URIs. " +
69             "Their names are assumed to be 0..N-1.",
70             DEFAULT_CRAWLER_COUNT));
71         addElementToDefinition(new SimpleType(ATTR_REDUCE_PATTERN,
72                 "A regex pattern to apply to the classKey, using " +
73                 "the first match as the mapping key. If empty (the" +
74                 "default), use the full classKey.",
75                 DEFAULT_REDUCE_PATTERN));
76     }
77
78     /**
79      * Look up the crawler node name to which the given CandidateURI
80      * should be mapped.
81      *
82      * @param cauri CandidateURI to consider
83      * @return String node name which should handle URI
84      */

85     protected String JavaDoc map(CandidateURI cauri) {
86         // get classKey, via frontier to generate if necessary
87
String JavaDoc key = getController().getFrontier().getClassKey(cauri);
88         return mapString(key, reducePattern, bucketCount);
89     }
90
91     protected void initialTasks() {
92         super.initialTasks();
93         bucketCount = (Long JavaDoc) getUncheckedAttribute(null,ATTR_CRAWLER_COUNT);
94         kickUpdate();
95     }
96
97     @Override JavaDoc
98     public void kickUpdate() {
99         super.kickUpdate();
100         reducePattern = (String JavaDoc)getUncheckedAttribute(null, ATTR_REDUCE_PATTERN);
101     }
102     
103     public static String JavaDoc mapString(String JavaDoc key, String JavaDoc reducePattern, long bucketCount) {
104         if(reducePattern!=null && reducePattern.length()>0) {
105            Matcher JavaDoc matcher = TextUtils.getMatcher(reducePattern,key);
106            if(matcher.find()) {
107                key = matcher.group();
108            }
109            TextUtils.recycleMatcher(matcher);
110         }
111         long fp = FPGenerator.std64.fp(key);
112         long bucket = fp % bucketCount;
113         return Long.toString(bucket >= 0 ? bucket : -bucket);
114     }
115 }
Popular Tags