KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > processor > LexicalCrawlMapper


1 /* LexicalCrawlMapper
2  *
3  * Created on Sep 30, 2005
4  *
5  * Copyright (C) 2005 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.processor;
24
25 import java.io.BufferedReader JavaDoc;
26 import java.io.File JavaDoc;
27 import java.io.FileReader JavaDoc;
28 import java.io.IOException JavaDoc;
29 import java.io.InputStreamReader JavaDoc;
30 import java.io.Reader JavaDoc;
31 import java.net.URL JavaDoc;
32 import java.net.URLConnection JavaDoc;
33 import java.util.Iterator JavaDoc;
34 import java.util.SortedMap JavaDoc;
35 import java.util.TreeMap JavaDoc;
36
37 import org.archive.crawler.datamodel.CandidateURI;
38 import org.archive.crawler.settings.SimpleType;
39 import org.archive.util.iterator.LineReadingIterator;
40 import org.archive.util.iterator.RegexpLineIterator;
41
42
43 /**
44  * A simple crawl splitter/mapper, dividing up CandidateURIs/CrawlURIs
45  * between crawlers by diverting some range of URIs to local log files
46  * (which can then be imported to other crawlers).
47  *
48  * May operate on a CrawlURI (typically early in the processing chain) or
49  * its CandidateURI outlinks (late in the processing chain, after
50  * LinksScoper), or both (if inserted and configured in both places).
51  *
52  * <p>Uses lexical comparisons of classKeys to map URIs to crawlers. The
53  * 'map' is specified via either a local or HTTP-fetchable file. Each
54  * line of this file should contain two space-separated tokens, the
55  * first a key and the second a crawler node name (which should be
56  * legal as part of a filename). All URIs will be mapped to the crawler
57  * node name associated with the nearest mapping key equal or subsequent
58  * to the URI's own classKey. If there are no mapping keys equal or
59  * after the classKey, the mapping 'wraps around' to the first mapping key.
60  *
61  * <p>One crawler name is distinguished as the 'local name'; URIs mapped to
62  * this name are not diverted, but continue to be processed normally.
63  *
64  * <p>For example, assume a SurtAuthorityQueueAssignmentPolicy and
65  * a simple mapping file:
66  *
67  * <pre>
68  * d crawlerA
69  * ~ crawlerB
70  * </pre>
71  * <p>All URIs with "com," classKeys will find the 'd' key as the nearest
72  * subsequent mapping key, and thus be mapped to 'crawlerA'. If that's
73  * the 'local name', the URIs will be processed normally; otherwise, the
74  * URI will be written to a diversion log aimed for 'crawlerA'.
75  *
76  * <p>If using the JMX importUris operation importing URLs dropped by
77  * a {@link LexicalCrawlMapper} instance, use <code>recoveryLog</code> style.
78  *
79  * @author gojomo
80  * @version $Date: 2007/01/13 01:31:25 $, $Revision: 1.1.4.1 $
81  */

82 public class LexicalCrawlMapper extends CrawlMapper {
83     private static final long serialVersionUID = 1L;
84     
85     /** where to load map from */
86     public static final String JavaDoc ATTR_MAP_SOURCE = "map-source";
87     public static final String JavaDoc DEFAULT_MAP_SOURCE = "";
88     
89     /**
90      * Mapping of classKey ranges (as represented by their start) to
91      * crawlers (by abstract name/filename)
92      */

93     TreeMap JavaDoc<String JavaDoc, String JavaDoc> map = new TreeMap JavaDoc<String JavaDoc, String JavaDoc>();
94
95     /**
96      * Constructor.
97      * @param name Name of this processor.
98      */

99     public LexicalCrawlMapper(String JavaDoc name) {
100         super(name, "LexicalCrawlMapper. Maps URIs to a named " +
101                 "crawler by a lexical comparison of the URI's " +
102                 "classKey to a supplied ranges map.");
103         addElementToDefinition(new SimpleType(ATTR_MAP_SOURCE,
104             "Path (or HTTP URL) to map specification file. Each line " +
105             "should include 2 whitespace-separated tokens: the first a " +
106             "key indicating the end of a range, the second the crawler " +
107             "node to which URIs in the key range should be mapped.",
108             DEFAULT_MAP_SOURCE));
109     }
110
111     /**
112      * Look up the crawler node name to which the given CandidateURI
113      * should be mapped.
114      *
115      * @param cauri CandidateURI to consider
116      * @return String node name which should handle URI
117      */

118     protected String JavaDoc map(CandidateURI cauri) {
119         // get classKey, via frontier to generate if necessary
120
String JavaDoc classKey = getController().getFrontier().getClassKey(cauri);
121         SortedMap JavaDoc tail = map.tailMap(classKey);
122         if(tail.isEmpty()) {
123             // wraparound
124
tail = map;
125         }
126         // target node is value of nearest subsequent key
127
return (String JavaDoc) tail.get(tail.firstKey());
128     }
129
130     protected void initialTasks() {
131         super.initialTasks();
132         try {
133             loadMap();
134         } catch (IOException JavaDoc e) {
135             e.printStackTrace();
136             throw new RuntimeException JavaDoc(e);
137         }
138     }
139
140     /**
141      * Retrieve and parse the mapping specification from a local path or
142      * HTTP URL.
143      *
144      * @throws IOException
145      */

146     protected void loadMap() throws IOException JavaDoc {
147         map.clear();
148         String JavaDoc mapSource = (String JavaDoc) getUncheckedAttribute(null,ATTR_MAP_SOURCE);
149         Reader JavaDoc reader = null;
150         if(!mapSource.startsWith("http://")) {
151             // file-based source
152
File JavaDoc source = new File JavaDoc(mapSource);
153             if (!source.isAbsolute()) {
154                 source = new File JavaDoc(getSettingsHandler().getOrder()
155                         .getController().getDisk(), mapSource);
156             }
157             reader = new FileReader JavaDoc(source);
158         } else {
159             URLConnection JavaDoc conn = (new URL JavaDoc(mapSource)).openConnection();
160             reader = new InputStreamReader JavaDoc(conn.getInputStream());
161         }
162         reader = new BufferedReader JavaDoc(reader);
163         Iterator JavaDoc iter =
164             new RegexpLineIterator(
165                     new LineReadingIterator((BufferedReader JavaDoc) reader),
166                     RegexpLineIterator.COMMENT_LINE,
167                     RegexpLineIterator.TRIMMED_ENTRY_TRAILING_COMMENT,
168                     RegexpLineIterator.ENTRY);
169         while (iter.hasNext()) {
170             String JavaDoc[] entry = ((String JavaDoc) iter.next()).split("\\s+");
171             map.put(entry[0],entry[1]);
172         }
173         reader.close();
174     }
175 }
Popular Tags