LexicalCrawlMapper


1   /* LexicalCrawlMapper
2    * 
3    * Created on Sep 30, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor;
24  
25  import java.io.BufferedReader  ;
26  import java.io.File  ;
27  import java.io.FileReader  ;
28  import java.io.IOException  ;
29  import java.io.InputStreamReader  ;
30  import java.io.Reader  ;
31  import java.net.URL  ;
32  import java.net.URLConnection  ;
33  import java.util.Iterator  ;
34  import java.util.SortedMap  ;
35  import java.util.TreeMap  ;
36  
37  import org.archive.crawler.datamodel.CandidateURI;
38  import org.archive.crawler.settings.SimpleType;
39  import org.archive.util.iterator.LineReadingIterator;
40  import org.archive.util.iterator.RegexpLineIterator;
41  
42  
43  /**
44   * A simple crawl splitter/mapper, dividing up CandidateURIs/CrawlURIs
45   * between crawlers by diverting some range of URIs to local log files
46   * (which can then be imported to other crawlers). 
47   * 
48   * May operate on a CrawlURI (typically early in the processing chain) or
49   * its CandidateURI outlinks (late in the processing chain, after 
50   * LinksScoper), or both (if inserted and configured in both places). 
51   * 
52   * <p>Uses lexical comparisons of classKeys to map URIs to crawlers. The
53   * 'map' is specified via either a local or HTTP-fetchable file. Each
54   * line of this file should contain two space-separated tokens, the
55   * first a key and the second a crawler node name (which should be
56   * legal as part of a filename). All URIs will be mapped to the crawler
57   * node name associated with the nearest mapping key equal or subsequent 
58   * to the URI's own classKey. If there are no mapping keys equal or 
59   * after the classKey, the mapping 'wraps around' to the first mapping key.
60   * 
61   * <p>One crawler name is distinguished as the 'local name'; URIs mapped to
62   * this name are not diverted, but continue to be processed normally.
63   * 
64   * <p>For example, assume a SurtAuthorityQueueAssignmentPolicy and
65   * a simple mapping file:
66   * 
67   * <pre>
68   *  d crawlerA
69   *  ~ crawlerB
70   * </pre>
71   * <p>All URIs with "com," classKeys will find the 'd' key as the nearest
72   * subsequent mapping key, and thus be mapped to 'crawlerA'. If that's
73   * the 'local name', the URIs will be processed normally; otherwise, the
74   * URI will be written to a diversion log aimed for 'crawlerA'. 
75   * 
76   * <p>If using the JMX importUris operation importing URLs dropped by
77   * a {@link LexicalCrawlMapper} instance, use <code>recoveryLog</code> style.
78   * 
79   * @author gojomo
80   * @version $Date: 2007/01/13 01:31:25 $, $Revision: 1.1.4.1 $
81   */
82  public class LexicalCrawlMapper extends CrawlMapper {
83      private static final long serialVersionUID = 1L;
84      
85      /** where to load map from */
86      public static final String   ATTR_MAP_SOURCE = "map-source";
87      public static final String   DEFAULT_MAP_SOURCE = "";
88      
89      /**
90       * Mapping of classKey ranges (as represented by their start) to 
91       * crawlers (by abstract name/filename)
92       */
93      TreeMap  <String  , String  > map = new TreeMap  <String  , String  >();
94  
95      /**
96       * Constructor.
97       * @param name Name of this processor.
98       */
99      public LexicalCrawlMapper(String   name) {
100         super(name, "LexicalCrawlMapper. Maps URIs to a named " +
101                 "crawler by a lexical comparison of the URI's " +
102                 "classKey to a supplied ranges map.");
103         addElementToDefinition(new SimpleType(ATTR_MAP_SOURCE,
104             "Path (or HTTP URL) to map specification file. Each line " +
105             "should include 2 whitespace-separated tokens: the first a " +
106             "key indicating the end of a range, the second the crawler " +
107             "node to which URIs in the key range should be mapped.",
108             DEFAULT_MAP_SOURCE));
109     }
110 
111     /**
112      * Look up the crawler node name to which the given CandidateURI 
113      * should be mapped. 
114      * 
115      * @param cauri CandidateURI to consider
116      * @return String node name which should handle URI
117      */
118     protected String   map(CandidateURI cauri) {
119         // get classKey, via frontier to generate if necessary
120         String   classKey = getController().getFrontier().getClassKey(cauri);
121         SortedMap   tail = map.tailMap(classKey);
122         if(tail.isEmpty()) {
123             // wraparound
124             tail = map;
125         }
126         // target node is value of nearest subsequent key
127         return (String  ) tail.get(tail.firstKey());
128     }
129 
130     protected void initialTasks() {
131         super.initialTasks();
132         try {
133             loadMap();
134         } catch (IOException   e) {
135             e.printStackTrace();
136             throw new RuntimeException  (e);
137         }
138     }
139 
140     /**
141      * Retrieve and parse the mapping specification from a local path or
142      * HTTP URL. 
143      * 
144      * @throws IOException
145      */
146     protected void loadMap() throws IOException   {
147         map.clear();
148         String   mapSource = (String  ) getUncheckedAttribute(null,ATTR_MAP_SOURCE);
149         Reader   reader = null;
150         if(!mapSource.startsWith("http://")) {
151             // file-based source
152             File   source = new File  (mapSource);
153             if (!source.isAbsolute()) {
154                 source = new File  (getSettingsHandler().getOrder()
155                         .getController().getDisk(), mapSource);
156             }
157             reader = new FileReader  (source);
158         } else {
159             URLConnection   conn = (new URL  (mapSource)).openConnection();
160             reader = new InputStreamReader  (conn.getInputStream());
161         }
162         reader = new BufferedReader  (reader);
163         Iterator   iter = 
164             new RegexpLineIterator(
165                     new LineReadingIterator((BufferedReader  ) reader),
166                     RegexpLineIterator.COMMENT_LINE,
167                     RegexpLineIterator.TRIMMED_ENTRY_TRAILING_COMMENT,
168                     RegexpLineIterator.ENTRY);
169         while (iter.hasNext()) {
170             String  [] entry = ((String  ) iter.next()).split("\\s+");
171             map.put(entry[0],entry[1]);
172         }
173         reader.close();
174     }
175 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags