1 23 package org.archive.crawler.processor; 24 25 import java.io.BufferedReader ; 26 import java.io.File ; 27 import java.io.FileReader ; 28 import java.io.IOException ; 29 import java.io.InputStreamReader ; 30 import java.io.Reader ; 31 import java.net.URL ; 32 import java.net.URLConnection ; 33 import java.util.Iterator ; 34 import java.util.SortedMap ; 35 import java.util.TreeMap ; 36 37 import org.archive.crawler.datamodel.CandidateURI; 38 import org.archive.crawler.settings.SimpleType; 39 import org.archive.util.iterator.LineReadingIterator; 40 import org.archive.util.iterator.RegexpLineIterator; 41 42 43 82 public class LexicalCrawlMapper extends CrawlMapper { 83 private static final long serialVersionUID = 1L; 84 85 86 public static final String ATTR_MAP_SOURCE = "map-source"; 87 public static final String DEFAULT_MAP_SOURCE = ""; 88 89 93 TreeMap <String , String > map = new TreeMap <String , String >(); 94 95 99 public LexicalCrawlMapper(String name) { 100 super(name, "LexicalCrawlMapper. Maps URIs to a named " + 101 "crawler by a lexical comparison of the URI's " + 102 "classKey to a supplied ranges map."); 103 addElementToDefinition(new SimpleType(ATTR_MAP_SOURCE, 104 "Path (or HTTP URL) to map specification file. Each line " + 105 "should include 2 whitespace-separated tokens: the first a " + 106 "key indicating the end of a range, the second the crawler " + 107 "node to which URIs in the key range should be mapped.", 108 DEFAULT_MAP_SOURCE)); 109 } 110 111 118 protected String map(CandidateURI cauri) { 119 String classKey = getController().getFrontier().getClassKey(cauri); 121 SortedMap tail = map.tailMap(classKey); 122 if(tail.isEmpty()) { 123 tail = map; 125 } 126 return (String ) tail.get(tail.firstKey()); 128 } 129 130 protected void initialTasks() { 131 super.initialTasks(); 132 try { 133 loadMap(); 134 } catch (IOException e) { 135 e.printStackTrace(); 136 throw new RuntimeException (e); 137 } 138 } 139 140 146 protected void loadMap() throws IOException { 147 map.clear(); 148 String mapSource = (String ) getUncheckedAttribute(null,ATTR_MAP_SOURCE); 149 Reader reader = null; 150 if(!mapSource.startsWith("http://")) { 151 File source = new File (mapSource); 153 if (!source.isAbsolute()) { 154 source = new File (getSettingsHandler().getOrder() 155 .getController().getDisk(), mapSource); 156 } 157 reader = new FileReader (source); 158 } else { 159 URLConnection conn = (new URL (mapSource)).openConnection(); 160 reader = new InputStreamReader (conn.getInputStream()); 161 } 162 reader = new BufferedReader (reader); 163 Iterator iter = 164 new RegexpLineIterator( 165 new LineReadingIterator((BufferedReader ) reader), 166 RegexpLineIterator.COMMENT_LINE, 167 RegexpLineIterator.TRIMMED_ENTRY_TRAILING_COMMENT, 168 RegexpLineIterator.ENTRY); 169 while (iter.hasNext()) { 170 String [] entry = ((String ) iter.next()).split("\\s+"); 171 map.put(entry[0],entry[1]); 172 } 173 reader.close(); 174 } 175 } | Popular Tags |