1 24 package org.archive.crawler.framework; 25 26 import java.io.BufferedReader ; 27 import java.io.File ; 28 import java.io.FileReader ; 29 import java.io.FileWriter ; 30 import java.io.IOException ; 31 import java.io.Writer ; 32 import java.util.HashSet ; 33 import java.util.Iterator ; 34 import java.util.List ; 35 import java.util.Set ; 36 import java.util.logging.Logger ; 37 38 import javax.management.AttributeNotFoundException ; 39 import javax.management.MBeanException ; 40 import javax.management.ReflectionException ; 41 42 import org.apache.commons.httpclient.URIException; 43 import org.archive.crawler.datamodel.CandidateURI; 44 import org.archive.crawler.scope.SeedFileIterator; 45 import org.archive.crawler.scope.SeedListener; 46 import org.archive.crawler.settings.CrawlerSettings; 47 import org.archive.crawler.settings.SimpleType; 48 import org.archive.crawler.settings.Type; 49 import org.archive.net.UURI; 50 import org.archive.util.DevUtils; 51 52 75 public class CrawlScope extends Filter { 76 77 private static final long serialVersionUID = -3321533224526211277L; 78 79 private static final Logger logger = 80 Logger.getLogger(CrawlScope.class.getName()); 81 public static final String ATTR_NAME = "scope"; 82 public static final String ATTR_SEEDS = "seedsfile"; 83 84 88 public static final String 89 ATTR_REREAD_SEEDS_ON_CONFIG = "reread-seeds-on-config"; 90 public static final Boolean 91 DEFAULT_REREAD_SEEDS_ON_CONFIG = Boolean.TRUE; 92 93 protected Set <SeedListener> seedListeners = new HashSet <SeedListener>(); 94 95 100 public CrawlScope(String name) { 101 super(ATTR_NAME, "Crawl scope"); 103 Type t; 104 t = addElementToDefinition(new SimpleType(ATTR_SEEDS, 105 "File from which to extract seeds.", "seeds.txt")); 106 t.setOverrideable(false); 107 t.setExpertSetting(true); 108 t = addElementToDefinition(new SimpleType(ATTR_REREAD_SEEDS_ON_CONFIG, 109 "Whether to reread the seeds specification, whether it has " + 110 "changed or not, every time any configuration change occurs. " + 111 "If true, seeds are reread even when (for example) new " + 112 "domain overrides are set. Rereading the seeds can take a " + 113 "long time with large seed lists.", 114 DEFAULT_REREAD_SEEDS_ON_CONFIG)); 115 t.setOverrideable(false); 116 t.setExpertSetting(true); 117 118 } 119 120 122 public CrawlScope() { 123 this(ATTR_NAME); 124 } 125 126 134 public void initialize(CrawlController controller) { 135 } 137 138 public String toString() { 139 return "CrawlScope<" + getName() + ">"; 140 } 141 142 146 public void refreshSeeds() { 147 } 149 150 153 public File getSeedfile() { 154 File file = null; 155 try { 156 file = getSettingsHandler().getPathRelativeToWorkingDirectory( 157 (String )getAttribute(ATTR_SEEDS)); 158 if (!file.exists() || !file.canRead()) { 159 throw new IOException ("Seeds file " + 160 file.getAbsolutePath() + " does not exist or unreadable."); 161 } 162 } catch (IOException e) { 163 DevUtils.warnHandle(e, "problem reading seeds"); 164 } catch (AttributeNotFoundException e) { 165 DevUtils.warnHandle(e, "problem reading seeds"); 166 } catch (MBeanException e) { 167 DevUtils.warnHandle(e, "problem reading seeds"); 168 e.printStackTrace(); 169 } catch (ReflectionException e) { 170 DevUtils.warnHandle(e, "problem reading seeds"); 171 e.printStackTrace(); 172 } 173 174 return file; 175 } 176 177 182 protected boolean isSeed(Object o) { 183 return o instanceof CandidateURI && ((CandidateURI) o).isSeed(); 184 } 185 186 191 protected boolean isSameHost(UURI a, UURI b) { 192 boolean isSameHost = false; 193 if (a != null && b != null) { 194 try { 197 if (a.getReferencedHost() != null && b.getReferencedHost() != null) { 198 if (a.getReferencedHost().equals(b.getReferencedHost())) { 199 isSameHost = true; 200 } 201 } 202 } 203 catch (URIException e) { 204 logger.severe("Failed compare of " + a + " " + b + ": " + 205 e.getMessage()); 206 } 207 } 208 return isSameHost; 209 } 210 211 212 213 216 public void listUsedFiles(List <String > list){ 217 try { 219 File file = getSettingsHandler().getPathRelativeToWorkingDirectory( 220 (String )getAttribute(ATTR_SEEDS)); 221 list.add(file.getAbsolutePath()); 222 } catch (AttributeNotFoundException e) { 223 e.printStackTrace(); 225 } catch (MBeanException e) { 226 e.printStackTrace(); 228 } catch (ReflectionException e) { 229 e.printStackTrace(); 231 } 232 } 233 234 239 public void kickUpdate() { 240 if (((Boolean ) getUncheckedAttribute(null, ATTR_REREAD_SEEDS_ON_CONFIG)) 244 .booleanValue()) { 245 refreshSeeds(); 246 getSettingsHandler().getOrder().getController().getFrontier().loadSeeds(); 247 } 248 } 249 250 257 public Iterator <UURI> seedsIterator() { 258 return seedsIterator(null); 259 } 260 261 269 public Iterator <UURI> seedsIterator(Writer ignoredItemWriter) { 270 BufferedReader br; 271 try { 272 br = new BufferedReader (new FileReader (getSeedfile())); 273 } catch (IOException e) { 274 throw new RuntimeException (e); 275 } 276 return new SeedFileIterator(br,ignoredItemWriter); 277 } 278 279 284 protected void checkClose(Iterator iter) { 285 if(iter instanceof SeedFileIterator) { 286 ((SeedFileIterator)iter).close(); 287 } 288 } 289 290 302 public boolean addSeed(final CandidateURI curi) { 303 File f = getSeedfile(); 304 if (f != null) { 305 try { 306 FileWriter fw = new FileWriter (f, true); 307 fw.write("\n"); 309 fw.write("# Heritrix added seed " + 310 ((curi.getVia() != null) ? "redirect from " + curi.getVia(): 311 "(JMX)") + ".\n"); 312 fw.write(curi.toString()); 313 fw.flush(); 314 fw.close(); 315 Iterator iter = seedListeners.iterator(); 316 while(iter.hasNext()) { 317 ((SeedListener)iter.next()).addedSeed(curi); 318 } 319 return true; 320 } catch (IOException e) { 321 DevUtils.warnHandle(e, "problem writing new seed"); 322 } 323 } 324 return false; 325 } 326 327 public void addSeedListener(SeedListener sl) { 328 seedListeners.add(sl); 329 } 330 } 331 | Popular Tags |