1 24 package org.archive.crawler.scope; 25 26 import java.io.File ; 27 import java.io.FileReader ; 28 import java.io.FileWriter ; 29 import java.io.IOException ; 30 31 import org.archive.crawler.datamodel.CandidateURI; 32 import org.archive.crawler.deciderules.DecidingScope; 33 import org.archive.crawler.framework.CrawlController; 34 import org.archive.crawler.settings.SimpleType; 35 import org.archive.crawler.settings.Type; 36 import org.archive.util.SurtPrefixSet; 37 38 52 public class SurtPrefixScope extends RefinedScope { 53 54 private static final long serialVersionUID = 2652008287322770123L; 55 56 public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file"; 57 public static final String ATTR_SEEDS_AS_SURT_PREFIXES = "seeds-as-surt-prefixes"; 58 public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file"; 59 60 private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES = new Boolean (true); 61 62 66 public static final String 67 ATTR_ALSO_CHECK_VIA = "also-check-via"; 68 public static final Boolean 69 DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE; 70 71 SurtPrefixSet surtPrefixes = null; 72 73 public SurtPrefixScope(String name) { 74 super(name); 75 setDescription( 76 "SurtPrefixScope: A scope for crawls limited to regions of " + 77 "the web defined by a set of SURT prefixes *Deprecated* " + 78 "Use DecidingScope instead. (The SURT form of " + 79 "a URI has its hostname reordered to ease sorting and " 80 + "grouping by domain hierarchies.)"); 81 addElementToDefinition( 82 new SimpleType(ATTR_SURTS_SOURCE_FILE, 83 "Source file from which to infer SURT prefixes. Any URLs " + 84 "in file will be converted to the implied SURT prefix, and " + 85 "literal SURT prefixes may be listed on lines beginning " + 86 "with a '+' character.", 87 "")); 88 addElementToDefinition( 89 new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES, 90 "Should seeds also be interpreted as SURT prefixes.", 91 DEFAULT_SEEDS_AS_SURT_PREFIXES)); 92 93 Type t = addElementToDefinition( 94 new SimpleType(ATTR_SURTS_DUMP_FILE, 95 "Dump file to save SURT prefixes actually used.", 96 "")); 97 t.setExpertSetting(true); 98 t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA, 99 "Whether to also rule URI in-scope if a " + 100 "URI's 'via' URI (the URI from which it was discovered) " + 101 "in SURT form begins with any of the established prefixes. " + 102 "For example, can be used to accept URIs that are 'one hop " + 103 "off' URIs fitting the SURT prefixes. Default is false.", 104 DEFAULT_ALSO_CHECK_VIA)); 105 t.setOverrideable(false); 106 t.setExpertSetting(true); 107 108 } 109 110 111 114 public void initialize(CrawlController controller) { 115 super.initialize(controller); 116 readPrefixes(); 117 } 118 119 126 protected synchronized boolean focusAccepts(Object object) { 127 if (surtPrefixes == null) { 129 readPrefixes(); 130 } 131 if ( (object instanceof CandidateURI) && 132 ((Boolean ) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA)) 133 .booleanValue()) { 134 if(focusAccepts(((CandidateURI)object).getVia())) { 135 return true; 136 } 137 } 138 String candidateSurt = SurtPrefixSet.getCandidateSurt(object); 139 if(candidateSurt == null) { 140 return false; 141 } 142 return surtPrefixes.containsPrefixOf(candidateSurt); 143 } 144 145 private void readPrefixes() { 146 surtPrefixes = new SurtPrefixSet(); 147 FileReader fr = null; 148 149 String sourcePath = (String ) getUncheckedAttribute(null, 151 ATTR_SURTS_SOURCE_FILE); 152 if(sourcePath.length()>0) { 153 File source = new File (sourcePath); 154 if (!source.isAbsolute()) { 155 source = new File (getSettingsHandler().getOrder() 156 .getController().getDisk(), sourcePath); 157 } 158 try { 159 fr = new FileReader (source); 160 try { 161 surtPrefixes.importFromMixed(fr,true); 162 } finally { 163 fr.close(); 164 } 165 166 } catch (IOException e) { 167 e.printStackTrace(); 168 throw new RuntimeException (e); 169 } 170 } 171 172 boolean deduceFromSeeds = 174 ((Boolean ) getUncheckedAttribute(null, ATTR_SEEDS_AS_SURT_PREFIXES)) 175 .booleanValue(); 176 try { 177 fr = new FileReader (getSeedfile()); 178 try { 179 surtPrefixes.importFromMixed(fr,deduceFromSeeds); 180 } finally { 181 fr.close(); 182 } 183 } catch (IOException e) { 184 e.printStackTrace(); 185 throw new RuntimeException (e); 186 } 187 188 String dumpPath = (String ) getUncheckedAttribute(null, 190 ATTR_SURTS_DUMP_FILE); 191 if(dumpPath.length()>0) { 192 File dump = new File (dumpPath); 193 if (!dump.isAbsolute()) { 194 dump = new File (getSettingsHandler().getOrder() 195 .getController().getDisk(), dumpPath); 196 } 197 try { 198 FileWriter fw = new FileWriter (dump); 199 try { 200 surtPrefixes.exportTo(fw); 201 } finally { 202 fw.close(); 203 } 204 } catch (IOException e) { 205 e.printStackTrace(); 206 throw new RuntimeException (e); 207 } 208 } 209 } 210 211 216 public synchronized void kickUpdate() { 217 super.kickUpdate(); 218 readPrefixes(); 221 } 222 } 223 | Popular Tags |