1 25 package org.archive.crawler.deciderules; 26 27 import java.io.File ; 28 import java.io.FileReader ; 29 import java.io.FileWriter ; 30 import java.io.IOException ; 31 32 import org.archive.crawler.datamodel.CandidateURI; 33 import org.archive.crawler.framework.CrawlScope; 34 import org.archive.crawler.scope.SeedListener; 35 import org.archive.crawler.settings.SimpleType; 36 import org.archive.crawler.settings.Type; 37 import org.archive.util.SurtPrefixSet; 38 39 40 41 55 public class SurtPrefixedDecideRule extends PredicatedDecideRule 56 implements SeedListener { 57 58 private static final long serialVersionUID = 2075790126085405015L; 59 60 63 public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file"; 64 public static final String ATTR_SEEDS_AS_SURT_PREFIXES = 65 "seeds-as-surt-prefixes"; 66 public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file"; 67 68 private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES = 69 new Boolean (true); 70 71 75 public static final String 76 ATTR_REBUILD_ON_RECONFIG = "rebuild-on-reconfig"; 77 public static final Boolean 78 DEFAULT_REBUILD_ON_RECONFIG = Boolean.TRUE; 79 80 84 public static final String 85 ATTR_ALSO_CHECK_VIA = "also-check-via"; 86 public static final Boolean 87 DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE; 88 89 protected SurtPrefixSet surtPrefixes = null; 90 91 95 public SurtPrefixedDecideRule(String name) { 96 super(name); 97 setDescription("SurtPrefixedDecideRule. Makes the configured decision " 98 + "for any URI which, when expressed in SURT form, begins " 99 + "with any of the established prefixes (from either seeds " 100 + "specification or an external file)."); 101 addElementToDefinition(new SimpleType(ATTR_SURTS_SOURCE_FILE, 102 "Source file from which to infer SURT prefixes. Any URLs " + 103 "in file will be converted to the implied SURT prefix, and " + 104 "literal SURT prefixes may be listed on lines beginning " + 105 "with a '+' character.", 106 "")); 107 addElementToDefinition(new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES, 108 "Should seeds also be interpreted as SURT prefixes.", 109 DEFAULT_SEEDS_AS_SURT_PREFIXES)); 110 Type t = addElementToDefinition(new SimpleType(ATTR_SURTS_DUMP_FILE, 111 "Dump file to save SURT prefixes actually used: " + 112 "Useful debugging SURTs.", "")); 113 t.setExpertSetting(true); 114 t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA, 115 "Whether to also make the configured decision if a " + 116 "URI's 'via' URI (the URI from which it was discovered) " + 117 "in SURT form begins with any of the established prefixes. " + 118 "For example, can be used to ACCEPT URIs that are 'one hop " + 119 "off' URIs fitting the SURT prefixes. Default is false.", 120 DEFAULT_ALSO_CHECK_VIA)); 121 t.setOverrideable(false); 122 t.setExpertSetting(true); 123 t = addElementToDefinition(new SimpleType(ATTR_REBUILD_ON_RECONFIG, 124 "Whether to rebuild the internal structures from source " + 125 "files (including seeds if appropriate) every time any " + 126 "configuration change occurs. If true, " + 127 "rule is rebuilt from sources even when (for example) " + 128 "unrelated new domain overrides are set. Rereading large" + 129 "source files can take a long time.", 130 DEFAULT_REBUILD_ON_RECONFIG)); 131 t.setOverrideable(false); 132 t.setExpertSetting(true); 133 } 134 135 141 protected boolean evaluate(Object object) { 142 if ( (object instanceof CandidateURI) && 143 ((Boolean ) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA)) 144 .booleanValue()) { 145 if(evaluate(((CandidateURI)object).getVia())) { 146 return true; 147 } 148 } 149 String candidateSurt; 150 candidateSurt = SurtPrefixSet.getCandidateSurt(object); 151 if (candidateSurt == null) { 152 return false; 153 } 154 return getPrefixes().containsPrefixOf(candidateSurt); 155 } 156 157 162 private synchronized SurtPrefixSet getPrefixes() { 163 if (surtPrefixes == null) { 164 readPrefixes(); 165 } 166 return surtPrefixes; 167 } 168 169 protected void readPrefixes() { 170 buildSurtPrefixSet(); 171 dumpSurtPrefixSet(); 172 } 173 174 177 protected void dumpSurtPrefixSet() { 178 String dumpPath = (String )getUncheckedAttribute(null, 180 ATTR_SURTS_DUMP_FILE); 181 if (dumpPath.length() > 0) { 182 File dump = new File (dumpPath); 183 if (!dump.isAbsolute()) { 184 dump = new File (getSettingsHandler().getOrder().getController() 185 .getDisk(), dumpPath); 186 } 187 try { 188 FileWriter fw = new FileWriter (dump); 189 try { 190 surtPrefixes.exportTo(fw); 191 } finally { 192 fw.close(); 193 } 194 } catch (IOException e) { 195 e.printStackTrace(); 196 throw new RuntimeException (e); 197 } 198 } 199 } 200 201 205 protected void buildSurtPrefixSet() { 206 SurtPrefixSet newSurtPrefixes = new SurtPrefixSet(); 207 FileReader fr = null; 208 209 String sourcePath = (String )getUncheckedAttribute(null, 211 ATTR_SURTS_SOURCE_FILE); 212 if (sourcePath.length() > 0) { 213 File source = new File (sourcePath); 214 if (!source.isAbsolute()) { 215 source = new File (getSettingsHandler().getOrder() 216 .getController().getDisk(), sourcePath); 217 } 218 try { 219 fr = new FileReader (source); 220 try { 221 newSurtPrefixes.importFromMixed(fr, true); 222 } finally { 223 fr.close(); 224 } 225 } catch (IOException e) { 226 e.printStackTrace(); 227 throw new RuntimeException (e); 228 } 229 } 230 231 boolean deduceFromSeeds = ((Boolean )getUncheckedAttribute(null, 233 ATTR_SEEDS_AS_SURT_PREFIXES)).booleanValue(); 234 if(deduceFromSeeds) { 235 try { 236 fr = new FileReader (getSeedfile()); 237 try { 238 newSurtPrefixes.importFromMixed(fr, deduceFromSeeds); 239 } finally { 240 fr.close(); 241 } 242 } catch (IOException e) { 243 e.printStackTrace(); 244 throw new RuntimeException (e); 245 } 246 } 247 248 surtPrefixes = newSurtPrefixes; 249 } 250 251 256 public synchronized void kickUpdate() { 257 super.kickUpdate(); 258 if (((Boolean ) getUncheckedAttribute(null, ATTR_REBUILD_ON_RECONFIG)) 259 .booleanValue()) { 260 readPrefixes(); 261 } 262 } 265 266 272 protected File getSeedfile() { 273 CrawlScope scope = 274 getSettingsHandler().getOrder().getController().getScope(); 275 scope.addSeedListener(this); 276 return scope.getSeedfile(); 277 } 278 279 public synchronized void addedSeed(final CandidateURI curi) { 280 SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone(); 281 newSurtPrefixes.add(prefixFrom(curi.toString())); 282 surtPrefixes = newSurtPrefixes; 283 } 284 285 protected String prefixFrom(String uri) { 286 return SurtPrefixSet.prefixFromPlain(uri); 287 } 288 } 289 | Popular Tags |