| 1 23 package org.archive.crawler.deciderules; 24 25 import java.util.logging.Level ; 26 import java.util.logging.Logger ; 27 28 import javax.management.AttributeNotFoundException ; 29 30 import org.archive.crawler.datamodel.CandidateURI; 31 import org.archive.crawler.settings.SimpleType; 32 import org.archive.net.UURI; 33 import org.archive.util.SurtPrefixSet; 34 35 43 public class ScopePlusOneDecideRule extends SurtPrefixedDecideRule { 44 45 private static final long serialVersionUID = -6344162369024146340L; 46 47 public static final String ATTR_SCOPE = "host-or-domain-scope"; 48 public static final String HOST = "Host"; 49 public static final String DOMAIN = "Domain"; 50 51 private static final Logger logger = 52 Logger.getLogger(ScopePlusOneDecideRule.class.getName()); 53 54 58 public ScopePlusOneDecideRule(String name) { 59 super(name); 60 setDescription( 61 "ScopePlusOneDecideRule. Rule allows one level of discovery " + 62 "beyond configured scope (e.g. Domain, plus the first " + 63 "otherwise out-of-scope link from an in-scope page, but " + 64 "no further hops from that first otherwise-out-of-scope page). " + 65 "surts-source-file is optional. Use surts-dump-file option " + 66 "when testing."); 67 addElementToDefinition(new SimpleType(ATTR_SCOPE, 68 "Restrict to host, e.g. archive.org excludes audio.archive.org, " + 69 "or expand to domain as well, e.g. archive.org includes all " + 70 "*.archive.org", DOMAIN, new String [] {HOST, DOMAIN})); 71 } 72 73 79 protected boolean evaluate(Object object) { 80 boolean result = false; 81 if (!(object instanceof CandidateURI)) { 82 return false; 84 } 85 SurtPrefixSet set = getPrefixes(object); 86 UURI u = UURI.from(object); 87 boolean firstResult = isInScope(u, set); 89 if (logger.isLoggable(Level.FINE)) { 90 logger.fine("Tested scope of UURI itself '" + u + 91 " and result was " + firstResult); 92 } 93 if (firstResult == true) { 94 result = true; 95 } else { 96 UURI via = getVia(object); 99 if (via == null) { 100 return false; 102 } 103 result = isInScope (via, set); 105 if (logger.isLoggable(Level.FINE)) { 106 logger.fine("Tested via UURI '" + via + 107 " and result was " + result); 108 } 109 } 110 return result; 111 } 112 113 119 protected synchronized SurtPrefixSet getPrefixes() { 120 return getPrefixes(null); 121 } 122 123 130 protected synchronized SurtPrefixSet getPrefixes(Object o) { 131 if (surtPrefixes == null) { 132 readPrefixes(o); 133 } 134 return surtPrefixes; 135 } 136 137 143 protected void readPrefixes(Object o) { 144 buildSurtPrefixSet(); 145 String scope = this.getScope(o); 147 if (scope.equals(HOST)){ 148 surtPrefixes.convertAllPrefixesToHosts(); 149 } else if (scope.equals(DOMAIN)) { 150 surtPrefixes.convertAllPrefixesToDomains(); 151 } 152 dumpSurtPrefixSet(); 153 } 154 155 private UURI getVia(Object o){ 156 return (o instanceof CandidateURI)? ((CandidateURI)o).getVia(): null; 157 } 158 159 165 protected String getScope(Object o) { 166 try { 167 String scope = (String )getAttribute(o, ATTR_SCOPE); 168 if (scope.equals(HOST)) { 169 return HOST; 170 } else if (scope.equals(DOMAIN)) { 171 return DOMAIN; 172 } else { 173 assert false : "Unrecognized scope " + scope 174 + ". Should never happen!"; 175 } 176 } catch (AttributeNotFoundException e) { 177 logger.severe(e.getMessage()); 178 } 179 return null; } 181 182 private boolean isInScope (Object o, SurtPrefixSet set) { 184 boolean iResult = false; 185 UURI u = (UURI)o; 186 if (u == null) { 187 return false; 188 } 189 String candidateSurt = u.getSurtForm(); 190 if (candidateSurt.startsWith("https:")) { 192 candidateSurt = "http:" + candidateSurt.substring(6); 193 } 194 if (set.containsPrefixOf(candidateSurt)){ 195 iResult = true; 196 } 197 return iResult; 198 } 199 } 200 | Popular Tags |