1 25 package org.archive.crawler.frontier; 26 27 import java.util.logging.Level ; 28 import java.util.logging.Logger ; 29 30 import org.apache.commons.httpclient.URIException; 31 import org.archive.crawler.datamodel.CandidateURI; 32 import org.archive.crawler.framework.CrawlController; 33 import org.archive.net.UURI; 34 import org.archive.net.UURIFactory; 35 36 39 public class SurtAuthorityQueueAssignmentPolicy 40 extends QueueAssignmentPolicy { 41 private static final Logger logger = Logger 42 .getLogger(SurtAuthorityQueueAssignmentPolicy.class.getName()); 43 46 private static String DEFAULT_CLASS_KEY = "default..."; 47 48 private static final String DNS = "dns"; 49 50 public String getClassKey(CrawlController controller, CandidateURI cauri) { 51 String scheme = cauri.getUURI().getScheme(); 52 String candidate = null; 53 try { 54 if (scheme.equals(DNS)) { 55 UURI effectiveuuri; 56 if (cauri.getVia() != null) { 57 effectiveuuri = UURIFactory.getInstance(cauri.flattenVia()); 64 } else { 65 effectiveuuri = UURIFactory.getInstance("http://" + 68 cauri.getUURI().getPath()); 69 } 70 candidate = getSurtAuthority(effectiveuuri.getSurtForm()); 71 } else { 72 candidate = getSurtAuthority(cauri.getUURI().getSurtForm()); 73 } 74 75 if(candidate == null || candidate.length() == 0) { 76 candidate = DEFAULT_CLASS_KEY; 77 } 78 } catch (URIException e) { 79 logger.log(Level.INFO, 80 "unable to extract class key; using default", e); 81 candidate = DEFAULT_CLASS_KEY; 82 } 83 return candidate.replace(':','#'); 85 } 86 87 protected String getSurtAuthority(String surt) { 88 int indexOfOpen = surt.indexOf("://("); 89 int indexOfClose = surt.indexOf(")"); 90 if (indexOfOpen == -1 || indexOfClose == -1 91 || ((indexOfOpen + 4) >= indexOfClose)) { 92 return DEFAULT_CLASS_KEY; 93 } 94 return surt.substring(indexOfOpen + 4, indexOfClose); 95 } 96 } 97 | Popular Tags |