1 25 package org.archive.crawler.frontier; 26 27 import java.util.logging.Level ; 28 import java.util.logging.Logger ; 29 30 import org.apache.commons.httpclient.URIException; 31 import org.archive.crawler.datamodel.CandidateURI; 32 import org.archive.crawler.framework.CrawlController; 33 import org.archive.net.UURI; 34 import org.archive.net.UURIFactory; 35 36 42 public class HostnameQueueAssignmentPolicy extends QueueAssignmentPolicy { 43 private static final Logger logger = Logger 44 .getLogger(HostnameQueueAssignmentPolicy.class.getName()); 45 48 private static String DEFAULT_CLASS_KEY = "default..."; 49 50 private static final String DNS = "dns"; 51 52 public String getClassKey(CrawlController controller, CandidateURI cauri) { 53 String scheme = cauri.getUURI().getScheme(); 54 String candidate = null; 55 try { 56 if (scheme.equals(DNS)){ 57 if (cauri.getVia() != null) { 58 UURI viaUuri = UURIFactory.getInstance(cauri.flattenVia()); 65 candidate = viaUuri.getAuthorityMinusUserinfo(); 66 scheme = viaUuri.getScheme(); 68 } else { 69 candidate= cauri.getUURI().getReferencedHost(); 70 } 71 } else { 72 candidate = cauri.getUURI().getAuthorityMinusUserinfo(); 73 } 74 75 if(candidate == null || candidate.length() == 0) { 76 candidate = DEFAULT_CLASS_KEY; 77 } 78 } catch (URIException e) { 79 logger.log(Level.INFO, 80 "unable to extract class key; using default", e); 81 candidate = DEFAULT_CLASS_KEY; 82 } 83 if (scheme != null && scheme.equals(UURIFactory.HTTPS)) { 84 if (!candidate.matches(".+:[0-9]+")) { 87 candidate += UURIFactory.HTTPS_PORT; 88 } 89 } 90 return candidate.replace(':','#'); 92 } 93 94 } 95 | Popular Tags |