KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > frontier > HostnameQueueAssignmentPolicy


1 /* HostnameQueueAssignmentPolicy
2 *
3 * $Id: HostnameQueueAssignmentPolicy.java,v 1.9 2005/09/21 23:00:47 gojomo Exp $
4 *
5 * Created on Oct 5, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.frontier;
26
27 import java.util.logging.Level JavaDoc;
28 import java.util.logging.Logger JavaDoc;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.datamodel.CandidateURI;
32 import org.archive.crawler.framework.CrawlController;
33 import org.archive.net.UURI;
34 import org.archive.net.UURIFactory;
35
36 /**
37  * QueueAssignmentPolicy based on the hostname:port evident in the given
38  * CrawlURI.
39  *
40  * @author gojomo
41  */

42 public class HostnameQueueAssignmentPolicy extends QueueAssignmentPolicy {
43     private static final Logger JavaDoc logger = Logger
44         .getLogger(HostnameQueueAssignmentPolicy.class.getName());
45     /**
46      * When neat host-based class-key fails us
47      */

48     private static String JavaDoc DEFAULT_CLASS_KEY = "default...";
49     
50     private static final String JavaDoc DNS = "dns";
51
52     public String JavaDoc getClassKey(CrawlController controller, CandidateURI cauri) {
53         String JavaDoc scheme = cauri.getUURI().getScheme();
54         String JavaDoc candidate = null;
55         try {
56             if (scheme.equals(DNS)){
57                 if (cauri.getVia() != null) {
58                     // Special handling for DNS: treat as being
59
// of the same class as the triggering URI.
60
// When a URI includes a port, this ensures
61
// the DNS lookup goes atop the host:port
62
// queue that triggered it, rather than
63
// some other host queue
64
UURI viaUuri = UURIFactory.getInstance(cauri.flattenVia());
65                     candidate = viaUuri.getAuthorityMinusUserinfo();
66                     // adopt scheme of triggering URI
67
scheme = viaUuri.getScheme();
68                 } else {
69                     candidate= cauri.getUURI().getReferencedHost();
70                 }
71             } else {
72                 candidate = cauri.getUURI().getAuthorityMinusUserinfo();
73             }
74             
75             if(candidate == null || candidate.length() == 0) {
76                 candidate = DEFAULT_CLASS_KEY;
77             }
78         } catch (URIException e) {
79             logger.log(Level.INFO,
80                     "unable to extract class key; using default", e);
81             candidate = DEFAULT_CLASS_KEY;
82         }
83         if (scheme != null && scheme.equals(UURIFactory.HTTPS)) {
84             // If https and no port specified, add default https port to
85
// distinguish https from http server without a port.
86
if (!candidate.matches(".+:[0-9]+")) {
87                 candidate += UURIFactory.HTTPS_PORT;
88             }
89         }
90         // Ensure classKeys are safe as filenames on NTFS
91
return candidate.replace(':','#');
92     }
93
94 }
95
Popular Tags