KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > frontier > SurtAuthorityQueueAssignmentPolicy


1 /* SurtAuthorityQueueAssignmentPolicy
2 *
3 * $Id: SurtAuthorityQueueAssignmentPolicy.java,v 1.5 2005/10/11 23:09:45 gojomo Exp $
4 *
5 * Created on Oct 5, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.frontier;
26
27 import java.util.logging.Level JavaDoc;
28 import java.util.logging.Logger JavaDoc;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.datamodel.CandidateURI;
32 import org.archive.crawler.framework.CrawlController;
33 import org.archive.net.UURI;
34 import org.archive.net.UURIFactory;
35
36 /**
37  * SurtAuthorityQueueAssignmentPolicy based on the surt form of hostname.
38  */

39 public class SurtAuthorityQueueAssignmentPolicy
40 extends QueueAssignmentPolicy {
41     private static final Logger JavaDoc logger = Logger
42             .getLogger(SurtAuthorityQueueAssignmentPolicy.class.getName());
43     /**
44      * When neat host-based class-key fails us
45      */

46     private static String JavaDoc DEFAULT_CLASS_KEY = "default...";
47     
48     private static final String JavaDoc DNS = "dns";
49
50     public String JavaDoc getClassKey(CrawlController controller, CandidateURI cauri) {
51         String JavaDoc scheme = cauri.getUURI().getScheme();
52         String JavaDoc candidate = null;
53         try {
54             if (scheme.equals(DNS)) {
55                 UURI effectiveuuri;
56                 if (cauri.getVia() != null) {
57                     // Special handling for DNS: treat as being
58
// of the same class as the triggering URI.
59
// When a URI includes a port, this ensures
60
// the DNS lookup goes atop the host:port
61
// queue that triggered it, rather than
62
// some other host queue
63
effectiveuuri = UURIFactory.getInstance(cauri.flattenVia());
64                 } else {
65                     // To get the dns surt form, create a fake http version
66
// (Gordon suggestion).
67
effectiveuuri = UURIFactory.getInstance("http://" +
68                         cauri.getUURI().getPath());
69                 }
70                 candidate = getSurtAuthority(effectiveuuri.getSurtForm());
71             } else {
72                 candidate = getSurtAuthority(cauri.getUURI().getSurtForm());
73             }
74             
75             if(candidate == null || candidate.length() == 0) {
76                 candidate = DEFAULT_CLASS_KEY;
77             }
78         } catch (URIException e) {
79             logger.log(Level.INFO,
80                     "unable to extract class key; using default", e);
81             candidate = DEFAULT_CLASS_KEY;
82         }
83         // Ensure classKeys are safe as filenames on NTFS
84
return candidate.replace(':','#');
85     }
86
87     protected String JavaDoc getSurtAuthority(String JavaDoc surt) {
88         int indexOfOpen = surt.indexOf("://(");
89         int indexOfClose = surt.indexOf(")");
90         if (indexOfOpen == -1 || indexOfClose == -1
91                 || ((indexOfOpen + 4) >= indexOfClose)) {
92             return DEFAULT_CLASS_KEY;
93         }
94         return surt.substring(indexOfOpen + 4, indexOfClose);
95     }
96 }
97
Popular Tags