KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > deciderules > ScopePlusOneDecideRule


1 /* ScopePlusOneDecideRule
2 *
3 * Created on Aug 22, 2005
4 *
5 * Copyright 2005 Regents of the University of California, All rights reserved
6 *
7 * This file is part of the Heritrix web crawler (crawler.archive.org).
8 *
9 * Heritrix is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * any later version.
13 *
14 * Heritrix is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser Public License
20 * along with Heritrix; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */

23 package org.archive.crawler.deciderules;
24
25 import java.util.logging.Level JavaDoc;
26 import java.util.logging.Logger JavaDoc;
27
28 import javax.management.AttributeNotFoundException JavaDoc;
29
30 import org.archive.crawler.datamodel.CandidateURI;
31 import org.archive.crawler.settings.SimpleType;
32 import org.archive.net.UURI;
33 import org.archive.util.SurtPrefixSet;
34
35 /**
36  * Rule allows one level of discovery beyond configured scope
37  * (e.g. Domain, plus the first otherwise out-of-scope link from an
38  * in-scope page, but not further hops from that first page)
39  *
40  * @author Shifra Raffel
41  * @version $Date: 2007/01/13 01:31:15 $ $Revision: 1.1.14.1 $
42  */

43 public class ScopePlusOneDecideRule extends SurtPrefixedDecideRule {
44
45     private static final long serialVersionUID = -6344162369024146340L;
46
47     public static final String JavaDoc ATTR_SCOPE = "host-or-domain-scope";
48     public static final String JavaDoc HOST = "Host";
49     public static final String JavaDoc DOMAIN = "Domain";
50     
51     private static final Logger JavaDoc logger =
52         Logger.getLogger(ScopePlusOneDecideRule.class.getName());
53     
54     /**
55      * Constructor.
56      * @param name
57      */

58     public ScopePlusOneDecideRule(String JavaDoc name) {
59         super(name);
60         setDescription(
61             "ScopePlusOneDecideRule. Rule allows one level of discovery " +
62             "beyond configured scope (e.g. Domain, plus the first " +
63             "otherwise out-of-scope link from an in-scope page, but " +
64             "no further hops from that first otherwise-out-of-scope page). " +
65             "surts-source-file is optional. Use surts-dump-file option " +
66             "when testing.");
67         addElementToDefinition(new SimpleType(ATTR_SCOPE,
68             "Restrict to host, e.g. archive.org excludes audio.archive.org, " +
69             "or expand to domain as well, e.g. archive.org includes all " +
70             "*.archive.org", DOMAIN, new String JavaDoc[] {HOST, DOMAIN}));
71     }
72
73     /**
74      * Evaluate whether given object comes from a URI which is in scope
75      *
76      * @param object to evaluate
77      * @return true if URI is either in scope or its via is
78      */

79     protected boolean evaluate(Object JavaDoc object) {
80         boolean result = false;
81         if (!(object instanceof CandidateURI)) {
82             // Can't evaluate if not a candidate URI
83
return false;
84         }
85         SurtPrefixSet set = getPrefixes(object);
86         UURI u = UURI.from(object);
87         // First, is the URI itself in scope?
88
boolean firstResult = isInScope(u, set);
89         if (logger.isLoggable(Level.FINE)) {
90             logger.fine("Tested scope of UURI itself '" + u +
91                         " and result was " + firstResult);
92         }
93         if (firstResult == true) {
94             result = true;
95         } else {
96             // This object is not itself within scope, but
97
// see whether its via might be
98
UURI via = getVia(object);
99             if (via == null) {
100                 // If there is no via and the URL doesn't match scope,reject it
101
return false;
102             }
103             // If the via is within scope, accept it
104
result = isInScope (via, set);
105             if (logger.isLoggable(Level.FINE)) {
106                 logger.fine("Tested via UURI '" + via +
107                         " and result was " + result);
108             }
109         }
110         return result;
111     }
112     
113     /**
114      * Synchronized get of prefix set to use
115      *
116      * @return SurtPrefixSet to use for check
117      *@see org.archive.crawler.deciderules.SurtPrefixedDecideRule#getPrefixes()
118      */

119     protected synchronized SurtPrefixSet getPrefixes() {
120         return getPrefixes(null);
121     }
122     
123     /**
124      * Synchronized get of prefix set to use.
125      * @param o Context object.
126      *
127      * @return SurtPrefixSet to use for check
128      * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#getPrefixes()
129      */

130     protected synchronized SurtPrefixSet getPrefixes(Object JavaDoc o) {
131         if (surtPrefixes == null) {
132             readPrefixes(o);
133         }
134         return surtPrefixes;
135     }
136     
137     /**
138      * Patch the SURT prefix set so that it only includes the appropriate
139      * prefixes.
140      * @param o Context object.
141      * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#readPrefixes()
142      */

143     protected void readPrefixes(Object JavaDoc o) {
144         buildSurtPrefixSet();
145         // See whether Host or Domain was chosen
146
String JavaDoc scope = this.getScope(o);
147         if (scope.equals(HOST)){
148             surtPrefixes.convertAllPrefixesToHosts();
149         } else if (scope.equals(DOMAIN)) {
150             surtPrefixes.convertAllPrefixesToDomains();
151         }
152         dumpSurtPrefixSet();
153     }
154     
155     private UURI getVia(Object JavaDoc o){
156         return (o instanceof CandidateURI)? ((CandidateURI)o).getVia(): null;
157     }
158
159     /**
160      * Decide whether using host or domain scope
161      * @param o Context
162      * @return String Host or domain
163      *
164      */

165     protected String JavaDoc getScope(Object JavaDoc o) {
166         try {
167             String JavaDoc scope = (String JavaDoc)getAttribute(o, ATTR_SCOPE);
168             if (scope.equals(HOST)) {
169                 return HOST;
170             } else if (scope.equals(DOMAIN)) {
171                 return DOMAIN;
172             } else {
173                 assert false : "Unrecognized scope " + scope
174                         + ". Should never happen!";
175             }
176         } catch (AttributeNotFoundException JavaDoc e) {
177             logger.severe(e.getMessage());
178         }
179         return null; // Basically the rule is inactive if this occurs.
180
}
181     
182     //check that the URI is in scope
183
private boolean isInScope (Object JavaDoc o, SurtPrefixSet set) {
184         boolean iResult = false;
185         UURI u = (UURI)o;
186         if (u == null) {
187             return false;
188         }
189         String JavaDoc candidateSurt = u.getSurtForm();
190         // also want to treat https as http
191
if (candidateSurt.startsWith("https:")) {
192             candidateSurt = "http:" + candidateSurt.substring(6);
193         }
194         if (set.containsPrefixOf(candidateSurt)){
195             iResult = true;
196         }
197         return iResult;
198     }
199 }
200
Popular Tags