KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > deciderules > TransclusionDecideRule


1 /* TransclusionDecideRule
2 *
3 * $Id: TransclusionDecideRule.java,v 1.4.18.1 2007/01/13 01:31:15 stack-sf Exp $
4 *
5 * Created on Apr 1, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.deciderules;
26
27 import org.archive.crawler.datamodel.CandidateURI;
28 import org.archive.crawler.extractor.Link;
29 import org.archive.crawler.settings.SimpleType;
30 import org.archive.crawler.settings.Type;
31
32
33
34 /**
35  * Rule ACCEPTs any CrawlURIs whose path-from-seed ('hopsPath' -- see
36  * {@link CandidateURI#getPathFromSeed()}) ends
37  * with at least one, but not more than, the given number of
38  * non-navlink ('L') hops.
39  *
40  * Otherwise, if the path-from-seed is empty or if a navlink ('L') occurs
41  * within max-trans-hops of the tail of the path-from-seed, this rule
42  * returns PASS.
43  *
44  * <p>Thus, it allows things like embedded resources (frames/images/media)
45  * and redirects to be transitively included ('transcluded') in a crawl,
46  * even if they otherwise would not, for some reasonable number of hops
47  * (1-4).
48  *
49  * @see <a HREF="http://www.google.com/search?q=define%3Atransclusion&sourceid=mozilla&start=0&start=0&ie=utf-8&oe=utf-8">Transclusion</a>
50  *
51  * @author gojomo
52  */

53 public class TransclusionDecideRule extends PredicatedDecideRule {
54
55     private static final long serialVersionUID = -3975688876990558918L;
56
57     private static final String JavaDoc ATTR_MAX_TRANS_HOPS = "max-trans-hops";
58     
59     /**
60      * Default maximum hops.
61      * Default access so can be accessed by unit tests.
62      */

63     static final Integer JavaDoc DEFAULT_MAX_TRANS_HOPS = new Integer JavaDoc(3);
64
65     /**
66      * Usual constructor.
67      * @param name Name of this DecideRule.
68      */

69     public TransclusionDecideRule(String JavaDoc name) {
70         super(name);
71         setDescription("TransclusionDecideRule. ACCEPTs URIs whose path " +
72                 "from the seed ends with up to (but not more than) the " +
73                 "configured '" + ATTR_MAX_TRANS_HOPS +
74                 "' number of non-navlink ('L') hops.");
75         // make default ACCEPT unchangeable
76
Type type = getElementFromDefinition(ATTR_DECISION);
77         type.setTransient(true);
78         addElementToDefinition(new SimpleType(ATTR_MAX_TRANS_HOPS,
79             "Maximum number of non-navlink ('L') hops.",
80             DEFAULT_MAX_TRANS_HOPS));
81         // TODO: add expert settings for further penalizing certain hop types,
82
// eg: make speculative hops ('X') count 3x, etc.
83
}
84
85     /**
86      * Evaluate whether given object is within the threshold number of
87      * transitive hops.
88      *
89      * @param object Object to make decision on.
90      * @return true if the transitive hops >0 and <= max
91      */

92     protected boolean evaluate(Object JavaDoc object) {
93         CandidateURI curi = null;
94         try {
95             curi = (CandidateURI)object;
96         } catch (ClassCastException JavaDoc e) {
97             // if not CrawlURI, always disregard.
98
return false;
99         }
100         String JavaDoc hopsPath = curi.getPathFromSeed();
101         if (hopsPath == null || hopsPath.length() == 0) {
102             return false;
103         }
104         int count = 0;
105         for (int i = hopsPath.length() - 1; i >= 0; i--) {
106             if (hopsPath.charAt(i) != Link.NAVLINK_HOP) {
107                 // TODO: count some hops for more (to bias against chains
108
// of them, eg 'X' speculative links that might really be
109
// navlinks)
110
count++;
111             } else {
112                 break;
113             }
114         }
115         return count > 0 && count <= getThresholdHops(object);
116     }
117
118     /**
119      * @param obj Context object.
120      * @return hops cutoff threshold
121      */

122     private int getThresholdHops(Object JavaDoc obj) {
123         return ((Integer JavaDoc)getUncheckedAttribute(obj,ATTR_MAX_TRANS_HOPS)).
124             intValue();
125     }
126 }
127
Popular Tags