KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > deciderules > PathologicalPathDecideRule


1 /* PathologicalPathDecideRule
2 *
3 * $Id: PathologicalPathDecideRule.java,v 1.3.16.1 2007/01/13 01:31:15 stack-sf Exp $
4 *
5 * Created on Apr 1, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.deciderules;
26
27 import java.util.logging.Logger JavaDoc;
28
29 import javax.management.AttributeNotFoundException JavaDoc;
30
31 import org.archive.crawler.settings.SimpleType;
32 import org.archive.crawler.settings.Type;
33
34
35
36 /**
37  * Rule REJECTs any URI which contains an excessive number of identical,
38  * consecutive path-segments (eg http://example.com/a/a/a/boo.html == 3 '/a'
39  * segments)
40  *
41  * @author gojomo
42  */

43 public class PathologicalPathDecideRule extends MatchesRegExpDecideRule {
44
45     private static final long serialVersionUID = -1803997581321178499L;
46
47     private static final Logger JavaDoc logger =
48         Logger.getLogger(PathologicalPathDecideRule.class.getName());
49
50     public static final String JavaDoc ATTR_REPETITIONS = "max-repetitions";
51
52     /**
53      * Default maximum repetitions.
54      * Default access so accessible by unit test.
55      */

56     static final Integer JavaDoc DEFAULT_REPETITIONS = new Integer JavaDoc(2);
57
58     protected String JavaDoc constructedRegexp;
59     
60     /** Constructs a new PathologicalPathFilter.
61      *
62      * @param name the name of the filter.
63      */

64     public PathologicalPathDecideRule(String JavaDoc name) {
65         super(name);
66         setDescription("PathologicalPathDecideRule. This rule" +
67                 " is used to avoid crawler traps by adding a constraint on" +
68                 " how many times a path-segment pattern in the URI may be" +
69                 " repeated. A URI will be REJECTed if the same path-segment" +
70                 " repeats more than '" + ATTR_REPETITIONS + "' in a row.");
71
72         // make default REJECT (overriding superclass) & always-default
73
Type type = addElementToDefinition(new SimpleType(ATTR_DECISION,
74                 "Decision to be applied", REJECT, ALLOWED_TYPES));
75         type.setTransient(true);
76         
77         // disable direct setting of regexp from superclass
78
type = getElementFromDefinition(ATTR_REGEXP);
79         type.setTransient(true);
80         
81         type = addElementToDefinition(new SimpleType(ATTR_REPETITIONS,
82                 "Number of times the pattern should be allowed to occur. " +
83                 "This rule returns its decision (usually REJECT) if a " +
84                 "path-segment is repeated more than number of times.",
85                 DEFAULT_REPETITIONS));
86         // overriding would require reconstruction of regexp every test
87
type.setOverrideable(false);
88     }
89
90     /**
91      * Construct the regexp string to be matched against the URI.
92      * @param o an object to extract a URI from.
93      * @return the regexp pattern.
94      */

95     protected String JavaDoc getRegexp(Object JavaDoc o) {
96         if (constructedRegexp == null) {
97             // race no concern: assignment is atomic, happy with any last value
98
constructedRegexp = constructRegexp();
99         }
100         return constructedRegexp;
101     }
102     
103     protected String JavaDoc constructRegexp() {
104         int rep = 0;
105         try {
106             rep = ((Integer JavaDoc) getAttribute(null, ATTR_REPETITIONS)).intValue();
107         } catch (AttributeNotFoundException JavaDoc e) {
108             logger.severe(e.getMessage());
109         }
110         return (rep == 0) ? null : ".*?/(.*?/)\\1{" + rep + ",}.*";
111     }
112     
113     
114     /**
115      * Repetitions may have changed; refresh constructedRegexp
116      *
117      * @see org.archive.crawler.deciderules.DecideRule#kickUpdate()
118      */

119     public void kickUpdate() {
120         super.kickUpdate();
121         constructedRegexp = constructRegexp();
122     }
123 }
124
Popular Tags