KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > filter > PathologicalPathFilter


1 /* PathologicalFilter
2  *
3  * $Id: PathologicalPathFilter.java,v 1.11.2.1 2007/01/13 01:31:21 stack-sf Exp $
4  *
5  * Created on Feb 20, 2004
6  *
7  * Copyright (C) 2004 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.filter;
26
27 import java.util.logging.Logger JavaDoc;
28
29 import javax.management.AttributeNotFoundException JavaDoc;
30
31 import org.archive.crawler.datamodel.CrawlURI;
32 import org.archive.crawler.deciderules.DecideRule;
33 import org.archive.crawler.deciderules.DecidingFilter;
34 import org.archive.crawler.settings.SimpleType;
35 import org.archive.crawler.settings.Type;
36
37 /**
38  * Checks if a URI contains a repeated pattern.
39  *
40  * This filter is checking if a pattern is repeated a specific number of times.
41  * The use is to avoid crawler traps where the server adds the same pattern to
42  * the requested URI like: <code>http://host/img/img/img/img....</code>. This
43  * filter returns TRUE if the path is pathological. FALSE otherwise.
44  *
45  * @author John Erik Halse
46  * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
47  * equivalent {@link DecideRule}.
48  */

49 public class PathologicalPathFilter extends URIRegExpFilter {
50
51     private static final long serialVersionUID = 2797805167250054353L;
52
53     private static final Logger JavaDoc logger =
54         Logger.getLogger(PathologicalPathFilter.class.getName());
55
56     public static final String JavaDoc ATTR_REPETITIONS = "repetitions";
57
58     public static final Integer JavaDoc DEFAULT_REPETITIONS = new Integer JavaDoc(3);
59     
60     private final String JavaDoc REGEX_PREFIX = ".*?/(.*?/)\\1{";
61     private final String JavaDoc REGEX_SUFFIX = ",}.*";
62
63     /** Constructs a new PathologicalPathFilter.
64      *
65      * @param name the name of the filter.
66      */

67     public PathologicalPathFilter(String JavaDoc name) {
68         super(name);
69         setDescription("Pathological path filter *Deprecated* Use" +
70                 "DecidingFilter and equivalent DecideRule instead. " +
71                 "The Pathologicalpath filter" +
72                 " is used to avoid crawler traps by adding a constraint on" +
73                 " how many times a pattern in the URI could be repeated." +
74                 " Returns false if the path is NOT pathological (There" +
75                 " are no subpath reptitions or reptitions are less than" +
76                 " the '" + ATTR_REPETITIONS + "' limit).");
77
78         Type type = getElementFromDefinition(ATTR_MATCH_RETURN_VALUE);
79         type.setTransient(true);
80
81         type = getElementFromDefinition(ATTR_REGEXP);
82         type.setTransient(true);
83
84         addElementToDefinition(new SimpleType(ATTR_REPETITIONS,
85                 "Number of times the pattern should be allowed to occur. \n" +
86                 "This filter returns true if number of repetitions of a" +
87                 " pattern exceeds this value",
88                 DEFAULT_REPETITIONS));
89     }
90
91     /**
92      * Construct the regexp string to be matched aginst the URI.
93      * @param o an object to extract a URI from.
94      * @return the regexp pattern.
95      */

96     protected String JavaDoc getRegexp(Object JavaDoc o) {
97         int rep = 0;
98         try {
99             rep = ((Integer JavaDoc)getAttribute(o, ATTR_REPETITIONS)).intValue();
100         } catch (AttributeNotFoundException JavaDoc e) {
101             logger.severe(e.getMessage());
102         }
103         return rep == 0? null: REGEX_PREFIX + (rep - 1) + REGEX_SUFFIX;
104     }
105     
106     protected boolean getFilterOffPosition(CrawlURI curi) {
107         return false;
108     }
109 }
110
Popular Tags