KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > deciderules > TooManyPathSegmentsDecideRule


1 /* AcceptRule
2 *
3 * $Id: TooManyPathSegmentsDecideRule.java,v 1.3.18.1 2007/01/13 01:31:15 stack-sf Exp $
4 *
5 * Created on Apr 1, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.deciderules;
26
27 import org.archive.crawler.datamodel.CandidateURI;
28 import org.archive.crawler.settings.SimpleType;
29 import org.archive.crawler.settings.Type;
30
31
32
33 /**
34  * Rule REJECTs any CrawlURIs whose total number of path-segments (as
35  * indicated by the count of '/' characters not including the first '//')
36  * is over a given threshold.
37  *
38  * @author gojomo
39  */

40 public class TooManyPathSegmentsDecideRule extends PredicatedDecideRule {
41
42     private static final long serialVersionUID = 147079100367815075L;
43
44     public static final String JavaDoc ATTR_MAX_PATH_DEPTH = "max-path-depth";
45     
46     /**
47      * Default maximum value.
48      * Default access so available to unit test.
49      */

50     static final Integer JavaDoc DEFAULT_MAX_PATH_DEPTH = new Integer JavaDoc(20);
51
52     /**
53      * Usual constructor.
54      * @param name Name of this DecideRule.
55      */

56     public TooManyPathSegmentsDecideRule(String JavaDoc name) {
57         super(name);
58         setDescription("TooManyPathSegmentsDecideRule. REJECTs URIs with " +
59                 "more total path-segments (as indicated by '/' characters) " +
60                 "than the configured '" + ATTR_MAX_PATH_DEPTH + "'.");
61         
62         // make default REJECT (overriding superclass) & always-default
63
Type type = addElementToDefinition(new SimpleType(ATTR_DECISION,
64                 "Decision to be applied", REJECT, ALLOWED_TYPES));
65         type.setTransient(true);
66         
67         addElementToDefinition(new SimpleType(ATTR_MAX_PATH_DEPTH, "Number of" +
68                 " path segments beyond which this rule will reject URIs.",
69                 DEFAULT_MAX_PATH_DEPTH));
70         
71     }
72
73     /**
74      * Evaluate whether given object is over the threshold number of
75      * path-segments.
76      *
77      * @param object
78      * @return true if the path-segments is exceeded
79      */

80     protected boolean evaluate(Object JavaDoc object) {
81         boolean result = false;
82         CandidateURI curi = null;
83         try {
84             curi = (CandidateURI)object;
85         } catch (ClassCastException JavaDoc e) {
86             // if not CrawlURI, always disregard
87
return result;
88         }
89         String JavaDoc uri = curi.toString();
90         int count = 0;
91         int threshold = getThresholdSegments(object);
92         for (int i = 0; i < uri.length(); i++) {
93             if (uri.charAt(i) == '/') {
94                 count++;
95             }
96             if (count > threshold) {
97                 result = true;
98                 break;
99             }
100         }
101         return result;
102     }
103
104     /**
105      * @param obj
106      * @return path-segments cutoff threshold
107      */

108     private int getThresholdSegments(Object JavaDoc obj) {
109         // add 2 for start-of-authority slashes (not path segments)
110
return ((Integer JavaDoc) getUncheckedAttribute(obj, ATTR_MAX_PATH_DEPTH))
111                 .intValue() + 2;
112     }
113 }
114
Popular Tags