KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > filter > URIListRegExpFilter


1 /* URIListRegExpFilter
2  *
3  * $Id: URIListRegExpFilter.java,v 1.2.2.1 2007/01/13 01:31:21 stack-sf Exp $
4  *
5  * Created on 30.5.2005
6  *
7  * Copyright (C) 2004 Kristinn Sigurdsson.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.filter;
26
27 import java.util.Iterator JavaDoc;
28 import java.util.List JavaDoc;
29 import java.util.logging.Level JavaDoc;
30 import java.util.logging.Logger JavaDoc;
31
32 import javax.management.AttributeNotFoundException JavaDoc;
33
34 import org.archive.crawler.deciderules.DecideRule;
35 import org.archive.crawler.deciderules.DecidingFilter;
36 import org.archive.crawler.framework.Filter;
37 import org.archive.crawler.settings.SimpleType;
38 import org.archive.crawler.settings.StringList;
39 import org.archive.util.TextUtils;
40
41
42 /**
43 * Compares passed object -- a CrawlURI, UURI, or String --
44 * against regular expressions, accepting matches.
45 * <p>
46 * Can be configured to logically OR or AND the regular expressions.
47 *
48 * @author Kristinn Sigurdsson
49 *
50 * @see org.archive.crawler.filter.URIRegExpFilter
51 * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
52 * equivalent {@link DecideRule}.
53 */

54 public class URIListRegExpFilter
55 extends Filter {
56
57     private static final long serialVersionUID = -2587977969340783677L;
58     
59     private static final Logger JavaDoc logger =
60        Logger.getLogger(URIListRegExpFilter.class.getName());
61    public static final String JavaDoc ATTR_REGEXP_LIST = "regexp-list";
62    public static final String JavaDoc ATTR_LIST_LOGIC= "list-logic";
63    public static final String JavaDoc ATTR_MATCH_RETURN_VALUE = "if-match-return";
64    
65    public static final String JavaDoc DEFAULT_LIST_LOGIC = "OR";
66    public static final String JavaDoc[] LEGAL_LIST_LOGIC = {"OR","AND"};
67    public static final Boolean JavaDoc DEFAULT_MATCH_RETURN_VALUE = new Boolean JavaDoc(true);
68
69    /**
70     * @param name Filter name.
71     */

72    public URIListRegExpFilter(String JavaDoc name) {
73        super(name, "A filter that uses a list of regular expressions " +
74             "*Deprecated* Use DecidingFilter and equivalent DecideRule " +
75             "instead. Can be " +
76              "optionally either OR or AND based in its evaluation.");
77        addElementToDefinition(
78                new SimpleType(ATTR_MATCH_RETURN_VALUE, "What to return when" +
79                    " regular expression matches. \n",
80                    DEFAULT_MATCH_RETURN_VALUE));
81        addElementToDefinition(
82                new SimpleType(ATTR_LIST_LOGIC, "Should the list of regular " +
83                    "expressions be considered as logically AND or OR when " +
84                    "matching.",
85                    DEFAULT_LIST_LOGIC, LEGAL_LIST_LOGIC));
86        addElementToDefinition(new StringList(ATTR_REGEXP_LIST,"The list of " +
87             "regular expressions to evalute against the URI."));
88    }
89
90    protected boolean innerAccepts(Object JavaDoc o) {
91        List JavaDoc regexps = getRegexp(o);
92        if(regexps.size()==0){
93            return false;
94        }
95        String JavaDoc str = o.toString();
96        Iterator JavaDoc it = regexps.iterator();
97        
98        boolean listLogicOR = isListLogicOR(o);
99        // Result is initialized so that if OR based the default assumption is
100
// false (find no matches) but if AND based the default assumption is
101
// true (finds no non-matches)
102
boolean result = listLogicOR == false;
103        
104        while(it.hasNext()){
105            String JavaDoc regexp = (String JavaDoc)it.next();
106            boolean matches = TextUtils.matches(regexp, str);
107
108            if (logger.isLoggable(Level.FINER)) {
109                logger.finer("Tested '" + str + "' match with regex '" +
110                    regexp + " and result was " + matches);
111            }
112            
113            if(matches){
114                if(listLogicOR){
115                    // OR based and we just got a match, done!
116
result = true;
117                    break;
118                }
119            } else {
120                if(listLogicOR == false){
121                    // AND based and we just found a non-match, done!
122
result = false;
123                    break;
124                }
125            }
126        }
127        
128        result = getMatchReturnValue(o) ? result : !result;
129        
130        if (logger.isLoggable(Level.FINE) && result){
131            logger.fine("Matched: " + str);
132        }
133        
134        return result;
135    }
136
137    /**
138     * Get the regular expressions list to match the URI against.
139     *
140     * @param o the object for which the regular expression should be
141     * matched against.
142     * @return the regular expression to match against.
143     */

144    protected List JavaDoc getRegexp(Object JavaDoc o) {
145        try {
146            return (StringList) getAttribute(o, ATTR_REGEXP_LIST);
147        } catch (AttributeNotFoundException JavaDoc e) {
148            logger.severe(e.getMessage());
149            // Basically the filter is inactive if this occurs
150
// (The caller should be returning false when regexp is null).
151
return null;
152        }
153    }
154    
155    protected boolean getMatchReturnValue(Object JavaDoc o){
156        try {
157            return ((Boolean JavaDoc) getAttribute(o, ATTR_MATCH_RETURN_VALUE)).booleanValue();
158        } catch (AttributeNotFoundException JavaDoc e) {
159            logger.severe(e.getMessage());
160            return DEFAULT_MATCH_RETURN_VALUE.booleanValue();
161        }
162    }
163
164    protected boolean isListLogicOR(Object JavaDoc o){
165        String JavaDoc logic = DEFAULT_LIST_LOGIC;
166        try {
167            logic = (String JavaDoc) getAttribute(o, ATTR_LIST_LOGIC);
168        } catch (AttributeNotFoundException JavaDoc e) {
169            logger.severe(e.getMessage());
170        }
171        return logic.equals("OR") ? true : false;
172    }
173
174 }
175
Popular Tags