KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > deciderules > MatchesListRegExpDecideRule


1 /* MatchesListRegExpDecideRule
2  *
3  * $Id: MatchesListRegExpDecideRule.java,v 1.2.14.1 2007/01/13 01:31:14 stack-sf Exp $
4  *
5  * Created on 30.5.2005
6  *
7  * Copyright (C) 2005 Kristinn Sigurdsson
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.deciderules;
26
27 import java.util.Iterator JavaDoc;
28 import java.util.List JavaDoc;
29 import java.util.logging.Level JavaDoc;
30 import java.util.logging.Logger JavaDoc;
31
32 import javax.management.AttributeNotFoundException JavaDoc;
33
34 import org.archive.crawler.settings.SimpleType;
35 import org.archive.crawler.settings.StringList;
36 import org.archive.util.TextUtils;
37
38
39 /**
40  * Rule applies configured decision to any CrawlURIs whose String URI
41  * matches the supplied regexps.
42  * <p>
43  * The list of regular expressions can be considered logically AND or OR.
44  *
45  * @author Kristinn Sigurdsson
46  *
47  * @see MatchesRegExpDecideRule
48  */

49 public class MatchesListRegExpDecideRule extends PredicatedDecideRule {
50
51     private static final long serialVersionUID = 3011579758573454930L;
52
53     private static final Logger JavaDoc logger =
54         Logger.getLogger(MatchesListRegExpDecideRule.class.getName());
55     
56     public static final String JavaDoc ATTR_REGEXP_LIST = "regexp-list";
57     public static final String JavaDoc ATTR_LIST_LOGIC= "list-logic";
58     
59     public static final String JavaDoc DEFAULT_LIST_LOGIC = "OR";
60     public static final String JavaDoc[] LEGAL_LIST_LOGIC = {"OR","AND"};
61
62     /**
63      * Usual constructor.
64      * @param name
65      */

66     public MatchesListRegExpDecideRule(String JavaDoc name) {
67         super(name);
68         setDescription("MatchesListRegExpDecideRule. Applies the configured " +
69             "decision to URIs matching the supplied regular expressions.\n" +
70             "The list of regular expressions can be considered logically AND " +
71             "or OR.");
72         addElementToDefinition(
73                 new SimpleType(ATTR_LIST_LOGIC, "Should the list of regular " +
74                     "expressions be considered as logically AND or OR when " +
75                     "matching.",
76                     DEFAULT_LIST_LOGIC, LEGAL_LIST_LOGIC));
77         addElementToDefinition(new StringList(ATTR_REGEXP_LIST,"The list of " +
78              "regular expressions to evalute against the URI."));
79     }
80
81     /**
82      * Evaluate whether given object's string version
83      * matches configured regexps
84      *
85      * @param o
86      * @return true if regexps are matched
87      */

88     protected boolean evaluate(Object JavaDoc o) {
89         try {
90             List JavaDoc regexps = getRegexp(o);
91             if(regexps.size()==0){
92                 return false;
93             }
94             String JavaDoc str = o.toString();
95             Iterator JavaDoc it = regexps.iterator();
96             
97             boolean listLogicOR = isListLogicOR(o);
98             // Result is initialized so that if OR based the default assumption is
99
// false (find no matches) but if AND based the default assumption is
100
// true (finds no non-matches)
101
boolean result = listLogicOR == false;
102             
103             while(it.hasNext()){
104                 String JavaDoc regexp = (String JavaDoc)it.next();
105                 boolean matches = TextUtils.matches(regexp, str);
106
107                 if (logger.isLoggable(Level.FINER)) {
108                     logger.finer("Tested '" + str + "' match with regex '" +
109                         regexp + " and result was " + matches);
110                 }
111                 
112                 if(matches){
113                     if(listLogicOR){
114                         // OR based and we just got a match, done!
115
result = true;
116                         break;
117                     }
118                 } else {
119                     if(listLogicOR == false){
120                         // AND based and we just found a non-match, done!
121
result = false;
122                         break;
123                     }
124                 }
125             }
126             
127             if (logger.isLoggable(Level.FINE) && result){
128                 logger.fine("Matched: " + str);
129             }
130             
131             return result;
132         } catch (ClassCastException JavaDoc e) {
133             // if not CrawlURI, always disregard
134
return false;
135         }
136     }
137     
138     /**
139      * Get the regular expressions list to match the URI against.
140      *
141      * @param o the object for which the regular expression should be
142      * matched against.
143      * @return the regular expression to match against.
144      */

145     protected List JavaDoc getRegexp(Object JavaDoc o) {
146         try {
147             return (StringList) getAttribute(o, ATTR_REGEXP_LIST);
148         } catch (AttributeNotFoundException JavaDoc e) {
149             logger.severe(e.getMessage());
150             // Basically the filter is inactive if this occurs
151
// (The caller should be returning false when regexp is null).
152
return null;
153         }
154     }
155     
156     protected boolean isListLogicOR(Object JavaDoc o){
157         String JavaDoc logic = DEFAULT_LIST_LOGIC;
158         try {
159             logic = (String JavaDoc) getAttribute(o, ATTR_LIST_LOGIC);
160         } catch (AttributeNotFoundException JavaDoc e) {
161             logger.severe(e.getMessage());
162         }
163         return logic.equals("OR") ? true : false;
164     }
165 }
Popular Tags