KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > matuschek > spider > RegExpURLCheck


1 package net.matuschek.spider;
2
3 /************************************************
4     Copyright (c) 2001/2002 by Daniel Matuschek
5  *************************************************/

6
7
8 import java.io.BufferedReader JavaDoc;
9 import java.io.IOException JavaDoc;
10 import java.io.Reader JavaDoc;
11 import java.net.URL JavaDoc;
12 import java.util.StringTokenizer JavaDoc;
13 import java.util.Vector JavaDoc;
14
15 import org.apache.regexp.RESyntaxException;
16
17
18 /**
19  * This URLChecker checks a URL using a list of regular expressions
20  * that should be allowed or denied.
21  *
22  * @author Daniel Matuschek
23  * @version $Revision: 1.4 $
24  */

25 public class RegExpURLCheck
26 implements URLCheck
27 {
28     /** vector to store the rules */
29     private Vector JavaDoc<RegExpRule> rules = null;
30
31     /** default check result if no matching regexp was found */
32     private boolean defaultResult = true;
33
34     /** initializes the object with an empty rule set */
35     public RegExpURLCheck() {
36         rules = new Vector JavaDoc<RegExpRule>();
37     }
38
39     /**
40      * <p>initialized the object with a rule set from an
41      * input stream (e.g. a file)</p>
42      *
43      * <p>every line of this stream has the format
44      * <code>allow|deny expression</code></p>
45      *
46      * <p>default value can be set with
47      * <code>allow|deny .</code> at the end of the file</p>
48      *
49      * <p>lines that start with "#" and empty lines will be
50      * ignored</p>
51      */

52     public RegExpURLCheck(Reader JavaDoc r)
53     throws IOException JavaDoc,
54     org.apache.regexp.RESyntaxException
55     {
56         this();
57
58         BufferedReader JavaDoc reader =
59             new BufferedReader JavaDoc(r);
60
61         String JavaDoc line = "";
62         int lineno=0;
63
64         while (line != null) {
65             line=reader.readLine();
66             lineno++;
67
68             if ((line != null) &&
69                     (! line.trim().equals("")) &&
70                     (! line.startsWith("#"))) {
71                 StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(line);
72                 // did we get 2 tokens ?
73
if (st.countTokens() != 2) {
74                     throw new IOException JavaDoc("line "+lineno+" don't consists of 2 fields");
75                 }
76
77                 String JavaDoc allowStr = st.nextToken();
78                 boolean allow = true;
79                 String JavaDoc expression = st.nextToken();
80
81                 // allow or deny ?
82
if (allowStr.equalsIgnoreCase("allow")) {
83                     allow=true;
84                 } else if (allowStr.equalsIgnoreCase("deny")) {
85                     allow=false;
86                 } else {
87                     throw new IOException JavaDoc("first token in line "+lineno+
88                     " has to be allow or deny");
89                 }
90
91                 addRule(expression,allow);
92             }
93         }
94     }
95
96
97     /**
98      * Sets the default result that will be returned if no matching
99      * regular expression was found
100      * @param default the default result
101      */

102     public void setDefaultResult(boolean defaultResult) {
103         this.defaultResult = defaultResult;
104     }
105
106     /**
107      * Gets the default result that will be returned if no matching
108      * regular expression was found
109      * @return the default result
110      */

111     public boolean getDefaultResult() {
112         return defaultResult;
113     }
114
115     /**
116      * Gets the list of rules
117      * @return a vector of RegExpRule objects
118      */

119     public Vector JavaDoc getRules() {
120         return rules;
121     }
122
123     /**
124      * Sets the list of rules
125      * @param rules a vector of RegExpRule objects
126      */

127     public void setRules(Vector JavaDoc<RegExpRule> rules) {
128         this.rules=rules;
129     }
130
131
132     /**
133      * adds a allow or deny rule
134      * @param regExp a String containing the regular expression
135      * @param allow allow (TRUE) or deny (FALSE)
136      */

137     public void addRule(String JavaDoc regExp, boolean allow)
138     throws RESyntaxException
139     {
140         RegExpRule rule = new RegExpRule();
141         rule.setPattern(regExp);
142         rule.setAllow(allow);
143         rules.add(rule);
144     }
145
146
147     /**
148      * Checks if a given URL is allowed or denied by the rules
149      *
150      * @return true if a matching "allow" rule was found,
151      * false if a matching "deny" rule was found,
152      * the default value if no rule was found
153      * @see #setDefaultResult(boolean)
154      */

155     public boolean checkURL(URL JavaDoc u) {
156         String JavaDoc urlStr = u.toString();
157
158         for (int i=0; i<rules.size(); i++) {
159             RegExpRule rule = rules.elementAt(i);
160
161             if (rule.match(urlStr)) {
162                 return rule.getAllow();
163             }
164         }
165
166         return defaultResult;
167     }
168
169     /**
170      * Checks if a given URL is allowed or denied by the rules for processing
171      *
172      * @return true if a matching "allow" rule was found,
173      * false if a matching "deny" rule was found,
174      * the default value if no rule was found
175      * @see #setDefaultResult(boolean)
176      */

177     public boolean checkURLForProcessing(URL JavaDoc u) {
178         String JavaDoc urlStr = u.toString();
179
180         for (int i=0; i<rules.size(); i++) {
181             RegExpRule rule = rules.elementAt(i);
182
183             if (rule.match(urlStr)) {
184                 return rule.getProcessAllowed();
185             }
186         }
187
188         return defaultResult;
189     }
190
191 } // RegExpURLCheck
192
Popular Tags