KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > net > RegexURLFilter


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.net;
5
6 import java.io.Reader JavaDoc;
7 import java.io.FileReader JavaDoc;
8 import java.io.BufferedReader JavaDoc;
9 import java.io.InputStreamReader JavaDoc;
10 import java.io.IOException JavaDoc;
11
12 import java.util.List JavaDoc;
13 import java.util.ArrayList JavaDoc;
14 import java.util.Iterator JavaDoc;
15 import java.util.logging.Logger JavaDoc;
16
17 import org.apache.oro.text.regex.Perl5Compiler;
18 import org.apache.oro.text.regex.Perl5Matcher;
19 import org.apache.oro.text.regex.Perl5Pattern;
20 import org.apache.oro.text.regex.Pattern;
21 import org.apache.oro.text.regex.PatternMatcher;
22 import org.apache.oro.text.regex.MalformedPatternException;
23
24 import net.nutch.util.*;
25
26 /** Filters URLs based on a file of regular expressions. The config file is
27  * named by the Nutch configuration property "urlfilter.regex.file".
28  *
29  * <p>The format of this file is:
30  * <pre>
31  * [+-]<regex>
32  * </pre>
33  * where plus means go ahead and index it and minus means no.
34  */

35
36 public class RegexURLFilter implements URLFilter {
37
38   private static final Logger JavaDoc LOG =
39     LogFormatter.getLogger("net.nutch.net.RegexURLFilter");
40
41   private static class Rule {
42     public Perl5Pattern pattern;
43     public boolean sign;
44     public String JavaDoc regex;
45   }
46
47   private List JavaDoc rules;
48   private PatternMatcher matcher = new Perl5Matcher();
49
50   public RegexURLFilter() throws IOException JavaDoc, MalformedPatternException {
51     String JavaDoc file = NutchConf.get("urlfilter.regex.file");
52     Reader JavaDoc reader = NutchConf.getConfResourceAsReader(file);
53
54     if (reader == null) {
55       LOG.severe("Can't find resource: " + file);
56     } else {
57       rules=readConfigurationFile(reader);
58     }
59   }
60
61   public RegexURLFilter(String JavaDoc filename)
62     throws IOException JavaDoc, MalformedPatternException {
63     rules = readConfigurationFile(new FileReader JavaDoc(filename));
64   }
65
66   public synchronized String JavaDoc filter(String JavaDoc url) {
67     Iterator JavaDoc i=rules.iterator();
68     while(i.hasNext()) {
69       Rule r=(Rule) i.next();
70       if (matcher.contains(url,r.pattern)) {
71         //System.out.println("Matched " + r.regex);
72
return r.sign ? url : null;
73       }
74     };
75         
76     return null; // assume no go
77
}
78
79   //
80
// Format of configuration file is
81
//
82
// [+-]<regex>
83
//
84
// where plus means go ahead and index it and minus means no.
85
//
86

87   private static List JavaDoc readConfigurationFile(Reader JavaDoc reader)
88     throws IOException JavaDoc, MalformedPatternException {
89
90     BufferedReader JavaDoc in=new BufferedReader JavaDoc(reader);
91     Perl5Compiler compiler=new Perl5Compiler();
92     List JavaDoc rules=new ArrayList JavaDoc();
93     String JavaDoc line;
94        
95     while((line=in.readLine())!=null) {
96       if (line.length() == 0)
97         continue;
98       char first=line.charAt(0);
99       boolean sign=false;
100       switch (first) {
101       case '+' :
102         sign=true;
103         break;
104       case '-' :
105         sign=false;
106         break;
107       case ' ' : case '\n' : case '#' : // skip blank & comment lines
108
continue;
109       default :
110         throw new IOException JavaDoc("Invalid first character: "+line);
111       }
112
113       String JavaDoc regex=line.substring(1);
114
115       Rule rule=new Rule();
116       rule.pattern=(Perl5Pattern) compiler.compile(regex);
117       rule.sign=sign;
118       rule.regex=regex;
119       rules.add(rule);
120     }
121
122     return rules;
123   }
124
125   public static void main(String JavaDoc args[])
126     throws IOException JavaDoc, MalformedPatternException {
127
128     RegexURLFilter filter=new RegexURLFilter();
129     BufferedReader JavaDoc in=new BufferedReader JavaDoc(new InputStreamReader JavaDoc(System.in));
130     String JavaDoc line;
131     while((line=in.readLine())!=null) {
132       String JavaDoc out=filter.filter(line);
133       if(out!=null) {
134         System.out.print("+");
135         System.out.println(out);
136       } else {
137         System.out.print("-");
138         System.out.println(line);
139       }
140     }
141   }
142
143 }
144
Popular Tags