1 2 3 4 package net.nutch.net; 5 6 import java.io.Reader ; 7 import java.io.FileReader ; 8 import java.io.BufferedReader ; 9 import java.io.InputStreamReader ; 10 import java.io.IOException ; 11 12 import java.util.List ; 13 import java.util.ArrayList ; 14 import java.util.Iterator ; 15 import java.util.logging.Logger ; 16 17 import org.apache.oro.text.regex.Perl5Compiler; 18 import org.apache.oro.text.regex.Perl5Matcher; 19 import org.apache.oro.text.regex.Perl5Pattern; 20 import org.apache.oro.text.regex.Pattern; 21 import org.apache.oro.text.regex.PatternMatcher; 22 import org.apache.oro.text.regex.MalformedPatternException; 23 24 import net.nutch.util.*; 25 26 35 36 public class RegexURLFilter implements URLFilter { 37 38 private static final Logger LOG = 39 LogFormatter.getLogger("net.nutch.net.RegexURLFilter"); 40 41 private static class Rule { 42 public Perl5Pattern pattern; 43 public boolean sign; 44 public String regex; 45 } 46 47 private List rules; 48 private PatternMatcher matcher = new Perl5Matcher(); 49 50 public RegexURLFilter() throws IOException , MalformedPatternException { 51 String file = NutchConf.get("urlfilter.regex.file"); 52 Reader reader = NutchConf.getConfResourceAsReader(file); 53 54 if (reader == null) { 55 LOG.severe("Can't find resource: " + file); 56 } else { 57 rules=readConfigurationFile(reader); 58 } 59 } 60 61 public RegexURLFilter(String filename) 62 throws IOException , MalformedPatternException { 63 rules = readConfigurationFile(new FileReader (filename)); 64 } 65 66 public synchronized String filter(String url) { 67 Iterator i=rules.iterator(); 68 while(i.hasNext()) { 69 Rule r=(Rule) i.next(); 70 if (matcher.contains(url,r.pattern)) { 71 return r.sign ? url : null; 73 } 74 }; 75 76 return null; } 78 79 87 private static List readConfigurationFile(Reader reader) 88 throws IOException , MalformedPatternException { 89 90 BufferedReader in=new BufferedReader (reader); 91 Perl5Compiler compiler=new Perl5Compiler(); 92 List rules=new ArrayList (); 93 String line; 94 95 while((line=in.readLine())!=null) { 96 if (line.length() == 0) 97 continue; 98 char first=line.charAt(0); 99 boolean sign=false; 100 switch (first) { 101 case '+' : 102 sign=true; 103 break; 104 case '-' : 105 sign=false; 106 break; 107 case ' ' : case '\n' : case '#' : continue; 109 default : 110 throw new IOException ("Invalid first character: "+line); 111 } 112 113 String regex=line.substring(1); 114 115 Rule rule=new Rule(); 116 rule.pattern=(Perl5Pattern) compiler.compile(regex); 117 rule.sign=sign; 118 rule.regex=regex; 119 rules.add(rule); 120 } 121 122 return rules; 123 } 124 125 public static void main(String args[]) 126 throws IOException , MalformedPatternException { 127 128 RegexURLFilter filter=new RegexURLFilter(); 129 BufferedReader in=new BufferedReader (new InputStreamReader (System.in)); 130 String line; 131 while((line=in.readLine())!=null) { 132 String out=filter.filter(line); 133 if(out!=null) { 134 System.out.print("+"); 135 System.out.println(out); 136 } else { 137 System.out.print("-"); 138 System.out.println(line); 139 } 140 } 141 } 142 143 } 144 | Popular Tags |