1 2 3 4 6 package net.nutch.net; 7 8 import java.io.Reader ; 9 import java.io.FileReader ; 10 import java.io.BufferedReader ; 11 import java.io.InputStreamReader ; 12 import java.io.IOException ; 13 14 import java.util.List ; 15 import java.util.ArrayList ; 16 import java.util.Iterator ; 17 import java.util.logging.Logger ; 18 19 import net.nutch.util.*; 20 21 26 public class PrefixURLFilter implements URLFilter { 27 28 private static final Logger LOG = 29 LogFormatter.getLogger("net.nutch.net.PrefixURLFilter"); 30 31 private TrieStringMatcher trie; 32 33 public PrefixURLFilter() throws IOException { 34 String file = NutchConf.get("urlfilter.prefix.file"); 35 Reader reader = NutchConf.getConfResourceAsReader(file); 36 37 if (reader == null) { 38 LOG.severe("Can't find resource: " + file); 39 } else { 40 trie = readConfigurationFile(reader); 41 } 42 } 43 44 public PrefixURLFilter(String filename) throws IOException { 45 trie = readConfigurationFile(new FileReader (filename)); 46 } 47 48 public String filter(String url) { 49 if (trie.shortestMatch(url) == null) 50 return null; 51 else 52 return url; 53 } 54 55 private static TrieStringMatcher readConfigurationFile(Reader reader) 56 throws IOException { 57 58 BufferedReader in=new BufferedReader (reader); 59 List urlprefixes = new ArrayList (); 60 String line; 61 62 while((line=in.readLine())!=null) { 63 if (line.length() == 0) 64 continue; 65 66 char first=line.charAt(0); 67 switch (first) { 68 case ' ' : case '\n' : case '#' : continue; 70 default : 71 urlprefixes.add(line); 72 } 73 } 74 75 return new PrefixStringMatcher(urlprefixes); 76 } 77 78 public static void main(String args[]) 79 throws IOException { 80 81 PrefixURLFilter filter; 82 if (args.length >= 1) 83 filter = new PrefixURLFilter(args[0]); 84 else 85 filter = new PrefixURLFilter(); 86 87 BufferedReader in=new BufferedReader (new InputStreamReader (System.in)); 88 String line; 89 while((line=in.readLine())!=null) { 90 String out=filter.filter(line); 91 if(out!=null) { 92 System.out.println(out); 93 } 94 } 95 } 96 97 } 98 | Popular Tags |