KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > net > PrefixURLFilter


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 // $Id: PrefixURLFilter.java,v 1.1 2004/05/20 19:26:02 cutting Exp $
5

6 package net.nutch.net;
7
8 import java.io.Reader JavaDoc;
9 import java.io.FileReader JavaDoc;
10 import java.io.BufferedReader JavaDoc;
11 import java.io.InputStreamReader JavaDoc;
12 import java.io.IOException JavaDoc;
13
14 import java.util.List JavaDoc;
15 import java.util.ArrayList JavaDoc;
16 import java.util.Iterator JavaDoc;
17 import java.util.logging.Logger JavaDoc;
18
19 import net.nutch.util.*;
20
21 /** Filters URLs based on a file of URL prefixes. The config file is
22  * named by the Nutch configuration property "urlfilter.prefix.file".
23  *
24  * <p>The format of this file is one URL per line.</p>
25  */

26 public class PrefixURLFilter implements URLFilter {
27
28   private static final Logger JavaDoc LOG =
29     LogFormatter.getLogger("net.nutch.net.PrefixURLFilter");
30
31   private TrieStringMatcher trie;
32
33   public PrefixURLFilter() throws IOException JavaDoc {
34     String JavaDoc file = NutchConf.get("urlfilter.prefix.file");
35     Reader JavaDoc reader = NutchConf.getConfResourceAsReader(file);
36
37     if (reader == null) {
38       LOG.severe("Can't find resource: " + file);
39     } else {
40       trie = readConfigurationFile(reader);
41     }
42   }
43
44   public PrefixURLFilter(String JavaDoc filename) throws IOException JavaDoc {
45     trie = readConfigurationFile(new FileReader JavaDoc(filename));
46   }
47
48   public String JavaDoc filter(String JavaDoc url) {
49     if (trie.shortestMatch(url) == null)
50       return null;
51     else
52       return url;
53   }
54
55   private static TrieStringMatcher readConfigurationFile(Reader JavaDoc reader)
56     throws IOException JavaDoc {
57     
58     BufferedReader JavaDoc in=new BufferedReader JavaDoc(reader);
59     List JavaDoc urlprefixes = new ArrayList JavaDoc();
60     String JavaDoc line;
61
62     while((line=in.readLine())!=null) {
63       if (line.length() == 0)
64         continue;
65
66       char first=line.charAt(0);
67       switch (first) {
68       case ' ' : case '\n' : case '#' : // skip blank & comment lines
69
continue;
70       default :
71     urlprefixes.add(line);
72       }
73     }
74
75     return new PrefixStringMatcher(urlprefixes);
76   }
77
78   public static void main(String JavaDoc args[])
79     throws IOException JavaDoc {
80     
81     PrefixURLFilter filter;
82     if (args.length >= 1)
83       filter = new PrefixURLFilter(args[0]);
84     else
85       filter = new PrefixURLFilter();
86     
87     BufferedReader JavaDoc in=new BufferedReader JavaDoc(new InputStreamReader JavaDoc(System.in));
88     String JavaDoc line;
89     while((line=in.readLine())!=null) {
90       String JavaDoc out=filter.filter(line);
91       if(out!=null) {
92         System.out.println(out);
93       }
94     }
95   }
96   
97 }
98
Popular Tags