1 2 3 4 package net.nutch.searcher.site; 5 6 import org.apache.lucene.document.Document; 7 import org.apache.lucene.document.Field; 8 9 import net.nutch.parse.Parse; 10 11 import net.nutch.indexer.IndexingFilter; 12 import net.nutch.indexer.IndexingException; 13 14 import net.nutch.fetcher.FetcherOutput; 15 import net.nutch.pagedb.FetchListEntry; 16 17 import java.util.logging.Logger ; 18 import net.nutch.util.LogFormatter; 19 20 import java.net.URL ; 21 import java.net.MalformedURLException ; 22 23 25 public class SiteIndexingFilter implements IndexingFilter { 26 public static final Logger LOG 27 = LogFormatter.getLogger(SiteIndexingFilter.class.getName()); 28 29 public Document filter(Document doc, Parse parse, FetcherOutput fo) 30 throws IndexingException { 31 32 URL url; 34 try { 35 url = new URL (fo.getUrl().toString()); 36 } catch (MalformedURLException e) { 37 throw new IndexingException(e); 38 } 39 40 doc.add(new Field("site", url.getHost(), false, true, false)); 42 43 return doc; 45 } 46 47 } 48 | Popular Tags |