KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > searcher > site > SiteIndexingFilter


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.searcher.site;
5
6 import org.apache.lucene.document.Document;
7 import org.apache.lucene.document.Field;
8
9 import net.nutch.parse.Parse;
10
11 import net.nutch.indexer.IndexingFilter;
12 import net.nutch.indexer.IndexingException;
13
14 import net.nutch.fetcher.FetcherOutput;
15 import net.nutch.pagedb.FetchListEntry;
16
17 import java.util.logging.Logger JavaDoc;
18 import net.nutch.util.LogFormatter;
19
20 import java.net.URL JavaDoc;
21 import java.net.MalformedURLException JavaDoc;
22
23 /** Adds the host name to a "site" field, so that it can be searched by
24  * SiteQueryFilter. */

25 public class SiteIndexingFilter implements IndexingFilter {
26   public static final Logger JavaDoc LOG
27     = LogFormatter.getLogger(SiteIndexingFilter.class.getName());
28
29   public Document filter(Document doc, Parse parse, FetcherOutput fo)
30     throws IndexingException {
31     
32     // parse the url to get the host name
33
URL JavaDoc url;
34     try {
35       url = new URL JavaDoc(fo.getUrl().toString());
36     } catch (MalformedURLException JavaDoc e) {
37       throw new IndexingException(e);
38     }
39
40     // add host as un-stored, indexed and un-tokenized
41
doc.add(new Field("site", url.getHost(), false, true, false));
42
43     // return the modified document
44
return doc;
45   }
46
47 }
48
Popular Tags