KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > indexer > basic > BasicIndexingFilter


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.indexer.basic;
5
6 import org.apache.lucene.document.Document;
7 import org.apache.lucene.document.Field;
8
9 import net.nutch.parse.Parse;
10
11 import net.nutch.indexer.IndexingFilter;
12 import net.nutch.indexer.IndexingException;
13
14 import net.nutch.fetcher.FetcherOutput;
15 import net.nutch.pagedb.FetchListEntry;
16
17 import java.util.logging.Logger JavaDoc;
18 import net.nutch.util.LogFormatter;
19 import net.nutch.util.NutchConf;
20
21 /** Adds basic searchable fields to a document. */
22 public class BasicIndexingFilter implements IndexingFilter {
23   public static final Logger JavaDoc LOG
24     = LogFormatter.getLogger(BasicIndexingFilter.class.getName());
25
26   private static final int MAX_TITLE_LENGTH =
27     NutchConf.getInt("indexer.max.title.length", 100);
28
29   public Document filter(Document doc, Parse parse, FetcherOutput fo)
30     throws IndexingException {
31     
32     String JavaDoc url = fo.getUrl().toString();
33
34     // url is both stored and indexed, so it's both searchable and returned
35
doc.add(Field.Text("url", url));
36     
37     // content is indexed, so that it's searchable, but not stored in index
38
doc.add(Field.UnStored("content", parse.getText()));
39     
40     // anchors are indexed, so they're searchable, but not stored in index
41
String JavaDoc[] anchors = fo.getAnchors();
42     for (int i = 0; i < anchors.length; i++) {
43       doc.add(Field.UnStored("anchor", anchors[i]));
44     }
45
46     // title
47
String JavaDoc title = parse.getData().getTitle();
48     if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
49
title = title.substring(0, MAX_TITLE_LENGTH);
50     }
51     // add title as anchor so it is searchable. doesn't warrant its own field.
52
doc.add(Field.UnStored("anchor", title));
53     // add title unindexed, so that it can be displayed
54
doc.add(Field.UnIndexed("title", title));
55
56     return doc;
57   }
58
59 }
60
Popular Tags