1 2 3 4 package net.nutch.indexer.basic; 5 6 import org.apache.lucene.document.Document; 7 import org.apache.lucene.document.Field; 8 9 import net.nutch.parse.Parse; 10 11 import net.nutch.indexer.IndexingFilter; 12 import net.nutch.indexer.IndexingException; 13 14 import net.nutch.fetcher.FetcherOutput; 15 import net.nutch.pagedb.FetchListEntry; 16 17 import java.util.logging.Logger ; 18 import net.nutch.util.LogFormatter; 19 import net.nutch.util.NutchConf; 20 21 22 public class BasicIndexingFilter implements IndexingFilter { 23 public static final Logger LOG 24 = LogFormatter.getLogger(BasicIndexingFilter.class.getName()); 25 26 private static final int MAX_TITLE_LENGTH = 27 NutchConf.getInt("indexer.max.title.length", 100); 28 29 public Document filter(Document doc, Parse parse, FetcherOutput fo) 30 throws IndexingException { 31 32 String url = fo.getUrl().toString(); 33 34 doc.add(Field.Text("url", url)); 36 37 doc.add(Field.UnStored("content", parse.getText())); 39 40 String [] anchors = fo.getAnchors(); 42 for (int i = 0; i < anchors.length; i++) { 43 doc.add(Field.UnStored("anchor", anchors[i])); 44 } 45 46 String title = parse.getData().getTitle(); 48 if (title.length() > MAX_TITLE_LENGTH) { title = title.substring(0, MAX_TITLE_LENGTH); 50 } 51 doc.add(Field.UnStored("anchor", title)); 53 doc.add(Field.UnIndexed("title", title)); 55 56 return doc; 57 } 58 59 } 60 | Popular Tags |