1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */ 2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ 3 4 package net.nutch.parse; 5 6 import net.nutch.protocol.Content; 7 8 import org.w3c.dom.DocumentFragment; 9 10 /** Extension point for DOM-based HTML parsers. Permits one to add additional 11 * metadata to HTML parses. All plugins found which implement this extension 12 * point are run sequentially on the parse. 13 */ 14 public interface HtmlParseFilter { 15 /** The name of the extension point. */ 16 final static String X_POINT_ID = HtmlParseFilter.class.getName(); 17 18 /** Adds metadata or otherwise modifies a parse of HTML content, given 19 * the DOM tree of a page. */ 20 Parse filter(Content content, Parse parse, DocumentFragment doc) 21 throws ParseException; 22 } 23