KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > extractor > LinkExtractor


1 /* LinkExtractor
2 *
3 * $Id: LinkExtractor.java,v 1.3 2005/07/18 17:30:19 stack-sf Exp $
4 *
5 * Created on Mar 16, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.extractor;
26
27 import java.io.InputStream JavaDoc;
28 import java.nio.charset.Charset JavaDoc;
29 import java.util.Iterator JavaDoc;
30
31 import org.archive.crawler.extractor.Link;
32 import org.archive.net.UURI;
33
34 /**
35  * LinkExtractor is a general interface for classes which, when given an
36  * InputStream and Charset, can scan for Links and return them via
37  * an Iterator interface.
38  *
39  * Implementors may in fact complete all extraction on the first
40  * hasNext(), then trickle Links out from an internal collection,
41  * depending on whether the link-extraction technique used is amenable
42  * to incremental scanning.
43  *
44  * ROUGH DRAFT IN PROGRESS / incomplete... untested...
45  *
46  * @author gojomo
47  */

48 public interface LinkExtractor extends Iterator JavaDoc {
49     /**
50      * Setup the LinkExtractor to operate on the given stream and charset,
51      * considering the given contextURI as the initial 'base' URI for
52      * resolving relative URIs.
53      *
54      * May be called to 'reset' a LinkExtractor to start with new input.
55      *
56      * @param source source URI
57      * @param base base URI (usually the source URI) for URI derelativizing
58      * @param content input stream of content to scan for links
59      * @param charset Charset to consult to decode stream to characters
60      * @param listener ExtractErrorListener to notify, rather than raising
61      * exception through extraction loop
62      */

63     public void setup(UURI source, UURI base, InputStream JavaDoc content,
64             Charset JavaDoc charset, ExtractErrorListener listener);
65     
66     /**
67      * Convenience version of above for common case where source and base are
68      * same.
69      *
70      * @param sourceandbase URI to use as source and base for derelativizing
71      * @param content input stream of content to scan for links
72      * @param charset Charset to consult to decode stream to characters
73      * @param listener ExtractErrorListener to notify, rather than raising
74      * exception through extraction loop
75      */

76     public void setup(UURI sourceandbase, InputStream JavaDoc content,
77             Charset JavaDoc charset, ExtractErrorListener listener);
78     
79     /**
80      * Alternative to Iterator.next() which returns type Link.
81      * @return a discovered Link
82      */

83     public Link nextLink();
84
85     /**
86      * Discard all state and release any used resources.
87      */

88     public void reset();
89 }
90
Popular Tags