KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > datamodel > UriUniqFilter


1 /* UriUniqFilter
2  *
3  * Created on Apr 17, 2003
4  *
5  * Copyright (C) 2003 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.datamodel;
24
25 import java.io.File JavaDoc;
26
27 /**
28  * A UriUniqFilter passes URI objects to a destination
29  * (receiver) if the passed URI object has not been previously seen.
30  *
31  * If already seen, the passed URI object is dropped.
32  *
33  * <p>For efficiency in comparison against a large history of
34  * seen URIs, URI objects may not be passed immediately, unless
35  * the addNow() is used or a flush() is forced.
36  *
37  * @author gojomo
38  * @version $Date: 2005/12/16 03:10:54 $, $Revision: 1.10 $
39  */

40 public interface UriUniqFilter {
41     /**
42      * @return Count of already seen URIs.
43      */

44     public long count();
45     
46     /**
47      * Count of items added, but not yet filtered in or out.
48      *
49      * Some implementations may buffer up large numbers of pending
50      * items to be evaluated in a later large batch/scan/merge with
51      * disk files.
52      *
53      * @return Count of items added not yet evaluated
54      */

55     public long pending();
56
57     /**
58      * Receiver of uniq URIs.
59      *
60      * Items that have not been seen before are pass through to this object.
61      * @param receiver Object that will be passed items. Must implement
62      * HasUriReceiver interface.
63      */

64     public void setDestination(HasUriReceiver receiver);
65     
66     /**
67      * Add given uri, if not already present.
68      * @param key Usually a canonicalized version of <code>value</code>.
69      * This is the key used doing lookups, forgets and insertions on the
70      * already included list.
71      * @param value item to add.
72      */

73     public void add(String JavaDoc key, CandidateURI value);
74     
75     /**
76      * Immediately add uri.
77      * @param key Usually a canonicalized version of <code>uri</code>.
78      * This is the key used doing lookups, forgets and insertions on the
79      * already included list.
80      * @param value item to add.
81      */

82     public void addNow(String JavaDoc key, CandidateURI value);
83     
84     /**
85      * Add given uri, all the way through to underlying destination, even
86      * if already present.
87      *
88      * (Sometimes a URI must be fetched, or refetched, for example when
89      * DNS or robots info expires or the operator forces a refetch. A
90      * normal add() or addNow() would drop the URI without forwarding
91      * on once it is determmined to already be in the filter.)
92      *
93      * @param key Usually a canonicalized version of <code>uri</code>.
94      * This is the key used doing lookups, forgets and insertions on the
95      * already included list.
96      * @param value item to add.
97      */

98     public void addForce(String JavaDoc key, CandidateURI value);
99     
100     /**
101      * Note item as seen, without passing through to receiver.
102      * @param key Usually a canonicalized version of an <code>URI</code>.
103      * This is the key used doing lookups, forgets and insertions on the
104      * already included list.
105      */

106     public void note(String JavaDoc key);
107     
108     /**
109      * Forget item was seen
110      * @param key Usually a canonicalized version of an <code>URI</code>.
111      * This is the key used doing lookups, forgets and insertions on the
112      * already included list.
113      * @param value item to add.
114      */

115     public void forget(String JavaDoc key, CandidateURI value);
116     
117     /**
118      * Request that any pending items be added/dropped. Implementors
119      * may ignore the request if a flush would be too expensive/too
120      * soon.
121      *
122      * @return Number added.
123      */

124     public long requestFlush();
125     
126     /**
127      * Close down any allocated resources.
128      * Makes sense calling this when checkpointing.
129      */

130     public void close();
131     
132     /**
133      * Set a File to receive a log for replay profiling.
134      */

135     public void setProfileLog(File JavaDoc logfile);
136     
137     /**
138      * URIs that have not been seen before 'visit' this 'Visitor'.
139      *
140      * Usually implementations of Frontier implement this interface.
141      * @author gojomo
142      */

143     public interface HasUriReceiver {
144         /**
145          * @param item Candidate uri tem that is 'visiting'.
146          */

147         public void receive(CandidateURI item);
148     }
149 }
Popular Tags