KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > util > BloomUriUniqFilter


1 /* BloomUriUniqFilter
2 *
3 * $Id: BloomUriUniqFilter.java,v 1.8.2.1 2007/01/13 01:31:29 stack-sf Exp $
4 *
5 * Created on June 21, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.util;
26
27 import java.io.Serializable JavaDoc;
28 import java.util.logging.Logger JavaDoc;
29
30 import org.archive.crawler.datamodel.CandidateURI;
31 import org.archive.util.BloomFilter;
32 import org.archive.util.BloomFilter32bitSplit;
33
34
35 /**
36  * A MG4J BloomFilter-based implementation of an AlreadySeen list.
37  *
38  * This implementation performs adequately without blowing out
39  * the heap through to very large numbers of URIs. See
40  * <a HREF="http://crawler.archive.org/cgi-bin/wiki.pl?AlreadySeen">AlreadySeen</a>.
41  *
42  * It is inherent to Bloom filters that as they get 'saturated', their
43  * false-positive rate rises. The default parameters used by this class
44  * attempt to maintain a 1-in-4 million (1 in 2^22) false-positive chance
45  * through 125 million unique inserts, which creates a filter structure
46  * about 495MB in size.
47  *
48  * You may use the following system properties to tune the size and
49  * false-positive rate of the bloom filter structure used by this class:
50  *
51  * org.archive.crawler.util.BloomUriUniqFilter.expected-size (default 125000000)
52  * org.archive.crawler.util.BloomUriUniqFilter.hash-count (default 22)
53  *
54  * The resulting filter will take up approximately...
55  *
56  * 1.44 * expected-size * hash-count / 8
57  *
58  * ...bytes.
59  *
60  * The default size is very close to the maximum practical size of the
61  * Bloom filter implementation, BloomFilter32bitSplit, created in the
62  * initialize() method, due to integer arithmetic limits.
63  *
64  * If you need a larger filter, you should edit the initialize
65  * method to intantiate a BloomFilter64bit instead.
66  *
67  * @author gojomo
68  * @version $Date: 2007/01/13 01:31:29 $, $Revision: 1.8.2.1 $
69  */

70 public class BloomUriUniqFilter extends SetBasedUriUniqFilter
71 implements Serializable JavaDoc {
72     private static final long serialVersionUID = 1061526253773091309L;
73
74     private static Logger JavaDoc LOGGER =
75         Logger.getLogger(BloomUriUniqFilter.class.getName());
76
77     BloomFilter bloom; // package access for testing convenience
78
protected int expected_n; // remember bloom contruction param
79

80     protected static final String JavaDoc EXPECTED_SIZE_KEY = ".expected-size";
81     protected static final String JavaDoc HASH_COUNT_KEY = ".hash-count";
82
83     // these defaults create a bloom filter that is
84
// 1.44*125mil*22/8 ~= 495MB in size, and at full
85
// capacity will give a false contained indication
86
// 1/(2^22) ~= 1 in every 4 million probes
87
private static final int DEFAULT_EXPECTED_SIZE = 125000000; // 125 million
88
private static final int DEFAULT_HASH_COUNT = 22; // 1 in 4 million false pos
89

90     /**
91      * Default constructor
92      */

93     public BloomUriUniqFilter() {
94         super();
95         String JavaDoc ns = System.getProperty(this.getClass().getName() + EXPECTED_SIZE_KEY);
96         int n = (ns == null) ? DEFAULT_EXPECTED_SIZE : Integer.parseInt(ns);
97         String JavaDoc ds = System.getProperty(this.getClass().getName() + HASH_COUNT_KEY);
98         int d = (ds == null) ? DEFAULT_HASH_COUNT : Integer.parseInt(ds);
99         initialize(n,d);
100     }
101
102     /**
103      * Constructor.
104      *
105      * @param n the expected number of elements.
106      * @param d the number of hash functions; if the filter adds not more
107      * than <code>n</code> elements, false positives will happen with
108      * probability 2<sup>-<var>d</var></sup>.
109      */

110     public BloomUriUniqFilter( final int n, final int d ) {
111         super();
112         initialize(n, d);
113     }
114
115     /**
116      * Initializer shared by constructors.
117      *
118      * @param n the expected number of elements.
119      * @param d the number of hash functions; if the filter adds not more
120      * than <code>n</code> elements, false positives will happen with
121      * probability 2<sup>-<var>d</var></sup>.
122      */

123     protected void initialize(final int n, final int d) {
124         this.expected_n = n;
125         bloom = new BloomFilter32bitSplit(n,d);
126     }
127
128     public void forget(String JavaDoc canonical, CandidateURI item) {
129         // TODO? could use in-memory exception list of currently-forgotten items
130
LOGGER.severe("forget(\""+canonical+"\",CandidateURI) not supported");
131     }
132
133     
134     protected boolean setAdd(CharSequence JavaDoc uri) {
135         boolean added = bloom.add(uri);
136         // warn if bloom has reached its expected size (and its false-pos
137
// rate will now exceed the theoretical/designed level)
138
if( added && (count() == expected_n)) {
139             LOGGER.warning("Bloom has reached expected limit "+expected_n);
140         }
141         return added;
142     }
143
144     protected long setCount() {
145         return bloom.size();
146     }
147
148     protected boolean setRemove(CharSequence JavaDoc uri) {
149         throw new UnsupportedOperationException JavaDoc();
150     }
151 }
152
Popular Tags