KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > util > BloomFilter64bit


1 /* BloomFilter
2 *
3 * $Id: BloomFilter64bit.java,v 1.2.16.1 2007/01/13 01:31:39 stack-sf Exp $
4 *
5 * Created on Jun 21, 2005
6 *
7 * Copyright (C) 2005 Internet Archive; a slight adaptation of
8 * LGPL work (C) Sebastiano Vigna
9 *
10 * This file is part of the Heritrix web crawler (crawler.archive.org).
11 *
12 * Heritrix is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Lesser Public License as published by
14 * the Free Software Foundation; either version 2.1 of the License, or
15 * any later version.
16 *
17 * Heritrix is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Lesser Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser Public License
23 * along with Heritrix; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 */

26
27 package org.archive.util;
28
29 import java.io.Serializable JavaDoc;
30 import java.security.SecureRandom JavaDoc;
31
32 /** A Bloom filter.
33  *
34  * SLIGHTLY ADAPTED VERSION OF MG4J it.unimi.dsi.mg4j.util.BloomFilter
35  *
36  * <p>KEY CHANGES:
37  *
38  * <ul>
39  * <li>NUMBER_OF_WEIGHTS is 2083, to better avoid collisions between
40  * similar strings</li>
41  * <li>Removed dependence on cern.colt MersenneTwister (replaced with
42  * SecureRandom) and QuickBitVector (replaced with local methods).</li>
43  * <li>Adapted to allow long bit indices so long as the index/64 (used
44  * an array index in bit vector) fits within Integer.MAX_VALUE</li>
45  * </ul>
46  *
47  * <hr>
48  *
49  * <P>Instances of this class represent a set of character sequences (with false positives)
50  * using a Bloom filter. Because of the way Bloom filters work,
51  * you cannot remove elements.
52  *
53  * <P>Bloom filters have an expected error rate, depending on the number
54  * of hash functions used, on the filter size and on the number of elements in the filter. This implementation
55  * uses a variable optimal number of hash functions, depending on the expected
56  * number of elements. More precisely, a Bloom
57  * filter for <var>n</var> character sequences with <var>d</var> hash functions will use
58  * ln 2 <var>d</var><var>n</var> &#8776; 1.44 <var>d</var><var>n</var> bits;
59  * false positives will happen with probability 2<sup>-<var>d</var></sup>.
60  *
61  * <P>Hash functions are generated at creation time using universal hashing. Each hash function
62  * uses {@link #NUMBER_OF_WEIGHTS} random integers, which are cyclically multiplied by
63  * the character codes in a character sequence. The resulting integers are XOR-ed together.
64  *
65  * <P>This class exports access methods that are very similar to those of {@link java.util.Set},
66  * but it does not implement that interface, as too many non-optional methods
67  * would be unimplementable (e.g., iterators).
68  *
69  * @author Sebastiano Vigna
70  */

71 public class BloomFilter64bit implements Serializable JavaDoc, BloomFilter {
72
73     private static final long serialVersionUID = 2317000663009608403L;
74
75     /** The number of weights used to create hash functions. */
76     final public static int NUMBER_OF_WEIGHTS = 2083; // CHANGED FROM 16
77
/** The number of bits in this filter. */
78     final public long m;
79     /** The number of hash functions used by this filter. */
80     final public int d;
81     /** The underlying bit vector. */
82     final private long[] bits;
83     /** The random integers used to generate the hash functions. */
84     final private long[][] weight;
85
86     /** The number of elements currently in the filter. It may be
87      * smaller than the actual number of additions of distinct character
88      * sequences because of false positives.
89      */

90     private int size;
91
92     /** The natural logarithm of 2, used in the computation of the number of bits. */
93     private final static double NATURAL_LOG_OF_2 = Math.log( 2 );
94
95     private final static boolean DEBUG = false;
96
97     /** Creates a new Bloom filter with given number of hash functions and expected number of elements.
98      *
99      * @param n the expected number of elements.
100      * @param d the number of hash functions; if the filter add not more than <code>n</code> elements,
101      * false positives will happen with probability 2<sup>-<var>d</var></sup>.
102      */

103     public BloomFilter64bit( final int n, final int d ) {
104         this.d = d;
105         int len = (int)Math.ceil( ( (long)n * (long)d / NATURAL_LOG_OF_2 ) / 64L );
106         if ( len/64 > Integer.MAX_VALUE ) throw new IllegalArgumentException JavaDoc( "This filter would require " + len * 64L + " bits" );
107         bits = new long[ len ];
108         m = bits.length * 64;
109
110         if ( DEBUG ) System.err.println( "Number of bits: " + m );
111
112         // seeded for reproduceable behavior in repeated runs; BUT:
113
// SecureRandom's default implementation (as of 1.5)
114
// seems to mix in its own seeding.
115
final SecureRandom JavaDoc random = new SecureRandom JavaDoc(new byte[] {19,96});
116         weight = new long[ d ][];
117         for( int i = 0; i < d; i++ ) {
118             weight[ i ] = new long[ NUMBER_OF_WEIGHTS ];
119             for( int j = 0; j < NUMBER_OF_WEIGHTS; j++ )
120                  weight[ i ][ j ] = random.nextLong();
121         }
122     }
123
124     /** The number of character sequences in the filter.
125      *
126      * @return the number of character sequences in the filter (but see {@link #contains(CharSequence)}).
127      */

128
129     public int size() {
130         return size;
131     }
132
133     /** Hashes the given sequence with the given hash function.
134      *
135      * @param s a character sequence.
136      * @param l the length of <code>s</code>.
137      * @param k a hash function index (smaller than {@link #d}).
138      * @return the position in the filter corresponding to <code>s</code> for the hash function <code>k</code>.
139      */

140
141     private long hash( final CharSequence JavaDoc s, final int l, final int k ) {
142         final long[] w = weight[ k ];
143         long h = 0;
144         int i = l;
145         while( i-- != 0 ) h ^= s.charAt( i ) * w[ i % NUMBER_OF_WEIGHTS ];
146         return ( h & 0x7FFFFFFFFFFFFFFFL ) % m;
147     }
148
149     /** Checks whether the given character sequence is in this filter.
150      *
151      * <P>Note that this method may return true on a character sequence that is has
152      * not been added to the filter. This will happen with probability 2<sub>-<var>d</var></sub>,
153      * where <var>d</var> is the number of hash functions specified at creation time, if
154      * the number of the elements in the filter is less than <var>n</var>, the number
155      * of expected elements specified at creation time.
156      *
157      * @param s a character sequence.
158      * @return true if the sequence is in the filter (or if a sequence with the
159      * same hash sequence is in the filter).
160      */

161
162     public boolean contains( final CharSequence JavaDoc s ) {
163         int i = d, l = s.length();
164         while( i-- != 0 ) if ( ! getBit( hash( s, l, i ) ) ) return false;
165         return true;
166     }
167
168     /** Adds a character sequence to the filter.
169      *
170      * @param s a character sequence.
171      * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}).
172      */

173
174     public boolean add( final CharSequence JavaDoc s ) {
175         boolean result = false;
176         int i = d, l = s.length();
177         long h;
178         while( i-- != 0 ) {
179             h = hash( s, l, i );
180             if ( ! getBit( h ) ) result = true;
181             setBit( h );
182         }
183         if ( result ) size++;
184         return result;
185     }
186     
187     protected final static long ADDRESS_BITS_PER_UNIT = 6; // 64=2^6
188
protected final static long BIT_INDEX_MASK = 63; // = BITS_PER_UNIT - 1;
189

190     /**
191      * Returns from the local bitvector the value of the bit with
192      * the specified index. The value is <tt>true</tt> if the bit
193      * with the index <tt>bitIndex</tt> is currently set; otherwise,
194      * returns <tt>false</tt>.
195      *
196      * (adapted from cern.colt.bitvector.QuickBitVector)
197      *
198      * @param bitIndex the bit index.
199      * @return the value of the bit with the specified index.
200      */

201     protected boolean getBit(long bitIndex) {
202         return ((bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] & (1L << (bitIndex & BIT_INDEX_MASK))) != 0);
203     }
204
205     /**
206      * Changes the bit with index <tt>bitIndex</tt> in local bitvector.
207      *
208      * (adapted from cern.colt.bitvector.QuickBitVector)
209      *
210      * @param bitIndex the index of the bit to be set.
211      */

212     protected void setBit( long bitIndex) {
213             bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] |= 1L << (bitIndex & BIT_INDEX_MASK);
214     }
215     
216     /* (non-Javadoc)
217      * @see org.archive.util.BloomFilter#getSizeBytes()
218      */

219     public long getSizeBytes() {
220         return bits.length*8;
221     }
222 }
223
Popular Tags