1 25 package org.archive.crawler.util; 26 27 import java.io.Serializable ; 28 import java.util.logging.Logger ; 29 30 import org.archive.crawler.datamodel.CandidateURI; 31 import org.archive.util.BloomFilter; 32 import org.archive.util.BloomFilter32bitSplit; 33 34 35 70 public class BloomUriUniqFilter extends SetBasedUriUniqFilter 71 implements Serializable { 72 private static final long serialVersionUID = 1061526253773091309L; 73 74 private static Logger LOGGER = 75 Logger.getLogger(BloomUriUniqFilter.class.getName()); 76 77 BloomFilter bloom; protected int expected_n; 80 protected static final String EXPECTED_SIZE_KEY = ".expected-size"; 81 protected static final String HASH_COUNT_KEY = ".hash-count"; 82 83 private static final int DEFAULT_EXPECTED_SIZE = 125000000; private static final int DEFAULT_HASH_COUNT = 22; 90 93 public BloomUriUniqFilter() { 94 super(); 95 String ns = System.getProperty(this.getClass().getName() + EXPECTED_SIZE_KEY); 96 int n = (ns == null) ? DEFAULT_EXPECTED_SIZE : Integer.parseInt(ns); 97 String ds = System.getProperty(this.getClass().getName() + HASH_COUNT_KEY); 98 int d = (ds == null) ? DEFAULT_HASH_COUNT : Integer.parseInt(ds); 99 initialize(n,d); 100 } 101 102 110 public BloomUriUniqFilter( final int n, final int d ) { 111 super(); 112 initialize(n, d); 113 } 114 115 123 protected void initialize(final int n, final int d) { 124 this.expected_n = n; 125 bloom = new BloomFilter32bitSplit(n,d); 126 } 127 128 public void forget(String canonical, CandidateURI item) { 129 LOGGER.severe("forget(\""+canonical+"\",CandidateURI) not supported"); 131 } 132 133 134 protected boolean setAdd(CharSequence uri) { 135 boolean added = bloom.add(uri); 136 if( added && (count() == expected_n)) { 139 LOGGER.warning("Bloom has reached expected limit "+expected_n); 140 } 141 return added; 142 } 143 144 protected long setCount() { 145 return bloom.size(); 146 } 147 148 protected boolean setRemove(CharSequence uri) { 149 throw new UnsupportedOperationException (); 150 } 151 } 152 | Popular Tags |