1 25 package org.archive.crawler.util; 26 27 import java.io.File ; 28 import java.io.IOException ; 29 import java.io.Serializable ; 30 import java.util.logging.Level ; 31 import java.util.logging.Logger ; 32 33 import st.ata.util.FPGenerator; 34 35 import com.sleepycat.bind.tuple.LongBinding; 36 import com.sleepycat.je.Database; 37 import com.sleepycat.je.DatabaseConfig; 38 import com.sleepycat.je.DatabaseEntry; 39 import com.sleepycat.je.DatabaseException; 40 import com.sleepycat.je.DatabaseNotFoundException; 41 import com.sleepycat.je.Environment; 42 import com.sleepycat.je.EnvironmentConfig; 43 import com.sleepycat.je.OperationStatus; 44 45 46 64 public class BdbUriUniqFilter 65 extends SetBasedUriUniqFilter implements Serializable { 66 private static final long serialVersionUID = -8099357538178524011L; 67 68 private static Logger logger = 69 Logger.getLogger(BdbUriUniqFilter.class.getName()); 70 71 protected boolean createdEnvironment = false; 72 protected long lastCacheMiss = 0; 73 protected long lastCacheMissDiff = 0; 74 protected transient Database alreadySeen = null; 75 protected transient DatabaseEntry value = null; 76 private static final String DB_NAME = "alreadySeenUrl"; 77 protected long count = 0; 78 private long aggregatedLookupTime = 0; 79 80 private static final String COLON_SLASH_SLASH = "://"; 81 82 85 protected BdbUriUniqFilter() { 86 super(); 87 } 88 89 94 public BdbUriUniqFilter(Environment environment) 95 throws IOException { 96 super(); 97 try { 98 initialize(environment); 99 } catch (DatabaseException e) { 100 throw new IOException (e.getMessage()); 101 } 102 } 103 104 111 public BdbUriUniqFilter(File bdbEnv) 112 throws IOException { 113 this(bdbEnv, -1); 114 } 115 116 125 public BdbUriUniqFilter(File bdbEnv, final int cacheSizePercentage) 126 throws IOException { 127 super(); 128 if (!bdbEnv.exists()) { 129 bdbEnv.mkdirs(); 130 } 131 EnvironmentConfig envConfig = new EnvironmentConfig(); 132 envConfig.setAllowCreate(true); 133 if (cacheSizePercentage > 0 && cacheSizePercentage < 100) { 134 envConfig.setCachePercent(cacheSizePercentage); 135 } 136 try { 137 createdEnvironment = true; 138 initialize(new Environment(bdbEnv, envConfig)); 139 } catch (DatabaseException e) { 140 throw new IOException (e.getMessage()); 141 } 142 } 143 144 149 protected void initialize(Environment env) throws DatabaseException { 150 DatabaseConfig dbConfig = new DatabaseConfig(); 151 dbConfig.setAllowCreate(true); 152 try { 153 env.truncateDatabase(null, DB_NAME, false); 154 } catch (DatabaseNotFoundException e) { 155 } 157 open(env, dbConfig); 158 } 159 160 166 public void reopen(final Environment env) 167 throws DatabaseException { 168 open(env, null); 169 } 170 171 protected void open(final Environment env, final DatabaseConfig dbConfig) 172 throws DatabaseException { 173 this.alreadySeen = env.openDatabase(null, DB_NAME, dbConfig); 174 this.value = new DatabaseEntry("".getBytes()); 175 } 176 177 public synchronized void close() { 178 Environment env = null; 179 if (this.alreadySeen != null) { 180 try { 181 env = this.alreadySeen.getEnvironment(); 182 if (logger.isLoggable(Level.INFO)) { 183 logger.info("Count of alreadyseen on close " + 184 Long.toString(count)); 185 } 186 this.alreadySeen.close(); 187 } catch (DatabaseException e) { 188 logger.severe(e.getMessage()); 189 } 190 this.alreadySeen = null; 191 } 192 if (env != null && createdEnvironment) { 193 try { 194 env.sync(); 197 env.close(); 198 } catch (DatabaseException e) { 199 logger.severe(e.getMessage()); 200 } 201 } 202 } 203 204 public synchronized long getCacheMisses() throws DatabaseException { 205 long cacheMiss = this.alreadySeen.getEnvironment(). 206 getStats(null).getNCacheMiss(); 207 this.lastCacheMissDiff = cacheMiss - this.lastCacheMiss; 208 this.lastCacheMiss = cacheMiss; 209 return this.lastCacheMiss; 210 } 211 212 public long getLastCacheMissDiff() { 213 return this.lastCacheMissDiff; 214 } 215 216 222 public static long createKey(CharSequence uri) { 223 String url = uri.toString(); 224 int index = url.indexOf(COLON_SLASH_SLASH); 225 if (index > 0) { 226 index = url.indexOf('/', index + COLON_SLASH_SLASH.length()); 227 } 228 CharSequence hostPlusScheme = (index == -1)? url: url.subSequence(0, index); 229 long tmp = FPGenerator.std24.fp(hostPlusScheme); 230 return tmp | (FPGenerator.std40.fp(url) >>> 24); 231 } 232 233 234 235 protected boolean setAdd(CharSequence uri) { 236 DatabaseEntry key = new DatabaseEntry(); 237 LongBinding.longToEntry(createKey(uri), key); 238 long started = 0; 239 240 OperationStatus status = null; 241 try { 242 if (logger.isLoggable(Level.INFO)) { 243 started = System.currentTimeMillis(); 244 } 245 status = alreadySeen.putNoOverwrite(null, key, value); 246 if (logger.isLoggable(Level.INFO)) { 247 aggregatedLookupTime += 248 (System.currentTimeMillis() - started); 249 } 250 } catch (DatabaseException e) { 251 logger.severe(e.getMessage()); 252 } 253 if (status == OperationStatus.SUCCESS) { 254 count++; 255 if (logger.isLoggable(Level.INFO)) { 256 final int logAt = 10000; 257 if (count > 0 && ((count % logAt) == 0)) { 258 logger.info("Average lookup " + 259 (aggregatedLookupTime / logAt) + "ms."); 260 aggregatedLookupTime = 0; 261 } 262 } 263 } 264 if(status == OperationStatus.KEYEXIST) { 265 return false; } else { 267 return true; 268 } 269 } 270 271 protected long setCount() { 272 return count; 273 } 274 275 protected boolean setRemove(CharSequence uri) { 276 DatabaseEntry key = new DatabaseEntry(); 277 LongBinding.longToEntry(createKey(uri), key); 278 OperationStatus status = null; 279 try { 280 status = alreadySeen.delete(null, key); 281 } catch (DatabaseException e) { 282 logger.severe(e.getMessage()); 283 } 284 if (status == OperationStatus.SUCCESS) { 285 count--; 286 return true; } else { 288 return false; } 290 } 291 292 public long flush() { 293 return 0; 296 } 297 } | Popular Tags |