KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > util > BdbUriUniqFilter


1 /* BdbUriUniqFilter
2 *
3 * $Id: BdbUriUniqFilter.java,v 1.21.14.1 2007/01/13 01:31:29 stack-sf Exp $
4 *
5 * Created on September 17, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.util;
26
27 import java.io.File JavaDoc;
28 import java.io.IOException JavaDoc;
29 import java.io.Serializable JavaDoc;
30 import java.util.logging.Level JavaDoc;
31 import java.util.logging.Logger JavaDoc;
32
33 import st.ata.util.FPGenerator;
34
35 import com.sleepycat.bind.tuple.LongBinding;
36 import com.sleepycat.je.Database;
37 import com.sleepycat.je.DatabaseConfig;
38 import com.sleepycat.je.DatabaseEntry;
39 import com.sleepycat.je.DatabaseException;
40 import com.sleepycat.je.DatabaseNotFoundException;
41 import com.sleepycat.je.Environment;
42 import com.sleepycat.je.EnvironmentConfig;
43 import com.sleepycat.je.OperationStatus;
44
45
46 /**
47  * A BDB implementation of an AlreadySeen list.
48  *
49  * This implementation performs adequately without blowing out
50  * the heap. See
51  * <a HREF="http://crawler.archive.org/cgi-bin/wiki.pl?AlreadySeen">AlreadySeen</a>.
52  *
53  * <p>Makes keys that have URIs from same server close to each other. Mercator
54  * and 2.3.5 'Elminating Already-Visited URLs' in 'Mining the Web' by Soumen
55  * Chakrabarti talk of a two-level key with the first 24 bits a hash of the
56  * host plus port and with the last 40 as a hash of the path. Testing
57  * showed adoption of such a scheme halving lookup times (This implementation
58  * actually concatenates scheme + host in first 24 bits and path + query in
59  * trailing 40 bits).
60  *
61  * @author stack
62  * @version $Date: 2007/01/13 01:31:29 $, $Revision: 1.21.14.1 $
63  */

64 public class BdbUriUniqFilter
65 extends SetBasedUriUniqFilter implements Serializable JavaDoc {
66     private static final long serialVersionUID = -8099357538178524011L;
67
68     private static Logger JavaDoc logger =
69         Logger.getLogger(BdbUriUniqFilter.class.getName());
70
71     protected boolean createdEnvironment = false;
72     protected long lastCacheMiss = 0;
73     protected long lastCacheMissDiff = 0;
74     protected transient Database alreadySeen = null;
75     protected transient DatabaseEntry value = null;
76     private static final String JavaDoc DB_NAME = "alreadySeenUrl";
77     protected long count = 0;
78     private long aggregatedLookupTime = 0;
79     
80     private static final String JavaDoc COLON_SLASH_SLASH = "://";
81     
82     /**
83      * Shutdown default constructor.
84      */

85     protected BdbUriUniqFilter() {
86         super();
87     }
88     
89     /**
90      * Constructor.
91      * @param environment A bdb environment ready-configured.
92      * @throws IOException
93      */

94     public BdbUriUniqFilter(Environment environment)
95     throws IOException JavaDoc {
96         super();
97         try {
98             initialize(environment);
99         } catch (DatabaseException e) {
100             throw new IOException JavaDoc(e.getMessage());
101         }
102     }
103     
104     /**
105      * Constructor.
106      * @param bdbEnv The directory that holds the bdb environment. Will
107      * make a database under here if doesn't already exit. Otherwise
108      * reopens any existing dbs.
109      * @throws IOException
110      */

111     public BdbUriUniqFilter(File JavaDoc bdbEnv)
112     throws IOException JavaDoc {
113         this(bdbEnv, -1);
114     }
115     
116     /**
117      * Constructor.
118      * @param bdbEnv The directory that holds the bdb environment. Will
119      * make a database under here if doesn't already exit. Otherwise
120      * reopens any existing dbs.
121      * @param cacheSizePercentage Percentage of JVM bdb allocates as
122      * its cache. Pass -1 to get default cache size.
123      * @throws IOException
124      */

125     public BdbUriUniqFilter(File JavaDoc bdbEnv, final int cacheSizePercentage)
126     throws IOException JavaDoc {
127         super();
128         if (!bdbEnv.exists()) {
129             bdbEnv.mkdirs();
130         }
131         EnvironmentConfig envConfig = new EnvironmentConfig();
132         envConfig.setAllowCreate(true);
133         if (cacheSizePercentage > 0 && cacheSizePercentage < 100) {
134             envConfig.setCachePercent(cacheSizePercentage);
135         }
136         try {
137             createdEnvironment = true;
138             initialize(new Environment(bdbEnv, envConfig));
139         } catch (DatabaseException e) {
140             throw new IOException JavaDoc(e.getMessage());
141         }
142     }
143     
144     /**
145      * Method shared by constructors.
146      * @param env Environment to use.
147      * @throws DatabaseException
148      */

149     protected void initialize(Environment env) throws DatabaseException {
150         DatabaseConfig dbConfig = new DatabaseConfig();
151         dbConfig.setAllowCreate(true);
152         try {
153             env.truncateDatabase(null, DB_NAME, false);
154         } catch (DatabaseNotFoundException e) {
155             // Ignored
156
}
157         open(env, dbConfig);
158     }
159     
160     /**
161      * Call after deserializing an instance of this class. Will open the
162      * already seen in passed environment.
163      * @param env DB Environment to use.
164      * @throws DatabaseException
165      */

166     public void reopen(final Environment env)
167     throws DatabaseException {
168         open(env, null);
169     }
170     
171     protected void open(final Environment env, final DatabaseConfig dbConfig)
172     throws DatabaseException {
173         this.alreadySeen = env.openDatabase(null, DB_NAME, dbConfig);
174         this.value = new DatabaseEntry("".getBytes());
175     }
176     
177     public synchronized void close() {
178         Environment env = null;
179         if (this.alreadySeen != null) {
180             try {
181                 env = this.alreadySeen.getEnvironment();
182                 if (logger.isLoggable(Level.INFO)) {
183                     logger.info("Count of alreadyseen on close " +
184                         Long.toString(count));
185                 }
186                 this.alreadySeen.close();
187             } catch (DatabaseException e) {
188                 logger.severe(e.getMessage());
189             }
190             this.alreadySeen = null;
191         }
192         if (env != null && createdEnvironment) {
193             try {
194                 // This sync flushes whats in RAM. Its expensive operation.
195
// Without, data can be lost. Not for transactional operation.
196
env.sync();
197                 env.close();
198             } catch (DatabaseException e) {
199                 logger.severe(e.getMessage());
200             }
201         }
202     }
203     
204     public synchronized long getCacheMisses() throws DatabaseException {
205         long cacheMiss = this.alreadySeen.getEnvironment().
206             getStats(null).getNCacheMiss();
207         this.lastCacheMissDiff = cacheMiss - this.lastCacheMiss;
208         this.lastCacheMiss = cacheMiss;
209         return this.lastCacheMiss;
210     }
211     
212     public long getLastCacheMissDiff() {
213         return this.lastCacheMissDiff;
214     }
215     
216     /**
217      * Create fingerprint.
218      * Pubic access so test code can access createKey.
219      * @param uri URI to fingerprint.
220      * @return Fingerprint of passed <code>url</code>.
221      */

222     public static long createKey(CharSequence JavaDoc uri) {
223         String JavaDoc url = uri.toString();
224         int index = url.indexOf(COLON_SLASH_SLASH);
225         if (index > 0) {
226             index = url.indexOf('/', index + COLON_SLASH_SLASH.length());
227         }
228         CharSequence JavaDoc hostPlusScheme = (index == -1)? url: url.subSequence(0, index);
229         long tmp = FPGenerator.std24.fp(hostPlusScheme);
230         return tmp | (FPGenerator.std40.fp(url) >>> 24);
231     }
232
233
234
235     protected boolean setAdd(CharSequence JavaDoc uri) {
236         DatabaseEntry key = new DatabaseEntry();
237         LongBinding.longToEntry(createKey(uri), key);
238         long started = 0;
239         
240         OperationStatus status = null;
241         try {
242             if (logger.isLoggable(Level.INFO)) {
243                 started = System.currentTimeMillis();
244             }
245             status = alreadySeen.putNoOverwrite(null, key, value);
246             if (logger.isLoggable(Level.INFO)) {
247                 aggregatedLookupTime +=
248                     (System.currentTimeMillis() - started);
249             }
250         } catch (DatabaseException e) {
251             logger.severe(e.getMessage());
252         }
253         if (status == OperationStatus.SUCCESS) {
254             count++;
255             if (logger.isLoggable(Level.INFO)) {
256                 final int logAt = 10000;
257                 if (count > 0 && ((count % logAt) == 0)) {
258                     logger.info("Average lookup " +
259                         (aggregatedLookupTime / logAt) + "ms.");
260                     aggregatedLookupTime = 0;
261                 }
262             }
263         }
264         if(status == OperationStatus.KEYEXIST) {
265             return false; // not added
266
} else {
267             return true;
268         }
269     }
270
271     protected long setCount() {
272         return count;
273     }
274
275     protected boolean setRemove(CharSequence JavaDoc uri) {
276         DatabaseEntry key = new DatabaseEntry();
277         LongBinding.longToEntry(createKey(uri), key);
278             OperationStatus status = null;
279         try {
280             status = alreadySeen.delete(null, key);
281         } catch (DatabaseException e) {
282             logger.severe(e.getMessage());
283         }
284         if (status == OperationStatus.SUCCESS) {
285             count--;
286             return true; // removed
287
} else {
288             return false; // not present
289
}
290     }
291
292     public long flush() {
293             // We always write but this might be place to do the sync
294
// when checkpointing? TODO.
295
return 0;
296     }
297 }
Popular Tags