KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > frontier > BdbFrontier


1 /* BdbFrontier
2  *
3  * $Id: BdbFrontier.java,v 1.88.4.1 2007/01/13 01:31:23 stack-sf Exp $
4 *
5  * Created on Sep 24, 2004
6  *
7  * Copyright (C) 2004 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  *
25   */

26 package org.archive.crawler.frontier;
27
28 import java.io.File JavaDoc;
29 import java.io.FileNotFoundException JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.io.Serializable JavaDoc;
32 import java.util.ArrayList JavaDoc;
33 import java.util.Iterator JavaDoc;
34 import java.util.List JavaDoc;
35 import java.util.logging.Logger JavaDoc;
36
37 import javax.management.AttributeNotFoundException JavaDoc;
38
39 import org.archive.crawler.datamodel.CrawlURI;
40 import org.archive.crawler.datamodel.UriUniqFilter;
41 import org.archive.crawler.framework.CrawlController;
42 import org.archive.crawler.framework.FrontierMarker;
43 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
44 import org.archive.crawler.settings.SimpleType;
45 import org.archive.crawler.settings.Type;
46 import org.archive.crawler.util.BdbUriUniqFilter;
47 import org.archive.crawler.util.BloomUriUniqFilter;
48 import org.archive.crawler.util.CheckpointUtils;
49 import org.archive.crawler.util.DiskFPMergeUriUniqFilter;
50 import org.archive.crawler.util.MemFPMergeUriUniqFilter;
51 import org.archive.util.ArchiveUtils;
52
53 import com.sleepycat.je.DatabaseException;
54
55 /**
56  * A Frontier using several BerkeleyDB JE Databases to hold its record of
57  * known hosts (queues), and pending URIs.
58  *
59  * @author Gordon Mohr
60  */

61 public class BdbFrontier extends WorkQueueFrontier implements Serializable JavaDoc {
62     // be robust against trivial implementation changes
63
private static final long serialVersionUID = ArchiveUtils
64         .classnameBasedUID(BdbFrontier.class, 1);
65
66     private static final Logger JavaDoc logger =
67         Logger.getLogger(BdbFrontier.class.getName());
68
69     /** all URIs scheduled to be crawled */
70     protected transient BdbMultipleWorkQueues pendingUris;
71
72     /** all URI-already-included options available to be chosen */
73     private String JavaDoc[] AVAILABLE_INCLUDED_OPTIONS = new String JavaDoc[] {
74             BdbUriUniqFilter.class.getName(),
75             BloomUriUniqFilter.class.getName(),
76             MemFPMergeUriUniqFilter.class.getName(),
77             DiskFPMergeUriUniqFilter.class.getName()};
78     
79     /** URI-already-included to use (by class name) */
80     public final static String JavaDoc ATTR_INCLUDED = "uri-included-structure";
81     
82     private final static String JavaDoc DEFAULT_INCLUDED =
83         BdbUriUniqFilter.class.getName();
84     
85     /**
86      * Constructor.
87      * @param name Name for of this Frontier.
88      */

89     public BdbFrontier(String JavaDoc name) {
90         this(name, "BdbFrontier. "
91             + "A Frontier using BerkeleyDB Java Edition databases for "
92             + "persistence to disk.");
93         Type t = addElementToDefinition(new SimpleType(ATTR_INCLUDED,
94                 "Structure to use for tracking already-seen URIs. Non-default " +
95                 "options may require additional configuration via system " +
96                 "properties.", DEFAULT_INCLUDED, AVAILABLE_INCLUDED_OPTIONS));
97         t.setExpertSetting(true);
98     }
99
100     /**
101      * Create the BdbFrontier
102      *
103      * @param name
104      * @param description
105      */

106     public BdbFrontier(String JavaDoc name, String JavaDoc description) {
107         super(name, description);
108     }
109     
110     /**
111      * Create the single object (within which is one BDB database)
112      * inside which all the other queues live.
113      *
114      * @return the created BdbMultipleWorkQueues
115      * @throws DatabaseException
116      */

117     private BdbMultipleWorkQueues createMultipleWorkQueues()
118     throws DatabaseException {
119         return new BdbMultipleWorkQueues(this.controller.getBdbEnvironment(),
120             this.controller.getClassCatalog(),
121             this.controller.isCheckpointRecover());
122     }
123
124     /**
125      * Create a UriUniqFilter that will serve as record
126      * of already seen URIs.
127      *
128      * @return A UURISet that will serve as a record of already seen URIs
129      * @throws IOException
130      */

131     protected UriUniqFilter createAlreadyIncluded() throws IOException JavaDoc {
132         UriUniqFilter uuf;
133         String JavaDoc c = null;
134         try {
135             c = (String JavaDoc)getAttribute(null, ATTR_INCLUDED);
136         } catch (AttributeNotFoundException JavaDoc e) {
137             // Do default action if attribute not in order.
138
}
139         // TODO: avoid all this special-casing; enable some common
140
// constructor interface usable for all alt implemenations
141
if (c != null && c.equals(BloomUriUniqFilter.class.getName())) {
142             uuf = this.controller.isCheckpointRecover()?
143                     deserializeAlreadySeen(BloomUriUniqFilter.class,
144                         this.controller.getCheckpointRecover().getDirectory()):
145                     new BloomUriUniqFilter();
146         } else if (c!=null && c.equals(MemFPMergeUriUniqFilter.class.getName())) {
147             // TODO: add checkpointing for MemFPMergeUriUniqFilter
148
uuf = new MemFPMergeUriUniqFilter();
149         } else if (c!=null && c.equals(DiskFPMergeUriUniqFilter.class.getName())) {
150             // TODO: add checkpointing for DiskFPMergeUriUniqFilter
151
uuf = new DiskFPMergeUriUniqFilter(controller.getScratchDisk());
152         } else {
153             // Assume its BdbUriUniqFilter.
154
uuf = this.controller.isCheckpointRecover()?
155                 deserializeAlreadySeen(BdbUriUniqFilter.class,
156                     this.controller.getCheckpointRecover().getDirectory()):
157                 new BdbUriUniqFilter(this.controller.getBdbEnvironment());
158             if (this.controller.isCheckpointRecover()) {
159                 // If recover, need to call reopen of the db.
160
try {
161                     ((BdbUriUniqFilter)uuf).
162                         reopen(this.controller.getBdbEnvironment());
163                 } catch (DatabaseException e) {
164                     throw new IOException JavaDoc(e.getMessage());
165                 }
166             }
167         }
168         uuf.setDestination(this);
169         return uuf;
170     }
171     
172     protected UriUniqFilter deserializeAlreadySeen(
173             final Class JavaDoc<? extends UriUniqFilter> cls,
174             final File JavaDoc dir)
175     throws FileNotFoundException JavaDoc, IOException JavaDoc {
176         UriUniqFilter uuf = null;
177         try {
178             logger.fine("Started deserializing " + cls.getName() +
179                 " of checkpoint recover.");
180             uuf = CheckpointUtils.readObjectFromFile(cls, dir);
181             logger.fine("Finished deserializing bdbje as part " +
182                 "of checkpoint recover.");
183         } catch (ClassNotFoundException JavaDoc e) {
184             throw new IOException JavaDoc("Failed to deserialize " +
185                 cls.getName() + ": " + e.getMessage());
186         }
187         return uuf;
188     }
189
190     /**
191      * Return the work queue for the given CrawlURI's classKey. URIs
192      * are ordered and politeness-delayed within their 'class'.
193      *
194      * @param curi CrawlURI to base queue on
195      * @return the found or created BdbWorkQueue
196      */

197     protected WorkQueue getQueueFor(CrawlURI curi) {
198         WorkQueue wq;
199         String JavaDoc classKey = curi.getClassKey();
200         synchronized (allQueues) {
201             wq = (WorkQueue)allQueues.get(classKey);
202             if (wq == null) {
203                 wq = new BdbWorkQueue(classKey, this);
204                 wq.setTotalBudget(((Long JavaDoc)getUncheckedAttribute(
205                     curi,ATTR_QUEUE_TOTAL_BUDGET)).longValue());
206                 allQueues.put(classKey, wq);
207             }
208         }
209         return wq;
210     }
211     
212     /**
213      * Return the work queue for the given classKey, or null
214      * if no such queue exists.
215      *
216      * @param classKey key to look for
217      * @return the found WorkQueue
218      */

219     protected WorkQueue getQueueFor(String JavaDoc classKey) {
220         WorkQueue wq;
221         synchronized (allQueues) {
222             wq = (WorkQueue)allQueues.get(classKey);
223         }
224         return wq;
225     }
226
227     public FrontierMarker getInitialMarker(String JavaDoc regexpr,
228             boolean inCacheOnly) {
229         return pendingUris.getInitialMarker(regexpr);
230     }
231
232     /**
233      * Return list of urls.
234      * @param marker
235      * @param numberOfMatches
236      * @param verbose
237      * @return List of URIs (strings).
238      */

239     public ArrayList JavaDoc<String JavaDoc> getURIsList(FrontierMarker marker,
240             int numberOfMatches, final boolean verbose) {
241         List JavaDoc curis;
242         try {
243             curis = pendingUris.getFrom(marker, numberOfMatches);
244         } catch (DatabaseException e) {
245             e.printStackTrace();
246             throw new RuntimeException JavaDoc(e);
247         }
248         ArrayList JavaDoc<String JavaDoc> results = new ArrayList JavaDoc<String JavaDoc>(curis.size());
249         Iterator JavaDoc iter = curis.iterator();
250         while(iter.hasNext()) {
251             CrawlURI curi = (CrawlURI) iter.next();
252             results.add("["+curi.getClassKey()+"] "+curi.singleLineReport());
253         }
254         return results;
255     }
256     
257     protected void initQueue() throws IOException JavaDoc {
258         try {
259             this.pendingUris = createMultipleWorkQueues();
260         } catch(DatabaseException e) {
261             throw (IOException JavaDoc)new IOException JavaDoc(e.getMessage()).initCause(e);
262         }
263     }
264     
265     protected void closeQueue() {
266         if (this.pendingUris != null) {
267             this.pendingUris.close();
268             this.pendingUris = null;
269         }
270     }
271         
272     protected BdbMultipleWorkQueues getWorkQueues() {
273         return pendingUris;
274     }
275
276     protected boolean workQueueDataOnDisk() {
277         return true;
278     }
279     
280     public void initialize(CrawlController c)
281     throws FatalConfigurationException, IOException JavaDoc {
282         super.initialize(c);
283         if (c.isCheckpointRecover()) {
284             // If a checkpoint recover, copy old values from serialized
285
// instance into this Frontier instance. Do it this way because
286
// though its possible to serialize BdbFrontier, its currently not
287
// possible to set/remove frontier attribute plugging the
288
// deserialized object back into the settings system.
289
// The below copying over is error-prone because its easy
290
// to miss a value. Perhaps there's a better way? Introspection?
291
BdbFrontier f = null;
292             try {
293                 f = (BdbFrontier)CheckpointUtils.
294                     readObjectFromFile(this.getClass(),
295                         this.controller.getCheckpointRecover().getDirectory());
296             } catch (FileNotFoundException JavaDoc e) {
297                 throw new FatalConfigurationException("Failed checkpoint " +
298                     "recover: " + e.getMessage());
299             } catch (IOException JavaDoc e) {
300                 throw new FatalConfigurationException("Failed checkpoint " +
301                     "recover: " + e.getMessage());
302             } catch (ClassNotFoundException JavaDoc e) {
303                 throw new FatalConfigurationException("Failed checkpoint " +
304                     "recover: " + e.getMessage());
305             }
306
307             this.nextOrdinal = f.nextOrdinal;
308             this.totalProcessedBytes = f.totalProcessedBytes;
309             this.disregardedUriCount = f.disregardedUriCount;
310             this.failedFetchCount = f.failedFetchCount;
311             this.processedBytesAfterLastEmittedURI =
312                 f.processedBytesAfterLastEmittedURI;
313             this.queuedUriCount = f.queuedUriCount;
314             this.succeededFetchCount = f.succeededFetchCount;
315             this.lastMaxBandwidthKB = f.lastMaxBandwidthKB;
316             this.readyClassQueues = f.readyClassQueues;
317             this.inactiveQueues = f.inactiveQueues;
318             this.retiredQueues = f.retiredQueues;
319             this.snoozedClassQueues = f.snoozedClassQueues;
320             this.inProcessQueues = f.inProcessQueues;
321             wakeQueues();
322         }
323     }
324
325     public void crawlCheckpoint(File JavaDoc checkpointDir) throws Exception JavaDoc {
326         super.crawlCheckpoint(checkpointDir);
327         logger.fine("Started serializing already seen as part "
328             + "of checkpoint. Can take some time.");
329         // An explicit sync on the any deferred write dbs is needed to make the
330
// db recoverable. Sync'ing the environment doesn't work.
331
if (this.pendingUris != null) {
332             this.pendingUris.sync();
333         }
334         CheckpointUtils .writeObjectToFile(this.alreadyIncluded, checkpointDir);
335         logger.fine("Finished serializing already seen as part "
336             + "of checkpoint.");
337         // Serialize ourselves.
338
CheckpointUtils.writeObjectToFile(this, checkpointDir);
339     }
340 }
341
Popular Tags