KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > framework > CrawlController


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * CrawlController.java
20  * Created on May 14, 2003
21  *
22  * $Id: CrawlController.java,v 1.155.2.1 2007/01/13 01:31:21 stack-sf Exp $
23  */

24 package org.archive.crawler.framework;
25
26 import java.io.File JavaDoc;
27 import java.io.FileOutputStream JavaDoc;
28 import java.io.FilenameFilter JavaDoc;
29 import java.io.IOException JavaDoc;
30 import java.io.ObjectInputStream JavaDoc;
31 import java.io.PrintWriter JavaDoc;
32 import java.io.Serializable JavaDoc;
33 import java.util.ArrayList JavaDoc;
34 import java.util.Arrays JavaDoc;
35 import java.util.Collections JavaDoc;
36 import java.util.Date JavaDoc;
37 import java.util.EventObject JavaDoc;
38 import java.util.HashMap JavaDoc;
39 import java.util.HashSet JavaDoc;
40 import java.util.Hashtable JavaDoc;
41 import java.util.Iterator JavaDoc;
42 import java.util.LinkedList JavaDoc;
43 import java.util.List JavaDoc;
44 import java.util.Map JavaDoc;
45 import java.util.Set JavaDoc;
46 import java.util.TreeSet JavaDoc;
47 import java.util.logging.FileHandler JavaDoc;
48 import java.util.logging.Formatter JavaDoc;
49 import java.util.logging.Level JavaDoc;
50 import java.util.logging.Logger JavaDoc;
51
52 import javax.management.AttributeNotFoundException JavaDoc;
53 import javax.management.InvalidAttributeValueException JavaDoc;
54 import javax.management.MBeanException JavaDoc;
55 import javax.management.ReflectionException JavaDoc;
56
57 import org.apache.commons.httpclient.URIException;
58 import org.archive.crawler.admin.CrawlJob;
59 import org.archive.crawler.admin.StatisticsTracker;
60 import org.archive.crawler.datamodel.Checkpoint;
61 import org.archive.crawler.datamodel.CrawlOrder;
62 import org.archive.crawler.datamodel.CrawlURI;
63 import org.archive.crawler.datamodel.ServerCache;
64 import org.archive.crawler.event.CrawlStatusListener;
65 import org.archive.crawler.event.CrawlURIDispositionListener;
66 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
67 import org.archive.crawler.framework.exceptions.InitializationException;
68 import org.archive.crawler.io.LocalErrorFormatter;
69 import org.archive.crawler.io.RuntimeErrorFormatter;
70 import org.archive.crawler.io.StatisticsLogFormatter;
71 import org.archive.crawler.io.UriErrorFormatter;
72 import org.archive.crawler.io.UriProcessingFormatter;
73 import org.archive.crawler.settings.MapType;
74 import org.archive.crawler.settings.SettingsHandler;
75 import org.archive.crawler.util.CheckpointUtils;
76 import org.archive.io.GenerationFileHandler;
77 import org.archive.net.UURI;
78 import org.archive.net.UURIFactory;
79 import org.archive.util.ArchiveUtils;
80 import org.archive.util.CachedBdbMap;
81 import org.archive.util.FileUtils;
82 import org.archive.util.Reporter;
83 import org.xbill.DNS.DClass;
84 import org.xbill.DNS.Lookup;
85 import org.xbill.DNS.Type;
86
87 import com.sleepycat.bind.serial.StoredClassCatalog;
88 import com.sleepycat.je.CheckpointConfig;
89 import com.sleepycat.je.Database;
90 import com.sleepycat.je.DatabaseConfig;
91 import com.sleepycat.je.DatabaseException;
92 import com.sleepycat.je.DbInternal;
93 import com.sleepycat.je.Environment;
94 import com.sleepycat.je.EnvironmentConfig;
95 import com.sleepycat.je.dbi.EnvironmentImpl;
96 import com.sleepycat.je.utilint.DbLsn;
97
98 import java.util.concurrent.locks.ReentrantLock JavaDoc;
99
100 /**
101  * CrawlController collects all the classes which cooperate to
102  * perform a crawl and provides a high-level interface to the
103  * running crawl.
104  *
105  * As the "global context" for a crawl, subcomponents will
106  * often reach each other through the CrawlController.
107  *
108  * @author Gordon Mohr
109  */

110 public class CrawlController implements Serializable JavaDoc, Reporter {
111     // be robust against trivial implementation changes
112
private static final long serialVersionUID =
113         ArchiveUtils.classnameBasedUID(CrawlController.class,1);
114
115     /**
116      * Messages from the crawlcontroller.
117      *
118      * They appear on console.
119      */

120     private final static Logger JavaDoc LOGGER =
121         Logger.getLogger(CrawlController.class.getName());
122
123     // manifest support
124
/** abbrieviation label for config files in manifest */
125     public static final char MANIFEST_CONFIG_FILE = 'C';
126     /** abbrieviation label for report files in manifest */
127     public static final char MANIFEST_REPORT_FILE = 'R';
128     /** abbrieviation label for log files in manifest */
129     public static final char MANIFEST_LOG_FILE = 'L';
130
131     // key log names
132
private static final String JavaDoc LOGNAME_PROGRESS_STATISTICS =
133         "progress-statistics";
134     private static final String JavaDoc LOGNAME_URI_ERRORS = "uri-errors";
135     private static final String JavaDoc LOGNAME_RUNTIME_ERRORS = "runtime-errors";
136     private static final String JavaDoc LOGNAME_LOCAL_ERRORS = "local-errors";
137     private static final String JavaDoc LOGNAME_CRAWL = "crawl";
138
139     // key subcomponents which define and implement a crawl in progress
140
private transient CrawlOrder order;
141     private transient CrawlScope scope;
142     private transient ProcessorChainList processorChains;
143     
144     private transient Frontier frontier;
145
146     private transient ToePool toePool;
147     
148     private transient ServerCache serverCache;
149     
150     // This gets passed into the initialize method.
151
private transient SettingsHandler settingsHandler;
152
153
154     // Used to enable/disable single-threaded operation after OOM
155
private volatile transient boolean singleThreadMode = false;
156     private transient ReentrantLock JavaDoc singleThreadLock = null;
157
158     // emergency reserve of memory to allow some progress/reporting after OOM
159
private transient LinkedList JavaDoc<char[]> reserveMemory;
160     private static final int RESERVE_BLOCKS = 1;
161     private static final int RESERVE_BLOCK_SIZE = 6*2^20; // 6MB
162

163     // crawl state: as requested or actual
164

165     /**
166      * Crawl exit status.
167      */

168     private transient String JavaDoc sExit;
169
170     private static final Object JavaDoc NASCENT = "NASCENT".intern();
171     private static final Object JavaDoc RUNNING = "RUNNING".intern();
172     private static final Object JavaDoc PAUSED = "PAUSED".intern();
173     private static final Object JavaDoc PAUSING = "PAUSING".intern();
174     private static final Object JavaDoc CHECKPOINTING = "CHECKPOINTING".intern();
175     private static final Object JavaDoc STOPPING = "STOPPING".intern();
176     private static final Object JavaDoc FINISHED = "FINISHED".intern();
177     private static final Object JavaDoc STARTED = "STARTED".intern();
178     private static final Object JavaDoc PREPARING = "PREPARING".intern();
179
180     transient private Object JavaDoc state = NASCENT;
181
182     // disk paths
183
private transient File JavaDoc disk; // overall disk path
184
private transient File JavaDoc logsDisk; // for log files
185

186     /**
187      * For temp files representing state of crawler (eg queues)
188      */

189     private transient File JavaDoc stateDisk;
190     
191     /**
192      * For discardable temp files (eg fetch buffers).
193      */

194     private transient File JavaDoc scratchDisk;
195
196     /**
197      * Directory that holds checkpoint.
198      */

199     private transient File JavaDoc checkpointsDisk;
200     
201     /**
202      * Checkpointer.
203      * Knows if checkpoint in progress and what name of checkpoint is. Also runs
204      * checkpoints.
205      */

206     private Checkpointer checkpointer;
207     
208     /**
209      * Gets set to checkpoint we're in recovering if in checkpoint recover
210      * mode. Gets setup by {@link #getCheckpointRecover()}.
211      */

212     private transient Checkpoint checkpointRecover = null;
213
214     // crawl limits
215
private long maxBytes;
216     private long maxDocument;
217     private long maxTime;
218
219     /**
220      * A manifest of all files used/created during this crawl. Written to file
221      * at the end of the crawl (the absolutely last thing done).
222      */

223     private StringBuffer JavaDoc manifest;
224
225     /**
226      * Record of fileHandlers established for loggers,
227      * assisting file rotation.
228      */

229     transient private Map JavaDoc<Logger JavaDoc,FileHandler JavaDoc> fileHandlers;
230
231     /** suffix to use on active logs */
232     public static final String JavaDoc CURRENT_LOG_SUFFIX = ".log";
233
234     /**
235      * Crawl progress logger.
236      *
237      * No exceptions. Logs summary result of each url processing.
238      */

239     public transient Logger JavaDoc uriProcessing;
240
241     /**
242      * This logger contains unexpected runtime errors.
243      *
244      * Would contain errors trying to set up a job or failures inside
245      * processors that they are not prepared to recover from.
246      */

247     public transient Logger JavaDoc runtimeErrors;
248
249     /**
250      * This logger is for job-scoped logging, specifically errors which
251      * happen and are handled within a particular processor.
252      *
253      * Examples would be socket timeouts, exceptions thrown by extractors, etc.
254      */

255     public transient Logger JavaDoc localErrors;
256
257     /**
258      * Special log for URI format problems, wherever they may occur.
259      */

260     public transient Logger JavaDoc uriErrors;
261
262     /**
263      * Statistics tracker writes here at regular intervals.
264      */

265     private transient Logger JavaDoc progressStats;
266
267     /**
268      * Logger to hold job summary report.
269      *
270      * Large state reports made at infrequent intervals (e.g. job ending) go
271      * here.
272      */

273     public transient Logger JavaDoc reports;
274
275     protected StatisticsTracking statistics = null;
276
277     /**
278      * List of crawl status listeners.
279      *
280      * All iterations need to synchronize on this object if they're to avoid
281      * concurrent modification exceptions.
282      * See {@link java.util.Collections#synchronizedList(List)}.
283      */

284     private transient List JavaDoc<CrawlStatusListener> registeredCrawlStatusListeners =
285         Collections.synchronizedList(new ArrayList JavaDoc<CrawlStatusListener>());
286     
287     // Since there is a high probability that there will only ever by one
288
// CrawlURIDispositionListner we will use this while there is only one:
289
private transient CrawlURIDispositionListener
290         registeredCrawlURIDispositionListener;
291
292     // And then switch to the array once there is more then one.
293
protected transient ArrayList JavaDoc<CrawlURIDispositionListener>
294      registeredCrawlURIDispositionListeners;
295     
296     /** Shared bdb Environment for Frontier subcomponents */
297     // TODO: investigate using multiple environments to split disk accesses
298
// across separate physical disks
299
private transient Environment bdbEnvironment = null;
300     
301     /**
302      * Shared class catalog database. Used by the
303      * {@link #classCatalog}.
304      */

305     private transient Database classCatalogDB = null;
306     
307     /**
308      * Class catalog instance.
309      * Used by bdb serialization.
310      */

311     private transient StoredClassCatalog classCatalog = null;
312     
313     /**
314      * Keep a list of all BigMap instance made -- shouldn't be many -- so that
315      * we can checkpoint.
316      */

317     private transient Map JavaDoc<String JavaDoc,CachedBdbMap<?,?>> bigmaps = null;
318     
319     /**
320      * Default constructor
321      */

322     public CrawlController() {
323         super();
324         // Defer most setup to initialize methods
325
}
326
327     /**
328      * Starting from nothing, set up CrawlController and associated
329      * classes to be ready for a first crawl.
330      *
331      * @param sH Settings handler.
332      * @throws InitializationException
333      */

334     public void initialize(SettingsHandler sH)
335     throws InitializationException {
336         sendCrawlStateChangeEvent(PREPARING, CrawlJob.STATUS_PREPARING);
337
338         this.singleThreadLock = new ReentrantLock JavaDoc();
339         this.settingsHandler = sH;
340         this.order = settingsHandler.getOrder();
341         this.order.setController(this);
342         this.bigmaps = new Hashtable JavaDoc<String JavaDoc,CachedBdbMap<?,?>>();
343         sExit = "";
344         this.manifest = new StringBuffer JavaDoc();
345         String JavaDoc onFailMessage = "";
346         try {
347             onFailMessage = "You must set the User-Agent and From HTTP" +
348             " header values to acceptable strings. \n" +
349             " User-Agent: [software-name](+[info-url])[misc]\n" +
350             " From: [email-address]\n";
351             order.checkUserAgentAndFrom();
352
353             onFailMessage = "Unable to setup disk";
354             if (disk == null) {
355                 setupDisk();
356             }
357
358             onFailMessage = "Unable to create log file(s)";
359             setupLogs();
360             
361             // Figure if we're to do a checkpoint restore. If so, get the
362
// checkpointRecover instance and then put into place the old bdb
363
// log files. If any of the log files already exist in target state
364
// diretory, WE DO NOT OVERWRITE (Makes for faster recovery).
365
// CrawlController checkpoint recovery code manages restoration of
366
// the old StatisticsTracker, any BigMaps used by the Crawler and
367
// the moving of bdb log files into place only. Other objects
368
// interested in recovery need to ask if
369
// CrawlController#isCheckpointRecover is set to figure if in
370
// recovery and then take appropriate recovery action
371
// (These objects can call CrawlController#getCheckpointRecover
372
// to get the directory that might hold files/objects dropped
373
// checkpointing). Such objects will need to use a technique other
374
// than object serialization restoring settings because they'll
375
// have already been constructed when comes time for object to ask
376
// if its to recover itself. See ARCWriterProcessor for example.
377
onFailMessage = "Unable to test/run checkpoint recover";
378             this.checkpointRecover = getCheckpointRecover();
379             if (this.checkpointRecover == null) {
380                 this.checkpointer =
381                     new Checkpointer(this, this.checkpointsDisk);
382             } else {
383                 setupCheckpointRecover();
384             }
385             
386             onFailMessage = "Unable to setup bdb environment.";
387             setupBdb();
388             
389             onFailMessage = "Unable to setup statistics";
390             setupStatTracking();
391             
392             onFailMessage = "Unable to setup crawl modules";
393             setupCrawlModules();
394         } catch (Exception JavaDoc e) {
395             String JavaDoc tmp = "On crawl: "
396                 + settingsHandler.getSettingsObject(null).getName() + " " +
397                 onFailMessage;
398             LOGGER.log(Level.SEVERE, tmp, e);
399             throw new InitializationException(tmp, e);
400         }
401
402         // force creation of DNS Cache now -- avoids CacheCleaner in toe-threads group
403
// also cap size at 1 (we never wanta cached value; 0 is non-operative)
404
Lookup.getDefaultCache(DClass.IN).setMaxEntries(1);
405         //dns.getRecords("localhost", Type.A, DClass.IN);
406

407         setupToePool();
408         setThresholds();
409         
410         reserveMemory = new LinkedList JavaDoc<char[]>();
411         for(int i = 1; i < RESERVE_BLOCKS; i++) {
412             reserveMemory.add(new char[RESERVE_BLOCK_SIZE]);
413         }
414     }
415     
416     /**
417      * Does setup of checkpoint recover.
418      * Copies bdb log files into state dir.
419      * @throws IOException
420      */

421     protected void setupCheckpointRecover()
422     throws IOException JavaDoc {
423         long started = System.currentTimeMillis();;
424         if (LOGGER.isLoggable(Level.FINE)) {
425             LOGGER.fine("Starting recovery setup -- copying into place " +
426                 "bdbje log files -- for checkpoint named " +
427                 this.checkpointRecover.getDisplayName());
428         }
429         // Mark context we're in a recovery.
430
this.checkpointer.recover(this);
431         this.progressStats.info("CHECKPOINT RECOVER " +
432             this.checkpointRecover.getDisplayName());
433         // Copy the bdb log files to the state dir so we don't damage
434
// old checkpoint. If thousands of log files, can take
435
// tens of minutes (1000 logs takes ~5 minutes to java copy,
436
// dependent upon hardware). If log file already exists over in the
437
// target state directory, we do not overwrite -- we assume the log
438
// file in the target same as one we'd copy from the checkpoint dir.
439
File JavaDoc bdbSubDir = CheckpointUtils.
440             getBdbSubDirectory(this.checkpointRecover.getDirectory());
441         FileUtils.copyFiles(bdbSubDir, CheckpointUtils.getJeLogsFilter(),
442             getStateDisk(), true,
443             false);
444         if (LOGGER.isLoggable(Level.INFO)) {
445             LOGGER.info("Finished recovery setup for checkpoint named " +
446                 this.checkpointRecover.getDisplayName() + " in " +
447                 (System.currentTimeMillis() - started) + "ms.");
448         }
449     }
450     
451     protected boolean getCheckpointCopyBdbjeLogs() {
452         return ((Boolean JavaDoc)this.order.getUncheckedAttribute(null,
453             CrawlOrder.ATTR_CHECKPOINT_COPY_BDBJE_LOGS)).booleanValue();
454     }
455     
456     private void setupBdb()
457     throws FatalConfigurationException, AttributeNotFoundException JavaDoc {
458         EnvironmentConfig envConfig = new EnvironmentConfig();
459         envConfig.setAllowCreate(true);
460         int bdbCachePercent = ((Integer JavaDoc)this.order.
461             getAttribute(null, CrawlOrder.ATTR_BDB_CACHE_PERCENT)).intValue();
462         if(bdbCachePercent > 0) {
463             // Operator has expressed a preference; override BDB default or
464
// je.properties value
465
envConfig.setCachePercent(bdbCachePercent);
466         }
467         envConfig.setLockTimeout(5000000); // 5 seconds
468
if (LOGGER.isLoggable(Level.FINEST)) {
469             envConfig.setConfigParam("java.util.logging.level", "SEVERE");
470             envConfig.setConfigParam("java.util.logging.level.evictor",
471                 "SEVERE");
472             envConfig.setConfigParam("java.util.logging.ConsoleHandler.on",
473                 "true");
474         }
475
476         if (!getCheckpointCopyBdbjeLogs()) {
477             // If we are not copying files on checkpoint, then set bdbje to not
478
// remove its log files so that its possible to later assemble
479
// (manually) all needed to run a recovery using mix of current
480
// bdbje logs and those its marked for deletion.
481
envConfig.setConfigParam("je.cleaner.expunge", "false");
482         }
483                 
484         try {
485             this.bdbEnvironment = new Environment(getStateDisk(), envConfig);
486             if (LOGGER.isLoggable(Level.FINE)) {
487                 // Write out the bdb configuration.
488
envConfig = bdbEnvironment.getConfig();
489                 LOGGER.fine("BdbConfiguration: Cache percentage " +
490                     envConfig.getCachePercent() +
491                     ", cache size " + envConfig.getCacheSize());
492             }
493             // Open the class catalog database. Create it if it does not
494
// already exist.
495
DatabaseConfig dbConfig = new DatabaseConfig();
496             dbConfig.setAllowCreate(true);
497             this.classCatalogDB = this.bdbEnvironment.
498                 openDatabase(null, "classes", dbConfig);
499             this.classCatalog = new StoredClassCatalog(classCatalogDB);
500         } catch (DatabaseException e) {
501             e.printStackTrace();
502             throw new FatalConfigurationException(e.getMessage());
503         }
504     }
505     
506     public Environment getBdbEnvironment() {
507         return this.bdbEnvironment;
508     }
509     
510     public StoredClassCatalog getClassCatalog() {
511         return this.classCatalog;
512     }
513
514     /**
515      * Register for CrawlStatus events.
516      *
517      * @param cl a class implementing the CrawlStatusListener interface
518      *
519      * @see CrawlStatusListener
520      */

521     public void addCrawlStatusListener(CrawlStatusListener cl) {
522         synchronized (this.registeredCrawlStatusListeners) {
523             this.registeredCrawlStatusListeners.add(cl);
524         }
525     }
526
527     /**
528      * Register for CrawlURIDisposition events.
529      *
530      * @param cl a class implementing the CrawlURIDispostionListener interface
531      *
532      * @see CrawlURIDispositionListener
533      */

534     public void addCrawlURIDispositionListener(CrawlURIDispositionListener cl) {
535         registeredCrawlURIDispositionListener = null;
536         if (registeredCrawlURIDispositionListeners == null) {
537             // First listener;
538
registeredCrawlURIDispositionListener = cl;
539             //Only used for the first one while it is the only one.
540
registeredCrawlURIDispositionListeners
541              = new ArrayList JavaDoc<CrawlURIDispositionListener>(1);
542             //We expect it to be very small.
543
}
544         registeredCrawlURIDispositionListeners.add(cl);
545     }
546
547     /**
548      * Allows an external class to raise a CrawlURIDispostion
549      * crawledURISuccessful event that will be broadcast to all listeners that
550      * have registered with the CrawlController.
551      *
552      * @param curi - The CrawlURI that will be sent with the event notification.
553      *
554      * @see CrawlURIDispositionListener#crawledURISuccessful(CrawlURI)
555      */

556     public void fireCrawledURISuccessfulEvent(CrawlURI curi) {
557         if (registeredCrawlURIDispositionListener != null) {
558             // Then we'll just use that.
559
registeredCrawlURIDispositionListener.crawledURISuccessful(curi);
560         } else {
561             // Go through the list.
562
if (registeredCrawlURIDispositionListeners != null
563                 && registeredCrawlURIDispositionListeners.size() > 0) {
564                 Iterator JavaDoc it = registeredCrawlURIDispositionListeners.iterator();
565                 while (it.hasNext()) {
566                     (
567                         (CrawlURIDispositionListener) it
568                             .next())
569                             .crawledURISuccessful(
570                         curi);
571                 }
572             }
573         }
574     }
575
576     /**
577      * Allows an external class to raise a CrawlURIDispostion
578      * crawledURINeedRetry event that will be broadcast to all listeners that
579      * have registered with the CrawlController.
580      *
581      * @param curi - The CrawlURI that will be sent with the event notification.
582      *
583      * @see CrawlURIDispositionListener#crawledURINeedRetry(CrawlURI)
584      */

585     public void fireCrawledURINeedRetryEvent(CrawlURI curi) {
586         if (registeredCrawlURIDispositionListener != null) {
587             // Then we'll just use that.
588
registeredCrawlURIDispositionListener.crawledURINeedRetry(curi);
589             return;
590         }
591         
592         // Go through the list.
593
if (registeredCrawlURIDispositionListeners != null
594                 && registeredCrawlURIDispositionListeners.size() > 0) {
595             for (Iterator JavaDoc i = registeredCrawlURIDispositionListeners.iterator();
596                     i.hasNext();) {
597                 ((CrawlURIDispositionListener)i.next()).crawledURINeedRetry(curi);
598             }
599         }
600     }
601
602     /**
603      * Allows an external class to raise a CrawlURIDispostion
604      * crawledURIDisregard event that will be broadcast to all listeners that
605      * have registered with the CrawlController.
606      *
607      * @param curi -
608      * The CrawlURI that will be sent with the event notification.
609      *
610      * @see CrawlURIDispositionListener#crawledURIDisregard(CrawlURI)
611      */

612     public void fireCrawledURIDisregardEvent(CrawlURI curi) {
613         if (registeredCrawlURIDispositionListener != null) {
614             // Then we'll just use that.
615
registeredCrawlURIDispositionListener.crawledURIDisregard(curi);
616         } else {
617             // Go through the list.
618
if (registeredCrawlURIDispositionListeners != null
619                 && registeredCrawlURIDispositionListeners.size() > 0) {
620                 Iterator JavaDoc it = registeredCrawlURIDispositionListeners.iterator();
621                 while (it.hasNext()) {
622                     (
623                         (CrawlURIDispositionListener) it
624                             .next())
625                             .crawledURIDisregard(
626                         curi);
627                 }
628             }
629         }
630     }
631
632     /**
633      * Allows an external class to raise a CrawlURIDispostion crawledURIFailure event
634      * that will be broadcast to all listeners that have registered with the CrawlController.
635      *
636      * @param curi - The CrawlURI that will be sent with the event notification.
637      *
638      * @see CrawlURIDispositionListener#crawledURIFailure(CrawlURI)
639      */

640     public void fireCrawledURIFailureEvent(CrawlURI curi) {
641         if (registeredCrawlURIDispositionListener != null) {
642             // Then we'll just use that.
643
registeredCrawlURIDispositionListener.crawledURIFailure(curi);
644         } else {
645             // Go through the list.
646
if (registeredCrawlURIDispositionListeners != null
647                 && registeredCrawlURIDispositionListeners.size() > 0) {
648                 Iterator JavaDoc it = registeredCrawlURIDispositionListeners.iterator();
649                 while (it.hasNext()) {
650                     ((CrawlURIDispositionListener)it.next())
651                         .crawledURIFailure(curi);
652                 }
653             }
654         }
655     }
656
657     private void setupCrawlModules() throws FatalConfigurationException,
658              AttributeNotFoundException JavaDoc, MBeanException JavaDoc, ReflectionException JavaDoc {
659         if (scope == null) {
660             scope = (CrawlScope) order.getAttribute(CrawlScope.ATTR_NAME);
661             scope.initialize(this);
662         }
663         try {
664             this.serverCache = new ServerCache(this);
665         } catch (Exception JavaDoc e) {
666             throw new FatalConfigurationException("Unable to" +
667                " initialize frontier (Failed setup of ServerCache) " + e);
668         }
669         
670         if (this.frontier == null) {
671             this.frontier = (Frontier)order.getAttribute(Frontier.ATTR_NAME);
672             try {
673                 frontier.initialize(this);
674                 frontier.pause(); // Pause until begun
675
// Run recovery if recoverPath points to a file (If it points
676
// to a directory, its a checkpoint recovery).
677
// TODO: make recover path relative to job root dir.
678
if (!isCheckpointRecover()) {
679                     runFrontierRecover((String JavaDoc)order.
680                         getAttribute(CrawlOrder.ATTR_RECOVER_PATH));
681                 }
682             } catch (IOException JavaDoc e) {
683                 throw new FatalConfigurationException(
684                     "unable to initialize frontier: " + e);
685             }
686         }
687
688         // Setup processors
689
if (processorChains == null) {
690             processorChains = new ProcessorChainList(order);
691         }
692     }
693     
694     protected void runFrontierRecover(String JavaDoc recoverPath)
695             throws AttributeNotFoundException JavaDoc, MBeanException JavaDoc,
696             ReflectionException JavaDoc, FatalConfigurationException {
697         if (recoverPath == null || recoverPath.length() <= 0) {
698             return;
699         }
700         File JavaDoc f = new File JavaDoc(recoverPath);
701         if (!f.exists()) {
702             LOGGER.severe("Recover file does not exist " + recoverPath);
703             return;
704         }
705         if (!f.isFile()) {
706             // Its a directory if supposed to be doing a checkpoint recover.
707
return;
708         }
709         boolean retainFailures = ((Boolean JavaDoc)order.
710           getAttribute(CrawlOrder.ATTR_RECOVER_RETAIN_FAILURES)).booleanValue();
711         try {
712             frontier.importRecoverLog(recoverPath, retainFailures);
713         } catch (IOException JavaDoc e) {
714             e.printStackTrace();
715             throw (FatalConfigurationException) new FatalConfigurationException(
716                 "Recover.log " + recoverPath + " problem: " + e).initCause(e);
717         }
718     }
719
720     private void setupDisk() throws AttributeNotFoundException JavaDoc {
721         String JavaDoc diskPath
722             = (String JavaDoc) order.getAttribute(null, CrawlOrder.ATTR_DISK_PATH);
723         this.disk = getSettingsHandler().
724             getPathRelativeToWorkingDirectory(diskPath);
725         this.disk.mkdirs();
726         this.logsDisk = getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
727         this.checkpointsDisk = getSettingsDir(CrawlOrder.ATTR_CHECKPOINTS_PATH);
728         this.stateDisk = getSettingsDir(CrawlOrder.ATTR_STATE_PATH);
729         this.scratchDisk = getSettingsDir(CrawlOrder.ATTR_SCRATCH_PATH);
730     }
731     
732     /**
733      * @return The logging directory or null if problem reading the settings.
734      */

735     public File JavaDoc getLogsDir() {
736         File JavaDoc f = null;
737         try {
738             f = getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
739         } catch (AttributeNotFoundException JavaDoc e) {
740             LOGGER.severe("Failed get of logs directory: " + e.getMessage());
741         }
742         return f;
743     }
744     
745     /**
746      * Return fullpath to the directory named by <code>key</code>
747      * in settings.
748      * If directory does not exist, it and all intermediary dirs
749      * will be created.
750      * @param key Key to use going to settings.
751      * @return Full path to directory named by <code>key</code>.
752      * @throws AttributeNotFoundException
753      */

754     public File JavaDoc getSettingsDir(String JavaDoc key)
755     throws AttributeNotFoundException JavaDoc {
756         String JavaDoc path = (String JavaDoc)order.getAttribute(null, key);
757         File JavaDoc f = new File JavaDoc(path);
758         if (!f.isAbsolute()) {
759             f = new File JavaDoc(disk.getPath(), path);
760         }
761         if (!f.exists()) {
762             f.mkdirs();
763         }
764         return f;
765     }
766
767     /**
768      * Setup the statistics tracker.
769      * The statistics object must be created before modules can use it.
770      * Do it here now so that when modules retrieve the object from the
771      * controller during initialization (which some do), its in place.
772      * @throws InvalidAttributeValueException
773      * @throws FatalConfigurationException
774      */

775     private void setupStatTracking()
776     throws InvalidAttributeValueException JavaDoc, FatalConfigurationException {
777         MapType loggers = order.getLoggers();
778         final String JavaDoc cstName = "crawl-statistics";
779         if (loggers.isEmpty(null)) {
780             if (!isCheckpointRecover() && this.statistics == null) {
781                 this.statistics = new StatisticsTracker(cstName);
782             }
783             loggers.addElement(null, (StatisticsTracker)this.statistics);
784         }
785         
786         if (isCheckpointRecover()) {
787             restoreStatisticsTracker(loggers, cstName);
788         }
789
790         for (Iterator JavaDoc it = loggers.iterator(null); it.hasNext();) {
791             StatisticsTracking tracker = (StatisticsTracking)it.next();
792             tracker.initialize(this);
793             if (this.statistics == null) {
794                 this.statistics = tracker;
795             }
796         }
797     }
798     
799     protected void restoreStatisticsTracker(MapType loggers,
800         String JavaDoc replaceName)
801     throws FatalConfigurationException {
802         try {
803             // Add the deserialized statstracker to the settings system.
804
loggers.removeElement(loggers.globalSettings(), replaceName);
805             loggers.addElement(loggers.globalSettings(),
806                 (StatisticsTracker)this.statistics);
807          } catch (Exception JavaDoc e) {
808              throw convertToFatalConfigurationException(e);
809          }
810     }
811     
812     protected FatalConfigurationException
813             convertToFatalConfigurationException(Exception JavaDoc e) {
814         FatalConfigurationException fce =
815             new FatalConfigurationException("Converted exception: " +
816                e.getMessage());
817         fce.setStackTrace(e.getStackTrace());
818         return fce;
819     }
820
821     private void setupLogs() throws IOException JavaDoc {
822         String JavaDoc logsPath = logsDisk.getAbsolutePath() + File.separatorChar;
823         uriProcessing = Logger.getLogger(LOGNAME_CRAWL + "." + logsPath);
824         runtimeErrors = Logger.getLogger(LOGNAME_RUNTIME_ERRORS + "." +
825             logsPath);
826         localErrors = Logger.getLogger(LOGNAME_LOCAL_ERRORS + "." + logsPath);
827         uriErrors = Logger.getLogger(LOGNAME_URI_ERRORS + "." + logsPath);
828         progressStats = Logger.getLogger(LOGNAME_PROGRESS_STATISTICS + "." +
829             logsPath);
830
831         this.fileHandlers = new HashMap JavaDoc<Logger JavaDoc,FileHandler JavaDoc>();
832
833         setupLogFile(uriProcessing,
834             logsPath + LOGNAME_CRAWL + CURRENT_LOG_SUFFIX,
835             new UriProcessingFormatter(), true);
836
837         setupLogFile(runtimeErrors,
838             logsPath + LOGNAME_RUNTIME_ERRORS + CURRENT_LOG_SUFFIX,
839             new RuntimeErrorFormatter(), true);
840
841         setupLogFile(localErrors,
842             logsPath + LOGNAME_LOCAL_ERRORS + CURRENT_LOG_SUFFIX,
843             new LocalErrorFormatter(), true);
844
845         setupLogFile(uriErrors,
846             logsPath + LOGNAME_URI_ERRORS + CURRENT_LOG_SUFFIX,
847             new UriErrorFormatter(), true);
848
849         setupLogFile(progressStats,
850             logsPath + LOGNAME_PROGRESS_STATISTICS + CURRENT_LOG_SUFFIX,
851             new StatisticsLogFormatter(), true);
852
853     }
854
855     private void setupLogFile(Logger JavaDoc logger, String JavaDoc filename, Formatter JavaDoc f,
856             boolean shouldManifest) throws IOException JavaDoc, SecurityException JavaDoc {
857         GenerationFileHandler fh = new GenerationFileHandler(filename, true,
858             shouldManifest);
859         fh.setFormatter(f);
860         logger.addHandler(fh);
861         addToManifest(filename, MANIFEST_LOG_FILE, shouldManifest);
862         logger.setUseParentHandlers(false);
863         this.fileHandlers.put(logger, fh);
864     }
865     
866     protected void rotateLogFiles(String JavaDoc generationSuffix)
867     throws IOException JavaDoc {
868         if (this.state != PAUSED && this.state != CHECKPOINTING) {
869             throw new IllegalStateException JavaDoc("Pause crawl before requesting " +
870                 "log rotation.");
871         }
872         for (Iterator JavaDoc i = fileHandlers.keySet().iterator(); i.hasNext();) {
873             Logger JavaDoc l = (Logger JavaDoc)i.next();
874             GenerationFileHandler gfh =
875                 (GenerationFileHandler)fileHandlers.get(l);
876             GenerationFileHandler newGfh =
877                 gfh.rotate(generationSuffix, CURRENT_LOG_SUFFIX);
878             if (gfh.shouldManifest()) {
879                 addToManifest((String JavaDoc) newGfh.getFilenameSeries().get(1),
880                     MANIFEST_LOG_FILE, newGfh.shouldManifest());
881             }
882             l.removeHandler(gfh);
883             l.addHandler(newGfh);
884             fileHandlers.put(l, newGfh);
885         }
886     }
887
888     /**
889      * Close all log files and remove handlers from loggers.
890      */

891     public void closeLogFiles() {
892        for (Iterator JavaDoc i = fileHandlers.keySet().iterator(); i.hasNext();) {
893             Logger JavaDoc l = (Logger JavaDoc)i.next();
894             GenerationFileHandler gfh =
895                 (GenerationFileHandler)fileHandlers.get(l);
896             gfh.close();
897             l.removeHandler(gfh);
898         }
899     }
900
901     /**
902      * Sets the values for max bytes, docs and time based on crawl order.
903      */

904     private void setThresholds() {
905         try {
906             maxBytes =
907                 ((Long JavaDoc) order.getAttribute(CrawlOrder.ATTR_MAX_BYTES_DOWNLOAD))
908                     .longValue();
909         } catch (Exception JavaDoc e) {
910             maxBytes = 0;
911         }
912         try {
913             maxDocument =
914                 ((Long JavaDoc) order
915                     .getAttribute(CrawlOrder.ATTR_MAX_DOCUMENT_DOWNLOAD))
916                     .longValue();
917         } catch (Exception JavaDoc e) {
918             maxDocument = 0;
919         }
920         try {
921             maxTime =
922                 ((Long JavaDoc) order.getAttribute(CrawlOrder.ATTR_MAX_TIME_SEC))
923                     .longValue();
924         } catch (Exception JavaDoc e) {
925             maxTime = 0;
926         }
927     }
928
929     /**
930      * @return Object this controller is using to track crawl statistics
931      */

932     public StatisticsTracking getStatistics() {
933         return statistics==null ?
934             new StatisticsTracker("crawl-statistics"): this.statistics;
935     }
936     
937     /**
938      * Send crawl change event to all listeners.
939      * @param newState State change we're to tell listeners' about.
940      * @param message Message on state change.
941      * @see #sendCheckpointEvent(File) for special case event sending
942      * telling listeners to checkpoint.
943      */

944     protected void sendCrawlStateChangeEvent(Object JavaDoc newState, String JavaDoc message) {
945         synchronized (this.registeredCrawlStatusListeners) {
946             this.state = newState;
947             for (Iterator JavaDoc i = this.registeredCrawlStatusListeners.iterator();
948                     i.hasNext();) {
949                 CrawlStatusListener l = (CrawlStatusListener)i.next();
950                 if (newState.equals(PAUSED)) {
951                    l.crawlPaused(message);
952                 } else if (newState.equals(RUNNING)) {
953                     l.crawlResuming(message);
954                 } else if (newState.equals(PAUSING)) {
955                    l.crawlPausing(message);
956                 } else if (newState.equals(STARTED)) {
957                     l.crawlStarted(message);
958                 } else if (newState.equals(STOPPING)) {
959                     l.crawlEnding(message);
960                 } else if (newState.equals(FINISHED)) {
961                     l.crawlEnded(message);
962                 } else if (newState.equals(PREPARING)) {
963                     l.crawlResuming(message);
964                 } else {
965                     throw new RuntimeException JavaDoc("Unknown state: " + newState);
966                 }
967                 if (LOGGER.isLoggable(Level.FINE)) {
968                     LOGGER.fine("Sent " + newState + " to " + l);
969                 }
970             }
971             LOGGER.fine("Sent " + newState);
972         }
973     }
974     
975     /**
976      * Send the checkpoint event.
977      * Has its own method apart from
978      * {@link #sendCrawlStateChangeEvent(Object, String)} because checkpointing
979      * throws an Exception (Didn't want to have to wrap all of the
980      * sendCrawlStateChangeEvent in try/catches).
981      * @param checkpointDir Where to write checkpoint state to.
982      * @throws Exception
983      */

984     protected void sendCheckpointEvent(File JavaDoc checkpointDir) throws Exception JavaDoc {
985         synchronized (this.registeredCrawlStatusListeners) {
986             if (this.state != PAUSED) {
987                 throw new IllegalStateException JavaDoc("Crawler must be completly " +
988                     "paused before checkpointing can start");
989             }
990             this.state = CHECKPOINTING;
991             for (Iterator JavaDoc i = this.registeredCrawlStatusListeners.iterator();
992                     i.hasNext();) {
993                 CrawlStatusListener l = (CrawlStatusListener)i.next();
994                 l.crawlCheckpoint(checkpointDir);
995                 if (LOGGER.isLoggable(Level.FINE)) {
996                     LOGGER.fine("Sent " + CHECKPOINTING + " to " + l);
997                 }
998             }
999             LOGGER.fine("Sent " + CHECKPOINTING);
1000        }
1001    }
1002
1003    /**
1004     * Operator requested crawl begin
1005     */

1006    public void requestCrawlStart() {
1007        runProcessorInitialTasks();
1008
1009        sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING);
1010        String JavaDoc jobState;
1011        state = RUNNING;
1012        jobState = CrawlJob.STATUS_RUNNING;
1013        sendCrawlStateChangeEvent(this.state, jobState);
1014
1015        // A proper exit will change this value.
1016
this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL;
1017        
1018        Thread JavaDoc statLogger = new Thread JavaDoc(statistics);
1019        statLogger.setName("StatLogger");
1020        statLogger.start();
1021        
1022        frontier.start();
1023    }
1024
1025    /**
1026     * Called when the last toethread exits.
1027     */

1028    protected void completeStop() {
1029        LOGGER.fine("Entered complete stop.");
1030        // Run processors' final tasks
1031
runProcessorFinalTasks();
1032        // Ok, now we are ready to exit.
1033
sendCrawlStateChangeEvent(FINISHED, this.sExit);
1034        synchronized (this.registeredCrawlStatusListeners) {
1035            // Remove all listeners now we're done with them.
1036
this.registeredCrawlStatusListeners.
1037                removeAll(this.registeredCrawlStatusListeners);
1038            this.registeredCrawlStatusListeners = null;
1039        }
1040        
1041        closeLogFiles();
1042        
1043        // Release reference to logger file handler instances.
1044
this.fileHandlers = null;
1045        this.uriErrors = null;
1046        this.uriProcessing = null;
1047        this.localErrors = null;
1048        this.runtimeErrors = null;
1049        this.progressStats = null;
1050        this.reports = null;
1051        this.manifest = null;
1052
1053        // Do cleanup.
1054
this.statistics = null;
1055        this.frontier = null;
1056        this.disk = null;
1057        this.scratchDisk = null;
1058        this.order = null;
1059        this.scope = null;
1060        if (this.settingsHandler != null) {
1061            this.settingsHandler.cleanup();
1062        }
1063        this.settingsHandler = null;
1064        this.reserveMemory = null;
1065        this.processorChains = null;
1066        if (this.serverCache != null) {
1067            this.serverCache.cleanup();
1068            this.serverCache = null;
1069        }
1070        if (this.checkpointer != null) {
1071            this.checkpointer.cleanup();
1072            this.checkpointer = null;
1073        }
1074        if (this.classCatalogDB != null) {
1075            try {
1076                this.classCatalogDB.close();
1077            } catch (DatabaseException e) {
1078                e.printStackTrace();
1079            }
1080            this.classCatalogDB = null;
1081        }
1082        if (this.bdbEnvironment != null) {
1083            try {
1084                this.bdbEnvironment.sync();
1085                this.bdbEnvironment.close();
1086            } catch (DatabaseException e) {
1087                e.printStackTrace();
1088            }
1089            this.bdbEnvironment = null;
1090        }
1091        this.bigmaps = null;
1092        if (this.toePool != null) {
1093            this.toePool.cleanup();
1094            // I played with launching a thread here to do cleanup of the
1095
// ToePool ThreadGroup (making sure the cleanup thread was not
1096
// in the ToePool ThreadGroup). Did this because ToePools seemed
1097
// to be sticking around holding references to CrawlController at
1098
// least. Need to spend more time looking to see that this is
1099
// still the case even after adding the above toePool#cleanup call.
1100
}
1101        this.toePool = null;
1102        LOGGER.fine("Finished crawl.");
1103    }
1104    
1105    synchronized void completePause() {
1106        // Send a notifyAll. At least checkpointing thread may be waiting on a
1107
// complete pause.
1108
notifyAll();
1109        sendCrawlStateChangeEvent(PAUSED, CrawlJob.STATUS_PAUSED);
1110    }
1111
1112    private boolean shouldContinueCrawling() {
1113        if (frontier.isEmpty()) {
1114            this.sExit = CrawlJob.STATUS_FINISHED;
1115            return false;
1116        }
1117
1118        if (maxBytes > 0 && frontier.totalBytesWritten() >= maxBytes) {
1119            // Hit the max byte download limit!
1120
sExit = CrawlJob.STATUS_FINISHED_DATA_LIMIT;
1121            return false;
1122        } else if (maxDocument > 0
1123                && frontier.succeededFetchCount() >= maxDocument) {
1124            // Hit the max document download limit!
1125
this.sExit = CrawlJob.STATUS_FINISHED_DOCUMENT_LIMIT;
1126            return false;
1127        } else if (maxTime > 0 &&
1128                statistics.crawlDuration() >= maxTime * 1000) {
1129            // Hit the max byte download limit!
1130
this.sExit = CrawlJob.STATUS_FINISHED_TIME_LIMIT;
1131            return false;
1132        }
1133        return state == RUNNING;
1134    }
1135
1136    /**
1137     * Request a checkpoint.
1138     * Sets a checkpointing thread running.
1139     * @throws IllegalStateException Thrown if crawl is not in paused state
1140     * (Crawl must be first paused before checkpointing).
1141     */

1142    public synchronized void requestCrawlCheckpoint()
1143    throws IllegalStateException JavaDoc {
1144        if (this.checkpointer == null) {
1145            return;
1146        }
1147        if (this.checkpointer.isCheckpointing()) {
1148            throw new IllegalStateException JavaDoc("Checkpoint already running.");
1149        }
1150        this.checkpointer.checkpoint();
1151    }
1152    
1153    /**
1154     * @return True if checkpointing.
1155     */

1156    public boolean isCheckpointing() {
1157        return this.state == CHECKPOINTING;
1158    }
1159    
1160    /**
1161     * Run checkpointing.
1162     * CrawlController takes care of managing the checkpointing/serializing
1163     * of bdb, the StatisticsTracker, and the CheckpointContext. Other
1164     * modules that want to revive themselves on checkpoint recovery need to
1165     * save state during their {@link CrawlStatusListener#crawlCheckpoint(File)}
1166     * invocation and then in their #initialize if a module,
1167     * or in their #initialTask if a processor, check with the CrawlController
1168     * if its checkpoint recovery. If it is, read in their old state from the
1169     * pointed to checkpoint directory.
1170     * <p>Default access only to be called by Checkpointer.
1171     * @throws Exception
1172     */

1173    void checkpoint()
1174    throws Exception JavaDoc {
1175        // Tell registered listeners to checkpoint.
1176
sendCheckpointEvent(this.checkpointer.
1177            getCheckpointInProgressDirectory());
1178        
1179        // Rotate off crawler logs.
1180
LOGGER.fine("Rotating log files.");
1181        rotateLogFiles(CURRENT_LOG_SUFFIX + "." +
1182            this.checkpointer.getNextCheckpointName());
1183
1184        // Sync the BigMap contents to bdb, if their bdb bigmaps.
1185
LOGGER.fine("BigMaps.");
1186        checkpointBigMaps(this.checkpointer.getCheckpointInProgressDirectory());
1187
1188        // Note, on deserialization, the super CrawlType#parent
1189
// needs to be restored. Parent is '/crawl-order/loggers'.
1190
// The settings handler for this module also needs to be
1191
// restored. Both of these fields are private in the
1192
// super class. Adding the restored ST to crawl order should take
1193
// care of this.
1194

1195        // Checkpoint bdb environment.
1196
LOGGER.fine("Bdb environment.");
1197        checkpointBdb(this.checkpointer.getCheckpointInProgressDirectory());
1198
1199        // Make copy of order, seeds, and settings.
1200
LOGGER.fine("Copying settings.");
1201        copySettings(this.checkpointer.getCheckpointInProgressDirectory());
1202
1203        // Checkpoint this crawlcontroller.
1204
CheckpointUtils.writeObjectToFile(this,
1205            this.checkpointer.getCheckpointInProgressDirectory());
1206    }
1207    
1208    /**
1209     * Copy off the settings.
1210     * @param checkpointDir Directory to write checkpoint to.
1211     * @throws IOException
1212     */

1213    protected void copySettings(final File JavaDoc checkpointDir) throws IOException JavaDoc {
1214        final List JavaDoc files = this.settingsHandler.getListOfAllFiles();
1215        boolean copiedSettingsDir = false;
1216        final File JavaDoc settingsDir = new File JavaDoc(this.disk, "settings");
1217        for (final Iterator JavaDoc i = files.iterator(); i.hasNext();) {
1218            File JavaDoc f = new File JavaDoc((String JavaDoc)i.next());
1219            if (f.getAbsolutePath().startsWith(settingsDir.getAbsolutePath())) {
1220                if (copiedSettingsDir) {
1221                    // Skip. We've already copied this member of the
1222
// settings directory.
1223
continue;
1224                }
1225                // Copy 'settings' dir all in one lump, not a file at a time.
1226
copiedSettingsDir = true;
1227                FileUtils.copyFiles(settingsDir,
1228                    new File JavaDoc(checkpointDir, settingsDir.getName()));
1229                continue;
1230            }
1231            FileUtils.copyFiles(f, f.isDirectory()? checkpointDir:
1232                new File JavaDoc(checkpointDir, f.getName()));
1233        }
1234    }
1235    
1236    /**
1237     * Checkpoint bdb.
1238     * I used do a call to log cleaning as suggested in je-2.0 javadoc but takes
1239     * way too much time (20minutes for a crawl of 1million items). Assume
1240     * cleaner is keeping up. Below was log cleaning loop .
1241     * <pre>int totalCleaned = 0;
1242     * for (int cleaned = 0; (cleaned = this.bdbEnvironment.cleanLog()) != 0;
1243     * totalCleaned += cleaned) {
1244     * LOGGER.fine("Cleaned " + cleaned + " log files.");
1245     * }
1246     * </pre>
1247     * <p>I also used to do a sync. But, from Mark Hayes, sync and checkpoint
1248     * are effectively same thing only sync is not configurable. He suggests
1249     * doing one or the other:
1250     * <p>MS: Reading code, Environment.sync() is a checkpoint. Looks like
1251     * I don't need to call a checkpoint after calling a sync?
1252     * <p>MH: Right, they're almost the same thing -- just do one or the other,
1253     * not both. With the new API, you'll need to do a checkpoint not a
1254     * sync, because the sync() method has no config parameter. Don't worry
1255     * -- it's fine to do a checkpoint even though you're not using.
1256     * @param checkpointDir Directory to write checkpoint to.
1257     * @throws DatabaseException
1258     * @throws IOException
1259     * @throws RuntimeException Thrown if failed setup of new bdb environment.
1260     */

1261    protected void checkpointBdb(File JavaDoc checkpointDir)
1262    throws DatabaseException, IOException JavaDoc, RuntimeException JavaDoc {
1263        EnvironmentConfig envConfig = this.bdbEnvironment.getConfig();
1264        final List JavaDoc bkgrdThreads = Arrays.asList(new String JavaDoc []
1265            {"je.env.runCheckpointer", "je.env.runCleaner",
1266                "je.env.runINCompressor"});
1267        try {
1268            // Disable background threads
1269
setBdbjeBkgrdThreads(envConfig, bkgrdThreads, "false");
1270            // Do a force checkpoint. Thats what a sync does (i.e. doSync).
1271
CheckpointConfig chkptConfig = new CheckpointConfig();
1272            chkptConfig.setForce(true);
1273            
1274            // Mark Hayes of sleepycat says:
1275
// "The default for this property is false, which gives the current
1276
// behavior (allow deltas). If this property is true, deltas are
1277
// prohibited -- full versions of internal nodes are always logged
1278
// during the checkpoint. When a full version of an internal node
1279
// is logged during a checkpoint, recovery does not need to process
1280
// it at all. It is only fetched if needed by the application,
1281
// during normal DB operations after recovery. When a delta of an
1282
// internal node is logged during a checkpoint, recovery must
1283
// process it by fetching the full version of the node from earlier
1284
// in the log, and then applying the delta to it. This can be
1285
// pretty slow, since it is potentially a large amount of
1286
// random I/O."
1287
chkptConfig.setMinimizeRecoveryTime(true);
1288            this.bdbEnvironment.checkpoint(chkptConfig);
1289            LOGGER.fine("Finished bdb checkpoint.");
1290            
1291            // From the sleepycat folks: A trick for flipping db logs.
1292
EnvironmentImpl envImpl =
1293                DbInternal.envGetEnvironmentImpl(this.bdbEnvironment);
1294            long firstFileInNextSet =
1295                DbLsn.getFileNumber(envImpl.forceLogFileFlip());
1296            // So the last file in the checkpoint is firstFileInNextSet - 1.
1297
// Write manifest of all log files into the bdb directory.
1298
final String JavaDoc lastBdbCheckpointLog =
1299                getBdbLogFileName(firstFileInNextSet - 1);
1300            processBdbLogs(checkpointDir, lastBdbCheckpointLog);
1301            LOGGER.fine("Finished processing bdb log files.");
1302        } finally {
1303            // Restore background threads.
1304
setBdbjeBkgrdThreads(envConfig, bkgrdThreads, "true");
1305        }
1306    }
1307    
1308    protected void processBdbLogs(final File JavaDoc checkpointDir,
1309            final String JavaDoc lastBdbCheckpointLog) throws IOException JavaDoc {
1310        File JavaDoc bdbDir = CheckpointUtils.getBdbSubDirectory(checkpointDir);
1311        if (!bdbDir.exists()) {
1312            bdbDir.mkdir();
1313        }
1314        PrintWriter JavaDoc pw = new PrintWriter JavaDoc(new FileOutputStream JavaDoc(new File JavaDoc(
1315             checkpointDir, "bdbje-logs-manifest.txt")));
1316        try {
1317            // Don't copy any beyond the last bdb log file (bdbje can keep
1318
// writing logs after checkpoint).
1319
boolean pastLastLogFile = false;
1320            Set JavaDoc<String JavaDoc> srcFilenames = null;
1321            final boolean copyFiles = getCheckpointCopyBdbjeLogs();
1322            do {
1323                FilenameFilter JavaDoc filter = CheckpointUtils.getJeLogsFilter();
1324                srcFilenames =
1325                    new HashSet JavaDoc<String JavaDoc>(Arrays.asList(
1326                            getStateDisk().list(filter)));
1327                List JavaDoc tgtFilenames = Arrays.asList(bdbDir.list(filter));
1328                if (tgtFilenames != null && tgtFilenames.size() > 0) {
1329                    srcFilenames.removeAll(tgtFilenames);
1330                }
1331                if (srcFilenames.size() > 0) {
1332                    // Sort files.
1333
srcFilenames = new TreeSet JavaDoc<String JavaDoc>(srcFilenames);
1334                    int count = 0;
1335                    for (final Iterator JavaDoc i = srcFilenames.iterator();
1336                            i.hasNext() && !pastLastLogFile;) {
1337                        String JavaDoc name = (String JavaDoc) i.next();
1338                        if (copyFiles) {
1339                            FileUtils.copyFiles(new File JavaDoc(getStateDisk(), name),
1340                                new File JavaDoc(bdbDir, name));
1341                        }
1342                        pw.println(name);
1343                        if (name.equals(lastBdbCheckpointLog)) {
1344                            // We're done.
1345
pastLastLogFile = true;
1346                        }
1347                        count++;
1348                    }
1349                    if (LOGGER.isLoggable(Level.FINE)) {
1350                        LOGGER.fine("Copied " + count);
1351                    }
1352                }
1353            } while (!pastLastLogFile && srcFilenames != null &&
1354                srcFilenames.size() > 0);
1355        } finally {
1356            pw.close();
1357        }
1358    }
1359 
1360    protected String JavaDoc getBdbLogFileName(final long index) {
1361        String JavaDoc lastBdbLogFileHex = Long.toHexString(index);
1362        StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
1363        for (int i = 0; i < (8 - lastBdbLogFileHex.length()); i++) {
1364            buffer.append('0');
1365        }
1366        buffer.append(lastBdbLogFileHex);
1367        buffer.append(".jdb");
1368        return buffer.toString();
1369    }
1370    
1371    protected void setBdbjeBkgrdThreads(final EnvironmentConfig config,
1372            final List JavaDoc threads, final String JavaDoc setting) {
1373        for (final Iterator JavaDoc i = threads.iterator(); i.hasNext();) {
1374            config.setConfigParam((String JavaDoc)i.next(), setting);
1375        }
1376    }
1377    
1378    /**
1379     * Get recover checkpoint.
1380     * Returns null if we're NOT in recover mode.
1381     * Looks at ATTR_RECOVER_PATH and if its a directory, assumes checkpoint
1382     * recover. If checkpoint mode, returns Checkpoint instance if
1383     * checkpoint was VALID (else null).
1384     * @return Checkpoint instance if we're in recover checkpoint
1385     * mode and the pointed-to checkpoint was valid.
1386     * @see #isCheckpointRecover()
1387     */

1388    public synchronized Checkpoint getCheckpointRecover() {
1389        if (this.checkpointRecover != null) {
1390            return this.checkpointRecover;
1391        }
1392        return getCheckpointRecover(this.order);
1393    }
1394    
1395    public static Checkpoint getCheckpointRecover(final CrawlOrder order) {
1396        String JavaDoc path = (String JavaDoc)order.getUncheckedAttribute(null,
1397            CrawlOrder.ATTR_RECOVER_PATH);
1398        if (path == null || path.length() <= 0) {
1399            return null;
1400        }
1401        File JavaDoc rp = new File JavaDoc(path);
1402        // Assume if path is to a directory, its a checkpoint recover.
1403
Checkpoint result = null;
1404        if (rp.exists() && rp.isDirectory()) {
1405            Checkpoint cp = new Checkpoint(rp);
1406            if (cp.isValid()) {
1407                // if valid, set as result.
1408
result = cp;
1409            }
1410        }
1411        return result;
1412    }
1413    
1414    public static boolean isCheckpointRecover(final CrawlOrder order) {
1415        return getCheckpointRecover(order) != null;
1416    }
1417    
1418    /**
1419     * @return True if we're in checkpoint recover mode. Call
1420     * {@link #getCheckpointRecover()} to get at Checkpoint instance
1421     * that has info on checkpoint directory being recovered from.
1422     */

1423    public boolean isCheckpointRecover() {
1424        return this.checkpointRecover != null;
1425    }
1426
1427    /**
1428     * Operator requested for crawl to stop.
1429     */

1430    public synchronized void requestCrawlStop() {
1431        requestCrawlStop(CrawlJob.STATUS_ABORTED);
1432    }
1433    
1434    /**
1435     * Operator requested for crawl to stop.
1436     * @param message
1437     */

1438    public synchronized void requestCrawlStop(String JavaDoc message) {
1439        if (state == STOPPING || state == FINISHED) {
1440            return;
1441        }
1442        if (message == null) {
1443            throw new IllegalArgumentException JavaDoc("Message cannot be null.");
1444        }
1445        this.sExit = message;
1446        beginCrawlStop();
1447    }
1448
1449    /**
1450     * Start the process of stopping the crawl.
1451     */

1452    public void beginCrawlStop() {
1453        LOGGER.fine("Started.");
1454        sendCrawlStateChangeEvent(STOPPING, this.sExit);
1455        if (this.frontier != null) {
1456            this.frontier.terminate();
1457            this.frontier.unpause();
1458        }
1459        LOGGER.fine("Finished.");
1460    }
1461    
1462    /**
1463     * Stop the crawl temporarly.
1464     */

1465    public synchronized void requestCrawlPause() {
1466        if (state == PAUSING || state == PAUSED) {
1467            // Already about to pause
1468
return;
1469        }
1470        sExit = CrawlJob.STATUS_WAITING_FOR_PAUSE;
1471        frontier.pause();
1472        sendCrawlStateChangeEvent(PAUSING, this.sExit);
1473        if (toePool.getActiveToeCount() == 0) {
1474            // if all threads already held, complete pause now
1475
// (no chance to trigger off later held thread)
1476
completePause();
1477        }
1478    }
1479
1480    /**
1481     * Tell if the controller is paused
1482     * @return true if paused
1483     */

1484    public boolean isPaused() {
1485        return state == PAUSED;
1486    }
1487    
1488    public boolean isPausing() {
1489        return state == PAUSING;
1490    }
1491    
1492    public boolean isRunning() {
1493        return state == RUNNING;
1494    }
1495
1496    /**
1497     * Resume crawl from paused state
1498     */

1499    public synchronized void requestCrawlResume() {
1500        if (state != PAUSING && state != PAUSED && state != CHECKPOINTING) {
1501            // Can't resume if not been told to pause or if we're in middle of
1502
// a checkpoint.
1503
return;
1504        }
1505        multiThreadMode();
1506        frontier.unpause();
1507        LOGGER.fine("Crawl resumed.");
1508        sendCrawlStateChangeEvent(RUNNING, CrawlJob.STATUS_RUNNING);
1509    }
1510
1511    /**
1512     * @return Active toe thread count.
1513     */

1514    public int getActiveToeCount() {
1515        if (toePool == null) {
1516            return 0;
1517        }
1518        return toePool.getActiveToeCount();
1519    }
1520
1521    private void setupToePool() {
1522        toePool = new ToePool(this);
1523        // TODO: make # of toes self-optimizing
1524
toePool.setSize(order.getMaxToes());
1525    }
1526
1527    /**
1528     * @return The order file instance.
1529     */

1530    public CrawlOrder getOrder() {
1531        return order;
1532    }
1533
1534    /**
1535     * @return The server cache instance.
1536     */

1537    public ServerCache getServerCache() {
1538        return serverCache;
1539    }
1540
1541    /**
1542     * @param o
1543     */

1544    public void setOrder(CrawlOrder o) {
1545        order = o;
1546    }
1547
1548
1549    /**
1550     * @return The frontier.
1551     */

1552    public Frontier getFrontier() {
1553        return frontier;
1554    }
1555
1556    /**
1557     * @return This crawl scope.
1558     */

1559    public CrawlScope getScope() {
1560        return scope;
1561    }
1562
1563    /** Get the list of processor chains.
1564     *
1565     * @return the list of processor chains.
1566     */

1567    public ProcessorChainList getProcessorChainList() {
1568        return processorChains;
1569    }
1570
1571    /** Get the first processor chain.
1572     *
1573     * @return the first processor chain.
1574     */

1575    public ProcessorChain getFirstProcessorChain() {
1576        return processorChains.getFirstChain();
1577    }
1578
1579    /** Get the postprocessor chain.
1580     *
1581     * @return the postprocessor chain.
1582     */

1583    public ProcessorChain getPostprocessorChain() {
1584        return processorChains.getLastChain();
1585    }
1586
1587    /**
1588     * Get the 'working' directory of the current crawl.
1589     * @return the 'working' directory of the current crawl.
1590     */

1591    public File JavaDoc getDisk() {
1592        return disk;
1593    }
1594
1595    /**
1596     * @return Scratch disk location.
1597     */

1598    public File JavaDoc getScratchDisk() {
1599        return scratchDisk;
1600    }
1601
1602    /**
1603     * @return State disk location.
1604     */

1605    public File JavaDoc getStateDisk() {
1606        return stateDisk;
1607    }
1608
1609    /**
1610     * @return The number of ToeThreads
1611     *
1612     * @see ToePool#getToeCount()
1613     */

1614    public int getToeCount() {
1615        return this.toePool == null? 0: this.toePool.getToeCount();
1616    }
1617
1618    /**
1619     * @return The ToePool
1620     */

1621    public ToePool getToePool() {
1622        return toePool;
1623    }
1624    
1625    /**
1626     * @return toepool one-line report
1627     */

1628    public String JavaDoc oneLineReportThreads() {
1629        // TODO Auto-generated method stub
1630
return toePool.singleLineReport();
1631    }
1632
1633    /**
1634     * While many settings will update automatically when the SettingsHandler is
1635     * modified, some settings need to be explicitly changed to reflect new
1636     * settings. This includes, number of toe threads and seeds.
1637     */

1638    public void kickUpdate() {
1639        toePool.setSize(order.getMaxToes());
1640        
1641        this.scope.kickUpdate();
1642        this.frontier.kickUpdate();
1643        this.processorChains.kickUpdate();
1644        
1645        // TODO: continue to generalize this, so that any major
1646
// component can get a kick when it may need to refresh its data
1647

1648        setThresholds();
1649    }
1650
1651    /**
1652     * @return The settings handler.
1653     */

1654    public SettingsHandler getSettingsHandler() {
1655        return settingsHandler;
1656    }
1657
1658    /**
1659     * This method iterates through processor chains to run processors' initial
1660     * tasks.
1661     *
1662     */

1663    private void runProcessorInitialTasks(){
1664        for (Iterator JavaDoc ic = processorChains.iterator(); ic.hasNext(); ) {
1665            for (Iterator JavaDoc ip = ((ProcessorChain) ic.next()).iterator();
1666                    ip.hasNext(); ) {
1667                ((Processor) ip.next()).initialTasks();
1668            }
1669        }
1670    }
1671
1672    /**
1673     * This method iterates through processor chains to run processors' final
1674     * tasks.
1675     *
1676     */

1677    private void runProcessorFinalTasks(){
1678        for (Iterator JavaDoc ic = processorChains.iterator(); ic.hasNext(); ) {
1679            for (Iterator JavaDoc ip = ((ProcessorChain) ic.next()).iterator();
1680                    ip.hasNext(); ) {
1681                ((Processor) ip.next()).finalTasks();
1682            }
1683        }
1684    }
1685
1686    /**
1687     * Kills a thread. For details see
1688     * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
1689     * ToePool.killThread(int, boolean)}.
1690     * @param threadNumber Thread to kill.
1691     * @param replace Should thread be replaced.
1692     * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
1693     */

1694    public void killThread(int threadNumber, boolean replace){
1695        toePool.killThread(threadNumber, replace);
1696    }
1697
1698    /**
1699     * Add a file to the manifest of files used/generated by the current
1700     * crawl.
1701     *
1702     * TODO: Its possible for a file to be added twice if reports are
1703     * force generated midcrawl. Fix.
1704     *
1705     * @param file The filename (with absolute path) of the file to add
1706     * @param type The type of the file
1707     * @param bundle Should the file be included in a typical bundling of
1708     * crawler files.
1709     *
1710     * @see #MANIFEST_CONFIG_FILE
1711     * @see #MANIFEST_LOG_FILE
1712     * @see #MANIFEST_REPORT_FILE
1713     */

1714    public void addToManifest(String JavaDoc file, char type, boolean bundle) {
1715        manifest.append(type + (bundle? "+": "-") + " " + file + "\n");
1716    }
1717
1718    /**
1719     * Evaluate if the crawl should stop because it is finished.
1720     */

1721    public void checkFinish() {
1722        if(atFinish()) {
1723            beginCrawlStop();
1724        }
1725    }
1726
1727    /**
1728     * Evaluate if the crawl should stop because it is finished,
1729     * without actually stopping the crawl.
1730     *
1731     * @return true if crawl is at a finish-possible state
1732     */

1733    public boolean atFinish() {
1734        return state == RUNNING && !shouldContinueCrawling();
1735    }
1736    
1737    private void readObject(ObjectInputStream JavaDoc stream)
1738    throws IOException JavaDoc, ClassNotFoundException JavaDoc {
1739        stream.defaultReadObject();
1740        // Setup status listeners
1741
this.registeredCrawlStatusListeners =
1742            Collections.synchronizedList(new ArrayList JavaDoc<CrawlStatusListener>());
1743        // Ensure no holdover singleThreadMode
1744
singleThreadMode = false;
1745    }
1746
1747    /**
1748     * Go to single thread mode, where only one ToeThread may
1749     * proceed at a time. Also acquires the single lock, so
1750     * no further threads will proceed past an
1751     * acquireContinuePermission. Caller mush be sure to release
1752     * lock to allow other threads to proceed one at a time.
1753     */

1754    public void singleThreadMode() {
1755        this.singleThreadLock.lock();
1756        singleThreadMode = true;
1757    }
1758
1759    /**
1760     * Go to back to regular multi thread mode, where all
1761     * ToeThreads may proceed at once
1762     */

1763    public void multiThreadMode() {
1764        this.singleThreadLock.lock();
1765        singleThreadMode = false;
1766        while(this.singleThreadLock.isHeldByCurrentThread()) {
1767            this.singleThreadLock.unlock();
1768        }
1769    }
1770    
1771    /**
1772     * Proceed only if allowed, giving CrawlController a chance
1773     * to enforce single-thread mode.
1774     */

1775    public void acquireContinuePermission() {
1776        if (singleThreadMode) {
1777            this.singleThreadLock.lock();
1778            if(!singleThreadMode) {
1779                // If changed while waiting, ignore
1780
while(this.singleThreadLock.isHeldByCurrentThread()) {
1781                    this.singleThreadLock.unlock();
1782                }
1783            }
1784        } // else, permission is automatic
1785
}
1786
1787    /**
1788     * Relinquish continue permission at end of processing (allowing
1789     * another thread to proceed if in single-thread mode).
1790     */

1791    public void releaseContinuePermission() {
1792        if (singleThreadMode) {
1793            while(this.singleThreadLock.isHeldByCurrentThread()) {
1794                this.singleThreadLock.unlock();
1795            }
1796        } // else do nothing;
1797
}
1798    
1799    public void freeReserveMemory() {
1800        if(!reserveMemory.isEmpty()) {
1801            reserveMemory.removeLast();
1802            System.gc();
1803        }
1804    }
1805
1806    /**
1807     * Note that a ToeThread reached paused condition, possibly
1808     * completing the crawl-pause.
1809     */

1810    public synchronized void toePaused() {
1811        releaseContinuePermission();
1812        if (state == PAUSING && toePool.getActiveToeCount() == 0) {
1813            completePause();
1814        }
1815    }
1816    
1817    /**
1818     * Note that a ToeThread ended, possibly completing the crawl-stop.
1819     */

1820    public synchronized void toeEnded() {
1821        if (state == STOPPING && toePool.getActiveToeCount() == 0) {
1822            completeStop();
1823        }
1824    }
1825
1826    /**
1827     * Add order file contents to manifest.
1828     * Write configuration files and any files managed by CrawlController to
1829     * it - files managed by other classes, excluding the settings framework,
1830     * are responsible for adding their files to the manifest themselves.
1831     * by calling addToManifest.
1832     * Call before writing out reports.
1833     */

1834    public void addOrderToManifest() {
1835        for (Iterator JavaDoc it = getSettingsHandler().getListOfAllFiles().iterator();
1836                it.hasNext();) {
1837            addToManifest((String JavaDoc)it.next(),
1838                CrawlController.MANIFEST_CONFIG_FILE, true);
1839        }
1840    }
1841    
1842    /**
1843     * Log a URIException from deep inside other components to the crawl's
1844     * shared log.
1845     *
1846     * @param e URIException encountered
1847     * @param u CrawlURI where problem occurred
1848     * @param l String which could not be interpreted as URI without exception
1849     */

1850    public void logUriError(URIException e, UURI u, CharSequence JavaDoc l) {
1851        if (e.getReasonCode() == UURIFactory.IGNORED_SCHEME) {
1852            // don't log those that are intentionally ignored
1853
return;
1854        }
1855        Object JavaDoc[] array = {u, l};
1856        uriErrors.log(Level.INFO, e.getMessage(), array);
1857    }
1858    
1859    //
1860
// Reporter
1861
//
1862
public final static String JavaDoc PROCESSORS_REPORT = "processors";
1863    public final static String JavaDoc MANIFEST_REPORT = "manifest";
1864    protected final static String JavaDoc[] REPORTS = {PROCESSORS_REPORT, MANIFEST_REPORT};
1865    
1866    /* (non-Javadoc)
1867     * @see org.archive.util.Reporter#getReports()
1868     */

1869    public String JavaDoc[] getReports() {
1870        return REPORTS;
1871    }
1872
1873    /* (non-Javadoc)
1874     * @see org.archive.util.Reporter#reportTo(java.io.Writer)
1875     */

1876    public void reportTo(PrintWriter JavaDoc writer) {
1877        reportTo(null,writer);
1878    }
1879
1880    public String JavaDoc singleLineReport() {
1881        return ArchiveUtils.singleLineReport(this);
1882    }
1883
1884    public void reportTo(String JavaDoc name, PrintWriter JavaDoc writer) {
1885        if(PROCESSORS_REPORT.equals(name)) {
1886            reportProcessorsTo(writer);
1887            return;
1888        } else if (MANIFEST_REPORT.equals(name)) {
1889            reportManifestTo(writer);
1890            return;
1891        } else if (name!=null) {
1892            writer.println("requested report unknown: "+name);
1893        }
1894        singleLineReportTo(writer);
1895    }
1896
1897    /**
1898     * @param writer Where to write report to.
1899     */

1900    protected void reportManifestTo(PrintWriter JavaDoc writer) {
1901        writer.print(manifest.toString());
1902    }
1903
1904    /**
1905     * Compiles and returns a human readable report on the active processors.
1906     * @param writer Where to write to.
1907     * @see org.archive.crawler.framework.Processor#report()
1908     */

1909    protected void reportProcessorsTo(PrintWriter JavaDoc writer) {
1910        writer.print(
1911            "Processors report - "
1912                + ArchiveUtils.TIMESTAMP12.format(new Date JavaDoc())
1913                + "\n");
1914        writer.print(" Job being crawled: " + getOrder().getCrawlOrderName()
1915                + "\n");
1916
1917        writer.print(" Number of Processors: " +
1918            processorChains.processorCount() + "\n");
1919        writer.print(" NOTE: Some processors may not return a report!\n\n");
1920
1921        for (Iterator JavaDoc ic = processorChains.iterator(); ic.hasNext(); ) {
1922            for (Iterator JavaDoc ip = ((ProcessorChain) ic.next()).iterator();
1923                    ip.hasNext(); ) {
1924                writer.print(((Processor) ip.next()).report());
1925            }
1926        }
1927    }
1928
1929    public void singleLineReportTo(PrintWriter JavaDoc writer) {
1930        // TODO: imrpvoe to be summary of crawl state
1931
writer.write("[Crawl Controller]\n");
1932    }
1933
1934    public String JavaDoc singleLineLegend() {
1935        // TODO improve
1936
return "nothingYet";
1937    }
1938    
1939    /**
1940     * Call this method to get instance of the crawler BigMap implementation.
1941     * A "BigMap" is a Map that knows how to manage ever-growing sets of
1942     * key/value pairs. If we're in a checkpoint recovery, this method will
1943     * manage reinstantiation of checkpointed bigmaps.
1944     * @param dbName Name to give any associated database. Also used
1945     * as part of name serializing out bigmap. Needs to be unique to a crawl.
1946     * @param keyClass Class of keys we'll be using.
1947     * @param valueClass Class of values we'll be using.
1948     * @return Map that knows how to carry large sets of key/value pairs or
1949     * if none available, returns instance of HashMap.
1950     * @throws Exception
1951     */

1952    public <K,V> Map JavaDoc<K,V> getBigMap(final String JavaDoc dbName,
1953            final Class JavaDoc<? super K> keyClass,
1954            final Class JavaDoc<? super V> valueClass)
1955    throws Exception JavaDoc {
1956        CachedBdbMap<K,V> result = new CachedBdbMap<K,V>(dbName);
1957        if (isCheckpointRecover()) {
1958            File JavaDoc baseDir = getCheckpointRecover().getDirectory();
1959            @SuppressWarnings JavaDoc("unchecked")
1960            CachedBdbMap<K,V> temp = CheckpointUtils.
1961                readObjectFromFile(result.getClass(), dbName, baseDir);
1962            result = temp;
1963        }
1964        result.initialize(getBdbEnvironment(), keyClass, valueClass,
1965                getClassCatalog());
1966        // Save reference to all big maps made so can manage their
1967
// checkpointing.
1968
this.bigmaps.put(dbName, result);
1969        return result;
1970    }
1971    
1972    protected void checkpointBigMaps(final File JavaDoc cpDir)
1973    throws Exception JavaDoc {
1974        for (final Iterator JavaDoc i = this.bigmaps.keySet().iterator(); i.hasNext();) {
1975            Object JavaDoc key = i.next();
1976            Object JavaDoc obj = this.bigmaps.get(key);
1977            // TODO: I tried adding sync to custom serialization of BigMap
1978
// implementation but data member counts of the BigMap
1979
// implementation were not being persisted properly. Look at
1980
// why. For now, do sync in advance of serialization for now.
1981
((CachedBdbMap)obj).sync();
1982            CheckpointUtils.writeObjectToFile(obj, (String JavaDoc)key, cpDir);
1983        }
1984    }
1985
1986    /**
1987     * Called whenever progress statistics logging event.
1988     * @param e Progress statistics event.
1989     */

1990    public void progressStatisticsEvent(final EventObject JavaDoc e) {
1991        // Default is to do nothing. Subclass if you want to catch this event.
1992
// Later, if demand, add publisher/listener support. Currently hacked
1993
// in so the subclass in CrawlJob added to support JMX can send
1994
// notifications of progressStatistics change.
1995
}
1996    
1997    /**
1998     * Log to the progress statistics log.
1999     * @param msg Message to write the progress statistics log.
2000     */

2001    public void logProgressStatistics(final String JavaDoc msg) {
2002        this.progressStats.info(msg);
2003    }
2004
2005    /**
2006     * @return CrawlController state.
2007     */

2008    public Object JavaDoc getState() {
2009        return this.state;
2010    }
2011
2012    public File JavaDoc getCheckpointsDisk() {
2013        return this.checkpointsDisk;
2014    }
2015}
2016
Popular Tags