KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > admin > CrawlJobHandler


1 /* CrawlJobHandler
2  *
3  * $Id: CrawlJobHandler.java,v 1.99.6.1 2007/01/13 01:31:07 stack-sf Exp $
4  *
5  * Copyright (C) 2003 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.admin;
24
25 import java.io.BufferedReader JavaDoc;
26 import java.io.BufferedWriter JavaDoc;
27 import java.io.File JavaDoc;
28 import java.io.FileWriter JavaDoc;
29 import java.io.FilenameFilter JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.io.InputStream JavaDoc;
32 import java.io.InputStreamReader JavaDoc;
33 import java.net.URL JavaDoc;
34 import java.net.URI JavaDoc;
35 import java.util.ArrayList JavaDoc;
36 import java.util.Comparator JavaDoc;
37 import java.util.Date JavaDoc;
38 import java.util.Enumeration JavaDoc;
39 import java.util.Iterator JavaDoc;
40 import java.util.List JavaDoc;
41 import java.util.TreeSet JavaDoc;
42 import java.util.logging.Level JavaDoc;
43 import java.util.logging.Logger JavaDoc;
44
45 import javax.management.Attribute JavaDoc;
46 import javax.management.AttributeNotFoundException JavaDoc;
47 import javax.management.InvalidAttributeValueException JavaDoc;
48 import javax.management.MBeanException JavaDoc;
49 import javax.management.ReflectionException JavaDoc;
50
51 import org.apache.commons.httpclient.URIException;
52 import org.archive.crawler.Heritrix;
53 import org.archive.crawler.datamodel.CrawlOrder;
54 import org.archive.crawler.event.CrawlStatusListener;
55 import org.archive.crawler.framework.FrontierMarker;
56 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
57 import org.archive.crawler.framework.exceptions.InitializationException;
58 import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
59 import org.archive.crawler.frontier.FrontierJournal;
60 import org.archive.crawler.frontier.RecoveryJournal;
61 import org.archive.crawler.settings.ComplexType;
62 import org.archive.crawler.settings.CrawlerSettings;
63 import org.archive.crawler.settings.SettingsHandler;
64 import org.archive.crawler.settings.XMLSettingsHandler;
65 import org.archive.util.ArchiveUtils;
66 import org.archive.util.FileUtils;
67
68
69 /**
70  * This class manages CrawlJobs. Submitted crawl jobs are queued up and run
71  * in order when the crawler is running.
72  * <p>Basically this provides a layer between any potential user interface and
73  * the CrawlJobs. It keeps the lists of completed jobs, pending jobs, etc.
74  * <p>
75  * The jobs managed by the handler can be divided into the following:
76  * <ul>
77  * <li> <code>Pending</code> - Jobs that are ready to run and are waiting their
78  * turn. These can be edited, viewed, deleted etc.
79  * <li> <code>Running</code> - Only one job can be running at a time. There may
80  * be no job running. The running job can be viewed
81  * and edited to some extent. It can also be
82  * terminated. This job should have a
83  * StatisticsTracking module attached to it for more
84  * details on the crawl.
85  * <li><code>Completed</code> - Jobs that have finished crawling or have been
86  * deleted from the pending queue or terminated
87  * while running. They can not be edited but can be
88  * viewed. They retain the StatisticsTracking
89  * module from their run.
90  * <li> <code>New job</code> - At any given time their can be one 'new job' the
91  * new job is not considered ready to run. It can
92  * be edited or discarded (in which case it will be
93  * totally destroyed, including any files on disk).
94  * Once an operator deems the job ready to run it
95  * can be moved to the pending queue.
96  * <li> <code>Profiles</code> - Jobs under profiles are not actual jobs. They can
97  * be edited normally but can not be submitted to
98  * the pending queue. New jobs can be created
99  * using a profile as it's template.
100  *
101  * @author Kristinn Sigurdsson
102  *
103  * @see org.archive.crawler.admin.CrawlJob
104  */

105
106 public class CrawlJobHandler implements CrawlStatusListener {
107     private static final Logger JavaDoc logger =
108         Logger.getLogger(CrawlJobHandler.class.getName());
109
110     /**
111      * Name of system property whose specification overrides default profile
112      * used.
113      *
114      */

115     public static final String JavaDoc DEFAULT_PROFILE_NAME
116         = "heritrix.default.profile";
117
118     /**
119      * Default profile name.
120      */

121     public static final String JavaDoc DEFAULT_PROFILE = "default";
122     
123     /**
124      * Name of the profiles directory.
125      */

126     public static final String JavaDoc PROFILES_DIR_NAME = "profiles";
127     
128     public static final String JavaDoc ORDER_FILE_NAME = "order.xml";
129
130     /**
131      * Job currently being crawled.
132      */

133     private CrawlJob currentJob = null;
134     
135     /**
136      * A new job that is being created/configured. Not yet ready for crawling.
137      */

138     private CrawlJob newJob = null;
139
140     /**
141      * Thread to start the next job in background
142      */

143     private Thread JavaDoc startingNextJob = null;
144
145     /**
146      * A list of pending CrawlJobs.
147      */

148     private TreeSet JavaDoc<CrawlJob> pendingCrawlJobs;
149
150     /**
151      * A list of completed CrawlJobs.
152      */

153     //private Vector completedCrawlJobs = new Vector();
154
private TreeSet JavaDoc<CrawlJob> completedCrawlJobs;
155
156     /**
157      * A list of profile CrawlJobs.
158      */

159     private TreeSet JavaDoc<CrawlJob> profileJobs;
160     
161     // The UIDs of profiles should be NOT be timestamps. A descriptive name is
162
// ideal.
163
private String JavaDoc defaultProfile = null;
164
165     /**
166      * If true the crawler is 'running'. That is the next pending job will start
167      * crawling as soon as the current job (if any) is completed.
168      */

169     private boolean running = false;
170     
171     /**
172      * String to indicate recovery should be based on the recovery log, not
173      * based on checkpointing.
174      */

175     public static final String JavaDoc RECOVER_LOG = "recover";
176     
177     /**
178      * Jobs directory.
179      */

180     private final File JavaDoc jobsDir;
181     
182     /**
183      * Constructor.
184      * @param jobsDir Jobs directory.
185      */

186     public CrawlJobHandler(final File JavaDoc jobsDir) {
187         this(jobsDir, true, true);
188     }
189
190     /**
191      * Constructor allowing for optional loading of profiles and jobs.
192      * @param jobsDir Jobs directory.
193      * @param loadJobs If true then any applicable jobs will be loaded.
194      * @param loadProfiles If true then any applicable profiles will be loaded.
195      */

196     public CrawlJobHandler(final File JavaDoc jobsDir,
197             final boolean loadJobs, final boolean loadProfiles) {
198         this.jobsDir = jobsDir;
199         // Make a comparator for CrawlJobs.
200
Comparator JavaDoc<CrawlJob> comp = new Comparator JavaDoc<CrawlJob>(){
201             public int compare(CrawlJob job1, CrawlJob job2) {
202                 if( job1.getJobPriority() < job2.getJobPriority() ){
203                     return -1;
204                 } else if( job1.getJobPriority() > job2.getJobPriority() ){
205                     return 1;
206                 } else {
207                     // Same priority, use UID (which should be a timestamp).
208
// Lower UID (string compare) means earlier time.
209
return job1.getUID().compareTo(job2.getUID());
210                 }
211             }
212         };
213         this.pendingCrawlJobs = new TreeSet JavaDoc<CrawlJob>(comp);
214         this.completedCrawlJobs = new TreeSet JavaDoc<CrawlJob>(comp);
215         // Profiles always have the same priority so it will be sorted by name
216
this.profileJobs = new TreeSet JavaDoc<CrawlJob>(comp);
217         if (loadProfiles){
218             loadProfiles();
219         }
220         if (loadJobs){
221             loadJobs();
222         }
223     }
224     
225     /**
226      * Find the state.job file in the job directory.
227      * @param jobDir Directory to look in.
228      * @return Full path to 'state.job' file or null if none found.
229      */

230     protected File JavaDoc getStateJobFile(final File JavaDoc jobDir) {
231         // Need to find job file ('state.job').
232
File JavaDoc[] jobFiles = jobDir.listFiles(new FilenameFilter JavaDoc() {
233             public boolean accept(File JavaDoc dir, String JavaDoc name) {
234                 return name.toLowerCase().endsWith(".job") &&
235                     (new File JavaDoc(dir, name)).canRead();
236             }
237             
238         });
239         return (jobFiles.length == 1)? jobFiles[0]: null;
240     }
241
242     /**
243      * Loads any availible jobs in the jobs directory.
244      * <p>
245      * Availible jobs are any directory containing a file called
246      * <code>state.job</code>. The file must contain valid job information.
247      */

248     private void loadJobs() {
249         this.jobsDir.mkdirs();
250         File JavaDoc[] jobs = this.jobsDir.listFiles();
251         for (int i = 0; i < jobs.length; i++) {
252             if (jobs[i].isDirectory()) {
253                 File JavaDoc jobFile = getStateJobFile(jobs[i]);
254                 if (jobFile != null) {
255                     loadJob(jobFile);
256                 }
257             }
258         }
259     }
260
261     /**
262      * Loads a job given a specific job file. The loaded job will be placed in
263      * the list of completed jobs or pending queue depending on its status.
264      * Running jobs will have their status set to 'finished abnormally' and put
265      * into the completed list.
266      * @param job The job file of the job to load.
267      */

268     protected void loadJob(final File JavaDoc job) {
269         CrawlJob cjob = null;
270         try {
271             // Load the CrawlJob
272
cjob = new CrawlJob(job, new CrawlJobErrorHandler());
273         } catch (InvalidJobFileException e) {
274             logger.log(Level.INFO,
275                     "Invalid job file for " + job.getAbsolutePath(), e);
276             return;
277         } catch (IOException JavaDoc e) {
278             logger.log(Level.INFO, "IOException for " + job.getName() +
279                     ", " + job.getAbsolutePath(), e);
280             return;
281         }
282         
283         // TODO: Move test into CrawlJob.
284
// Check job status and place it accordingly.
285
if (cjob.getStatus().equals(CrawlJob.STATUS_RUNNING)
286                 || cjob.getStatus().equals(CrawlJob.STATUS_PAUSED)
287                 || cjob.getStatus().equals(CrawlJob.STATUS_CHECKPOINTING)
288                 || cjob.getStatus().equals(CrawlJob.STATUS_WAITING_FOR_PAUSE) ){
289             // Was a running job.
290
cjob.setStatus(CrawlJob.STATUS_FINISHED_ABNORMAL);
291             this.completedCrawlJobs.add(cjob);
292         } else if( cjob.getStatus().equals(CrawlJob.STATUS_PENDING) ) {
293             // Was a pending job.
294
this.pendingCrawlJobs.add(cjob);
295         } else if( cjob.getStatus().equals(CrawlJob.STATUS_CREATED)
296                 || cjob.getStatus().equals(CrawlJob.STATUS_DELETED) ) {
297             // Ignore for now. TODO: Add to 'recycle bin'
298
} else {
299             // Must have been completed.
300
this.completedCrawlJobs.add(cjob);
301         }
302     }
303
304     /**
305      * Looks in conf dir for a profiles dir.
306      * @return the directory where profiles are stored else null if none
307      * available
308      * @throws IOException
309      */

310     private File JavaDoc getProfilesDirectory() throws IOException JavaDoc {
311         URL JavaDoc webappProfilePath = Heritrix.class.getResource("/" +
312             PROFILES_DIR_NAME);
313         if (webappProfilePath != null) {
314             try {
315                 return new File JavaDoc(new URI JavaDoc(webappProfilePath.toString()));
316             } catch (java.lang.IllegalArgumentException JavaDoc e) {
317                 // e.g. "profiles" within a jar file
318
// try Heritrix.getConfdir() in this case
319
} catch (java.net.URISyntaxException JavaDoc e) {
320                 e.printStackTrace();
321             }
322         }
323         return (Heritrix.getConfdir(false) == null)? null:
324             new File JavaDoc(Heritrix.getConfdir().getAbsolutePath(),
325                 PROFILES_DIR_NAME);
326     }
327
328     /**
329      * Loads the default profile and all other profiles found on disk.
330      */

331     private void loadProfiles() {
332         boolean loadedDefault = false;
333         File JavaDoc profileDir = null;
334         try {
335             profileDir = getProfilesDirectory();
336         } catch (IOException JavaDoc e) {
337             e.printStackTrace();
338         }
339         if (profileDir != null) {
340             File JavaDoc[] ps = profileDir.listFiles();
341             if (ps != null && ps.length > 0) {
342                 for (int i = 0; i < ps.length; i++) {
343                     File JavaDoc f = ps[i];
344                     if (f.isDirectory()) {
345                         // Each directory in the profiles directory should
346
// contain the file order.xml.
347
File JavaDoc profile = new File JavaDoc(f, ORDER_FILE_NAME);
348                         if (profile.canRead()) {
349                             boolean b = loadProfile(profile);
350                             if (b) {
351                                 loadedDefault = b;
352                             }
353                         }
354                     }
355                 }
356             }
357         }
358         // Now add in the default profile. Its on the CLASSPATH and needs
359
// special handling. Don't add if already a default present.
360
String JavaDoc parent = File.separator + PROFILES_DIR_NAME + File.separator;
361         if (!loadedDefault) {
362             loadProfile(new File JavaDoc(parent + DEFAULT_PROFILE, ORDER_FILE_NAME));
363         }
364         // Look to see if a default profile system property has been
365
// supplied. If so, use it instead.
366
// TODO: Try and read default profile from some permanent storage.
367
defaultProfile = DEFAULT_PROFILE;
368     }
369     
370     /**
371      * Load one profile.
372      * @param profile Profile to load.
373      * @return True if loaded profile was the default profile.
374      */

375     protected boolean loadProfile(File JavaDoc profile) {
376         boolean loadedDefault = false;
377         // Ok, got the order file for this profile.
378
try {
379             // The directory name denotes the profiles UID and name.
380
XMLSettingsHandler newSettingsHandler =
381                 new XMLSettingsHandler(profile);
382             CrawlJobErrorHandler cjseh =
383                 new CrawlJobErrorHandler(Level.SEVERE);
384             newSettingsHandler.
385                 setErrorReportingLevel(cjseh.getLevel());
386             newSettingsHandler.initialize();
387             addProfile(new CrawlJob(profile.getParentFile().getName(),
388                 newSettingsHandler, cjseh));
389             loadedDefault = profile.getParentFile().getName().
390                 equals(DEFAULT_PROFILE);
391         } catch (InvalidAttributeValueException JavaDoc e) {
392             System.err.println("Failed to load profile '" +
393                     profile.getParentFile().getName() +
394                     "'. InvalidAttributeValueException.");
395         }
396         return loadedDefault;
397     }
398
399     /**
400      * Add a new profile
401      * @param profile The new profile
402      */

403     public synchronized void addProfile(CrawlJob profile){
404         profileJobs.add(profile);
405     }
406     
407     public synchronized void deleteProfile(CrawlJob cj) throws IOException JavaDoc {
408         File JavaDoc d = getProfilesDirectory();
409         File JavaDoc p = new File JavaDoc(d, cj.getJobName());
410         if (!p.exists()) {
411             throw new IOException JavaDoc("No profile named " + cj.getJobName() +
412                 " at " + d.getAbsolutePath());
413         }
414         FileUtils.deleteDir(p);
415         this.profileJobs.remove(cj);
416     }
417
418     /**
419      * Returns a List of all known profiles.
420      * @return a List of all known profiles.
421      */

422     public synchronized List JavaDoc<CrawlJob> getProfiles(){
423         ArrayList JavaDoc<CrawlJob> tmp = new ArrayList JavaDoc<CrawlJob>(profileJobs.size());
424         tmp.addAll(profileJobs);
425         return tmp;
426     }
427
428     /**
429      * Submit a job to the handler. Job will be scheduled for crawling. At
430      * present it will not take the job's priority into consideration.
431      *
432      * @param job A new job for the handler
433      * @return CrawlJob that was added or null.
434      */

435     public CrawlJob addJob(CrawlJob job) {
436         if(job.isProfile()){
437             return null; // Can't crawl profiles.
438
}
439         job.setStatus(CrawlJob.STATUS_PENDING);
440         if(job.isNew()){
441             // Are adding the new job to the pending queue.
442
this.newJob = null;
443             job.setNew(false);
444         }
445         this.pendingCrawlJobs.add(job);
446         if(isCrawling() == false && isRunning()) {
447             // Start crawling
448
startNextJob();
449         }
450         return job;
451     }
452
453     /**
454      * Returns the default profile. If no default profile has been set it will
455      * return the first profile that was set/loaded and still exists. If no
456      * profiles exist it will return null
457      * @return the default profile.
458      */

459     public synchronized CrawlJob getDefaultProfile() {
460         if(defaultProfile != null){
461             for(Iterator JavaDoc it = profileJobs.iterator(); it.hasNext();) {
462                 CrawlJob item = (CrawlJob)it.next();
463                 if(item.getJobName().equals(defaultProfile)){
464                     // Found it.
465
return item;
466                 }
467             }
468         }
469         if(profileJobs.size() > 0){
470             return (CrawlJob)profileJobs.first();
471         }
472         return null;
473     }
474
475     /**
476      * Set the default profile.
477      * @param profile The new default profile. The following must apply to it.
478      * profile.isProfile() should return true and
479      * this.getProfiles() should contain it.
480      */

481     public void setDefaultProfile(CrawlJob profile) {
482         defaultProfile = profile.getJobName();
483         // TODO: Make changes to default profile durable across restarts.
484
}
485
486     /**
487      * A List of all pending jobs
488      *
489      * @return A List of all pending jobs.
490      * No promises are made about the order of the list
491      */

492     public List JavaDoc<CrawlJob> getPendingJobs() {
493         ArrayList JavaDoc<CrawlJob> tmp
494          = new ArrayList JavaDoc<CrawlJob>(pendingCrawlJobs.size());
495         tmp.addAll(pendingCrawlJobs);
496         return tmp;
497     }
498
499     /**
500      * @return The job currently being crawled.
501      */

502     public CrawlJob getCurrentJob() {
503         return currentJob;
504     }
505
506     /**
507      * @return A List of all finished jobs.
508      */

509     public List JavaDoc<CrawlJob> getCompletedJobs() {
510         ArrayList JavaDoc<CrawlJob> tmp
511          = new ArrayList JavaDoc<CrawlJob>(completedCrawlJobs.size());
512         tmp.addAll(completedCrawlJobs);
513         return tmp;
514     }
515
516     /**
517      * Return a job with the given UID.
518      * Doesn't matter if it's pending, currently running, has finished running
519      * is new or a profile.
520      *
521      * @param jobUID The unique ID of the job.
522      * @return The job with the UID or null if no such job is found
523      */

524     public CrawlJob getJob(String JavaDoc jobUID) {
525         if (jobUID == null){
526             return null; // UID can't be null
527
}
528         // First check currently running job
529
if (currentJob != null && currentJob.getUID().equals(jobUID)) {
530             return currentJob;
531         } else if (newJob != null && newJob.getUID().equals(jobUID)) {
532             // Then check the 'new job'
533
return newJob;
534         } else {
535             // Then check pending jobs.
536
Iterator JavaDoc itPend = pendingCrawlJobs.iterator();
537             while (itPend.hasNext()) {
538                 CrawlJob cj = (CrawlJob) itPend.next();
539                 if (cj.getUID().equals(jobUID)) {
540                     return cj;
541                 }
542             }
543
544             // Next check completed jobs.
545
Iterator JavaDoc itComp = completedCrawlJobs.iterator();
546             while (itComp.hasNext()) {
547                 CrawlJob cj = (CrawlJob) itComp.next();
548                 if (cj.getUID().equals(jobUID)) {
549                     return cj;
550                 }
551             }
552
553             // And finally check the profiles.
554
for (Iterator JavaDoc i = getProfiles().iterator(); i.hasNext();) {
555                 CrawlJob cj = (CrawlJob) i.next();
556                 if (cj.getUID().equals(jobUID)) {
557                     return cj;
558                 }
559             }
560         }
561         return null; // Nothing found, return null
562
}
563     
564     /**
565      * @return True if we terminated a current job (False if no job to
566      * terminate)
567      */

568     public boolean terminateCurrentJob() {
569         if (this.currentJob == null) {
570             return false;
571         }
572         // requestCrawlStop will cause crawlEnding to be invoked.
573
// It will handle the clean up.
574
this.currentJob.stopCrawling();
575         synchronized (this) {
576             try {
577                 // Take a few moments so that the controller can change
578
// states before the UI updates. The CrawlEnding event
579
// will wake us if it occurs sooner than this.
580
wait(3000);
581             } catch (InterruptedException JavaDoc e) {
582                 // Ignore.
583
}
584         }
585         return true;
586     }
587
588     /**
589      * The specified job will be removed from the pending queue or aborted if
590      * currently running. It will be placed in the list of completed jobs with
591      * appropriate status info. If the job is already in the completed list or
592      * no job with the given UID is found, no action will be taken.
593      *
594      * @param jobUID The UID (unique ID) of the job that is to be deleted.
595      *
596      */

597     public void deleteJob(String JavaDoc jobUID) {
598         // First check to see if we are deleting the current job.
599
if (currentJob != null && jobUID.equals(currentJob.getUID())) {
600             terminateCurrentJob();
601             return; // We're not going to find another job with the same UID
602
}
603         
604         // Ok, it isn't the current job, let's check the pending jobs.
605
for(Iterator JavaDoc it = pendingCrawlJobs.iterator(); it.hasNext();) {
606             CrawlJob cj = (CrawlJob) it.next();
607             if (cj.getUID().equals(jobUID)) {
608                 // Found the one to delete.
609
cj.setStatus(CrawlJob.STATUS_DELETED);
610                 it.remove();
611                 return; // We're not going to find another job with the same UID
612
}
613         }
614         
615         // And finally the completed jobs.
616
for (Iterator JavaDoc it = completedCrawlJobs.iterator(); it.hasNext();) {
617             CrawlJob cj = (CrawlJob) it.next();
618             if (cj.getUID().equals(jobUID)) {
619                 // Found the one to delete.
620
cj.setStatus(CrawlJob.STATUS_DELETED);
621                 it.remove();
622                 return; // No other job will have the same UID
623
}
624         }
625     }
626
627     /**
628      * Cause the current job to pause. If no current job is crawling this
629      * method will have no effect.
630      */

631     public void pauseJob() {
632         if (this.currentJob != null) {
633             this.currentJob.pause();
634         }
635     }
636
637     /**
638      * Cause the current job to resume crawling if it was paused. Will have no
639      * effect if the current job was not paused or if there is no current job.
640      * If the current job is still waiting to pause, this will not take effect
641      * until the job has actually paused. At which time it will immeditatly
642      * resume crawling.
643      */

644     public void resumeJob() {
645         if (this.currentJob != null) {
646             this.currentJob.resume();
647         }
648     }
649
650     /**
651      * Cause the current job to write a checkpoint to disk. Currently
652      * requires job to already be paused.
653      * @throws IllegalStateException Thrown if crawl is not paused.
654      */

655     public void checkpointJob() throws IllegalStateException JavaDoc {
656         if (this.currentJob != null) {
657             this.currentJob.checkpoint();
658         }
659     }
660
661     /**
662      * Returns a unique job ID.
663      * <p>
664      * No two calls to this method (on the same instance of this class) can ever
665      * return the same value. <br>
666      * Currently implemented to return a time stamp. That is subject to change
667      * though.
668      *
669      * @return A unique job ID.
670      *
671      * @see ArchiveUtils#TIMESTAMP17
672      */

673     public String JavaDoc getNextJobUID() {
674         return ArchiveUtils.TIMESTAMP17.format(new Date JavaDoc());
675     }
676
677     /**
678      * Creates a new job. The new job will be returned and also registered as
679      * the handler's 'new job'. The new job will be based on the settings
680      * provided but created in a new location on disk.
681      *
682      * @param baseOn
683      * A CrawlJob (with a valid settingshandler) to use as the
684      * template for the new job.
685      * @param recovery Whether to preinitialize new job as recovery of
686      * <code>baseOn</code> job. String holds RECOVER_LOG if we are to
687      * do the recovery based off the recover.gz log -- See RecoveryJournal in
688      * the frontier package -- or it holds the name of
689      * the checkpoint we're to use recoverying.
690      * @param name
691      * The name of the new job.
692      * @param description
693      * Descriptions of the job.
694      * @param seeds
695      * The contents of the new settings' seed file.
696      * @param priority
697      * The priority of the new job.
698      *
699      * @return The new crawl job.
700      * @throws FatalConfigurationException If a problem occurs creating the
701      * settings.
702      */

703     public CrawlJob newJob(CrawlJob baseOn, String JavaDoc recovery, String JavaDoc name,
704             String JavaDoc description, String JavaDoc seeds, int priority)
705     throws FatalConfigurationException {
706         // See what the recover story is.
707
File JavaDoc recover = null;
708         try {
709             if (recovery != null && recovery.length() > 0
710                     && recovery.equals(RECOVER_LOG)) {
711                 // Then we're to do a recovery based off the RecoveryJournal
712
// recover.gz log.
713
File JavaDoc dir = baseOn.getSettingsHandler().getOrder()
714                     .getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
715                 // Add name of recover file. We're hardcoding it as
716
// 'recover.gz'.
717
recover = new File JavaDoc(dir, FrontierJournal.LOGNAME_RECOVER +
718                     RecoveryJournal.GZIP_SUFFIX);
719             } else if (recovery != null && recovery.length() > 0) {
720                 // Must be name of a checkpoint to use.
721
recover = new File JavaDoc(baseOn.getSettingsHandler().
722                     getOrder().getSettingsDir(CrawlOrder.ATTR_CHECKPOINTS_PATH),
723                         recovery);
724             }
725         } catch (AttributeNotFoundException JavaDoc e1) {
726             throw new FatalConfigurationException(
727                 "AttributeNotFoundException occured while setting up" +
728                     "new job/profile " + name + " \n" + e1.getMessage());
729         }
730
731         CrawlJob cj = createNewJob(baseOn.getSettingsHandler().getOrderFile(),
732             name, description, seeds, priority);
733     
734         updateRecoveryPaths(recover, cj.getSettingsHandler(), name);
735         
736         return cj;
737     }
738     
739     /**
740      * Creates a new job. The new job will be returned and also registered as
741      * the handler's 'new job'. The new job will be based on the settings
742      * provided but created in a new location on disk.
743      * @param orderFile Order file to use as the template for the new job.
744      * @param name The name of the new job.
745      * @param description Descriptions of the job.
746      * @param seeds The contents of the new settings' seed file.
747      *
748      * @return The new crawl job.
749      * @throws FatalConfigurationException If a problem occurs creating the
750      * settings.
751      */

752     public CrawlJob newJob(final File JavaDoc orderFile, final String JavaDoc name,
753         final String JavaDoc description, final String JavaDoc seeds)
754     throws FatalConfigurationException {
755         return createNewJob(orderFile, name, description, seeds,
756             CrawlJob.PRIORITY_AVERAGE);
757     }
758     
759     protected void checkDirectory(File JavaDoc dir)
760     throws FatalConfigurationException {
761         if (dir == null) {
762             return;
763         }
764         if (!dir.exists() && !dir.canRead()) {
765             throw new FatalConfigurationException(dir.getAbsolutePath() +
766                 " does not exist or is unreadable");
767         }
768     }
769     
770     protected CrawlJob createNewJob(final File JavaDoc orderFile, final String JavaDoc name,
771             final String JavaDoc description, final String JavaDoc seeds, final int priority)
772     throws FatalConfigurationException {
773         if (newJob != null) {
774             //There already is a new job. Discard it.
775
discardNewJob();
776         }
777         String JavaDoc UID = getNextJobUID();
778         File JavaDoc jobDir;
779         jobDir = new File JavaDoc(this.jobsDir, name + "-" + UID);
780         CrawlJobErrorHandler errorHandler = new CrawlJobErrorHandler();
781         XMLSettingsHandler handler =
782             createSettingsHandler(orderFile, name, description,
783                 seeds, jobDir, errorHandler, "order.xml", "seeds.txt");
784         this.newJob = new CrawlJob(UID, name, handler, errorHandler, priority,
785                 jobDir);
786         return this.newJob;
787     }
788
789     /**
790      * Creates a new profile. The new profile will be returned and also
791      * registered as the handler's 'new job'. The new profile will be based on
792      * the settings provided but created in a new location on disk.
793      *
794      * @param baseOn
795      * A CrawlJob (with a valid settingshandler) to use as the
796      * template for the new profile.
797      * @param name
798      * The name of the new profile.
799      * @param description
800      * Description of the new profile
801      * @param seeds
802      * The contents of the new profiles' seed file
803      * @return The new profile.
804      * @throws FatalConfigurationException
805      * @throws IOException
806      */

807     public CrawlJob newProfile(CrawlJob baseOn, String JavaDoc name, String JavaDoc description,
808             String JavaDoc seeds)
809     throws FatalConfigurationException, IOException JavaDoc {
810         File JavaDoc profileDir = new File JavaDoc(getProfilesDirectory().getAbsoluteFile()
811             + File.separator + name);
812         CrawlJobErrorHandler cjseh = new CrawlJobErrorHandler(Level.SEVERE);
813         CrawlJob newProfile = new CrawlJob(name,
814             createSettingsHandler(baseOn.getSettingsHandler().getOrderFile(),
815                 name, description, seeds, profileDir, cjseh, "order.xml",
816                 "seeds.txt"), cjseh);
817         addProfile(newProfile);
818         return newProfile;
819     }
820     
821     /**
822      * Creates a new settings handler based on an existing job. Basically all
823      * the settings file for the 'based on' will be copied to the specified
824      * directory.
825      *
826      * @param orderFile Order file to base new order file on. Cannot be null.
827      * @param name Name for the new settings
828      * @param description Description of the new settings.
829      * @param seeds The contents of the new settings' seed file.
830      * @param newSettingsDir
831      * @param errorHandler
832      * @param filename Name of new order file.
833      * @param seedfile Name of new seeds file.
834      *
835      * @return The new settings handler.
836      * @throws FatalConfigurationException
837      * If there are problems with reading the 'base on'
838      * configuration, with writing the new configuration or it's
839      * seed file.
840      */

841     protected XMLSettingsHandler createSettingsHandler(
842         final File JavaDoc orderFile, final String JavaDoc name, final String JavaDoc description,
843         final String JavaDoc seeds, final File JavaDoc newSettingsDir,
844         final CrawlJobErrorHandler errorHandler,
845         final String JavaDoc filename, final String JavaDoc seedfile)
846     throws FatalConfigurationException {
847         XMLSettingsHandler newHandler = null;
848         try {
849             newHandler = new XMLSettingsHandler(orderFile);
850             if(errorHandler != null){
851                 newHandler.registerValueErrorHandler(errorHandler);
852             }
853             newHandler.setErrorReportingLevel(errorHandler.getLevel());
854             newHandler.initialize();
855         } catch (InvalidAttributeValueException JavaDoc e2) {
856             throw new FatalConfigurationException(
857                 "InvalidAttributeValueException occured while creating" +
858                 " new settings handler for new job/profile\n" +
859                 e2.getMessage());
860         }
861
862         // Make sure the directory exists.
863
newSettingsDir.mkdirs();
864
865         try {
866             // Set the seed file
867
((ComplexType)newHandler.getOrder().getAttribute("scope"))
868                 .setAttribute(new Attribute JavaDoc("seedsfile", seedfile));
869         } catch (AttributeNotFoundException JavaDoc e1) {
870             throw new FatalConfigurationException(
871                     "AttributeNotFoundException occured while setting up" +
872                     "new job/profile\n" + e1.getMessage());
873         } catch (InvalidAttributeValueException JavaDoc e1) {
874             throw new FatalConfigurationException(
875                     "InvalidAttributeValueException occured while setting" +
876                     "up new job/profile\n" + e1.getMessage());
877         } catch (MBeanException JavaDoc e1) {
878             throw new FatalConfigurationException(
879                     "MBeanException occured while setting up new" +
880                     " job/profile\n" + e1.getMessage());
881         } catch (ReflectionException JavaDoc e1) {
882             throw new FatalConfigurationException(
883                     "ReflectionException occured while setting up" +
884                     " new job/profile\n" + e1.getMessage());
885         }
886
887         File JavaDoc newFile = new File JavaDoc(newSettingsDir.getAbsolutePath(), filename);
888         
889         try {
890             newHandler.copySettings(newFile, (String JavaDoc)newHandler.getOrder()
891                 .getAttribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY));
892         } catch (IOException JavaDoc e3) {
893             // Print stack trace to help debug issue where cannot create
894
// new job from an old that has overrides.
895
e3.printStackTrace();
896             throw new FatalConfigurationException(
897                     "IOException occured while writing new settings files" +
898                     " for new job/profile\n" + e3.getMessage());
899         } catch (AttributeNotFoundException JavaDoc e) {
900             throw new FatalConfigurationException(
901                     "AttributeNotFoundException occured while writing new" +
902                     " settings files for new job/profile\n" + e.getMessage());
903         } catch (MBeanException JavaDoc e) {
904             throw new FatalConfigurationException(
905                     "MBeanException occured while writing new settings files" +
906                     " for new job/profile\n" + e.getMessage());
907         } catch (ReflectionException JavaDoc e) {
908             throw new FatalConfigurationException(
909                     "ReflectionException occured while writing new settings" +
910                     " files for new job/profile\n" + e.getMessage());
911         }
912         CrawlerSettings orderfile = newHandler.getSettingsObject(null);
913
914         orderfile.setName(name);
915         orderfile.setDescription(description);
916
917         if (seeds != null) {
918             BufferedWriter JavaDoc writer = null;
919             try {
920                 writer = new BufferedWriter JavaDoc(new FileWriter JavaDoc(newHandler
921                     .getPathRelativeToWorkingDirectory(seedfile)));
922                 try {
923                     writer.write(seeds);
924                 } finally {
925                     writer.close();
926                 }
927             } catch (IOException JavaDoc e) {
928                 throw new FatalConfigurationException(
929                     "IOException occured while writing seed file for new"
930                         + " job/profile\n" + e.getMessage());
931             }
932         }
933         return newHandler;
934     }
935     
936     /**
937      * @param recover
938      * Source to use recovering. Can be full path to a recovery log
939      * or full path to a checkpoint src dir.
940      * @param sh
941      * Settings Handler to update.
942      * @param jobName
943      * Name of this job.
944      * @throws FatalConfigurationException
945      */

946     protected void updateRecoveryPaths(final File JavaDoc recover,
947             final SettingsHandler sh, final String JavaDoc jobName)
948     throws FatalConfigurationException {
949         if (recover == null) {
950             return;
951         }
952         checkDirectory(recover);
953         try {
954             // Set 'recover-path' to be old job's recovery log path
955
updateRecoveryPaths(recover, sh);
956         } catch (AttributeNotFoundException JavaDoc e1) {
957             throw new FatalConfigurationException(
958                     "AttributeNotFoundException occured while setting up"
959                             + "new job/profile " + jobName + " \n"
960                             + e1.getMessage());
961         } catch (InvalidAttributeValueException JavaDoc e1) {
962             throw new FatalConfigurationException(
963                     "InvalidAttributeValueException occured while setting"
964                             + "new job/profile " + jobName + " \n"
965                             + e1.getMessage());
966         } catch (MBeanException JavaDoc e1) {
967             throw new FatalConfigurationException(
968                     "MBeanException occured while setting up new"
969                             + "new job/profile " + jobName + " \n"
970                             + e1.getMessage());
971         } catch (ReflectionException JavaDoc e1) {
972             throw new FatalConfigurationException(
973                     "ReflectionException occured while setting up"
974                             + "new job/profile " + jobName + " \n"
975                             + e1.getMessage());
976         } catch (IOException JavaDoc e) {
977             throw new FatalConfigurationException(
978                     "IOException occured while setting up" + "new job/profile "
979                             + jobName + " \n" + e.getMessage());
980         }
981     }
982
983     /**
984      * @param recover
985      * Source to use recovering. Can be full path to a recovery log
986      * or full path to a checkpoint src dir.
987      * @param newHandler
988      * @throws ReflectionException
989      * @throws MBeanException
990      * @throws InvalidAttributeValueException
991      * @throws AttributeNotFoundException
992      * @throws IOException
993      */

994     private void updateRecoveryPaths(final File JavaDoc recover,
995         SettingsHandler newHandler)
996     throws AttributeNotFoundException JavaDoc, InvalidAttributeValueException JavaDoc,
997     MBeanException JavaDoc, ReflectionException JavaDoc, IOException JavaDoc {
998         if (recover == null || !recover.exists()) {
999             throw new IOException JavaDoc("Recovery src does not exist: " + recover);
1000        }
1001        newHandler.getOrder().setAttribute(
1002            new Attribute JavaDoc(CrawlOrder.ATTR_RECOVER_PATH,
1003                recover.getAbsolutePath()));
1004            
1005        // Now, ensure that 'logs' and 'state' don't overlap with
1006
// previous job's files (ok for 'arcs' and 'scratch' to overlap)
1007
File JavaDoc newLogsDisk = null;
1008        final String JavaDoc RECOVERY_SUFFIX = "-R";
1009        while(true) {
1010            try {
1011                newLogsDisk = newHandler.getOrder().
1012                    getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
1013            } catch (AttributeNotFoundException JavaDoc e) {
1014                logger.log(Level.SEVERE, "Failed to get logs directory", e);
1015            }
1016            if (newLogsDisk.list().length > 0) {
1017                // 'new' directory is nonempty; rename with trailing '-R'
1018
String JavaDoc logsPath = (String JavaDoc) newHandler.getOrder().
1019                    getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1020                if(logsPath.endsWith("/")) {
1021                    logsPath = logsPath.substring(0,logsPath.length()-1);
1022                }
1023                newHandler.getOrder().setAttribute(
1024                    new Attribute JavaDoc(CrawlOrder.ATTR_LOGS_PATH,
1025                        logsPath + RECOVERY_SUFFIX));
1026            } else {
1027                // directory is suitably empty; exit loop
1028
break;
1029            }
1030        }
1031        File JavaDoc newStateDisk = null;
1032        while (true) {
1033            try {
1034                newStateDisk = newHandler.getOrder().getSettingsDir(
1035                        CrawlOrder.ATTR_STATE_PATH);
1036            } catch (AttributeNotFoundException JavaDoc e) {
1037                logger.log(Level.SEVERE, "Failed to get state directory", e);
1038            }
1039            if (newStateDisk.list().length>0) {
1040                // 'new' directory is nonempty; rename with trailing '-R'
1041
String JavaDoc statePath = (String JavaDoc) newHandler.getOrder().
1042                    getAttribute(CrawlOrder.ATTR_STATE_PATH);
1043                if(statePath.endsWith("/")) {
1044                    statePath = statePath.substring(0,statePath.length()-1);
1045                }
1046                newHandler.getOrder().setAttribute(
1047                    new Attribute JavaDoc(CrawlOrder.ATTR_STATE_PATH,
1048                        statePath + RECOVERY_SUFFIX));
1049            } else {
1050                // directory is suitably empty; exit loop
1051
break;
1052            }
1053        }
1054    }
1055
1056    /**
1057     * Discard the handler's 'new job'. This will remove any files/directories
1058     * written to disk.
1059     */

1060    public void discardNewJob(){
1061        FileUtils.deleteDir(new File JavaDoc(newJob.getSettingsDirectory()));
1062    }
1063
1064    /**
1065     * Get the handler's 'new job'
1066     * @return the handler's 'new job'
1067     */

1068    public CrawlJob getNewJob(){
1069        return newJob;
1070    }
1071
1072    /**
1073     * Is the crawler accepting crawl jobs to run?
1074     * @return True if the next availible CrawlJob will be crawled. False otherwise.
1075     */

1076    public boolean isRunning() {
1077        return running;
1078    }
1079
1080    /**
1081     * Is a crawl job being crawled?
1082     * @return True if a job is actually being crawled (even if it is paused).
1083     * False if no job is being crawled.
1084     */

1085    public boolean isCrawling() {
1086        return this.currentJob != null;
1087    }
1088
1089    /**
1090     * Allow jobs to be crawled.
1091     */

1092    public void startCrawler() {
1093        running = true;
1094        if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
1095            // Ok, can just start the next job
1096
startNextJob();
1097        }
1098    }
1099
1100    /**
1101     * Stop future jobs from being crawled.
1102     *
1103     * This action will not affect the current job.
1104     */

1105    public void stopCrawler() {
1106        running = false;
1107    }
1108
1109    /**
1110     * Start next crawl job.
1111     *
1112     * If a is job already running this method will do nothing.
1113     */

1114    protected final void startNextJob() {
1115        synchronized (this) {
1116            if(startingNextJob != null) {
1117                try {
1118                    startingNextJob.join();
1119                } catch (InterruptedException JavaDoc e) {
1120                    e.printStackTrace();
1121                    return;
1122                }
1123            }
1124            startingNextJob = new Thread JavaDoc(new Runnable JavaDoc() {
1125                public void run() {
1126                    startNextJobInternal();
1127                }
1128            }, "StartNextJob");
1129            startingNextJob.start();
1130        }
1131    }
1132    
1133    protected void startNextJobInternal() {
1134        if (pendingCrawlJobs.size() == 0 || isCrawling()) {
1135            // No job ready or already crawling.
1136
return;
1137        }
1138        this.currentJob = (CrawlJob)pendingCrawlJobs.first();
1139        assert pendingCrawlJobs.contains(currentJob) :
1140            "pendingCrawlJobs is in an illegal state";
1141        pendingCrawlJobs.remove(currentJob);
1142        try {
1143            this.currentJob.setupForCrawlStart();
1144            // This is ugly but needed so I can clear the currentJob
1145
// reference in the crawlEnding and update the list of completed
1146
// jobs. Also, crawlEnded can startup next job.
1147
this.currentJob.getController().addCrawlStatusListener(this);
1148            // now, actually start
1149
this.currentJob.getController().requestCrawlStart();
1150        } catch (InitializationException e) {
1151            loadJob(getStateJobFile(this.currentJob.getDirectory()));
1152            this.currentJob = null;
1153            startNextJobInternal(); // Load the next job if there is one.
1154
}
1155    }
1156
1157    /**
1158     * Forward a 'kick' update to current job if any.
1159     */

1160    public void kickUpdate() {
1161        if(this.currentJob != null) {
1162            this.currentJob.kickUpdate();
1163        }
1164    }
1165
1166    /**
1167     * Loads options from a file. Typically these are a list of available
1168     * modules that can be plugged into some part of the configuration.
1169     * For examples Processors, Frontiers, Filters etc. Leading and trailing
1170     * spaces are trimmed from each line.
1171     *
1172     * <p>Options are loaded from the CLASSPATH.
1173     * @param file the name of the option file (without path!)
1174     * @return The option file with each option line as a seperate entry in the
1175     * ArrayList.
1176     * @throws IOException when there is trouble reading the file.
1177     */

1178    public static ArrayList JavaDoc<String JavaDoc> loadOptions(String JavaDoc file)
1179    throws IOException JavaDoc {
1180        ArrayList JavaDoc<String JavaDoc> ret = new ArrayList JavaDoc<String JavaDoc>();
1181        Enumeration JavaDoc resources =
1182            CrawlJob.class.getClassLoader().getResources("modules/" + file);
1183
1184        boolean noFileFound = true;
1185        while (resources.hasMoreElements()) {
1186            InputStream JavaDoc is = ((URL JavaDoc) resources.nextElement()).openStream();
1187            noFileFound = false;
1188
1189            String JavaDoc line = null;
1190            BufferedReader JavaDoc bf =
1191                new BufferedReader JavaDoc(new InputStreamReader JavaDoc(is), 8192);
1192            try {
1193                while ((line = bf.readLine()) != null) {
1194                    line = line.trim();
1195                    if(line.indexOf('#')<0 && line.length()>0){
1196                        // Looks like a valid line.
1197
ret.add(line);
1198                    }
1199                }
1200            } finally {
1201                bf.close();
1202            }
1203        }
1204        
1205        if (noFileFound) {
1206            throw new IOException JavaDoc("Failed to get " + file + " from the " +
1207                " CLASSPATH");
1208        }
1209
1210        return ret;
1211    }
1212
1213    /**
1214     * Returns a URIFrontierMarker for the current, paused, job. If there is no
1215     * current job or it is not paused null will be returned.
1216     *
1217     * @param regexpr
1218     * A regular expression that each URI must match in order to be
1219     * considered 'within' the marker.
1220     * @param inCacheOnly
1221     * Limit marker scope to 'cached' URIs.
1222     * @return a URIFrontierMarker for the current job.
1223     * @see #getPendingURIsList(FrontierMarker, int, boolean)
1224     * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
1225     * boolean)
1226     * @see org.archive.crawler.framework.FrontierMarker
1227     */

1228    public FrontierMarker getInitialMarker(String JavaDoc regexpr,
1229            boolean inCacheOnly) {
1230        return (this.currentJob != null)?
1231                this.currentJob.getInitialMarker(regexpr, inCacheOnly): null;
1232    }
1233
1234    /**
1235     * Returns the frontiers URI list based on the provided marker. This method
1236     * will return null if there is not current job or if the current job is
1237     * not paused. Only when there is a paused current job will this method
1238     * return a URI list.
1239     *
1240     * @param marker
1241     * URIFrontier marker
1242     * @param numberOfMatches
1243     * maximum number of matches to return
1244     * @param verbose
1245     * should detailed info be provided on each URI?
1246     * @return the frontiers URI list based on the provided marker
1247     * @throws InvalidFrontierMarkerException
1248     * When marker is inconsistent with the current state of the
1249     * frontier.
1250     * @see #getInitialMarker(String, boolean)
1251     * @see org.archive.crawler.framework.FrontierMarker
1252     */

1253    public ArrayList JavaDoc getPendingURIsList(FrontierMarker marker,
1254            int numberOfMatches, boolean verbose)
1255    throws InvalidFrontierMarkerException {
1256        return (this.currentJob != null)?
1257           this.currentJob.getPendingURIsList(marker, numberOfMatches, verbose):
1258           null;
1259    }
1260
1261    /**
1262     * Delete any URI from the frontier of the current (paused) job that match
1263     * the specified regular expression. If the current job is not paused (or
1264     * there is no current job) nothing will be done.
1265     * @param regexpr Regular expression to delete URIs by.
1266     * @return the number of URIs deleted
1267     */

1268    public long deleteURIsFromPending(String JavaDoc regexpr) {
1269        return (this.currentJob != null)?
1270                this.currentJob.deleteURIsFromPending(regexpr): 0;
1271    }
1272    
1273    public String JavaDoc importUris(String JavaDoc file, String JavaDoc style, String JavaDoc force) {
1274        return importUris(file, style, "true".equals(force));
1275    }
1276
1277    /**
1278     * @param fileOrUrl Name of file w/ seeds.
1279     * @param style What style of seeds -- crawl log (<code>crawlLog</code>
1280     * style) or recovery journal (<code>recoveryJournal</code> style), or
1281     * seeds file style (Pass <code>default</code> style).
1282     * @param forceRevisit Should we revisit even if seen before?
1283     * @return A display string that has a count of all added.
1284     */

1285    public String JavaDoc importUris(final String JavaDoc fileOrUrl, final String JavaDoc style,
1286            final boolean forceRevisit) {
1287        return (this.currentJob != null)?
1288            this.currentJob.importUris(fileOrUrl, style, forceRevisit): null;
1289    }
1290    
1291    protected int importUris(InputStream JavaDoc is, String JavaDoc style,
1292            boolean forceRevisit) {
1293        return (this.currentJob != null)?
1294                this.currentJob.importUris(is, style, forceRevisit): 0;
1295    }
1296    
1297    /**
1298     * Schedule a uri.
1299     * @param uri Uri to schedule.
1300     * @param forceFetch Should it be forcefetched.
1301     * @param isSeed True if seed.
1302     * @throws URIException
1303     */

1304    public void importUri(final String JavaDoc uri, final boolean forceFetch,
1305            final boolean isSeed)
1306    throws URIException {
1307        importUri(uri, forceFetch, isSeed, true);
1308    }
1309    
1310    /**
1311     * Schedule a uri.
1312     * @param str String that can be: 1. a UURI, 2. a snippet of the
1313     * crawl.log line, or 3. a snippet from recover log. See
1314     * {@link #importUris(InputStream, String, boolean)} for how it subparses
1315     * the lines from crawl.log and recover.log.
1316     * @param forceFetch Should it be forcefetched.
1317     * @param isSeed True if seed.
1318     * @param isFlush If true, flush the frontier IF it implements
1319     * flushing.
1320     * @throws URIException
1321     */

1322    public void importUri(final String JavaDoc str, final boolean forceFetch,
1323            final boolean isSeed, final boolean isFlush)
1324    throws URIException {
1325        if (this.currentJob != null) {
1326            this.currentJob.importUri(str, forceFetch, isSeed, isFlush);
1327        }
1328    }
1329    
1330    /**
1331     * If its a HostQueuesFrontier, needs to be flushed for the queued.
1332     */

1333    protected void doFlush() {
1334        if (this.currentJob != null) {
1335            this.currentJob.flush();
1336        }
1337    }
1338    
1339    public void stop() {
1340        if (isCrawling()) {
1341            deleteJob(getCurrentJob().getUID());
1342        }
1343    }
1344    
1345    public void requestCrawlStop() {
1346        if (this.currentJob != null) {
1347            this.currentJob.stopCrawling();
1348        }
1349    }
1350    
1351    /**
1352     * Ensure order file with new name/desc is written.
1353     * See '[ 1066573 ] sometimes job based-on other job uses older job name'.
1354     * @param newJob Newly created job.
1355     * @param metaname Metaname for new job.
1356     * @param description Description for new job.
1357     * @return <code>newJob</code>
1358     */

1359    public static CrawlJob ensureNewJobWritten(CrawlJob newJob, String JavaDoc metaname,
1360            String JavaDoc description) {
1361        XMLSettingsHandler settingsHandler = newJob.getSettingsHandler();
1362        CrawlerSettings orderfile = settingsHandler.getSettingsObject(null);
1363        orderfile.setName(metaname);
1364        orderfile.setDescription(description);
1365        settingsHandler.writeSettingsObject(orderfile);
1366        return newJob;
1367    }
1368
1369    public void crawlStarted(String JavaDoc message) {
1370        // TODO Auto-generated method stub
1371

1372    }
1373
1374    public void crawlEnding(String JavaDoc sExitMessage) {
1375        loadJob(getStateJobFile(this.currentJob.getDirectory()));
1376        currentJob = null;
1377        synchronized (this) {
1378            // If the GUI terminated the job then it is waiting for this event.
1379
notifyAll();
1380        }
1381    }
1382
1383    public void crawlEnded(String JavaDoc sExitMessage) {
1384        if (this.running) {
1385            startNextJob();
1386        }
1387    }
1388
1389    public void crawlPausing(String JavaDoc statusMessage) {
1390        // TODO Auto-generated method stub
1391

1392    }
1393
1394    public void crawlPaused(String JavaDoc statusMessage) {
1395        // TODO Auto-generated method stub
1396

1397    }
1398
1399    public void crawlResuming(String JavaDoc statusMessage) {
1400        // TODO Auto-generated method stub
1401
}
1402
1403    public void crawlCheckpoint(File JavaDoc checkpointDir) throws Exception JavaDoc {
1404        // TODO Auto-generated method stub
1405
}
1406}
1407
Popular Tags