KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > admin > CrawlJob


1 /* CrawlJob
2  *
3  * Copyright (C) 2003 Internet Archive.
4  *
5  * This file is part of the Heritrix web crawler (crawler.archive.org).
6  *
7  * Heritrix is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser Public License as published by
9  * the Free Software Foundation; either version 2.1 of the License, or
10  * any later version.
11  *
12  * Heritrix is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU Lesser Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser Public License
18  * along with Heritrix; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20  */

21 package org.archive.crawler.admin;
22
23 import java.io.BufferedReader JavaDoc;
24 import java.io.File JavaDoc;
25 import java.io.FileNotFoundException JavaDoc;
26 import java.io.FileReader JavaDoc;
27 import java.io.FileWriter JavaDoc;
28 import java.io.IOException JavaDoc;
29 import java.io.InputStream JavaDoc;
30 import java.io.InputStreamReader JavaDoc;
31 import java.io.PrintWriter JavaDoc;
32 import java.io.Serializable JavaDoc;
33 import java.io.StringWriter JavaDoc;
34 import java.util.ArrayList JavaDoc;
35 import java.util.Arrays JavaDoc;
36 import java.util.Collection JavaDoc;
37 import java.util.EventObject JavaDoc;
38 import java.util.Hashtable JavaDoc;
39 import java.util.Iterator JavaDoc;
40 import java.util.List JavaDoc;
41 import java.util.Map JavaDoc;
42 import java.util.logging.Level JavaDoc;
43 import java.util.logging.Logger JavaDoc;
44
45 import javax.management.Attribute JavaDoc;
46 import javax.management.AttributeList JavaDoc;
47 import javax.management.AttributeNotFoundException JavaDoc;
48 import javax.management.DynamicMBean JavaDoc;
49 import javax.management.InstanceAlreadyExistsException JavaDoc;
50 import javax.management.InvalidAttributeValueException JavaDoc;
51 import javax.management.MBeanAttributeInfo JavaDoc;
52 import javax.management.MBeanException JavaDoc;
53 import javax.management.MBeanInfo JavaDoc;
54 import javax.management.MBeanNotificationInfo JavaDoc;
55 import javax.management.MBeanOperationInfo JavaDoc;
56 import javax.management.MBeanParameterInfo JavaDoc;
57 import javax.management.MBeanRegistration JavaDoc;
58 import javax.management.MBeanRegistrationException JavaDoc;
59 import javax.management.MBeanServer JavaDoc;
60 import javax.management.NotCompliantMBeanException JavaDoc;
61 import javax.management.Notification JavaDoc;
62 import javax.management.NotificationBroadcasterSupport JavaDoc;
63 import javax.management.ObjectName JavaDoc;
64 import javax.management.ReflectionException JavaDoc;
65 import javax.management.RuntimeOperationsException JavaDoc;
66 import javax.management.openmbean.CompositeData JavaDoc;
67 import javax.management.openmbean.CompositeDataSupport JavaDoc;
68 import javax.management.openmbean.CompositeType JavaDoc;
69 import javax.management.openmbean.OpenDataException JavaDoc;
70 import javax.management.openmbean.OpenMBeanAttributeInfo JavaDoc;
71 import javax.management.openmbean.OpenMBeanAttributeInfoSupport JavaDoc;
72 import javax.management.openmbean.OpenMBeanConstructorInfoSupport JavaDoc;
73 import javax.management.openmbean.OpenMBeanInfoSupport JavaDoc;
74 import javax.management.openmbean.OpenMBeanOperationInfo JavaDoc;
75 import javax.management.openmbean.OpenMBeanOperationInfoSupport JavaDoc;
76 import javax.management.openmbean.OpenMBeanParameterInfo JavaDoc;
77 import javax.management.openmbean.OpenMBeanParameterInfoSupport JavaDoc;
78 import javax.management.openmbean.SimpleType JavaDoc;
79
80 import org.apache.commons.httpclient.URIException;
81 import org.archive.crawler.Heritrix;
82 import org.archive.crawler.datamodel.CandidateURI;
83 import org.archive.crawler.datamodel.Checkpoint;
84 import org.archive.crawler.datamodel.CrawlOrder;
85 import org.archive.crawler.event.CrawlStatusListener;
86 import org.archive.crawler.framework.CrawlController;
87 import org.archive.crawler.framework.FrontierMarker;
88 import org.archive.crawler.framework.StatisticsTracking;
89 import org.archive.crawler.framework.exceptions.InitializationException;
90 import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
91 import org.archive.crawler.frontier.AbstractFrontier;
92 import org.archive.crawler.settings.ComplexType;
93 import org.archive.crawler.settings.ModuleAttributeInfo;
94 import org.archive.crawler.settings.TextField;
95 import org.archive.crawler.settings.XMLSettingsHandler;
96 import org.archive.crawler.util.CheckpointUtils;
97 import org.archive.crawler.util.IoUtils;
98 import org.archive.util.ArchiveUtils;
99 import org.archive.util.FileUtils;
100 import org.archive.util.JEMBeanHelper;
101 import org.archive.util.JmxUtils;
102 import org.archive.util.iterator.LineReadingIterator;
103 import org.archive.util.iterator.RegexpLineIterator;
104
105 import com.sleepycat.je.DatabaseException;
106 import com.sleepycat.je.Environment;
107
108 /**
109  * A CrawlJob encapsulates a 'crawl order' with any and all information and
110  * methods needed by a CrawlJobHandler to accept and execute them.
111  *
112  * <p>A given crawl job may also be a 'profile' for a crawl. In that case it
113  * should not be executed as a crawl but can be edited and used as a template
114  * for creating new CrawlJobs.
115  *
116  * <p>All of it's constructors are protected since only a CrawlJobHander
117  * should construct new CrawlJobs.
118  *
119  * @author Kristinn Sigurdsson
120  *
121  * @see org.archive.crawler.admin.CrawlJobHandler#newJob(CrawlJob, String,
122  * String, String, String, int)
123  * @see org.archive.crawler.admin.CrawlJobHandler#newProfile(CrawlJob,
124  * String, String, String)
125  */

126
127 public class CrawlJob extends NotificationBroadcasterSupport JavaDoc
128 implements DynamicMBean JavaDoc, MBeanRegistration JavaDoc, CrawlStatusListener, Serializable JavaDoc {
129     /**
130      * Eclipse generated serial number.
131      */

132     private static final long serialVersionUID = 3411161000452525856L;
133     
134     private static final Logger JavaDoc logger =
135         Logger.getLogger(CrawlJob.class.getName());
136     /*
137      * Possible values for Priority
138      */

139     /** lowest */
140     public static final int PRIORITY_MINIMAL = 0;
141     /** low */
142     public static final int PRIORITY_LOW = 1;
143     /** average */
144     public static final int PRIORITY_AVERAGE = 2;
145     /** high */
146     public static final int PRIORITY_HIGH = 3;
147     /** highest */
148     public static final int PRIORITY_CRITICAL = 4;
149
150     /*
151      * Possible states for a Job.
152      */

153     /** Inital value. May not be ready to run/incomplete. */
154     public static final String JavaDoc STATUS_CREATED = "Created";
155     /** Job has been successfully submitted to a CrawlJobHandler */
156     public static final String JavaDoc STATUS_PENDING = "Pending";
157     /** Job is being crawled */
158     public static final String JavaDoc STATUS_RUNNING = "Running";
159     /** Job was deleted by user, will not be displayed in UI. */
160     public static final String JavaDoc STATUS_DELETED = "Deleted";
161     /** Job was terminted by user input while crawling */
162     public static final String JavaDoc STATUS_ABORTED = "Finished - Ended by operator";
163     /** Something went very wrong */
164     public static final String JavaDoc STATUS_FINISHED_ABNORMAL =
165         "Finished - Abnormal exit from crawling";
166     /** Job finished normally having completed its crawl. */
167     public static final String JavaDoc STATUS_FINISHED = "Finished";
168     /** Job finished normally when the specified timelimit was hit. */
169     public static final String JavaDoc STATUS_FINISHED_TIME_LIMIT =
170         "Finished - Timelimit hit";
171     /** Job finished normally when the specifed amount of
172      * data (MB) had been downloaded */

173     public static final String JavaDoc STATUS_FINISHED_DATA_LIMIT =
174         "Finished - Maximum amount of data limit hit";
175     /** Job finished normally when the specified number of documents had been
176      * fetched.
177      */

178     public static final String JavaDoc STATUS_FINISHED_DOCUMENT_LIMIT =
179         "Finished - Maximum number of documents limit hit";
180     /** Job is going to be temporarly stopped after active threads are finished. */
181     public static final String JavaDoc STATUS_WAITING_FOR_PAUSE = "Pausing - " +
182         "Waiting for threads to finish";
183     /** Job was temporarly stopped. State is kept so it can be resumed */
184     public static final String JavaDoc STATUS_PAUSED = "Paused";
185     /**
186      * Job is being checkpointed. When finished checkpointing, job is set
187      * back to STATUS_PAUSED (Job must be first paused before checkpointing
188      * will run).
189      */

190     public static final String JavaDoc STATUS_CHECKPOINTING = "Checkpointing";
191     /** Job could not be launced due to an InitializationException */
192     public static final String JavaDoc STATUS_MISCONFIGURED = "Could not launch job " +
193         "- Fatal InitializationException";
194     /** Job is actually a profile */
195     public static final String JavaDoc STATUS_PROFILE = "Profile";
196     
197     public static final String JavaDoc STATUS_PREPARING = "Preparing";
198
199     // Class variables
200
private String JavaDoc UID; //A UID issued by the CrawlJobHandler.
201
private String JavaDoc name;
202     private String JavaDoc status;
203     private boolean isReadOnly = false;
204     private boolean isNew = true;
205     private boolean isProfile = false;
206     private boolean isRunning = false;
207     private int priority;
208     private int numberOfJournalEntries = 0;
209     
210     private String JavaDoc statisticsFileSave = "";
211
212     private String JavaDoc errorMessage = null;
213
214     private File JavaDoc jobDir = null;
215
216     private transient CrawlJobErrorHandler errorHandler = null;
217
218     protected transient XMLSettingsHandler settingsHandler;
219     
220     private transient CrawlController controller = null;
221     
222     private static final String JavaDoc RECOVERY_JOURNAL_STYLE = "recoveryJournal";
223     private static final String JavaDoc CRAWL_LOG_STYLE = "crawlLog";
224     
225     // OpenMBean support.
226

227     /**
228      * Server we registered with. Maybe null.
229      */

230     private transient MBeanServer JavaDoc mbeanServer = null;
231     private transient ObjectName JavaDoc mbeanName = null;
232     private static final String JavaDoc CRAWLJOB_JMXMBEAN_TYPE =
233         JmxUtils.SERVICE + ".Job";
234     private transient JEMBeanHelper bdbjeMBeanHelper = null;
235     private transient List JavaDoc<String JavaDoc> bdbjeAttributeNameList = null;
236     private transient List JavaDoc<String JavaDoc> bdbjeOperationsNameList = null;
237     
238     
239     /**
240      * The MBean we've registered ourselves with (May be null
241      * throughout life of Heritrix).
242      */

243     private transient OpenMBeanInfoSupport JavaDoc openMBeanInfo;
244     
245     private final static String JavaDoc NAME_ATTR = "Name";
246     private final static String JavaDoc UID_ATTR = "UID";
247     private final static String JavaDoc STATUS_ATTR = "Status";
248     private final static String JavaDoc FRONTIER_SHORT_REPORT_ATTR =
249         "FrontierShortReport";
250     private final static String JavaDoc THREADS_SHORT_REPORT_ATTR =
251         "ThreadsShortReport";
252     private final static String JavaDoc TOTAL_DATA_ATTR = "TotalData";
253     private final static String JavaDoc CRAWL_TIME_ATTR = "CrawlTime";
254     private final static String JavaDoc DOC_RATE_ATTR = "DocRate";
255     private final static String JavaDoc CURRENT_DOC_RATE_ATTR = "CurrentDocRate";
256     private final static String JavaDoc KB_RATE_ATTR = "KbRate";
257     private final static String JavaDoc CURRENT_KB_RATE_ATTR = "CurrentKbRate";
258     private final static String JavaDoc THREAD_COUNT_ATTR = "ThreadCount";
259     private final static String JavaDoc DOWNLOAD_COUNT_ATTR = "DownloadedCount";
260     private final static String JavaDoc DISCOVERED_COUNT_ATTR = "DiscoveredCount";
261     private final static String JavaDoc [] ATTRIBUTE_ARRAY = {NAME_ATTR, UID_ATTR,
262         STATUS_ATTR, FRONTIER_SHORT_REPORT_ATTR, THREADS_SHORT_REPORT_ATTR,
263         TOTAL_DATA_ATTR, CRAWL_TIME_ATTR, DOC_RATE_ATTR,
264         CURRENT_DOC_RATE_ATTR, KB_RATE_ATTR, CURRENT_KB_RATE_ATTR,
265         THREAD_COUNT_ATTR, DOWNLOAD_COUNT_ATTR, DISCOVERED_COUNT_ATTR};
266     private final static List JavaDoc ATTRIBUTE_LIST = Arrays.asList(ATTRIBUTE_ARRAY);
267     
268     private final static String JavaDoc IMPORT_URI_OPER = "importUri";
269     private final static String JavaDoc IMPORT_URIS_OPER = "importUris";
270     private final static String JavaDoc PAUSE_OPER = "pause";
271     private final static String JavaDoc RESUME_OPER = "resume";
272     private final static String JavaDoc FRONTIER_REPORT_OPER = "frontierReport";
273     private final static String JavaDoc THREADS_REPORT_OPER = "threadsReport";
274     private final static String JavaDoc SEEDS_REPORT_OPER = "seedsReport";
275     private final static String JavaDoc CHECKPOINT_OPER = "startCheckpoint";
276     private final static String JavaDoc PROGRESS_STATISTICS_OPER =
277         "progressStatistics";
278     private final static String JavaDoc PROGRESS_STATISTICS_LEGEND_OPER =
279         "progressStatisticsLegend";
280     
281     private final static String JavaDoc PROG_STATS = "progressStatistics";
282     
283     // Same as JEMBeanHelper.OP_DB_STAT
284
private final static String JavaDoc OP_DB_STAT = "getDatabaseStats";
285     
286     /**
287      * Don't add the following crawl-order items.
288      */

289     private final static List JavaDoc ORDER_EXCLUDE;
290     static {
291         ORDER_EXCLUDE = Arrays.asList(new String JavaDoc [] {"bdb-cache-percent",
292             "extract-processors", "DNS", "uri-included-structure"});
293     }
294     
295     /**
296      * Sequence number for jmx notifications.
297      */

298     private static int notificationsSequenceNumber = 1;
299     
300     /**
301      * A shutdown Constructor.
302      */

303     protected CrawlJob() {
304         super();
305     }
306
307     /**
308      * A constructor for jobs.
309      *
310      * <p> Create, ready to crawl, jobs.
311      * @param UID A unique ID for this job. Typically emitted by the
312      * CrawlJobHandler.
313      * @param name The name of the job
314      * @param settingsHandler The associated settings
315      * @param errorHandler The crawl jobs settings error handler.
316      * <tt>null</tt> means none is set
317      * @param priority job priority.
318      * @param dir The directory that is considered this jobs working directory.
319      */

320     public CrawlJob(final String JavaDoc UID,
321             final String JavaDoc name, final XMLSettingsHandler settingsHandler,
322             final CrawlJobErrorHandler errorHandler, final int priority,
323             final File JavaDoc dir) {
324         this(UID, name, settingsHandler, errorHandler,
325                 priority, dir, null, false, true);
326     }
327
328     /**
329      * A constructor for profiles.
330      *
331      * <p> Any job created with this constructor will be
332      * considered a profile. Profiles are not stored on disk (only their
333      * settings files are stored on disk). This is because their data is
334      * predictible given any settings files.
335      * @param UIDandName A unique ID for this job. For profiles this is the same
336      * as name
337      * @param settingsHandler The associated settings
338      * @param errorHandler The crawl jobs settings error handler.
339      * <tt>null</tt> means none is set
340      */

341     protected CrawlJob(final String JavaDoc UIDandName,
342             final XMLSettingsHandler settingsHandler,
343             final CrawlJobErrorHandler errorHandler) {
344         this(UIDandName, UIDandName, settingsHandler, errorHandler,
345             PRIORITY_AVERAGE, null, STATUS_PROFILE, true, false);
346     }
347     
348     public CrawlJob(final String JavaDoc UID,
349             final String JavaDoc name, final XMLSettingsHandler settingsHandler,
350             final CrawlJobErrorHandler errorHandler, final int priority,
351             final File JavaDoc dir, final String JavaDoc status, final boolean isProfile,
352             final boolean isNew) {
353         super();
354         this.UID = UID;
355         this.name = name;
356         this.settingsHandler = settingsHandler;
357         this.errorHandler = errorHandler;
358         this.status = status;
359         this.isProfile = isProfile;
360         this.isNew = isNew;
361         this.jobDir = dir;
362         this.priority = priority;
363     }
364
365     /**
366      * A constructor for reloading jobs from disk. Jobs (not profiles) have
367      * their data written to persistent storage in the file system. This method
368      * is used to load the job from such storage. This is done by the
369      * <code>CrawlJobHandler</code>.
370      * <p>
371      * Proper structure of a job file (TODO: Maybe one day make this an XML file)
372      * Line 1. UID <br>
373      * Line 2. Job name (string) <br>
374      * Line 3. Job status (string) <br>
375      * Line 4. is job read only (true/false) <br>
376      * Line 5. is job running (true/false) <br>
377      * Line 6. job priority (int) <br>
378      * Line 7. number of journal entries <br>
379      * Line 8. setting file (with path) <br>
380      * Line 9. statistics tracker file (with path) <br>
381      * Line 10-?. error message (String, empty for null), can be many lines <br>
382      * @param jobFile
383      * a file containing information about the job to load.
384      * @param errorHandler The crawl jobs settings error handler.
385      * null means none is set
386      * @throws InvalidJobFileException
387      * if the specified file does not refer to a valid job file.
388      * @throws IOException
389      * if io operations fail
390      */

391     protected CrawlJob(final File JavaDoc jobFile,
392             final CrawlJobErrorHandler errorHandler)
393             throws InvalidJobFileException, IOException JavaDoc {
394         this(null, null, null, errorHandler,
395                 PRIORITY_AVERAGE, null, null, false, true);
396         this.jobDir = jobFile.getParentFile();
397         
398         // Check for corrupt job.state files (can be corrupt if we crash).
399
if (jobFile.length() == 0) {
400             throw new InvalidJobFileException(jobFile.getCanonicalPath() +
401                 " is corrupt (length is zero)");
402         }
403         
404         // Open file. Read data and set up class variables accordingly...
405
BufferedReader JavaDoc jobReader =
406             new BufferedReader JavaDoc(new FileReader JavaDoc(jobFile), 4096);
407         // UID
408
this.UID = jobReader.readLine();
409         // name
410
this.name = jobReader.readLine();
411         // status
412
this.status = jobReader.readLine();
413         if(status.equals(STATUS_ABORTED)==false
414                 && status.equals(STATUS_CREATED)==false
415                 && status.equals(STATUS_DELETED)==false
416                 && status.equals(STATUS_FINISHED)==false
417                 && status.equals(STATUS_FINISHED_ABNORMAL)==false
418                 && status.equals(STATUS_FINISHED_DATA_LIMIT)==false
419                 && status.equals(STATUS_FINISHED_DOCUMENT_LIMIT)==false
420                 && status.equals(STATUS_FINISHED_TIME_LIMIT)==false
421                 && status.equals(STATUS_MISCONFIGURED)==false
422                 && status.equals(STATUS_PAUSED)==false
423                 && status.equals(STATUS_CHECKPOINTING)==false
424                 && status.equals(STATUS_PENDING)==false
425                 && status.equals(STATUS_RUNNING)==false
426                 && status.equals(STATUS_WAITING_FOR_PAUSE)==false
427                 && status.equals(STATUS_PREPARING)==false){
428             // status is invalid. Must be one of the above
429
throw new InvalidJobFileException("Status (line 3) in job file " +
430                     "is not valid: '" + status + "'");
431         }
432         // isReadOnly
433
String JavaDoc tmp = jobReader.readLine();
434         if(tmp.equals("true")){
435             isReadOnly = true;
436         } else if(tmp.equals("false")){
437             isReadOnly = false;
438         } else {
439             throw new InvalidJobFileException("isReadOnly (line 4) in job" +
440                     " file '" + jobFile.getAbsolutePath() + "' is not " +
441                     "valid: '" + tmp + "'");
442         }
443         // isRunning
444
tmp = jobReader.readLine();
445         if(tmp.equals("true")){
446             this.isRunning = true;
447         } else if(tmp.equals("false")){
448             this.isRunning = false;
449         } else {
450             throw new InvalidJobFileException("isRunning (line 5) in job " +
451                     "file '" + jobFile.getAbsolutePath() + "' is not valid: " +
452                     "'" + tmp + "'");
453         }
454         // priority
455
tmp = jobReader.readLine();
456         try{
457             this.priority = Integer.parseInt(tmp);
458         } catch(NumberFormatException JavaDoc e){
459             throw new InvalidJobFileException("priority (line 5) in job " +
460                     "file '" + jobFile.getAbsolutePath() + "' is not valid: " +
461                     "'" + tmp + "'");
462         }
463         // numberOfJournalEntries
464
tmp = jobReader.readLine();
465         try{
466             this.numberOfJournalEntries = Integer.parseInt(tmp);
467         } catch(NumberFormatException JavaDoc e){
468             throw new InvalidJobFileException("numberOfJournalEntries " +
469                     "(line 5) in job file '" + jobFile.getAbsolutePath() +
470                     "' is not valid: " + "'" + tmp + "'");
471         }
472         // settingsHandler
473
tmp = jobReader.readLine();
474         try {
475             File JavaDoc f = new File JavaDoc(tmp);
476             this.settingsHandler = new XMLSettingsHandler((f.isAbsolute())?
477                 f: new File JavaDoc(jobDir, f.getName()));
478             if(this.errorHandler != null){
479                 this.settingsHandler.registerValueErrorHandler(errorHandler);
480             }
481             this.settingsHandler.initialize();
482         } catch (InvalidAttributeValueException JavaDoc e1) {
483             throw new InvalidJobFileException("Problem reading from settings " +
484                     "file (" + tmp + ") specified in job file '" +
485                     jobFile.getAbsolutePath() + "'\n" + e1.getMessage());
486         }
487         // Statistics tracker.
488
jobReader.readLine();
489         // errorMessage
490
// TODO: Multilines
491
tmp = jobReader.readLine();
492         errorMessage = "";
493         while(tmp!=null){
494             errorMessage+=tmp+'\n';
495             tmp = jobReader.readLine();
496         }
497         if(errorMessage.length()==0){
498             // Empty error message should be null
499
errorMessage = null;
500         }
501         // TODO: Load stattrack if needed.
502

503         // TODO: This should be inside a finally block.
504
jobReader.close();
505     }
506
507     /**
508      * Cause the job to be written to persistent storage.
509      * This will also save the statistics tracker if it is not null and the
510      * job status is finished (regardless of how it's finished)
511      */

512     private void writeJobFile() {
513         if (isProfile) {
514             return;
515         }
516         
517         final String JavaDoc jobDirAbsolute = jobDir.getAbsolutePath();
518         if (!jobDir.exists() || !jobDir.canWrite()) {
519             logger.warning("Can't update status on " +
520                 jobDirAbsolute + " because file does not" +
521                 " exist (or is unwriteable)");
522             return;
523         }
524         File JavaDoc f = new File JavaDoc(jobDirAbsolute, "state.job");
525
526         String JavaDoc settingsFile = getSettingsDirectory();
527         // Make settingsFile's path relative if order.xml is somewhere in the
528
// job's directory tree
529
if(settingsFile.startsWith(jobDirAbsolute.concat(File.separator))) {
530             settingsFile = settingsFile.substring(jobDirAbsolute.length()+1);
531         }
532         try {
533             FileWriter JavaDoc jobWriter = new FileWriter JavaDoc(f, false);
534             try {
535                 jobWriter.write(UID + "\n");
536                 jobWriter.write(name + "\n");
537                 jobWriter.write(status + "\n");
538                 jobWriter.write(isReadOnly + "\n");
539                 jobWriter.write(isRunning + "\n");
540                 jobWriter.write(priority + "\n");
541                 jobWriter.write(numberOfJournalEntries + "\n");
542                 jobWriter.write(settingsFile + "\n");
543                 jobWriter.write(statisticsFileSave + "\n");// TODO: Is this
544
// right?
545
// Can be multiple lines so we keep it last
546
if (errorMessage != null) {
547                     jobWriter.write(errorMessage + "\n");
548                 }
549             } finally {
550                 if (jobWriter != null) {
551                     jobWriter.close();
552                 }
553             }
554         } catch (IOException JavaDoc e) {
555             logger.log(Level.WARNING, "An IOException occured saving job " +
556                     name + " (" + UID + ")", e);
557         }
558     }
559   
560     /**
561      * Returns this jobs unique ID (UID) that was issued by the
562      * CrawlJobHandler() when this job was first created.
563      *
564      * @return Job This jobs UID.
565      * @see CrawlJobHandler#getNextJobUID()
566      */

567     public String JavaDoc getUID(){
568         return UID;
569     }
570
571     /**
572      * Returns this job's 'name'. The name comes from the settings for this job,
573      * need not be unique and may change. For a unique identifier use
574      * {@link #getUID() getUID()}.
575      * <p>
576      * The name corrisponds to the value of the 'name' tag in the 'meta' section
577      * of the settings file.
578      *
579      * @return This job's 'name'
580      */

581     public String JavaDoc getJobName(){
582         return name;
583     }
584
585     /**
586      * Return the combination of given name and UID most commonly
587      * used in administrative interface.
588      *
589      * @return Job's name with UID notation
590      */

591     public String JavaDoc getDisplayName() {
592         return getJobName()+" ["+getUID()+"]";
593     }
594
595     /**
596      * Set this job's level of priority.
597      *
598      * @param priority The level of priority
599      *
600      * @see #getJobPriority()
601      * @see #PRIORITY_MINIMAL
602      * @see #PRIORITY_LOW
603      * @see #PRIORITY_AVERAGE
604      * @see #PRIORITY_HIGH
605      * @see #PRIORITY_CRITICAL
606      */

607     public void setJobPriority(int priority) {
608         this.priority = priority;
609     }
610
611     /**
612      * Get this job's level of priority.
613      *
614      * @return this job's priority
615      * @see #setJobPriority(int)
616      * @see #PRIORITY_MINIMAL
617      * @see #PRIORITY_LOW
618      * @see #PRIORITY_AVERAGE
619      * @see #PRIORITY_HIGH
620      * @see #PRIORITY_CRITICAL
621      */

622     public int getJobPriority() {
623         return priority;
624     }
625
626     /**
627      * Once called no changes can be made to the settings for this job.
628      * Typically this is done once a crawl is completed and further changes
629      * to the crawl order are therefor meaningless.
630      */

631     public void setReadOnly() {
632         isReadOnly = true;
633         writeJobFile(); //Save changes
634
}
635
636     /**
637      * Is job read only?
638      * @return false until setReadOnly has been invoked, after that it returns true.
639      */

640     public boolean isReadOnly(){
641         return isReadOnly;
642     }
643
644     /**
645      * Set the status of this CrawlJob.
646      *
647      * @param status Current status of CrawlJob
648      * (see constants defined here beginning with STATUS)
649      */

650     public void setStatus(String JavaDoc status) {
651         this.status = status;
652         writeJobFile(); //Save changes
653
// TODO: If job finished, save StatisticsTracker!
654
}
655
656     /**
657      * @return Status of the crawler (Used by JMX).
658      */

659     public String JavaDoc getCrawlStatus() {
660         return this.controller != null?
661             this.controller.getState().toString(): "Illegal State";
662     }
663     
664     /**
665      * Get the current status of this CrawlJob
666      *
667      * @return The current status of this CrawlJob
668      * (see constants defined here beginning with STATUS)
669      */

670     public String JavaDoc getStatus() {
671         return this.status;
672     }
673
674     /**
675      * Returns the settings handler for this job. It will have been initialized.
676      * @return the settings handler for this job.
677      */

678     public XMLSettingsHandler getSettingsHandler() {
679         return this.settingsHandler;
680     }
681     /**
682      * Is this a new job?
683      * @return True if is new.
684      */

685     public boolean isNew() {
686         return isNew;
687     }
688
689     /**
690      * Set if the job is considered to be a profile
691      * @return True if is a profile.
692      */

693     public boolean isProfile() {
694         return isProfile;
695     }
696
697     /**
698      * Set if the job is considered a new job or not.
699      * @param b Is the job considered to be new.
700      */

701     public void setNew(boolean b) {
702         isNew = b;
703         writeJobFile(); //Save changes
704
}
705
706     /**
707      * Returns true if the job is being crawled.
708      * @return true if the job is being crawled
709      */

710     public boolean isRunning() {
711         return isRunning;
712     }
713
714     /**
715      * Set if job is being crawled.
716      * @param b Is job being crawled.
717      */

718     protected void setRunning(boolean b) {
719         isRunning = b;
720         writeJobFile(); // Save changes
721
//TODO: Job ending -> Save statistics tracker.
722
//TODO: This is likely to happen as the CrawlEnding event occurs,
723
// need to ensure that the StatisticsTracker is saved to disk on
724
// CrawlEnded. Maybe move responsibility for this into the
725
// StatisticsTracker?
726
}
727     
728     protected void unregisterMBean() {
729         // Unregister current job from JMX agent, if there one.
730
if (this.mbeanServer == null) {
731             return;
732         }
733         try {
734             this.mbeanServer.unregisterMBean(this.mbeanName);
735             this.mbeanServer = null;
736         } catch (Exception JavaDoc e) {
737             logger.log(Level.SEVERE, "Failed with " + this.mbeanName, e);
738         }
739     }
740     
741     /**
742      * Subclass of crawlcontroller that unregisters beans when stopped.
743      * Done as subclass so CrawlController doesn't get any JMX (or 'CrawlJob')
744      * pollution, so for sure CrawlJob is unregistered with JMX and so any
745      * listeners on the CrawlJob get a chance to get crawl ended message
746      * (These latter notifications may not actually be getting through -- TBD).
747      * <p>TODO: This override dirtys the data model since CC knows about CJs.
748      * The facility provided by this class emitting events and statistics so
749      * they can be read by JMX needs to go back into CC. Probably best to
750      * registering in JMX the CC, rather than CJ. Lets do this in Heritrix 2.0
751      * since means changing the JMX API some.
752      */

753     public class MBeanCrawlController extends CrawlController
754     implements Serializable JavaDoc {
755         private static final long serialVersionUID = -4608537998168407222L;
756         private CrawlJob cj = null;
757         private CompositeType JavaDoc ct = null;
758         
759         public CrawlJob getCrawlJob() {
760             return this.cj;
761         }
762
763         public void setCrawlJob(CrawlJob cj) {
764             this.cj = cj;
765         }
766         
767         public void progressStatisticsEvent(final EventObject JavaDoc e) {
768             super.progressStatisticsEvent(e);
769             if (this.cj.getMbeanName() == null) {
770                 // Can be null around job startup. Return w/o doing anything.
771
return;
772             }
773                 
774             Map JavaDoc s = ((StatisticsTracking)e.getSource()).getProgressStatistics();
775             // Convert the statistics to OpenType CompositeData and add as
776
// user data to Notification.
777
CompositeData JavaDoc cd = null;
778             try {
779                 if (this.ct == null) {
780                     this.ct = JmxUtils.createCompositeType(s, PROG_STATS,
781                         PROG_STATS + " for " + this.cj.getMbeanName());
782                 }
783                 cd = new CompositeDataSupport JavaDoc(this.ct, s);
784             } catch (OpenDataException JavaDoc ode) {
785                 ode.printStackTrace();
786             }
787             if (cd != null) {
788                 Notification JavaDoc n = new Notification JavaDoc(PROG_STATS,
789                     this.cj.getMbeanName(), getNotificationsSequenceNumber(),
790                     ((StatisticsTracking)e.getSource()).
791                         getProgressStatisticsLine());
792                 n.setUserData(cd);
793                 this.cj.sendNotification(n);
794             }
795         }
796         
797         protected void completeStop() {
798             try {
799                 super.completeStop();
800             } finally {
801                 if (this.cj != null) {
802                     this.cj.unregisterMBean();
803                 }
804                 this.cj = null;
805             }
806         }
807     }
808     
809     protected CrawlController setupCrawlController()
810     throws InitializationException {
811         CrawlController controller = null;
812         
813         // Check if we're to do a checkpoint recover. If so, deserialize
814
// the checkpoint's CrawlController and use that in place of a new
815
// CrawlController instance.
816
Checkpoint cp = CrawlController.
817             getCheckpointRecover(getSettingsHandler().getOrder());
818         if (cp != null) {
819             try {
820                 controller = (MBeanCrawlController)CheckpointUtils.
821                     readObjectFromFile(MBeanCrawlController.class,
822                         cp.getDirectory());
823             } catch (FileNotFoundException JavaDoc e) {
824                 throw new InitializationException(e);
825             } catch (IOException JavaDoc e) {
826                 throw new InitializationException(e);
827             } catch (ClassNotFoundException JavaDoc e) {
828                 throw new InitializationException(e);
829             }
830         } else {
831             controller = new MBeanCrawlController();
832         }
833         return controller;
834     }
835     
836     protected CrawlController createCrawlController() {
837         return new MBeanCrawlController();
838     }
839     
840     public void setupForCrawlStart()
841     throws InitializationException {
842         try {
843             this.controller = setupCrawlController();
844             // Register as listener to get job finished notice.
845
this.controller.addCrawlStatusListener(this);
846             this.controller.initialize(getSettingsHandler());
847             // Set the crawl job this MBeanCrawlController needs to worry about.
848
((MBeanCrawlController)this.controller).setCrawlJob(this);
849             // Create our mbean description and register our crawljob.
850
this.openMBeanInfo = buildMBeanInfo();
851             try {
852                 Heritrix.registerMBean(this, getJmxJobName(),
853                     CRAWLJOB_JMXMBEAN_TYPE);
854             } catch (InstanceAlreadyExistsException JavaDoc e) {
855                 throw new InitializationException(e);
856             } catch (MBeanRegistrationException JavaDoc e) {
857                 throw new InitializationException(e);
858             } catch (NotCompliantMBeanException JavaDoc e) {
859                 throw new InitializationException(e);
860             }
861         } catch (InitializationException e) {
862             // Can't load current job since it is misconfigured.
863
setStatus(CrawlJob.STATUS_MISCONFIGURED);
864             setErrorMessage("A fatal InitializationException occured when "
865                     + "loading job:\n" + e.getMessage());
866             // Log to stdout so its seen in logs as well as in UI.
867
e.printStackTrace();
868             this.controller = null;
869             throw e;
870         }
871         setStatus(CrawlJob.STATUS_RUNNING);
872         setRunning(true);
873     }
874     
875     public void stopCrawling() {
876         if(this.controller != null) {
877             this.controller.requestCrawlStop();
878         }
879     }
880
881     /**
882      * @return One-line Frontier report.
883      */

884     public String JavaDoc getFrontierOneLine() {
885         if (this.controller == null || this.controller.getFrontier() == null) {
886             return "Crawler not running";
887         }
888         return this.controller.getFrontier().singleLineReport();
889     }
890     
891     /**
892      * @param reportName Name of report to write.
893      * @return A report of the frontier's status.
894      */

895     public String JavaDoc getFrontierReport(final String JavaDoc reportName) {
896         if (this.controller == null || this.controller.getFrontier() == null) {
897             return "Crawler not running";
898         }
899         return ArchiveUtils.writeReportToString(this.controller.getFrontier(),
900                 reportName);
901     }
902     
903     /**
904      * Write the requested frontier report to the given PrintWriter
905      * @param reportName Name of report to write.
906      * @param writer Where to write to.
907      */

908     public void writeFrontierReport(String JavaDoc reportName, PrintWriter JavaDoc writer) {
909         if (this.controller == null || this.controller.getFrontier() == null) {
910             writer.println("Crawler not running.");
911             return;
912         }
913         this.controller.getFrontier().reportTo(reportName,writer);
914     }
915
916     /**
917      * @return One-line threads report.
918      */

919     public String JavaDoc getThreadOneLine() {
920         if (this.controller == null) {
921             return "Crawler not running";
922         }
923         return this.controller.oneLineReportThreads();
924     }
925     
926     /**
927      * Get the CrawlControllers ToeThreads report for the running crawl.
928      * @return The CrawlControllers ToeThreads report
929      */

930     public String JavaDoc getThreadsReport() {
931         if (this.controller == null) {
932             return "Crawler not running";
933         }
934         return ArchiveUtils.writeReportToString(this.controller.getToePool(),
935                 null);
936     }
937     
938     /**
939      * Write the requested threads report to the given PrintWriter
940      * @param reportName Name of report to write.
941      * @param writer Where to write to.
942      */

943     public void writeThreadsReport(String JavaDoc reportName, PrintWriter JavaDoc writer) {
944         if (this.controller == null || this.controller.getFrontier() == null) {
945             writer.println("Crawler not running.");
946             return;
947         }
948         this.controller.getToePool().reportTo(reportName, writer);
949     }
950     
951     /**
952      * Kills a thread. For details see
953      * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
954      * ToePool.killThread(int, boolean)}.
955      * @param threadNumber Thread to kill.
956      * @param replace Should thread be replaced.
957      * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
958      */

959     public void killThread(int threadNumber, boolean replace) {
960         if (this.controller == null) {
961             return;
962         }
963         this.controller.killThread(threadNumber, replace);
964     }
965
966     /**
967      * Get the Processors report for the running crawl.
968      * @return The Processors report for the running crawl.
969      */

970     public String JavaDoc getProcessorsReport() {
971         if (this.controller == null) {
972             return "Crawler not running";
973         }
974         return ArchiveUtils.writeReportToString(this.controller,
975                 CrawlController.PROCESSORS_REPORT);
976     }
977     
978     /**
979      * Returns the directory where the configuration files for this job are
980      * located.
981      *
982      * @return the directory where the configuration files for this job are
983      * located
984      */

985     public String JavaDoc getSettingsDirectory() {
986         return settingsHandler.getOrderFile().getPath();
987     }
988
989     /**
990      * Returns the path of the job's base directory. For profiles this is always
991      * equal to <code>new File(getSettingsDirectory())</code>.
992      * @return the path of the job's base directory.
993      */

994     public File JavaDoc getDirectory(){
995         return isProfile? new File JavaDoc(getSettingsDirectory()): jobDir;
996     }
997
998     /**
999      * Get the error message associated with this job. Will return null if there
1000     * is no error message.
1001     * @return the error message associated with this job
1002     */

1003    public String JavaDoc getErrorMessage() {
1004        return errorMessage;
1005    }
1006
1007    /**
1008     * Set an error message for this job. Generally this only occurs if the job
1009     * is misconfigured.
1010     * @param string the error message associated with this job
1011     */

1012    public void setErrorMessage(String JavaDoc string) {
1013        errorMessage = string;
1014        writeJobFile(); //Save changes
1015
}
1016
1017    /**
1018     * @return Returns the number of journal entries.
1019     */

1020    public int getNumberOfJournalEntries() {
1021        return numberOfJournalEntries;
1022    }
1023
1024    /**
1025     * @param numberOfJournalEntries The number of journal entries to set.
1026     */

1027    public void setNumberOfJournalEntries(int numberOfJournalEntries) {
1028        this.numberOfJournalEntries = numberOfJournalEntries;
1029        writeJobFile();
1030    }
1031
1032    /**
1033     * @return Returns the error handler for this crawl job
1034     */

1035    public CrawlJobErrorHandler getErrorHandler() {
1036        return errorHandler;
1037    }
1038
1039    /**
1040     * Read all the checkpoints found in the job's checkpoints
1041     * directory into Checkpoint instances
1042     * @return Collection containing list of all checkpoints.
1043     */

1044    public Collection JavaDoc scanCheckpoints() {
1045        File JavaDoc checkpointsDirectory =
1046            settingsHandler.getOrder().getCheckpointsDirectory();
1047        File JavaDoc[] perCheckpointDirs = checkpointsDirectory.listFiles();
1048        Collection JavaDoc<Checkpoint> checkpoints = new ArrayList JavaDoc<Checkpoint>();
1049        if (perCheckpointDirs != null) {
1050            for (int i = 0; i < perCheckpointDirs.length; i++) {
1051                Checkpoint cp = new Checkpoint(perCheckpointDirs[i]);
1052                checkpoints.add(cp);
1053            }
1054        }
1055        return checkpoints;
1056    }
1057
1058    /**
1059     * Returns the absolute path of the specified log.
1060     * Note: If crawl has not begun, this file may not exist.
1061     * @param log
1062     * @return the absolute path for the specified log.
1063     * @throws AttributeNotFoundException
1064     * @throws ReflectionException
1065     * @throws MBeanException
1066     */

1067    public String JavaDoc getLogPath(String JavaDoc log)
1068    throws AttributeNotFoundException JavaDoc, MBeanException JavaDoc, ReflectionException JavaDoc {
1069        String JavaDoc logsPath = (String JavaDoc)settingsHandler.getOrder().
1070            getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1071        CrawlOrder order = settingsHandler.getOrder();
1072        String JavaDoc diskPath = (String JavaDoc) order.getAttribute(null,
1073            CrawlOrder.ATTR_DISK_PATH);
1074        File JavaDoc disk = settingsHandler.
1075            getPathRelativeToWorkingDirectory(diskPath);
1076        File JavaDoc f = new File JavaDoc(logsPath, log);
1077        if (!f.isAbsolute()) {
1078            f = new File JavaDoc(disk.getPath(), f.getPath());
1079        }
1080        return f.getAbsolutePath();
1081    }
1082
1083    // OpenMBean implementation.
1084

1085    protected void pause() {
1086        if (this.controller != null && this.controller.isPaused() == false) {
1087            this.controller.requestCrawlPause();
1088        }
1089    }
1090    
1091    protected void resume() {
1092        if (this.controller != null) {
1093            this.controller.requestCrawlResume();
1094        }
1095    }
1096
1097    /**
1098     * @throws IllegalStateException Thrown if crawl is not paused.
1099     */

1100    protected void checkpoint() throws IllegalStateException JavaDoc {
1101        if (this.controller != null) {
1102            this.controller.requestCrawlCheckpoint();
1103        }
1104    }
1105    
1106    /**
1107     * @return True if checkpointing.
1108     */

1109    public boolean isCheckpointing() {
1110        return this.controller != null? this.controller.isCheckpointing(): false;
1111    }
1112    
1113    /**
1114     * If its a HostQueuesFrontier, needs to be flushed for the queued.
1115     */

1116    protected void flush() {
1117        // Nothing to do.
1118
}
1119
1120    /**
1121     * Delete any URI from the frontier of the current (paused) job that match
1122     * the specified regular expression. If the current job is not paused (or
1123     * there is no current job) nothing will be done.
1124     * @param regexpr Regular expression to delete URIs by.
1125     * @return the number of URIs deleted
1126     */

1127    public long deleteURIsFromPending(String JavaDoc regexpr){
1128        return (this.controller != null &&
1129                this.controller.getFrontier() != null &&
1130                this.controller.isPaused())?
1131            this.controller.getFrontier().deleteURIs(regexpr): 0;
1132    }
1133    
1134    public String JavaDoc importUris(String JavaDoc file, String JavaDoc style, String JavaDoc force) {
1135        return importUris(file, style, "true".equals(force));
1136    }
1137    
1138    public String JavaDoc importUris(final String JavaDoc fileOrUrl, final String JavaDoc style,
1139            final boolean forceRevisit) {
1140        return importUris(fileOrUrl, style, forceRevisit, false);
1141    }
1142
1143    /**
1144     * @param fileOrUrl Name of file w/ seeds.
1145     * @param style What style of seeds -- crawl log, recovery journal, or
1146     * seeds file.
1147     * @param forceRevisit Should we revisit even if seen before?
1148     * @param areSeeds Is the file exclusively seeds?
1149     * @return A display string that has a count of all added.
1150     */

1151    public String JavaDoc importUris(final String JavaDoc fileOrUrl, final String JavaDoc style,
1152            final boolean forceRevisit, final boolean areSeeds) {
1153        InputStream JavaDoc is =
1154            IoUtils.getInputStream(this.controller.getDisk(), fileOrUrl);
1155        String JavaDoc message = null;
1156        // Do we have an inputstream?
1157
if (is == null) {
1158            message = "Failed to get inputstream from " + fileOrUrl;
1159            logger.severe(message);
1160        } else {
1161            int addedCount = importUris(is, style, forceRevisit, areSeeds);
1162            message = Integer.toString(addedCount) + " URIs added from " +
1163                fileOrUrl;
1164        }
1165        return message;
1166    }
1167    
1168    protected int importUris(InputStream JavaDoc is, String JavaDoc style,
1169            boolean forceRevisit) {
1170        return importUris(is, style, forceRevisit, false);
1171    }
1172    
1173    /**
1174     * Import URIs.
1175     * @param is Stream to use as URI source.
1176     * @param style Style in which URIs are rendored. Currently support for
1177     * <code>recoveryJournal</code>, <code>crawlLog</code>, and seeds file
1178     * format (i.e <code>default</code>) where <code>default</code> style is
1179     * a UURI per line (comments allowed).
1180     * @param forceRevisit Whether we should revisit this URI even if we've
1181     * visited it previously.
1182     * @param areSeeds Are the imported URIs seeds?
1183     * @return Count of added URIs.
1184     */

1185    protected int importUris(InputStream JavaDoc is, String JavaDoc style,
1186            boolean forceRevisit, final boolean areSeeds) {
1187        // Figure the regex to use parsing each line of input stream.
1188
String JavaDoc extractor;
1189        String JavaDoc output;
1190        if(CRAWL_LOG_STYLE.equals(style)) {
1191            // Skip first 3 fields
1192
extractor = "\\S+\\s+\\S+\\s+\\S+\\s+(\\S+\\s+\\S+\\s+\\S+\\s+).*";
1193            output = "$1";
1194        } else if (RECOVERY_JOURNAL_STYLE.equals(style)) {
1195            // Skip the begin-of-line directive
1196
extractor = "\\S+\\s+((\\S+)(?:\\s+\\S+\\s+\\S+)?)\\s*";
1197            output = "$1";
1198        } else {
1199            extractor =
1200                RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT;
1201            output = RegexpLineIterator.ENTRY;
1202        }
1203        
1204        // Read the input stream.
1205
BufferedReader JavaDoc br = null;
1206        int addedCount = 0;
1207        try {
1208            br = new BufferedReader JavaDoc(new InputStreamReader JavaDoc(is));
1209            Iterator JavaDoc iter = new RegexpLineIterator(new LineReadingIterator(br),
1210                RegexpLineIterator.COMMENT_LINE, extractor, output);
1211            while(iter.hasNext()) {
1212                try {
1213                    importUri((String JavaDoc)iter.next(), forceRevisit, areSeeds,
1214                        false);
1215                    addedCount++;
1216                } catch (URIException e) {
1217                    e.printStackTrace();
1218                }
1219            }
1220            br.close();
1221            flush();
1222        } catch (IOException JavaDoc e) {
1223            e.printStackTrace();
1224        }
1225        return addedCount;
1226    }
1227    
1228    /**
1229     * Schedule a uri.
1230     * @param uri Uri to schedule.
1231     * @param forceFetch Should it be forcefetched.
1232     * @param isSeed True if seed.
1233     * @throws URIException
1234     */

1235    public void importUri(final String JavaDoc uri, final boolean forceFetch,
1236            final boolean isSeed)
1237    throws URIException {
1238        importUri(uri, forceFetch, isSeed, true);
1239    }
1240    
1241    /**
1242     * Schedule a uri.
1243     * @param str String that can be: 1. a UURI, 2. a snippet of the
1244     * crawl.log line, or 3. a snippet from recover log. See
1245     * {@link #importUris(InputStream, String, boolean)} for how it subparses
1246     * the lines from crawl.log and recover.log.
1247     * @param forceFetch Should it be forcefetched.
1248     * @param isSeed True if seed.
1249     * @param isFlush If true, flush the frontier IF it implements
1250     * flushing.
1251     * @throws URIException
1252     */

1253    public void importUri(final String JavaDoc str, final boolean forceFetch,
1254            final boolean isSeed, final boolean isFlush)
1255    throws URIException {
1256        CandidateURI caUri = CandidateURI.fromString(str);
1257        caUri.setForceFetch(forceFetch);
1258        if (isSeed) {
1259            caUri.setIsSeed(isSeed);
1260            if (caUri.getVia() == null || caUri.getVia().length() <= 0) {
1261                // Danger of double-add of seeds because of this code here.
1262
// Only call addSeed if no via. If a via, the schedule will
1263
// take care of updating scope.
1264
this.controller.getScope().addSeed(caUri);
1265            }
1266        }
1267        this.controller.getFrontier().schedule(caUri);
1268        if (isFlush) {
1269            flush();
1270        }
1271    }
1272    
1273    
1274    /**
1275     * @return Our mbean info (Needed for CrawlJob to qualify as a
1276     * DynamicMBean).
1277     */

1278    public MBeanInfo JavaDoc getMBeanInfo() {
1279        return this.openMBeanInfo;
1280    }
1281    
1282    /**
1283     * Build up the MBean info for Heritrix main.
1284     * @return Return created mbean info instance.
1285     * @throws InitializationException
1286     */

1287    protected OpenMBeanInfoSupport JavaDoc buildMBeanInfo()
1288    throws InitializationException {
1289        // Start adding my attributes.
1290
List JavaDoc<OpenMBeanAttributeInfo JavaDoc> attributes
1291         = new ArrayList JavaDoc<OpenMBeanAttributeInfo JavaDoc>();
1292
1293        // Attributes.
1294
attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(NAME_ATTR,
1295            "Crawl job name", SimpleType.STRING, true, false, false));
1296        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(STATUS_ATTR,
1297            "Short basic status message", SimpleType.STRING, true, false,
1298            false));
1299        attributes.add(
1300                new OpenMBeanAttributeInfoSupport JavaDoc(FRONTIER_SHORT_REPORT_ATTR,
1301                "Short frontier report", SimpleType.STRING, true,
1302                false, false));
1303        attributes.add(
1304                new OpenMBeanAttributeInfoSupport JavaDoc(THREADS_SHORT_REPORT_ATTR,
1305                "Short threads report", SimpleType.STRING, true,
1306                false, false));
1307        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(UID_ATTR,
1308            "Crawl job UID", SimpleType.STRING, true, false, false));
1309        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(TOTAL_DATA_ATTR,
1310            "Total data received", SimpleType.LONG, true, false, false));
1311        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(CRAWL_TIME_ATTR,
1312            "Crawl time", SimpleType.LONG, true, false, false));
1313        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(CURRENT_DOC_RATE_ATTR,
1314            "Current crawling rate (Docs/sec)", SimpleType.DOUBLE,
1315            true, false, false));
1316        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(CURRENT_KB_RATE_ATTR,
1317            "Current crawling rate (Kb/sec)", SimpleType.LONG,
1318            true, false, false));
1319        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(THREAD_COUNT_ATTR,
1320            "Active thread count", SimpleType.INTEGER, true, false, false));
1321        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(DOC_RATE_ATTR,
1322            "Crawling rate (Docs/sec)", SimpleType.DOUBLE,
1323            true, false, false));
1324        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(KB_RATE_ATTR,
1325            "Current crawling rate (Kb/sec)", SimpleType.LONG,
1326            true, false, false));
1327        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(DOWNLOAD_COUNT_ATTR,
1328            "Count of downloaded documents", SimpleType.LONG,
1329            true, false, false));
1330        attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(DISCOVERED_COUNT_ATTR,
1331            "Count of discovered documents", SimpleType.LONG,
1332            true, false, false));
1333        
1334        // Add in the crawl order attributes.
1335
addCrawlOrderAttributes(this.getController().getOrder(), attributes);
1336        
1337        // Add the bdbje attributes. Convert to open mbean attributes.
1338
// First do bdbeje setup. Then add a subset of the bdbje attributes.
1339
// Keep around the list of names as a convenience for when it comes
1340
// time to test if attribute is supported.
1341
Environment env = this.controller.getBdbEnvironment();
1342        try {
1343            this.bdbjeMBeanHelper =
1344                new JEMBeanHelper(env.getConfig(), env.getHome(), true);
1345        } catch (DatabaseException e) {
1346            e.printStackTrace();
1347            InitializationException ie =
1348                new InitializationException(e.getMessage());
1349            ie.setStackTrace(e.getStackTrace());
1350            throw ie;
1351        }
1352        this.bdbjeAttributeNameList = Arrays.asList(new String JavaDoc [] {
1353                JEMBeanHelper.ATT_ENV_HOME,
1354                JEMBeanHelper.ATT_OPEN,
1355                JEMBeanHelper.ATT_IS_READ_ONLY,
1356                JEMBeanHelper.ATT_IS_TRANSACTIONAL,
1357                JEMBeanHelper.ATT_CACHE_SIZE,
1358                JEMBeanHelper.ATT_CACHE_PERCENT,
1359                JEMBeanHelper.ATT_LOCK_TIMEOUT,
1360                JEMBeanHelper.ATT_IS_SERIALIZABLE,
1361                JEMBeanHelper.ATT_SET_READ_ONLY,
1362        });
1363        addBdbjeAttributes(attributes,
1364                this.bdbjeMBeanHelper.getAttributeList(env),
1365                this.bdbjeAttributeNameList);
1366
1367        // Operations.
1368
List JavaDoc<OpenMBeanOperationInfo JavaDoc> operations
1369         = new ArrayList JavaDoc<OpenMBeanOperationInfo JavaDoc>();
1370        OpenMBeanParameterInfo JavaDoc[] args = new OpenMBeanParameterInfoSupport JavaDoc[3];
1371        args[0] = new OpenMBeanParameterInfoSupport JavaDoc("url",
1372            "URL to add to the frontier", SimpleType.STRING);
1373        args[1] = new OpenMBeanParameterInfoSupport JavaDoc("forceFetch",
1374            "True if URL is to be force fetched", SimpleType.BOOLEAN);
1375        args[2] = new OpenMBeanParameterInfoSupport JavaDoc("seed",
1376            "True if URL is a seed", SimpleType.BOOLEAN);
1377        operations.add(new OpenMBeanOperationInfoSupport JavaDoc(IMPORT_URI_OPER,
1378            "Add passed URL to the frontier", args, SimpleType.VOID,
1379                MBeanOperationInfo.ACTION));
1380        
1381        args = new OpenMBeanParameterInfoSupport JavaDoc[4];
1382        args[0] = new OpenMBeanParameterInfoSupport JavaDoc("pathOrUrl",
1383            "Path or URL to file of URLs", SimpleType.STRING);
1384        args[1] = new OpenMBeanParameterInfoSupport JavaDoc("style",
1385            "Format format:default|crawlLog|recoveryJournal",
1386            SimpleType.STRING);
1387        args[2] = new OpenMBeanParameterInfoSupport JavaDoc("forceFetch",
1388            "True if URLs are to be force fetched", SimpleType.BOOLEAN);
1389        args[3] = new OpenMBeanParameterInfoSupport JavaDoc("seed",
1390            "True if all content are seeds.", SimpleType.BOOLEAN);
1391        operations.add(new OpenMBeanOperationInfoSupport JavaDoc(IMPORT_URIS_OPER,
1392            "Add file of passed URLs to the frontier", args, SimpleType.STRING,
1393                MBeanOperationInfo.ACTION));
1394        
1395        operations.add(new OpenMBeanOperationInfoSupport JavaDoc(PAUSE_OPER,
1396            "Pause crawling (noop if already paused)", null, SimpleType.VOID,
1397            MBeanOperationInfo.ACTION));
1398        
1399        operations.add(new OpenMBeanOperationInfoSupport JavaDoc(RESUME_OPER,
1400            "Resume crawling (noop if already resumed)", null,
1401            SimpleType.VOID, MBeanOperationInfo.ACTION));
1402        
1403        args = new OpenMBeanParameterInfoSupport JavaDoc[1];
1404        args[0] = new OpenMBeanParameterInfoSupport JavaDoc("name",
1405            "Name of report ('all', 'standard', etc.).", SimpleType.STRING);
1406        operations.add(new OpenMBeanOperationInfoSupport JavaDoc(FRONTIER_REPORT_OPER,
1407             "Full frontier report", args, SimpleType.STRING,
1408             MBeanOperationInfo.INFO));
1409        
1410        operations.add(new OpenMBeanOperationInfoSupport JavaDoc(THREADS_REPORT_OPER,
1411             "Full thread report", null, SimpleType.STRING,
1412             MBeanOperationInfo.INFO));
1413        
1414        operations.add(new OpenMBeanOperationInfoSupport JavaDoc(SEEDS_REPORT_OPER,
1415             "Seeds report", null, SimpleType.STRING, MBeanOperationInfo.INFO));
1416 
1417        operations.add(
1418                new OpenMBeanOperationInfoSupport JavaDoc(PROGRESS_STATISTICS_OPER,
1419                "Progress statistics at time of invocation", null,
1420                SimpleType.STRING, MBeanOperationInfo.INFO));
1421        
1422        operations.add(new OpenMBeanOperationInfoSupport JavaDoc(
1423            PROGRESS_STATISTICS_LEGEND_OPER,
1424                "Progress statistics legend", null,
1425                SimpleType.STRING, MBeanOperationInfo.INFO));
1426        
1427        operations.add(new OpenMBeanOperationInfoSupport JavaDoc(CHECKPOINT_OPER,
1428                "Start a checkpoint", null, SimpleType.VOID,
1429                MBeanOperationInfo.ACTION));
1430                
1431        // Add bdbje operations. Add subset only. Keep around the list so have
1432
// it to hand when figuring what operations are supported. Usual actual
1433
// Strings because not accessible from JEMBeanHelper.
1434
this.bdbjeOperationsNameList = Arrays.asList(new String JavaDoc[] { "cleanLog",
1435                "evictMemory", "checkpoint", "sync",
1436                "getEnvironmentStatsToString", "getLockStatsToString",
1437                "getDatabaseNames", OP_DB_STAT
1438        });
1439        addBdbjeOperations(operations,
1440                this.bdbjeMBeanHelper.getOperationList(env),
1441                this.bdbjeOperationsNameList);
1442        
1443        // Register notifications
1444
List JavaDoc<MBeanNotificationInfo JavaDoc> notifications
1445         = new ArrayList JavaDoc<MBeanNotificationInfo JavaDoc>();
1446        notifications.add(
1447            new MBeanNotificationInfo JavaDoc(new String JavaDoc [] {"crawlStarted",
1448                    "crawlEnding", "crawlPaused", "crawlResuming", PROG_STATS},
1449                this.getClass().getName() + ".notifications",
1450                "CrawlStatusListener events and progress statistics as " +
1451                    "notifications"));
1452        MBeanNotificationInfo JavaDoc [] notificationsArray =
1453            new MBeanNotificationInfo JavaDoc[notifications.size()];
1454        notifications.toArray(notificationsArray);
1455        
1456        // Build the info object.
1457
OpenMBeanAttributeInfoSupport JavaDoc[] attributesArray =
1458            new OpenMBeanAttributeInfoSupport JavaDoc[attributes.size()];
1459        attributes.toArray(attributesArray);
1460        OpenMBeanOperationInfoSupport JavaDoc[] operationsArray =
1461            new OpenMBeanOperationInfoSupport JavaDoc[operations.size()];
1462        operations.toArray(operationsArray);
1463        return new OpenMBeanInfoSupport JavaDoc(this.getClass().getName(),
1464            "Current Crawl Job as OpenMBean",
1465            attributesArray,
1466            new OpenMBeanConstructorInfoSupport JavaDoc [] {},
1467            operationsArray,
1468            notificationsArray);
1469    }
1470    
1471    protected void addBdbjeAttributes(
1472            final List JavaDoc<OpenMBeanAttributeInfo JavaDoc> attributes,
1473            final List JavaDoc<MBeanAttributeInfo JavaDoc> bdbjeAttributes,
1474            final List JavaDoc<String JavaDoc> bdbjeNamesToAdd) {
1475        for (MBeanAttributeInfo JavaDoc info: bdbjeAttributes) {
1476            if (bdbjeNamesToAdd.contains(info.getName())) {
1477                attributes.add(JmxUtils.convertToOpenMBeanAttribute(info));
1478            }
1479        }
1480    }
1481    
1482    protected void addBdbjeOperations(
1483            final List JavaDoc<OpenMBeanOperationInfo JavaDoc> operations,
1484            final List JavaDoc<MBeanOperationInfo JavaDoc> bdbjeOperations,
1485            final List JavaDoc<String JavaDoc> bdbjeNamesToAdd) {
1486        for (MBeanOperationInfo JavaDoc info: bdbjeOperations) {
1487            if (bdbjeNamesToAdd.contains(info.getName())) {
1488                OpenMBeanOperationInfo JavaDoc omboi = null;
1489                if (info.getName().equals(OP_DB_STAT)) {
1490                    // Db stats needs special handling. The published
1491
// signature is wrong and its return type is awkward.
1492
// Handle it.
1493
omboi = JmxUtils.convertToOpenMBeanOperation(info, null,
1494                        SimpleType.STRING);
1495                    MBeanParameterInfo JavaDoc[] params = omboi.getSignature();
1496                    OpenMBeanParameterInfo JavaDoc[] args =
1497                        new OpenMBeanParameterInfoSupport JavaDoc[params.length + 1];
1498                    for (int ii = 0; ii < params.length; ii++) {
1499                        args[ii] = (OpenMBeanParameterInfo JavaDoc) params[ii];
1500                    }
1501                    args[params.length] = new OpenMBeanParameterInfoSupport JavaDoc(
1502                            "name", "Database name", SimpleType.STRING);
1503                    omboi = new OpenMBeanOperationInfoSupport JavaDoc(omboi.getName(),
1504                        omboi.getDescription(), args, omboi.getReturnOpenType(),
1505                        omboi.getImpact());
1506                } else {
1507                    omboi = JmxUtils.convertToOpenMBeanOperation(info);
1508                }
1509                operations.add(omboi);
1510            }
1511        }
1512    }
1513    
1514    protected void addCrawlOrderAttributes(final ComplexType type,
1515            final List JavaDoc<OpenMBeanAttributeInfo JavaDoc> attributes) {
1516        for (final Iterator JavaDoc i = type.getAttributeInfoIterator(null);
1517                i.hasNext();) {
1518            ModuleAttributeInfo info = (ModuleAttributeInfo)i.next();
1519            if (ORDER_EXCLUDE.contains(info.getName())) {
1520                // Skip.
1521
continue;
1522            }
1523            String JavaDoc absoluteName = type.getAbsoluteName() + "/" + info.getName();
1524            if (JmxUtils.isOpenType(info.getType())) {
1525                String JavaDoc description = info.getDescription();
1526                if (description == null || description.length() <= 0) {
1527                    // Description can't be empty.
1528
description = info.getName();
1529                }
1530                attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(
1531                    absoluteName, description,
1532                    JmxUtils.getOpenType(info.getType()), true, true, false));
1533            } else if(info.isComplexType()) {
1534                try {
1535                    ComplexType c =
1536                        (ComplexType)type.getAttribute(info.getName());
1537                    addCrawlOrderAttributes(c, attributes);
1538                } catch (AttributeNotFoundException JavaDoc e) {
1539                    logger.log(Level.SEVERE, "Failed get of attribute", e);
1540                } catch (MBeanException JavaDoc e) {
1541                    logger.log(Level.SEVERE, "Failed get of attribute", e);
1542                } catch (ReflectionException JavaDoc e) {
1543                    logger.log(Level.SEVERE, "Failed get of attribute", e);
1544                }
1545            } else if (info.getType().equals(TextField.class.getName())) {
1546                // Special handling for TextField. Use the STRING OpenType.
1547
attributes.add(new OpenMBeanAttributeInfoSupport JavaDoc(
1548                        absoluteName, info.getDescription(),
1549                        SimpleType.STRING, true, true, false));
1550            } else {
1551                // Looks like only type we don't currently handle is StringList.
1552
// Figure how to do it. Add as AttributeList?
1553
logger.fine(info.getType());
1554            }
1555        }
1556    }
1557    
1558    public Object JavaDoc getAttribute(String JavaDoc attribute_name)
1559    throws AttributeNotFoundException JavaDoc {
1560        if (attribute_name == null) {
1561            throw new RuntimeOperationsException JavaDoc(
1562                 new IllegalArgumentException JavaDoc("Attribute name cannot be null"),
1563                 "Cannot call getAttribute with null attribute name");
1564        }
1565        
1566        // If no controller, we can't do any work in here.
1567
if (this.controller == null) {
1568            throw new RuntimeOperationsException JavaDoc(
1569                 new NullPointerException JavaDoc("Controller is null"),
1570                 "Controller is null");
1571        }
1572        
1573        // Is it a bdbje attribute?
1574
if (this.bdbjeAttributeNameList.contains(attribute_name)) {
1575            try {
1576                return this.bdbjeMBeanHelper.getAttribute(
1577                        this.controller.getBdbEnvironment(), attribute_name);
1578            } catch (MBeanException JavaDoc e) {
1579                throw new RuntimeOperationsException JavaDoc(new RuntimeException JavaDoc(e));
1580            }
1581        }
1582        
1583        // Is it a crawl-order attribute?
1584
if (attribute_name.
1585                startsWith(this.controller.getOrder().getAbsoluteName())) {
1586            return getCrawlOrderAttribute(attribute_name);
1587        }
1588        
1589        if (!ATTRIBUTE_LIST.contains(attribute_name)) {
1590            throw new AttributeNotFoundException JavaDoc("Attribute " +
1591                    attribute_name + " is unimplemented.");
1592        }
1593
1594        // The pattern in the below is to match an attribute and when found
1595
// do a return out of if clause. Doing it this way, I can fall
1596
// on to the AttributeNotFoundException for case where we've an
1597
// attribute but no handler.
1598
if (attribute_name.equals(STATUS_ATTR)) {
1599            return getCrawlStatus();
1600        }
1601        if (attribute_name.equals(NAME_ATTR)) {
1602            return getJobName();
1603        }
1604        if (attribute_name.equals(UID_ATTR)) {
1605            return getUID();
1606        }
1607        if (attribute_name.equals(TOTAL_DATA_ATTR)) {
1608            return new Long JavaDoc(this.controller == null &&
1609                    this.controller.getStatistics() != null? 0:
1610                this.controller.getStatistics().totalBytesWritten());
1611        }
1612        if (attribute_name.equals(CRAWL_TIME_ATTR)) {
1613            return new Long JavaDoc(this.controller == null &&
1614                    this.controller.getStatistics() != null? 0:
1615                this.controller.getStatistics().getCrawlerTotalElapsedTime() /
1616                    1000);
1617        }
1618        if (attribute_name.equals(CURRENT_DOC_RATE_ATTR)) {
1619            return new Double JavaDoc(this.controller == null &&
1620                    this.controller.getStatistics() != null? 0:
1621                this.controller.getStatistics().currentProcessedDocsPerSec());
1622        }
1623        if (attribute_name.equals(DOC_RATE_ATTR)) {
1624            return new Double JavaDoc(this.controller == null &&
1625                    this.controller.getStatistics() != null? 0:
1626                this.controller.getStatistics().processedDocsPerSec());
1627        }
1628        if (attribute_name.equals(KB_RATE_ATTR)) {
1629            return new Long JavaDoc(this.controller == null &&
1630                    this.controller.getStatistics() != null? 0:
1631                this.controller.getStatistics().currentProcessedKBPerSec());
1632        }
1633        if (attribute_name.equals(CURRENT_KB_RATE_ATTR)) {
1634            return new Long JavaDoc(this.controller == null &&
1635                    this.controller.getStatistics() != null? 0:
1636                this.controller.getStatistics().processedKBPerSec());
1637        }
1638        if (attribute_name.equals(THREAD_COUNT_ATTR)) {
1639            return new Integer JavaDoc(this.controller == null &&
1640                    this.controller.getStatistics() != null? 0:
1641                this.controller.getStatistics().activeThreadCount());
1642        }
1643        if (attribute_name.equals(FRONTIER_SHORT_REPORT_ATTR)) {
1644            return getFrontierOneLine();
1645        }
1646        if (attribute_name.equals(THREADS_SHORT_REPORT_ATTR)) {
1647            return getThreadOneLine();
1648        }
1649        if (attribute_name.equals(DISCOVERED_COUNT_ATTR)) {
1650            return new Long JavaDoc(this.controller == null &&
1651                    this.controller.getStatistics() != null? 0:
1652                this.controller.getStatistics().totalCount());
1653        }
1654        if (attribute_name.equals(DOWNLOAD_COUNT_ATTR)) {
1655            return new Long JavaDoc(this.controller == null &&
1656                    this.controller.getStatistics() != null? 0:
1657                this.controller.getStatistics().successfullyFetchedCount());
1658        }
1659        
1660        throw new AttributeNotFoundException JavaDoc("Attribute " +
1661            attribute_name + " not found.");
1662    }
1663    
1664    protected Object JavaDoc getCrawlOrderAttribute(final String JavaDoc attribute_name) {
1665        CrawlOrder order = this.getController().getOrder();
1666        Object JavaDoc result = null;
1667        try {
1668            result = getCrawlOrderAttribute(attribute_name.substring(order
1669                    .getAbsoluteName().length()), order);
1670        } catch (NullPointerException JavaDoc e) {
1671            logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1672        } catch (AttributeNotFoundException JavaDoc e) {
1673            logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1674        } catch (MBeanException JavaDoc e) {
1675            logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1676        } catch (ReflectionException JavaDoc e) {
1677            logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1678        }
1679        return result;
1680    }
1681
1682    protected Object JavaDoc getCrawlOrderAttribute(final String JavaDoc attribute_name,
1683            final ComplexType ct)
1684    throws AttributeNotFoundException JavaDoc, MBeanException JavaDoc, ReflectionException JavaDoc {
1685        String JavaDoc subName = attribute_name.startsWith("/") ? attribute_name
1686                .substring(1) : attribute_name;
1687        int index = subName.indexOf("/");
1688        if (index <= 0) {
1689            MBeanAttributeInfo JavaDoc info = ct.getAttributeInfo(subName);
1690            // Special handling for TextField.
1691
return info.getType().equals(TextField.class.getName()) ? ct
1692                    .getAttribute(subName).toString() : ct
1693                    .getAttribute(subName);
1694        }
1695        return getCrawlOrderAttribute(subName.substring(index + 1),
1696                (ComplexType) ct.getAttribute(subName.substring(0, index)));
1697    }
1698    
1699    public AttributeList JavaDoc getAttributes(String JavaDoc [] attributeNames) {
1700        if (attributeNames == null) {
1701            throw new RuntimeOperationsException JavaDoc(
1702                new IllegalArgumentException JavaDoc("attributeNames[] cannot be " +
1703                "null"), "Cannot call getAttributes with null attribute " +
1704                "names");
1705        }
1706        
1707        // If no controller, we can't do any work in here.
1708
if (this.controller == null) {
1709            throw new RuntimeOperationsException JavaDoc(
1710                 new NullPointerException JavaDoc("Controller is null"),
1711                 "Controller is null");
1712        }
1713        
1714        AttributeList JavaDoc resultList = new AttributeList JavaDoc();
1715        if (attributeNames.length == 0) {
1716            return resultList;
1717        }
1718        for (int i = 0; i < attributeNames.length; i++) {
1719            try {
1720                Object JavaDoc value = getAttribute(attributeNames[i]);
1721                resultList.add(new Attribute JavaDoc(attributeNames[i], value));
1722            } catch (Exception JavaDoc e) {
1723                e.printStackTrace();
1724            }
1725        }
1726        return(resultList);
1727    }
1728
1729    public void setAttribute(Attribute JavaDoc attribute)
1730            throws AttributeNotFoundException JavaDoc {
1731        // Is it a crawl order attribute?
1732
CrawlOrder order = this.getController().getOrder();
1733        String JavaDoc attName = attribute.getName();
1734        if (attName.startsWith(order.getAbsoluteName())) {
1735            try {
1736                setCrawlOrderAttribute(attribute.getName().substring(
1737                        order.getAbsoluteName().length()), order, attribute);
1738            } catch (NullPointerException JavaDoc e) {
1739                logger.log(Level.SEVERE, "Failed set of " + attName, e);
1740            } catch (AttributeNotFoundException JavaDoc e) {
1741                logger.log(Level.SEVERE, "Failed set of " + attName, e);
1742            } catch (MBeanException JavaDoc e) {
1743                logger.log(Level.SEVERE, "Failed set of " + attName, e);
1744            } catch (ReflectionException JavaDoc e) {
1745                logger.log(Level.SEVERE, "Failed set of " + attName, e);
1746            } catch (InvalidAttributeValueException JavaDoc e) {
1747                logger.log(Level.SEVERE, "Failed set of " + attName, e);
1748            }
1749            return;
1750        }
1751        
1752        // Is it a bdbje attribute?
1753
if (this.bdbjeAttributeNameList.contains(attName)) {
1754            try {
1755                this.bdbjeMBeanHelper.setAttribute(this.controller
1756                        .getBdbEnvironment(), attribute);
1757            } catch (AttributeNotFoundException JavaDoc e) {
1758                throw new RuntimeOperationsException JavaDoc(new RuntimeException JavaDoc(e));
1759            } catch (InvalidAttributeValueException JavaDoc e) {
1760                throw new RuntimeOperationsException JavaDoc(new RuntimeException JavaDoc(e));
1761            }
1762            return;
1763        }
1764        
1765        // Else, we don't know how to handle this attribute.
1766
throw new AttributeNotFoundException JavaDoc("Attribute " + attName +
1767            " can not be set.");
1768    }
1769    
1770    protected void setCrawlOrderAttribute(final String JavaDoc attribute_name,
1771            final ComplexType ct, final Attribute JavaDoc attribute)
1772    throws AttributeNotFoundException JavaDoc, InvalidAttributeValueException JavaDoc,
1773            MBeanException JavaDoc, ReflectionException JavaDoc {
1774        String JavaDoc subName = attribute_name.startsWith("/") ? attribute_name
1775                .substring(1) : attribute_name;
1776        int index = subName.indexOf("/");
1777        if (index <= 0) {
1778            ct.setAttribute(new Attribute JavaDoc(subName, attribute.getValue()));
1779            return;
1780        }
1781        setCrawlOrderAttribute(subName.substring(index + 1), (ComplexType) ct
1782                .getAttribute(subName.substring(0, index)), attribute);
1783    }
1784
1785    public AttributeList JavaDoc setAttributes(AttributeList JavaDoc attributes) {
1786        if (attributes == null) {
1787            throw new RuntimeOperationsException JavaDoc(
1788                new IllegalArgumentException JavaDoc("attributeNames[] cannot be " +
1789                "null"), "Cannot call getAttributes with null attribute " +
1790                "names");
1791        }
1792        
1793        AttributeList JavaDoc resultList = new AttributeList JavaDoc();
1794        if (attributes.size() == 0) {
1795            return resultList;
1796        }
1797        for (int i = 0; i < attributes.size(); i++) {
1798            try {
1799                Attribute JavaDoc attr = (Attribute JavaDoc)attributes.get(i);
1800                setAttribute(attr);
1801                String JavaDoc an = attr.getName();
1802                Object JavaDoc newValue = getAttribute(an);
1803                resultList.add(new Attribute JavaDoc(an, newValue));
1804            } catch (Exception JavaDoc e) {
1805                e.printStackTrace();
1806            }
1807        }
1808        return resultList;
1809    }
1810
1811    public Object JavaDoc invoke(String JavaDoc operationName, Object JavaDoc[] params,
1812        String JavaDoc[] signature)
1813    throws ReflectionException JavaDoc {
1814        if (operationName == null) {
1815            throw new RuntimeOperationsException JavaDoc(
1816                new IllegalArgumentException JavaDoc("Operation name cannot be null"),
1817                "Cannot call invoke with null operation name");
1818        }
1819        
1820        if (this.bdbjeOperationsNameList.contains(operationName)) {
1821            try {
1822                Object JavaDoc o = this.bdbjeMBeanHelper.invoke(
1823                        this.controller.getBdbEnvironment(),
1824                        operationName, params, signature);
1825                // If OP_DB_ST, return String version of result.
1826
if (operationName.equals(OP_DB_STAT)) {
1827                    return o.toString();
1828                }
1829                return o;
1830            } catch (MBeanException JavaDoc e) {
1831                throw new RuntimeOperationsException JavaDoc(new RuntimeException JavaDoc(e));
1832            }
1833        }
1834        
1835        // TODO: Exploit passed signature.
1836

1837        // The pattern in the below is to match an operation and when found
1838
// do a return out of if clause. Doing it this way, I can fall
1839
// on to the MethodNotFoundException for case where we've an
1840
// attribute but no handler.
1841
if (operationName.equals(IMPORT_URI_OPER)) {
1842            JmxUtils.checkParamsCount(IMPORT_URI_OPER, params, 3);
1843            mustBeCrawling();
1844            try {
1845                importUri((String JavaDoc)params[0],
1846                    ((Boolean JavaDoc)params[1]).booleanValue(),
1847                    ((Boolean JavaDoc)params[2]).booleanValue());
1848            } catch (URIException e) {
1849                throw new RuntimeOperationsException JavaDoc(new RuntimeException JavaDoc(e));
1850            }
1851            return null;
1852        }
1853        
1854        if (operationName.equals(IMPORT_URIS_OPER)) {
1855            JmxUtils.checkParamsCount(IMPORT_URIS_OPER, params, 4);
1856            mustBeCrawling();
1857            return importUris((String JavaDoc)params[0],
1858                ((String JavaDoc)params[1]).toString(),
1859                ((Boolean JavaDoc)params[2]).booleanValue(),
1860                ((Boolean JavaDoc)params[3]).booleanValue());
1861        }
1862        
1863        if (operationName.equals(PAUSE_OPER)) {
1864            JmxUtils.checkParamsCount(PAUSE_OPER, params, 0);
1865            mustBeCrawling();
1866            pause();
1867            return null;
1868        }
1869        
1870        if (operationName.equals(RESUME_OPER)) {
1871            JmxUtils.checkParamsCount(RESUME_OPER, params, 0);
1872            mustBeCrawling();
1873            resume();
1874            return null;
1875        }
1876        
1877        if (operationName.equals(FRONTIER_REPORT_OPER)) {
1878            JmxUtils.checkParamsCount(FRONTIER_REPORT_OPER, params, 1);
1879            mustBeCrawling();
1880            return getFrontierReport((String JavaDoc)params[0]);
1881        }
1882        
1883        if (operationName.equals(THREADS_REPORT_OPER)) {
1884            JmxUtils.checkParamsCount(THREADS_REPORT_OPER, params, 0);
1885            mustBeCrawling();
1886            return getThreadsReport();
1887        }
1888        
1889        if (operationName.equals(SEEDS_REPORT_OPER)) {
1890            JmxUtils.checkParamsCount(SEEDS_REPORT_OPER, params, 0);
1891            mustBeCrawling();
1892            StringWriter JavaDoc sw = new StringWriter JavaDoc();
1893            if (getStatisticsTracking() != null &&
1894                    getStatisticsTracking() instanceof StatisticsTracker) {
1895                ((StatisticsTracker)getStatisticsTracking()).
1896                    writeSeedsReportTo(new PrintWriter JavaDoc(sw));
1897            } else {
1898                sw.write("Unsupported");
1899            }
1900            return sw.toString();
1901        }
1902        
1903        if (operationName.equals(CHECKPOINT_OPER)) {
1904            JmxUtils.checkParamsCount(CHECKPOINT_OPER, params, 0);
1905            mustBeCrawling();
1906            try {
1907                checkpoint();
1908            } catch (IllegalStateException JavaDoc e) {
1909                throw new RuntimeOperationsException JavaDoc(e);
1910            }
1911            return null;
1912        }
1913        
1914        if (operationName.equals(PROGRESS_STATISTICS_OPER)) {
1915            JmxUtils.checkParamsCount(PROGRESS_STATISTICS_OPER, params, 0);
1916            mustBeCrawling();
1917            return getStatisticsTracking().getProgressStatisticsLine();
1918        }
1919        
1920        if (operationName.equals(PROGRESS_STATISTICS_LEGEND_OPER)) {
1921            JmxUtils.checkParamsCount(PROGRESS_STATISTICS_LEGEND_OPER,
1922                    params, 0);
1923            return getStatisticsTracking().progressStatisticsLegend();
1924        }
1925        
1926        throw new ReflectionException JavaDoc(
1927            new NoSuchMethodException JavaDoc(operationName),
1928                "Cannot find the operation " + operationName);
1929    }
1930    
1931    public void mustBeCrawling() {
1932        if (!isCrawling()) {
1933            throw new RuntimeOperationsException JavaDoc(
1934                new IllegalArgumentException JavaDoc("Not " +
1935                "crawling (Shouldn't ever be the case)"),
1936                "Not current crawling job?");
1937        }
1938    }
1939    
1940    public boolean isCrawling() {
1941        return this.controller != null;
1942    }
1943    
1944    /**
1945     * Utility method to get the stored list of ignored seed items (if any),
1946     * from the last time the seeds were imported to the frontier.
1947     *
1948     * @return String of all ignored seed items, or null if none
1949     */

1950    public String JavaDoc getIgnoredSeeds() {
1951        File JavaDoc ignoredFile = new File JavaDoc(getDirectory(),
1952                AbstractFrontier.IGNORED_SEEDS_FILENAME);
1953        if(!ignoredFile.exists()) {
1954            return null;
1955        }
1956        try {
1957            return FileUtils.readFileAsString(ignoredFile);
1958        } catch (IOException JavaDoc e) {
1959            // TODO Auto-generated catch block
1960
e.printStackTrace();
1961            return null;
1962        }
1963    }
1964    
1965    /**
1966     * Forward a 'kick' update to current controller if any.
1967     * @see CrawlController#kickUpdate()
1968     */

1969    public void kickUpdate(){
1970        if (this.controller != null){
1971            this.controller.kickUpdate();
1972        }
1973    }
1974    
1975    /**
1976     * Returns a URIFrontierMarker for the current, paused, job. If there is no
1977     * current job or it is not paused null will be returned.
1978     *
1979     * @param regexpr A regular expression that each URI must match in order to
1980     * be considered 'within' the marker.
1981     * @param inCacheOnly Limit marker scope to 'cached' URIs.
1982     * @return a URIFrontierMarker for the current job.
1983     * @see #getPendingURIsList(FrontierMarker, int, boolean)
1984     * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
1985     * boolean)
1986     * @see org.archive.crawler.framework.FrontierMarker
1987     */

1988    public FrontierMarker getInitialMarker(String JavaDoc regexpr,
1989            boolean inCacheOnly) {
1990        return (this.controller != null && this.controller.isPaused())?
1991           this.controller.getFrontier().getInitialMarker(regexpr, inCacheOnly):
1992               null;
1993    }
1994    
1995    /**
1996     * Returns the frontiers URI list based on the provided marker. This method
1997     * will return null if there is not current job or if the current job is
1998     * not paused. Only when there is a paused current job will this method
1999     * return a URI list.
2000     *
2001     * @param marker URIFrontier marker
2002     * @param numberOfMatches Maximum number of matches to return
2003     * @param verbose Should detailed info be provided on each URI?
2004     * @return the frontiers URI list based on the provided marker
2005     * @throws InvalidFrontierMarkerException
2006     * When marker is inconsistent with the current state of the
2007     * frontier.
2008     * @see #getInitialMarker(String, boolean)
2009     * @see org.archive.crawler.framework.FrontierMarker
2010     */

2011    public ArrayList JavaDoc getPendingURIsList(FrontierMarker marker,
2012            int numberOfMatches, boolean verbose)
2013    throws InvalidFrontierMarkerException {
2014        return (this.controller != null && this.controller.isPaused())?
2015            this.controller.getFrontier().getURIsList(marker, numberOfMatches,
2016                    verbose):
2017            null;
2018    }
2019
2020    public void crawlStarted(String JavaDoc message) {
2021        if (this.mbeanName != null) {
2022            // Can be null around job startup.
2023
sendNotification(new Notification JavaDoc("crawlStarted",
2024                this.mbeanName, getNotificationsSequenceNumber(), message));
2025        }
2026    }
2027
2028    public void crawlEnding(String JavaDoc sExitMessage) {
2029        setRunning(false);
2030        setStatus(sExitMessage);
2031        setReadOnly();
2032        if (this.mbeanName != null) {
2033            sendNotification(new Notification JavaDoc("crawlEnding", this.mbeanName,
2034                getNotificationsSequenceNumber(), sExitMessage));
2035        }
2036    }
2037
2038    public void crawlEnded(String JavaDoc sExitMessage) {
2039        // Let the settings handler be cleaned up by the crawl controller
2040
// completeStop. Just let go of our reference in here.
2041
// if (this.settingsHandler != null) {
2042
// this.settingsHandler.cleanup();
2043
// }
2044

2045        // We used to zero-out datamembers but no longer needed now CrawlJobs
2046
// no longer persist after completion (They used to be kept around in
2047
// a list so operator could view CrawlJob finish state and reports --
2048
// but we now dump actual job and create a new uninitialized CrawlJob
2049
// that points at old CrawlJob data.
2050
}
2051
2052    public void crawlPausing(String JavaDoc statusMessage) {
2053        setStatus(statusMessage);
2054    }
2055
2056    public void crawlPaused(String JavaDoc statusMessage) {
2057        setStatus(statusMessage);
2058        if (this.mbeanName != null) {
2059            // Can be null around job startup.
2060
sendNotification(new Notification JavaDoc("crawlPaused", this.mbeanName,
2061                getNotificationsSequenceNumber(), statusMessage));
2062        }
2063    }
2064
2065    public void crawlResuming(String JavaDoc statusMessage) {
2066        setStatus(statusMessage);
2067        if (this.mbeanName != null) {
2068            // Can be null around job startup.
2069
sendNotification(new Notification JavaDoc("crawlResuming", this.mbeanName,
2070                getNotificationsSequenceNumber(), statusMessage));
2071        }
2072    }
2073
2074    public void crawlCheckpoint(File JavaDoc checkpointDir) throws Exception JavaDoc {
2075        setStatus(CrawlJob.STATUS_CHECKPOINTING);
2076    }
2077
2078    public CrawlController getController() {
2079        return this.controller;
2080    }
2081    
2082    public ObjectName JavaDoc preRegister(final MBeanServer JavaDoc server, ObjectName JavaDoc on)
2083    throws Exception JavaDoc {
2084        this.mbeanServer = server;
2085        @SuppressWarnings JavaDoc("unchecked")
2086        Hashtable JavaDoc<String JavaDoc,String JavaDoc> ht = on.getKeyPropertyList();
2087        if (!ht.containsKey(JmxUtils.NAME)) {
2088            throw new IllegalArgumentException JavaDoc("Name property required" +
2089                on.getCanonicalName());
2090        }
2091        // Now append key/values from hosting heritrix JMX ObjectName so it can be
2092
// found just by examination of the CrawlJob JMX ObjectName. Add heritrix
2093
// name attribute as 'mother' attribute.
2094
Heritrix h = getHostingHeritrix();
2095        if (h == null || h.getMBeanName() == null) {
2096            throw new IllegalArgumentException JavaDoc("Hosting heritrix not found " +
2097                "or not registered with JMX: " + on.getCanonicalName());
2098        }
2099        @SuppressWarnings JavaDoc("unchecked")
2100        Map JavaDoc<String JavaDoc,String JavaDoc> hht = h.getMBeanName().getKeyPropertyList();
2101        ht.put(JmxUtils.MOTHER, hht.get(JmxUtils.NAME));
2102        String JavaDoc port = hht.get(JmxUtils.JMX_PORT);
2103        if (port != null) {
2104            ht.put(JmxUtils.JMX_PORT, port);
2105        }
2106        ht.put(JmxUtils.HOST, hht.get(JmxUtils.HOST));
2107        if (!ht.containsKey(JmxUtils.TYPE)) {
2108            ht.put(JmxUtils.TYPE, CRAWLJOB_JMXMBEAN_TYPE);
2109        }
2110        this.mbeanName = new ObjectName JavaDoc(on.getDomain(), ht);
2111        return this.mbeanName;
2112    }
2113
2114    public void postRegister(Boolean JavaDoc registrationDone) {
2115        if (logger.isLoggable(Level.INFO)) {
2116            logger.info(
2117                JmxUtils.getLogRegistrationMsg(this.mbeanName.getCanonicalName(),
2118                this.mbeanServer, registrationDone.booleanValue()));
2119        }
2120    }
2121
2122    public void preDeregister() throws Exception JavaDoc {
2123        // Nothing to do.
2124
}
2125
2126    public void postDeregister() {
2127        if (mbeanName == null) {
2128            return;
2129        }
2130        if (logger.isLoggable(Level.INFO)) {
2131            logger.info(JmxUtils.getLogUnregistrationMsg(
2132                    this.mbeanName.getCanonicalName(), this.mbeanServer));
2133        }
2134        this.mbeanName = null;
2135    }
2136    
2137    /**
2138     * @return Heritrix that is hosting this job.
2139     */

2140    protected Heritrix getHostingHeritrix() {
2141        Heritrix hostingHeritrix = null;
2142        Map JavaDoc heritrice = Heritrix.getInstances();
2143        for (final Iterator JavaDoc i = heritrice.keySet().iterator(); i.hasNext();) {
2144            Heritrix h = (Heritrix)heritrice.get(i.next());
2145            if (h.getJobHandler().getCurrentJob() == this) {
2146                hostingHeritrix = h;
2147                break;
2148            }
2149        }
2150        return hostingHeritrix;
2151    }
2152    
2153    /**
2154     * @return Unique name for job that is safe to use in jmx (Like display
2155     * name but without spaces).
2156     */

2157    public String JavaDoc getJmxJobName() {
2158        return getJobName() + "-" + getUID();
2159    }
2160
2161    /**
2162     * @return Notification sequence number (Does increment after each access).
2163     */

2164    protected static int getNotificationsSequenceNumber() {
2165        return notificationsSequenceNumber++;
2166    }
2167
2168    protected ObjectName JavaDoc getMbeanName() {
2169        return this.mbeanName;
2170    }
2171    
2172    /**
2173     * @return the statistics tracking instance (of null if none yet available).
2174     */

2175    public StatisticsTracking getStatisticsTracking() {
2176        return this.controller == null ||
2177            this.controller.getStatistics() == null? null:
2178                this.controller.getStatistics();
2179    }
2180}
2181
Popular Tags