1 21 package org.archive.crawler.admin; 22 23 import java.io.BufferedReader ; 24 import java.io.File ; 25 import java.io.FileNotFoundException ; 26 import java.io.FileReader ; 27 import java.io.FileWriter ; 28 import java.io.IOException ; 29 import java.io.InputStream ; 30 import java.io.InputStreamReader ; 31 import java.io.PrintWriter ; 32 import java.io.Serializable ; 33 import java.io.StringWriter ; 34 import java.util.ArrayList ; 35 import java.util.Arrays ; 36 import java.util.Collection ; 37 import java.util.EventObject ; 38 import java.util.Hashtable ; 39 import java.util.Iterator ; 40 import java.util.List ; 41 import java.util.Map ; 42 import java.util.logging.Level ; 43 import java.util.logging.Logger ; 44 45 import javax.management.Attribute ; 46 import javax.management.AttributeList ; 47 import javax.management.AttributeNotFoundException ; 48 import javax.management.DynamicMBean ; 49 import javax.management.InstanceAlreadyExistsException ; 50 import javax.management.InvalidAttributeValueException ; 51 import javax.management.MBeanAttributeInfo ; 52 import javax.management.MBeanException ; 53 import javax.management.MBeanInfo ; 54 import javax.management.MBeanNotificationInfo ; 55 import javax.management.MBeanOperationInfo ; 56 import javax.management.MBeanParameterInfo ; 57 import javax.management.MBeanRegistration ; 58 import javax.management.MBeanRegistrationException ; 59 import javax.management.MBeanServer ; 60 import javax.management.NotCompliantMBeanException ; 61 import javax.management.Notification ; 62 import javax.management.NotificationBroadcasterSupport ; 63 import javax.management.ObjectName ; 64 import javax.management.ReflectionException ; 65 import javax.management.RuntimeOperationsException ; 66 import javax.management.openmbean.CompositeData ; 67 import javax.management.openmbean.CompositeDataSupport ; 68 import javax.management.openmbean.CompositeType ; 69 import javax.management.openmbean.OpenDataException ; 70 import javax.management.openmbean.OpenMBeanAttributeInfo ; 71 import javax.management.openmbean.OpenMBeanAttributeInfoSupport ; 72 import javax.management.openmbean.OpenMBeanConstructorInfoSupport ; 73 import javax.management.openmbean.OpenMBeanInfoSupport ; 74 import javax.management.openmbean.OpenMBeanOperationInfo ; 75 import javax.management.openmbean.OpenMBeanOperationInfoSupport ; 76 import javax.management.openmbean.OpenMBeanParameterInfo ; 77 import javax.management.openmbean.OpenMBeanParameterInfoSupport ; 78 import javax.management.openmbean.SimpleType ; 79 80 import org.apache.commons.httpclient.URIException; 81 import org.archive.crawler.Heritrix; 82 import org.archive.crawler.datamodel.CandidateURI; 83 import org.archive.crawler.datamodel.Checkpoint; 84 import org.archive.crawler.datamodel.CrawlOrder; 85 import org.archive.crawler.event.CrawlStatusListener; 86 import org.archive.crawler.framework.CrawlController; 87 import org.archive.crawler.framework.FrontierMarker; 88 import org.archive.crawler.framework.StatisticsTracking; 89 import org.archive.crawler.framework.exceptions.InitializationException; 90 import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException; 91 import org.archive.crawler.frontier.AbstractFrontier; 92 import org.archive.crawler.settings.ComplexType; 93 import org.archive.crawler.settings.ModuleAttributeInfo; 94 import org.archive.crawler.settings.TextField; 95 import org.archive.crawler.settings.XMLSettingsHandler; 96 import org.archive.crawler.util.CheckpointUtils; 97 import org.archive.crawler.util.IoUtils; 98 import org.archive.util.ArchiveUtils; 99 import org.archive.util.FileUtils; 100 import org.archive.util.JEMBeanHelper; 101 import org.archive.util.JmxUtils; 102 import org.archive.util.iterator.LineReadingIterator; 103 import org.archive.util.iterator.RegexpLineIterator; 104 105 import com.sleepycat.je.DatabaseException; 106 import com.sleepycat.je.Environment; 107 108 126 127 public class CrawlJob extends NotificationBroadcasterSupport 128 implements DynamicMBean , MBeanRegistration , CrawlStatusListener, Serializable { 129 132 private static final long serialVersionUID = 3411161000452525856L; 133 134 private static final Logger logger = 135 Logger.getLogger(CrawlJob.class.getName()); 136 139 140 public static final int PRIORITY_MINIMAL = 0; 141 142 public static final int PRIORITY_LOW = 1; 143 144 public static final int PRIORITY_AVERAGE = 2; 145 146 public static final int PRIORITY_HIGH = 3; 147 148 public static final int PRIORITY_CRITICAL = 4; 149 150 153 154 public static final String STATUS_CREATED = "Created"; 155 156 public static final String STATUS_PENDING = "Pending"; 157 158 public static final String STATUS_RUNNING = "Running"; 159 160 public static final String STATUS_DELETED = "Deleted"; 161 162 public static final String STATUS_ABORTED = "Finished - Ended by operator"; 163 164 public static final String STATUS_FINISHED_ABNORMAL = 165 "Finished - Abnormal exit from crawling"; 166 167 public static final String STATUS_FINISHED = "Finished"; 168 169 public static final String STATUS_FINISHED_TIME_LIMIT = 170 "Finished - Timelimit hit"; 171 173 public static final String STATUS_FINISHED_DATA_LIMIT = 174 "Finished - Maximum amount of data limit hit"; 175 178 public static final String STATUS_FINISHED_DOCUMENT_LIMIT = 179 "Finished - Maximum number of documents limit hit"; 180 181 public static final String STATUS_WAITING_FOR_PAUSE = "Pausing - " + 182 "Waiting for threads to finish"; 183 184 public static final String STATUS_PAUSED = "Paused"; 185 190 public static final String STATUS_CHECKPOINTING = "Checkpointing"; 191 192 public static final String STATUS_MISCONFIGURED = "Could not launch job " + 193 "- Fatal InitializationException"; 194 195 public static final String STATUS_PROFILE = "Profile"; 196 197 public static final String STATUS_PREPARING = "Preparing"; 198 199 private String UID; private String name; 202 private String status; 203 private boolean isReadOnly = false; 204 private boolean isNew = true; 205 private boolean isProfile = false; 206 private boolean isRunning = false; 207 private int priority; 208 private int numberOfJournalEntries = 0; 209 210 private String statisticsFileSave = ""; 211 212 private String errorMessage = null; 213 214 private File jobDir = null; 215 216 private transient CrawlJobErrorHandler errorHandler = null; 217 218 protected transient XMLSettingsHandler settingsHandler; 219 220 private transient CrawlController controller = null; 221 222 private static final String RECOVERY_JOURNAL_STYLE = "recoveryJournal"; 223 private static final String CRAWL_LOG_STYLE = "crawlLog"; 224 225 227 230 private transient MBeanServer mbeanServer = null; 231 private transient ObjectName mbeanName = null; 232 private static final String CRAWLJOB_JMXMBEAN_TYPE = 233 JmxUtils.SERVICE + ".Job"; 234 private transient JEMBeanHelper bdbjeMBeanHelper = null; 235 private transient List <String > bdbjeAttributeNameList = null; 236 private transient List <String > bdbjeOperationsNameList = null; 237 238 239 243 private transient OpenMBeanInfoSupport openMBeanInfo; 244 245 private final static String NAME_ATTR = "Name"; 246 private final static String UID_ATTR = "UID"; 247 private final static String STATUS_ATTR = "Status"; 248 private final static String FRONTIER_SHORT_REPORT_ATTR = 249 "FrontierShortReport"; 250 private final static String THREADS_SHORT_REPORT_ATTR = 251 "ThreadsShortReport"; 252 private final static String TOTAL_DATA_ATTR = "TotalData"; 253 private final static String CRAWL_TIME_ATTR = "CrawlTime"; 254 private final static String DOC_RATE_ATTR = "DocRate"; 255 private final static String CURRENT_DOC_RATE_ATTR = "CurrentDocRate"; 256 private final static String KB_RATE_ATTR = "KbRate"; 257 private final static String CURRENT_KB_RATE_ATTR = "CurrentKbRate"; 258 private final static String THREAD_COUNT_ATTR = "ThreadCount"; 259 private final static String DOWNLOAD_COUNT_ATTR = "DownloadedCount"; 260 private final static String DISCOVERED_COUNT_ATTR = "DiscoveredCount"; 261 private final static String [] ATTRIBUTE_ARRAY = {NAME_ATTR, UID_ATTR, 262 STATUS_ATTR, FRONTIER_SHORT_REPORT_ATTR, THREADS_SHORT_REPORT_ATTR, 263 TOTAL_DATA_ATTR, CRAWL_TIME_ATTR, DOC_RATE_ATTR, 264 CURRENT_DOC_RATE_ATTR, KB_RATE_ATTR, CURRENT_KB_RATE_ATTR, 265 THREAD_COUNT_ATTR, DOWNLOAD_COUNT_ATTR, DISCOVERED_COUNT_ATTR}; 266 private final static List ATTRIBUTE_LIST = Arrays.asList(ATTRIBUTE_ARRAY); 267 268 private final static String IMPORT_URI_OPER = "importUri"; 269 private final static String IMPORT_URIS_OPER = "importUris"; 270 private final static String PAUSE_OPER = "pause"; 271 private final static String RESUME_OPER = "resume"; 272 private final static String FRONTIER_REPORT_OPER = "frontierReport"; 273 private final static String THREADS_REPORT_OPER = "threadsReport"; 274 private final static String SEEDS_REPORT_OPER = "seedsReport"; 275 private final static String CHECKPOINT_OPER = "startCheckpoint"; 276 private final static String PROGRESS_STATISTICS_OPER = 277 "progressStatistics"; 278 private final static String PROGRESS_STATISTICS_LEGEND_OPER = 279 "progressStatisticsLegend"; 280 281 private final static String PROG_STATS = "progressStatistics"; 282 283 private final static String OP_DB_STAT = "getDatabaseStats"; 285 286 289 private final static List ORDER_EXCLUDE; 290 static { 291 ORDER_EXCLUDE = Arrays.asList(new String [] {"bdb-cache-percent", 292 "extract-processors", "DNS", "uri-included-structure"}); 293 } 294 295 298 private static int notificationsSequenceNumber = 1; 299 300 303 protected CrawlJob() { 304 super(); 305 } 306 307 320 public CrawlJob(final String UID, 321 final String name, final XMLSettingsHandler settingsHandler, 322 final CrawlJobErrorHandler errorHandler, final int priority, 323 final File dir) { 324 this(UID, name, settingsHandler, errorHandler, 325 priority, dir, null, false, true); 326 } 327 328 341 protected CrawlJob(final String UIDandName, 342 final XMLSettingsHandler settingsHandler, 343 final CrawlJobErrorHandler errorHandler) { 344 this(UIDandName, UIDandName, settingsHandler, errorHandler, 345 PRIORITY_AVERAGE, null, STATUS_PROFILE, true, false); 346 } 347 348 public CrawlJob(final String UID, 349 final String name, final XMLSettingsHandler settingsHandler, 350 final CrawlJobErrorHandler errorHandler, final int priority, 351 final File dir, final String status, final boolean isProfile, 352 final boolean isNew) { 353 super(); 354 this.UID = UID; 355 this.name = name; 356 this.settingsHandler = settingsHandler; 357 this.errorHandler = errorHandler; 358 this.status = status; 359 this.isProfile = isProfile; 360 this.isNew = isNew; 361 this.jobDir = dir; 362 this.priority = priority; 363 } 364 365 391 protected CrawlJob(final File jobFile, 392 final CrawlJobErrorHandler errorHandler) 393 throws InvalidJobFileException, IOException { 394 this(null, null, null, errorHandler, 395 PRIORITY_AVERAGE, null, null, false, true); 396 this.jobDir = jobFile.getParentFile(); 397 398 if (jobFile.length() == 0) { 400 throw new InvalidJobFileException(jobFile.getCanonicalPath() + 401 " is corrupt (length is zero)"); 402 } 403 404 BufferedReader jobReader = 406 new BufferedReader (new FileReader (jobFile), 4096); 407 this.UID = jobReader.readLine(); 409 this.name = jobReader.readLine(); 411 this.status = jobReader.readLine(); 413 if(status.equals(STATUS_ABORTED)==false 414 && status.equals(STATUS_CREATED)==false 415 && status.equals(STATUS_DELETED)==false 416 && status.equals(STATUS_FINISHED)==false 417 && status.equals(STATUS_FINISHED_ABNORMAL)==false 418 && status.equals(STATUS_FINISHED_DATA_LIMIT)==false 419 && status.equals(STATUS_FINISHED_DOCUMENT_LIMIT)==false 420 && status.equals(STATUS_FINISHED_TIME_LIMIT)==false 421 && status.equals(STATUS_MISCONFIGURED)==false 422 && status.equals(STATUS_PAUSED)==false 423 && status.equals(STATUS_CHECKPOINTING)==false 424 && status.equals(STATUS_PENDING)==false 425 && status.equals(STATUS_RUNNING)==false 426 && status.equals(STATUS_WAITING_FOR_PAUSE)==false 427 && status.equals(STATUS_PREPARING)==false){ 428 throw new InvalidJobFileException("Status (line 3) in job file " + 430 "is not valid: '" + status + "'"); 431 } 432 String tmp = jobReader.readLine(); 434 if(tmp.equals("true")){ 435 isReadOnly = true; 436 } else if(tmp.equals("false")){ 437 isReadOnly = false; 438 } else { 439 throw new InvalidJobFileException("isReadOnly (line 4) in job" + 440 " file '" + jobFile.getAbsolutePath() + "' is not " + 441 "valid: '" + tmp + "'"); 442 } 443 tmp = jobReader.readLine(); 445 if(tmp.equals("true")){ 446 this.isRunning = true; 447 } else if(tmp.equals("false")){ 448 this.isRunning = false; 449 } else { 450 throw new InvalidJobFileException("isRunning (line 5) in job " + 451 "file '" + jobFile.getAbsolutePath() + "' is not valid: " + 452 "'" + tmp + "'"); 453 } 454 tmp = jobReader.readLine(); 456 try{ 457 this.priority = Integer.parseInt(tmp); 458 } catch(NumberFormatException e){ 459 throw new InvalidJobFileException("priority (line 5) in job " + 460 "file '" + jobFile.getAbsolutePath() + "' is not valid: " + 461 "'" + tmp + "'"); 462 } 463 tmp = jobReader.readLine(); 465 try{ 466 this.numberOfJournalEntries = Integer.parseInt(tmp); 467 } catch(NumberFormatException e){ 468 throw new InvalidJobFileException("numberOfJournalEntries " + 469 "(line 5) in job file '" + jobFile.getAbsolutePath() + 470 "' is not valid: " + "'" + tmp + "'"); 471 } 472 tmp = jobReader.readLine(); 474 try { 475 File f = new File (tmp); 476 this.settingsHandler = new XMLSettingsHandler((f.isAbsolute())? 477 f: new File (jobDir, f.getName())); 478 if(this.errorHandler != null){ 479 this.settingsHandler.registerValueErrorHandler(errorHandler); 480 } 481 this.settingsHandler.initialize(); 482 } catch (InvalidAttributeValueException e1) { 483 throw new InvalidJobFileException("Problem reading from settings " + 484 "file (" + tmp + ") specified in job file '" + 485 jobFile.getAbsolutePath() + "'\n" + e1.getMessage()); 486 } 487 jobReader.readLine(); 489 tmp = jobReader.readLine(); 492 errorMessage = ""; 493 while(tmp!=null){ 494 errorMessage+=tmp+'\n'; 495 tmp = jobReader.readLine(); 496 } 497 if(errorMessage.length()==0){ 498 errorMessage = null; 500 } 501 503 jobReader.close(); 505 } 506 507 512 private void writeJobFile() { 513 if (isProfile) { 514 return; 515 } 516 517 final String jobDirAbsolute = jobDir.getAbsolutePath(); 518 if (!jobDir.exists() || !jobDir.canWrite()) { 519 logger.warning("Can't update status on " + 520 jobDirAbsolute + " because file does not" + 521 " exist (or is unwriteable)"); 522 return; 523 } 524 File f = new File (jobDirAbsolute, "state.job"); 525 526 String settingsFile = getSettingsDirectory(); 527 if(settingsFile.startsWith(jobDirAbsolute.concat(File.separator))) { 530 settingsFile = settingsFile.substring(jobDirAbsolute.length()+1); 531 } 532 try { 533 FileWriter jobWriter = new FileWriter (f, false); 534 try { 535 jobWriter.write(UID + "\n"); 536 jobWriter.write(name + "\n"); 537 jobWriter.write(status + "\n"); 538 jobWriter.write(isReadOnly + "\n"); 539 jobWriter.write(isRunning + "\n"); 540 jobWriter.write(priority + "\n"); 541 jobWriter.write(numberOfJournalEntries + "\n"); 542 jobWriter.write(settingsFile + "\n"); 543 jobWriter.write(statisticsFileSave + "\n"); if (errorMessage != null) { 547 jobWriter.write(errorMessage + "\n"); 548 } 549 } finally { 550 if (jobWriter != null) { 551 jobWriter.close(); 552 } 553 } 554 } catch (IOException e) { 555 logger.log(Level.WARNING, "An IOException occured saving job " + 556 name + " (" + UID + ")", e); 557 } 558 } 559 560 567 public String getUID(){ 568 return UID; 569 } 570 571 581 public String getJobName(){ 582 return name; 583 } 584 585 591 public String getDisplayName() { 592 return getJobName()+" ["+getUID()+"]"; 593 } 594 595 607 public void setJobPriority(int priority) { 608 this.priority = priority; 609 } 610 611 622 public int getJobPriority() { 623 return priority; 624 } 625 626 631 public void setReadOnly() { 632 isReadOnly = true; 633 writeJobFile(); } 635 636 640 public boolean isReadOnly(){ 641 return isReadOnly; 642 } 643 644 650 public void setStatus(String status) { 651 this.status = status; 652 writeJobFile(); } 655 656 659 public String getCrawlStatus() { 660 return this.controller != null? 661 this.controller.getState().toString(): "Illegal State"; 662 } 663 664 670 public String getStatus() { 671 return this.status; 672 } 673 674 678 public XMLSettingsHandler getSettingsHandler() { 679 return this.settingsHandler; 680 } 681 685 public boolean isNew() { 686 return isNew; 687 } 688 689 693 public boolean isProfile() { 694 return isProfile; 695 } 696 697 701 public void setNew(boolean b) { 702 isNew = b; 703 writeJobFile(); } 705 706 710 public boolean isRunning() { 711 return isRunning; 712 } 713 714 718 protected void setRunning(boolean b) { 719 isRunning = b; 720 writeJobFile(); } 727 728 protected void unregisterMBean() { 729 if (this.mbeanServer == null) { 731 return; 732 } 733 try { 734 this.mbeanServer.unregisterMBean(this.mbeanName); 735 this.mbeanServer = null; 736 } catch (Exception e) { 737 logger.log(Level.SEVERE, "Failed with " + this.mbeanName, e); 738 } 739 } 740 741 753 public class MBeanCrawlController extends CrawlController 754 implements Serializable { 755 private static final long serialVersionUID = -4608537998168407222L; 756 private CrawlJob cj = null; 757 private CompositeType ct = null; 758 759 public CrawlJob getCrawlJob() { 760 return this.cj; 761 } 762 763 public void setCrawlJob(CrawlJob cj) { 764 this.cj = cj; 765 } 766 767 public void progressStatisticsEvent(final EventObject e) { 768 super.progressStatisticsEvent(e); 769 if (this.cj.getMbeanName() == null) { 770 return; 772 } 773 774 Map s = ((StatisticsTracking)e.getSource()).getProgressStatistics(); 775 CompositeData cd = null; 778 try { 779 if (this.ct == null) { 780 this.ct = JmxUtils.createCompositeType(s, PROG_STATS, 781 PROG_STATS + " for " + this.cj.getMbeanName()); 782 } 783 cd = new CompositeDataSupport (this.ct, s); 784 } catch (OpenDataException ode) { 785 ode.printStackTrace(); 786 } 787 if (cd != null) { 788 Notification n = new Notification (PROG_STATS, 789 this.cj.getMbeanName(), getNotificationsSequenceNumber(), 790 ((StatisticsTracking)e.getSource()). 791 getProgressStatisticsLine()); 792 n.setUserData(cd); 793 this.cj.sendNotification(n); 794 } 795 } 796 797 protected void completeStop() { 798 try { 799 super.completeStop(); 800 } finally { 801 if (this.cj != null) { 802 this.cj.unregisterMBean(); 803 } 804 this.cj = null; 805 } 806 } 807 } 808 809 protected CrawlController setupCrawlController() 810 throws InitializationException { 811 CrawlController controller = null; 812 813 Checkpoint cp = CrawlController. 817 getCheckpointRecover(getSettingsHandler().getOrder()); 818 if (cp != null) { 819 try { 820 controller = (MBeanCrawlController)CheckpointUtils. 821 readObjectFromFile(MBeanCrawlController.class, 822 cp.getDirectory()); 823 } catch (FileNotFoundException e) { 824 throw new InitializationException(e); 825 } catch (IOException e) { 826 throw new InitializationException(e); 827 } catch (ClassNotFoundException e) { 828 throw new InitializationException(e); 829 } 830 } else { 831 controller = new MBeanCrawlController(); 832 } 833 return controller; 834 } 835 836 protected CrawlController createCrawlController() { 837 return new MBeanCrawlController(); 838 } 839 840 public void setupForCrawlStart() 841 throws InitializationException { 842 try { 843 this.controller = setupCrawlController(); 844 this.controller.addCrawlStatusListener(this); 846 this.controller.initialize(getSettingsHandler()); 847 ((MBeanCrawlController)this.controller).setCrawlJob(this); 849 this.openMBeanInfo = buildMBeanInfo(); 851 try { 852 Heritrix.registerMBean(this, getJmxJobName(), 853 CRAWLJOB_JMXMBEAN_TYPE); 854 } catch (InstanceAlreadyExistsException e) { 855 throw new InitializationException(e); 856 } catch (MBeanRegistrationException e) { 857 throw new InitializationException(e); 858 } catch (NotCompliantMBeanException e) { 859 throw new InitializationException(e); 860 } 861 } catch (InitializationException e) { 862 setStatus(CrawlJob.STATUS_MISCONFIGURED); 864 setErrorMessage("A fatal InitializationException occured when " 865 + "loading job:\n" + e.getMessage()); 866 e.printStackTrace(); 868 this.controller = null; 869 throw e; 870 } 871 setStatus(CrawlJob.STATUS_RUNNING); 872 setRunning(true); 873 } 874 875 public void stopCrawling() { 876 if(this.controller != null) { 877 this.controller.requestCrawlStop(); 878 } 879 } 880 881 884 public String getFrontierOneLine() { 885 if (this.controller == null || this.controller.getFrontier() == null) { 886 return "Crawler not running"; 887 } 888 return this.controller.getFrontier().singleLineReport(); 889 } 890 891 895 public String getFrontierReport(final String reportName) { 896 if (this.controller == null || this.controller.getFrontier() == null) { 897 return "Crawler not running"; 898 } 899 return ArchiveUtils.writeReportToString(this.controller.getFrontier(), 900 reportName); 901 } 902 903 908 public void writeFrontierReport(String reportName, PrintWriter writer) { 909 if (this.controller == null || this.controller.getFrontier() == null) { 910 writer.println("Crawler not running."); 911 return; 912 } 913 this.controller.getFrontier().reportTo(reportName,writer); 914 } 915 916 919 public String getThreadOneLine() { 920 if (this.controller == null) { 921 return "Crawler not running"; 922 } 923 return this.controller.oneLineReportThreads(); 924 } 925 926 930 public String getThreadsReport() { 931 if (this.controller == null) { 932 return "Crawler not running"; 933 } 934 return ArchiveUtils.writeReportToString(this.controller.getToePool(), 935 null); 936 } 937 938 943 public void writeThreadsReport(String reportName, PrintWriter writer) { 944 if (this.controller == null || this.controller.getFrontier() == null) { 945 writer.println("Crawler not running."); 946 return; 947 } 948 this.controller.getToePool().reportTo(reportName, writer); 949 } 950 951 959 public void killThread(int threadNumber, boolean replace) { 960 if (this.controller == null) { 961 return; 962 } 963 this.controller.killThread(threadNumber, replace); 964 } 965 966 970 public String getProcessorsReport() { 971 if (this.controller == null) { 972 return "Crawler not running"; 973 } 974 return ArchiveUtils.writeReportToString(this.controller, 975 CrawlController.PROCESSORS_REPORT); 976 } 977 978 985 public String getSettingsDirectory() { 986 return settingsHandler.getOrderFile().getPath(); 987 } 988 989 994 public File getDirectory(){ 995 return isProfile? new File (getSettingsDirectory()): jobDir; 996 } 997 998 1003 public String getErrorMessage() { 1004 return errorMessage; 1005 } 1006 1007 1012 public void setErrorMessage(String string) { 1013 errorMessage = string; 1014 writeJobFile(); } 1016 1017 1020 public int getNumberOfJournalEntries() { 1021 return numberOfJournalEntries; 1022 } 1023 1024 1027 public void setNumberOfJournalEntries(int numberOfJournalEntries) { 1028 this.numberOfJournalEntries = numberOfJournalEntries; 1029 writeJobFile(); 1030 } 1031 1032 1035 public CrawlJobErrorHandler getErrorHandler() { 1036 return errorHandler; 1037 } 1038 1039 1044 public Collection scanCheckpoints() { 1045 File checkpointsDirectory = 1046 settingsHandler.getOrder().getCheckpointsDirectory(); 1047 File [] perCheckpointDirs = checkpointsDirectory.listFiles(); 1048 Collection <Checkpoint> checkpoints = new ArrayList <Checkpoint>(); 1049 if (perCheckpointDirs != null) { 1050 for (int i = 0; i < perCheckpointDirs.length; i++) { 1051 Checkpoint cp = new Checkpoint(perCheckpointDirs[i]); 1052 checkpoints.add(cp); 1053 } 1054 } 1055 return checkpoints; 1056 } 1057 1058 1067 public String getLogPath(String log) 1068 throws AttributeNotFoundException , MBeanException , ReflectionException { 1069 String logsPath = (String )settingsHandler.getOrder(). 1070 getAttribute(CrawlOrder.ATTR_LOGS_PATH); 1071 CrawlOrder order = settingsHandler.getOrder(); 1072 String diskPath = (String ) order.getAttribute(null, 1073 CrawlOrder.ATTR_DISK_PATH); 1074 File disk = settingsHandler. 1075 getPathRelativeToWorkingDirectory(diskPath); 1076 File f = new File (logsPath, log); 1077 if (!f.isAbsolute()) { 1078 f = new File (disk.getPath(), f.getPath()); 1079 } 1080 return f.getAbsolutePath(); 1081 } 1082 1083 1085 protected void pause() { 1086 if (this.controller != null && this.controller.isPaused() == false) { 1087 this.controller.requestCrawlPause(); 1088 } 1089 } 1090 1091 protected void resume() { 1092 if (this.controller != null) { 1093 this.controller.requestCrawlResume(); 1094 } 1095 } 1096 1097 1100 protected void checkpoint() throws IllegalStateException { 1101 if (this.controller != null) { 1102 this.controller.requestCrawlCheckpoint(); 1103 } 1104 } 1105 1106 1109 public boolean isCheckpointing() { 1110 return this.controller != null? this.controller.isCheckpointing(): false; 1111 } 1112 1113 1116 protected void flush() { 1117 } 1119 1120 1127 public long deleteURIsFromPending(String regexpr){ 1128 return (this.controller != null && 1129 this.controller.getFrontier() != null && 1130 this.controller.isPaused())? 1131 this.controller.getFrontier().deleteURIs(regexpr): 0; 1132 } 1133 1134 public String importUris(String file, String style, String force) { 1135 return importUris(file, style, "true".equals(force)); 1136 } 1137 1138 public String importUris(final String fileOrUrl, final String style, 1139 final boolean forceRevisit) { 1140 return importUris(fileOrUrl, style, forceRevisit, false); 1141 } 1142 1143 1151 public String importUris(final String fileOrUrl, final String style, 1152 final boolean forceRevisit, final boolean areSeeds) { 1153 InputStream is = 1154 IoUtils.getInputStream(this.controller.getDisk(), fileOrUrl); 1155 String message = null; 1156 if (is == null) { 1158 message = "Failed to get inputstream from " + fileOrUrl; 1159 logger.severe(message); 1160 } else { 1161 int addedCount = importUris(is, style, forceRevisit, areSeeds); 1162 message = Integer.toString(addedCount) + " URIs added from " + 1163 fileOrUrl; 1164 } 1165 return message; 1166 } 1167 1168 protected int importUris(InputStream is, String style, 1169 boolean forceRevisit) { 1170 return importUris(is, style, forceRevisit, false); 1171 } 1172 1173 1185 protected int importUris(InputStream is, String style, 1186 boolean forceRevisit, final boolean areSeeds) { 1187 String extractor; 1189 String output; 1190 if(CRAWL_LOG_STYLE.equals(style)) { 1191 extractor = "\\S+\\s+\\S+\\s+\\S+\\s+(\\S+\\s+\\S+\\s+\\S+\\s+).*"; 1193 output = "$1"; 1194 } else if (RECOVERY_JOURNAL_STYLE.equals(style)) { 1195 extractor = "\\S+\\s+((\\S+)(?:\\s+\\S+\\s+\\S+)?)\\s*"; 1197 output = "$1"; 1198 } else { 1199 extractor = 1200 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT; 1201 output = RegexpLineIterator.ENTRY; 1202 } 1203 1204 BufferedReader br = null; 1206 int addedCount = 0; 1207 try { 1208 br = new BufferedReader (new InputStreamReader (is)); 1209 Iterator iter = new RegexpLineIterator(new LineReadingIterator(br), 1210 RegexpLineIterator.COMMENT_LINE, extractor, output); 1211 while(iter.hasNext()) { 1212 try { 1213 importUri((String )iter.next(), forceRevisit, areSeeds, 1214 false); 1215 addedCount++; 1216 } catch (URIException e) { 1217 e.printStackTrace(); 1218 } 1219 } 1220 br.close(); 1221 flush(); 1222 } catch (IOException e) { 1223 e.printStackTrace(); 1224 } 1225 return addedCount; 1226 } 1227 1228 1235 public void importUri(final String uri, final boolean forceFetch, 1236 final boolean isSeed) 1237 throws URIException { 1238 importUri(uri, forceFetch, isSeed, true); 1239 } 1240 1241 1253 public void importUri(final String str, final boolean forceFetch, 1254 final boolean isSeed, final boolean isFlush) 1255 throws URIException { 1256 CandidateURI caUri = CandidateURI.fromString(str); 1257 caUri.setForceFetch(forceFetch); 1258 if (isSeed) { 1259 caUri.setIsSeed(isSeed); 1260 if (caUri.getVia() == null || caUri.getVia().length() <= 0) { 1261 this.controller.getScope().addSeed(caUri); 1265 } 1266 } 1267 this.controller.getFrontier().schedule(caUri); 1268 if (isFlush) { 1269 flush(); 1270 } 1271 } 1272 1273 1274 1278 public MBeanInfo getMBeanInfo() { 1279 return this.openMBeanInfo; 1280 } 1281 1282 1287 protected OpenMBeanInfoSupport buildMBeanInfo() 1288 throws InitializationException { 1289 List <OpenMBeanAttributeInfo > attributes 1291 = new ArrayList <OpenMBeanAttributeInfo >(); 1292 1293 attributes.add(new OpenMBeanAttributeInfoSupport (NAME_ATTR, 1295 "Crawl job name", SimpleType.STRING, true, false, false)); 1296 attributes.add(new OpenMBeanAttributeInfoSupport (STATUS_ATTR, 1297 "Short basic status message", SimpleType.STRING, true, false, 1298 false)); 1299 attributes.add( 1300 new OpenMBeanAttributeInfoSupport (FRONTIER_SHORT_REPORT_ATTR, 1301 "Short frontier report", SimpleType.STRING, true, 1302 false, false)); 1303 attributes.add( 1304 new OpenMBeanAttributeInfoSupport (THREADS_SHORT_REPORT_ATTR, 1305 "Short threads report", SimpleType.STRING, true, 1306 false, false)); 1307 attributes.add(new OpenMBeanAttributeInfoSupport (UID_ATTR, 1308 "Crawl job UID", SimpleType.STRING, true, false, false)); 1309 attributes.add(new OpenMBeanAttributeInfoSupport (TOTAL_DATA_ATTR, 1310 "Total data received", SimpleType.LONG, true, false, false)); 1311 attributes.add(new OpenMBeanAttributeInfoSupport (CRAWL_TIME_ATTR, 1312 "Crawl time", SimpleType.LONG, true, false, false)); 1313 attributes.add(new OpenMBeanAttributeInfoSupport (CURRENT_DOC_RATE_ATTR, 1314 "Current crawling rate (Docs/sec)", SimpleType.DOUBLE, 1315 true, false, false)); 1316 attributes.add(new OpenMBeanAttributeInfoSupport (CURRENT_KB_RATE_ATTR, 1317 "Current crawling rate (Kb/sec)", SimpleType.LONG, 1318 true, false, false)); 1319 attributes.add(new OpenMBeanAttributeInfoSupport (THREAD_COUNT_ATTR, 1320 "Active thread count", SimpleType.INTEGER, true, false, false)); 1321 attributes.add(new OpenMBeanAttributeInfoSupport (DOC_RATE_ATTR, 1322 "Crawling rate (Docs/sec)", SimpleType.DOUBLE, 1323 true, false, false)); 1324 attributes.add(new OpenMBeanAttributeInfoSupport (KB_RATE_ATTR, 1325 "Current crawling rate (Kb/sec)", SimpleType.LONG, 1326 true, false, false)); 1327 attributes.add(new OpenMBeanAttributeInfoSupport (DOWNLOAD_COUNT_ATTR, 1328 "Count of downloaded documents", SimpleType.LONG, 1329 true, false, false)); 1330 attributes.add(new OpenMBeanAttributeInfoSupport (DISCOVERED_COUNT_ATTR, 1331 "Count of discovered documents", SimpleType.LONG, 1332 true, false, false)); 1333 1334 addCrawlOrderAttributes(this.getController().getOrder(), attributes); 1336 1337 Environment env = this.controller.getBdbEnvironment(); 1342 try { 1343 this.bdbjeMBeanHelper = 1344 new JEMBeanHelper(env.getConfig(), env.getHome(), true); 1345 } catch (DatabaseException e) { 1346 e.printStackTrace(); 1347 InitializationException ie = 1348 new InitializationException(e.getMessage()); 1349 ie.setStackTrace(e.getStackTrace()); 1350 throw ie; 1351 } 1352 this.bdbjeAttributeNameList = Arrays.asList(new String [] { 1353 JEMBeanHelper.ATT_ENV_HOME, 1354 JEMBeanHelper.ATT_OPEN, 1355 JEMBeanHelper.ATT_IS_READ_ONLY, 1356 JEMBeanHelper.ATT_IS_TRANSACTIONAL, 1357 JEMBeanHelper.ATT_CACHE_SIZE, 1358 JEMBeanHelper.ATT_CACHE_PERCENT, 1359 JEMBeanHelper.ATT_LOCK_TIMEOUT, 1360 JEMBeanHelper.ATT_IS_SERIALIZABLE, 1361 JEMBeanHelper.ATT_SET_READ_ONLY, 1362 }); 1363 addBdbjeAttributes(attributes, 1364 this.bdbjeMBeanHelper.getAttributeList(env), 1365 this.bdbjeAttributeNameList); 1366 1367 List <OpenMBeanOperationInfo > operations 1369 = new ArrayList <OpenMBeanOperationInfo >(); 1370 OpenMBeanParameterInfo [] args = new OpenMBeanParameterInfoSupport [3]; 1371 args[0] = new OpenMBeanParameterInfoSupport ("url", 1372 "URL to add to the frontier", SimpleType.STRING); 1373 args[1] = new OpenMBeanParameterInfoSupport ("forceFetch", 1374 "True if URL is to be force fetched", SimpleType.BOOLEAN); 1375 args[2] = new OpenMBeanParameterInfoSupport ("seed", 1376 "True if URL is a seed", SimpleType.BOOLEAN); 1377 operations.add(new OpenMBeanOperationInfoSupport (IMPORT_URI_OPER, 1378 "Add passed URL to the frontier", args, SimpleType.VOID, 1379 MBeanOperationInfo.ACTION)); 1380 1381 args = new OpenMBeanParameterInfoSupport [4]; 1382 args[0] = new OpenMBeanParameterInfoSupport ("pathOrUrl", 1383 "Path or URL to file of URLs", SimpleType.STRING); 1384 args[1] = new OpenMBeanParameterInfoSupport ("style", 1385 "Format format:default|crawlLog|recoveryJournal", 1386 SimpleType.STRING); 1387 args[2] = new OpenMBeanParameterInfoSupport ("forceFetch", 1388 "True if URLs are to be force fetched", SimpleType.BOOLEAN); 1389 args[3] = new OpenMBeanParameterInfoSupport ("seed", 1390 "True if all content are seeds.", SimpleType.BOOLEAN); 1391 operations.add(new OpenMBeanOperationInfoSupport (IMPORT_URIS_OPER, 1392 "Add file of passed URLs to the frontier", args, SimpleType.STRING, 1393 MBeanOperationInfo.ACTION)); 1394 1395 operations.add(new OpenMBeanOperationInfoSupport (PAUSE_OPER, 1396 "Pause crawling (noop if already paused)", null, SimpleType.VOID, 1397 MBeanOperationInfo.ACTION)); 1398 1399 operations.add(new OpenMBeanOperationInfoSupport (RESUME_OPER, 1400 "Resume crawling (noop if already resumed)", null, 1401 SimpleType.VOID, MBeanOperationInfo.ACTION)); 1402 1403 args = new OpenMBeanParameterInfoSupport [1]; 1404 args[0] = new OpenMBeanParameterInfoSupport ("name", 1405 "Name of report ('all', 'standard', etc.).", SimpleType.STRING); 1406 operations.add(new OpenMBeanOperationInfoSupport (FRONTIER_REPORT_OPER, 1407 "Full frontier report", args, SimpleType.STRING, 1408 MBeanOperationInfo.INFO)); 1409 1410 operations.add(new OpenMBeanOperationInfoSupport (THREADS_REPORT_OPER, 1411 "Full thread report", null, SimpleType.STRING, 1412 MBeanOperationInfo.INFO)); 1413 1414 operations.add(new OpenMBeanOperationInfoSupport (SEEDS_REPORT_OPER, 1415 "Seeds report", null, SimpleType.STRING, MBeanOperationInfo.INFO)); 1416 1417 operations.add( 1418 new OpenMBeanOperationInfoSupport (PROGRESS_STATISTICS_OPER, 1419 "Progress statistics at time of invocation", null, 1420 SimpleType.STRING, MBeanOperationInfo.INFO)); 1421 1422 operations.add(new OpenMBeanOperationInfoSupport ( 1423 PROGRESS_STATISTICS_LEGEND_OPER, 1424 "Progress statistics legend", null, 1425 SimpleType.STRING, MBeanOperationInfo.INFO)); 1426 1427 operations.add(new OpenMBeanOperationInfoSupport (CHECKPOINT_OPER, 1428 "Start a checkpoint", null, SimpleType.VOID, 1429 MBeanOperationInfo.ACTION)); 1430 1431 this.bdbjeOperationsNameList = Arrays.asList(new String [] { "cleanLog", 1435 "evictMemory", "checkpoint", "sync", 1436 "getEnvironmentStatsToString", "getLockStatsToString", 1437 "getDatabaseNames", OP_DB_STAT 1438 }); 1439 addBdbjeOperations(operations, 1440 this.bdbjeMBeanHelper.getOperationList(env), 1441 this.bdbjeOperationsNameList); 1442 1443 List <MBeanNotificationInfo > notifications 1445 = new ArrayList <MBeanNotificationInfo >(); 1446 notifications.add( 1447 new MBeanNotificationInfo (new String [] {"crawlStarted", 1448 "crawlEnding", "crawlPaused", "crawlResuming", PROG_STATS}, 1449 this.getClass().getName() + ".notifications", 1450 "CrawlStatusListener events and progress statistics as " + 1451 "notifications")); 1452 MBeanNotificationInfo [] notificationsArray = 1453 new MBeanNotificationInfo [notifications.size()]; 1454 notifications.toArray(notificationsArray); 1455 1456 OpenMBeanAttributeInfoSupport [] attributesArray = 1458 new OpenMBeanAttributeInfoSupport [attributes.size()]; 1459 attributes.toArray(attributesArray); 1460 OpenMBeanOperationInfoSupport [] operationsArray = 1461 new OpenMBeanOperationInfoSupport [operations.size()]; 1462 operations.toArray(operationsArray); 1463 return new OpenMBeanInfoSupport (this.getClass().getName(), 1464 "Current Crawl Job as OpenMBean", 1465 attributesArray, 1466 new OpenMBeanConstructorInfoSupport [] {}, 1467 operationsArray, 1468 notificationsArray); 1469 } 1470 1471 protected void addBdbjeAttributes( 1472 final List <OpenMBeanAttributeInfo > attributes, 1473 final List <MBeanAttributeInfo > bdbjeAttributes, 1474 final List <String > bdbjeNamesToAdd) { 1475 for (MBeanAttributeInfo info: bdbjeAttributes) { 1476 if (bdbjeNamesToAdd.contains(info.getName())) { 1477 attributes.add(JmxUtils.convertToOpenMBeanAttribute(info)); 1478 } 1479 } 1480 } 1481 1482 protected void addBdbjeOperations( 1483 final List <OpenMBeanOperationInfo > operations, 1484 final List <MBeanOperationInfo > bdbjeOperations, 1485 final List <String > bdbjeNamesToAdd) { 1486 for (MBeanOperationInfo info: bdbjeOperations) { 1487 if (bdbjeNamesToAdd.contains(info.getName())) { 1488 OpenMBeanOperationInfo omboi = null; 1489 if (info.getName().equals(OP_DB_STAT)) { 1490 omboi = JmxUtils.convertToOpenMBeanOperation(info, null, 1494 SimpleType.STRING); 1495 MBeanParameterInfo [] params = omboi.getSignature(); 1496 OpenMBeanParameterInfo [] args = 1497 new OpenMBeanParameterInfoSupport [params.length + 1]; 1498 for (int ii = 0; ii < params.length; ii++) { 1499 args[ii] = (OpenMBeanParameterInfo ) params[ii]; 1500 } 1501 args[params.length] = new OpenMBeanParameterInfoSupport ( 1502 "name", "Database name", SimpleType.STRING); 1503 omboi = new OpenMBeanOperationInfoSupport (omboi.getName(), 1504 omboi.getDescription(), args, omboi.getReturnOpenType(), 1505 omboi.getImpact()); 1506 } else { 1507 omboi = JmxUtils.convertToOpenMBeanOperation(info); 1508 } 1509 operations.add(omboi); 1510 } 1511 } 1512 } 1513 1514 protected void addCrawlOrderAttributes(final ComplexType type, 1515 final List <OpenMBeanAttributeInfo > attributes) { 1516 for (final Iterator i = type.getAttributeInfoIterator(null); 1517 i.hasNext();) { 1518 ModuleAttributeInfo info = (ModuleAttributeInfo)i.next(); 1519 if (ORDER_EXCLUDE.contains(info.getName())) { 1520 continue; 1522 } 1523 String absoluteName = type.getAbsoluteName() + "/" + info.getName(); 1524 if (JmxUtils.isOpenType(info.getType())) { 1525 String description = info.getDescription(); 1526 if (description == null || description.length() <= 0) { 1527 description = info.getName(); 1529 } 1530 attributes.add(new OpenMBeanAttributeInfoSupport ( 1531 absoluteName, description, 1532 JmxUtils.getOpenType(info.getType()), true, true, false)); 1533 } else if(info.isComplexType()) { 1534 try { 1535 ComplexType c = 1536 (ComplexType)type.getAttribute(info.getName()); 1537 addCrawlOrderAttributes(c, attributes); 1538 } catch (AttributeNotFoundException e) { 1539 logger.log(Level.SEVERE, "Failed get of attribute", e); 1540 } catch (MBeanException e) { 1541 logger.log(Level.SEVERE, "Failed get of attribute", e); 1542 } catch (ReflectionException e) { 1543 logger.log(Level.SEVERE, "Failed get of attribute", e); 1544 } 1545 } else if (info.getType().equals(TextField.class.getName())) { 1546 attributes.add(new OpenMBeanAttributeInfoSupport ( 1548 absoluteName, info.getDescription(), 1549 SimpleType.STRING, true, true, false)); 1550 } else { 1551 logger.fine(info.getType()); 1554 } 1555 } 1556 } 1557 1558 public Object getAttribute(String attribute_name) 1559 throws AttributeNotFoundException { 1560 if (attribute_name == null) { 1561 throw new RuntimeOperationsException ( 1562 new IllegalArgumentException ("Attribute name cannot be null"), 1563 "Cannot call getAttribute with null attribute name"); 1564 } 1565 1566 if (this.controller == null) { 1568 throw new RuntimeOperationsException ( 1569 new NullPointerException ("Controller is null"), 1570 "Controller is null"); 1571 } 1572 1573 if (this.bdbjeAttributeNameList.contains(attribute_name)) { 1575 try { 1576 return this.bdbjeMBeanHelper.getAttribute( 1577 this.controller.getBdbEnvironment(), attribute_name); 1578 } catch (MBeanException e) { 1579 throw new RuntimeOperationsException (new RuntimeException (e)); 1580 } 1581 } 1582 1583 if (attribute_name. 1585 startsWith(this.controller.getOrder().getAbsoluteName())) { 1586 return getCrawlOrderAttribute(attribute_name); 1587 } 1588 1589 if (!ATTRIBUTE_LIST.contains(attribute_name)) { 1590 throw new AttributeNotFoundException ("Attribute " + 1591 attribute_name + " is unimplemented."); 1592 } 1593 1594 if (attribute_name.equals(STATUS_ATTR)) { 1599 return getCrawlStatus(); 1600 } 1601 if (attribute_name.equals(NAME_ATTR)) { 1602 return getJobName(); 1603 } 1604 if (attribute_name.equals(UID_ATTR)) { 1605 return getUID(); 1606 } 1607 if (attribute_name.equals(TOTAL_DATA_ATTR)) { 1608 return new Long (this.controller == null && 1609 this.controller.getStatistics() != null? 0: 1610 this.controller.getStatistics().totalBytesWritten()); 1611 } 1612 if (attribute_name.equals(CRAWL_TIME_ATTR)) { 1613 return new Long (this.controller == null && 1614 this.controller.getStatistics() != null? 0: 1615 this.controller.getStatistics().getCrawlerTotalElapsedTime() / 1616 1000); 1617 } 1618 if (attribute_name.equals(CURRENT_DOC_RATE_ATTR)) { 1619 return new Double (this.controller == null && 1620 this.controller.getStatistics() != null? 0: 1621 this.controller.getStatistics().currentProcessedDocsPerSec()); 1622 } 1623 if (attribute_name.equals(DOC_RATE_ATTR)) { 1624 return new Double (this.controller == null && 1625 this.controller.getStatistics() != null? 0: 1626 this.controller.getStatistics().processedDocsPerSec()); 1627 } 1628 if (attribute_name.equals(KB_RATE_ATTR)) { 1629 return new Long (this.controller == null && 1630 this.controller.getStatistics() != null? 0: 1631 this.controller.getStatistics().currentProcessedKBPerSec()); 1632 } 1633 if (attribute_name.equals(CURRENT_KB_RATE_ATTR)) { 1634 return new Long (this.controller == null && 1635 this.controller.getStatistics() != null? 0: 1636 this.controller.getStatistics().processedKBPerSec()); 1637 } 1638 if (attribute_name.equals(THREAD_COUNT_ATTR)) { 1639 return new Integer (this.controller == null && 1640 this.controller.getStatistics() != null? 0: 1641 this.controller.getStatistics().activeThreadCount()); 1642 } 1643 if (attribute_name.equals(FRONTIER_SHORT_REPORT_ATTR)) { 1644 return getFrontierOneLine(); 1645 } 1646 if (attribute_name.equals(THREADS_SHORT_REPORT_ATTR)) { 1647 return getThreadOneLine(); 1648 } 1649 if (attribute_name.equals(DISCOVERED_COUNT_ATTR)) { 1650 return new Long (this.controller == null && 1651 this.controller.getStatistics() != null? 0: 1652 this.controller.getStatistics().totalCount()); 1653 } 1654 if (attribute_name.equals(DOWNLOAD_COUNT_ATTR)) { 1655 return new Long (this.controller == null && 1656 this.controller.getStatistics() != null? 0: 1657 this.controller.getStatistics().successfullyFetchedCount()); 1658 } 1659 1660 throw new AttributeNotFoundException ("Attribute " + 1661 attribute_name + " not found."); 1662 } 1663 1664 protected Object getCrawlOrderAttribute(final String attribute_name) { 1665 CrawlOrder order = this.getController().getOrder(); 1666 Object result = null; 1667 try { 1668 result = getCrawlOrderAttribute(attribute_name.substring(order 1669 .getAbsoluteName().length()), order); 1670 } catch (NullPointerException e) { 1671 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e); 1672 } catch (AttributeNotFoundException e) { 1673 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e); 1674 } catch (MBeanException e) { 1675 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e); 1676 } catch (ReflectionException e) { 1677 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e); 1678 } 1679 return result; 1680 } 1681 1682 protected Object getCrawlOrderAttribute(final String attribute_name, 1683 final ComplexType ct) 1684 throws AttributeNotFoundException , MBeanException , ReflectionException { 1685 String subName = attribute_name.startsWith("/") ? attribute_name 1686 .substring(1) : attribute_name; 1687 int index = subName.indexOf("/"); 1688 if (index <= 0) { 1689 MBeanAttributeInfo info = ct.getAttributeInfo(subName); 1690 return info.getType().equals(TextField.class.getName()) ? ct 1692 .getAttribute(subName).toString() : ct 1693 .getAttribute(subName); 1694 } 1695 return getCrawlOrderAttribute(subName.substring(index + 1), 1696 (ComplexType) ct.getAttribute(subName.substring(0, index))); 1697 } 1698 1699 public AttributeList getAttributes(String [] attributeNames) { 1700 if (attributeNames == null) { 1701 throw new RuntimeOperationsException ( 1702 new IllegalArgumentException ("attributeNames[] cannot be " + 1703 "null"), "Cannot call getAttributes with null attribute " + 1704 "names"); 1705 } 1706 1707 if (this.controller == null) { 1709 throw new RuntimeOperationsException ( 1710 new NullPointerException ("Controller is null"), 1711 "Controller is null"); 1712 } 1713 1714 AttributeList resultList = new AttributeList (); 1715 if (attributeNames.length == 0) { 1716 return resultList; 1717 } 1718 for (int i = 0; i < attributeNames.length; i++) { 1719 try { 1720 Object value = getAttribute(attributeNames[i]); 1721 resultList.add(new Attribute (attributeNames[i], value)); 1722 } catch (Exception e) { 1723 e.printStackTrace(); 1724 } 1725 } 1726 return(resultList); 1727 } 1728 1729 public void setAttribute(Attribute attribute) 1730 throws AttributeNotFoundException { 1731 CrawlOrder order = this.getController().getOrder(); 1733 String attName = attribute.getName(); 1734 if (attName.startsWith(order.getAbsoluteName())) { 1735 try { 1736 setCrawlOrderAttribute(attribute.getName().substring( 1737 order.getAbsoluteName().length()), order, attribute); 1738 } catch (NullPointerException e) { 1739 logger.log(Level.SEVERE, "Failed set of " + attName, e); 1740 } catch (AttributeNotFoundException e) { 1741 logger.log(Level.SEVERE, "Failed set of " + attName, e); 1742 } catch (MBeanException e) { 1743 logger.log(Level.SEVERE, "Failed set of " + attName, e); 1744 } catch (ReflectionException e) { 1745 logger.log(Level.SEVERE, "Failed set of " + attName, e); 1746 } catch (InvalidAttributeValueException e) { 1747 logger.log(Level.SEVERE, "Failed set of " + attName, e); 1748 } 1749 return; 1750 } 1751 1752 if (this.bdbjeAttributeNameList.contains(attName)) { 1754 try { 1755 this.bdbjeMBeanHelper.setAttribute(this.controller 1756 .getBdbEnvironment(), attribute); 1757 } catch (AttributeNotFoundException e) { 1758 throw new RuntimeOperationsException (new RuntimeException (e)); 1759 } catch (InvalidAttributeValueException e) { 1760 throw new RuntimeOperationsException (new RuntimeException (e)); 1761 } 1762 return; 1763 } 1764 1765 throw new AttributeNotFoundException ("Attribute " + attName + 1767 " can not be set."); 1768 } 1769 1770 protected void setCrawlOrderAttribute(final String attribute_name, 1771 final ComplexType ct, final Attribute attribute) 1772 throws AttributeNotFoundException , InvalidAttributeValueException , 1773 MBeanException , ReflectionException { 1774 String subName = attribute_name.startsWith("/") ? attribute_name 1775 .substring(1) : attribute_name; 1776 int index = subName.indexOf("/"); 1777 if (index <= 0) { 1778 ct.setAttribute(new Attribute (subName, attribute.getValue())); 1779 return; 1780 } 1781 setCrawlOrderAttribute(subName.substring(index + 1), (ComplexType) ct 1782 .getAttribute(subName.substring(0, index)), attribute); 1783 } 1784 1785 public AttributeList setAttributes(AttributeList attributes) { 1786 if (attributes == null) { 1787 throw new RuntimeOperationsException ( 1788 new IllegalArgumentException ("attributeNames[] cannot be " + 1789 "null"), "Cannot call getAttributes with null attribute " + 1790 "names"); 1791 } 1792 1793 AttributeList resultList = new AttributeList (); 1794 if (attributes.size() == 0) { 1795 return resultList; 1796 } 1797 for (int i = 0; i < attributes.size(); i++) { 1798 try { 1799 Attribute attr = (Attribute )attributes.get(i); 1800 setAttribute(attr); 1801 String an = attr.getName(); 1802 Object newValue = getAttribute(an); 1803 resultList.add(new Attribute (an, newValue)); 1804 } catch (Exception e) { 1805 e.printStackTrace(); 1806 } 1807 } 1808 return resultList; 1809 } 1810 1811 public Object invoke(String operationName, Object [] params, 1812 String [] signature) 1813 throws ReflectionException { 1814 if (operationName == null) { 1815 throw new RuntimeOperationsException ( 1816 new IllegalArgumentException ("Operation name cannot be null"), 1817 "Cannot call invoke with null operation name"); 1818 } 1819 1820 if (this.bdbjeOperationsNameList.contains(operationName)) { 1821 try { 1822 Object o = this.bdbjeMBeanHelper.invoke( 1823 this.controller.getBdbEnvironment(), 1824 operationName, params, signature); 1825 if (operationName.equals(OP_DB_STAT)) { 1827 return o.toString(); 1828 } 1829 return o; 1830 } catch (MBeanException e) { 1831 throw new RuntimeOperationsException (new RuntimeException (e)); 1832 } 1833 } 1834 1835 1837 if (operationName.equals(IMPORT_URI_OPER)) { 1842 JmxUtils.checkParamsCount(IMPORT_URI_OPER, params, 3); 1843 mustBeCrawling(); 1844 try { 1845 importUri((String )params[0], 1846 ((Boolean )params[1]).booleanValue(), 1847 ((Boolean )params[2]).booleanValue()); 1848 } catch (URIException e) { 1849 throw new RuntimeOperationsException (new RuntimeException (e)); 1850 } 1851 return null; 1852 } 1853 1854 if (operationName.equals(IMPORT_URIS_OPER)) { 1855 JmxUtils.checkParamsCount(IMPORT_URIS_OPER, params, 4); 1856 mustBeCrawling(); 1857 return importUris((String )params[0], 1858 ((String )params[1]).toString(), 1859 ((Boolean )params[2]).booleanValue(), 1860 ((Boolean )params[3]).booleanValue()); 1861 } 1862 1863 if (operationName.equals(PAUSE_OPER)) { 1864 JmxUtils.checkParamsCount(PAUSE_OPER, params, 0); 1865 mustBeCrawling(); 1866 pause(); 1867 return null; 1868 } 1869 1870 if (operationName.equals(RESUME_OPER)) { 1871 JmxUtils.checkParamsCount(RESUME_OPER, params, 0); 1872 mustBeCrawling(); 1873 resume(); 1874 return null; 1875 } 1876 1877 if (operationName.equals(FRONTIER_REPORT_OPER)) { 1878 JmxUtils.checkParamsCount(FRONTIER_REPORT_OPER, params, 1); 1879 mustBeCrawling(); 1880 return getFrontierReport((String )params[0]); 1881 } 1882 1883 if (operationName.equals(THREADS_REPORT_OPER)) { 1884 JmxUtils.checkParamsCount(THREADS_REPORT_OPER, params, 0); 1885 mustBeCrawling(); 1886 return getThreadsReport(); 1887 } 1888 1889 if (operationName.equals(SEEDS_REPORT_OPER)) { 1890 JmxUtils.checkParamsCount(SEEDS_REPORT_OPER, params, 0); 1891 mustBeCrawling(); 1892 StringWriter sw = new StringWriter (); 1893 if (getStatisticsTracking() != null && 1894 getStatisticsTracking() instanceof StatisticsTracker) { 1895 ((StatisticsTracker)getStatisticsTracking()). 1896 writeSeedsReportTo(new PrintWriter (sw)); 1897 } else { 1898 sw.write("Unsupported"); 1899 } 1900 return sw.toString(); 1901 } 1902 1903 if (operationName.equals(CHECKPOINT_OPER)) { 1904 JmxUtils.checkParamsCount(CHECKPOINT_OPER, params, 0); 1905 mustBeCrawling(); 1906 try { 1907 checkpoint(); 1908 } catch (IllegalStateException e) { 1909 throw new RuntimeOperationsException (e); 1910 } 1911 return null; 1912 } 1913 1914 if (operationName.equals(PROGRESS_STATISTICS_OPER)) { 1915 JmxUtils.checkParamsCount(PROGRESS_STATISTICS_OPER, params, 0); 1916 mustBeCrawling(); 1917 return getStatisticsTracking().getProgressStatisticsLine(); 1918 } 1919 1920 if (operationName.equals(PROGRESS_STATISTICS_LEGEND_OPER)) { 1921 JmxUtils.checkParamsCount(PROGRESS_STATISTICS_LEGEND_OPER, 1922 params, 0); 1923 return getStatisticsTracking().progressStatisticsLegend(); 1924 } 1925 1926 throw new ReflectionException ( 1927 new NoSuchMethodException (operationName), 1928 "Cannot find the operation " + operationName); 1929 } 1930 1931 public void mustBeCrawling() { 1932 if (!isCrawling()) { 1933 throw new RuntimeOperationsException ( 1934 new IllegalArgumentException ("Not " + 1935 "crawling (Shouldn't ever be the case)"), 1936 "Not current crawling job?"); 1937 } 1938 } 1939 1940 public boolean isCrawling() { 1941 return this.controller != null; 1942 } 1943 1944 1950 public String getIgnoredSeeds() { 1951 File ignoredFile = new File (getDirectory(), 1952 AbstractFrontier.IGNORED_SEEDS_FILENAME); 1953 if(!ignoredFile.exists()) { 1954 return null; 1955 } 1956 try { 1957 return FileUtils.readFileAsString(ignoredFile); 1958 } catch (IOException e) { 1959 e.printStackTrace(); 1961 return null; 1962 } 1963 } 1964 1965 1969 public void kickUpdate(){ 1970 if (this.controller != null){ 1971 this.controller.kickUpdate(); 1972 } 1973 } 1974 1975 1988 public FrontierMarker getInitialMarker(String regexpr, 1989 boolean inCacheOnly) { 1990 return (this.controller != null && this.controller.isPaused())? 1991 this.controller.getFrontier().getInitialMarker(regexpr, inCacheOnly): 1992 null; 1993 } 1994 1995 2011 public ArrayList getPendingURIsList(FrontierMarker marker, 2012 int numberOfMatches, boolean verbose) 2013 throws InvalidFrontierMarkerException { 2014 return (this.controller != null && this.controller.isPaused())? 2015 this.controller.getFrontier().getURIsList(marker, numberOfMatches, 2016 verbose): 2017 null; 2018 } 2019 2020 public void crawlStarted(String message) { 2021 if (this.mbeanName != null) { 2022 sendNotification(new Notification ("crawlStarted", 2024 this.mbeanName, getNotificationsSequenceNumber(), message)); 2025 } 2026 } 2027 2028 public void crawlEnding(String sExitMessage) { 2029 setRunning(false); 2030 setStatus(sExitMessage); 2031 setReadOnly(); 2032 if (this.mbeanName != null) { 2033 sendNotification(new Notification ("crawlEnding", this.mbeanName, 2034 getNotificationsSequenceNumber(), sExitMessage)); 2035 } 2036 } 2037 2038 public void crawlEnded(String sExitMessage) { 2039 2045 } 2051 2052 public void crawlPausing(String statusMessage) { 2053 setStatus(statusMessage); 2054 } 2055 2056 public void crawlPaused(String statusMessage) { 2057 setStatus(statusMessage); 2058 if (this.mbeanName != null) { 2059 sendNotification(new Notification ("crawlPaused", this.mbeanName, 2061 getNotificationsSequenceNumber(), statusMessage)); 2062 } 2063 } 2064 2065 public void crawlResuming(String statusMessage) { 2066 setStatus(statusMessage); 2067 if (this.mbeanName != null) { 2068 sendNotification(new Notification ("crawlResuming", this.mbeanName, 2070 getNotificationsSequenceNumber(), statusMessage)); 2071 } 2072 } 2073 2074 public void crawlCheckpoint(File checkpointDir) throws Exception { 2075 setStatus(CrawlJob.STATUS_CHECKPOINTING); 2076 } 2077 2078 public CrawlController getController() { 2079 return this.controller; 2080 } 2081 2082 public ObjectName preRegister(final MBeanServer server, ObjectName on) 2083 throws Exception { 2084 this.mbeanServer = server; 2085 @SuppressWarnings ("unchecked") 2086 Hashtable <String ,String > ht = on.getKeyPropertyList(); 2087 if (!ht.containsKey(JmxUtils.NAME)) { 2088 throw new IllegalArgumentException ("Name property required" + 2089 on.getCanonicalName()); 2090 } 2091 Heritrix h = getHostingHeritrix(); 2095 if (h == null || h.getMBeanName() == null) { 2096 throw new IllegalArgumentException ("Hosting heritrix not found " + 2097 "or not registered with JMX: " + on.getCanonicalName()); 2098 } 2099 @SuppressWarnings ("unchecked") 2100 Map <String ,String > hht = h.getMBeanName().getKeyPropertyList(); 2101 ht.put(JmxUtils.MOTHER, hht.get(JmxUtils.NAME)); 2102 String port = hht.get(JmxUtils.JMX_PORT); 2103 if (port != null) { 2104 ht.put(JmxUtils.JMX_PORT, port); 2105 } 2106 ht.put(JmxUtils.HOST, hht.get(JmxUtils.HOST)); 2107 if (!ht.containsKey(JmxUtils.TYPE)) { 2108 ht.put(JmxUtils.TYPE, CRAWLJOB_JMXMBEAN_TYPE); 2109 } 2110 this.mbeanName = new ObjectName (on.getDomain(), ht); 2111 return this.mbeanName; 2112 } 2113 2114 public void postRegister(Boolean registrationDone) { 2115 if (logger.isLoggable(Level.INFO)) { 2116 logger.info( 2117 JmxUtils.getLogRegistrationMsg(this.mbeanName.getCanonicalName(), 2118 this.mbeanServer, registrationDone.booleanValue())); 2119 } 2120 } 2121 2122 public void preDeregister() throws Exception { 2123 } 2125 2126 public void postDeregister() { 2127 if (mbeanName == null) { 2128 return; 2129 } 2130 if (logger.isLoggable(Level.INFO)) { 2131 logger.info(JmxUtils.getLogUnregistrationMsg( 2132 this.mbeanName.getCanonicalName(), this.mbeanServer)); 2133 } 2134 this.mbeanName = null; 2135 } 2136 2137 2140 protected Heritrix getHostingHeritrix() { 2141 Heritrix hostingHeritrix = null; 2142 Map heritrice = Heritrix.getInstances(); 2143 for (final Iterator i = heritrice.keySet().iterator(); i.hasNext();) { 2144 Heritrix h = (Heritrix)heritrice.get(i.next()); 2145 if (h.getJobHandler().getCurrentJob() == this) { 2146 hostingHeritrix = h; 2147 break; 2148 } 2149 } 2150 return hostingHeritrix; 2151 } 2152 2153 2157 public String getJmxJobName() { 2158 return getJobName() + "-" + getUID(); 2159 } 2160 2161 2164 protected static int getNotificationsSequenceNumber() { 2165 return notificationsSequenceNumber++; 2166 } 2167 2168 protected ObjectName getMbeanName() { 2169 return this.mbeanName; 2170 } 2171 2172 2175 public StatisticsTracking getStatisticsTracking() { 2176 return this.controller == null || 2177 this.controller.getStatistics() == null? null: 2178 this.controller.getStatistics(); 2179 } 2180} 2181 | Popular Tags |