1 25 package org.archive.crawler; 26 27 import java.io.File ; 28 import java.io.FileInputStream ; 29 import java.io.FileNotFoundException ; 30 import java.io.FileOutputStream ; 31 import java.io.IOException ; 32 import java.io.InputStream ; 33 import java.io.PrintStream ; 34 import java.io.PrintWriter ; 35 import java.net.HttpURLConnection ; 36 import java.net.InetAddress ; 37 import java.net.URL ; 38 import java.net.URLConnection ; 39 import java.net.UnknownHostException ; 40 import java.util.ArrayList ; 41 import java.util.Arrays ; 42 import java.util.Collection ; 43 import java.util.Collections ; 44 import java.util.Enumeration ; 45 import java.util.Hashtable ; 46 import java.util.Iterator ; 47 import java.util.List ; 48 import java.util.Map ; 49 import java.util.Properties ; 50 import java.util.StringTokenizer ; 51 import java.util.TimeZone ; 52 import java.util.Vector ; 53 import java.util.logging.Level ; 54 import java.util.logging.LogManager ; 55 import java.util.logging.Logger ; 56 57 import javax.management.Attribute ; 58 import javax.management.AttributeList ; 59 import javax.management.AttributeNotFoundException ; 60 import javax.management.DynamicMBean ; 61 import javax.management.InstanceAlreadyExistsException ; 62 import javax.management.InstanceNotFoundException ; 63 import javax.management.InvalidAttributeValueException ; 64 import javax.management.MBeanInfo ; 65 import javax.management.MBeanNotificationInfo ; 66 import javax.management.MBeanOperationInfo ; 67 import javax.management.MBeanRegistration ; 68 import javax.management.MBeanRegistrationException ; 69 import javax.management.MBeanServer ; 70 import javax.management.MBeanServerFactory ; 71 import javax.management.MalformedObjectNameException ; 72 import javax.management.NotCompliantMBeanException ; 73 import javax.management.ObjectName ; 74 import javax.management.ReflectionException ; 75 import javax.management.RuntimeOperationsException ; 76 import javax.management.openmbean.CompositeData ; 77 import javax.management.openmbean.CompositeDataSupport ; 78 import javax.management.openmbean.CompositeType ; 79 import javax.management.openmbean.OpenDataException ; 80 import javax.management.openmbean.OpenMBeanAttributeInfoSupport ; 81 import javax.management.openmbean.OpenMBeanConstructorInfoSupport ; 82 import javax.management.openmbean.OpenMBeanInfoSupport ; 83 import javax.management.openmbean.OpenMBeanOperationInfoSupport ; 84 import javax.management.openmbean.OpenMBeanParameterInfo ; 85 import javax.management.openmbean.OpenMBeanParameterInfoSupport ; 86 import javax.management.openmbean.OpenType ; 87 import javax.management.openmbean.SimpleType ; 88 import javax.management.openmbean.TabularData ; 89 import javax.management.openmbean.TabularDataSupport ; 90 import javax.management.openmbean.TabularType ; 91 import javax.naming.CompoundName ; 92 import javax.naming.Context ; 93 import javax.naming.NameNotFoundException ; 94 import javax.naming.NamingException ; 95 import javax.naming.NoInitialContextException ; 96 97 import org.apache.commons.cli.Option; 98 import org.archive.crawler.admin.CrawlJob; 99 import org.archive.crawler.admin.CrawlJobErrorHandler; 100 import org.archive.crawler.admin.CrawlJobHandler; 101 import org.archive.crawler.datamodel.CredentialStore; 102 import org.archive.crawler.datamodel.credential.Credential; 103 import org.archive.crawler.event.CrawlStatusListener; 104 import org.archive.crawler.framework.AlertManager; 105 import org.archive.crawler.framework.CrawlController; 106 import org.archive.crawler.framework.exceptions.FatalConfigurationException; 107 import org.archive.crawler.framework.exceptions.InitializationException; 108 import org.archive.crawler.selftest.SelfTestCrawlJobHandler; 109 import org.archive.crawler.settings.XMLSettingsHandler; 110 import org.archive.io.SinkHandler; 111 import org.archive.io.SinkHandlerLogRecord; 112 import org.archive.net.UURI; 113 import org.archive.util.FileUtils; 114 import org.archive.util.IoUtils; 115 import org.archive.util.JmxUtils; 116 import org.archive.util.JndiUtils; 117 import org.archive.util.PropertyUtils; 118 import org.archive.util.TextUtils; 119 120 import sun.net.www.protocol.file.FileURLConnection; 121 122 123 148 public class Heritrix implements DynamicMBean , MBeanRegistration { 149 152 private static final Logger logger = 153 Logger.getLogger(Heritrix.class.getName()); 154 155 private static final File TMPDIR = 156 new File (System.getProperty("java.io.tmpdir", "/tmp")); 157 158 161 private static final String PROPERTIES = "heritrix.properties"; 162 163 167 private static final String PROPERTIES_KEY = PROPERTIES; 168 169 172 private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix."; 173 174 177 private static SimpleHttpServer httpServer = null; 178 179 182 private CrawlJobHandler jobHandler = null; 183 184 192 private static final String STARTLOG = "heritrix_dmesg.log"; 193 194 199 public static final String DEFAULT_ENCODING = "ISO-8859-1"; 200 201 210 private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log"; 211 212 218 private static PrintWriter out = null; 219 220 223 private static final String ARCHIVE_PACKAGE = "org.archive."; 224 225 228 private static final String CRAWLER_PACKAGE = Heritrix.class.getName(). 229 substring(0, Heritrix.class.getName().lastIndexOf('.')); 230 231 234 private static final String ROOT_CONTEXT = "/"; 235 236 239 private static boolean commandLine = false; 240 241 244 private static boolean containerInitialized = false; 245 246 249 private static boolean propertiesLoaded = false; 250 251 private static final String JAR_SUFFIX = ".jar"; 252 253 private AlertManager alertManager; 254 255 258 private static String adminContext = ROOT_CONTEXT; 259 260 264 private static boolean gui = 265 !PropertyUtils.getBooleanProperty("heritrix.cmdline.nowui"); 266 267 271 private static int guiPort = SimpleHttpServer.DEFAULT_PORT; 272 273 274 278 final private static Collection <String > LOCALHOST_ONLY = 279 Collections.unmodifiableList(Arrays.asList(new String [] { "127.0.0.1" })); 280 281 282 288 private static Collection <String > guiHosts = LOCALHOST_ONLY; 289 290 291 294 private static String ADMIN = "admin"; 295 296 300 private MBeanServer mbeanServer = null; 301 302 305 private ObjectName mbeanName = null; 306 307 319 private static Map <String ,Heritrix> instances 320 = new Hashtable <String ,Heritrix>(); 321 322 private OpenMBeanInfoSupport openMBeanInfo; 323 private final static String STATUS_ATTR = "Status"; 324 private final static String VERSION_ATTR = "Version"; 325 private final static List ATTRIBUTE_LIST; 326 static { 327 ATTRIBUTE_LIST = Arrays.asList(new String [] {STATUS_ATTR, 328 VERSION_ATTR}); 329 } 330 331 private final static String START_OPER = "start"; 332 private final static String STOP_OPER = "stop"; 333 private final static String DESTROY_OPER = "destroy"; 334 private final static String INTERRUPT_OPER = "interrupt"; 335 private final static String START_CRAWLING_OPER = "startCrawling"; 336 private final static String STOP_CRAWLING_OPER = "stopCrawling"; 337 private final static String ADD_CRAWL_JOB_OPER = "addJob"; 338 private final static String TERMINATE_CRAWL_JOB_OPER = 339 "terminateCurrentJob"; 340 private final static String DELETE_CRAWL_JOB_OPER = "deleteJob"; 341 private final static String ALERT_OPER = "alert"; 342 private final static String ADD_CRAWL_JOB_BASEDON_OPER = "addJobBasedon"; 343 private final static String PENDING_JOBS_OPER = "pendingJobs"; 344 private final static String COMPLETED_JOBS_OPER = "completedJobs"; 345 private final static String CRAWLEND_REPORT_OPER = "crawlendReport"; 346 private final static String SHUTDOWN_OPER = "shutdown"; 347 private final static String LOG_OPER = "log"; 348 private final static String REBIND_JNDI_OPER = "rebindJNDI"; 349 private final static List OPERATION_LIST; 350 static { 351 OPERATION_LIST = Arrays.asList(new String [] {START_OPER, STOP_OPER, 352 INTERRUPT_OPER, START_CRAWLING_OPER, STOP_CRAWLING_OPER, 353 ADD_CRAWL_JOB_OPER, ADD_CRAWL_JOB_BASEDON_OPER, 354 DELETE_CRAWL_JOB_OPER, ALERT_OPER, PENDING_JOBS_OPER, 355 COMPLETED_JOBS_OPER, CRAWLEND_REPORT_OPER, SHUTDOWN_OPER, 356 LOG_OPER, DESTROY_OPER, TERMINATE_CRAWL_JOB_OPER, 357 REBIND_JNDI_OPER}); 358 } 359 private CompositeType jobCompositeType = null; 360 private TabularType jobsTabularType = null; 361 private static final String [] JOB_KEYS = 362 new String [] {"uid", "name", "status"}; 363 364 private static String adminUsername; 365 366 private static String adminPassword; 367 368 376 public Heritrix() throws IOException { 377 this(null, false); 378 } 379 380 public Heritrix(final boolean jmxregister) throws IOException { 381 this(null, jmxregister); 382 } 383 384 391 public Heritrix(final String name, final boolean jmxregister) 392 throws IOException { 393 this(name, jmxregister, new CrawlJobHandler(getJobsdir())); 394 } 395 396 404 public Heritrix(final String name, final boolean jmxregister, 405 final CrawlJobHandler cjh) 406 throws IOException { 407 super(); 408 containerInitialization(); 409 this.jobHandler = cjh; 410 this.openMBeanInfo = buildMBeanInfo(); 411 final SinkHandler sinkHandler = SinkHandler.getInstance(); 415 if (sinkHandler == null) { 416 throw new NullPointerException ("SinkHandler not found."); 417 } 418 this.alertManager = new AlertManager() { 420 public void add(SinkHandlerLogRecord record) { 421 sinkHandler.publish(record); 422 } 423 424 public Vector getAll() { 425 return sinkHandler.getAll(); 426 } 427 428 public Vector getNewAll() { 429 return sinkHandler.getAllUnread(); 430 } 431 432 public SinkHandlerLogRecord get(String alertID) { 433 return sinkHandler.get(Long.parseLong(alertID)); 434 } 435 436 public int getCount() { 437 return sinkHandler.getCount(); 438 } 439 440 public int getNewCount() { 441 return sinkHandler.getUnreadCount(); 442 } 443 444 public void remove(String alertID) { 445 sinkHandler.remove(Long.parseLong(alertID)); 446 } 447 448 public void read(String alertID) { 449 sinkHandler.read(Long.parseLong(alertID)); 450 } 451 }; 452 453 try { 454 Heritrix.registerHeritrix(this, name, jmxregister); 455 } catch (InstanceAlreadyExistsException e) { 456 throw new RuntimeException (e); 457 } catch (MBeanRegistrationException e) { 458 throw new RuntimeException (e); 459 } catch (NotCompliantMBeanException e) { 460 throw new RuntimeException (e); 461 } catch (MalformedObjectNameException e) { 462 throw new RuntimeException (e); 463 } 464 } 465 466 471 protected static void containerInitialization() throws IOException { 472 if (Heritrix.containerInitialized) { 473 return; 474 } 475 Heritrix.containerInitialized = true; 476 Heritrix.loadProperties(); 481 Heritrix.patchLogging(); 482 Heritrix.configureTrustStore(); 483 Runtime.getRuntime().addShutdownHook( 487 Heritrix.getShutdownThread(false, 0, "Heritrix shutdown hook")); 488 try { 491 registerContainerJndi(); 492 } catch (Exception e) { 493 logger.log(Level.WARNING, "Failed jndi container registration.", e); 494 } 495 } 496 497 506 public void destroy() { 507 stop(); 508 try { 509 Heritrix.unregisterHeritrix(this); 510 } catch (InstanceNotFoundException e) { 511 e.printStackTrace(); 512 } catch (MBeanRegistrationException e) { 513 e.printStackTrace(); 514 } catch (NullPointerException e) { 515 e.printStackTrace(); 516 } 517 this.jobHandler = null; 518 this.openMBeanInfo = null; 519 } 520 521 530 public static void main(String [] args) 531 throws Exception { 532 Heritrix.commandLine = true; 533 534 TimeZone.setDefault(TimeZone.getTimeZone("GMT")); 537 538 File startLog = new File (getHeritrixHome(), STARTLOG); 539 Heritrix.out = new PrintWriter (isDevelopment()? 540 System.out: new PrintStream (new FileOutputStream (startLog))); 541 542 try { 543 containerInitialization(); 544 String status = doCmdLineArgs(args); 545 if (status != null) { 546 Heritrix.out.println(status); 547 } 548 } 549 550 catch(Exception e) { 551 e.printStackTrace(Heritrix.out); 553 throw e; 554 } 555 556 finally { 557 if (!isDevelopment()) { 561 if (Heritrix.out != null) { 562 Heritrix.out.close(); 563 } 564 System.out.println("Heritrix version: " + 565 Heritrix.getVersion()); 566 } else { 567 if (Heritrix.out != null) { 568 Heritrix.out.flush(); 569 } 570 } 571 } 572 } 573 574 protected static String doCmdLineArgs(final String [] args) 575 throws Exception { 576 String tmpStr = PropertyUtils. 578 getPropertyOrNull("heritrix.context"); 579 if (tmpStr != null) { 580 Heritrix.adminContext = tmpStr; 581 } 582 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.port"); 583 if (tmpStr != null) { 584 Heritrix.guiPort = Integer.parseInt(tmpStr); 585 } 586 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.admin"); 587 String adminLoginPassword = (tmpStr == null)? "": tmpStr; 588 String crawlOrderFile = 589 PropertyUtils.getPropertyOrNull("heritrix.cmdline.order"); 590 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.run"); 591 boolean runMode = 592 PropertyUtils.getBooleanProperty("heritrix.cmdline.run"); 593 boolean selfTest = false; 594 String selfTestName = null; 595 CommandLineParser clp = new CommandLineParser(args, Heritrix.out, 596 Heritrix.getVersion()); 597 List arguments = clp.getCommandLineArguments(); 598 Option [] options = clp.getCommandLineOptions(); 599 600 if (arguments.size() > 1) { 603 clp.usage(1); 604 } else if (arguments.size() == 1) { 605 crawlOrderFile = (String )arguments.get(0); 606 if (!(new File (crawlOrderFile).exists())) { 607 clp.usage("ORDER.XML <" + crawlOrderFile + 608 "> specified does not exist.", 1); 609 } 610 if (crawlOrderFile.length() > 4 && 612 !crawlOrderFile.substring(crawlOrderFile.length() - 4). 613 equalsIgnoreCase(".xml")) { 614 clp.usage("ORDER.XML <" + crawlOrderFile + 615 "> does not have required '.xml' suffix.", 1); 616 } 617 } 618 619 for (int i = 0; i < options.length; i++) { 621 switch(options[i].getId()) { 622 case 'h': 623 clp.usage(); 624 break; 625 626 case 'a': 627 adminLoginPassword = options[i].getValue(); 628 break; 629 630 case 'n': 631 if (crawlOrderFile == null) { 632 clp.usage("You must specify an ORDER_FILE with" + 633 " '--nowui' option.", 1); 634 } 635 Heritrix.gui = false; 636 break; 637 638 case 'b': 639 Heritrix.guiHosts = parseHosts(options[i].getValue()); 640 break; 641 642 case 'p': 643 try { 644 Heritrix.guiPort = 645 Integer.parseInt(options[i].getValue()); 646 } catch (NumberFormatException e) { 647 clp.usage("Failed parse of port number: " + 648 options[i].getValue(), 1); 649 } 650 if (Heritrix.guiPort <= 0) { 651 clp.usage("Nonsensical port number: " + 652 options[i].getValue(), 1); 653 } 654 break; 655 656 case 'r': 657 runMode = true; 658 break; 659 660 case 's': 661 selfTestName = options[i].getValue(); 662 selfTest = true; 663 break; 664 665 default: 666 assert false: options[i].getId(); 667 } 668 } 669 670 String status = null; 672 if (selfTest) { 673 for (int i = 0; i < options.length; i++) { 677 if (options[i].getId() != 'p' && options[i].getId() != 's') { 678 clp.usage(1); 679 } 680 } 681 682 if (arguments.size() > 0) { 683 clp.usage(1); 685 } 686 status = selftest(selfTestName, Heritrix.guiPort); 687 } else { 688 if (!isValidLoginPasswordString(adminLoginPassword)) { 689 clp.usage("Invalid admin login:password value, or none " 690 + "specified. ", 1); 691 } 692 693 if (!Heritrix.gui) { 694 if (options.length > 1) { 695 clp.usage(1); 699 } 700 Heritrix h = new Heritrix(true); 701 status = h.doOneCrawl(crawlOrderFile); 702 } else { 703 status = startEmbeddedWebserver( 704 Heritrix.guiHosts, Heritrix.guiPort, 705 adminLoginPassword); 706 Heritrix h = new Heritrix(true); 707 708 String tmp = h.launch(crawlOrderFile, runMode); 709 if (tmp != null) { 710 status += ('\n' + tmp); 711 } 712 } 713 } 714 return status; 715 } 716 717 720 public static String getHeritrixOut() { 721 String tmp = System.getProperty("heritrix.out"); 722 if (tmp == null || tmp.length() == 0) { 723 tmp = Heritrix.DEFAULT_HERITRIX_OUT; 724 } 725 return tmp; 726 } 727 728 734 protected static File getHeritrixHome() 735 throws IOException { 736 File heritrixHome = null; 737 String home = System.getProperty("heritrix.home"); 738 if (home != null && home.length() > 0) { 739 heritrixHome = new File (home); 740 if (!heritrixHome.exists()) { 741 throw new IOException ("HERITRIX_HOME <" + home + 742 "> does not exist."); 743 } 744 } else { 745 heritrixHome = new File (new File ("").getAbsolutePath()); 746 } 747 return heritrixHome; 748 } 749 750 756 public static File getJobsdir() throws IOException { 757 Heritrix.loadProperties(); String jobsdirStr = System.getProperty("heritrix.jobsdir", "jobs"); 759 File jobsdir = new File (jobsdirStr); 760 return (jobsdir.isAbsolute())? 761 jobsdir: 762 new File (getHeritrixHome(), jobsdirStr); 763 } 764 765 775 protected static File getSubDir(String subdirName) 776 throws IOException { 777 return getSubDir(subdirName, true); 778 } 779 780 792 protected static File getSubDir(String subdirName, boolean fail) 793 throws IOException { 794 String path = isDevelopment()? 795 "src" + File.separator + subdirName: 796 subdirName; 797 File dir = new File (getHeritrixHome(), path); 798 if (!dir.exists()) { 799 if (fail) { 800 throw new IOException ("Cannot find subdir: " + subdirName); 801 } 802 dir = null; 803 } 804 return dir; 805 } 806 807 816 protected static boolean isValidLoginPasswordString(String str) { 817 boolean isValid = false; 818 StringTokenizer tokenizer = new StringTokenizer (str, ":"); 819 if (tokenizer.countTokens() == 2) { 820 String login = ((String )tokenizer.nextElement()).trim(); 821 String password = ((String )tokenizer.nextElement()).trim(); 822 if (login.length() > 0 && password.length() > 0) { 823 isValid = true; 824 } 825 } 826 return isValid; 827 } 828 829 protected static boolean isDevelopment() { 830 return System.getProperty("heritrix.development") != null; 831 } 832 833 843 protected static Properties loadProperties() 844 throws IOException { 845 if (Heritrix.propertiesLoaded) { 846 return System.getProperties(); 847 } 848 Heritrix.propertiesLoaded = true; 849 850 Properties properties = new Properties (); 851 properties.load(getPropertiesInputStream()); 852 853 for (Enumeration e = properties.keys(); e.hasMoreElements();) { 858 String key = ((String )e.nextElement()).trim(); 859 if (key.startsWith(ARCHIVE_PACKAGE) || 860 key.startsWith(HERITRIX_PROPERTIES_PREFIX)) { 861 if (key.indexOf(".level") < 0) { 864 if (System.getProperty(key) == null || 865 System.getProperty(key).length() == 0) { 866 System.setProperty(key, 867 properties.getProperty(key).trim()); 868 } 869 } 870 } 871 } 872 return properties; 873 } 874 875 protected static InputStream getPropertiesInputStream() 876 throws IOException { 877 File file = null; 878 String alternateProperties = System.getProperty(PROPERTIES_KEY); 880 if (alternateProperties != null && alternateProperties.length() > 0) { 881 file = new File (alternateProperties); 882 } 883 if ((file == null || !file.exists()) && getConfdir(false) != null) { 885 file = new File (getConfdir(), PROPERTIES); 886 if (!file.exists()) { 887 file = null; 890 } 891 } 892 InputStream is = (file != null)? 896 new FileInputStream (file): 897 Heritrix.class.getResourceAsStream("/" + PROPERTIES_KEY); 898 if (is == null) { 899 throw new IOException ("Failed to load properties file from" + 900 " filesystem or from classpath."); 901 } 902 return is; 903 } 904 905 917 protected static void patchLogging() 918 throws SecurityException , IOException { 919 if (System.getProperty("java.util.logging.config.class") != null) { 920 return; 921 } 922 923 if (System.getProperty("java.util.logging.config.file") != null) { 924 return; 925 } 926 927 LogManager.getLogManager(). 930 readConfiguration(getPropertiesInputStream()); 931 } 932 933 944 protected static void configureTrustStore() { 945 final String TRUSTSTORE_KEY = "javax.net.ssl.trustStore"; 947 String value = System.getProperty(TRUSTSTORE_KEY); 948 File confdir = null; 949 try { 950 confdir = getConfdir(false); 951 } catch (IOException e) { 952 logger.log(Level.WARNING, "Failed to get confdir.", e); 953 } 954 if ((value == null || value.length() <= 0) && confdir != null) { 955 File heritrixStore = new File (confdir, "heritrix.cacerts"); 957 if(heritrixStore.exists()) { 958 value = heritrixStore.getAbsolutePath(); 959 } 960 } 961 962 if (value != null && value.length() > 0) { 963 System.setProperty(TRUSTSTORE_KEY, value); 964 } 965 } 966 967 977 protected static String selftest(final String oneSelfTestName, 978 final int port) 979 throws Exception { 980 final String SELFTEST = "selftest"; 982 Heritrix.httpServer = new SimpleHttpServer(SELFTEST, 983 Heritrix.adminContext, LOCALHOST_ONLY, port, true); 984 Heritrix.httpServer.setAuthentication(SELFTEST, Heritrix.adminContext, 990 SELFTEST, SELFTEST, SELFTEST); 991 Heritrix.httpServer.startServer(); 992 File selftestDir = (isDevelopment())? 995 new File (getConfdir(), SELFTEST): 996 new File (File.separator + SELFTEST); 997 File crawlOrderFile = new File (selftestDir, "order.xml"); 998 final String ROOTURI = "127.0.0.1:" + Integer.toString(port); 1003 String selfTestUrl = "http://" + ROOTURI + '/'; 1004 if (oneSelfTestName != null && oneSelfTestName.length() > 0) { 1005 selfTestUrl += (oneSelfTestName + '/'); 1006 } 1007 CrawlJobHandler cjh = new SelfTestCrawlJobHandler(getJobsdir(), 1008 oneSelfTestName, selfTestUrl); 1009 Heritrix h = new Heritrix("Selftest", true, cjh); 1010 CrawlJob job = createCrawlJob(cjh, crawlOrderFile, "Template"); 1011 job = h.getJobHandler().newJob(job, null, SELFTEST, 1012 "Integration self test", selfTestUrl, CrawlJob.PRIORITY_CRITICAL); 1013 h.getJobHandler().addJob(job); 1014 CredentialStore cs = (CredentialStore)job.getSettingsHandler(). 1016 getOrder().getAttribute(CredentialStore.ATTR_NAME); 1017 for (Iterator i = cs.iterator(null); i.hasNext();) { 1018 ((Credential)i.next()).setCredentialDomain(null, ROOTURI); 1019 } 1020 h.getJobHandler().startCrawler(); 1021 StringBuffer buffer = new StringBuffer (); 1022 buffer.append("Heritrix " + Heritrix.getVersion() + 1023 " selftest started."); 1024 buffer.append("\nSelftest first crawls " + selfTestUrl + 1025 " and then runs an analysis."); 1026 buffer.append("\nResult of analysis printed to " + 1027 getHeritrixOut() + " when done."); 1028 buffer.append("\nSelftest job directory for logs and arcs:\n" + 1029 job.getDirectory().getAbsolutePath()); 1030 return buffer.toString(); 1031 } 1032 1033 1043 protected String doOneCrawl(String crawlOrderFile) 1044 throws InitializationException, InvalidAttributeValueException { 1045 return doOneCrawl(crawlOrderFile, null); 1046 } 1047 1048 1060 protected String doOneCrawl(String crawlOrderFile, 1061 CrawlStatusListener listener) 1062 throws InitializationException, InvalidAttributeValueException { 1063 XMLSettingsHandler handler = 1064 new XMLSettingsHandler(new File (crawlOrderFile)); 1065 handler.initialize(); 1066 CrawlController controller = new CrawlController(); 1067 controller.initialize(handler); 1068 if (listener != null) { 1069 controller.addCrawlStatusListener(listener); 1070 } 1071 controller.requestCrawlStart(); 1072 return "Crawl started using " + crawlOrderFile + "."; 1073 } 1074 1075 1084 public String launch() throws Exception { 1085 return launch(null, false); 1086 } 1087 1088 1099 public String launch(String crawlOrderFile, boolean runMode) 1100 throws Exception { 1101 String status = null; 1102 if (crawlOrderFile != null) { 1103 addCrawlJob(crawlOrderFile, "Autolaunched", "", ""); 1104 if(runMode) { 1105 this.jobHandler.startCrawler(); 1106 status = "Job being crawled: " + crawlOrderFile; 1107 } else { 1108 status = "Crawl job ready and pending: " + crawlOrderFile; 1109 } 1110 } else if(runMode) { 1111 this.jobHandler.startCrawler(); 1115 status = "Crawler set to run mode."; 1116 } 1117 return status; 1118 } 1119 1120 1129 protected static String startEmbeddedWebserver(final int port, 1130 final boolean lho, final String adminLoginPassword) 1131 throws Exception { 1132 ArrayList <String > hosts = new ArrayList <String >(); 1133 if (lho) { 1134 hosts.add("127.0.0.1"); 1135 } 1136 return startEmbeddedWebserver(hosts, port, adminLoginPassword); 1137 } 1138 1139 1140 1153 private static Collection <String > parseHosts(String hosts) { 1154 hosts = hosts.trim(); 1155 if (hosts.equals("/")) { 1156 return new ArrayList <String >(1); 1157 } 1158 String [] hostArray = hosts.split(","); 1159 for (int i = 0; i < hostArray.length; i++) { 1160 hostArray[i] = hostArray[i].trim(); 1161 } 1162 return Arrays.asList(hostArray); 1163 } 1164 1165 1177 protected static String startEmbeddedWebserver(Collection <String > hosts, 1178 int port, String adminLoginPassword) 1179 throws Exception { 1180 adminUsername = adminLoginPassword. 1181 substring(0, adminLoginPassword.indexOf(":")); 1182 adminPassword = adminLoginPassword. 1183 substring(adminLoginPassword.indexOf(":") + 1); 1184 Heritrix.httpServer = new SimpleHttpServer("admin", 1185 Heritrix.adminContext, hosts, port, false); 1186 1187 final String DOTWAR = ".war"; 1188 final String SELFTEST = "selftest"; 1189 1190 File [] wars = getWarsdir().listFiles(); 1192 for(int i = 0; i < wars.length; i++) { 1193 if(wars[i].isFile()) { 1194 final String warName = wars[i].getName(); 1195 final String warNameNC = warName.toLowerCase(); 1196 if(warNameNC.endsWith(DOTWAR) && 1197 !warNameNC.equals(ADMIN + DOTWAR) && 1198 !warNameNC.equals(SELFTEST + DOTWAR)) { 1199 int dot = warName.indexOf('.'); 1200 Heritrix.httpServer.addWebapp(warName.substring(0, dot), 1201 null, true); 1202 } 1203 } 1204 } 1205 1206 final String ROLE = ADMIN; 1209 Heritrix.httpServer.setAuthentication(ROLE, Heritrix.adminContext, 1210 adminUsername, adminPassword, ROLE); 1211 Heritrix.httpServer.startServer(); 1212 StringBuffer buffer = new StringBuffer (); 1213 buffer.append("Heritrix " + Heritrix.getVersion() + " is running."); 1214 for (String host: httpServer.getHosts()) { 1215 buffer.append("\nWeb console is at: http://"); 1216 buffer.append(host).append(':').append(port); 1217 } 1218 buffer.append("\nWeb console login and password: " + 1219 adminUsername + "/" + adminPassword); 1220 return buffer.toString(); 1221 } 1222 1223 1229 public static void resetAuthentication(String newUsername, 1230 String newPassword) { 1231 Heritrix.httpServer.resetAuthentication(ADMIN, adminUsername, 1232 newUsername, newPassword); 1233 adminUsername = newUsername; 1234 adminPassword = newPassword; 1235 logger.info("administrative login changed to " 1236 +newUsername+":"+newPassword); 1237 } 1238 1239 protected static CrawlJob createCrawlJob(CrawlJobHandler handler, 1240 File crawlOrderFile, String name) 1241 throws InvalidAttributeValueException { 1242 XMLSettingsHandler settings = new XMLSettingsHandler(crawlOrderFile); 1243 settings.initialize(); 1244 return new CrawlJob(handler.getNextJobUID(), name, settings, 1245 new CrawlJobErrorHandler(Level.SEVERE), 1246 CrawlJob.PRIORITY_HIGH, 1247 crawlOrderFile.getAbsoluteFile().getParentFile()); 1248 } 1249 1250 1262 public String addCrawlJob(String orderPathOrUrl, String name, 1263 String description, String seeds) 1264 throws IOException , FatalConfigurationException { 1265 if (!UURI.hasScheme(orderPathOrUrl)) { 1266 return addCrawlJob(new File (orderPathOrUrl), name, description, 1268 seeds); 1269 } 1270 1271 URL url = new URL (orderPathOrUrl); 1273 1274 String result = null; 1278 URLConnection connection = url.openConnection(); 1279 if (connection instanceof HttpURLConnection ) { 1280 result = addCrawlJob(url, (HttpURLConnection )connection, name, 1281 description, seeds); 1282 } else if (connection instanceof FileURLConnection) { 1283 result = addCrawlJob(new File (url.getPath()), name, description, 1284 seeds); 1285 } else { 1286 throw new UnsupportedOperationException ("No support for " 1287 + connection); 1288 } 1289 1290 return result; 1291 } 1292 1293 protected String addCrawlJob(final URL url, 1294 final HttpURLConnection connection, 1295 final String name, final String description, final String seeds) 1296 throws IOException , FatalConfigurationException { 1297 boolean isJar = url.getPath() != null && 1299 url.getPath().toLowerCase().endsWith(JAR_SUFFIX); 1300 File localFile = File.createTempFile(Heritrix.class.getName(), 1302 isJar? JAR_SUFFIX: null, TMPDIR); 1303 connection.connect(); 1304 String result = null; 1305 try { 1306 IoUtils.readFullyToFile(connection.getInputStream(), localFile); 1307 result = addCrawlJob(localFile, name, description, seeds); 1308 } catch (IOException ioe) { 1309 localFile.delete(); 1311 localFile = null; 1312 } finally { 1313 connection.disconnect(); 1314 if (isJar && localFile != null && localFile.exists()) { 1318 localFile.delete(); 1319 } 1320 } 1321 return result; 1322 } 1323 1324 protected String addCrawlJob(final File order, final String name, 1325 final String description, final String seeds) 1326 throws FatalConfigurationException, IOException { 1327 CrawlJob addedJob = null; 1328 if (this.jobHandler == null) { 1329 throw new NullPointerException ("Heritrix jobhandler is null."); 1330 } 1331 try { 1332 if (order.getName().toLowerCase().endsWith(JAR_SUFFIX)) { 1333 return addCrawlJobBasedonJar(order, name, description, seeds); 1334 } 1335 addedJob = this.jobHandler. 1336 addJob(createCrawlJob(this.jobHandler, order, name)); 1337 } catch (InvalidAttributeValueException e) { 1338 FatalConfigurationException fce = new FatalConfigurationException( 1339 "Converted InvalidAttributeValueException on " + 1340 order.getAbsolutePath() + ": " + e.getMessage()); 1341 fce.setStackTrace(e.getStackTrace()); 1342 } 1343 return addedJob != null? addedJob.getUID(): null; 1344 } 1345 1346 1356 protected String addCrawlJobBasedonJar(final File jarFile, 1357 final String name, final String description, final String seeds) 1358 throws IOException , FatalConfigurationException { 1359 if (jarFile == null || !jarFile.exists()) { 1360 throw new FileNotFoundException (jarFile.getAbsolutePath()); 1361 } 1362 File dir = File.createTempFile(Heritrix.class.getName(), ".expandedjar", 1367 TMPDIR); 1368 dir.delete(); 1369 dir.mkdir(); 1370 try { 1371 org.archive.crawler.util.IoUtils.unzip(jarFile, dir); 1372 File orderFile = new File (dir, "order.xml"); 1374 if (!orderFile.exists()) { 1375 throw new IOException ("Missing order: " + 1376 orderFile.getAbsolutePath()); 1377 } 1378 CrawlJob job = 1379 createCrawlJobBasedOn(orderFile, name, description, seeds); 1380 File seedsFile = new File (dir, "seeds.txt"); 1383 if (seedsFile.exists()) { 1384 FileUtils.copyFiles(seedsFile, new File (job.getDirectory(), 1385 seedsFile.getName())); 1386 } 1387 File settingsDir = new File (dir, "settings"); 1388 if (settingsDir.exists()) { 1389 FileUtils.copyFiles(settingsDir, job.getDirectory()); 1390 } 1391 addCrawlJob(job); 1392 return job.getUID(); 1393 } finally { 1394 org.archive.util.FileUtils.deleteDir(dir); 1400 } 1401 } 1402 1403 public String addCrawlJobBasedOn(String jobUidOrProfile, 1404 String name, String description, String seeds) { 1405 try { 1406 CrawlJob cj = getJobHandler().getJob(jobUidOrProfile); 1407 if (cj == null) { 1408 throw new InvalidAttributeValueException (jobUidOrProfile + 1409 " is not a job UID or profile name (Job UIDs are " + 1410 " usually the 14 digit date portion of job name)."); 1411 } 1412 CrawlJob job = addCrawlJobBasedOn( 1413 cj.getSettingsHandler().getOrderFile(), name, description, 1414 seeds); 1415 return job.getUID(); 1416 } catch (Exception e) { 1417 e.printStackTrace(); 1418 return "Exception on " + jobUidOrProfile + ": " + e.getMessage(); 1419 } 1420 } 1421 1422 protected CrawlJob addCrawlJobBasedOn(final File orderFile, 1423 final String name, final String description, final String seeds) 1424 throws FatalConfigurationException { 1425 return addCrawlJob(createCrawlJobBasedOn(orderFile, name, description, 1426 seeds)); 1427 } 1428 1429 protected CrawlJob createCrawlJobBasedOn(final File orderFile, 1430 final String name, final String description, final String seeds) 1431 throws FatalConfigurationException { 1432 CrawlJob job = getJobHandler().newJob(orderFile, name, description, 1433 seeds); 1434 return CrawlJobHandler.ensureNewJobWritten(job, name, description); 1435 } 1436 1437 protected CrawlJob addCrawlJob(final CrawlJob job) { 1438 return getJobHandler().addJob(job); 1439 } 1440 1441 public void startCrawling() { 1442 if (getJobHandler() == null) { 1443 throw new NullPointerException ("Heritrix jobhandler is null."); 1444 } 1445 getJobHandler().startCrawler(); 1446 } 1447 1448 public void stopCrawling() { 1449 if (getJobHandler() == null) { 1450 throw new NullPointerException ("Heritrix jobhandler is null."); 1451 } 1452 getJobHandler().stopCrawler(); 1453 } 1454 1455 1460 public static String getVersion() { 1461 return System.getProperty("heritrix.version"); 1462 } 1463 1464 1469 public CrawlJobHandler getJobHandler() { 1470 return this.jobHandler; 1471 } 1472 1473 1479 public static File getConfdir() 1480 throws IOException { 1481 return getConfdir(true); 1482 } 1483 1484 1492 public static File getConfdir(final boolean fail) 1493 throws IOException { 1494 final String key = "heritrix.conf"; 1495 String tmp = System.getProperty(key); 1497 if (tmp == null || tmp.length() == 0) { 1499 return getSubDir("conf", fail); 1500 } 1501 File dir = new File (tmp); 1502 if (!dir.exists()) { 1503 if (fail) { 1504 throw new IOException ("Cannot find conf dir: " + tmp); 1505 } else { 1506 logger.log(Level.WARNING, "Specified " + key + 1507 " dir does not exist. Falling back on default"); 1508 } 1509 dir = getSubDir("conf", fail); 1510 } 1511 return dir; 1512 } 1513 1514 1517 public static SimpleHttpServer getHttpServer() { 1518 return Heritrix.httpServer; 1519 } 1520 1521 1526 public static File getWarsdir() 1527 throws IOException { 1528 return getSubDir("webapps"); 1529 } 1530 1531 1539 public static void prepareHeritrixShutDown() { 1540 final Object [] keys = Heritrix.instances.keySet().toArray(); 1544 for (int i = 0; i < keys.length; i++) { 1545 ((Heritrix)Heritrix.instances.get(keys[i])).destroy(); 1546 } 1547 1548 try { 1549 deregisterJndi(getJndiContainerName()); 1550 } catch (NameNotFoundException e) { 1551 logger.log(Level.WARNING, "deregistration of jndi", e); 1553 } catch (Exception e) { 1554 e.printStackTrace(); 1555 } 1556 1557 if(Heritrix.httpServer != null) { 1558 try { 1560 Heritrix.httpServer.stopServer(); 1561 } catch (InterruptedException e) { 1562 e.printStackTrace(); 1565 } finally { 1566 Heritrix.httpServer = null; 1567 } 1568 } 1569 } 1570 1571 1575 public static void performHeritrixShutDown() { 1576 performHeritrixShutDown(0); 1577 } 1578 1579 1586 public static void performHeritrixShutDown(int exitCode) { 1587 System.exit(exitCode); 1588 } 1589 1590 1595 public static void shutdown(final int exitCode) { 1596 getShutdownThread(true, exitCode, "Heritrix shutdown").start(); 1597 } 1598 1599 protected static Thread getShutdownThread(final boolean sysexit, 1600 final int exitCode, final String name) { 1601 Thread t = new Thread (name) { 1602 public void run() { 1603 Heritrix.prepareHeritrixShutDown(); 1604 if (sysexit) { 1605 Heritrix.performHeritrixShutDown(exitCode); 1606 } 1607 } 1608 }; 1609 t.setDaemon(true); 1610 return t; 1611 } 1612 1613 public static void shutdown() { 1614 shutdown(0); 1615 } 1616 1617 1651 protected static void registerHeritrix(final Heritrix h, 1652 final String name, final boolean jmxregister) 1653 throws MalformedObjectNameException , InstanceAlreadyExistsException , 1654 MBeanRegistrationException , NotCompliantMBeanException { 1655 MBeanServer server = getMBeanServer(); 1656 if (server != null) { 1657 if (jmxregister) { 1661 ObjectName objName = (name == null || name.length() <= 0)? 1662 getJmxObjectName(): getJmxObjectName(name); 1663 registerMBean(server, h, objName); 1664 } 1665 } else { 1666 Heritrix.instances.put(h.getNoJmxName(), h); 1671 } 1672 } 1673 1674 protected static void unregisterHeritrix(final Heritrix h) 1675 throws InstanceNotFoundException , MBeanRegistrationException , 1676 NullPointerException { 1677 MBeanServer server = getMBeanServer(); 1678 if (server != null) { 1679 server.unregisterMBean(h.mbeanName); 1680 } else { 1681 Heritrix.instances.remove(h.getNoJmxName()); 1684 } 1685 } 1686 1687 1694 public static MBeanServer getMBeanServer() { 1695 MBeanServer result = null; 1696 List servers = MBeanServerFactory.findMBeanServer(null); 1697 if (servers == null) { 1698 return result; 1699 } 1700 for (Iterator i = servers.iterator(); i.hasNext();) { 1701 MBeanServer server = (MBeanServer )i.next(); 1702 if (server == null) { 1703 continue; 1704 } 1705 result = server; 1706 break; 1707 } 1708 return result; 1709 } 1710 1711 public static MBeanServer registerMBean(final Object objToRegister, 1712 final String name, final String type) 1713 throws InstanceAlreadyExistsException , MBeanRegistrationException , 1714 NotCompliantMBeanException { 1715 MBeanServer server = getMBeanServer(); 1716 if (server != null) { 1717 server = registerMBean(server, objToRegister, name, type); 1718 } 1719 return server; 1720 } 1721 1722 public static MBeanServer registerMBean(final MBeanServer server, 1723 final Object objToRegister, final String name, final String type) 1724 throws InstanceAlreadyExistsException , MBeanRegistrationException , 1725 NotCompliantMBeanException { 1726 try { 1727 Hashtable <String ,String > ht = new Hashtable <String ,String >(); 1728 ht.put(JmxUtils.NAME, name); 1729 ht.put(JmxUtils.TYPE, type); 1730 registerMBean(server, objToRegister, 1731 new ObjectName (CRAWLER_PACKAGE, ht)); 1732 } catch (MalformedObjectNameException e) { 1733 e.printStackTrace(); 1734 } 1735 return server; 1736 } 1737 1738 public static MBeanServer registerMBean(final MBeanServer server, 1739 final Object objToRegister, final ObjectName objName) 1740 throws InstanceAlreadyExistsException , MBeanRegistrationException , 1741 NotCompliantMBeanException { 1742 server.registerMBean(objToRegister, objName); 1743 return server; 1744 } 1745 1746 public static void unregisterMBean(final MBeanServer server, 1747 final String name, final String type) { 1748 if (server == null) { 1749 return; 1750 } 1751 try { 1752 unregisterMBean(server, getJmxObjectName(name, type)); 1753 } catch (MalformedObjectNameException e) { 1754 e.printStackTrace(); 1755 } 1756 } 1757 1758 public static void unregisterMBean(final MBeanServer server, 1759 final ObjectName name) { 1760 try { 1761 server.unregisterMBean(name); 1762 logger.info("Unregistered bean " + name.getCanonicalName()); 1763 } catch (InstanceNotFoundException e) { 1764 e.printStackTrace(); 1765 } catch (MBeanRegistrationException e) { 1766 e.printStackTrace(); 1767 } catch (NullPointerException e) { 1768 e.printStackTrace(); 1769 } 1770 } 1771 1772 1775 protected String getNoJmxName() { 1776 return this.getClass().getName(); 1777 } 1778 1779 public static ObjectName getJmxObjectName() 1780 throws MalformedObjectNameException , NullPointerException { 1781 return getJmxObjectName("Heritrix", JmxUtils.SERVICE); 1782 } 1783 1784 public static ObjectName getJmxObjectName(final String name) 1785 throws MalformedObjectNameException , NullPointerException { 1786 return getJmxObjectName(name, JmxUtils.SERVICE); 1787 } 1788 1789 public static ObjectName getJmxObjectName(final String name, 1790 final String type) 1791 throws MalformedObjectNameException , NullPointerException { 1792 Hashtable <String ,String > ht = new Hashtable <String ,String >(); 1793 ht.put(JmxUtils.NAME, name); 1794 ht.put(JmxUtils.TYPE, type); 1795 return new ObjectName (CRAWLER_PACKAGE, ht); 1796 } 1797 1798 1804 public static boolean isCommandLine() { 1805 return Heritrix.commandLine; 1806 } 1807 1808 1811 public boolean isStarted() { 1812 return this.jobHandler != null; 1813 } 1814 1815 public String getStatus() { 1816 StringBuffer buffer = new StringBuffer (); 1817 if (this.getJobHandler() != null) { 1818 buffer.append("isRunning="); 1819 buffer.append(this.getJobHandler().isRunning()); 1820 buffer.append(" isCrawling="); 1821 buffer.append(this.getJobHandler().isCrawling()); 1822 buffer.append(" alertCount="); 1823 buffer.append(getAlertsCount()); 1824 buffer.append(" newAlertCount="); 1825 buffer.append(getNewAlertsCount()); 1826 if (this.getJobHandler().isCrawling()) { 1827 buffer.append(" currentJob="); 1828 buffer.append(this.getJobHandler().getCurrentJob(). 1829 getJmxJobName()); 1830 } 1831 } 1832 return buffer.toString(); 1833 } 1834 1835 public int getAlertsCount() { 1837 return this.alertManager.getCount(); 1838 } 1839 1840 public int getNewAlertsCount() { 1841 return this.alertManager.getNewCount(); 1842 } 1843 1844 public Vector getAlerts() { 1845 return this.alertManager.getAll(); 1846 } 1847 1848 public Vector getNewAlerts() { 1849 return this.alertManager.getNewAll(); 1850 } 1851 1852 public SinkHandlerLogRecord getAlert(final String id) { 1853 return this.alertManager.get(id); 1854 } 1855 1856 public void readAlert(final String id) { 1857 this.alertManager.read(id); 1858 } 1859 1860 public void removeAlert(final String id) { 1861 this.alertManager.remove(id); 1862 } 1863 1864 1873 public void start() { 1874 if (!Heritrix.isCommandLine() && !isStarted()) { 1877 try { 1878 logger.info(launch()); 1879 } catch (Exception e) { 1880 e.printStackTrace(); 1881 } 1882 } 1883 } 1884 1885 1890 public void stop() { 1891 if (this.jobHandler != null) { 1892 this.jobHandler.stop(); 1893 } 1894 } 1895 1896 public String interrupt(String threadName) { 1897 String result = "Thread " + threadName + " not found"; 1898 ThreadGroup group = Thread.currentThread().getThreadGroup(); 1899 if (group == null) { 1900 return result; 1901 } 1902 ThreadGroup parent = null; 1905 while((parent = group.getParent()) != null) { 1906 group = parent; 1907 } 1908 final int max = group.activeCount() * 2; 1911 Thread [] threads = new Thread [max]; 1912 int threadCount = group.enumerate(threads, true); 1913 if (threadCount >= max) { 1914 logger.info("Some threads not found...array too small: " + 1915 max); 1916 } 1917 for (int j = 0; j < threadCount; j++) { 1918 if (threads[j].getName().equals(threadName)) { 1919 threads[j].interrupt(); 1920 result = "Interrupt sent to " + threadName; 1921 break; 1922 } 1923 } 1924 return result; 1925 } 1926 1927 1929 1933 protected OpenMBeanInfoSupport buildMBeanInfo() { 1934 OpenMBeanAttributeInfoSupport [] attributes = 1935 new OpenMBeanAttributeInfoSupport [Heritrix.ATTRIBUTE_LIST.size()]; 1936 OpenMBeanConstructorInfoSupport [] constructors = 1937 new OpenMBeanConstructorInfoSupport [1]; 1938 OpenMBeanOperationInfoSupport [] operations = 1939 new OpenMBeanOperationInfoSupport [Heritrix.OPERATION_LIST.size()]; 1940 MBeanNotificationInfo [] notifications = 1941 new MBeanNotificationInfo [0]; 1942 1943 attributes[0] = 1945 new OpenMBeanAttributeInfoSupport (Heritrix.STATUS_ATTR, 1946 "Short basic status message", SimpleType.STRING, true, 1947 false, false); 1948 attributes[1] = 1950 new OpenMBeanAttributeInfoSupport (Heritrix.VERSION_ATTR, 1951 "Heritrix version", SimpleType.STRING, true, false, false); 1952 1953 constructors[0] = new OpenMBeanConstructorInfoSupport ( 1955 "HeritrixOpenMBean", "Constructs Heritrix OpenMBean instance ", 1956 new OpenMBeanParameterInfoSupport [0]); 1957 1958 operations[0] = new OpenMBeanOperationInfoSupport ( 1960 Heritrix.START_OPER, "Start Heritrix instance", null, 1961 SimpleType.VOID, MBeanOperationInfo.ACTION); 1962 1963 operations[1] = new OpenMBeanOperationInfoSupport ( 1964 Heritrix.STOP_OPER, "Stop Heritrix instance", null, 1965 SimpleType.VOID, MBeanOperationInfo.ACTION); 1966 1967 OpenMBeanParameterInfo [] args = new OpenMBeanParameterInfoSupport [1]; 1968 args[0] = new OpenMBeanParameterInfoSupport ("threadName", 1969 "Name of thread to send interrupt", SimpleType.STRING); 1970 operations[2] = new OpenMBeanOperationInfoSupport ( 1971 Heritrix.INTERRUPT_OPER, "Send thread an interrupt " + 1972 "(Used debugging)", args, SimpleType.STRING, 1973 MBeanOperationInfo.ACTION_INFO); 1974 1975 operations[3] = new OpenMBeanOperationInfoSupport ( 1976 Heritrix.START_CRAWLING_OPER, "Set Heritrix instance " + 1977 "into crawling mode", null, SimpleType.VOID, 1978 MBeanOperationInfo.ACTION); 1979 1980 operations[4] = new OpenMBeanOperationInfoSupport ( 1981 Heritrix.STOP_CRAWLING_OPER, "Unset Heritrix instance " + 1982 " crawling mode", null, SimpleType.VOID, 1983 MBeanOperationInfo.ACTION); 1984 1985 args = new OpenMBeanParameterInfoSupport [4]; 1986 args[0] = new OpenMBeanParameterInfoSupport ("pathOrURL", 1987 "Path/URL to order or jar of order+seed", 1988 SimpleType.STRING); 1989 args[1] = new OpenMBeanParameterInfoSupport ("name", 1990 "Basename for new job", SimpleType.STRING); 1991 args[2] = new OpenMBeanParameterInfoSupport ("description", 1992 "Description to save with new job", SimpleType.STRING); 1993 args[3] = new OpenMBeanParameterInfoSupport ("seeds", 1994 "Initial seed(s)", SimpleType.STRING); 1995 operations[5] = new OpenMBeanOperationInfoSupport ( 1996 Heritrix.ADD_CRAWL_JOB_OPER, "Add new crawl job", args, 1997 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO); 1998 1999 args = new OpenMBeanParameterInfoSupport [4]; 2000 args[0] = new OpenMBeanParameterInfoSupport ("uidOrName", 2001 "Job UID or profile name", SimpleType.STRING); 2002 args[1] = new OpenMBeanParameterInfoSupport ("name", 2003 "Basename for new job", SimpleType.STRING); 2004 args[2] = new OpenMBeanParameterInfoSupport ("description", 2005 "Description to save with new job", SimpleType.STRING); 2006 args[3] = new OpenMBeanParameterInfoSupport ("seeds", 2007 "Initial seed(s)", SimpleType.STRING); 2008 operations[6] = new OpenMBeanOperationInfoSupport ( 2009 Heritrix.ADD_CRAWL_JOB_BASEDON_OPER, 2010 "Add a new crawl job based on passed Job UID or profile", 2011 args, SimpleType.STRING, MBeanOperationInfo.ACTION_INFO); 2012 2013 args = new OpenMBeanParameterInfoSupport [1]; 2014 args[0] = new OpenMBeanParameterInfoSupport ("UID", 2015 "Job UID", SimpleType.STRING); 2016 operations[7] = new OpenMBeanOperationInfoSupport (DELETE_CRAWL_JOB_OPER, 2017 "Delete/stop this crawl job", args, SimpleType.VOID, 2018 MBeanOperationInfo.ACTION); 2019 2020 args = new OpenMBeanParameterInfoSupport [1]; 2021 args[0] = new OpenMBeanParameterInfoSupport ("index", 2022 "Zero-based index into array of alerts", SimpleType.INTEGER); 2023 operations[8] = new OpenMBeanOperationInfoSupport ( 2024 Heritrix.ALERT_OPER, "Return alert at passed index", args, 2025 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO); 2026 2027 try { 2028 this.jobCompositeType = new CompositeType ("job", 2029 "Job attributes", JOB_KEYS, 2030 new String [] {"Job unique ID", "Job name", "Job status"}, 2031 new OpenType [] {SimpleType.STRING, SimpleType.STRING, 2032 SimpleType.STRING}); 2033 this.jobsTabularType = new TabularType ("jobs", "List of jobs", 2034 this.jobCompositeType, new String [] {"uid"}); 2035 } catch (OpenDataException e) { 2036 throw new RuntimeException (e); 2038 } 2039 operations[9] = new OpenMBeanOperationInfoSupport ( 2040 Heritrix.PENDING_JOBS_OPER, 2041 "List of pending jobs (or null if none)", null, 2042 this.jobsTabularType, MBeanOperationInfo.INFO); 2043 operations[10] = new OpenMBeanOperationInfoSupport ( 2044 Heritrix.COMPLETED_JOBS_OPER, 2045 "List of completed jobs (or null if none)", null, 2046 this.jobsTabularType, MBeanOperationInfo.INFO); 2047 2048 args = new OpenMBeanParameterInfoSupport [2]; 2049 args[0] = new OpenMBeanParameterInfoSupport ("uid", 2050 "Job unique ID", SimpleType.STRING); 2051 args[1] = new OpenMBeanParameterInfoSupport ("name", 2052 "Report name (e.g. crawl-report, etc.)", 2053 SimpleType.STRING); 2054 operations[11] = new OpenMBeanOperationInfoSupport ( 2055 Heritrix.CRAWLEND_REPORT_OPER, "Return crawl-end report", args, 2056 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO); 2057 2058 operations[12] = new OpenMBeanOperationInfoSupport ( 2059 Heritrix.SHUTDOWN_OPER, "Shutdown container", null, 2060 SimpleType.VOID, MBeanOperationInfo.ACTION); 2061 2062 args = new OpenMBeanParameterInfoSupport [2]; 2063 args[0] = new OpenMBeanParameterInfoSupport ("level", 2064 "Log level: e.g. SEVERE, WARNING, etc.", SimpleType.STRING); 2065 args[1] = new OpenMBeanParameterInfoSupport ("message", 2066 "Log message", SimpleType.STRING); 2067 operations[13] = new OpenMBeanOperationInfoSupport (Heritrix.LOG_OPER, 2068 "Add a log message", args, SimpleType.VOID, 2069 MBeanOperationInfo.ACTION); 2070 2071 operations[14] = new OpenMBeanOperationInfoSupport ( 2072 Heritrix.DESTROY_OPER, "Destroy Heritrix instance", null, 2073 SimpleType.VOID, MBeanOperationInfo.ACTION); 2074 2075 operations[15] = new OpenMBeanOperationInfoSupport ( 2076 Heritrix.TERMINATE_CRAWL_JOB_OPER, 2077 "Returns false if no current job", null, SimpleType.BOOLEAN, 2078 MBeanOperationInfo.ACTION); 2079 2080 operations[16] = new OpenMBeanOperationInfoSupport ( 2081 Heritrix.REBIND_JNDI_OPER, 2082 "Rebinds this Heritrix with JNDI.", null, 2083 SimpleType.VOID, MBeanOperationInfo.ACTION); 2084 2085 return new OpenMBeanInfoSupport (this.getClass().getName(), 2087 "Heritrix Main OpenMBean", attributes, constructors, operations, 2088 notifications); 2089 } 2090 2091 public Object getAttribute(String attribute_name) 2092 throws AttributeNotFoundException { 2093 if (attribute_name == null) { 2094 throw new RuntimeOperationsException ( 2095 new IllegalArgumentException ("Attribute name cannot be null"), 2096 "Cannot call getAttribute with null attribute name"); 2097 } 2098 if (!Heritrix.ATTRIBUTE_LIST.contains(attribute_name)) { 2099 throw new AttributeNotFoundException ("Attribute " + 2100 attribute_name + " is unimplemented."); 2101 } 2102 if (attribute_name.equals(STATUS_ATTR)) { 2107 return getStatus(); 2108 } 2109 if (attribute_name.equals(VERSION_ATTR)) { 2110 return getVersion(); 2111 } 2112 throw new AttributeNotFoundException ("Attribute " + 2113 attribute_name + " not found."); 2114 } 2115 2116 public void setAttribute(Attribute attribute) 2117 throws AttributeNotFoundException { 2118 throw new AttributeNotFoundException ("No attribute can be set in " + 2119 "this MBean"); 2120 } 2121 2122 public AttributeList getAttributes(String [] attributeNames) { 2123 if (attributeNames == null) { 2124 throw new RuntimeOperationsException ( 2125 new IllegalArgumentException ("attributeNames[] cannot be " + 2126 "null"), "Cannot call getAttributes with null attribute " + 2127 "names"); 2128 } 2129 AttributeList resultList = new AttributeList (); 2130 if (attributeNames.length == 0) { 2131 return resultList; 2132 } 2133 for (int i = 0; i < attributeNames.length; i++) { 2134 try { 2135 Object value = getAttribute(attributeNames[i]); 2136 resultList.add(new Attribute (attributeNames[i], value)); 2137 } catch (Exception e) { 2138 e.printStackTrace(); 2139 } 2140 } 2141 return(resultList); 2142 } 2143 2144 public AttributeList setAttributes(AttributeList attributes) { 2145 return new AttributeList (); } 2147 2148 public Object invoke(final String operationName, final Object [] params, 2149 final String [] signature) 2150 throws ReflectionException { 2151 if (operationName == null) { 2152 throw new RuntimeOperationsException ( 2153 new IllegalArgumentException ("Operation name cannot be null"), 2154 "Cannot call invoke with null operation name"); 2155 } 2156 if (operationName.equals(START_OPER)) { 2161 JmxUtils.checkParamsCount(START_OPER, params, 0); 2162 start(); 2163 return null; 2164 } 2165 if (operationName.equals(STOP_OPER)) { 2166 JmxUtils.checkParamsCount(STOP_OPER, params, 0); 2167 stop(); 2168 return null; 2169 } 2170 if (operationName.equals(DESTROY_OPER)) { 2171 JmxUtils.checkParamsCount(DESTROY_OPER, params, 0); 2172 destroy(); 2173 return null; 2174 } 2175 if (operationName.equals(TERMINATE_CRAWL_JOB_OPER)) { 2176 JmxUtils.checkParamsCount(TERMINATE_CRAWL_JOB_OPER, params, 0); 2177 return new Boolean (this.jobHandler.terminateCurrentJob()); 2178 } 2179 if (operationName.equals(REBIND_JNDI_OPER)) { 2180 JmxUtils.checkParamsCount(REBIND_JNDI_OPER, params, 0); 2181 try { 2182 registerContainerJndi(); 2183 } catch (MalformedObjectNameException e) { 2184 throw new RuntimeOperationsException (new RuntimeException (e)); 2185 } catch (UnknownHostException e) { 2186 throw new RuntimeOperationsException (new RuntimeException (e)); 2187 } catch (NamingException e) { 2188 throw new RuntimeOperationsException (new RuntimeException (e)); 2189 } 2190 return null; 2191 } 2192 if (operationName.equals(SHUTDOWN_OPER)) { 2193 JmxUtils.checkParamsCount(SHUTDOWN_OPER, params, 0); 2194 Heritrix.shutdown(); 2195 return null; 2196 } 2197 if (operationName.equals(LOG_OPER)) { 2198 JmxUtils.checkParamsCount(LOG_OPER, params, 2); 2199 logger.log(Level.parse((String )params[0]), (String )params[1]); 2200 return null; 2201 } 2202 if (operationName.equals(INTERRUPT_OPER)) { 2203 JmxUtils.checkParamsCount(INTERRUPT_OPER, params, 1); 2204 return interrupt((String )params[0]); 2205 } 2206 if (operationName.equals(START_CRAWLING_OPER)) { 2207 JmxUtils.checkParamsCount(START_CRAWLING_OPER, params, 0); 2208 startCrawling(); 2209 return null; 2210 } 2211 if (operationName.equals(STOP_CRAWLING_OPER)) { 2212 JmxUtils.checkParamsCount(STOP_CRAWLING_OPER, params, 0); 2213 stopCrawling(); 2214 return null; 2215 } 2216 if (operationName.equals(ADD_CRAWL_JOB_OPER)) { 2217 JmxUtils.checkParamsCount(ADD_CRAWL_JOB_OPER, params, 4); 2218 try { 2219 return addCrawlJob((String )params[0], (String )params[1], 2220 checkForEmptyPlaceHolder((String )params[2]), 2221 checkForEmptyPlaceHolder((String )params[3])); 2222 } catch (IOException e) { 2223 throw new RuntimeOperationsException (new RuntimeException (e)); 2224 } catch (FatalConfigurationException e) { 2225 throw new RuntimeOperationsException (new RuntimeException (e)); 2226 } 2227 } 2228 if (operationName.equals(DELETE_CRAWL_JOB_OPER)) { 2229 JmxUtils.checkParamsCount(DELETE_CRAWL_JOB_OPER, params, 1); 2230 this.jobHandler.deleteJob((String )params[0]); 2231 return null; 2232 } 2233 2234 if (operationName.equals(ADD_CRAWL_JOB_BASEDON_OPER)) { 2235 JmxUtils.checkParamsCount(ADD_CRAWL_JOB_BASEDON_OPER, params, 4); 2236 return addCrawlJobBasedOn((String )params[0], (String )params[1], 2237 checkForEmptyPlaceHolder((String )params[2]), 2238 checkForEmptyPlaceHolder((String )params[3])); 2239 } 2240 if (operationName.equals(ALERT_OPER)) { 2241 JmxUtils.checkParamsCount(ALERT_OPER, params, 1); 2242 SinkHandlerLogRecord slr = null; 2243 if (this.alertManager.getCount() > 0) { 2244 slr = (SinkHandlerLogRecord)this.alertManager.getAll(). 2248 get(((Integer )params[0]).intValue()); 2249 } 2250 return (slr != null)? slr.toString(): null; 2251 } 2252 2253 if (operationName.equals(PENDING_JOBS_OPER)) { 2254 JmxUtils.checkParamsCount(PENDING_JOBS_OPER, params, 0); 2255 try { 2256 return makeJobsTabularData(getJobHandler().getPendingJobs()); 2257 } catch (OpenDataException e) { 2258 throw new RuntimeOperationsException (new RuntimeException (e)); 2259 } 2260 } 2261 2262 if (operationName.equals(COMPLETED_JOBS_OPER)) { 2263 JmxUtils.checkParamsCount(COMPLETED_JOBS_OPER, params, 0); 2264 try { 2265 return makeJobsTabularData(getJobHandler().getCompletedJobs()); 2266 } catch (OpenDataException e) { 2267 throw new RuntimeOperationsException (new RuntimeException (e)); 2268 } 2269 } 2270 2271 if (operationName.equals(CRAWLEND_REPORT_OPER)) { 2272 JmxUtils.checkParamsCount(CRAWLEND_REPORT_OPER, params, 2); 2273 try { 2274 return getCrawlendReport((String )params[0], (String ) params[1]); 2275 } catch (IOException e) { 2276 throw new RuntimeOperationsException (new RuntimeException (e)); 2277 } 2278 } 2279 2280 throw new ReflectionException ( 2281 new NoSuchMethodException (operationName), 2282 "Cannot find the operation " + operationName); 2283 } 2284 2285 2295 protected String getCrawlendReport(String jobUid, String reportName) 2296 throws IOException { 2297 CrawlJob job = getJobHandler().getJob(jobUid); 2298 if (job == null) { 2299 throw new IOException ("No such job: " + jobUid); 2300 } 2301 File report = new File (job.getDirectory(), reportName + ".txt"); 2302 if (!report.exists()) { 2303 throw new FileNotFoundException (report.getAbsolutePath()); 2304 } 2305 return FileUtils.readFileAsString(report); 2306 } 2307 2308 protected TabularData makeJobsTabularData(List jobs) 2309 throws OpenDataException { 2310 if (jobs == null || jobs.size() == 0) { 2311 return null; 2312 } 2313 TabularData td = new TabularDataSupport (this.jobsTabularType); 2314 for (Iterator i = jobs.iterator(); i.hasNext();) { 2315 CrawlJob job = (CrawlJob)i.next(); 2316 CompositeData cd = new CompositeDataSupport (this.jobCompositeType, 2317 JOB_KEYS, 2318 new String [] {job.getUID(), job.getJobName(), job.getStatus()}); 2319 td.put(cd); 2320 } 2321 return td; 2322 } 2323 2324 2334 protected String checkForEmptyPlaceHolder(String str) { 2335 return TextUtils.matches("-| +", str)? "": str; 2336 } 2337 2338 public MBeanInfo getMBeanInfo() { 2339 return this.openMBeanInfo; 2340 } 2341 2342 2346 public ObjectName getMBeanName() { 2347 return this.mbeanName; 2348 } 2349 2350 public ObjectName preRegister(MBeanServer server, ObjectName name) 2351 throws Exception { 2352 this.mbeanServer = server; 2353 @SuppressWarnings ("unchecked") 2354 Hashtable <String ,String > ht = name.getKeyPropertyList(); 2355 if (!ht.containsKey(JmxUtils.NAME)) { 2356 throw new IllegalArgumentException ("Name property required" + 2357 name.getCanonicalName()); 2358 } 2359 if (!ht.containsKey(JmxUtils.TYPE)) { 2360 ht.put(JmxUtils.TYPE, JmxUtils.SERVICE); 2361 name = new ObjectName (name.getDomain(), ht); 2362 } 2363 this.mbeanName = addGuiPort(addVitals(name)); 2364 Heritrix.instances.put(this.mbeanName. 2365 getCanonicalKeyPropertyListString(), this); 2366 return this.mbeanName; 2367 } 2368 2369 2377 protected static ObjectName addVitals(ObjectName name) 2378 throws UnknownHostException , MalformedObjectNameException , 2379 NullPointerException { 2380 @SuppressWarnings ("unchecked") 2381 Hashtable <String ,String > ht = name.getKeyPropertyList(); 2382 if (!ht.containsKey(JmxUtils.HOST)) { 2383 ht.put(JmxUtils.HOST, InetAddress.getLocalHost().getHostName()); 2384 name = new ObjectName (name.getDomain(), ht); 2385 } 2386 if (!ht.containsKey(JmxUtils.JMX_PORT)) { 2387 String p = System.getProperty("com.sun.management.jmxremote.port"); 2392 if (p != null && p.length() > 0) { 2393 ht.put(JmxUtils.JMX_PORT, p); 2394 name = new ObjectName (name.getDomain(), ht); 2395 } 2396 } 2397 return name; 2398 } 2399 2400 protected static ObjectName addGuiPort(ObjectName name) 2401 throws MalformedObjectNameException , NullPointerException { 2402 @SuppressWarnings ("unchecked") 2403 Hashtable <String ,String > ht = name.getKeyPropertyList(); 2404 if (!ht.containsKey(JmxUtils.GUI_PORT)) { 2405 if (Heritrix.gui) { 2407 ht.put(JmxUtils.GUI_PORT, Integer.toString(Heritrix.guiPort)); 2408 name = new ObjectName (name.getDomain(), ht); 2409 } 2410 } 2411 return name; 2412 } 2413 2414 public void postRegister(Boolean registrationDone) { 2415 if (logger.isLoggable(Level.INFO)) { 2416 logger.info( 2417 JmxUtils.getLogRegistrationMsg(this.mbeanName.getCanonicalName(), 2418 this.mbeanServer, registrationDone.booleanValue())); 2419 } 2420 try { 2421 registerJndi(this.mbeanName); 2422 } catch (Exception e) { 2423 logger.log(Level.SEVERE, "Failed jndi registration", e); 2424 } 2425 } 2426 2427 public void preDeregister() throws Exception { 2428 deregisterJndi(this.mbeanName); 2429 } 2430 2431 public void postDeregister() { 2432 Heritrix.instances. 2433 remove(this.mbeanName.getCanonicalKeyPropertyListString()); 2434 if (logger.isLoggable(Level.INFO)) { 2435 logger.info(JmxUtils.getLogUnregistrationMsg( 2436 this.mbeanName.getCanonicalName(), this.mbeanServer)); 2437 } 2438 } 2439 2440 protected static void registerContainerJndi() 2441 throws MalformedObjectNameException , NullPointerException , 2442 UnknownHostException , NamingException { 2443 registerJndi(getJndiContainerName()); 2444 } 2445 2446 protected static void registerJndi(final ObjectName name) 2447 throws NullPointerException , NamingException { 2448 Context c = getJndiContext(); 2449 if (c == null) { 2450 return; 2451 } 2452 CompoundName key = JndiUtils.bindObjectName(c, name); 2453 if (logger.isLoggable(Level.FINE)) { 2454 logger.fine("Bound '" + key + "' to '" + JndiUtils. 2455 getCompoundName(c.getNameInNamespace()).toString() 2456 + "' jndi context"); 2457 } 2458 } 2459 2460 protected static void deregisterJndi(final ObjectName name) 2461 throws NullPointerException , NamingException { 2462 Context c = getJndiContext(); 2463 if (c == null) { 2464 return; 2465 } 2466 CompoundName key = JndiUtils.unbindObjectName(c, name); 2467 if (logger.isLoggable(Level.FINE)) { 2468 logger.fine("Unbound '" + key + "' from '" + 2469 JndiUtils.getCompoundName(c.getNameInNamespace()).toString() + 2470 "' jndi context"); 2471 } 2472 } 2473 2474 2478 protected static Context getJndiContext() throws NamingException { 2479 Context c = null; 2480 try { 2481 c = JndiUtils.getSubContext(CRAWLER_PACKAGE); 2482 } catch (NoInitialContextException e) { 2483 logger.fine("No JNDI Context: " + e.toString()); 2484 } 2485 return c; 2486 } 2487 2488 2497 protected static ObjectName getJndiContainerName() 2498 throws MalformedObjectNameException , NullPointerException , 2499 UnknownHostException { 2500 ObjectName objName = new ObjectName (CRAWLER_PACKAGE, "type", 2501 "container"); 2502 return addVitals(objName); 2503 } 2504 2505 2509 public static Map getInstances() { 2510 return Heritrix.instances; 2511 } 2512 2513 2516 public static boolean isSingleInstance() { 2517 return Heritrix.instances != null && Heritrix.instances.size() == 1; 2518 } 2519 2520 2523 public static Heritrix getSingleInstance() { 2524 return !isSingleInstance()? 2525 null: 2526 (Heritrix)Heritrix.instances. 2527 get(Heritrix.instances.keySet().iterator().next()); 2528 } 2529} 2530 | Popular Tags |