1 23 package org.archive.crawler.admin; 24 25 import java.io.BufferedReader ; 26 import java.io.BufferedWriter ; 27 import java.io.File ; 28 import java.io.FileWriter ; 29 import java.io.FilenameFilter ; 30 import java.io.IOException ; 31 import java.io.InputStream ; 32 import java.io.InputStreamReader ; 33 import java.net.URL ; 34 import java.net.URI ; 35 import java.util.ArrayList ; 36 import java.util.Comparator ; 37 import java.util.Date ; 38 import java.util.Enumeration ; 39 import java.util.Iterator ; 40 import java.util.List ; 41 import java.util.TreeSet ; 42 import java.util.logging.Level ; 43 import java.util.logging.Logger ; 44 45 import javax.management.Attribute ; 46 import javax.management.AttributeNotFoundException ; 47 import javax.management.InvalidAttributeValueException ; 48 import javax.management.MBeanException ; 49 import javax.management.ReflectionException ; 50 51 import org.apache.commons.httpclient.URIException; 52 import org.archive.crawler.Heritrix; 53 import org.archive.crawler.datamodel.CrawlOrder; 54 import org.archive.crawler.event.CrawlStatusListener; 55 import org.archive.crawler.framework.FrontierMarker; 56 import org.archive.crawler.framework.exceptions.FatalConfigurationException; 57 import org.archive.crawler.framework.exceptions.InitializationException; 58 import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException; 59 import org.archive.crawler.frontier.FrontierJournal; 60 import org.archive.crawler.frontier.RecoveryJournal; 61 import org.archive.crawler.settings.ComplexType; 62 import org.archive.crawler.settings.CrawlerSettings; 63 import org.archive.crawler.settings.SettingsHandler; 64 import org.archive.crawler.settings.XMLSettingsHandler; 65 import org.archive.util.ArchiveUtils; 66 import org.archive.util.FileUtils; 67 68 69 105 106 public class CrawlJobHandler implements CrawlStatusListener { 107 private static final Logger logger = 108 Logger.getLogger(CrawlJobHandler.class.getName()); 109 110 115 public static final String DEFAULT_PROFILE_NAME 116 = "heritrix.default.profile"; 117 118 121 public static final String DEFAULT_PROFILE = "default"; 122 123 126 public static final String PROFILES_DIR_NAME = "profiles"; 127 128 public static final String ORDER_FILE_NAME = "order.xml"; 129 130 133 private CrawlJob currentJob = null; 134 135 138 private CrawlJob newJob = null; 139 140 143 private Thread startingNextJob = null; 144 145 148 private TreeSet <CrawlJob> pendingCrawlJobs; 149 150 153 private TreeSet <CrawlJob> completedCrawlJobs; 155 156 159 private TreeSet <CrawlJob> profileJobs; 160 161 private String defaultProfile = null; 164 165 169 private boolean running = false; 170 171 175 public static final String RECOVER_LOG = "recover"; 176 177 180 private final File jobsDir; 181 182 186 public CrawlJobHandler(final File jobsDir) { 187 this(jobsDir, true, true); 188 } 189 190 196 public CrawlJobHandler(final File jobsDir, 197 final boolean loadJobs, final boolean loadProfiles) { 198 this.jobsDir = jobsDir; 199 Comparator <CrawlJob> comp = new Comparator <CrawlJob>(){ 201 public int compare(CrawlJob job1, CrawlJob job2) { 202 if( job1.getJobPriority() < job2.getJobPriority() ){ 203 return -1; 204 } else if( job1.getJobPriority() > job2.getJobPriority() ){ 205 return 1; 206 } else { 207 return job1.getUID().compareTo(job2.getUID()); 210 } 211 } 212 }; 213 this.pendingCrawlJobs = new TreeSet <CrawlJob>(comp); 214 this.completedCrawlJobs = new TreeSet <CrawlJob>(comp); 215 this.profileJobs = new TreeSet <CrawlJob>(comp); 217 if (loadProfiles){ 218 loadProfiles(); 219 } 220 if (loadJobs){ 221 loadJobs(); 222 } 223 } 224 225 230 protected File getStateJobFile(final File jobDir) { 231 File [] jobFiles = jobDir.listFiles(new FilenameFilter () { 233 public boolean accept(File dir, String name) { 234 return name.toLowerCase().endsWith(".job") && 235 (new File (dir, name)).canRead(); 236 } 237 238 }); 239 return (jobFiles.length == 1)? jobFiles[0]: null; 240 } 241 242 248 private void loadJobs() { 249 this.jobsDir.mkdirs(); 250 File [] jobs = this.jobsDir.listFiles(); 251 for (int i = 0; i < jobs.length; i++) { 252 if (jobs[i].isDirectory()) { 253 File jobFile = getStateJobFile(jobs[i]); 254 if (jobFile != null) { 255 loadJob(jobFile); 256 } 257 } 258 } 259 } 260 261 268 protected void loadJob(final File job) { 269 CrawlJob cjob = null; 270 try { 271 cjob = new CrawlJob(job, new CrawlJobErrorHandler()); 273 } catch (InvalidJobFileException e) { 274 logger.log(Level.INFO, 275 "Invalid job file for " + job.getAbsolutePath(), e); 276 return; 277 } catch (IOException e) { 278 logger.log(Level.INFO, "IOException for " + job.getName() + 279 ", " + job.getAbsolutePath(), e); 280 return; 281 } 282 283 if (cjob.getStatus().equals(CrawlJob.STATUS_RUNNING) 286 || cjob.getStatus().equals(CrawlJob.STATUS_PAUSED) 287 || cjob.getStatus().equals(CrawlJob.STATUS_CHECKPOINTING) 288 || cjob.getStatus().equals(CrawlJob.STATUS_WAITING_FOR_PAUSE) ){ 289 cjob.setStatus(CrawlJob.STATUS_FINISHED_ABNORMAL); 291 this.completedCrawlJobs.add(cjob); 292 } else if( cjob.getStatus().equals(CrawlJob.STATUS_PENDING) ) { 293 this.pendingCrawlJobs.add(cjob); 295 } else if( cjob.getStatus().equals(CrawlJob.STATUS_CREATED) 296 || cjob.getStatus().equals(CrawlJob.STATUS_DELETED) ) { 297 } else { 299 this.completedCrawlJobs.add(cjob); 301 } 302 } 303 304 310 private File getProfilesDirectory() throws IOException { 311 URL webappProfilePath = Heritrix.class.getResource("/" + 312 PROFILES_DIR_NAME); 313 if (webappProfilePath != null) { 314 try { 315 return new File (new URI (webappProfilePath.toString())); 316 } catch (java.lang.IllegalArgumentException e) { 317 } catch (java.net.URISyntaxException e) { 320 e.printStackTrace(); 321 } 322 } 323 return (Heritrix.getConfdir(false) == null)? null: 324 new File (Heritrix.getConfdir().getAbsolutePath(), 325 PROFILES_DIR_NAME); 326 } 327 328 331 private void loadProfiles() { 332 boolean loadedDefault = false; 333 File profileDir = null; 334 try { 335 profileDir = getProfilesDirectory(); 336 } catch (IOException e) { 337 e.printStackTrace(); 338 } 339 if (profileDir != null) { 340 File [] ps = profileDir.listFiles(); 341 if (ps != null && ps.length > 0) { 342 for (int i = 0; i < ps.length; i++) { 343 File f = ps[i]; 344 if (f.isDirectory()) { 345 File profile = new File (f, ORDER_FILE_NAME); 348 if (profile.canRead()) { 349 boolean b = loadProfile(profile); 350 if (b) { 351 loadedDefault = b; 352 } 353 } 354 } 355 } 356 } 357 } 358 String parent = File.separator + PROFILES_DIR_NAME + File.separator; 361 if (!loadedDefault) { 362 loadProfile(new File (parent + DEFAULT_PROFILE, ORDER_FILE_NAME)); 363 } 364 defaultProfile = DEFAULT_PROFILE; 368 } 369 370 375 protected boolean loadProfile(File profile) { 376 boolean loadedDefault = false; 377 try { 379 XMLSettingsHandler newSettingsHandler = 381 new XMLSettingsHandler(profile); 382 CrawlJobErrorHandler cjseh = 383 new CrawlJobErrorHandler(Level.SEVERE); 384 newSettingsHandler. 385 setErrorReportingLevel(cjseh.getLevel()); 386 newSettingsHandler.initialize(); 387 addProfile(new CrawlJob(profile.getParentFile().getName(), 388 newSettingsHandler, cjseh)); 389 loadedDefault = profile.getParentFile().getName(). 390 equals(DEFAULT_PROFILE); 391 } catch (InvalidAttributeValueException e) { 392 System.err.println("Failed to load profile '" + 393 profile.getParentFile().getName() + 394 "'. InvalidAttributeValueException."); 395 } 396 return loadedDefault; 397 } 398 399 403 public synchronized void addProfile(CrawlJob profile){ 404 profileJobs.add(profile); 405 } 406 407 public synchronized void deleteProfile(CrawlJob cj) throws IOException { 408 File d = getProfilesDirectory(); 409 File p = new File (d, cj.getJobName()); 410 if (!p.exists()) { 411 throw new IOException ("No profile named " + cj.getJobName() + 412 " at " + d.getAbsolutePath()); 413 } 414 FileUtils.deleteDir(p); 415 this.profileJobs.remove(cj); 416 } 417 418 422 public synchronized List <CrawlJob> getProfiles(){ 423 ArrayList <CrawlJob> tmp = new ArrayList <CrawlJob>(profileJobs.size()); 424 tmp.addAll(profileJobs); 425 return tmp; 426 } 427 428 435 public CrawlJob addJob(CrawlJob job) { 436 if(job.isProfile()){ 437 return null; } 439 job.setStatus(CrawlJob.STATUS_PENDING); 440 if(job.isNew()){ 441 this.newJob = null; 443 job.setNew(false); 444 } 445 this.pendingCrawlJobs.add(job); 446 if(isCrawling() == false && isRunning()) { 447 startNextJob(); 449 } 450 return job; 451 } 452 453 459 public synchronized CrawlJob getDefaultProfile() { 460 if(defaultProfile != null){ 461 for(Iterator it = profileJobs.iterator(); it.hasNext();) { 462 CrawlJob item = (CrawlJob)it.next(); 463 if(item.getJobName().equals(defaultProfile)){ 464 return item; 466 } 467 } 468 } 469 if(profileJobs.size() > 0){ 470 return (CrawlJob)profileJobs.first(); 471 } 472 return null; 473 } 474 475 481 public void setDefaultProfile(CrawlJob profile) { 482 defaultProfile = profile.getJobName(); 483 } 485 486 492 public List <CrawlJob> getPendingJobs() { 493 ArrayList <CrawlJob> tmp 494 = new ArrayList <CrawlJob>(pendingCrawlJobs.size()); 495 tmp.addAll(pendingCrawlJobs); 496 return tmp; 497 } 498 499 502 public CrawlJob getCurrentJob() { 503 return currentJob; 504 } 505 506 509 public List <CrawlJob> getCompletedJobs() { 510 ArrayList <CrawlJob> tmp 511 = new ArrayList <CrawlJob>(completedCrawlJobs.size()); 512 tmp.addAll(completedCrawlJobs); 513 return tmp; 514 } 515 516 524 public CrawlJob getJob(String jobUID) { 525 if (jobUID == null){ 526 return null; } 528 if (currentJob != null && currentJob.getUID().equals(jobUID)) { 530 return currentJob; 531 } else if (newJob != null && newJob.getUID().equals(jobUID)) { 532 return newJob; 534 } else { 535 Iterator itPend = pendingCrawlJobs.iterator(); 537 while (itPend.hasNext()) { 538 CrawlJob cj = (CrawlJob) itPend.next(); 539 if (cj.getUID().equals(jobUID)) { 540 return cj; 541 } 542 } 543 544 Iterator itComp = completedCrawlJobs.iterator(); 546 while (itComp.hasNext()) { 547 CrawlJob cj = (CrawlJob) itComp.next(); 548 if (cj.getUID().equals(jobUID)) { 549 return cj; 550 } 551 } 552 553 for (Iterator i = getProfiles().iterator(); i.hasNext();) { 555 CrawlJob cj = (CrawlJob) i.next(); 556 if (cj.getUID().equals(jobUID)) { 557 return cj; 558 } 559 } 560 } 561 return null; } 563 564 568 public boolean terminateCurrentJob() { 569 if (this.currentJob == null) { 570 return false; 571 } 572 this.currentJob.stopCrawling(); 575 synchronized (this) { 576 try { 577 wait(3000); 581 } catch (InterruptedException e) { 582 } 584 } 585 return true; 586 } 587 588 597 public void deleteJob(String jobUID) { 598 if (currentJob != null && jobUID.equals(currentJob.getUID())) { 600 terminateCurrentJob(); 601 return; } 603 604 for(Iterator it = pendingCrawlJobs.iterator(); it.hasNext();) { 606 CrawlJob cj = (CrawlJob) it.next(); 607 if (cj.getUID().equals(jobUID)) { 608 cj.setStatus(CrawlJob.STATUS_DELETED); 610 it.remove(); 611 return; } 613 } 614 615 for (Iterator it = completedCrawlJobs.iterator(); it.hasNext();) { 617 CrawlJob cj = (CrawlJob) it.next(); 618 if (cj.getUID().equals(jobUID)) { 619 cj.setStatus(CrawlJob.STATUS_DELETED); 621 it.remove(); 622 return; } 624 } 625 } 626 627 631 public void pauseJob() { 632 if (this.currentJob != null) { 633 this.currentJob.pause(); 634 } 635 } 636 637 644 public void resumeJob() { 645 if (this.currentJob != null) { 646 this.currentJob.resume(); 647 } 648 } 649 650 655 public void checkpointJob() throws IllegalStateException { 656 if (this.currentJob != null) { 657 this.currentJob.checkpoint(); 658 } 659 } 660 661 673 public String getNextJobUID() { 674 return ArchiveUtils.TIMESTAMP17.format(new Date ()); 675 } 676 677 703 public CrawlJob newJob(CrawlJob baseOn, String recovery, String name, 704 String description, String seeds, int priority) 705 throws FatalConfigurationException { 706 File recover = null; 708 try { 709 if (recovery != null && recovery.length() > 0 710 && recovery.equals(RECOVER_LOG)) { 711 File dir = baseOn.getSettingsHandler().getOrder() 714 .getSettingsDir(CrawlOrder.ATTR_LOGS_PATH); 715 recover = new File (dir, FrontierJournal.LOGNAME_RECOVER + 718 RecoveryJournal.GZIP_SUFFIX); 719 } else if (recovery != null && recovery.length() > 0) { 720 recover = new File (baseOn.getSettingsHandler(). 722 getOrder().getSettingsDir(CrawlOrder.ATTR_CHECKPOINTS_PATH), 723 recovery); 724 } 725 } catch (AttributeNotFoundException e1) { 726 throw new FatalConfigurationException( 727 "AttributeNotFoundException occured while setting up" + 728 "new job/profile " + name + " \n" + e1.getMessage()); 729 } 730 731 CrawlJob cj = createNewJob(baseOn.getSettingsHandler().getOrderFile(), 732 name, description, seeds, priority); 733 734 updateRecoveryPaths(recover, cj.getSettingsHandler(), name); 735 736 return cj; 737 } 738 739 752 public CrawlJob newJob(final File orderFile, final String name, 753 final String description, final String seeds) 754 throws FatalConfigurationException { 755 return createNewJob(orderFile, name, description, seeds, 756 CrawlJob.PRIORITY_AVERAGE); 757 } 758 759 protected void checkDirectory(File dir) 760 throws FatalConfigurationException { 761 if (dir == null) { 762 return; 763 } 764 if (!dir.exists() && !dir.canRead()) { 765 throw new FatalConfigurationException(dir.getAbsolutePath() + 766 " does not exist or is unreadable"); 767 } 768 } 769 770 protected CrawlJob createNewJob(final File orderFile, final String name, 771 final String description, final String seeds, final int priority) 772 throws FatalConfigurationException { 773 if (newJob != null) { 774 discardNewJob(); 776 } 777 String UID = getNextJobUID(); 778 File jobDir; 779 jobDir = new File (this.jobsDir, name + "-" + UID); 780 CrawlJobErrorHandler errorHandler = new CrawlJobErrorHandler(); 781 XMLSettingsHandler handler = 782 createSettingsHandler(orderFile, name, description, 783 seeds, jobDir, errorHandler, "order.xml", "seeds.txt"); 784 this.newJob = new CrawlJob(UID, name, handler, errorHandler, priority, 785 jobDir); 786 return this.newJob; 787 } 788 789 807 public CrawlJob newProfile(CrawlJob baseOn, String name, String description, 808 String seeds) 809 throws FatalConfigurationException, IOException { 810 File profileDir = new File (getProfilesDirectory().getAbsoluteFile() 811 + File.separator + name); 812 CrawlJobErrorHandler cjseh = new CrawlJobErrorHandler(Level.SEVERE); 813 CrawlJob newProfile = new CrawlJob(name, 814 createSettingsHandler(baseOn.getSettingsHandler().getOrderFile(), 815 name, description, seeds, profileDir, cjseh, "order.xml", 816 "seeds.txt"), cjseh); 817 addProfile(newProfile); 818 return newProfile; 819 } 820 821 841 protected XMLSettingsHandler createSettingsHandler( 842 final File orderFile, final String name, final String description, 843 final String seeds, final File newSettingsDir, 844 final CrawlJobErrorHandler errorHandler, 845 final String filename, final String seedfile) 846 throws FatalConfigurationException { 847 XMLSettingsHandler newHandler = null; 848 try { 849 newHandler = new XMLSettingsHandler(orderFile); 850 if(errorHandler != null){ 851 newHandler.registerValueErrorHandler(errorHandler); 852 } 853 newHandler.setErrorReportingLevel(errorHandler.getLevel()); 854 newHandler.initialize(); 855 } catch (InvalidAttributeValueException e2) { 856 throw new FatalConfigurationException( 857 "InvalidAttributeValueException occured while creating" + 858 " new settings handler for new job/profile\n" + 859 e2.getMessage()); 860 } 861 862 newSettingsDir.mkdirs(); 864 865 try { 866 ((ComplexType)newHandler.getOrder().getAttribute("scope")) 868 .setAttribute(new Attribute ("seedsfile", seedfile)); 869 } catch (AttributeNotFoundException e1) { 870 throw new FatalConfigurationException( 871 "AttributeNotFoundException occured while setting up" + 872 "new job/profile\n" + e1.getMessage()); 873 } catch (InvalidAttributeValueException e1) { 874 throw new FatalConfigurationException( 875 "InvalidAttributeValueException occured while setting" + 876 "up new job/profile\n" + e1.getMessage()); 877 } catch (MBeanException e1) { 878 throw new FatalConfigurationException( 879 "MBeanException occured while setting up new" + 880 " job/profile\n" + e1.getMessage()); 881 } catch (ReflectionException e1) { 882 throw new FatalConfigurationException( 883 "ReflectionException occured while setting up" + 884 " new job/profile\n" + e1.getMessage()); 885 } 886 887 File newFile = new File (newSettingsDir.getAbsolutePath(), filename); 888 889 try { 890 newHandler.copySettings(newFile, (String )newHandler.getOrder() 891 .getAttribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY)); 892 } catch (IOException e3) { 893 e3.printStackTrace(); 896 throw new FatalConfigurationException( 897 "IOException occured while writing new settings files" + 898 " for new job/profile\n" + e3.getMessage()); 899 } catch (AttributeNotFoundException e) { 900 throw new FatalConfigurationException( 901 "AttributeNotFoundException occured while writing new" + 902 " settings files for new job/profile\n" + e.getMessage()); 903 } catch (MBeanException e) { 904 throw new FatalConfigurationException( 905 "MBeanException occured while writing new settings files" + 906 " for new job/profile\n" + e.getMessage()); 907 } catch (ReflectionException e) { 908 throw new FatalConfigurationException( 909 "ReflectionException occured while writing new settings" + 910 " files for new job/profile\n" + e.getMessage()); 911 } 912 CrawlerSettings orderfile = newHandler.getSettingsObject(null); 913 914 orderfile.setName(name); 915 orderfile.setDescription(description); 916 917 if (seeds != null) { 918 BufferedWriter writer = null; 919 try { 920 writer = new BufferedWriter (new FileWriter (newHandler 921 .getPathRelativeToWorkingDirectory(seedfile))); 922 try { 923 writer.write(seeds); 924 } finally { 925 writer.close(); 926 } 927 } catch (IOException e) { 928 throw new FatalConfigurationException( 929 "IOException occured while writing seed file for new" 930 + " job/profile\n" + e.getMessage()); 931 } 932 } 933 return newHandler; 934 } 935 936 946 protected void updateRecoveryPaths(final File recover, 947 final SettingsHandler sh, final String jobName) 948 throws FatalConfigurationException { 949 if (recover == null) { 950 return; 951 } 952 checkDirectory(recover); 953 try { 954 updateRecoveryPaths(recover, sh); 956 } catch (AttributeNotFoundException e1) { 957 throw new FatalConfigurationException( 958 "AttributeNotFoundException occured while setting up" 959 + "new job/profile " + jobName + " \n" 960 + e1.getMessage()); 961 } catch (InvalidAttributeValueException e1) { 962 throw new FatalConfigurationException( 963 "InvalidAttributeValueException occured while setting" 964 + "new job/profile " + jobName + " \n" 965 + e1.getMessage()); 966 } catch (MBeanException e1) { 967 throw new FatalConfigurationException( 968 "MBeanException occured while setting up new" 969 + "new job/profile " + jobName + " \n" 970 + e1.getMessage()); 971 } catch (ReflectionException e1) { 972 throw new FatalConfigurationException( 973 "ReflectionException occured while setting up" 974 + "new job/profile " + jobName + " \n" 975 + e1.getMessage()); 976 } catch (IOException e) { 977 throw new FatalConfigurationException( 978 "IOException occured while setting up" + "new job/profile " 979 + jobName + " \n" + e.getMessage()); 980 } 981 } 982 983 994 private void updateRecoveryPaths(final File recover, 995 SettingsHandler newHandler) 996 throws AttributeNotFoundException , InvalidAttributeValueException , 997 MBeanException , ReflectionException , IOException { 998 if (recover == null || !recover.exists()) { 999 throw new IOException ("Recovery src does not exist: " + recover); 1000 } 1001 newHandler.getOrder().setAttribute( 1002 new Attribute (CrawlOrder.ATTR_RECOVER_PATH, 1003 recover.getAbsolutePath())); 1004 1005 File newLogsDisk = null; 1008 final String RECOVERY_SUFFIX = "-R"; 1009 while(true) { 1010 try { 1011 newLogsDisk = newHandler.getOrder(). 1012 getSettingsDir(CrawlOrder.ATTR_LOGS_PATH); 1013 } catch (AttributeNotFoundException e) { 1014 logger.log(Level.SEVERE, "Failed to get logs directory", e); 1015 } 1016 if (newLogsDisk.list().length > 0) { 1017 String logsPath = (String ) newHandler.getOrder(). 1019 getAttribute(CrawlOrder.ATTR_LOGS_PATH); 1020 if(logsPath.endsWith("/")) { 1021 logsPath = logsPath.substring(0,logsPath.length()-1); 1022 } 1023 newHandler.getOrder().setAttribute( 1024 new Attribute (CrawlOrder.ATTR_LOGS_PATH, 1025 logsPath + RECOVERY_SUFFIX)); 1026 } else { 1027 break; 1029 } 1030 } 1031 File newStateDisk = null; 1032 while (true) { 1033 try { 1034 newStateDisk = newHandler.getOrder().getSettingsDir( 1035 CrawlOrder.ATTR_STATE_PATH); 1036 } catch (AttributeNotFoundException e) { 1037 logger.log(Level.SEVERE, "Failed to get state directory", e); 1038 } 1039 if (newStateDisk.list().length>0) { 1040 String statePath = (String ) newHandler.getOrder(). 1042 getAttribute(CrawlOrder.ATTR_STATE_PATH); 1043 if(statePath.endsWith("/")) { 1044 statePath = statePath.substring(0,statePath.length()-1); 1045 } 1046 newHandler.getOrder().setAttribute( 1047 new Attribute (CrawlOrder.ATTR_STATE_PATH, 1048 statePath + RECOVERY_SUFFIX)); 1049 } else { 1050 break; 1052 } 1053 } 1054 } 1055 1056 1060 public void discardNewJob(){ 1061 FileUtils.deleteDir(new File (newJob.getSettingsDirectory())); 1062 } 1063 1064 1068 public CrawlJob getNewJob(){ 1069 return newJob; 1070 } 1071 1072 1076 public boolean isRunning() { 1077 return running; 1078 } 1079 1080 1085 public boolean isCrawling() { 1086 return this.currentJob != null; 1087 } 1088 1089 1092 public void startCrawler() { 1093 running = true; 1094 if (pendingCrawlJobs.size() > 0 && isCrawling() == false) { 1095 startNextJob(); 1097 } 1098 } 1099 1100 1105 public void stopCrawler() { 1106 running = false; 1107 } 1108 1109 1114 protected final void startNextJob() { 1115 synchronized (this) { 1116 if(startingNextJob != null) { 1117 try { 1118 startingNextJob.join(); 1119 } catch (InterruptedException e) { 1120 e.printStackTrace(); 1121 return; 1122 } 1123 } 1124 startingNextJob = new Thread (new Runnable () { 1125 public void run() { 1126 startNextJobInternal(); 1127 } 1128 }, "StartNextJob"); 1129 startingNextJob.start(); 1130 } 1131 } 1132 1133 protected void startNextJobInternal() { 1134 if (pendingCrawlJobs.size() == 0 || isCrawling()) { 1135 return; 1137 } 1138 this.currentJob = (CrawlJob)pendingCrawlJobs.first(); 1139 assert pendingCrawlJobs.contains(currentJob) : 1140 "pendingCrawlJobs is in an illegal state"; 1141 pendingCrawlJobs.remove(currentJob); 1142 try { 1143 this.currentJob.setupForCrawlStart(); 1144 this.currentJob.getController().addCrawlStatusListener(this); 1148 this.currentJob.getController().requestCrawlStart(); 1150 } catch (InitializationException e) { 1151 loadJob(getStateJobFile(this.currentJob.getDirectory())); 1152 this.currentJob = null; 1153 startNextJobInternal(); } 1155 } 1156 1157 1160 public void kickUpdate() { 1161 if(this.currentJob != null) { 1162 this.currentJob.kickUpdate(); 1163 } 1164 } 1165 1166 1178 public static ArrayList <String > loadOptions(String file) 1179 throws IOException { 1180 ArrayList <String > ret = new ArrayList <String >(); 1181 Enumeration resources = 1182 CrawlJob.class.getClassLoader().getResources("modules/" + file); 1183 1184 boolean noFileFound = true; 1185 while (resources.hasMoreElements()) { 1186 InputStream is = ((URL ) resources.nextElement()).openStream(); 1187 noFileFound = false; 1188 1189 String line = null; 1190 BufferedReader bf = 1191 new BufferedReader (new InputStreamReader (is), 8192); 1192 try { 1193 while ((line = bf.readLine()) != null) { 1194 line = line.trim(); 1195 if(line.indexOf('#')<0 && line.length()>0){ 1196 ret.add(line); 1198 } 1199 } 1200 } finally { 1201 bf.close(); 1202 } 1203 } 1204 1205 if (noFileFound) { 1206 throw new IOException ("Failed to get " + file + " from the " + 1207 " CLASSPATH"); 1208 } 1209 1210 return ret; 1211 } 1212 1213 1228 public FrontierMarker getInitialMarker(String regexpr, 1229 boolean inCacheOnly) { 1230 return (this.currentJob != null)? 1231 this.currentJob.getInitialMarker(regexpr, inCacheOnly): null; 1232 } 1233 1234 1253 public ArrayList getPendingURIsList(FrontierMarker marker, 1254 int numberOfMatches, boolean verbose) 1255 throws InvalidFrontierMarkerException { 1256 return (this.currentJob != null)? 1257 this.currentJob.getPendingURIsList(marker, numberOfMatches, verbose): 1258 null; 1259 } 1260 1261 1268 public long deleteURIsFromPending(String regexpr) { 1269 return (this.currentJob != null)? 1270 this.currentJob.deleteURIsFromPending(regexpr): 0; 1271 } 1272 1273 public String importUris(String file, String style, String force) { 1274 return importUris(file, style, "true".equals(force)); 1275 } 1276 1277 1285 public String importUris(final String fileOrUrl, final String style, 1286 final boolean forceRevisit) { 1287 return (this.currentJob != null)? 1288 this.currentJob.importUris(fileOrUrl, style, forceRevisit): null; 1289 } 1290 1291 protected int importUris(InputStream is, String style, 1292 boolean forceRevisit) { 1293 return (this.currentJob != null)? 1294 this.currentJob.importUris(is, style, forceRevisit): 0; 1295 } 1296 1297 1304 public void importUri(final String uri, final boolean forceFetch, 1305 final boolean isSeed) 1306 throws URIException { 1307 importUri(uri, forceFetch, isSeed, true); 1308 } 1309 1310 1322 public void importUri(final String str, final boolean forceFetch, 1323 final boolean isSeed, final boolean isFlush) 1324 throws URIException { 1325 if (this.currentJob != null) { 1326 this.currentJob.importUri(str, forceFetch, isSeed, isFlush); 1327 } 1328 } 1329 1330 1333 protected void doFlush() { 1334 if (this.currentJob != null) { 1335 this.currentJob.flush(); 1336 } 1337 } 1338 1339 public void stop() { 1340 if (isCrawling()) { 1341 deleteJob(getCurrentJob().getUID()); 1342 } 1343 } 1344 1345 public void requestCrawlStop() { 1346 if (this.currentJob != null) { 1347 this.currentJob.stopCrawling(); 1348 } 1349 } 1350 1351 1359 public static CrawlJob ensureNewJobWritten(CrawlJob newJob, String metaname, 1360 String description) { 1361 XMLSettingsHandler settingsHandler = newJob.getSettingsHandler(); 1362 CrawlerSettings orderfile = settingsHandler.getSettingsObject(null); 1363 orderfile.setName(metaname); 1364 orderfile.setDescription(description); 1365 settingsHandler.writeSettingsObject(orderfile); 1366 return newJob; 1367 } 1368 1369 public void crawlStarted(String message) { 1370 1372 } 1373 1374 public void crawlEnding(String sExitMessage) { 1375 loadJob(getStateJobFile(this.currentJob.getDirectory())); 1376 currentJob = null; 1377 synchronized (this) { 1378 notifyAll(); 1380 } 1381 } 1382 1383 public void crawlEnded(String sExitMessage) { 1384 if (this.running) { 1385 startNextJob(); 1386 } 1387 } 1388 1389 public void crawlPausing(String statusMessage) { 1390 1392 } 1393 1394 public void crawlPaused(String statusMessage) { 1395 1397 } 1398 1399 public void crawlResuming(String statusMessage) { 1400 } 1402 1403 public void crawlCheckpoint(File checkpointDir) throws Exception { 1404 } 1406} 1407 | Popular Tags |