| 1 25 package org.archive.crawler.settings; 26 27 import java.io.BufferedInputStream ; 28 import java.io.BufferedOutputStream ; 29 import java.io.File ; 30 import java.io.FileInputStream ; 31 import java.io.FileOutputStream ; 32 import java.io.IOException ; 33 import java.io.InputStream ; 34 import java.util.ArrayList ; 35 import java.util.Collection ; 36 import java.util.List ; 37 import java.util.TreeSet ; 38 import java.util.logging.Logger ; 39 40 import javax.management.Attribute ; 41 import javax.management.AttributeNotFoundException ; 42 import javax.management.InvalidAttributeValueException ; 43 import javax.management.MBeanAttributeInfo ; 44 import javax.management.MBeanException ; 45 import javax.management.MBeanInfo ; 46 import javax.management.ReflectionException ; 47 import javax.xml.parsers.FactoryConfigurationError ; 48 import javax.xml.parsers.ParserConfigurationException ; 49 import javax.xml.parsers.SAXParserFactory ; 50 import javax.xml.transform.Source ; 51 import javax.xml.transform.Transformer ; 52 import javax.xml.transform.TransformerFactory ; 53 import javax.xml.transform.stream.StreamResult ; 54 55 import org.archive.crawler.datamodel.CrawlOrder; 56 import org.archive.util.ArchiveUtils; 57 import org.archive.util.FileUtils; 58 import org.xml.sax.InputSource ; 59 import org.xml.sax.SAXException ; 60 import org.xml.sax.SAXParseException ; 61 import org.xml.sax.XMLReader ; 62 63 67 public class XMLSettingsHandler extends SettingsHandler { 68 private static Logger logger = 69 Logger.getLogger( 70 "org.archive.crawler.settings.XMLSettingsHandler"); 71 72 protected static final String XML_SCHEMA = "heritrix_settings.xsd"; 74 protected static final String XML_ROOT_ORDER = "crawl-order"; 75 protected static final String XML_ROOT_HOST_SETTINGS = "crawl-settings"; 76 protected static final String XML_ROOT_REFINEMENT = "crawl-refinement"; 77 protected static final String XML_ELEMENT_CONTROLLER = "controller"; 78 protected static final String XML_ELEMENT_META = "meta"; 79 protected static final String XML_ELEMENT_NAME = "name"; 80 protected static final String XML_ELEMENT_DESCRIPTION = "description"; 81 protected static final String XML_ELEMENT_OPERATOR = "operator"; 82 protected static final String XML_ELEMENT_ORGANIZATION = "organization"; 83 protected static final String XML_ELEMENT_AUDIENCE = "audience"; 84 protected static final String XML_ELEMENT_DATE = "date"; 85 protected static final String XML_ELEMENT_REFINEMENTLIST = "refinement-list"; 86 protected static final String XML_ELEMENT_REFINEMENT = "refinement"; 87 protected static final String XML_ELEMENT_REFERENCE = "reference"; 88 protected static final String XML_ELEMENT_LIMITS = "limits"; 89 protected static final String XML_ELEMENT_TIMESPAN = "timespan"; 90 protected static final String XML_ELEMENT_PORTNUMBER = "portnumber"; 91 protected static final String XML_ELEMENT_URIMATCHES = "uri-matches"; 92 protected static final String XML_ELEMENT_CONTENTMATCHES = "content-type-matches"; 93 protected static final String XML_ELEMENT_OBJECT = "object"; 94 protected static final String XML_ELEMENT_NEW_OBJECT = "newObject"; 95 protected static final String XML_ATTRIBUTE_NAME = "name"; 96 protected static final String XML_ATTRIBUTE_CLASS = "class"; 97 protected static final String XML_ATTRIBUTE_FROM = "from"; 98 protected static final String XML_ATTRIBUTE_TO = "to"; 99 100 private File orderFile; 101 private final static String settingsFilename = "settings"; 102 private final static String settingsFilenameSuffix = "xml"; 103 private final static String REFINEMENT_DIR = "_refinements"; 104 105 110 public XMLSettingsHandler(File orderFile) 111 throws InvalidAttributeValueException { 112 super(); 113 this.orderFile = orderFile.getAbsoluteFile(); 114 } 115 116 121 public void initialize() { 122 super.initialize(); 123 } 124 125 134 public void initialize(File source) { 135 File tmpOrderFile = orderFile; 136 orderFile = source.getAbsoluteFile(); 137 this.initialize(); 138 orderFile = tmpOrderFile; 139 } 140 141 private File getSettingsDirectory() { 142 String settingsDirectoryName = null; 143 try { 144 settingsDirectoryName = 145 (String ) getOrder().getAttribute( 146 CrawlOrder.ATTR_SETTINGS_DIRECTORY); 147 } catch (AttributeNotFoundException e) { 148 e.printStackTrace(); 149 } catch (MBeanException e) { 150 e.printStackTrace(); 151 } catch (ReflectionException e) { 152 e.printStackTrace(); 153 } 154 155 return getPathRelativeToWorkingDirectory(settingsDirectoryName); 156 } 157 158 166 protected final File settingsToFilename(CrawlerSettings settings) { 167 File file; 168 169 if (settings.getScope() == null || settings.getScope().equals("")) { 170 if (settings.isRefinement()) { 171 file = new File (getSettingsDirectory(), File.separatorChar 172 + REFINEMENT_DIR + File.separatorChar 173 + settings.getName() + '.' + settingsFilenameSuffix); 174 } else { 175 file = orderFile; 176 } 177 } else { 178 String elements[] = settings.getScope().split("\\."); 179 if (elements.length == 0) { 180 return orderFile; 181 } 182 183 StringBuffer path = new StringBuffer (); 184 for (int i = elements.length - 1; i > 0; i--) { 185 path.append(elements[i]); 186 path.append(File.separatorChar); 187 } 188 path.append(elements[0]); 189 190 if (settings.isRefinement()) { 191 file = new File (getSettingsDirectory(), path.toString() 192 + File.separatorChar + REFINEMENT_DIR 193 + File.separatorChar + settings.getName() + '.' 194 + settingsFilenameSuffix); 195 } else { 196 file = new File (getSettingsDirectory(), path.toString() 197 + File.separatorChar + settingsFilename + "." 198 + settingsFilenameSuffix); 199 } 200 } 201 return file; 202 } 203 204 public final void writeSettingsObject(CrawlerSettings settings) { 205 File filename = settingsToFilename(settings); 206 writeSettingsObject(settings, filename); 207 } 208 209 218 public final void writeSettingsObject( 219 CrawlerSettings settings, File filename) { 220 221 logger.fine("Writing " + filename.getAbsolutePath()); 222 filename.getParentFile().mkdirs(); 223 224 try { 225 long lastSaved = 0L; 226 File backup = null; 227 if (getOrder().getController() != null && filename.exists()) { 228 String name = filename.getName(); 230 lastSaved = settings.getLastSavedTime().getTime(); 231 name = name.substring(0, name.lastIndexOf('.')) + '_' 232 + ArchiveUtils.get14DigitDate(lastSaved) + "." 233 + settingsFilenameSuffix; 234 backup = new File (filename.getParentFile(), name); 235 FileUtils.copyFiles(filename, backup); 236 } 237 238 StreamResult result = 239 new StreamResult ( 240 new BufferedOutputStream (new FileOutputStream (filename))); 241 Transformer transformer = 242 TransformerFactory.newInstance().newTransformer(); 243 Source source = new CrawlSettingsSAXSource(settings); 244 transformer.transform(source, result); 245 246 if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) { 251 backup.delete(); 252 } 253 } catch (Exception e) { 254 e.printStackTrace(); 255 } 256 } 257 258 266 protected final CrawlerSettings readSettingsObject(CrawlerSettings settings, 267 File f) { 268 CrawlerSettings result = null; 269 try { 270 InputStream is = null; 271 if (!f.exists()) { 272 if (!f.getName().startsWith(settingsFilename)) { 279 is = XMLSettingsHandler.class. 280 getResourceAsStream(f.getPath()); 281 } 282 } else { 283 is = new FileInputStream (f); 284 } 285 if (is != null) { 286 XMLReader parser = SAXParserFactory.newInstance() 287 .newSAXParser().getXMLReader(); 288 InputStream file = new BufferedInputStream (is); 289 parser.setContentHandler(new CrawlSettingsSAXHandler(settings)); 290 InputSource source = new InputSource (file); 291 source.setSystemId(f.toURL().toExternalForm()); 292 parser.parse(source); 293 result = settings; 294 } 295 } catch (SAXParseException e) { 296 logger.warning(e.getMessage() + " in '" + e.getSystemId() 297 + "', line: " + e.getLineNumber() + ", column: " 298 + e.getColumnNumber()); 299 } catch (SAXException e) { 300 logger.warning(e.getMessage() + ": " 301 + e.getException().getMessage()); 302 } catch (ParserConfigurationException e) { 303 logger.warning(e.getMessage() + ": " 304 + e.getCause().getMessage()); 305 } catch (FactoryConfigurationError e) { 306 logger.warning(e.getMessage() + ": " 307 + e.getException().getMessage()); 308 } catch (IOException e) { 309 logger.warning("Could not access file '" 310 + f.getAbsolutePath() + "': " + e.getMessage()); 311 } 312 return result; 313 } 314 315 protected final CrawlerSettings readSettingsObject(CrawlerSettings settings) { 316 File filename = settingsToFilename(settings); 317 return readSettingsObject(settings, filename); 318 } 319 320 324 public File getOrderFile() { 325 return orderFile; 326 } 327 328 340 public void copySettings(File newOrderFileName, String newSettingsDirectory) 341 throws IOException { 342 File oldSettingsDirectory = getSettingsDirectory(); 343 344 orderFile = newOrderFileName; 346 try { 347 getOrder().setAttribute( 348 new Attribute ( 349 CrawlOrder.ATTR_SETTINGS_DIRECTORY, newSettingsDirectory)); 350 } catch (Exception e) { 351 throw new IOException ("Could not update settings with new location: " 352 + e.getMessage()); 353 } 354 writeSettingsObject(getSettingsObject(null)); 355 356 File newDir = getPathRelativeToWorkingDirectory(newSettingsDirectory); 357 358 if (oldSettingsDirectory.compareTo(newDir) != 0) { 360 FileUtils.copyFiles(oldSettingsDirectory, newDir); 361 } 362 } 363 364 373 public File getPathRelativeToWorkingDirectory(String path) { 374 File f = new File (path); 375 if (!f.isAbsolute()) { 378 f = new File (this.getOrderFile().getParent(), path); 379 } 380 return f; 381 } 382 383 public Collection getDomainOverrides(String rootDomain) { 384 File settingsDir = getSettingsDirectory(); 385 386 ArrayList <String > domains = new ArrayList <String >(); 388 while(rootDomain != null && rootDomain.length()>0){ 390 if(rootDomain.indexOf('.')<0){ 391 domains.add(rootDomain); 393 break; } else { 395 domains.add(rootDomain.substring(0,rootDomain.indexOf('.'))); 397 rootDomain = rootDomain.substring(rootDomain.indexOf('.')+1); 399 } 400 } 401 StringBuffer subDir = new StringBuffer (); 404 for(int i=(domains.size()-1) ; i>=0 ; i--){ 405 subDir.append(File.separator+domains.get(i)); 406 } 407 settingsDir = new File (settingsDir.getPath()+subDir); 409 TreeSet <String > confirmedSubDomains = new TreeSet <String >(); 410 if(settingsDir.exists()){ 411 File [] possibleSubDomains = settingsDir.listFiles(); 413 for (int i = 0; i < possibleSubDomains.length; i++) { 414 if (possibleSubDomains[i].isDirectory() 415 && isOverride(possibleSubDomains[i])) { 416 confirmedSubDomains.add(possibleSubDomains[i].getName()); 418 } 419 } 420 } 421 return confirmedSubDomains; 422 } 423 424 431 private boolean isOverride(File f){ 432 if(f.isDirectory()){ 433 File [] subs = f.listFiles(); 435 for(int i=0 ; i < subs.length ; i++){ 436 if(isOverride(subs[i])){ 437 return true; 439 } 440 } 441 } else if (f.getName().equals( 442 settingsFilename + "." + settingsFilenameSuffix)) { 443 return true; 445 } 446 return false; 448 } 449 450 457 public void deleteSettingsObject(CrawlerSettings settings) { 458 super.deleteSettingsObject(settings); 459 File settingsDirectory = getSettingsDirectory(); 460 File settingsFile = settingsToFilename(settings); 461 462 settingsFile.delete(); 463 settingsFile = settingsFile.getParentFile(); 464 while (settingsFile.isDirectory() && settingsFile.list().length == 0 465 && !settingsFile.equals(settingsDirectory)) { 466 settingsFile.delete(); 467 settingsFile = settingsFile.getParentFile(); 468 } 469 } 470 471 474 public List <String > getListOfAllFiles() { 475 ArrayList <String > list = new ArrayList <String >(); 476 list.add(getOrderFile().getAbsolutePath()); 478 if (getSettingsDirectory().exists()) { 480 recursiveFindFiles(getSettingsDirectory(),list); 481 } 482 recursiveFindSecondaryFiles(getOrder(),list); 484 return list; 485 } 486 487 495 private void recursiveFindSecondaryFiles(ComplexType mbean, 496 ArrayList <String > list) { 497 MBeanInfo info = mbean.getMBeanInfo(); 498 MBeanAttributeInfo [] a = info.getAttributes(); 499 if(mbean instanceof ModuleType){ 501 ((ModuleType)mbean).listUsedFiles(list); 502 } 503 504 for(int n=0; n<a.length; n++) { 506 if(a[n] == null) { 507 } else { 509 ModuleAttributeInfo att = (ModuleAttributeInfo)a[n]; 510 Object currentAttribute; 511 try { 512 currentAttribute = mbean.getAttribute(att.getName()); 513 if(currentAttribute instanceof ComplexType) { 514 recursiveFindSecondaryFiles((ComplexType)currentAttribute,list); 515 } 516 } catch (AttributeNotFoundException e) { 517 e.printStackTrace(); 519 } catch (MBeanException e) { 520 e.printStackTrace(); 522 } catch (ReflectionException e) { 523 e.printStackTrace(); 525 } 526 } 527 } 528 } 529 530 539 private void recursiveFindFiles(File dir, ArrayList <String > list){ 540 File [] subs = dir.listFiles(); 541 if (subs != null) { 542 for(int i=0 ; i < subs.length ; i++){ 543 if(subs[i].isDirectory()){ 544 recursiveFindFiles(subs[i],list); 545 } else { 546 if(subs[i].getName().endsWith(settingsFilenameSuffix)){ 547 list.add(subs[i].getAbsolutePath()); 549 } 550 } 551 } 552 } 553 } 554 } 555 | Popular Tags |