KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > settings > XMLSettingsHandler


1 /* XMLSettingsHandler
2  *
3  * $Id: XMLSettingsHandler.java,v 1.13.6.1 2007/01/13 01:31:28 stack-sf Exp $
4  *
5  * Created on Dec 18, 2003
6  *
7  * Copyright (C) 2004 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.settings;
26
27 import java.io.BufferedInputStream JavaDoc;
28 import java.io.BufferedOutputStream JavaDoc;
29 import java.io.File JavaDoc;
30 import java.io.FileInputStream JavaDoc;
31 import java.io.FileOutputStream JavaDoc;
32 import java.io.IOException JavaDoc;
33 import java.io.InputStream JavaDoc;
34 import java.util.ArrayList JavaDoc;
35 import java.util.Collection JavaDoc;
36 import java.util.List JavaDoc;
37 import java.util.TreeSet JavaDoc;
38 import java.util.logging.Logger JavaDoc;
39
40 import javax.management.Attribute JavaDoc;
41 import javax.management.AttributeNotFoundException JavaDoc;
42 import javax.management.InvalidAttributeValueException JavaDoc;
43 import javax.management.MBeanAttributeInfo JavaDoc;
44 import javax.management.MBeanException JavaDoc;
45 import javax.management.MBeanInfo JavaDoc;
46 import javax.management.ReflectionException JavaDoc;
47 import javax.xml.parsers.FactoryConfigurationError JavaDoc;
48 import javax.xml.parsers.ParserConfigurationException JavaDoc;
49 import javax.xml.parsers.SAXParserFactory JavaDoc;
50 import javax.xml.transform.Source JavaDoc;
51 import javax.xml.transform.Transformer JavaDoc;
52 import javax.xml.transform.TransformerFactory JavaDoc;
53 import javax.xml.transform.stream.StreamResult JavaDoc;
54
55 import org.archive.crawler.datamodel.CrawlOrder;
56 import org.archive.util.ArchiveUtils;
57 import org.archive.util.FileUtils;
58 import org.xml.sax.InputSource JavaDoc;
59 import org.xml.sax.SAXException JavaDoc;
60 import org.xml.sax.SAXParseException JavaDoc;
61 import org.xml.sax.XMLReader JavaDoc;
62
63 /** A SettingsHandler which uses XML files as persistent storage.
64  *
65  * @author John Erik Halse
66  */

67 public class XMLSettingsHandler extends SettingsHandler {
68     private static Logger JavaDoc logger =
69         Logger.getLogger(
70             "org.archive.crawler.settings.XMLSettingsHandler");
71
72     // XML element name constants
73
protected static final String JavaDoc XML_SCHEMA = "heritrix_settings.xsd";
74     protected static final String JavaDoc XML_ROOT_ORDER = "crawl-order";
75     protected static final String JavaDoc XML_ROOT_HOST_SETTINGS = "crawl-settings";
76     protected static final String JavaDoc XML_ROOT_REFINEMENT = "crawl-refinement";
77     protected static final String JavaDoc XML_ELEMENT_CONTROLLER = "controller";
78     protected static final String JavaDoc XML_ELEMENT_META = "meta";
79     protected static final String JavaDoc XML_ELEMENT_NAME = "name";
80     protected static final String JavaDoc XML_ELEMENT_DESCRIPTION = "description";
81     protected static final String JavaDoc XML_ELEMENT_OPERATOR = "operator";
82     protected static final String JavaDoc XML_ELEMENT_ORGANIZATION = "organization";
83     protected static final String JavaDoc XML_ELEMENT_AUDIENCE = "audience";
84     protected static final String JavaDoc XML_ELEMENT_DATE = "date";
85     protected static final String JavaDoc XML_ELEMENT_REFINEMENTLIST = "refinement-list";
86     protected static final String JavaDoc XML_ELEMENT_REFINEMENT = "refinement";
87     protected static final String JavaDoc XML_ELEMENT_REFERENCE = "reference";
88     protected static final String JavaDoc XML_ELEMENT_LIMITS = "limits";
89     protected static final String JavaDoc XML_ELEMENT_TIMESPAN = "timespan";
90     protected static final String JavaDoc XML_ELEMENT_PORTNUMBER = "portnumber";
91     protected static final String JavaDoc XML_ELEMENT_URIMATCHES = "uri-matches";
92     protected static final String JavaDoc XML_ELEMENT_CONTENTMATCHES = "content-type-matches";
93     protected static final String JavaDoc XML_ELEMENT_OBJECT = "object";
94     protected static final String JavaDoc XML_ELEMENT_NEW_OBJECT = "newObject";
95     protected static final String JavaDoc XML_ATTRIBUTE_NAME = "name";
96     protected static final String JavaDoc XML_ATTRIBUTE_CLASS = "class";
97     protected static final String JavaDoc XML_ATTRIBUTE_FROM = "from";
98     protected static final String JavaDoc XML_ATTRIBUTE_TO = "to";
99
100     private File JavaDoc orderFile;
101     private final static String JavaDoc settingsFilename = "settings";
102     private final static String JavaDoc settingsFilenameSuffix = "xml";
103     private final static String JavaDoc REFINEMENT_DIR = "_refinements";
104
105     /** Create a new XMLSettingsHandler object.
106      *
107      * @param orderFile where the order file is located.
108      * @throws InvalidAttributeValueException
109      */

110     public XMLSettingsHandler(File JavaDoc orderFile)
111     throws InvalidAttributeValueException JavaDoc {
112         super();
113         this.orderFile = orderFile.getAbsoluteFile();
114     }
115
116     /** Initialize the SettingsHandler.
117      *
118      * This method builds the settings data structure and initializes it with
119      * settings from the order file given to the constructor.
120      */

121     public void initialize() {
122         super.initialize();
123     }
124
125     /**
126      * Initialize the SettingsHandler from a source.
127      *
128      * This method builds the settings data structure and initializes it with
129      * settings from the order file given as a parameter. The intended use is
130      * to create a new order file based on a default (template) order file.
131      *
132      * @param source the order file to initialize from.
133      */

134     public void initialize(File JavaDoc source) {
135         File JavaDoc tmpOrderFile = orderFile;
136         orderFile = source.getAbsoluteFile();
137         this.initialize();
138         orderFile = tmpOrderFile;
139     }
140
141     private File JavaDoc getSettingsDirectory() {
142         String JavaDoc settingsDirectoryName = null;
143         try {
144             settingsDirectoryName =
145                     (String JavaDoc) getOrder().getAttribute(
146                         CrawlOrder.ATTR_SETTINGS_DIRECTORY);
147         } catch (AttributeNotFoundException JavaDoc e) {
148             e.printStackTrace();
149         } catch (MBeanException JavaDoc e) {
150             e.printStackTrace();
151         } catch (ReflectionException JavaDoc e) {
152             e.printStackTrace();
153         }
154
155         return getPathRelativeToWorkingDirectory(settingsDirectoryName);
156     }
157
158     /** Resolves the filename for a settings object into a file path.
159      *
160      * It will also create the directory structure leading to this file
161      * if it doesn't exist.
162      *
163      * @param settings the settings object to get file path for.
164      * @return the file path for this settings object.
165      */

166     protected final File JavaDoc settingsToFilename(CrawlerSettings settings) {
167         File JavaDoc file;
168
169         if (settings.getScope() == null || settings.getScope().equals("")) {
170             if (settings.isRefinement()) {
171                 file = new File JavaDoc(getSettingsDirectory(), File.separatorChar
172                         + REFINEMENT_DIR + File.separatorChar
173                         + settings.getName() + '.' + settingsFilenameSuffix);
174             } else {
175                 file = orderFile;
176             }
177         } else {
178             String JavaDoc elements[] = settings.getScope().split("\\.");
179             if (elements.length == 0) {
180                 return orderFile;
181             }
182
183             StringBuffer JavaDoc path = new StringBuffer JavaDoc();
184             for (int i = elements.length - 1; i > 0; i--) {
185                 path.append(elements[i]);
186                 path.append(File.separatorChar);
187             }
188             path.append(elements[0]);
189
190             if (settings.isRefinement()) {
191                 file = new File JavaDoc(getSettingsDirectory(), path.toString()
192                         + File.separatorChar + REFINEMENT_DIR
193                         + File.separatorChar + settings.getName() + '.'
194                         + settingsFilenameSuffix);
195             } else {
196                 file = new File JavaDoc(getSettingsDirectory(), path.toString()
197                         + File.separatorChar + settingsFilename + "."
198                         + settingsFilenameSuffix);
199             }
200         }
201         return file;
202     }
203
204     public final void writeSettingsObject(CrawlerSettings settings) {
205         File JavaDoc filename = settingsToFilename(settings);
206         writeSettingsObject(settings, filename);
207     }
208
209     /** Write a CrawlerSettings object to a specified file.
210      *
211      * This method is similar to {@link #writeSettingsObject(CrawlerSettings)}
212      * except that it uses the submitted File object instead of trying to
213      * resolve where the file should be written.
214      *
215      * @param settings the settings object to be serialized.
216      * @param filename the file to which the settings object should be written.
217      */

218     public final void writeSettingsObject(
219             CrawlerSettings settings, File JavaDoc filename) {
220
221         logger.fine("Writing " + filename.getAbsolutePath());
222         filename.getParentFile().mkdirs();
223
224         try {
225             long lastSaved = 0L;
226             File JavaDoc backup = null;
227             if (getOrder().getController() != null && filename.exists()) {
228                 // The crawler is running and file exists - make backup first.
229
String JavaDoc name = filename.getName();
230                 lastSaved = settings.getLastSavedTime().getTime();
231                 name = name.substring(0, name.lastIndexOf('.')) + '_'
232                         + ArchiveUtils.get14DigitDate(lastSaved) + "."
233                         + settingsFilenameSuffix;
234                 backup = new File JavaDoc(filename.getParentFile(), name);
235                 FileUtils.copyFiles(filename, backup);
236             }
237
238             StreamResult JavaDoc result =
239                 new StreamResult JavaDoc(
240                     new BufferedOutputStream JavaDoc(new FileOutputStream JavaDoc(filename)));
241             Transformer JavaDoc transformer =
242                 TransformerFactory.newInstance().newTransformer();
243             Source JavaDoc source = new CrawlSettingsSAXSource(settings);
244             transformer.transform(source, result);
245
246             // Hack to get rid of unnesessary backupfiles.
247
// What happens is that the WUI often saves settings files
248
// several times during a settings change. This code removes the
249
// last backup file if its no more than 2 minutes old.
250
if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) {
251                 backup.delete();
252             }
253         } catch (Exception JavaDoc e) {
254             e.printStackTrace();
255         }
256     }
257
258     /** Read the CrawlerSettings object from a specific file.
259      *
260      * @param settings the settings object to be updated with data from the
261      * persistent storage.
262      * @param f the file to read from.
263      * @return the updated settings object or null if there was no data for this
264      * in the persistent storage.
265      */

266     protected final CrawlerSettings readSettingsObject(CrawlerSettings settings,
267             File JavaDoc f) {
268         CrawlerSettings result = null;
269         try {
270             InputStream JavaDoc is = null;
271             if (!f.exists()) {
272                 // Perhaps the file we're looking for is on the CLASSPATH.
273
// DON'T look on the CLASSPATH for 'settings.xml' files. The
274
// look for 'settings.xml' files happens frequently. Not looking
275
// on classpath for 'settings.xml' is an optimization based on
276
// ASSUMPTION that there will never be a 'settings.xml' saved
277
// on classpath.
278
if (!f.getName().startsWith(settingsFilename)) {
279                     is = XMLSettingsHandler.class.
280                         getResourceAsStream(f.getPath());
281                 }
282             } else {
283                 is = new FileInputStream JavaDoc(f);
284             }
285             if (is != null) {
286                 XMLReader JavaDoc parser = SAXParserFactory.newInstance()
287                     .newSAXParser().getXMLReader();
288                 InputStream JavaDoc file = new BufferedInputStream JavaDoc(is);
289                 parser.setContentHandler(new CrawlSettingsSAXHandler(settings));
290                 InputSource JavaDoc source = new InputSource JavaDoc(file);
291                 source.setSystemId(f.toURL().toExternalForm());
292                 parser.parse(source);
293                 result = settings;
294             }
295         } catch (SAXParseException JavaDoc e) {
296             logger.warning(e.getMessage() + " in '" + e.getSystemId()
297                 + "', line: " + e.getLineNumber() + ", column: "
298                 + e.getColumnNumber());
299         } catch (SAXException JavaDoc e) {
300             logger.warning(e.getMessage() + ": "
301                 + e.getException().getMessage());
302         } catch (ParserConfigurationException JavaDoc e) {
303             logger.warning(e.getMessage() + ": "
304                 + e.getCause().getMessage());
305         } catch (FactoryConfigurationError JavaDoc e) {
306             logger.warning(e.getMessage() + ": "
307                 + e.getException().getMessage());
308         } catch (IOException JavaDoc e) {
309             logger.warning("Could not access file '"
310                 + f.getAbsolutePath() + "': " + e.getMessage());
311         }
312         return result;
313     }
314
315     protected final CrawlerSettings readSettingsObject(CrawlerSettings settings) {
316         File JavaDoc filename = settingsToFilename(settings);
317         return readSettingsObject(settings, filename);
318     }
319
320     /** Get the <code>File</code> object pointing to the order file.
321      *
322      * @return File object for the order file.
323      */

324     public File JavaDoc getOrderFile() {
325         return orderFile;
326     }
327
328     /** Creates a replica of the settings file structure in another directory
329      * (fully recursive, includes all per host settings). The SettingsHandler
330      * will then refer to the new files.
331      *
332      * Observe that this method should only be called after the SettingsHandler
333      * has been initialized.
334      *
335      * @param newOrderFileName where the new order file should be saved.
336      * @param newSettingsDirectory the top level directory of the per host/domain
337      * settings files.
338      * @throws IOException
339      */

340     public void copySettings(File JavaDoc newOrderFileName, String JavaDoc newSettingsDirectory)
341       throws IOException JavaDoc {
342         File JavaDoc oldSettingsDirectory = getSettingsDirectory();
343
344         // Write new orderfile and point the settingshandler to it
345
orderFile = newOrderFileName;
346         try {
347             getOrder().setAttribute(
348                 new Attribute JavaDoc(
349                     CrawlOrder.ATTR_SETTINGS_DIRECTORY, newSettingsDirectory));
350         } catch (Exception JavaDoc e) {
351             throw new IOException JavaDoc("Could not update settings with new location: "
352                 + e.getMessage());
353         }
354         writeSettingsObject(getSettingsObject(null));
355
356         File JavaDoc newDir = getPathRelativeToWorkingDirectory(newSettingsDirectory);
357
358         // Copy the per host files if src and dest directories are different.
359
if (oldSettingsDirectory.compareTo(newDir) != 0) {
360             FileUtils.copyFiles(oldSettingsDirectory, newDir);
361         }
362     }
363
364     /**
365      * Transforms a relative path so that it is relative to the location of the
366      * order file. If an absolute path is given, it will be returned unchanged.<p>
367      * The location of it's order file is always considered as the 'working'
368      * directory for any given settings.
369      * @param path A relative path to a file (or directory)
370      * @return The same path modified so that it is relative to the file level
371      * location of the order file for the settings handler.
372      */

373     public File JavaDoc getPathRelativeToWorkingDirectory(String JavaDoc path) {
374         File JavaDoc f = new File JavaDoc(path);
375         // If path is not absolute, set f's directory
376
// relative to the path of the order file
377
if (!f.isAbsolute()) {
378             f = new File JavaDoc(this.getOrderFile().getParent(), path);
379         }
380         return f;
381     }
382
383     public Collection JavaDoc getDomainOverrides(String JavaDoc rootDomain) {
384         File JavaDoc settingsDir = getSettingsDirectory();
385
386         //Find the right start directory.
387
ArrayList JavaDoc<String JavaDoc> domains = new ArrayList JavaDoc<String JavaDoc>();
388         //First we deconstruct the rootDomain string
389
while(rootDomain != null && rootDomain.length()>0){
390             if(rootDomain.indexOf('.')<0){
391                 // Last level.
392
domains.add(rootDomain);
393                 break; //We're done.
394
} else {
395                 // Got more then one level left.
396
domains.add(rootDomain.substring(0,rootDomain.indexOf('.')));
397                 // Strip down rootDomain.
398
rootDomain = rootDomain.substring(rootDomain.indexOf('.')+1);
399             }
400         }
401         //Build up a proper path
402
//Since the domains are right to left, we start at the end of the array.
403
StringBuffer JavaDoc subDir = new StringBuffer JavaDoc();
404         for(int i=(domains.size()-1) ; i>=0 ; i--){
405             subDir.append(File.separator+domains.get(i));
406         }
407         //Then we move to the approprite directory.
408
settingsDir = new File JavaDoc(settingsDir.getPath()+subDir);
409         TreeSet JavaDoc<String JavaDoc> confirmedSubDomains = new TreeSet JavaDoc<String JavaDoc>();
410         if(settingsDir.exists()){
411             // Found our place! Search through it's subdirs.
412
File JavaDoc[] possibleSubDomains = settingsDir.listFiles();
413             for (int i = 0; i < possibleSubDomains.length; i++) {
414                 if (possibleSubDomains[i].isDirectory()
415                     && isOverride(possibleSubDomains[i])) {
416                     // Found one!
417
confirmedSubDomains.add(possibleSubDomains[i].getName());
418                 }
419             }
420         }
421         return confirmedSubDomains;
422     }
423
424     /**
425      * Checks if a file is a a 'per host' override or if it's a directory if it
426      * or it's subdirectories contains a 'per host' override file.
427      * @param f The file or directory to check
428      * @return True if the file is an override or it's a directory that contains
429      * such a file.
430      */

431     private boolean isOverride(File JavaDoc f){
432         if(f.isDirectory()){
433             // Have a directory, check it's contents.
434
File JavaDoc[] subs = f.listFiles();
435             for(int i=0 ; i < subs.length ; i++){
436                 if(isOverride(subs[i])){
437                     // Found one. Can stop looking.
438
return true;
439                 }
440             }
441         } else if (f.getName().equals(
442                 settingsFilename + "." + settingsFilenameSuffix)) {
443             // This is an override file (or sure looks like one in any case).
444
return true;
445         }
446         // Didn't find an override.
447
return false;
448     }
449
450     /** Delete a settings object from persistent storage.
451      *
452      * Deletes the file represented by the submitted settings object. All empty
453      * directories that are parents to the files path are also deleted.
454      *
455      * @param settings the settings object to delete.
456      */

457     public void deleteSettingsObject(CrawlerSettings settings) {
458         super.deleteSettingsObject(settings);
459         File JavaDoc settingsDirectory = getSettingsDirectory();
460         File JavaDoc settingsFile = settingsToFilename(settings);
461
462         settingsFile.delete();
463         settingsFile = settingsFile.getParentFile();
464         while (settingsFile.isDirectory() && settingsFile.list().length == 0
465                 && !settingsFile.equals(settingsDirectory)) {
466             settingsFile.delete();
467             settingsFile = settingsFile.getParentFile();
468         }
469     }
470
471     /* (non-Javadoc)
472      * @see org.archive.crawler.settings.SettingsHandler#getListOfAllFiles()
473      */

474     public List JavaDoc<String JavaDoc> getListOfAllFiles() {
475         ArrayList JavaDoc<String JavaDoc> list = new ArrayList JavaDoc<String JavaDoc>();
476         // Add CrawlOrder.
477
list.add(getOrderFile().getAbsolutePath());
478         // Iterate through the entire override hierarchy
479
if (getSettingsDirectory().exists()) {
480             recursiveFindFiles(getSettingsDirectory(),list);
481         }
482         // Get files used by settings modules.
483
recursiveFindSecondaryFiles(getOrder(),list);
484         return list;
485     }
486
487     /**
488      * Add any files being used by any of the Modules making up the settings to
489      * the list.
490      *
491      * @param mbean A ModuleType to interrogate for files. Any child modules
492      * will be recursively interrogated.
493      * @param list The list to add found files to.
494      */

495     private void recursiveFindSecondaryFiles(ComplexType mbean,
496             ArrayList JavaDoc<String JavaDoc> list) {
497         MBeanInfo JavaDoc info = mbean.getMBeanInfo();
498         MBeanAttributeInfo JavaDoc[] a = info.getAttributes();
499         // Interrogate the current module
500
if(mbean instanceof ModuleType){
501             ((ModuleType)mbean).listUsedFiles(list);
502         }
503
504         // Recursively interrogate all sub modules that are of ModuleType
505
for(int n=0; n<a.length; n++) {
506             if(a[n] == null) {
507                 // Error null attribute.
508
} else {
509                 ModuleAttributeInfo att = (ModuleAttributeInfo)a[n];
510                 Object JavaDoc currentAttribute;
511                 try {
512                     currentAttribute = mbean.getAttribute(att.getName());
513                     if(currentAttribute instanceof ComplexType) {
514                         recursiveFindSecondaryFiles((ComplexType)currentAttribute,list);
515                     }
516                 } catch (AttributeNotFoundException JavaDoc e) {
517                     // TODO Auto-generated catch block
518
e.printStackTrace();
519                 } catch (MBeanException JavaDoc e) {
520                     // TODO Auto-generated catch block
521
e.printStackTrace();
522                 } catch (ReflectionException JavaDoc e) {
523                     // TODO Auto-generated catch block
524
e.printStackTrace();
525                 }
526             }
527         }
528     }
529
530     /**
531      * Starting at the specific directory this method will iterate through all
532      * sub directories and add each file (as absolute name, with path as a
533      * string) to the provided ArrayList. Any file found under the settings
534      * directory with the proper suffix will be considered valid and added to
535      * the list.
536      * @param dir Starting directory
537      * @param list The list to add to
538      */

539     private void recursiveFindFiles(File JavaDoc dir, ArrayList JavaDoc<String JavaDoc> list){
540         File JavaDoc[] subs = dir.listFiles();
541         if (subs != null) {
542             for(int i=0 ; i < subs.length ; i++){
543                 if(subs[i].isDirectory()){
544                     recursiveFindFiles(subs[i],list);
545                 } else {
546                     if(subs[i].getName().endsWith(settingsFilenameSuffix)){
547                         // Add it to list
548
list.add(subs[i].getAbsolutePath());
549                     }
550                 }
551             }
552         }
553     }
554 }
555
Popular Tags