1 17 18 19 20 package org.apache.lenya.search.crawler; 21 22 import java.io.File ; 23 import java.io.IOException ; 24 25 import org.apache.avalon.excalibur.io.FileUtil; 26 import org.apache.lenya.xml.DOMUtil; 27 import org.apache.lenya.xml.DocumentHelper; 28 import org.apache.lenya.xml.XPath; 29 import org.apache.log4j.Category; 30 import org.w3c.dom.Document ; 31 import org.w3c.dom.Element ; 32 33 34 37 public class CrawlerConfiguration { 38 static Category log = Category.getInstance(CrawlerConfiguration.class); 39 private String configurationFilePath; 40 private String base_url; 41 private String user_agent; 42 private String scope_url; 43 private String uri_list; 44 private String htdocs_dump_dir; 45 private String robots_file; 46 private String robots_domain; 47 48 53 public CrawlerConfiguration(String configurationFilePath) { 54 this.configurationFilePath = configurationFilePath; 55 56 File configurationFile = new File (configurationFilePath); 57 58 try { 59 Document document = DocumentHelper.readDocument(configurationFile); 60 configure(document.getDocumentElement()); 61 } catch (Exception e) { 62 log.error("Cannot load publishing configuration! ", e); 63 } 64 } 65 66 71 public static void main(String [] args) { 72 if (args.length == 0) { 73 System.err.println( 74 "Usage: org.apache.lenya.search.crawler.CrawlerConfiguration crawler.xconf [-name <name>]"); 75 76 return; 77 } 78 79 CrawlerConfiguration ce = new CrawlerConfiguration(args[0]); 80 String parameter; 81 82 String name = null; 83 84 for (int i = 0; i < args.length; i++) { 85 if (args[i].equals("-name")) { 86 if ((i + 1) < args.length) { 87 name = args[i + 1]; 88 } 89 } 90 } 91 92 if (name != null) { 93 if (name.equals("htdocs-dump-dir")) { 94 parameter = ce.getHTDocsDumpDir(); 95 System.out.println(ce.resolvePath(parameter)); 96 } else { 97 System.out.println("No such element: " + name); 98 } 99 } else { 100 parameter = ce.getBaseURL(); 101 System.out.println("Crawler Config: Base URL: " + parameter); 102 103 parameter = ce.getScopeURL(); 104 System.out.println("Crawler Config: Scope URL: " + parameter); 105 106 parameter = ce.getUserAgent(); 107 System.out.println("Crawler Config: User Agent: " + parameter); 108 109 parameter = ce.getURIList(); 110 System.out.println("Crawler Config: URI List: " + ce.resolvePath(parameter) + " (" + parameter + ")"); 111 112 parameter = ce.getHTDocsDumpDir(); 113 System.out.println("Crawler Config: HTDocs Dump Dir: " + ce.resolvePath(parameter) + " (" + parameter + ")"); 114 115 parameter = ce.getRobotsFile(); 116 if (parameter != null) { 117 System.out.println("Crawler Config: Robots File: " + ce.resolvePath(parameter + " (" + parameter + ")")); 118 } 119 120 parameter = ce.getRobotsDomain(); 121 if (parameter != null) { 122 System.out.println("Crawler Config: Robots Domain: " + parameter); 123 } 124 } 125 } 126 127 134 public void configure(Element root) throws Exception { 135 DOMUtil du = new DOMUtil(); 136 137 base_url = du.getAttributeValue(root, new XPath("base-url/@href")); 138 scope_url = du.getAttributeValue(root, new XPath("scope-url/@href")); 139 user_agent = du.getElementValue(root, new XPath("user-agent")); 140 uri_list = du.getAttributeValue(root, new XPath("uri-list/@src")); 141 htdocs_dump_dir = du.getAttributeValue(root, new XPath("htdocs-dump-dir/@src")); 142 if (du.elementExists(root, new XPath("robots"))) { 143 robots_file = du.getAttributeValue(root, new XPath("robots/@src")); 144 robots_domain = du.getAttributeValue(root, new XPath("robots/@domain")); 145 } 146 } 147 148 153 public String getBaseURL() { 154 log.debug(".getBaseURL(): " + base_url); 155 156 return base_url; 157 } 158 159 164 public String getScopeURL() { 165 log.debug(".getScopeURL(): " + scope_url); 166 167 return scope_url; 168 } 169 170 175 public String getUserAgent() { 176 log.debug(".getUserAgent(): " + user_agent); 177 178 return user_agent; 179 } 180 181 186 public String getURIList() { 187 log.debug(".getURIList(): " + uri_list); 188 189 return uri_list; 190 } 191 192 197 public String getURIListResolved() { 198 log.debug(".getURIList(): " + uri_list); 199 200 return resolvePath(uri_list); 201 } 202 203 208 public String getHTDocsDumpDir() { 209 log.debug(".getHTDocsDumpDir(): " + htdocs_dump_dir); 210 211 return htdocs_dump_dir; 212 } 213 214 219 public String getHTDocsDumpDirResolved() { 220 221 return resolvePath(htdocs_dump_dir); 222 } 223 224 229 public String getRobotsFile() { 230 log.debug(robots_file); 231 232 return robots_file; 233 } 234 235 240 public String getRobotsFileResolved() { 241 log.debug(robots_file); 242 243 return resolvePath(robots_file); 244 } 245 246 251 public String getRobotsDomain() { 252 log.debug(robots_domain); 253 254 return robots_domain; 255 } 256 257 264 public String resolvePath(String path) { 265 266 if ( new File (path) .isAbsolute() ) { 268 return path; 269 } 270 271 try { 274 String configDir = new File (configurationFilePath) .getAbsoluteFile() .getParent(); 275 return new File (configDir, path) .getCanonicalPath(); 276 277 } catch (java.io.IOException e) { 278 e.printStackTrace(); 280 return null; 281 } 282 283 } 284 } 285 | Popular Tags |