KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lenya > search > crawler > CrawlerConfiguration


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */

17
18 /* $Id: CrawlerConfiguration.java 190175 2005-06-11 20:56:34Z gregor $ */
19
20 package org.apache.lenya.search.crawler;
21
22 import java.io.File JavaDoc;
23 import java.io.IOException JavaDoc;
24
25 import org.apache.avalon.excalibur.io.FileUtil;
26 import org.apache.lenya.xml.DOMUtil;
27 import org.apache.lenya.xml.DocumentHelper;
28 import org.apache.lenya.xml.XPath;
29 import org.apache.log4j.Category;
30 import org.w3c.dom.Document JavaDoc;
31 import org.w3c.dom.Element JavaDoc;
32
33
34 /**
35  * Web-Crawler (it might make sense to replace this by Nutch)
36  */

37 public class CrawlerConfiguration {
38     static Category log = Category.getInstance(CrawlerConfiguration.class);
39     private String JavaDoc configurationFilePath;
40     private String JavaDoc base_url;
41     private String JavaDoc user_agent;
42     private String JavaDoc scope_url;
43     private String JavaDoc uri_list;
44     private String JavaDoc htdocs_dump_dir;
45     private String JavaDoc robots_file;
46     private String JavaDoc robots_domain;
47
48     /**
49      * Creates a new CrawlerConfiguration object.
50      *
51      * @param configurationFilePath DOCUMENT ME!
52      */

53     public CrawlerConfiguration(String JavaDoc configurationFilePath) {
54         this.configurationFilePath = configurationFilePath;
55
56         File JavaDoc configurationFile = new File JavaDoc(configurationFilePath);
57
58         try {
59             Document JavaDoc document = DocumentHelper.readDocument(configurationFile);
60             configure(document.getDocumentElement());
61         } catch (Exception JavaDoc e) {
62             log.error("Cannot load publishing configuration! ", e);
63         }
64     }
65
66     /**
67      * DOCUMENT ME!
68      *
69      * @param args DOCUMENT ME!
70      */

71     public static void main(String JavaDoc[] args) {
72         if (args.length == 0) {
73             System.err.println(
74                 "Usage: org.apache.lenya.search.crawler.CrawlerConfiguration crawler.xconf [-name <name>]");
75
76             return;
77         }
78
79         CrawlerConfiguration ce = new CrawlerConfiguration(args[0]);
80         String JavaDoc parameter;
81
82         String JavaDoc name = null;
83
84         for (int i = 0; i < args.length; i++) {
85             if (args[i].equals("-name")) {
86                 if ((i + 1) < args.length) {
87                     name = args[i + 1];
88                 }
89             }
90         }
91
92         if (name != null) {
93             if (name.equals("htdocs-dump-dir")) {
94                 parameter = ce.getHTDocsDumpDir();
95                 System.out.println(ce.resolvePath(parameter));
96             } else {
97                 System.out.println("No such element: " + name);
98             }
99         } else {
100             parameter = ce.getBaseURL();
101             System.out.println("Crawler Config: Base URL: " + parameter);
102
103             parameter = ce.getScopeURL();
104             System.out.println("Crawler Config: Scope URL: " + parameter);
105
106             parameter = ce.getUserAgent();
107             System.out.println("Crawler Config: User Agent: " + parameter);
108
109             parameter = ce.getURIList();
110             System.out.println("Crawler Config: URI List: " + ce.resolvePath(parameter) + " (" + parameter + ")");
111
112             parameter = ce.getHTDocsDumpDir();
113             System.out.println("Crawler Config: HTDocs Dump Dir: " + ce.resolvePath(parameter) + " (" + parameter + ")");
114
115             parameter = ce.getRobotsFile();
116             if (parameter != null) {
117                 System.out.println("Crawler Config: Robots File: " + ce.resolvePath(parameter + " (" + parameter + ")"));
118             }
119
120             parameter = ce.getRobotsDomain();
121             if (parameter != null) {
122                 System.out.println("Crawler Config: Robots Domain: " + parameter);
123             }
124         }
125     }
126
127     /**
128      * Extract parameters from configuration
129      *
130      * @param configuration DOCUMENT ME!
131      *
132      * @throws Exception DOCUMENT ME!
133      */

134     public void configure(Element JavaDoc root) throws Exception JavaDoc {
135         DOMUtil du = new DOMUtil();
136
137         base_url = du.getAttributeValue(root, new XPath("base-url/@href"));
138         scope_url = du.getAttributeValue(root, new XPath("scope-url/@href"));
139         user_agent = du.getElementValue(root, new XPath("user-agent"));
140         uri_list = du.getAttributeValue(root, new XPath("uri-list/@src"));
141         htdocs_dump_dir = du.getAttributeValue(root, new XPath("htdocs-dump-dir/@src"));
142         if (du.elementExists(root, new XPath("robots"))) {
143             robots_file = du.getAttributeValue(root, new XPath("robots/@src"));
144             robots_domain = du.getAttributeValue(root, new XPath("robots/@domain"));
145         }
146     }
147
148     /**
149      * DOCUMENT ME!
150      *
151      * @return DOCUMENT ME!
152      */

153     public String JavaDoc getBaseURL() {
154         log.debug(".getBaseURL(): " + base_url);
155
156         return base_url;
157     }
158
159     /**
160      * DOCUMENT ME!
161      *
162      * @return DOCUMENT ME!
163      */

164     public String JavaDoc getScopeURL() {
165         log.debug(".getScopeURL(): " + scope_url);
166
167         return scope_url;
168     }
169
170     /**
171      * DOCUMENT ME!
172      *
173      * @return DOCUMENT ME!
174      */

175     public String JavaDoc getUserAgent() {
176         log.debug(".getUserAgent(): " + user_agent);
177
178         return user_agent;
179     }
180
181     /**
182      * Get URI list path
183      *
184      * @return URI list path
185      */

186     public String JavaDoc getURIList() {
187         log.debug(".getURIList(): " + uri_list);
188
189         return uri_list;
190     }
191
192     /**
193      * Get URI list path as absolute path
194      *
195      * @return URI list path
196      */

197     public String JavaDoc getURIListResolved() {
198         log.debug(".getURIList(): " + uri_list);
199
200         return resolvePath(uri_list);
201     }
202
203     /**
204      * Get htdocs-dump-dir/@src
205      *
206      * @return htdocs-dump-dir/@src
207      */

208     public String JavaDoc getHTDocsDumpDir() {
209         log.debug(".getHTDocsDumpDir(): " + htdocs_dump_dir);
210
211         return htdocs_dump_dir;
212     }
213
214     /**
215      * Get htdocs-dump-dir/@src as absolute path
216      *
217      * @return htdocs-dump-dir/@src
218      */

219     public String JavaDoc getHTDocsDumpDirResolved() {
220
221         return resolvePath(htdocs_dump_dir);
222     }
223
224     /**
225      * Get robots/@src
226      *
227      * @return robots/@src
228      */

229     public String JavaDoc getRobotsFile() {
230         log.debug(robots_file);
231
232         return robots_file;
233     }
234
235     /**
236      * Get robots/@src as absolute path
237      *
238      * @return robots/@src
239      */

240     public String JavaDoc getRobotsFileResolved() {
241         log.debug(robots_file);
242
243         return resolvePath(robots_file);
244     }
245
246     /**
247      * Get robots/@domain
248      *
249      * @return robots/@domain
250      */

251     public String JavaDoc getRobotsDomain() {
252         log.debug(robots_domain);
253
254         return robots_domain;
255     }
256
257     /**
258      * Resolve path
259      *
260      * @param path Original path
261      *
262      * @return Resolved path
263      */

264     public String JavaDoc resolvePath(String JavaDoc path) {
265
266         // nothing to do if we already have an absolute pathname
267
if ( new File JavaDoc(path) .isAbsolute() ) {
268             return path;
269         }
270
271         // from the Java API doc: "A canonical pathname is both absolute and unique."
272
// however we may get an exception while converting a path to it's canonical form
273
try {
274             String JavaDoc configDir = new File JavaDoc(configurationFilePath) .getAbsoluteFile() .getParent();
275             return new File JavaDoc(configDir, path) .getCanonicalPath();
276
277         } catch (java.io.IOException JavaDoc e) {
278             // FIXME: maybe this Exception should be thrown to the caller ?
279
e.printStackTrace();
280             return null;
281         }
282
283     }
284 }
285
Popular Tags