KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lenya > search > crawler > IterativeHTMLCrawler


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */

17
18 /* $Id: IterativeHTMLCrawler.java 164996 2005-04-27 15:52:29Z michi $ */
19
20 package org.apache.lenya.search.crawler;
21
22 import java.io.File JavaDoc;
23 import java.io.FileOutputStream JavaDoc;
24 import java.net.HttpURLConnection JavaDoc;
25 import java.net.MalformedURLException JavaDoc;
26 import java.net.URL JavaDoc;
27 import java.util.StringTokenizer JavaDoc;
28
29 import websphinx.RobotExclusion;
30
31 import org.apache.log4j.Category;
32
33
34 /**
35  * Crawl iteratively
36  */

37 public class IterativeHTMLCrawler {
38     static Category log = Category.getInstance(IterativeHTMLCrawler.class);
39
40     java.util.Vector JavaDoc urlsToCrawl;
41     java.util.TreeSet JavaDoc urlsToCrawlLowerCase;
42     String JavaDoc url_list_file = "url_file.txt";
43     String JavaDoc html_dump_directory = "html_dump";
44     private String JavaDoc rootURL;
45     private String JavaDoc[] scopeURL;
46     private RobotExclusion robot;
47
48     /**
49      * Command line interface
50      *
51      * @param args Configuration file crawler.xconf
52      */

53     public static void main(String JavaDoc[] args) {
54         if (args.length == 0) {
55             System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
56
57             return;
58         }
59
60         try {
61             if (args.length == 1) {
62                 CrawlerConfiguration ce = new CrawlerConfiguration(args[0]);
63                 new IterativeHTMLCrawler(new File JavaDoc(args[0])).crawl(new URL JavaDoc(ce.getBaseURL()), ce.getScopeURL());
64         } else {
65                 System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
66             }
67         } catch (MalformedURLException JavaDoc e) {
68             log.error("" + e);
69         }
70     }
71
72     /**
73      * Creates a new IterativeHTMLCrawler object.
74      *
75      * @param url_list_file File where all dumped files will be listed
76      * @param html_dump_directory Directory where htdocs should be dumped
77      * @param userAgent User-agent for robots.txt
78      */

79     public IterativeHTMLCrawler(String JavaDoc url_list_file, String JavaDoc html_dump_directory, String JavaDoc userAgent) {
80         this.url_list_file = url_list_file;
81         this.html_dump_directory = html_dump_directory;
82
83         robot = new RobotExclusion(userAgent);
84     }
85
86     /**
87      * Creates a new IterativeHTMLCrawler object.
88      * @param config Configuration File
89      */

90     public IterativeHTMLCrawler(File JavaDoc config) {
91         CrawlerConfiguration ce = new CrawlerConfiguration(config.getAbsolutePath());
92
93
94         this.url_list_file = ce.getURIListResolved();
95         log.debug("URI list file: " + this.url_list_file);
96
97         this.html_dump_directory = ce.getHTDocsDumpDirResolved();
98         log.debug("HTDocs Dump Dir: " + this.html_dump_directory);
99
100         robot = new RobotExclusion(ce.getUserAgent());
101
102         String JavaDoc robots_file = ce.getRobotsFileResolved();
103         log.debug("Robots File: " + robots_file);
104         String JavaDoc robots_domain = ce.getRobotsDomain();
105         if (robots_file != null && robots_domain != null) {
106             log.debug(robots_file + " " + robots_domain);
107             robot.addLocalEntries(robots_domain, new File JavaDoc(robots_file));
108         }
109     }
110
111     /**
112      * Crawl
113      *
114      * @param start Start crawling at this URL
115      * @param scope Limit crawling to this scope
116      */

117     public void crawl(URL JavaDoc start, String JavaDoc scope) {
118         scopeURL = new String JavaDoc[1];
119         scopeURL[0] = scope;
120
121         String JavaDoc seedURL = start.toString();
122         this.rootURL = seedURL.substring(0, seedURL.indexOf("/", 8));
123
124         urlsToCrawl = new java.util.Vector JavaDoc();
125         urlsToCrawlLowerCase = new java.util.TreeSet JavaDoc();
126
127         String JavaDoc currentURLPath = start.toString().substring(0, start.toString().lastIndexOf("/"));
128
129         try {
130             log.info("Start crawling at: " + start);
131
132             if (addURL(start.getFile(), currentURLPath) != null) {
133                 dumpHTDoc(start);
134             } else {
135                 log.warn("Start URL has not been dumped: " + start);
136             }
137         } catch (MalformedURLException JavaDoc e) {
138             log.error("" + e);
139         }
140
141         int currentPosition = 0;
142
143         while (currentPosition < urlsToCrawl.size()) {
144             URL JavaDoc currentURL = (URL JavaDoc) urlsToCrawl.elementAt(currentPosition);
145             currentURLPath = currentURL.toString().substring(0, currentURL.toString().lastIndexOf("/"));
146
147             log.info("INFO: Current Array Size: " + urlsToCrawl.size() + ", Current Position: " + currentPosition + ", Current URL: " + currentURL.toString());
148
149
150             java.util.List JavaDoc urlsWithinPage = parsePage(currentURL.toString());
151
152             if (urlsWithinPage != null) {
153                 java.util.Iterator JavaDoc iterator = urlsWithinPage.iterator();
154
155                 while (iterator.hasNext()) {
156                     String JavaDoc urlCandidate = (String JavaDoc) iterator.next();
157
158                     try {
159                         URL JavaDoc urlToCrawl = null;
160
161                         if ((urlToCrawl = addURL(urlCandidate, currentURLPath)) != null) {
162                             dumpHTDoc(urlToCrawl);
163                         }
164                     } catch (MalformedURLException JavaDoc e) {
165                         log.warn("" + e + " " + urlCandidate);
166                     }
167                 }
168             }
169
170             currentPosition = currentPosition + 1;
171         }
172
173         log.info("Stop crawling at: " + urlsToCrawl.elementAt(urlsToCrawl.size()-1));
174
175
176
177         // Write all crawled URLs into file
178
try {
179             File JavaDoc parent = new File JavaDoc(new File JavaDoc(url_list_file).getParent());
180             if (!parent.isDirectory()) {
181                 parent.mkdirs();
182                 log.warn("Directory has been created: " + parent);
183             }
184             java.io.PrintWriter JavaDoc out = new java.io.PrintWriter JavaDoc(new FileOutputStream JavaDoc(url_list_file));
185
186             for (int i = 0; i < urlsToCrawl.size(); i++) {
187                 out.println("" + urlsToCrawl.elementAt(i));
188             }
189
190             out.close();
191         } catch (java.io.FileNotFoundException JavaDoc e) {
192             log.error("" + e);
193         }
194     }
195
196     /**
197      * Add URLs to crawl
198      *
199      * @param urlCandidate DOCUMENT ME!
200      * @param currentURLPath DOCUMENT ME!
201      *
202      * @return DOCUMENT ME!
203      *
204      * @throws MalformedURLException DOCUMENT ME!
205      */

206     public URL JavaDoc addURL(String JavaDoc urlCandidate, String JavaDoc currentURLPath)
207         throws MalformedURLException JavaDoc {
208         URL JavaDoc url = new URL JavaDoc(parseHREF(urlCandidate, urlCandidate.toLowerCase(), currentURLPath));
209         //completeURL(currentURL,urlCandidate) new URL(currentURLPath+"/"+urlCandidate);
210

211         if (filterURL(urlCandidate, currentURLPath, urlsToCrawlLowerCase)) {
212             if (!robot.disallowed(url)) {
213                 if (url.getQuery() == null) {
214                     urlsToCrawl.add(url);
215                     urlsToCrawlLowerCase.add(url.toString().toLowerCase());
216                     log.debug("URL added: " + url);
217                 } else {
218                     log.info("Don't crawl URLs with query string: " + url);
219                 }
220
221                 return url;
222             } else {
223                 log.info("Disallowed by robots.txt: " + urlCandidate);
224             }
225         }
226
227         return null;
228     }
229
230     /**
231      * DOCUMENT ME!
232      *
233      * @param urlString DOCUMENT ME!
234      *
235      * @return ok, 404
236      */

237     public java.util.List JavaDoc parsePage(String JavaDoc urlString) {
238         String JavaDoc status = "ok";
239
240         try {
241             URL JavaDoc currentURL = new java.net.URL JavaDoc(urlString);
242             String JavaDoc currentURLPath = urlString.substring(0, urlString.lastIndexOf("/"));
243             HttpURLConnection JavaDoc httpCon = (HttpURLConnection JavaDoc) currentURL.openConnection();
244
245             httpCon.setRequestProperty("User-Agent", "Lenya Lucene Crawler");
246
247             httpCon.connect();
248
249             long lastModified = httpCon.getLastModified();
250
251             if (httpCon.getResponseCode() == HttpURLConnection.HTTP_OK) {
252                 String JavaDoc contentType = httpCon.getContentType();
253
254                 if (contentType.indexOf("text/html") != -1) {
255                     return handleHTML(httpCon);
256                 } else if (contentType.indexOf("application/pdf") != -1) {
257                     handlePDF(httpCon);
258                 } else {
259                     status = "Not an excepted content type : " + contentType;
260                 }
261             } else {
262                 status = "bad";
263             }
264
265             httpCon.disconnect();
266         } catch (java.net.MalformedURLException JavaDoc mue) {
267             status = mue.toString();
268         } catch (java.net.UnknownHostException JavaDoc uh) {
269             status = uh.toString(); // Mark as a bad URL
270
} catch (java.io.IOException JavaDoc ioe) {
271             status = ioe.toString(); // Mark as a bad URL
272
} catch (Exception JavaDoc e) {
273             status = e.toString(); // Mark as a bad URL
274
}
275
276         //return status;
277
return null;
278     }
279
280     /**
281      * DOCUMENT ME!
282      *
283      * @param httpCon DOCUMENT ME!
284      *
285      * @return DOCUMENT ME!
286      *
287      * @throws java.io.IOException DOCUMENT ME!
288      */

289     public static java.util.List JavaDoc handleHTML(HttpURLConnection JavaDoc httpCon)
290         throws java.io.IOException JavaDoc {
291         ContentHandler handler = new HTMLHandler();
292         handler.parse(httpCon.getInputStream());
293
294         if (handler.getRobotFollow()) {
295             java.util.List JavaDoc links = handler.getLinks();
296
297             return links;
298         }
299
300         return null;
301     }
302
303     /**
304      * Parse PDF for links
305      *
306      * @param httpCon DOCUMENT ME!
307      */

308     public void handlePDF(HttpURLConnection JavaDoc httpCon) {
309         log.debug(".handlePDF(): Not handled yet!");
310     }
311
312     /**
313      * DOCUMENT ME!
314      *
315      * @param url DOCUMENT ME!
316      * @param currentURLPath DOCUMENT ME!
317      * @param links DOCUMENT ME!
318      *
319      * @return DOCUMENT ME!
320      */

321     public boolean filterURL(String JavaDoc url, String JavaDoc currentURLPath, java.util.TreeSet JavaDoc links) {
322         String JavaDoc urlLowCase = url.toLowerCase();
323
324         if (!(urlLowCase.startsWith("http://") || urlLowCase.startsWith("https://"))) {
325             url = parseHREF(url, urlLowCase, currentURLPath);
326
327             if (url != null) {
328                 urlLowCase = url.toLowerCase();
329             }
330         }
331
332         if ((url != null) && inScope(url)) {
333             if (!links.contains(urlLowCase)) {
334                 return true;
335             }
336         } else {
337             log.debug("Not in scope: " + url);
338         }
339
340         return false;
341     }
342
343     /**
344      * Parse URL and complete if necessary
345      *
346      * @param url URL from href
347      * @param urlLowCase url is lower case
348      * @param currentURLPath URL of current page
349      *
350      * @return Completed URL
351      */

352     public String JavaDoc parseHREF(String JavaDoc url, String JavaDoc urlLowCase, String JavaDoc currentURLPath) {
353         if (urlLowCase.startsWith("http://") || urlLowCase.startsWith("https://")) {
354             return url;
355         }
356
357         // Looks for incomplete URL and completes them
358
if (urlLowCase.startsWith("/")) {
359             url = rootURL + url;
360         } else if (urlLowCase.startsWith("./")) {
361             url = currentURLPath + url.substring(1, url.length());
362         } else if (urlLowCase.startsWith("../")) {
363             int back = 1;
364
365             // Count number of "../"s
366
while (urlLowCase.indexOf("../", back * 3) != -1)
367                 back++;
368
369             int pos = currentURLPath.length();
370             int count = back;
371
372             while (count-- > 0) {
373                 pos = currentURLPath.lastIndexOf("/", pos) - 1;
374             }
375
376             String JavaDoc dotsRemoved = url.substring(3 * back, url.length());
377             if (dotsRemoved.length() > 0 && dotsRemoved.charAt(0) == '.') {
378                 log.error("Parsing failed: " + url + " (" + currentURLPath + ")");
379                 url = null;
380             } else {
381                 url = currentURLPath.substring(0, pos + 2) + dotsRemoved;
382             }
383         } else if (urlLowCase.startsWith("javascript:")) {
384             // handle javascript:...
385
log.debug("\"javascript:\" is not implemented yet!");
386             url = null;
387         } else if (urlLowCase.startsWith("#")) {
388             log.debug("\"#\" (anchor) will be ignored!");
389
390             // internal anchor... ignore.
391
url = null;
392         } else if (urlLowCase.startsWith("mailto:")) {
393             log.debug("\"mailto:\" is not a URL to be followed!");
394
395             // handle mailto:...
396
url = null;
397         } else {
398             url = currentURLPath + "/" + url;
399         }
400
401         // strip anchor if exists otherwise crawler may index content multiple times
402
// links to the same url but with unique anchors would be considered unique
403
// by the crawler when they should not be
404
if (url != null) {
405             int i;
406
407             if ((i = url.indexOf("#")) != -1) {
408                 url = url.substring(0, i);
409             }
410         }
411
412
413         return url;
414     }
415
416     /**
417      * DOCUMENT ME!
418      *
419      * @param url DOCUMENT ME!
420      *
421      * @return DOCUMENT ME!
422      */

423     public boolean inScope(String JavaDoc url) {
424         for (int i = 0; i < scopeURL.length; i++) {
425             if (url.startsWith(scopeURL[i])) {
426                 return true;
427             }
428         }
429
430         return false;
431     }
432
433     /**
434      * DOCUMENT ME!
435      *
436      * @param parent DOCUMENT ME!
437      * @param child DOCUMENT ME!
438      *
439      * @return DOCUMENT ME!
440      *
441      * @throws MalformedURLException DOCUMENT ME!
442      */

443     public URL JavaDoc completeURL(URL JavaDoc parent, String JavaDoc child) throws MalformedURLException JavaDoc {
444         return parent;
445     }
446
447     /**
448      * DOCUMENT ME!
449      *
450      * @param url DOCUMENT ME!
451      */

452     public void dumpHTDoc(URL JavaDoc url) {
453         String JavaDoc ext = getExtension(url);
454
455         String JavaDoc filename = html_dump_directory + url.getFile();
456         File JavaDoc file = new File JavaDoc(filename);
457
458         if (filename.charAt(filename.length() - 1) == '/') {
459             file = new File JavaDoc(filename + "index.html");
460             ext = getExtension(file);
461         }
462
463         if (ext.equals("html") || ext.equals("htm") || ext.equals("txt") || ext.equals("pdf")) {
464             try {
465                 File JavaDoc parent = new File JavaDoc(file.getParent());
466
467                 if (!parent.exists()) {
468                     parent.mkdirs();
469                 }
470
471                 HttpURLConnection JavaDoc httpConnection = (HttpURLConnection JavaDoc) url.openConnection();
472                 java.io.InputStream JavaDoc in = httpConnection.getInputStream();
473
474         FileOutputStream JavaDoc out = new FileOutputStream JavaDoc(file);
475                 byte[] buffer = new byte[1024];
476                 int bytesRead = -1;
477                 while ((bytesRead = in.read(buffer)) >= 0) {
478                     out.write(buffer, 0, bytesRead);
479                 }
480                 out.close();
481
482 /*
483                 BufferedInputStream bin = new BufferedInputStream(in);
484                 BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
485
486                 java.io.FileWriter fw = new java.io.FileWriter(file);
487                 int i;
488
489                 while ((i = reader.read()) != -1) {
490                     fw.write(i);
491                 }
492
493                 fw.close();
494
495                 bin.close();
496 */

497                 in.close();
498                 httpConnection.disconnect();
499
500                 log.info("URL dumped: " + url + " (" + file + ")");
501             } catch (Exception JavaDoc e) {
502                 log.error("" + e);
503                 log.error("URL not dumped: " + url);
504             }
505         } else {
506             log.info("URL not dumped: " + url);
507         }
508     }
509
510     /**
511      *
512      */

513 /*
514     public void saveToFile(String filename, byte[] bytes)
515         throws FileNotFoundException, IOException {
516         File file = new File(filename);
517
518         if (filename.charAt(filename.length() - 1) == '/') {
519             file = new File(filename + "index.html");
520         }
521
522         File parent = new File(file.getParent());
523
524         if (!parent.exists()) {
525             log.warn("Directory will be created: " + parent.getAbsolutePath());
526             parent.mkdirs();
527         }
528
529         FileOutputStream out = new FileOutputStream(file.getAbsolutePath());
530         out.write(bytes);
531         out.close();
532     }
533 */

534
535     /**
536      * DOCUMENT ME!
537      *
538      * @param url DOCUMENT ME!
539      *
540      * @return DOCUMENT ME!
541      */

542     public String JavaDoc getExtension(URL JavaDoc url) {
543         return getExtension(new File JavaDoc(url.getPath()));
544     }
545
546     /**
547      * DOCUMENT ME!
548      *
549      * @param file DOCUMENT ME!
550      *
551      * @return DOCUMENT ME!
552      */

553     public String JavaDoc getExtension(File JavaDoc file) {
554         StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(file.getPath(), ".");
555         String JavaDoc extension = null;
556
557         while (st.hasMoreElements()) {
558             extension = st.nextToken();
559         }
560
561         return extension;
562     }
563 }
564
Popular Tags