IterativeHTMLCrawler


1   /*
2    * Copyright  1999-2004 The Apache Software Foundation
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *
16   */
17  
18  /* $Id: IterativeHTMLCrawler.java 164996 2005-04-27 15:52:29Z michi $  */
19  
20  package org.apache.lenya.search.crawler;
21  
22  import java.io.File  ;
23  import java.io.FileOutputStream  ;
24  import java.net.HttpURLConnection  ;
25  import java.net.MalformedURLException  ;
26  import java.net.URL  ;
27  import java.util.StringTokenizer  ;
28  
29  import websphinx.RobotExclusion;
30  
31  import org.apache.log4j.Category;
32  
33  
34  /**
35   * Crawl iteratively
36   */
37  public class IterativeHTMLCrawler {
38      static Category log = Category.getInstance(IterativeHTMLCrawler.class);
39  
40      java.util.Vector   urlsToCrawl;
41      java.util.TreeSet   urlsToCrawlLowerCase;
42      String   url_list_file = "url_file.txt";
43      String   html_dump_directory = "html_dump";
44      private String   rootURL;
45      private String  [] scopeURL;
46      private RobotExclusion robot;
47  
48      /**
49       * Command line interface
50       *
51       * @param args Configuration file crawler.xconf
52       */
53      public static void main(String  [] args) {
54          if (args.length == 0) {
55              System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
56  
57              return;
58          }
59  
60          try {
61              if (args.length == 1) {
62                  CrawlerConfiguration ce = new CrawlerConfiguration(args[0]);
63                  new IterativeHTMLCrawler(new File  (args[0])).crawl(new URL  (ce.getBaseURL()), ce.getScopeURL());
64          } else {
65                  System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
66              }
67          } catch (MalformedURLException   e) {
68              log.error("" + e);
69          }
70      }
71  
72      /**
73       * Creates a new IterativeHTMLCrawler object.
74       *
75       * @param url_list_file File where all dumped files will be listed
76       * @param html_dump_directory Directory where htdocs should be dumped
77       * @param userAgent User-agent for robots.txt
78       */
79      public IterativeHTMLCrawler(String   url_list_file, String   html_dump_directory, String   userAgent) {
80          this.url_list_file = url_list_file;
81          this.html_dump_directory = html_dump_directory;
82  
83          robot = new RobotExclusion(userAgent);
84      }
85  
86      /**
87       * Creates a new IterativeHTMLCrawler object.
88       * @param config Configuration File
89       */
90      public IterativeHTMLCrawler(File   config) {
91          CrawlerConfiguration ce = new CrawlerConfiguration(config.getAbsolutePath());
92  
93  
94          this.url_list_file = ce.getURIListResolved();
95          log.debug("URI list file: " + this.url_list_file);
96  
97          this.html_dump_directory = ce.getHTDocsDumpDirResolved();
98          log.debug("HTDocs Dump Dir: " + this.html_dump_directory);
99  
100         robot = new RobotExclusion(ce.getUserAgent());
101 
102         String   robots_file = ce.getRobotsFileResolved();
103         log.debug("Robots File: " + robots_file);
104         String   robots_domain = ce.getRobotsDomain();
105         if (robots_file != null && robots_domain != null) {
106             log.debug(robots_file + " " + robots_domain);
107             robot.addLocalEntries(robots_domain, new File  (robots_file));
108         }
109     }
110 
111     /**
112      * Crawl
113      *
114      * @param start Start crawling at this URL
115      * @param scope Limit crawling to this scope
116      */
117     public void crawl(URL   start, String   scope) {
118         scopeURL = new String  [1];
119         scopeURL[0] = scope;
120 
121         String   seedURL = start.toString();
122         this.rootURL = seedURL.substring(0, seedURL.indexOf("/", 8));
123 
124         urlsToCrawl = new java.util.Vector  ();
125         urlsToCrawlLowerCase = new java.util.TreeSet  ();
126 
127         String   currentURLPath = start.toString().substring(0, start.toString().lastIndexOf("/"));
128 
129         try {
130             log.info("Start crawling at: " + start);
131 
132             if (addURL(start.getFile(), currentURLPath) != null) {
133                 dumpHTDoc(start);
134             } else {
135                 log.warn("Start URL has not been dumped: " + start);
136             }
137         } catch (MalformedURLException   e) {
138             log.error("" + e);
139         }
140 
141         int currentPosition = 0;
142 
143         while (currentPosition < urlsToCrawl.size()) {
144             URL   currentURL = (URL  ) urlsToCrawl.elementAt(currentPosition);
145             currentURLPath = currentURL.toString().substring(0, currentURL.toString().lastIndexOf("/"));
146 
147             log.info("INFO: Current Array Size: " + urlsToCrawl.size() + ", Current Position: " + currentPosition + ", Current URL: " + currentURL.toString());
148 
149 
150             java.util.List   urlsWithinPage = parsePage(currentURL.toString());
151 
152             if (urlsWithinPage != null) {
153                 java.util.Iterator   iterator = urlsWithinPage.iterator();
154 
155                 while (iterator.hasNext()) {
156                     String   urlCandidate = (String  ) iterator.next();
157 
158                     try {
159                         URL   urlToCrawl = null;
160 
161                         if ((urlToCrawl = addURL(urlCandidate, currentURLPath)) != null) {
162                             dumpHTDoc(urlToCrawl);
163                         }
164                     } catch (MalformedURLException   e) {
165                         log.warn("" + e + " " + urlCandidate);
166                     }
167                 }
168             }
169 
170             currentPosition = currentPosition + 1;
171         }
172 
173         log.info("Stop crawling at: " + urlsToCrawl.elementAt(urlsToCrawl.size()-1));
174 
175 
176 
177         // Write all crawled URLs into file
178         try {
179             File   parent = new File  (new File  (url_list_file).getParent());
180             if (!parent.isDirectory()) {
181                 parent.mkdirs();
182                 log.warn("Directory has been created: " + parent);
183             }
184             java.io.PrintWriter   out = new java.io.PrintWriter  (new FileOutputStream  (url_list_file));
185 
186             for (int i = 0; i < urlsToCrawl.size(); i++) {
187                 out.println("" + urlsToCrawl.elementAt(i));
188             }
189 
190             out.close();
191         } catch (java.io.FileNotFoundException   e) {
192             log.error("" + e);
193         }
194     }
195 
196     /**
197      * Add URLs to crawl
198      *
199      * @param urlCandidate DOCUMENT ME!
200      * @param currentURLPath DOCUMENT ME!
201      *
202      * @return DOCUMENT ME!
203      *
204      * @throws MalformedURLException DOCUMENT ME!
205      */
206     public URL   addURL(String   urlCandidate, String   currentURLPath)
207         throws MalformedURLException   {
208         URL   url = new URL  (parseHREF(urlCandidate, urlCandidate.toLowerCase(), currentURLPath));
209         //completeURL(currentURL,urlCandidate)  new URL(currentURLPath+"/"+urlCandidate);
210 
211         if (filterURL(urlCandidate, currentURLPath, urlsToCrawlLowerCase)) {
212             if (!robot.disallowed(url)) {
213                 if (url.getQuery() == null) {
214                     urlsToCrawl.add(url);
215                     urlsToCrawlLowerCase.add(url.toString().toLowerCase());
216                     log.debug("URL added: " + url);
217                 } else {
218                     log.info("Don't crawl URLs with query string: " + url);
219                 }
220 
221                 return url;
222             } else {
223                 log.info("Disallowed by robots.txt: " + urlCandidate);
224             }
225         }
226 
227         return null;
228     }
229 
230     /**
231      * DOCUMENT ME!
232      *
233      * @param urlString DOCUMENT ME!
234      *
235      * @return ok, 404
236      */
237     public java.util.List   parsePage(String   urlString) {
238         String   status = "ok";
239 
240         try {
241             URL   currentURL = new java.net.URL  (urlString);
242             String   currentURLPath = urlString.substring(0, urlString.lastIndexOf("/"));
243             HttpURLConnection   httpCon = (HttpURLConnection  ) currentURL.openConnection();
244 
245             httpCon.setRequestProperty("User-Agent", "Lenya Lucene Crawler");
246 
247             httpCon.connect();
248 
249             long lastModified = httpCon.getLastModified();
250 
251             if (httpCon.getResponseCode() == HttpURLConnection.HTTP_OK) {
252                 String   contentType = httpCon.getContentType();
253 
254                 if (contentType.indexOf("text/html") != -1) {
255                     return handleHTML(httpCon);
256                 } else if (contentType.indexOf("application/pdf") != -1) {
257                     handlePDF(httpCon);
258                 } else {
259                     status = "Not an excepted content type : " + contentType;
260                 }
261             } else {
262                 status = "bad";
263             }
264 
265             httpCon.disconnect();
266         } catch (java.net.MalformedURLException   mue) {
267             status = mue.toString();
268         } catch (java.net.UnknownHostException   uh) {
269             status = uh.toString(); // Mark as a bad URL
270         } catch (java.io.IOException   ioe) {
271             status = ioe.toString(); // Mark as a bad URL
272         } catch (Exception   e) {
273             status = e.toString(); // Mark as a bad URL
274         }
275 
276         //return status;
277         return null;
278     }
279 
280     /**
281      * DOCUMENT ME!
282      *
283      * @param httpCon DOCUMENT ME!
284      *
285      * @return DOCUMENT ME!
286      *
287      * @throws java.io.IOException DOCUMENT ME!
288      */
289     public static java.util.List   handleHTML(HttpURLConnection   httpCon)
290         throws java.io.IOException   {
291         ContentHandler handler = new HTMLHandler();
292         handler.parse(httpCon.getInputStream());
293 
294         if (handler.getRobotFollow()) {
295             java.util.List   links = handler.getLinks();
296 
297             return links;
298         }
299 
300         return null;
301     }
302 
303     /**
304      * Parse PDF for links
305      *
306      * @param httpCon DOCUMENT ME!
307      */
308     public void handlePDF(HttpURLConnection   httpCon) {
309         log.debug(".handlePDF(): Not handled yet!");
310     }
311 
312     /**
313      * DOCUMENT ME!
314      *
315      * @param url DOCUMENT ME!
316      * @param currentURLPath DOCUMENT ME!
317      * @param links DOCUMENT ME!
318      *
319      * @return DOCUMENT ME!
320      */
321     public boolean filterURL(String   url, String   currentURLPath, java.util.TreeSet   links) {
322         String   urlLowCase = url.toLowerCase();
323 
324         if (!(urlLowCase.startsWith("http://") || urlLowCase.startsWith("https://"))) {
325             url = parseHREF(url, urlLowCase, currentURLPath);
326 
327             if (url != null) {
328                 urlLowCase = url.toLowerCase();
329             }
330         }
331 
332         if ((url != null) && inScope(url)) {
333             if (!links.contains(urlLowCase)) {
334                 return true;
335             }
336         } else {
337             log.debug("Not in scope: " + url);
338         }
339 
340         return false;
341     }
342 
343     /**
344      * Parse URL and complete if necessary
345      *
346      * @param url URL from href
347      * @param urlLowCase url is lower case
348      * @param currentURLPath URL of current page
349      *
350      * @return Completed URL
351      */
352     public String   parseHREF(String   url, String   urlLowCase, String   currentURLPath) {
353         if (urlLowCase.startsWith("http://") || urlLowCase.startsWith("https://")) {
354             return url;
355         }
356 
357         // Looks for incomplete URL and completes them
358         if (urlLowCase.startsWith("/")) {
359             url = rootURL + url;
360         } else if (urlLowCase.startsWith("./")) {
361             url = currentURLPath + url.substring(1, url.length());
362         } else if (urlLowCase.startsWith("../")) {
363             int back = 1;
364 
365             // Count number of "../"s
366             while (urlLowCase.indexOf("../", back * 3) != -1)
367                 back++;
368 
369             int pos = currentURLPath.length();
370             int count = back;
371 
372             while (count-- > 0) {
373                 pos = currentURLPath.lastIndexOf("/", pos) - 1;
374             }
375 
376             String   dotsRemoved = url.substring(3 * back, url.length());
377             if (dotsRemoved.length() > 0 && dotsRemoved.charAt(0) == '.') {
378                 log.error("Parsing failed: " + url + " (" + currentURLPath + ")");
379                 url = null;
380             } else {
381                 url = currentURLPath.substring(0, pos + 2) + dotsRemoved;
382             }
383         } else if (urlLowCase.startsWith("javascript:")) {
384             // handle javascript:...
385             log.debug("\"javascript:\" is not implemented yet!");
386             url = null;
387         } else if (urlLowCase.startsWith("#")) {
388             log.debug("\"#\" (anchor) will be ignored!");
389 
390             // internal anchor... ignore.
391             url = null;
392         } else if (urlLowCase.startsWith("mailto:")) {
393             log.debug("\"mailto:\" is not a URL to be followed!");
394 
395             // handle mailto:...
396             url = null;
397         } else {
398             url = currentURLPath + "/" + url;
399         }
400 
401         // strip anchor if exists otherwise crawler may index content multiple times
402         // links to the same url but with unique anchors would be considered unique
403         // by the crawler when they should not be
404         if (url != null) {
405             int i;
406 
407             if ((i = url.indexOf("#")) != -1) {
408                 url = url.substring(0, i);
409             }
410         }
411 
412 
413         return url;
414     }
415 
416     /**
417      * DOCUMENT ME!
418      *
419      * @param url DOCUMENT ME!
420      *
421      * @return DOCUMENT ME!
422      */
423     public boolean inScope(String   url) {
424         for (int i = 0; i < scopeURL.length; i++) {
425             if (url.startsWith(scopeURL[i])) {
426                 return true;
427             }
428         }
429 
430         return false;
431     }
432 
433     /**
434      * DOCUMENT ME!
435      *
436      * @param parent DOCUMENT ME!
437      * @param child DOCUMENT ME!
438      *
439      * @return DOCUMENT ME!
440      *
441      * @throws MalformedURLException DOCUMENT ME!
442      */
443     public URL   completeURL(URL   parent, String   child) throws MalformedURLException   {
444         return parent;
445     }
446 
447     /**
448      * DOCUMENT ME!
449      *
450      * @param url DOCUMENT ME!
451      */
452     public void dumpHTDoc(URL   url) {
453         String   ext = getExtension(url);
454 
455         String   filename = html_dump_directory + url.getFile();
456         File   file = new File  (filename);
457 
458         if (filename.charAt(filename.length() - 1) == '/') {
459             file = new File  (filename + "index.html");
460             ext = getExtension(file);
461         }
462 
463         if (ext.equals("html") || ext.equals("htm") || ext.equals("txt") || ext.equals("pdf")) {
464             try {
465                 File   parent = new File  (file.getParent());
466 
467                 if (!parent.exists()) {
468                     parent.mkdirs();
469                 }
470 
471                 HttpURLConnection   httpConnection = (HttpURLConnection  ) url.openConnection();
472                 java.io.InputStream   in = httpConnection.getInputStream();
473 
474         FileOutputStream   out = new FileOutputStream  (file);
475                 byte[] buffer = new byte[1024];
476                 int bytesRead = -1;
477                 while ((bytesRead = in.read(buffer)) >= 0) {
478                     out.write(buffer, 0, bytesRead);
479                 }
480                 out.close();
481 
482 /*
483                 BufferedInputStream bin = new BufferedInputStream(in);
484                 BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
485 
486                 java.io.FileWriter fw = new java.io.FileWriter(file);
487                 int i;
488 
489                 while ((i = reader.read()) != -1) {
490                     fw.write(i);
491                 }
492 
493                 fw.close();
494 
495                 bin.close();
496 */
497                 in.close();
498                 httpConnection.disconnect();
499 
500                 log.info("URL dumped: " + url + " (" + file + ")");
501             } catch (Exception   e) {
502                 log.error("" + e);
503                 log.error("URL not dumped: " + url);
504             }
505         } else {
506             log.info("URL not dumped: " + url);
507         }
508     }
509 
510     /**
511      *
512      */
513 /*
514     public void saveToFile(String filename, byte[] bytes)
515         throws FileNotFoundException, IOException {
516         File file = new File(filename);
517 
518         if (filename.charAt(filename.length() - 1) == '/') {
519             file = new File(filename + "index.html");
520         }
521 
522         File parent = new File(file.getParent());
523 
524         if (!parent.exists()) {
525             log.warn("Directory will be created: " + parent.getAbsolutePath());
526             parent.mkdirs();
527         }
528 
529         FileOutputStream out = new FileOutputStream(file.getAbsolutePath());
530         out.write(bytes);
531         out.close();
532     }
533 */
534 
535     /**
536      * DOCUMENT ME!
537      *
538      * @param url DOCUMENT ME!
539      *
540      * @return DOCUMENT ME!
541      */
542     public String   getExtension(URL   url) {
543         return getExtension(new File  (url.getPath()));
544     }
545 
546     /**
547      * DOCUMENT ME!
548      *
549      * @param file DOCUMENT ME!
550      *
551      * @return DOCUMENT ME!
552      */
553     public String   getExtension(File   file) {
554         StringTokenizer   st = new StringTokenizer  (file.getPath(), ".");
555         String   extension = null;
556 
557         while (st.hasMoreElements()) {
558             extension = st.nextToken();
559         }
560 
561         return extension;
562     }
563 }
564
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags