|                                                                                                              1
 17
 18
 19
 20  package org.apache.lenya.search.crawler;
 21
 22  import java.io.File
  ; 23  import java.io.FileOutputStream
  ; 24  import java.net.HttpURLConnection
  ; 25  import java.net.MalformedURLException
  ; 26  import java.net.URL
  ; 27  import java.util.StringTokenizer
  ; 28
 29  import websphinx.RobotExclusion;
 30
 31  import org.apache.log4j.Category;
 32
 33
 34
 37  public class IterativeHTMLCrawler {
 38      static Category log = Category.getInstance(IterativeHTMLCrawler.class);
 39
 40      java.util.Vector
  urlsToCrawl; 41      java.util.TreeSet
  urlsToCrawlLowerCase; 42      String
  url_list_file = "url_file.txt"; 43      String
  html_dump_directory = "html_dump"; 44      private String
  rootURL; 45      private String
  [] scopeURL; 46      private RobotExclusion robot;
 47
 48
 53      public static void main(String
  [] args) { 54          if (args.length == 0) {
 55              System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
 56
 57              return;
 58          }
 59
 60          try {
 61              if (args.length == 1) {
 62                  CrawlerConfiguration ce = new CrawlerConfiguration(args[0]);
 63                  new IterativeHTMLCrawler(new File
  (args[0])).crawl(new URL  (ce.getBaseURL()), ce.getScopeURL()); 64          } else {
 65                  System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
 66              }
 67          } catch (MalformedURLException
  e) { 68              log.error("" + e);
 69          }
 70      }
 71
 72
 79      public IterativeHTMLCrawler(String
  url_list_file, String  html_dump_directory, String  userAgent) { 80          this.url_list_file = url_list_file;
 81          this.html_dump_directory = html_dump_directory;
 82
 83          robot = new RobotExclusion(userAgent);
 84      }
 85
 86
 90      public IterativeHTMLCrawler(File
  config) { 91          CrawlerConfiguration ce = new CrawlerConfiguration(config.getAbsolutePath());
 92
 93
 94          this.url_list_file = ce.getURIListResolved();
 95          log.debug("URI list file: " + this.url_list_file);
 96
 97          this.html_dump_directory = ce.getHTDocsDumpDirResolved();
 98          log.debug("HTDocs Dump Dir: " + this.html_dump_directory);
 99
 100         robot = new RobotExclusion(ce.getUserAgent());
 101
 102         String
  robots_file = ce.getRobotsFileResolved(); 103         log.debug("Robots File: " + robots_file);
 104         String
  robots_domain = ce.getRobotsDomain(); 105         if (robots_file != null && robots_domain != null) {
 106             log.debug(robots_file + " " + robots_domain);
 107             robot.addLocalEntries(robots_domain, new File
  (robots_file)); 108         }
 109     }
 110
 111
 117     public void crawl(URL
  start, String  scope) { 118         scopeURL = new String
  [1]; 119         scopeURL[0] = scope;
 120
 121         String
  seedURL = start.toString(); 122         this.rootURL = seedURL.substring(0, seedURL.indexOf("/", 8));
 123
 124         urlsToCrawl = new java.util.Vector
  (); 125         urlsToCrawlLowerCase = new java.util.TreeSet
  (); 126
 127         String
  currentURLPath = start.toString().substring(0, start.toString().lastIndexOf("/")); 128
 129         try {
 130             log.info("Start crawling at: " + start);
 131
 132             if (addURL(start.getFile(), currentURLPath) != null) {
 133                 dumpHTDoc(start);
 134             } else {
 135                 log.warn("Start URL has not been dumped: " + start);
 136             }
 137         } catch (MalformedURLException
  e) { 138             log.error("" + e);
 139         }
 140
 141         int currentPosition = 0;
 142
 143         while (currentPosition < urlsToCrawl.size()) {
 144             URL
  currentURL = (URL  ) urlsToCrawl.elementAt(currentPosition); 145             currentURLPath = currentURL.toString().substring(0, currentURL.toString().lastIndexOf("/"));
 146
 147             log.info("INFO: Current Array Size: " + urlsToCrawl.size() + ", Current Position: " + currentPosition + ", Current URL: " + currentURL.toString());
 148
 149
 150             java.util.List
  urlsWithinPage = parsePage(currentURL.toString()); 151
 152             if (urlsWithinPage != null) {
 153                 java.util.Iterator
  iterator = urlsWithinPage.iterator(); 154
 155                 while (iterator.hasNext()) {
 156                     String
  urlCandidate = (String  ) iterator.next(); 157
 158                     try {
 159                         URL
  urlToCrawl = null; 160
 161                         if ((urlToCrawl = addURL(urlCandidate, currentURLPath)) != null) {
 162                             dumpHTDoc(urlToCrawl);
 163                         }
 164                     } catch (MalformedURLException
  e) { 165                         log.warn("" + e + " " + urlCandidate);
 166                     }
 167                 }
 168             }
 169
 170             currentPosition = currentPosition + 1;
 171         }
 172
 173         log.info("Stop crawling at: " + urlsToCrawl.elementAt(urlsToCrawl.size()-1));
 174
 175
 176
 177                 try {
 179             File
  parent = new File  (new File  (url_list_file).getParent()); 180             if (!parent.isDirectory()) {
 181                 parent.mkdirs();
 182                 log.warn("Directory has been created: " + parent);
 183             }
 184             java.io.PrintWriter
  out = new java.io.PrintWriter  (new FileOutputStream  (url_list_file)); 185
 186             for (int i = 0; i < urlsToCrawl.size(); i++) {
 187                 out.println("" + urlsToCrawl.elementAt(i));
 188             }
 189
 190             out.close();
 191         } catch (java.io.FileNotFoundException
  e) { 192             log.error("" + e);
 193         }
 194     }
 195
 196
 206     public URL
  addURL(String  urlCandidate, String  currentURLPath) 207         throws MalformedURLException
  { 208         URL
  url = new URL  (parseHREF(urlCandidate, urlCandidate.toLowerCase(), currentURLPath)); 209
 211         if (filterURL(urlCandidate, currentURLPath, urlsToCrawlLowerCase)) {
 212             if (!robot.disallowed(url)) {
 213                 if (url.getQuery() == null) {
 214                     urlsToCrawl.add(url);
 215                     urlsToCrawlLowerCase.add(url.toString().toLowerCase());
 216                     log.debug("URL added: " + url);
 217                 } else {
 218                     log.info("Don't crawl URLs with query string: " + url);
 219                 }
 220
 221                 return url;
 222             } else {
 223                 log.info("Disallowed by robots.txt: " + urlCandidate);
 224             }
 225         }
 226
 227         return null;
 228     }
 229
 230
 237     public java.util.List
  parsePage(String  urlString) { 238         String
  status = "ok"; 239
 240         try {
 241             URL
  currentURL = new java.net.URL  (urlString); 242             String
  currentURLPath = urlString.substring(0, urlString.lastIndexOf("/")); 243             HttpURLConnection
  httpCon = (HttpURLConnection  ) currentURL.openConnection(); 244
 245             httpCon.setRequestProperty("User-Agent", "Lenya Lucene Crawler");
 246
 247             httpCon.connect();
 248
 249             long lastModified = httpCon.getLastModified();
 250
 251             if (httpCon.getResponseCode() == HttpURLConnection.HTTP_OK) {
 252                 String
  contentType = httpCon.getContentType(); 253
 254                 if (contentType.indexOf("text/html") != -1) {
 255                     return handleHTML(httpCon);
 256                 } else if (contentType.indexOf("application/pdf") != -1) {
 257                     handlePDF(httpCon);
 258                 } else {
 259                     status = "Not an excepted content type : " + contentType;
 260                 }
 261             } else {
 262                 status = "bad";
 263             }
 264
 265             httpCon.disconnect();
 266         } catch (java.net.MalformedURLException
  mue) { 267             status = mue.toString();
 268         } catch (java.net.UnknownHostException
  uh) { 269             status = uh.toString();         } catch (java.io.IOException
  ioe) { 271             status = ioe.toString();         } catch (Exception
  e) { 273             status = e.toString();         }
 275
 276                 return null;
 278     }
 279
 280
 289     public static java.util.List
  handleHTML(HttpURLConnection  httpCon) 290         throws java.io.IOException
  { 291         ContentHandler handler = new HTMLHandler();
 292         handler.parse(httpCon.getInputStream());
 293
 294         if (handler.getRobotFollow()) {
 295             java.util.List
  links = handler.getLinks(); 296
 297             return links;
 298         }
 299
 300         return null;
 301     }
 302
 303
 308     public void handlePDF(HttpURLConnection
  httpCon) { 309         log.debug(".handlePDF(): Not handled yet!");
 310     }
 311
 312
 321     public boolean filterURL(String
  url, String  currentURLPath, java.util.TreeSet  links) { 322         String
  urlLowCase = url.toLowerCase(); 323
 324         if (!(urlLowCase.startsWith("http://") || urlLowCase.startsWith("https://"))) {
 325             url = parseHREF(url, urlLowCase, currentURLPath);
 326
 327             if (url != null) {
 328                 urlLowCase = url.toLowerCase();
 329             }
 330         }
 331
 332         if ((url != null) && inScope(url)) {
 333             if (!links.contains(urlLowCase)) {
 334                 return true;
 335             }
 336         } else {
 337             log.debug("Not in scope: " + url);
 338         }
 339
 340         return false;
 341     }
 342
 343
 352     public String
  parseHREF(String  url, String  urlLowCase, String  currentURLPath) { 353         if (urlLowCase.startsWith("http://") || urlLowCase.startsWith("https://")) {
 354             return url;
 355         }
 356
 357                 if (urlLowCase.startsWith("/")) {
 359             url = rootURL + url;
 360         } else if (urlLowCase.startsWith("./")) {
 361             url = currentURLPath + url.substring(1, url.length());
 362         } else if (urlLowCase.startsWith("../")) {
 363             int back = 1;
 364
 365                         while (urlLowCase.indexOf("../", back * 3) != -1)
 367                 back++;
 368
 369             int pos = currentURLPath.length();
 370             int count = back;
 371
 372             while (count-- > 0) {
 373                 pos = currentURLPath.lastIndexOf("/", pos) - 1;
 374             }
 375
 376             String
  dotsRemoved = url.substring(3 * back, url.length()); 377             if (dotsRemoved.length() > 0 && dotsRemoved.charAt(0) == '.') {
 378                 log.error("Parsing failed: " + url + " (" + currentURLPath + ")");
 379                 url = null;
 380             } else {
 381                 url = currentURLPath.substring(0, pos + 2) + dotsRemoved;
 382             }
 383         } else if (urlLowCase.startsWith("javascript:")) {
 384                         log.debug("\"javascript:\" is not implemented yet!");
 386             url = null;
 387         } else if (urlLowCase.startsWith("#")) {
 388             log.debug("\"#\" (anchor) will be ignored!");
 389
 390                         url = null;
 392         } else if (urlLowCase.startsWith("mailto:")) {
 393             log.debug("\"mailto:\" is not a URL to be followed!");
 394
 395                         url = null;
 397         } else {
 398             url = currentURLPath + "/" + url;
 399         }
 400
 401                                 if (url != null) {
 405             int i;
 406
 407             if ((i = url.indexOf("#")) != -1) {
 408                 url = url.substring(0, i);
 409             }
 410         }
 411
 412
 413         return url;
 414     }
 415
 416
 423     public boolean inScope(String
  url) { 424         for (int i = 0; i < scopeURL.length; i++) {
 425             if (url.startsWith(scopeURL[i])) {
 426                 return true;
 427             }
 428         }
 429
 430         return false;
 431     }
 432
 433
 443     public URL
  completeURL(URL  parent, String  child) throws MalformedURLException  { 444         return parent;
 445     }
 446
 447
 452     public void dumpHTDoc(URL
  url) { 453         String
  ext = getExtension(url); 454
 455         String
  filename = html_dump_directory + url.getFile(); 456         File
  file = new File  (filename); 457
 458         if (filename.charAt(filename.length() - 1) == '/') {
 459             file = new File
  (filename + "index.html"); 460             ext = getExtension(file);
 461         }
 462
 463         if (ext.equals("html") || ext.equals("htm") || ext.equals("txt") || ext.equals("pdf")) {
 464             try {
 465                 File
  parent = new File  (file.getParent()); 466
 467                 if (!parent.exists()) {
 468                     parent.mkdirs();
 469                 }
 470
 471                 HttpURLConnection
  httpConnection = (HttpURLConnection  ) url.openConnection(); 472                 java.io.InputStream
  in = httpConnection.getInputStream(); 473
 474         FileOutputStream
  out = new FileOutputStream  (file); 475                 byte[] buffer = new byte[1024];
 476                 int bytesRead = -1;
 477                 while ((bytesRead = in.read(buffer)) >= 0) {
 478                     out.write(buffer, 0, bytesRead);
 479                 }
 480                 out.close();
 481
 482
 497                 in.close();
 498                 httpConnection.disconnect();
 499
 500                 log.info("URL dumped: " + url + " (" + file + ")");
 501             } catch (Exception
  e) { 502                 log.error("" + e);
 503                 log.error("URL not dumped: " + url);
 504             }
 505         } else {
 506             log.info("URL not dumped: " + url);
 507         }
 508     }
 509
 510
 513
 534
 535
 542     public String
  getExtension(URL  url) { 543         return getExtension(new File
  (url.getPath())); 544     }
 545
 546
 553     public String
  getExtension(File  file) { 554         StringTokenizer
  st = new StringTokenizer  (file.getPath(), "."); 555         String
  extension = null; 556
 557         while (st.hasMoreElements()) {
 558             extension = st.nextToken();
 559         }
 560
 561         return extension;
 562     }
 563 }
 564
                                                                                                                                                                                                             |                                                                       
 
 
 
 
 
                                                                                   Popular Tags                                                                                                                                                                                              |