1 17 18 19 20 package org.apache.lenya.search.crawler; 21 22 import java.io.File ; 23 import java.io.FileOutputStream ; 24 import java.net.HttpURLConnection ; 25 import java.net.MalformedURLException ; 26 import java.net.URL ; 27 import java.util.StringTokenizer ; 28 29 import websphinx.RobotExclusion; 30 31 import org.apache.log4j.Category; 32 33 34 37 public class IterativeHTMLCrawler { 38 static Category log = Category.getInstance(IterativeHTMLCrawler.class); 39 40 java.util.Vector urlsToCrawl; 41 java.util.TreeSet urlsToCrawlLowerCase; 42 String url_list_file = "url_file.txt"; 43 String html_dump_directory = "html_dump"; 44 private String rootURL; 45 private String [] scopeURL; 46 private RobotExclusion robot; 47 48 53 public static void main(String [] args) { 54 if (args.length == 0) { 55 System.err.println("Usage: IterativeHTMLCrawler crawler.xconf"); 56 57 return; 58 } 59 60 try { 61 if (args.length == 1) { 62 CrawlerConfiguration ce = new CrawlerConfiguration(args[0]); 63 new IterativeHTMLCrawler(new File (args[0])).crawl(new URL (ce.getBaseURL()), ce.getScopeURL()); 64 } else { 65 System.err.println("Usage: IterativeHTMLCrawler crawler.xconf"); 66 } 67 } catch (MalformedURLException e) { 68 log.error("" + e); 69 } 70 } 71 72 79 public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) { 80 this.url_list_file = url_list_file; 81 this.html_dump_directory = html_dump_directory; 82 83 robot = new RobotExclusion(userAgent); 84 } 85 86 90 public IterativeHTMLCrawler(File config) { 91 CrawlerConfiguration ce = new CrawlerConfiguration(config.getAbsolutePath()); 92 93 94 this.url_list_file = ce.getURIListResolved(); 95 log.debug("URI list file: " + this.url_list_file); 96 97 this.html_dump_directory = ce.getHTDocsDumpDirResolved(); 98 log.debug("HTDocs Dump Dir: " + this.html_dump_directory); 99 100 robot = new RobotExclusion(ce.getUserAgent()); 101 102 String robots_file = ce.getRobotsFileResolved(); 103 log.debug("Robots File: " + robots_file); 104 String robots_domain = ce.getRobotsDomain(); 105 if (robots_file != null && robots_domain != null) { 106 log.debug(robots_file + " " + robots_domain); 107 robot.addLocalEntries(robots_domain, new File (robots_file)); 108 } 109 } 110 111 117 public void crawl(URL start, String scope) { 118 scopeURL = new String [1]; 119 scopeURL[0] = scope; 120 121 String seedURL = start.toString(); 122 this.rootURL = seedURL.substring(0, seedURL.indexOf("/", 8)); 123 124 urlsToCrawl = new java.util.Vector (); 125 urlsToCrawlLowerCase = new java.util.TreeSet (); 126 127 String currentURLPath = start.toString().substring(0, start.toString().lastIndexOf("/")); 128 129 try { 130 log.info("Start crawling at: " + start); 131 132 if (addURL(start.getFile(), currentURLPath) != null) { 133 dumpHTDoc(start); 134 } else { 135 log.warn("Start URL has not been dumped: " + start); 136 } 137 } catch (MalformedURLException e) { 138 log.error("" + e); 139 } 140 141 int currentPosition = 0; 142 143 while (currentPosition < urlsToCrawl.size()) { 144 URL currentURL = (URL ) urlsToCrawl.elementAt(currentPosition); 145 currentURLPath = currentURL.toString().substring(0, currentURL.toString().lastIndexOf("/")); 146 147 log.info("INFO: Current Array Size: " + urlsToCrawl.size() + ", Current Position: " + currentPosition + ", Current URL: " + currentURL.toString()); 148 149 150 java.util.List urlsWithinPage = parsePage(currentURL.toString()); 151 152 if (urlsWithinPage != null) { 153 java.util.Iterator iterator = urlsWithinPage.iterator(); 154 155 while (iterator.hasNext()) { 156 String urlCandidate = (String ) iterator.next(); 157 158 try { 159 URL urlToCrawl = null; 160 161 if ((urlToCrawl = addURL(urlCandidate, currentURLPath)) != null) { 162 dumpHTDoc(urlToCrawl); 163 } 164 } catch (MalformedURLException e) { 165 log.warn("" + e + " " + urlCandidate); 166 } 167 } 168 } 169 170 currentPosition = currentPosition + 1; 171 } 172 173 log.info("Stop crawling at: " + urlsToCrawl.elementAt(urlsToCrawl.size()-1)); 174 175 176 177 try { 179 File parent = new File (new File (url_list_file).getParent()); 180 if (!parent.isDirectory()) { 181 parent.mkdirs(); 182 log.warn("Directory has been created: " + parent); 183 } 184 java.io.PrintWriter out = new java.io.PrintWriter (new FileOutputStream (url_list_file)); 185 186 for (int i = 0; i < urlsToCrawl.size(); i++) { 187 out.println("" + urlsToCrawl.elementAt(i)); 188 } 189 190 out.close(); 191 } catch (java.io.FileNotFoundException e) { 192 log.error("" + e); 193 } 194 } 195 196 206 public URL addURL(String urlCandidate, String currentURLPath) 207 throws MalformedURLException { 208 URL url = new URL (parseHREF(urlCandidate, urlCandidate.toLowerCase(), currentURLPath)); 209 211 if (filterURL(urlCandidate, currentURLPath, urlsToCrawlLowerCase)) { 212 if (!robot.disallowed(url)) { 213 if (url.getQuery() == null) { 214 urlsToCrawl.add(url); 215 urlsToCrawlLowerCase.add(url.toString().toLowerCase()); 216 log.debug("URL added: " + url); 217 } else { 218 log.info("Don't crawl URLs with query string: " + url); 219 } 220 221 return url; 222 } else { 223 log.info("Disallowed by robots.txt: " + urlCandidate); 224 } 225 } 226 227 return null; 228 } 229 230 237 public java.util.List parsePage(String urlString) { 238 String status = "ok"; 239 240 try { 241 URL currentURL = new java.net.URL (urlString); 242 String currentURLPath = urlString.substring(0, urlString.lastIndexOf("/")); 243 HttpURLConnection httpCon = (HttpURLConnection ) currentURL.openConnection(); 244 245 httpCon.setRequestProperty("User-Agent", "Lenya Lucene Crawler"); 246 247 httpCon.connect(); 248 249 long lastModified = httpCon.getLastModified(); 250 251 if (httpCon.getResponseCode() == HttpURLConnection.HTTP_OK) { 252 String contentType = httpCon.getContentType(); 253 254 if (contentType.indexOf("text/html") != -1) { 255 return handleHTML(httpCon); 256 } else if (contentType.indexOf("application/pdf") != -1) { 257 handlePDF(httpCon); 258 } else { 259 status = "Not an excepted content type : " + contentType; 260 } 261 } else { 262 status = "bad"; 263 } 264 265 httpCon.disconnect(); 266 } catch (java.net.MalformedURLException mue) { 267 status = mue.toString(); 268 } catch (java.net.UnknownHostException uh) { 269 status = uh.toString(); } catch (java.io.IOException ioe) { 271 status = ioe.toString(); } catch (Exception e) { 273 status = e.toString(); } 275 276 return null; 278 } 279 280 289 public static java.util.List handleHTML(HttpURLConnection httpCon) 290 throws java.io.IOException { 291 ContentHandler handler = new HTMLHandler(); 292 handler.parse(httpCon.getInputStream()); 293 294 if (handler.getRobotFollow()) { 295 java.util.List links = handler.getLinks(); 296 297 return links; 298 } 299 300 return null; 301 } 302 303 308 public void handlePDF(HttpURLConnection httpCon) { 309 log.debug(".handlePDF(): Not handled yet!"); 310 } 311 312 321 public boolean filterURL(String url, String currentURLPath, java.util.TreeSet links) { 322 String urlLowCase = url.toLowerCase(); 323 324 if (!(urlLowCase.startsWith("http://") || urlLowCase.startsWith("https://"))) { 325 url = parseHREF(url, urlLowCase, currentURLPath); 326 327 if (url != null) { 328 urlLowCase = url.toLowerCase(); 329 } 330 } 331 332 if ((url != null) && inScope(url)) { 333 if (!links.contains(urlLowCase)) { 334 return true; 335 } 336 } else { 337 log.debug("Not in scope: " + url); 338 } 339 340 return false; 341 } 342 343 352 public String parseHREF(String url, String urlLowCase, String currentURLPath) { 353 if (urlLowCase.startsWith("http://") || urlLowCase.startsWith("https://")) { 354 return url; 355 } 356 357 if (urlLowCase.startsWith("/")) { 359 url = rootURL + url; 360 } else if (urlLowCase.startsWith("./")) { 361 url = currentURLPath + url.substring(1, url.length()); 362 } else if (urlLowCase.startsWith("../")) { 363 int back = 1; 364 365 while (urlLowCase.indexOf("../", back * 3) != -1) 367 back++; 368 369 int pos = currentURLPath.length(); 370 int count = back; 371 372 while (count-- > 0) { 373 pos = currentURLPath.lastIndexOf("/", pos) - 1; 374 } 375 376 String dotsRemoved = url.substring(3 * back, url.length()); 377 if (dotsRemoved.length() > 0 && dotsRemoved.charAt(0) == '.') { 378 log.error("Parsing failed: " + url + " (" + currentURLPath + ")"); 379 url = null; 380 } else { 381 url = currentURLPath.substring(0, pos + 2) + dotsRemoved; 382 } 383 } else if (urlLowCase.startsWith("javascript:")) { 384 log.debug("\"javascript:\" is not implemented yet!"); 386 url = null; 387 } else if (urlLowCase.startsWith("#")) { 388 log.debug("\"#\" (anchor) will be ignored!"); 389 390 url = null; 392 } else if (urlLowCase.startsWith("mailto:")) { 393 log.debug("\"mailto:\" is not a URL to be followed!"); 394 395 url = null; 397 } else { 398 url = currentURLPath + "/" + url; 399 } 400 401 if (url != null) { 405 int i; 406 407 if ((i = url.indexOf("#")) != -1) { 408 url = url.substring(0, i); 409 } 410 } 411 412 413 return url; 414 } 415 416 423 public boolean inScope(String url) { 424 for (int i = 0; i < scopeURL.length; i++) { 425 if (url.startsWith(scopeURL[i])) { 426 return true; 427 } 428 } 429 430 return false; 431 } 432 433 443 public URL completeURL(URL parent, String child) throws MalformedURLException { 444 return parent; 445 } 446 447 452 public void dumpHTDoc(URL url) { 453 String ext = getExtension(url); 454 455 String filename = html_dump_directory + url.getFile(); 456 File file = new File (filename); 457 458 if (filename.charAt(filename.length() - 1) == '/') { 459 file = new File (filename + "index.html"); 460 ext = getExtension(file); 461 } 462 463 if (ext.equals("html") || ext.equals("htm") || ext.equals("txt") || ext.equals("pdf")) { 464 try { 465 File parent = new File (file.getParent()); 466 467 if (!parent.exists()) { 468 parent.mkdirs(); 469 } 470 471 HttpURLConnection httpConnection = (HttpURLConnection ) url.openConnection(); 472 java.io.InputStream in = httpConnection.getInputStream(); 473 474 FileOutputStream out = new FileOutputStream (file); 475 byte[] buffer = new byte[1024]; 476 int bytesRead = -1; 477 while ((bytesRead = in.read(buffer)) >= 0) { 478 out.write(buffer, 0, bytesRead); 479 } 480 out.close(); 481 482 497 in.close(); 498 httpConnection.disconnect(); 499 500 log.info("URL dumped: " + url + " (" + file + ")"); 501 } catch (Exception e) { 502 log.error("" + e); 503 log.error("URL not dumped: " + url); 504 } 505 } else { 506 log.info("URL not dumped: " + url); 507 } 508 } 509 510 513 534 535 542 public String getExtension(URL url) { 543 return getExtension(new File (url.getPath())); 544 } 545 546 553 public String getExtension(File file) { 554 StringTokenizer st = new StringTokenizer (file.getPath(), "."); 555 String extension = null; 556 557 while (st.hasMoreElements()) { 558 extension = st.nextToken(); 559 } 560 561 return extension; 562 } 563 } 564 | Popular Tags |