1 16 package org.apache.cocoon.components.crawler; 17 18 import org.apache.avalon.excalibur.pool.Recyclable; 19 import org.apache.avalon.framework.activity.Disposable; 20 import org.apache.avalon.framework.configuration.Configurable; 21 import org.apache.avalon.framework.configuration.Configuration; 22 import org.apache.avalon.framework.configuration.ConfigurationException; 23 import org.apache.avalon.framework.logger.AbstractLogEnabled; 24 import org.apache.cocoon.Constants; 25 import org.apache.commons.lang.StringUtils; 26 import org.apache.regexp.RE; 27 import org.apache.regexp.RESyntaxException; 28 29 import java.io.BufferedReader ; 30 import java.io.IOException ; 31 import java.io.InputStream ; 32 import java.io.InputStreamReader ; 33 import java.net.URL ; 34 import java.net.URLConnection ; 35 import java.util.ArrayList ; 36 import java.util.HashSet ; 37 import java.util.Iterator ; 38 import java.util.List ; 39 40 46 public class SimpleCocoonCrawlerImpl extends AbstractLogEnabled 47 implements CocoonCrawler, Configurable, Disposable, Recyclable { 48 49 55 public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type"; 56 57 63 public final String LINK_CONTENT_TYPE_DEFAULT = Constants.LINK_CONTENT_TYPE; 64 65 72 public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query"; 73 74 80 public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links"; 81 82 88 public final static String EXCLUDE_CONFIG = "exclude"; 89 90 96 public final static String INCLUDE_CONFIG = "include"; 97 98 104 public final static String USER_AGENT_CONFIG = "user-agent"; 105 106 110 public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME; 111 112 118 public final static String ACCEPT_CONFIG = "accept"; 119 120 126 public final static String ACCEPT_DEFAULT = "*/*"; 127 128 129 private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT; 130 private String linkContentType = LINK_CONTENT_TYPE_DEFAULT; 131 private HashSet excludeCrawlingURL; 132 private HashSet includeCrawlingURL; 133 private String userAgent = USER_AGENT_DEFAULT; 134 private String accept = ACCEPT_DEFAULT; 135 136 private int depth; 137 138 private HashSet crawled; 139 private HashSet urlsToProcess; 140 private HashSet urlsNextDepth; 141 142 143 144 147 public SimpleCocoonCrawlerImpl() { 148 includeCrawlingURL = null; 150 excludeCrawlingURL = null; 152 } 153 154 155 176 public void configure(Configuration configuration) 177 throws ConfigurationException { 178 179 Configuration[] children; 180 children = configuration.getChildren(INCLUDE_CONFIG); 181 if (children.length > 0) { 182 includeCrawlingURL = new HashSet (); 183 for (int i = 0; i < children.length; i++) { 184 String pattern = children[i].getValue(); 185 try { 186 String params[] = StringUtils.split(pattern, ", "); 187 for (int index = 0; index < params.length; index++) { 188 String tokenized_pattern = params[index]; 189 this.includeCrawlingURL.add(new RE(tokenized_pattern)); 190 } 191 } catch (RESyntaxException rese) { 192 getLogger().error("Cannot create including regular-expression for " + 193 pattern, rese); 194 } 195 } 196 } else { 197 if (getLogger().isDebugEnabled()) { 198 getLogger().debug("Include all URLs"); 199 } 200 } 201 202 children = configuration.getChildren(EXCLUDE_CONFIG); 203 if (children.length > 0) { 204 excludeCrawlingURL = new HashSet (); 205 for (int i = 0; i < children.length; i++) { 206 String pattern = children[i].getValue(); 207 try { 208 String params[] = StringUtils.split(pattern, ", "); 209 for (int index = 0; index < params.length; index++) { 210 String tokenized_pattern = params[index]; 211 this.excludeCrawlingURL.add(new RE(tokenized_pattern)); 212 } 213 } catch (RESyntaxException rese) { 214 getLogger().error("Cannot create excluding regular-expression for " + 215 pattern, rese); 216 } 217 } 218 } else { 219 excludeCrawlingURL = new HashSet (); 220 setDefaultExcludeFromCrawling(); 221 if (getLogger().isDebugEnabled()) { 222 getLogger().debug("Exclude default URLs only"); 223 } 224 } 225 226 Configuration child; 227 String value; 228 child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false); 229 if (child != null) { 230 value = child.getValue(); 231 if (value != null && value.length() > 0) { 232 this.linkContentType = value.trim(); 233 } 234 } 235 child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false); 236 if (child != null) { 237 value = child.getValue(); 238 if (value != null && value.length() > 0) { 239 this.linkViewQuery = value.trim(); 240 } 241 } 242 243 child = configuration.getChild(USER_AGENT_CONFIG, false); 244 if (child != null) { 245 value = child.getValue(); 246 if (value != null && value.length() > 0) { 247 this.userAgent = value; 248 } 249 } 250 251 child = configuration.getChild(ACCEPT_CONFIG, false); 252 if (child != null) { 253 value = child.getValue(); 254 if (value != null && value.length() > 0) { 255 this.accept = value; 256 } 257 } 258 259 } 260 261 262 265 public void dispose() { 266 crawled = null; 267 urlsToProcess = null; 268 urlsNextDepth = null; 269 excludeCrawlingURL = null; 270 includeCrawlingURL = null; 271 } 272 273 274 277 public void recycle() { 278 crawled = null; 279 urlsToProcess = null; 280 urlsNextDepth = null; 281 depth = -1; 282 } 283 284 285 290 public void crawl(URL url) { 291 crawl(url, -1); 292 } 293 294 322 public void crawl(URL url, int maxDepth) { 323 crawled = new HashSet (); 324 urlsToProcess = new HashSet (); 325 urlsNextDepth = new HashSet (); 326 depth = maxDepth; 327 328 if (getLogger().isDebugEnabled()) { 329 getLogger().debug("crawl URL " + url + " to depth " + maxDepth); 330 } 331 332 urlsToProcess.add(url); 333 } 334 335 336 346 public Iterator iterator() { 347 return new CocoonCrawlerIterator(this); 348 } 349 350 351 366 private void setDefaultExcludeFromCrawling() { 367 String [] EXCLUDE_FROM_CRAWLING_DEFAULT = { 368 ".*\\.gif(\\?.*)?$", 369 ".*\\.png(\\?.*)?$", 370 ".*\\.jpe?g(\\?.*)?$", 371 ".*\\.js(\\?.*)?$", 372 ".*\\.css(\\?.*)?$" 373 }; 374 375 for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) { 376 String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i]; 377 try { 378 excludeCrawlingURL.add(new RE(pattern)); 379 } catch (RESyntaxException rese) { 380 getLogger().error("Cannot create excluding regular-expression for " + 381 pattern, rese); 382 } 383 } 384 } 385 386 387 398 private List getLinks(URL url) { 399 ArrayList url_links = null; 400 String sURL = url.toString(); 401 402 if (!isIncludedURL(sURL) || isExcludedURL(sURL)) { 403 return null; 404 } 405 406 if (crawled.contains(sURL)) { 408 return null; 409 } 410 411 crawled.add(sURL); 413 414 if (getLogger().isDebugEnabled()) { 416 getLogger().debug("Getting links of URL " + sURL); 417 } 418 BufferedReader br = null; 419 try { 420 sURL = url.getFile(); 421 URL links = new URL (url, sURL 422 + ((sURL.indexOf("?") == -1) ? "?" : "&") 423 + linkViewQuery); 424 URLConnection links_url_connection = links.openConnection(); 425 links_url_connection.setRequestProperty("Accept", accept); 426 links_url_connection.setRequestProperty("User-Agent", userAgent); 427 links_url_connection.connect(); 428 InputStream is = links_url_connection.getInputStream(); 429 br = new BufferedReader (new InputStreamReader (is)); 430 431 String contentType = links_url_connection.getContentType(); 432 if (contentType == null) { 433 if (getLogger().isDebugEnabled()) { 434 getLogger().debug("Ignoring " + sURL + " (no content type)"); 435 } 436 return null; 438 } 439 440 int index = contentType.indexOf(';'); 441 if (index != -1) { 442 contentType = contentType.substring(0, index); 443 } 444 445 if (getLogger().isDebugEnabled()) { 446 getLogger().debug("Content-type: " + contentType); 447 } 448 449 if (contentType.equals(linkContentType)) { 450 url_links = new ArrayList (); 451 452 String line; 455 while ((line = br.readLine()) != null) { 456 final URL newUrl = new URL (url, line); 457 final String sNewUrl = newUrl.toString(); 458 459 boolean add_url = true; 460 if (add_url) { 462 add_url &= !url_links.contains(sNewUrl); 463 } 464 465 if (add_url) { 467 add_url &= !crawled.contains(sNewUrl); 468 } 469 470 if (add_url) { 472 add_url &= isIncludedURL(sNewUrl); 473 } 474 475 if (add_url) { 477 add_url &= !isExcludedURL(sNewUrl); 478 } 479 if (add_url) { 480 if (getLogger().isDebugEnabled()) { 481 getLogger().debug("Add URL: " + sNewUrl); 482 } 483 url_links.add(newUrl); 484 } 485 } 486 } 488 } catch (IOException ioe) { 489 getLogger().warn("Problems get links of " + url, ioe); 490 } finally { 491 if (br != null) { 492 try { 493 br.close(); 494 br = null; 495 } catch (IOException ignored) { 496 } 497 } 498 } 499 return url_links; 500 } 501 502 503 509 private boolean isExcludedURL(String url) { 510 if (excludeCrawlingURL == null) { 512 return false; 513 } 514 515 final String s = url; 516 Iterator i = excludeCrawlingURL.iterator(); 517 while (i.hasNext()) { 518 RE pattern = (RE) i.next(); 519 if (pattern.match(s)) { 520 if (getLogger().isDebugEnabled()) { 521 getLogger().debug("Excluded URL " + url); 522 } 523 return true; 524 } 525 } 526 if (getLogger().isDebugEnabled()) { 527 getLogger().debug("Not excluded URL " + url); 528 } 529 return false; 530 } 531 532 533 539 private boolean isIncludedURL(String url) { 540 if (includeCrawlingURL == null) { 542 return true; 543 } 544 545 final String s = url; 546 Iterator i = includeCrawlingURL.iterator(); 547 while (i.hasNext()) { 548 RE pattern = (RE) i.next(); 549 if (pattern.match(s)) { 550 if (getLogger().isDebugEnabled()) { 551 getLogger().debug("Included URL " + url); 552 } 553 return true; 554 } 555 } 556 if (getLogger().isDebugEnabled()) { 557 getLogger().debug("Not included URL " + url); 558 } 559 return false; 560 } 561 562 563 573 public static class CocoonCrawlerIterator implements Iterator { 574 private SimpleCocoonCrawlerImpl cocoonCrawler; 575 576 577 582 CocoonCrawlerIterator(SimpleCocoonCrawlerImpl cocoonCrawler) { 583 this.cocoonCrawler = cocoonCrawler; 584 } 585 586 587 593 public boolean hasNext() { 594 return cocoonCrawler.urlsToProcess.size() > 0 595 || cocoonCrawler.urlsNextDepth.size() > 0; 596 } 597 598 599 602 public Object next() { 603 if (cocoonCrawler.urlsToProcess.size() == 0 604 && cocoonCrawler.urlsNextDepth.size() > 0) { 605 cocoonCrawler.urlsToProcess = cocoonCrawler.urlsNextDepth; 607 cocoonCrawler.urlsNextDepth = new HashSet (); 608 if (cocoonCrawler.depth > 0) { 612 cocoonCrawler.depth--; 613 } 614 } 615 URL theNextUrl = null; 616 for (Iterator i = cocoonCrawler.urlsToProcess.iterator(); 620 i.hasNext() && theNextUrl == null;) { 621 URL url = (URL ) i.next(); 623 624 i.remove(); 626 627 if (cocoonCrawler.depth == -1 || cocoonCrawler.depth > 0) { 628 List url_links = cocoonCrawler.getLinks(url); 630 if (url_links != null) { 631 cocoonCrawler.urlsNextDepth.addAll(url_links); 633 theNextUrl = url; 634 } 635 } 636 } 637 return theNextUrl; 639 } 640 641 642 645 public void remove() { 646 throw new UnsupportedOperationException ("remove is not implemented"); 647 } 648 } 649 } 650 651 | Popular Tags |