1 16 package org.apache.cocoon.generation; 17 18 import org.apache.avalon.excalibur.pool.Recyclable; 19 import org.apache.avalon.framework.parameters.Parameters; 20 import org.apache.avalon.framework.configuration.Configurable; 21 import org.apache.avalon.framework.configuration.Configuration; 22 import org.apache.avalon.framework.configuration.ConfigurationException; 23 import org.apache.cocoon.ProcessingException; 24 import org.apache.cocoon.ResourceNotFoundException; 25 import org.apache.cocoon.environment.SourceResolver; 26 import org.apache.cocoon.Constants; 27 import org.apache.commons.lang.StringUtils; 28 import org.apache.regexp.RE; 29 import org.apache.regexp.RESyntaxException; 30 31 import org.xml.sax.SAXException ; 32 import org.xml.sax.helpers.AttributesImpl ; 33 34 import java.io.IOException ; 35 import java.io.InputStream ; 36 import java.io.BufferedReader ; 37 import java.io.InputStreamReader ; 38 import java.net.URLConnection ; 39 import java.net.HttpURLConnection ; 40 import java.net.URL ; 41 import java.util.Map ; 42 import java.util.HashSet ; 43 import java.util.Iterator ; 44 import java.util.List ; 45 import java.util.ArrayList ; 46 47 60 public class LinkStatusGenerator extends ServiceableGenerator 61 implements Recyclable, Configurable { 62 63 64 protected static final String URI = 65 "http://apache.org/cocoon/linkstatus/2.0"; 66 67 68 protected static final String PREFIX = "linkstatus"; 69 70 71 protected static final String TOP_NODE_NAME = "linkstatus"; 72 protected static final String LINK_NODE_NAME = "link"; 73 74 protected static final String HREF_ATTR_NAME = "href"; 75 protected static final String REFERRER_ATTR_NAME = "referrer"; 76 protected static final String CONTENT_ATTR_NAME = "content"; 77 protected static final String STATUS_ATTR_NAME = "status"; 78 protected static final String MESSAGE_ATTR_NAME = "message"; 79 80 protected AttributesImpl attributes; 81 82 90 public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type"; 91 92 100 public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links"; 101 102 111 public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query"; 112 120 public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links"; 121 122 130 public final static String EXCLUDE_CONFIG = "exclude"; 131 132 140 public final static String INCLUDE_CONFIG = "include"; 141 142 150 public final static String USER_AGENT_CONFIG = "user-agent"; 151 157 public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME; 158 159 167 public final static String ACCEPT_CONFIG = "accept"; 168 176 public final static String ACCEPT_DEFAULT = "*/*"; 177 178 private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT; 179 private String linkContentType = LINK_CONTENT_TYPE_DEFAULT; 180 private HashSet excludeCrawlingURL; 181 private HashSet includeCrawlingURL; 182 186 private HashSet crawled; 187 private HashSet linksToProcess; 188 189 192 private static class Link { 193 private URL url; 194 private String referrer; 195 196 public Link(URL url, String referrer) { 197 this.url = url; 198 this.referrer = referrer; 199 } 200 201 public URL getURL() { 202 return url; 203 } 204 205 public String getReferrer() { 206 return referrer; 207 } 208 209 public boolean equals(Link l) { 210 return url.equals(l.getURL()); 211 } 212 } 213 214 238 public void configure(Configuration configuration) 239 throws ConfigurationException { 240 241 Configuration[] children; 242 children = configuration.getChildren(INCLUDE_CONFIG); 243 if (children.length > 0) { 244 includeCrawlingURL = new HashSet (); 245 for (int i = 0; i < children.length; i++) { 246 String pattern = children[i].getValue(); 247 try { 248 String params[] = StringUtils.split(pattern, ", "); 249 for (int index = 0; index < params.length; index++) { 250 String tokenized_pattern = params[index]; 251 this.includeCrawlingURL.add(new RE(tokenized_pattern)); 252 } 253 } catch (RESyntaxException rese) { 254 getLogger().error("Cannot create including regular-expression for " + 255 pattern, rese); 256 } 257 } 258 } 259 260 children = configuration.getChildren(EXCLUDE_CONFIG); 261 if (children.length > 0) { 262 excludeCrawlingURL = new HashSet (); 263 for (int i = 0; i < children.length; i++) { 264 String pattern = children[i].getValue(); 265 try { 266 String params[] = StringUtils.split(pattern, ", "); 267 for (int index = 0; index < params.length; index++) { 268 String tokenized_pattern = params[index]; 269 this.excludeCrawlingURL.add(new RE(tokenized_pattern)); 270 } 271 } catch (RESyntaxException rese) { 272 getLogger().error("Cannot create excluding regular-expression for " + 273 pattern, rese); 274 } 275 } 276 } else { 277 excludeCrawlingURL = new HashSet (); 278 setDefaultExcludeFromCrawling(); 279 } 280 281 Configuration child; 282 String value; 283 child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false); 284 if (child != null) { 285 value = child.getValue(); 286 if (value != null && value.length() > 0) { 287 this.linkContentType = value.trim(); 288 } 289 } 290 child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false); 291 if (child != null) { 292 value = child.getValue(); 293 if (value != null && value.length() > 0) { 294 this.linkViewQuery = value.trim(); 295 } 296 } 297 314 } 315 316 public void setup(SourceResolver resolver, Map objectModel, String src, Parameters par) 317 throws ProcessingException, SAXException , IOException { 318 319 super.setup(resolver, objectModel, src, par); 320 321 322 this.attributes = new AttributesImpl (); 323 324 } 328 329 337 public void generate() 338 throws SAXException , ProcessingException { 339 try { 340 341 crawled = new HashSet (); 342 linksToProcess = new HashSet (); 343 344 URL root = new URL (source); 345 linksToProcess.add(new Link(root, "")); 346 347 348 if (getLogger().isDebugEnabled()) { 349 getLogger().debug("crawl URL " + root); 350 } 351 352 this.contentHandler.startDocument(); 353 this.contentHandler.startPrefixMapping(PREFIX, URI); 354 355 attributes.clear(); 356 super.contentHandler.startElement(URI, TOP_NODE_NAME, PREFIX + ':' + TOP_NODE_NAME, attributes); 357 358 while (linksToProcess.size() > 0) { 359 Iterator i = linksToProcess.iterator(); 360 361 if (i.hasNext()) { 362 Link link = (Link) i.next(); 364 URL url = link.getURL(); 365 366 linksToProcess.remove(link); 368 369 String new_url_link = processURL(url, link.getReferrer()); 370 371 if (new_url_link != null) { 373 374 List url_links = getLinksFromConnection(new_url_link, url); 375 if (url_links != null) { 376 linksToProcess.addAll(url_links); 378 } 379 } 380 } 381 } 382 383 super.contentHandler.endElement(URI, TOP_NODE_NAME, PREFIX + ':' + TOP_NODE_NAME); 384 this.contentHandler.endPrefixMapping(PREFIX); 385 this.contentHandler.endDocument(); 386 } catch (IOException ioe) { 387 getLogger().warn("Could not read source ", ioe); 388 throw new ResourceNotFoundException("Could not read source ", ioe); 389 } 390 } 391 392 407 private void setDefaultExcludeFromCrawling() { 408 String [] EXCLUDE_FROM_CRAWLING_DEFAULT = { 409 ".*\\.gif(\\?.*)?$", 410 ".*\\.png(\\?.*)?$", 411 ".*\\.jpe?g(\\?.*)?$", 412 ".*\\.js(\\?.*)?$", 413 ".*\\.css(\\?.*)?$" 414 }; 415 416 for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) { 417 String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i]; 418 try { 419 excludeCrawlingURL.add(new RE(pattern)); 420 } catch (RESyntaxException rese) { 421 getLogger().error("Cannot create excluding regular-expression for " + 422 pattern, rese); 423 } 424 } 425 } 426 427 428 439 protected List getLinksFromConnection(String url_link_string, URL url_of_referrer) { 440 List url_links = null; 441 BufferedReader br = null; 442 try { 443 URL url_link = new URL (url_link_string); 444 URLConnection conn = url_link.openConnection(); 445 String content_type = conn.getContentType(); 446 447 if (content_type == null) { 448 getLogger().warn("No content type available for " + String.valueOf(url_link_string)); 449 return url_links; 451 } 452 453 if (getLogger().isDebugEnabled()) { 454 getLogger().debug("Content-type: " + content_type); 455 } 456 457 if (content_type.equals(linkContentType) || 458 content_type.startsWith(linkContentType + ";")) { 459 url_links = new ArrayList (); 460 461 InputStream is = conn.getInputStream(); 462 br = new BufferedReader (new InputStreamReader (is)); 463 464 String line; 467 String referrer = url_of_referrer.toString(); 468 469 while ((line = br.readLine()) != null) { 470 URL new_url = new URL (url_link, line); 471 boolean add_url = true; 472 if (add_url) { 474 add_url &= !url_links.contains(new_url); 475 } 476 477 if (add_url) { 479 add_url &= !crawled.contains(new_url.toString()); 480 } 481 482 Link new_link = new Link(new_url, referrer); 483 if (add_url) { 484 add_url &= !linksToProcess.contains(new_link); 485 } 486 487 if (add_url) { 489 add_url &= isIncludedURL(new_url.toString()); 490 } 491 492 if (add_url) { 493 if (getLogger().isDebugEnabled()) { 494 getLogger().debug("Add URL: " + new_url.toString()); 495 } 496 url_links.add(new_link); 497 } 498 } 499 } 501 } catch (IOException ioe) { 502 getLogger().warn("Problems get links of " + url_link_string, ioe); 503 } finally { 504 if (br != null) { 506 try { 507 br.close(); 508 br = null; 509 } catch (IOException ignored) { 510 } 511 } 512 } 513 return url_links; 514 } 515 516 524 protected String processURL(URL url, String referrer) throws SAXException { 525 526 if (getLogger().isDebugEnabled()) { 527 getLogger().debug("getLinks URL " + url); 528 } 529 530 String result = null; 531 532 if (crawled.contains(url.toString())) { 534 return null; 535 } 536 537 crawled.add(url.toString()); 539 540 attributes.clear(); 541 attributes.addAttribute("", HREF_ATTR_NAME, 542 HREF_ATTR_NAME, "CDATA", url.toString()); 543 attributes.addAttribute("", REFERRER_ATTR_NAME, 544 REFERRER_ATTR_NAME, "CDATA", referrer); 545 546 HttpURLConnection h = null; 548 try { 549 550 URLConnection links_url_connection = url.openConnection(); 551 h = (HttpURLConnection ) links_url_connection; 552 String content_type = links_url_connection.getContentType(); 553 554 attributes.addAttribute("", CONTENT_ATTR_NAME, 555 CONTENT_ATTR_NAME, "CDATA", 556 content_type); 557 558 attributes.addAttribute("", MESSAGE_ATTR_NAME, 559 MESSAGE_ATTR_NAME, "CDATA", 560 h.getResponseMessage()); 561 562 attributes.addAttribute("", STATUS_ATTR_NAME, 563 STATUS_ATTR_NAME, "CDATA", 564 String.valueOf(h.getResponseCode())); 565 } catch (IOException ioe) { 566 attributes.addAttribute("", MESSAGE_ATTR_NAME, 567 MESSAGE_ATTR_NAME, "CDATA", 568 ioe.getMessage()); 569 } finally { 570 if (h != null) { 571 h.disconnect(); 572 } 573 } 574 575 if (!isExcludedURL(url.toString()) && isIncludedURL(url.toString())) { 578 result = url.toExternalForm() 580 + ((url.toExternalForm().indexOf("?") == -1) ? "?" : "&") 581 + linkViewQuery; 582 } 583 584 super.contentHandler.startElement(URI, LINK_NODE_NAME, PREFIX + ':' + LINK_NODE_NAME, attributes); 585 super.contentHandler.endElement(URI, LINK_NODE_NAME, PREFIX + ':' + LINK_NODE_NAME); 586 587 return result; 588 } 589 590 597 private boolean isExcludedURL(String url) { 598 if (excludeCrawlingURL == null) { 600 if (getLogger().isDebugEnabled()) { 601 getLogger().debug("exclude no URL " + url); 602 } 603 return false; 604 } 605 606 final String s = url; 607 Iterator i = excludeCrawlingURL.iterator(); 608 while (i.hasNext()) { 609 RE pattern = (RE) i.next(); 610 if (pattern.match(s)) { 611 if (getLogger().isDebugEnabled()) { 612 getLogger().debug("exclude URL " + url); 613 } 614 return true; 615 } 616 } 617 if (getLogger().isDebugEnabled()) { 618 getLogger().debug("exclude not URL " + url); 619 } 620 return false; 621 } 622 623 624 631 private boolean isIncludedURL(String url) { 632 if (includeCrawlingURL == null) { 634 if (getLogger().isDebugEnabled()) { 635 getLogger().debug("include all URL " + url); 636 } 637 return true; 638 } 639 640 final String s = url; 641 Iterator i = includeCrawlingURL.iterator(); 642 while (i.hasNext()) { 643 RE pattern = (RE) i.next(); 644 if (pattern.match(s)) { 645 if (getLogger().isDebugEnabled()) { 646 getLogger().debug("include URL " + url); 647 } 648 return true; 649 } 650 } 651 if (getLogger().isDebugEnabled()) { 652 getLogger().debug("include not URL " + url); 653 } 654 return false; 655 } 656 657 public void recycle() { 658 super.recycle(); 659 660 this.attributes = null; 661 } 662 } 663 | Popular Tags |