1 24 package org.archive.crawler.extractor; 25 26 import java.io.IOException ; 27 import java.util.ArrayList ; 28 import java.util.Iterator ; 29 import java.util.logging.Level ; 30 import java.util.logging.Logger ; 31 import java.util.regex.Matcher ; 32 33 import org.apache.commons.httpclient.URIException; 34 import org.archive.crawler.datamodel.CoreAttributeConstants; 35 import org.archive.crawler.datamodel.CrawlURI; 36 import org.archive.crawler.datamodel.RobotsHonoringPolicy; 37 import org.archive.crawler.settings.SimpleType; 38 import org.archive.crawler.settings.Type; 39 import org.archive.io.ReplayCharSequence; 40 import org.archive.net.UURI; 41 import org.archive.net.UURIFactory; 42 import org.archive.util.DevUtils; 43 import org.archive.util.HttpRecorder; 44 import org.archive.util.TextUtils; 45 46 53 public class ExtractorHTML extends Extractor 54 implements CoreAttributeConstants { 55 56 private static final long serialVersionUID = 5855731422080471017L; 57 58 private static Logger logger = 59 Logger.getLogger(ExtractorHTML.class.getName()); 60 61 82 private static final int MAX_ELEMENT_LENGTH = 84 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() + 85 ".maxElementNameLength", "1024")); 86 87 static final String RELEVANT_TAG_EXTRACTOR = 88 "(?is)<(?:((script[^>]*+)>.*?</script)" + "|((style[^>]*+)>[^<]*+</style)" + "|(((meta)|(?:\\w{1,"+MAX_ELEMENT_LENGTH+"}))\\s+[^>]*+)" + "|(!--.*?--))>"; 93 97 98 99 108 private static final int MAX_ATTR_NAME_LENGTH = 109 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() + 110 ".maxAttributeNameLength", "1024")); 112 static final int MAX_ATTR_VAL_LENGTH = 113 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() + 114 ".maxAttributeValueLength", "16384")); 116 118 static final String EACH_ATTRIBUTE_EXTRACTOR = 122 "(?is)\\s((href)|(action)|(on\\w*)" +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" +"|(?:usemap)|(?:profile)|(?:datasrc))" +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" +"|(value)|(style)|([-\\w]{1,"+MAX_ATTR_NAME_LENGTH+"}))" +"\\s*=\\s*" 128 +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))" +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))" +"|(\\S{1,"+MAX_ATTR_VAL_LENGTH+"}))"; 151 152 static final String LIKELY_URI_PATH = 157 "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)"; 158 static final String WHITESPACE = "\\s"; 159 static final String CLASSEXT =".class"; 160 static final String APPLET = "applet"; 161 static final String BASE = "base"; 162 static final String LINK = "link"; 163 static final String FRAME = "frame"; 164 static final String IFRAME = "iframe"; 165 166 public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS = 167 "treat-frames-as-embed-links"; 168 169 public static final String ATTR_IGNORE_FORM_ACTION_URLS = 170 "ignore-form-action-urls"; 171 172 173 public static final String ATTR_EXTRACT_JAVASCRIPT = 174 "extract-javascript"; 175 176 public static final String ATTR_OVERLY_EAGER_LINK_DETECTION = 177 "overly-eager-link-detection"; 178 179 public static final String ATTR_IGNORE_UNEXPECTED_HTML = 180 "ignore-unexpected-html"; 181 182 183 protected long numberOfCURIsHandled = 0; 184 protected long numberOfLinksExtracted = 0; 185 186 public ExtractorHTML(String name) { 187 this(name, "HTML extractor. Extracts links from HTML documents"); 188 } 189 190 public ExtractorHTML(String name, String description) { 191 super(name, description); 192 Type t = addElementToDefinition( 193 new SimpleType(ATTR_EXTRACT_JAVASCRIPT, 194 "If true, in-page Javascript is scanned for strings that " + 195 "appear likely to be URIs. This typically finds both valid " + 196 "and invalid URIs, and attempts to fetch the invalid URIs " + 197 "sometimes generates webmaster concerns over odd crawler " + 198 "behavior. Default is true.", 199 Boolean.TRUE)); 200 t.setExpertSetting(true); 201 t = addElementToDefinition( 202 new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS, 203 "If true, FRAME/IFRAME SRC-links are treated as embedded " + 204 "resources (like IMG, 'E' hop-type), otherwise they are " + 205 "treated as navigational links. Default is true.", Boolean.TRUE)); 206 t.setExpertSetting(true); 207 t = addElementToDefinition( 208 new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS, 209 "If true, URIs appearing as the ACTION attribute in " + 210 "HTML FORMs are ignored. Default is false.", Boolean.FALSE)); 211 t.setExpertSetting(true); 212 t = addElementToDefinition( 213 new SimpleType(ATTR_OVERLY_EAGER_LINK_DETECTION, 214 "If true, strings that look like URIs found in unusual " + 215 "places (such as form VALUE attributes) will be extracted. " + 216 "This typically finds both valid and invalid URIs, and " + 217 "attempts to fetch the invalid URIs sometimes generate " + 218 "webmaster concerns over odd crawler behavior. Default " + 219 "is true.", 220 Boolean.TRUE)); 221 t.setExpertSetting(true); 222 t = addElementToDefinition( 223 new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML, 224 "If true, URIs which end in typical non-HTML extensions " + 225 "(such as .gif) will not be scanned as if it were HTML. " + 226 "Default is true.", Boolean.TRUE)); 227 t.setExpertSetting(true); 228 } 229 230 protected void processGeneralTag(CrawlURI curi, CharSequence element, 231 CharSequence cs) { 232 233 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); 234 235 String codebase = null; 237 ArrayList <String > resources = null; 238 239 final boolean framesAsEmbeds = ((Boolean )getUncheckedAttribute(curi, 240 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue(); 241 242 final boolean ignoreFormActions = ((Boolean )getUncheckedAttribute(curi, 243 ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue(); 244 245 final boolean overlyEagerLinkDetection = ((Boolean )getUncheckedAttribute 246 (curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue(); 247 248 final String elementStr = element.toString(); 249 250 while (attr.find()) { 251 int valueGroup = 252 (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15; 253 int start = attr.start(valueGroup); 254 int end = attr.end(valueGroup); 255 assert start >= 0: "Start is: " + start + ", " + curi; 256 assert end >= 0: "End is :" + end + ", " + curi; 257 CharSequence value = cs.subSequence(start, end); 258 value = TextUtils.unescapeHtml(value); 259 if (attr.start(2) > -1) { 260 CharSequence context = 262 Link.elementContext(element, attr.group(2)); 263 if(elementStr.equalsIgnoreCase(LINK)) { 264 processEmbed(curi, value, context); 266 } else { 267 processLink(curi, value, context); 269 } 270 if (elementStr.equalsIgnoreCase(BASE)) { 271 try { 272 curi.setBaseURI(value.toString()); 273 } catch (URIException e) { 274 if (getController() != null) { 275 getController().logUriError(e, curi.getUURI(), 278 value.toString()); 279 } else { 280 logger.info("Failed set base uri: " + 281 curi + ", " + value.toString() + ": " + 282 e.getMessage()); 283 } 284 } 285 } 286 } else if (attr.start(3) > -1) { 287 if (!ignoreFormActions) { 289 CharSequence context = Link.elementContext(element, 290 attr.group(3)); 291 processLink(curi, value, context); 292 } 293 } else if (attr.start(4) > -1) { 294 processScriptCode(curi, value); } else if (attr.start(5) > -1) { 297 CharSequence context = Link.elementContext(element, 299 attr.group(5)); 300 301 final char hopType; 303 304 if(!framesAsEmbeds 305 && (elementStr.equalsIgnoreCase(FRAME) || elementStr 306 .equalsIgnoreCase(IFRAME))) { 307 hopType = Link.NAVLINK_HOP; 308 } else { 309 hopType = Link.EMBED_HOP; 310 } 311 processEmbed(curi, value, context, hopType); 312 } else if (attr.start(6) > -1) { 313 codebase = (value instanceof String )? 315 (String )value: value.toString(); 316 CharSequence context = Link.elementContext(element, 317 attr.group(6)); 318 processEmbed(curi, codebase, context); 319 } else if (attr.start(7) > -1) { 320 if (resources == null) { 322 resources = new ArrayList <String >(); 323 } 324 resources.add(value.toString()); 325 } else if (attr.start(8) > -1) { 326 if (resources==null) { 328 resources = new ArrayList <String >(); 329 } 330 String [] multi = TextUtils.split(WHITESPACE, value); 331 for(int i = 0; i < multi.length; i++ ) { 332 resources.add(multi[i]); 333 } 334 } else if (attr.start(9) > -1) { 335 if (resources==null) { 337 resources = new ArrayList <String >(); 338 } 339 if (elementStr.equalsIgnoreCase(APPLET) && 342 !value.toString().toLowerCase().endsWith(CLASSEXT)) { 343 resources.add(value.toString() + CLASSEXT); 344 } else { 345 resources.add(value.toString()); 346 } 347 } else if (attr.start(10) > -1) { 348 if (overlyEagerLinkDetection 350 && TextUtils.matches(LIKELY_URI_PATH, value)) { 351 CharSequence context = Link.elementContext(element, 352 attr.group(10)); 353 processLink(curi,value, context); 354 } 355 356 } else if (attr.start(11) > -1) { 357 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode( 360 curi, value, getController()); 361 362 } else if (attr.start(12) > -1) { 363 } 369 } 370 TextUtils.recycleMatcher(attr); 371 372 if (resources == null) { 374 return; 375 } 376 Iterator iter = resources.iterator(); 377 UURI codebaseURI = null; 378 String res = null; 379 try { 380 if (codebase != null) { 381 codebaseURI = UURIFactory. 383 getInstance(curi.getUURI(), codebase); 384 } 385 while(iter.hasNext()) { 386 res = iter.next().toString(); 387 res = (String ) TextUtils.unescapeHtml(res); 388 if (codebaseURI != null) { 389 res = codebaseURI.resolve(res).toString(); 390 } 391 processEmbed(curi, res, element); } 393 } catch (URIException e) { 394 curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase); 395 } catch (IllegalArgumentException e) { 396 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + 397 "codebase=" + codebase + " res=" + res + "\n" + 398 DevUtils.extraInfo(), e); 399 } 400 } 401 402 408 protected void processScriptCode(CrawlURI curi, CharSequence cs) { 409 if((Boolean )getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) { 410 this.numberOfLinksExtracted += 411 ExtractorJS.considerStrings(curi, cs, getController(), false); 412 } } 414 415 static final String JAVASCRIPT = "(?i)^javascript:.*"; 416 417 424 protected void processLink(CrawlURI curi, final CharSequence value, 425 CharSequence context) { 426 if (TextUtils.matches(JAVASCRIPT, value)) { 427 processScriptCode(curi, value. subSequence(11, value.length())); 428 } else { 429 if (logger.isLoggable(Level.FINEST)) { 430 logger.finest("link: " + value.toString() + " from " + curi); 431 } 432 addLinkFromString(curi, 433 (value instanceof String )? 434 (String )value: value.toString(), 435 context, Link.NAVLINK_HOP); 436 this.numberOfLinksExtracted++; 437 } 438 } 439 440 private void addLinkFromString(CrawlURI curi, String uri, 441 CharSequence context, char hopType) { 442 try { 443 curi.createAndAddLinkRelativeToBase(uri, context.toString(), 448 hopType); 449 } catch (URIException e) { 450 if (getController() != null) { 451 getController().logUriError(e, curi.getUURI(), uri); 452 } else { 453 logger.info("Failed createAndAddLinkRelativeToBase " + 454 curi + ", " + uri + ", " + context + ", " + hopType + 455 ": " + e); 456 } 457 } 458 } 459 460 protected final void processEmbed(CrawlURI curi, CharSequence value, 461 CharSequence context) { 462 processEmbed(curi, value, context, Link.EMBED_HOP); 463 } 464 465 protected void processEmbed(CrawlURI curi, final CharSequence value, 466 CharSequence context, char hopType) { 467 if (logger.isLoggable(Level.FINEST)) { 468 logger.finest("embed (" + hopType + "): " + value.toString() + 469 " from " + curi); 470 } 471 addLinkFromString(curi, 472 (value instanceof String )? 473 (String )value: value.toString(), 474 context, hopType); 475 this.numberOfLinksExtracted++; 476 } 477 478 public void extract(CrawlURI curi) { 479 if (!isHttpTransactionContentToProcess(curi) || 480 ! (isExpectedMimeType(curi.getContentType(), "text/html") 481 || isExpectedMimeType(curi.getContentType(), "application/xhtml"))) { 482 return; 483 } 484 485 final boolean ignoreUnexpectedHTML = 486 ((Boolean )getUncheckedAttribute(curi, 487 ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue(); 488 489 if (ignoreUnexpectedHTML) { 490 try { 491 if(!isHtmlExpectedHere(curi)) { 492 return; 495 } 496 } catch (URIException e) { 497 logger.severe("Failed expectedHTML test: " + e.getMessage()); 498 } 499 } 500 501 this.numberOfCURIsHandled++; 502 503 ReplayCharSequence cs = null; 504 505 try { 506 HttpRecorder hr = curi.getHttpRecorder(); 507 if (hr == null) { 508 throw new IOException ("Why is recorder null here?"); 509 } 510 cs = hr.getReplayCharSequence(); 511 } catch (IOException e) { 512 curi.addLocalizedError(this.getName(), e, 513 "Failed get of replay char sequence " + curi.toString() + 514 " " + e.getMessage()); 515 logger.log(Level.SEVERE,"Failed get of replay char sequence in " + 516 Thread.currentThread().getName(), e); 517 } 518 519 if (cs == null) { 520 return; 521 } 522 523 try { 526 extract(curi, cs); 528 curi.linkExtractorFinished(); 530 } finally { 531 if (cs != null) { 532 try { 533 cs.close(); 534 } catch (IOException ioe) { 535 logger.warning(TextUtils.exceptionToString( 536 "Failed close of ReplayCharSequence.", ioe)); 537 } 538 } 539 } 540 } 541 542 550 void extract(CrawlURI curi, CharSequence cs) { 551 Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs); 552 while(tags.find()) { 553 if(Thread.interrupted()){ 554 break; 555 } 556 if (tags.start(8) > 0) { 557 } else if (tags.start(7) > 0) { 560 int start = tags.start(5); 562 int end = tags.end(5); 563 assert start >= 0: "Start is: " + start + ", " + curi; 564 assert end >= 0: "End is :" + end + ", " + curi; 565 if (processMeta(curi, 566 cs.subSequence(start, end))) { 567 568 break; 570 } 571 } else if (tags.start(5) > 0) { 572 int start5 = tags.start(5); 574 int end5 = tags.end(5); 575 assert start5 >= 0: "Start is: " + start5 + ", " + curi; 576 assert end5 >= 0: "End is :" + end5 + ", " + curi; 577 int start6 = tags.start(6); 578 int end6 = tags.end(6); 579 assert start6 >= 0: "Start is: " + start6 + ", " + curi; 580 assert end6 >= 0: "End is :" + end6 + ", " + curi; 581 processGeneralTag(curi, 582 cs.subSequence(start6, end6), 583 cs.subSequence(start5, end5)); 584 585 } else if (tags.start(1) > 0) { 586 int start = tags.start(1); 588 int end = tags.end(1); 589 assert start >= 0: "Start is: " + start + ", " + curi; 590 assert end >= 0: "End is :" + end + ", " + curi; 591 assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) + 592 ", " + curi; 593 processScript(curi, cs.subSequence(start, end), 594 tags.end(2) - start); 595 596 } else if (tags.start(3) > 0){ 597 int start = tags.start(3); 599 int end = tags.end(3); 600 assert start >= 0: "Start is: " + start + ", " + curi; 601 assert end >= 0: "End is :" + end + ", " + curi; 602 assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) + 603 ", " + curi; 604 processStyle(curi, cs.subSequence(start, end), 605 tags.end(4) - start); 606 } 607 } 608 TextUtils.recycleMatcher(tags); 609 } 610 611 612 static final String NON_HTML_PATH_EXTENSION = 613 "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+ 614 "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)"; 615 616 624 protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException { 625 String path = curi.getUURI().getPath(); 626 if(path==null) { 627 return true; 629 } 630 int dot = path.lastIndexOf('.'); 631 if (dot < 0) { 632 return true; 634 } 635 if(dot<(path.length()-5)) { 636 return true; 638 } 639 String ext = path.substring(dot+1); 640 return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext); 641 } 642 643 protected void processScript(CrawlURI curi, CharSequence sequence, 644 int endOfOpenTag) { 645 processGeneralTag(curi,sequence.subSequence(0,6), 648 sequence.subSequence(0,endOfOpenTag)); 649 650 processScriptCode( 653 curi, sequence.subSequence(endOfOpenTag, sequence.length())); 654 } 655 656 664 protected boolean processMeta(CrawlURI curi, CharSequence cs) { 665 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); 666 String name = null; 667 String httpEquiv = null; 668 String content = null; 669 while (attr.find()) { 670 int valueGroup = 671 (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15; 672 CharSequence value = 673 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup)); 674 if (attr.group(1).equalsIgnoreCase("name")) { 675 name = value.toString(); 676 } else if (attr.group(1).equalsIgnoreCase("http-equiv")) { 677 httpEquiv = value.toString(); 678 } else if (attr.group(1).equalsIgnoreCase("content")) { 679 content = value.toString(); 680 } 681 } 683 TextUtils.recycleMatcher(attr); 684 685 if("robots".equalsIgnoreCase(name) && content != null ) { 687 curi.putString(A_META_ROBOTS, content); 688 RobotsHonoringPolicy policy = 689 getSettingsHandler().getOrder().getRobotsHonoringPolicy(); 690 String contentLower = content.toLowerCase(); 691 if ((policy == null 692 || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE) 693 && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM))) 694 && (contentLower.indexOf("nofollow") >= 0 695 || contentLower.indexOf("none") >= 0)) { 696 logger.fine("HTML extraction skipped due to robots meta-tag for: " 699 + curi.toString()); 700 return true; 701 } 702 } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) { 703 String refreshUri = content.substring(content.indexOf("=") + 1); 704 try { 705 curi.createAndAddLinkRelativeToBase(refreshUri, "meta", 706 Link.REFER_HOP); 707 } catch (URIException e) { 708 if (getController() != null) { 709 getController().logUriError(e, curi.getUURI(), refreshUri); 710 } else { 711 logger.info("Failed createAndAddLinkRelativeToBase " + 712 curi + ", " + cs + ", " + refreshUri + ": " + e); 713 } 714 } 715 } 716 return false; 717 } 718 719 727 protected void processStyle(CrawlURI curi, CharSequence sequence, 728 int endOfOpenTag) { 729 processGeneralTag(curi, sequence.subSequence(0,6), 731 sequence.subSequence(0,endOfOpenTag)); 732 733 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode( 735 curi, sequence.subSequence(endOfOpenTag,sequence.length()), 736 getController()); 737 } 738 739 740 741 744 public String report() { 745 StringBuffer ret = new StringBuffer (); 746 ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n"); 747 ret.append(" Function: Link extraction on HTML documents\n"); 748 ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n"); 749 ret.append(" Links extracted: " + this.numberOfLinksExtracted + 750 "\n\n"); 751 return ret.toString(); 752 } 753 } 754 755 | Popular Tags |