1 26 27 package org.archive.crawler.extractor; 28 29 import java.util.logging.Logger ; 30 31 import org.archive.crawler.datamodel.CrawlURI; 32 33 41 public class AggressiveExtractorHTML 42 extends ExtractorHTML { 43 44 private static final long serialVersionUID = 3586060081186247087L; 45 46 static Logger logger = 47 Logger.getLogger(AggressiveExtractorHTML.class.getName()); 48 49 public AggressiveExtractorHTML(String name) { 50 super(name, "Aggressive HTML extractor. Subclasses ExtractorHTML " + 51 " so does all that it does, except in regard to javascript " + 52 " blocks. Here " + 53 " it first processes as JS as its parent does, but then it " + 54 " reruns through the JS treating it as HTML (May cause many " + 55 " false positives). It finishes by applying heuristics " + 56 " against script code looking for possible URIs. "); 57 } 58 59 protected void processScript(CrawlURI curi, CharSequence sequence, 60 int endOfOpenTag) { 61 super.processScript(curi,sequence,endOfOpenTag); 62 processGeneralTag(curi, sequence.subSequence(0,6), 65 sequence.subSequence(endOfOpenTag, sequence.length())); 66 } 67 68 71 public String report() { 72 StringBuffer ret = new StringBuffer (256); 73 ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML2\n"); 74 ret.append(" Function: Link extraction on HTML documents " + 75 "(including embedded CSS)\n"); 76 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 77 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 78 return ret.toString(); 79 } 80 } 81 | Popular Tags |