1 26 27 package org.archive.crawler.extractor; 28 29 import java.util.Collection ; 30 import java.util.logging.Level ; 31 import java.util.logging.Logger ; 32 import java.util.regex.Matcher ; 33 34 import org.apache.commons.httpclient.URIException; 35 import org.archive.crawler.datamodel.CoreAttributeConstants; 36 import org.archive.crawler.datamodel.CrawlURI; 37 import org.archive.crawler.settings.SimpleType; 38 import org.archive.util.TextUtils; 39 40 57 58 public class ExtractorImpliedURI extends Extractor implements CoreAttributeConstants { 59 60 private static final long serialVersionUID = 8579045413127769497L; 61 62 private static Logger LOGGER = 63 Logger.getLogger(ExtractorImpliedURI.class.getName()); 64 65 66 public static final String ATTR_TRIGGER_REGEXP = "trigger-regexp"; 67 68 public static final String ATTR_BUILD_PATTERN = "build-pattern"; 69 70 private long numberOfCURIsHandled = 0; 73 private long numberOfLinksExtracted = 0; 74 75 80 public ExtractorImpliedURI(String name) { 81 super(name, "Implied URI Extractor. Finds URIs implied by other " + 82 "URIs according to regex/replacement patterns. Should " + 83 "appear after most other extractors."); 84 85 addElementToDefinition( 86 new SimpleType(ATTR_TRIGGER_REGEXP, 87 "Triggering regular expression. When a discovered URI " + 88 "matches this pattern, the 'implied' URI will be " + 89 "built. The capturing groups of this expression are " + 90 "available for the build replacement pattern.", "")); 91 addElementToDefinition( 92 new SimpleType(ATTR_BUILD_PATTERN, 93 "Replacement pattern to build 'implied' URI, using " + 94 "captured groups of trigger expression.", "")); 95 } 96 97 102 public void extract(CrawlURI curi) { 103 104 this.numberOfCURIsHandled++; 105 Collection <Link> links = curi.getOutLinks(); 107 Link[] sourceLinks = links.toArray(new Link[links.size()]); 108 for (Link wref: sourceLinks) { 109 String implied = extractImplied( 110 wref.getDestination(), 111 (String )getUncheckedAttribute(curi,ATTR_TRIGGER_REGEXP), 112 (String )getUncheckedAttribute(curi,ATTR_BUILD_PATTERN)); 113 if (implied!=null) { 114 try { 115 curi.createAndAddLink( 116 implied, 117 Link.SPECULATIVE_MISC, 118 Link.SPECULATIVE_HOP); 119 numberOfLinksExtracted++; 120 } catch (URIException e) { 121 LOGGER.log(Level.FINE, "bad URI", e); 122 } 123 } 124 } 125 } 126 127 136 protected static String extractImplied(CharSequence uri, String trigger, String build) { 137 if(trigger.length()==0) { 138 return null; 140 } 141 Matcher m = TextUtils.getMatcher(trigger, uri); 142 if(m.matches()) { 143 String result = m.replaceFirst(build); 144 TextUtils.recycleMatcher(m); 145 return result; 146 } 147 return null; 148 } 149 150 public String report() { 151 StringBuffer ret = new StringBuffer (); 152 ret.append("Processor: "+ExtractorImpliedURI.class.getName()+"\n"); 153 ret.append(" Function: Extracts links inside other URIs\n"); 154 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 155 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 156 157 return ret.toString(); 158 } 159 } 160 | Popular Tags |