1 26 27 package org.archive.crawler.extractor; 28 29 import java.util.ArrayList ; 30 import java.util.Collection ; 31 import java.util.List ; 32 import java.util.logging.Level ; 33 import java.util.logging.Logger ; 34 import java.util.regex.Matcher ; 35 36 import org.apache.commons.codec.DecoderException; 37 import org.apache.commons.httpclient.URIException; 38 import org.archive.crawler.datamodel.CoreAttributeConstants; 39 import org.archive.crawler.datamodel.CrawlURI; 40 import org.archive.net.LaxURLCodec; 41 import org.archive.net.UURI; 42 import org.archive.util.TextUtils; 43 44 57 58 public class ExtractorURI extends Extractor implements CoreAttributeConstants { 59 60 private static final long serialVersionUID = -6273897743240970822L; 61 62 private static Logger LOGGER = 63 Logger.getLogger(ExtractorURI.class.getName()); 64 65 static final String ABS_HTTP_URI_PATTERN = "^https?://[^\\s<>]*$"; 66 67 private long numberOfCURIsHandled = 0; 70 private long numberOfLinksExtracted = 0; 71 72 77 public ExtractorURI(String name) { 78 super(name, "URI Extractor. Extracts links inside other " + 79 "discovered URIs. Should appear last among extractors."); 80 } 81 82 87 public void extract(CrawlURI curi) { 88 89 this.numberOfCURIsHandled++; 90 Collection <Link> links = curi.getOutLinks(); 92 Link[] sourceLinks = links.toArray(new Link[links.size()]); 93 for (Link wref: sourceLinks) { 94 extractLink(curi,wref); 95 } 96 } 97 98 104 protected void extractLink(CrawlURI curi, Link wref) { 105 UURI source = UURI.from(wref.getDestination()); 106 if(source == null) { 107 return; 109 } 110 List <String > found = extractQueryStringLinks(source); 111 for (String uri : found) { 112 try { 113 curi.createAndAddLink( 114 uri, 115 Link.SPECULATIVE_MISC, 116 Link.SPECULATIVE_HOP); 117 numberOfLinksExtracted++; 118 } catch (URIException e) { 119 LOGGER.log(Level.FINE, "bad URI", e); 120 } 121 } 122 124 } 125 126 134 protected static List <String > extractQueryStringLinks(UURI source) { 135 List <String > results = new ArrayList <String >(); 136 String decodedQuery; 137 try { 138 decodedQuery = source.getQuery(); 139 } catch (URIException e1) { 140 return results; 142 } 143 if(decodedQuery==null) { 144 return results; 145 } 146 Matcher m = TextUtils.getMatcher(ABS_HTTP_URI_PATTERN,decodedQuery); 148 if(m.matches()) { 149 TextUtils.recycleMatcher(m); 150 results.add(decodedQuery); 151 } 152 String rawQuery = new String (source.getRawQuery()); 154 String [] params = rawQuery.split("&"); 155 for (String param : params) { 156 String [] keyVal = param.split("="); 157 if(keyVal.length==2) { 158 String candidate; 159 try { 160 candidate = LaxURLCodec.DEFAULT.decode(keyVal[1]); 161 } catch (DecoderException e) { 162 continue; 163 } 164 m.reset(candidate); 166 if(m.matches()) { 167 results.add(candidate); 168 } 169 } 170 } 171 return results; 172 } 173 174 public String report() { 175 StringBuffer ret = new StringBuffer (); 176 ret.append("Processor: "+ExtractorURI.class.getName()+"\n"); 177 ret.append(" Function: Extracts links inside other URIs\n"); 178 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 179 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 180 181 return ret.toString(); 182 } 183 } 184 | Popular Tags |