1 24 package org.archive.crawler.extractor; 25 26 import java.util.logging.Logger ; 27 28 import org.apache.commons.httpclient.Header; 29 import org.apache.commons.httpclient.HttpMethod; 30 import org.apache.commons.httpclient.URIException; 31 import org.archive.crawler.datamodel.CoreAttributeConstants; 32 import org.archive.crawler.datamodel.CrawlURI; 33 import org.archive.crawler.framework.Processor; 34 35 39 public class ExtractorHTTP extends Processor 40 implements CoreAttributeConstants { 41 42 private static final long serialVersionUID = 8499072198570554647L; 43 44 private static final Logger LOGGER = 45 Logger.getLogger(ExtractorHTTP.class.getName()); 46 protected long numberOfCURIsHandled = 0; 47 protected long numberOfLinksExtracted = 0; 48 49 public ExtractorHTTP(String name) { 50 super(name, 51 "HTTP extractor. Extracts URIs from HTTP response headers."); 52 } 53 54 public void innerProcess(CrawlURI curi) { 55 if (!curi.isHttpTransaction() || curi.getFetchStatus() <= 0) { 56 return; 58 } 59 numberOfCURIsHandled++; 60 HttpMethod method = (HttpMethod)curi.getObject(A_HTTP_TRANSACTION); 61 addHeaderLink(curi, method.getResponseHeader("Location")); 62 addHeaderLink(curi, method.getResponseHeader("Content-Location")); 63 } 64 65 protected void addHeaderLink(CrawlURI curi, Header loc) { 66 if (loc == null) { 67 return; 69 } 70 try { 72 curi.createAndAddLink(loc.getValue(), loc.getName() + ":", 73 Link.REFER_HOP); 74 numberOfLinksExtracted++; 75 } catch (URIException e) { 76 if (getController() != null) { 79 getController().logUriError(e, curi.getUURI(), loc.getValue()); 80 } else { 81 LOGGER.info(curi + ", " + loc.getValue() + ": " + 82 e.getMessage()); 83 } 84 } 85 86 } 87 88 public String report() { 89 StringBuffer ret = new StringBuffer (); 90 ret.append("Processor: org.archive.crawler.extractor.ExtractorHTTP\n"); 91 ret.append(" Function: " + 92 "Extracts URIs from HTTP response headers\n"); 93 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 94 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 95 return ret.toString(); 96 } 97 } 98 | Popular Tags |