1 26 27 package org.archive.crawler.extractor; 28 29 import java.io.IOException ; 30 import java.util.logging.Logger ; 31 import java.util.regex.Matcher ; 32 33 import org.apache.commons.httpclient.URIException; 34 import org.archive.crawler.datamodel.CoreAttributeConstants; 35 import org.archive.crawler.datamodel.CrawlURI; 36 import org.archive.crawler.framework.CrawlController; 37 import org.archive.io.ReplayCharSequence; 38 import org.archive.util.TextUtils; 39 40 48 49 public class ExtractorXML extends Extractor implements CoreAttributeConstants { 50 51 private static final long serialVersionUID = 3101230586822401584L; 52 53 private static Logger logger = 54 Logger.getLogger(ExtractorXML.class.getName()); 55 56 private static String ESCAPED_AMP = "&"; 57 58 static final String XML_URI_EXTRACTOR = 59 "(?i)[\"\'>]\\s*(http:[^\\s\"\'<>]+)\\s*[\"\'<]"; 60 63 private long numberOfCURIsHandled = 0; 64 private long numberOfLinksExtracted = 0; 65 66 69 public ExtractorXML(String name) { 70 super(name, "XML Extractor. Extracts links from XML/RSS."); 71 } 72 73 76 public void extract(CrawlURI curi) { 77 if (!isHttpTransactionContentToProcess(curi)) { 78 return; 79 } 80 String mimeType = curi.getContentType(); 81 if (mimeType == null) { 82 return; 83 } 84 if ((mimeType.toLowerCase().indexOf("xml") < 0) 85 && (!curi.toString().toLowerCase().endsWith(".rss")) 86 && (!curi.toString().toLowerCase().endsWith(".xml"))) { 87 return; 88 } 89 this.numberOfCURIsHandled++; 90 91 ReplayCharSequence cs = null; 92 try { 93 cs = curi.getHttpRecorder().getReplayCharSequence(); 94 } catch (IOException e) { 95 logger.severe("Failed getting ReplayCharSequence: " + e.getMessage()); 96 } 97 if (cs == null) { 98 logger.severe("Failed getting ReplayCharSequence: " + 99 curi.toString()); 100 return; 101 } 102 try { 103 this.numberOfLinksExtracted += processXml(curi, cs, 104 getController()); 105 curi.linkExtractorFinished(); 107 } finally { 108 if (cs != null) { 109 try { 110 cs.close(); 111 } catch (IOException ioe) { 112 logger.warning(TextUtils.exceptionToString( 113 "Failed close of ReplayCharSequence.", ioe)); 114 } 115 } 116 } 117 } 118 119 public static long processXml(CrawlURI curi, CharSequence cs, 120 CrawlController controller) { 121 long foundLinks = 0; 122 Matcher uris = null; 123 String xmlUri; 124 uris = TextUtils.getMatcher(XML_URI_EXTRACTOR, cs); 125 while (uris.find()) { 126 xmlUri = uris.group(1); 127 xmlUri = TextUtils.replaceAll(ESCAPED_AMP, xmlUri, "&"); 129 foundLinks++; 130 try { 131 curi.createAndAddLink(xmlUri,Link.SPECULATIVE_MISC, 135 Link.SPECULATIVE_HOP); 136 } catch (URIException e) { 137 if (controller != null) { 140 controller.logUriError(e, curi.getUURI(), xmlUri); 141 } else { 142 logger.info(curi + ", " + xmlUri + ": " + 143 e.getMessage()); 144 } 145 } 146 } 147 TextUtils.recycleMatcher(uris); 148 return foundLinks; 149 } 150 151 public String report() { 152 StringBuffer ret = new StringBuffer (); 153 ret.append("Processor: org.archive.crawler.extractor.ExtractorXML\n"); 154 ret.append(" Function: Link extraction on XML/RSS\n"); 155 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 156 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 157 158 return ret.toString(); 159 } 160 } 161 | Popular Tags |