1 26 27 package org.archive.crawler.extractor; 28 29 import java.io.IOException ; 30 import java.util.logging.Logger ; 31 import java.util.regex.Matcher ; 32 33 import org.apache.commons.httpclient.URIException; 34 import org.archive.crawler.datamodel.CoreAttributeConstants; 35 import org.archive.crawler.datamodel.CrawlURI; 36 import org.archive.crawler.framework.CrawlController; 37 import org.archive.io.ReplayCharSequence; 38 import org.archive.net.UURI; 39 import org.archive.util.DevUtils; 40 import org.archive.util.TextUtils; 41 42 57 58 public class ExtractorCSS extends Extractor implements CoreAttributeConstants { 59 60 private static final long serialVersionUID = -1540252885329424902L; 61 62 private static Logger logger = 63 Logger.getLogger("org.archive.crawler.extractor.ExtractorCSS"); 64 65 private static String ESCAPED_AMP = "&"; 66 static final String CSS_BACKSLASH_ESCAPE = "\\\\([,'\"\\(\\)\\s])"; 70 71 76 static final String CSS_URI_EXTRACTOR = 79 "(?i)(?:@import (?:url[(]|)|url[(])\\s*([\\\"\']?)" + "([^\\\"\'].{0,"+UURI.MAX_URL_LENGTH+"}?)\\1\\s*[);]"; 85 private long numberOfCURIsHandled = 0; 86 private long numberOfLinksExtracted = 0; 87 88 91 public ExtractorCSS(String name) { 92 super(name, "CSS Extractor. Extracts links from Cascading Style" + 93 " Sheets (.css)."); 94 } 95 96 99 public void extract(CrawlURI curi) { 100 if (!isHttpTransactionContentToProcess(curi)) { 101 return; 102 } 103 String mimeType = curi.getContentType(); 104 if (mimeType == null) { 105 return; 106 } 107 if ((mimeType.toLowerCase().indexOf("css") < 0) && 108 (!curi.toString().toLowerCase().endsWith(".css"))) { 109 return; 110 } 111 this.numberOfCURIsHandled++; 112 113 ReplayCharSequence cs = null; 114 try { 115 cs = curi.getHttpRecorder().getReplayCharSequence(); 116 } catch (IOException e) { 117 logger.severe("Failed getting ReplayCharSequence: " + e.getMessage()); 118 } 119 if (cs == null) { 120 logger.warning("Failed getting ReplayCharSequence: " + 121 curi.toString()); 122 return; 123 } 124 125 try { 128 this.numberOfLinksExtracted += 129 processStyleCode(curi, cs, getController()); 130 curi.linkExtractorFinished(); 132 } finally { 133 if (cs != null) { 134 try { 135 cs.close(); 136 } catch (IOException ioe) { 137 logger.warning(TextUtils.exceptionToString( 138 "Failed close of ReplayCharSequence.", ioe)); 139 } 140 } 141 } 142 } 143 144 public static long processStyleCode(CrawlURI curi, CharSequence cs, 145 CrawlController controller) { 146 long foundLinks = 0; 147 Matcher uris = null; 148 String cssUri; 149 try { 150 uris = TextUtils.getMatcher(CSS_URI_EXTRACTOR, cs); 151 while (uris.find()) { 152 cssUri = uris.group(2); 153 cssUri = TextUtils.replaceAll(ESCAPED_AMP, cssUri, "&"); 155 cssUri = TextUtils.replaceAll(CSS_BACKSLASH_ESCAPE, cssUri, 157 "$1"); 158 foundLinks++; 159 try { 160 curi.createAndAddLinkRelativeToBase(cssUri,Link.EMBED_MISC, 161 Link.EMBED_HOP); 162 } catch (URIException e) { 163 if (controller != null) { 166 controller.logUriError(e, curi.getUURI(), cssUri); 167 } else { 168 logger.info(curi + ", " + cssUri + ": " + 169 e.getMessage()); 170 } 171 } 172 } 173 } catch (StackOverflowError e) { 174 DevUtils.warnHandle(e, "ExtractorCSS StackOverflowError"); 175 } finally { 176 TextUtils.recycleMatcher(uris); 177 } 178 return foundLinks; 179 } 180 181 public String report() { 182 StringBuffer ret = new StringBuffer (); 183 ret.append("Processor: org.archive.crawler.extractor.ExtractorCSS\n"); 184 ret.append(" Function: Link extraction on Cascading Style Sheets (.css)\n"); 185 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 186 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 187 188 return ret.toString(); 189 } 190 } 191 | Popular Tags |