1 26 27 package org.archive.extractor; 28 29 import java.util.regex.Matcher ; 30 31 import org.apache.commons.httpclient.URIException; 32 import org.archive.crawler.extractor.Link; 33 import org.archive.net.UURIFactory; 34 import org.archive.util.DevUtils; 35 import org.archive.util.TextUtils; 36 37 54 55 public class RegexpCSSLinkExtractor extends CharSequenceLinkExtractor { 56 57 60 private static String ESCAPED_AMP = "&"; 61 static final String CSS_BACKSLASH_ESCAPE = "\\\\([,'\"\\(\\)\\s])"; 65 66 protected Matcher uris; 67 68 73 static final String CSS_URI_EXTRACTOR = 74 "(?:@import (?:url[(]|)|url[(])\\s*([\\\"\']?)([^\\\"\'].*?)\\1\\s*[);]"; 75 76 protected boolean findNextLink() { 77 if (uris == null) { 78 uris = TextUtils.getMatcher(CSS_URI_EXTRACTOR, sourceContent); 79 } 82 String cssUri; 83 try { 84 while (uris.find()) { 85 cssUri = uris.group(2); 86 cssUri = TextUtils.replaceAll(ESCAPED_AMP, cssUri, "&"); 88 cssUri = TextUtils.replaceAll(CSS_BACKSLASH_ESCAPE, cssUri, "$1"); 90 try { 92 Link link = new Link(source, UURIFactory.getInstance(base, 93 cssUri), Link.EMBED_MISC, Link.EMBED_HOP); 94 next.addLast(link); 95 } catch (URIException e) { 96 extractErrorListener.noteExtractError(e, source, cssUri); 97 } 98 return true; 99 } 100 } catch (StackOverflowError e) { 101 DevUtils.warnHandle(e, "RegexpCSSLinkExtractor StackOverflowError"); 102 } 103 return false; 104 } 105 106 public void reset() { 107 super.reset(); 108 TextUtils.recycleMatcher(uris); 109 uris = null; 110 } 111 112 protected static CharSequenceLinkExtractor newDefaultInstance() { 113 return new RegexpCSSLinkExtractor(); 114 } 115 } 116 | Popular Tags |