1 24 package org.archive.crawler.extractor; 25 26 import java.io.IOException ; 27 import java.util.logging.Logger ; 28 import java.util.regex.Matcher ; 29 30 import org.apache.commons.httpclient.URIException; 31 import org.archive.crawler.datamodel.CoreAttributeConstants; 32 import org.archive.crawler.datamodel.CrawlURI; 33 import org.archive.crawler.framework.CrawlController; 34 import org.archive.io.ReplayCharSequence; 35 import org.archive.net.UURI; 36 import org.archive.util.DevUtils; 37 import org.archive.util.TextUtils; 38 39 46 public class ExtractorJS extends Extractor implements CoreAttributeConstants { 47 48 private static final long serialVersionUID = -2231962381454717720L; 49 50 private static Logger LOGGER = 51 Logger.getLogger("org.archive.crawler.extractor.ExtractorJS"); 52 53 static final String AMP = "&"; 54 static final String ESCAPED_AMP = "&"; 55 static final String WHITESPACE = "\\s"; 56 57 static final String JAVASCRIPT_STRING_EXTRACTOR = 61 "(\\\\{0,8}+(?:\"|\'))(\\S{0,"+UURI.MAX_URL_LENGTH+"}?)(?:\\1)"; 62 66 static final String STRING_URI_DETECTOR = 70 "(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)"; 71 72 protected long numberOfCURIsHandled = 0; 73 protected static long numberOfLinksExtracted = 0; 74 75 78 public ExtractorJS(String name) { 79 super(name, "JavaScript extractor. Link extraction on JavaScript" + 80 " files (.js)."); 81 } 82 83 86 public void extract(CrawlURI curi) { 87 if (!isHttpTransactionContentToProcess(curi)) { 88 return; 89 } 90 String contentType = curi.getContentType(); 91 if ((contentType == null)) { 92 return; 93 } 94 if((contentType.indexOf("javascript") < 0) && 97 (contentType.indexOf("jscript") < 0) && 98 (contentType.indexOf("ecmascript") < 0) && 99 (!curi.toString().toLowerCase().endsWith(".js")) && 100 (curi.getViaContext() == null || !curi.getViaContext(). 101 toString().toLowerCase().startsWith("script"))) { 102 return; 103 } 104 105 this.numberOfCURIsHandled++; 106 107 ReplayCharSequence cs = null; 108 try { 109 cs = curi.getHttpRecorder().getReplayCharSequence(); 110 } catch (IOException e) { 111 curi.addLocalizedError(this.getName(), e, 112 "Failed get of replay char sequence."); 113 } 114 if (cs == null) { 115 LOGGER.warning("Failed getting ReplayCharSequence: " + 116 curi.toString()); 117 return; 118 } 119 120 try { 121 try { 122 numberOfLinksExtracted += considerStrings(curi, cs, 123 getController(), true); 124 } catch (StackOverflowError e) { 125 DevUtils.warnHandle(e, "ExtractorJS StackOverflowError"); 126 } 127 curi.linkExtractorFinished(); 129 } finally { 130 if (cs != null) { 132 try { 133 cs.close(); 134 } catch (IOException ioe) { 135 LOGGER.warning(TextUtils.exceptionToString( 136 "Failed close of ReplayCharSequence.", ioe)); 137 } 138 } 139 } 140 } 141 142 public static long considerStrings(CrawlURI curi, CharSequence cs, 143 CrawlController controller, boolean handlingJSFile) { 144 long foundLinks = 0; 145 Matcher strings = 146 TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs); 147 while(strings.find()) { 148 CharSequence subsequence = 149 cs.subSequence(strings.start(2), strings.end(2)); 150 Matcher uri = 151 TextUtils.getMatcher(STRING_URI_DETECTOR, subsequence); 152 if(uri.matches()) { 153 String string = uri.group(); 154 string = TextUtils.replaceAll(ESCAPED_AMP, string, AMP); 155 foundLinks++; 156 try { 157 if (handlingJSFile) { 158 curi.createAndAddLinkRelativeToVia(string, 159 Link.JS_MISC, Link.SPECULATIVE_HOP); 160 } else { 161 curi.createAndAddLinkRelativeToBase(string, 162 Link.JS_MISC, Link.SPECULATIVE_HOP); 163 } 164 } catch (URIException e) { 165 if (controller != null) { 168 controller.logUriError(e, curi.getUURI(), string); 169 } else { 170 LOGGER.info(curi + ", " + string + ": " + 171 e.getMessage()); 172 } 173 } 174 } else { 175 foundLinks += considerStrings(curi, subsequence, 176 controller, handlingJSFile); 177 } 178 TextUtils.recycleMatcher(uri); 179 } 180 TextUtils.recycleMatcher(strings); 181 return foundLinks; 182 } 183 184 187 public String report() { 188 StringBuffer ret = new StringBuffer (); 189 ret.append("Processor: org.archive.crawler.extractor.ExtractorJS\n"); 190 ret.append(" Function: Link extraction on JavaScript code\n"); 191 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 192 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 193 194 return ret.toString(); 195 } 196 } 197 | Popular Tags |