1 24 package org.archive.extractor; 25 26 import java.util.LinkedList ; 27 import java.util.regex.Matcher ; 28 import java.util.regex.Pattern ; 29 30 import org.apache.commons.httpclient.URIException; 31 import org.archive.crawler.extractor.Link; 32 import org.archive.net.UURI; 33 import org.archive.net.UURIFactory; 34 import org.archive.util.TextUtils; 35 36 43 public class RegexpJSLinkExtractor extends CharSequenceLinkExtractor { 44 47 static final String AMP = "&"; 48 static final String ESCAPED_AMP = "&"; 49 static final String WHITESPACE = "\\s"; 50 51 static final Pattern JAVASCRIPT_STRING_EXTRACTOR = Pattern.compile( 55 "(\\\\{0,8}+(?:\"|\'))(.+?)(?:\\1)"); 56 57 static final Pattern STRING_URI_DETECTOR = Pattern.compile( 61 "(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)"); 62 63 Matcher strings; 64 LinkedList <Matcher > matcherStack = new LinkedList <Matcher >(); 65 66 protected boolean findNextLink() { 67 if(strings==null) { 68 strings = JAVASCRIPT_STRING_EXTRACTOR.matcher(sourceContent); 69 } 70 while(strings!=null) { 71 while(strings.find()) { 72 CharSequence subsequence = 73 sourceContent.subSequence(strings.start(2), strings.end(2)); 74 Matcher uri = STRING_URI_DETECTOR.matcher(subsequence); 75 if ((subsequence.length() <= UURI.MAX_URL_LENGTH) && uri.matches()) { 76 String string = uri.group(); 77 string = TextUtils.replaceAll(ESCAPED_AMP, string, AMP); 78 try { 79 Link link = new Link(source, UURIFactory.getInstance( 80 source, string), Link.JS_MISC, Link.SPECULATIVE_HOP); 81 next.add(link); 82 return true; 83 } catch (URIException e) { 84 extractErrorListener.noteExtractError(e,source,string); 85 } 86 } else { 87 matcherStack.addFirst(strings); 89 strings = JAVASCRIPT_STRING_EXTRACTOR.matcher(subsequence); 91 } 92 } 93 strings = (Matcher ) (matcherStack.isEmpty() ? null : matcherStack.removeFirst()); 95 } 96 return false; 97 } 98 99 100 103 public void reset() { 104 super.reset(); 105 matcherStack.clear(); 106 strings = null; 107 } 108 109 protected static CharSequenceLinkExtractor newDefaultInstance() { 110 return new RegexpJSLinkExtractor(); 111 } 112 } 113 | Popular Tags |