1 18 19 package org.apache.jmeter.protocol.http.parser; 20 21 import java.net.MalformedURLException ; 22 import java.net.URL ; 23 import java.util.Iterator ; 24 25 import org.apache.jorphan.logging.LoggingManager; 26 import org.apache.log.Logger; 27 28 import org.apache.oro.text.regex.MatchResult; 33 import org.apache.oro.text.regex.Pattern; 34 import org.apache.oro.text.regex.PatternMatcherInput; 35 import org.apache.oro.text.regex.Perl5Compiler; 36 import org.apache.oro.text.regex.Perl5Matcher; 37 import org.apache.oro.text.regex.MalformedPatternException; 38 39 75 class RegexpHTMLParser extends HTMLParser 76 { 77 78 85 private static final String VALUE= 86 "\\s*=\\s*(?:\"([^\"]*)\"|'([^']*)'|([^\"'\\s>\\\\][^\\s>]*)(?=[\\s>]))"; 87 89 92 private static final String SEP= 93 "\\s(?:[^>]*\\s)?"; 94 95 99 private static final String REGEXP= 100 "<(?:" 101 + "!--.*?-->" 102 + "|BASE"+SEP+"HREF"+VALUE 103 + "|(?:IMG|SCRIPT|FRAME|IFRAME)"+SEP+"SRC"+VALUE 104 + "|APPLET"+SEP+"CODE(?:BASE)?"+VALUE 105 + "|(?:EMBED|OBJECT)"+SEP+"(?:SRC|CODEBASE)"+VALUE 106 + "|(?:BODY|TABLE|TR|TD)"+SEP+"BACKGROUND"+VALUE 107 + "|INPUT(?:"+SEP+"(?:SRC"+VALUE+"|TYPE\\s*=\\s*(?:\"image\"|'image'|image(?=[\\s>])))){2,}" 108 + "|LINK(?:"+SEP+"(?:HREF"+VALUE+"|REL\\s*=\\s*(?:\"stylesheet\"|'stylesheet'|stylesheet(?=[\\s>])))){2,}" 109 + ")"; 110 111 private static final int NUM_BASE_GROUPS= 3; 113 114 117 static Pattern pattern; 118 119 122 private static ThreadLocal localMatcher= new ThreadLocal () 123 { 124 protected Object initialValue() 125 { 126 return new Perl5Matcher(); 127 } 128 }; 129 130 133 private static ThreadLocal localInput= new ThreadLocal () 134 { 135 protected Object initialValue() 136 { 137 return new PatternMatcherInput(new char[0]); 138 } 139 }; 140 141 142 transient private static Logger log; 143 144 protected boolean isReusable() 145 { 146 return true; 147 } 148 149 152 protected RegexpHTMLParser() { 153 super(); 154 155 log= LoggingManager.getLoggerForClass(); 158 159 try 161 { 162 Perl5Compiler c= new Perl5Compiler(); 163 pattern= 164 c.compile( 165 REGEXP, 166 Perl5Compiler.CASE_INSENSITIVE_MASK 167 | Perl5Compiler.SINGLELINE_MASK 168 | Perl5Compiler.READ_ONLY_MASK); 169 } 170 catch (MalformedPatternException mpe) 171 { 172 log.error( 173 "Internal error compiling regular expression in ParseRegexp."); 174 log.error("MalformedPatternException - " + mpe); 175 throw new Error (mpe.toString()); } 177 } 178 179 182 public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl, URLCollection urls) 183 { 184 185 Perl5Matcher matcher= (Perl5Matcher)localMatcher.get(); 186 PatternMatcherInput input= (PatternMatcherInput)localInput.get(); 187 input.setInput(new String (html)); 191 while (matcher.contains(input, pattern)) 192 { 193 MatchResult match= matcher.getMatch(); 194 String s; 195 if (log.isDebugEnabled()) 196 log.debug("match groups " + match.groups()); 197 for (int g=1; g <= NUM_BASE_GROUPS && g <= match.groups(); g++) 199 { 200 s= match.group(g); 201 if (s != null) 202 { 203 if (log.isDebugEnabled()) 204 { 205 log.debug("new baseUrl: " + s + " - " + baseUrl.toString()); 206 } 207 try 208 { 209 baseUrl= new URL (baseUrl, s); 210 } 211 catch (MalformedURLException e) 212 { 213 if (log.isDebugEnabled()) 216 { 217 log.debug( 218 "Can't build base URL from RL " 219 + s 220 + " in page " 221 + baseUrl, 222 e); 223 } 224 } 225 } 226 } 227 for (int g= NUM_BASE_GROUPS+1; g <= match.groups(); g++) 228 { 229 s= match.group(g); 230 if (log.isDebugEnabled()) 231 { 232 log.debug("group " + g + " - " + match.group(g)); 233 } 234 if (s != null) 235 { 236 urls.addURL(s,baseUrl); 237 } 238 } 239 } 240 return urls.iterator(); 241 } 242 } 243 | Popular Tags |