1 24 package org.archive.extractor; 25 26 import java.util.ArrayList ; 27 import java.util.Iterator ; 28 import java.util.LinkedList ; 29 import java.util.logging.Level ; 30 import java.util.logging.Logger ; 31 import java.util.regex.Matcher ; 32 33 import org.apache.commons.httpclient.URIException; 34 import org.archive.crawler.extractor.Link; 35 import org.archive.net.UURI; 36 import org.archive.net.UURIFactory; 37 import org.archive.util.DevUtils; 38 import org.archive.util.TextUtils; 39 40 41 49 public class RegexpHTMLLinkExtractor extends CharSequenceLinkExtractor { 50 private static Logger logger = 51 Logger.getLogger(RegexpHTMLLinkExtractor.class.getName()); 52 53 boolean honorRobots = true; 54 boolean extractInlineCss = true; 55 boolean extractInlineJs = true; 56 57 protected LinkedList <Link> next = new LinkedList <Link>(); 58 protected Matcher tags; 59 60 63 protected boolean findNextLink() { 64 if (tags == null) { 65 tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, sourceContent); 66 } 67 while(tags.find()) { 68 if(Thread.interrupted()){ 69 break; 71 } 72 if (tags.start(8) > 0) { 73 } else if (tags.start(7) > 0) { 76 int start = tags.start(5); 78 int end = tags.end(5); 79 processMeta(sourceContent.subSequence(start, end)); 80 } else if (tags.start(5) > 0) { 81 int start5 = tags.start(5); 83 int end5 = tags.end(5); 84 int start6 = tags.start(6); 85 int end6 = tags.end(6); 86 processGeneralTag(sourceContent.subSequence(start6, end6), 87 sourceContent.subSequence(start5, end5)); 88 } else if (tags.start(1) > 0) { 89 int start = tags.start(1); 91 int end = tags.end(1); 92 processScript(sourceContent.subSequence(start, end), 93 tags.end(2) - start); 94 } else if (tags.start(3) > 0){ 95 int start = tags.start(3); 97 int end = tags.end(3); 98 processStyle(sourceContent.subSequence(start, end), 99 tags.end(4) - start); 100 } 101 if(!next.isEmpty()) { 102 return true; 104 } 105 } 106 return false; 108 } 109 110 131 static final String RELEVANT_TAG_EXTRACTOR = 132 "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?:\\w+))\\s+[^>]*+)|(!--.*?--))>"; 133 134 static final String EACH_ATTRIBUTE_EXTRACTOR = 138 "(?is)\\s((href)|(action)|(on\\w*)" 139 +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" 140 +"|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))" 141 +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" 142 +"|(value)|([-\\w]+))" 143 +"\\s*=\\s*" 144 +"(?:(?:\"(.*?)(?:\"|$))" 145 +"|(?:'(.*?)(?:'|$))" 146 +"|(\\S+))"; 147 166 167 static final String LIKELY_URI_PATH = 172 "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)"; 173 static final String ESCAPED_AMP = "&"; 174 static final String AMP ="&"; 175 static final String WHITESPACE = "\\s"; 176 static final String CLASSEXT =".class"; 177 static final String APPLET = "applet"; 178 static final String BASE = "base"; 179 static final String LINK = "link"; 180 181 protected boolean processGeneralTag(CharSequence element, CharSequence cs) { 182 183 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); 184 185 String codebase = null; 187 ArrayList <String > resources = null; 188 long tally = next.size(); 189 190 while (attr.find()) { 191 int valueGroup = 192 (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14; 193 int start = attr.start(valueGroup); 194 int end = attr.end(valueGroup); 195 CharSequence value = cs.subSequence(start, end); 196 if (attr.start(2) > -1) { 197 CharSequence context = Link.elementContext(element, attr.group(2)); 199 if(element.toString().equalsIgnoreCase(LINK)) { 200 processEmbed(value, context); 202 } else { 203 if (element.toString().equalsIgnoreCase(BASE)) { 204 try { 205 base = UURIFactory.getInstance(value.toString()); 206 } catch (URIException e) { 207 extractErrorListener.noteExtractError(e,source,value); 208 } 209 } 210 processLink(value, context); 212 } 213 } else if (attr.start(3) > -1) { 214 CharSequence context = Link.elementContext(element, attr.group(3)); 216 processLink(value, context); 217 } else if (attr.start(4) > -1) { 218 processScriptCode(value); } else if (attr.start(5) > -1) { 221 CharSequence context = Link.elementContext(element, attr.group(5)); 223 processEmbed(value, context); 224 } else if (attr.start(6) > -1) { 225 codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP); 228 CharSequence context = Link.elementContext(element,attr.group(6)); 229 processEmbed(codebase, context); 230 } else if (attr.start(7) > -1) { 231 if (resources == null) { 233 resources = new ArrayList <String >(); 234 } 235 resources.add(value.toString()); 236 } else if (attr.start(8) > -1) { 237 if (resources==null) { 239 resources = new ArrayList <String >(); 240 } 241 String [] multi = TextUtils.split(WHITESPACE, value); 242 for(int i = 0; i < multi.length; i++ ) { 243 resources.add(multi[i]); 244 } 245 } else if (attr.start(9) > -1) { 246 if (resources==null) { 248 resources = new ArrayList <String >(); 249 } 250 if (element.toString().toLowerCase().equals(APPLET) && 253 !value.toString().toLowerCase().endsWith(CLASSEXT)) { 254 resources.add(value.toString() + CLASSEXT); 255 } else { 256 resources.add(value.toString()); 257 } 258 259 } else if (attr.start(10) > -1) { 260 if(TextUtils.matches(LIKELY_URI_PATH, value)) { 262 CharSequence context = Link.elementContext(element, attr.group(10)); 263 processLink(value, context); 264 } 265 266 } else if (attr.start(11) > -1) { 267 } 273 } 274 TextUtils.recycleMatcher(attr); 275 276 if (resources == null) { 278 return (tally-next.size())>0; 279 } 280 Iterator iter = resources.iterator(); 281 UURI codebaseURI = null; 282 String res = null; 283 try { 284 if (codebase != null) { 285 codebaseURI = UURIFactory.getInstance(base, codebase); 287 } 288 while(iter.hasNext()) { 289 res = iter.next().toString(); 290 res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP); 292 if (codebaseURI != null) { 293 res = codebaseURI.resolve(res).toString(); 294 } 295 processEmbed(res, element); } 297 } catch (URIException e) { 298 extractErrorListener.noteExtractError(e,source,codebase); 299 } catch (IllegalArgumentException e) { 300 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + 301 "codebase=" + codebase + " res=" + res + "\n" + 302 DevUtils.extraInfo(), e); 303 } 304 return (tally-next.size())>0; 305 } 306 307 310 protected void processScriptCode(CharSequence cs) { 311 RegexpJSLinkExtractor.extract(cs, source, base, next, 312 extractErrorListener); 313 } 314 315 static final String JAVASCRIPT = "(?i)^javascript:.*"; 316 317 321 protected void processLink(CharSequence value, CharSequence context) { 322 String link = TextUtils.replaceAll(ESCAPED_AMP, value, "&"); 323 324 if(TextUtils.matches(JAVASCRIPT, link)) { 325 processScriptCode(value.subSequence(11, value.length())); 326 } else { 327 addLinkFromString(link, context,Link.NAVLINK_HOP); 328 } 329 } 330 331 335 private void addLinkFromString(String uri, CharSequence context, char hopType) { 336 try { 337 Link link = new Link(source, UURIFactory.getInstance( 338 base, uri), context, hopType); 339 next.addLast(link); 340 } catch (URIException e) { 341 extractErrorListener.noteExtractError(e,source,uri); 342 } 343 } 344 345 protected long processEmbed(CharSequence value, CharSequence context) { 346 String embed = TextUtils.replaceAll(ESCAPED_AMP, value, "&"); 347 addLinkFromString(embed, context,Link.EMBED_HOP); 348 return 1; 349 } 350 351 static final String NON_HTML_PATH_EXTENSION = 352 "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+ 353 "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)"; 354 355 protected void processScript(CharSequence sequence, int endOfOpenTag) { 356 processGeneralTag(sequence.subSequence(0,6), 359 sequence.subSequence(0,endOfOpenTag)); 360 361 processScriptCode(sequence.subSequence(endOfOpenTag, sequence.length())); 364 } 365 366 protected void processMeta(CharSequence cs) { 367 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); 368 369 String name = null; 370 String httpEquiv = null; 371 String content = null; 372 373 while (attr.find()) { 374 int valueGroup = 375 (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14; 376 CharSequence value = 377 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup)); 378 if (attr.group(1).equalsIgnoreCase("name")) { 379 name = value.toString(); 380 } else if (attr.group(1).equalsIgnoreCase("http-equiv")) { 381 httpEquiv = value.toString(); 382 } else if (attr.group(1).equalsIgnoreCase("content")) { 383 content = value.toString(); 384 } 385 } 387 TextUtils.recycleMatcher(attr); 388 389 if("robots".equalsIgnoreCase(name) && content != null ) { 391 if (getHonorRobots()) { 392 String contentLower = content.toLowerCase(); 393 if ((contentLower.indexOf("nofollow") >= 0 394 || contentLower.indexOf("none") >= 0)) { 395 logger.fine("HTML extraction skipped due to robots meta-tag for: " 398 + source); 399 cancelFurtherExtraction(); 400 return; 401 } 402 } 403 } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) { 404 String refreshUri = content.substring(content.indexOf("=") + 1); 405 try { 406 Link refreshLink = new Link(source, UURIFactory.getInstance(base,refreshUri), Link.elementContext("meta",httpEquiv),Link.REFER_HOP); 407 next.addLast(refreshLink); 408 } catch (URIException e) { 409 extractErrorListener.noteExtractError(e,source,refreshUri); 410 } 411 } 412 } 413 414 417 private boolean getHonorRobots() { 418 return honorRobots; 419 } 420 421 424 private void cancelFurtherExtraction() { 425 tags.reset(""); 428 } 429 430 434 protected void processStyle(CharSequence sequence, 435 int endOfOpenTag) 436 { 437 processGeneralTag(sequence.subSequence(0,6), 439 sequence.subSequence(0,endOfOpenTag)); 440 441 RegexpCSSLinkExtractor.extract(sequence.subSequence(endOfOpenTag, 443 sequence.length()), source, base, next, extractErrorListener); 444 } 445 446 449 public void reset() { 450 super.reset(); 451 TextUtils.recycleMatcher(tags); 452 tags = null; 453 } 454 455 protected static CharSequenceLinkExtractor newDefaultInstance() { 456 return new RegexpHTMLLinkExtractor(); 457 } 458 } 459 460 | Popular Tags |