1 23 package org.archive.crawler.extractor; 24 25 import java.util.ArrayList ; 26 import java.util.Collection ; 27 import java.util.Iterator ; 28 import java.util.LinkedList ; 29 import java.util.List ; 30 import java.util.logging.Level ; 31 import java.util.logging.Logger ; 32 33 import org.apache.commons.httpclient.URIException; 34 import org.apache.commons.lang.StringEscapeUtils; 35 import org.archive.crawler.datamodel.CoreAttributeConstants; 36 import org.archive.crawler.datamodel.CrawlURI; 37 import org.archive.crawler.datamodel.RobotsHonoringPolicy; 38 import org.archive.net.UURI; 39 import org.archive.net.UURIFactory; 40 import org.archive.util.DevUtils; 41 import org.archive.util.TextUtils; 42 43 import au.id.jericho.lib.html.Attribute; 44 import au.id.jericho.lib.html.Attributes; 45 import au.id.jericho.lib.html.Element; 46 import au.id.jericho.lib.html.FormControl; 47 import au.id.jericho.lib.html.FormControlType; 48 import au.id.jericho.lib.html.FormField; 49 import au.id.jericho.lib.html.FormFields; 50 import au.id.jericho.lib.html.HTMLElementName; 51 import au.id.jericho.lib.html.Source; 52 import au.id.jericho.lib.html.StartTagType; 53 54 74 public class JerichoExtractorHTML extends ExtractorHTML implements 75 CoreAttributeConstants { 76 77 private static final long serialVersionUID = 1684681316546343615L; 78 79 private Logger logger = Logger.getLogger(this.getClass().getName()); 80 81 protected long numberOfFormsProcessed = 0; 82 83 public JerichoExtractorHTML(String name) { 84 this(name, "Jericho-HTML extractor. Extracts links from HTML " + 85 "documents using Jericho HTML Parser. Offers same " + 86 "basic functionality as ExtractorHTML but better " + 87 "handles broken HTML and extraction of default " + 88 "values from HTML forms. A word of warning: the used " + 89 "parser, the Jericho HTML Parser, reads the whole " + 90 "document into memory for " + 91 "parsing - thus this extractor has an inherent OOME risk. " + 92 "This OOME risk can be reduced/eleminated by limiting the " + 93 "size of documents to be parsed (i.e. using " + 94 "NotExceedsDocumentLengthTresholdDecideRule). "); 95 } 96 97 public JerichoExtractorHTML(String name, String description) { 98 super(name, description); 99 } 100 101 private static List <Attribute> findOnAttributes(Attributes attributes) { 102 List <Attribute> result = new LinkedList <Attribute>(); 103 for (Iterator attrIter = attributes.iterator(); attrIter.hasNext();) { 104 Attribute attr = (Attribute) attrIter.next(); 105 if (attr.getKey().startsWith("on")) 106 result.add(attr); 107 } 108 return result; 109 } 110 111 protected void processGeneralTag(CrawlURI curi, Element element, 112 Attributes attributes) { 113 Attribute attr; 114 String attrValue; 115 List attrList; 116 String elementName = element.getName(); 117 118 String codebase = null; 120 ArrayList <String > resources = null; 121 122 final boolean framesAsEmbeds = ((Boolean ) getUncheckedAttribute(curi, 123 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue(); 124 125 final boolean ignoreFormActions = ((Boolean ) getUncheckedAttribute( 126 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue(); 127 128 final boolean overlyEagerLinkDetection = 129 ((Boolean )getUncheckedAttribute( 130 curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue(); 131 132 if (((attr = attributes.get("href")) != null) && 134 ((attrValue = attr.getValue()) != null)) { 135 CharSequence context = Link.elementContext(elementName, attr 136 .getKey()); 137 if ("link".equals(elementName)) { 138 processEmbed(curi, attrValue, context); 140 } else { 141 processLink(curi, attrValue, context); 143 } 144 if ("base".equals(elementName)) { 145 try { 146 curi.setBaseURI(attrValue); 147 } catch (URIException e) { 148 if (getController() != null) { 149 getController().logUriError(e, curi.getUURI(), 152 attrValue); 153 } else { 154 logger.info("Failed set base uri: " + curi + ", " 155 + attrValue + ": " + e.getMessage()); 156 } 157 } 158 } 159 } 160 else if (((attr = attributes.get("action")) != null) && 162 ((attrValue = attr.getValue()) != null)) { 163 if (!ignoreFormActions) { 164 CharSequence context = Link.elementContext(elementName, attr 165 .getKey()); 166 processLink(curi, attrValue, context); 167 } 168 } 169 else if ((attrList = findOnAttributes(attributes)).size() != 0) { 171 for (Iterator attrIter = attrList.iterator(); attrIter.hasNext();) { 172 attr = (Attribute) attrIter.next(); 173 CharSequence valueSegment = attr.getValueSegment(); 174 if (valueSegment != null) 175 processScriptCode(curi, valueSegment); 176 177 } 178 } 179 else if ((((attr = attributes.get("src")) != null) 181 || ((attr = attributes.get("lowsrc")) != null) 182 || ((attr = attributes.get("background")) != null) 183 || ((attr = attributes.get("cite")) != null) 184 || ((attr = attributes.get("longdesc")) != null) 185 || ((attr = attributes.get("usemap")) != null) 186 || ((attr = attributes.get("profile")) != null) 187 || ((attr = attributes.get("datasrc")) != null)) && 188 ((attrValue = attr.getValue()) != null)) { 189 190 final char hopType; 191 CharSequence context = Link.elementContext(elementName, attr 192 .getKey()); 193 194 if (!framesAsEmbeds 195 && ("frame".equals(elementName) || "iframe" 196 .equals(elementName))) 197 hopType = Link.NAVLINK_HOP; 198 else 199 hopType = Link.EMBED_HOP; 200 201 processEmbed(curi, attrValue, context, hopType); 202 } 203 else if (((attr = attributes.get("codebase")) != null) && 205 ((attrValue = attr.getValue()) != null)) { 206 codebase = StringEscapeUtils.unescapeHtml(attrValue); 207 CharSequence context = Link.elementContext(elementName, attr 208 .getKey()); 209 processEmbed(curi, codebase, context); 210 } 211 else if ((((attr = attributes.get("classid")) != null) 213 || ((attr = attributes.get("data")) != null)) && 214 ((attrValue = attr.getValue()) != null)) { 215 if (resources == null) 216 resources = new ArrayList <String >(); 217 resources.add(attrValue); 218 } 219 else if (((attr = attributes.get("archive")) != null) && 221 ((attrValue = attr.getValue()) != null)) { 222 if (resources == null) 223 resources = new ArrayList <String >(); 224 String [] multi = TextUtils.split(WHITESPACE, attrValue); 225 for (int i = 0; i < multi.length; i++) { 226 resources.add(multi[i]); 227 } 228 } 229 else if (((attr = attributes.get("code")) != null) && 231 ((attrValue = attr.getValue()) != null)) { 232 if (resources == null) 233 resources = new ArrayList <String >(); 234 if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) { 237 resources.add(attrValue + CLASSEXT); 238 } else { 239 resources.add(attrValue); 240 } 241 } 242 else if (((attr = attributes.get("value")) != null) && 244 ((attrValue = attr.getValue()) != null)) { 245 if (TextUtils.matches(LIKELY_URI_PATH, attrValue) 246 && overlyEagerLinkDetection) { 247 CharSequence context = Link.elementContext(elementName, attr 248 .getKey()); 249 processLink(curi, attrValue, context); 250 } 251 252 } 253 else if (((attr = attributes.get("style")) != null) && 255 ((attrValue = attr.getValue()) != null)) { 256 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi, 259 attrValue, getController()); 260 } 261 262 if (resources == null) 264 return; 265 266 Iterator <String > iter = resources.iterator(); 267 UURI codebaseURI = null; 268 String res = null; 269 try { 270 if (codebase != null) { 271 codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase); 273 } 274 while (iter.hasNext()) { 275 res = iter.next(); 276 res = StringEscapeUtils.unescapeHtml(res); 277 if (codebaseURI != null) { 278 res = codebaseURI.resolve(res).toString(); 279 } 280 processEmbed(curi, res, element); } 283 } catch (URIException e) { 284 curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase); 285 } catch (IllegalArgumentException e) { 286 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" 287 + "codebase=" + codebase + " res=" + res + "\n" 288 + DevUtils.extraInfo(), e); 289 } 290 } 291 292 protected boolean processMeta(CrawlURI curi, Element element) { 293 String name = element.getAttributeValue("name"); 294 String httpEquiv = element.getAttributeValue("http-equiv"); 295 String content = element.getAttributeValue("content"); 296 297 if ("robots".equals(name) && content != null) { 298 curi.putString(A_META_ROBOTS, content); 299 RobotsHonoringPolicy policy = getSettingsHandler().getOrder() 300 .getRobotsHonoringPolicy(); 301 String contentLower = content.toLowerCase(); 302 if ((policy == null || (!policy.isType(curi, 303 RobotsHonoringPolicy.IGNORE) && !policy.isType(curi, 304 RobotsHonoringPolicy.CUSTOM))) 305 && (contentLower.indexOf("nofollow") >= 0 || contentLower 306 .indexOf("none") >= 0)) { 307 logger.fine("HTML extraction skipped due to robots meta-tag " + 310 "for: " + curi.toString()); 311 return true; 312 } 313 } 314 if ("refresh".equals(httpEquiv) && content != null) { 315 String refreshUri = content.substring(content.indexOf("=") + 1); 316 try { 317 curi.createAndAddLinkRelativeToBase(refreshUri, "meta", 318 Link.REFER_HOP); 319 } catch (URIException e) { 320 if (getController() != null) { 321 getController().logUriError(e, curi.getUURI(), refreshUri); 322 } else { 323 logger.info("Failed createAndAddLinkRelativeToBase " + curi 324 + ", " + element.toString() + ", " + refreshUri 325 + ": " + e); 326 } 327 } 328 } 329 return false; 330 } 331 332 protected void processScript(CrawlURI curi, Element element) { 333 processGeneralTag(curi, element, element.getAttributes()); 336 337 processScriptCode(curi, element.getContent()); 340 341 } 342 343 protected void processStyle(CrawlURI curi, Element element) { 344 processGeneralTag(curi, element, element.getAttributes()); 346 347 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi, 349 element.getContent(), getController()); 350 } 351 352 protected void processForm(CrawlURI curi, Element element) { 353 String action = element.getAttributeValue("action"); 354 String name = element.getAttributeValue("name"); 355 String queryURL = ""; 356 357 final boolean ignoreFormActions = ((Boolean ) getUncheckedAttribute( 358 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue(); 359 360 if (ignoreFormActions) 361 return; 362 363 numberOfFormsProcessed++; 364 365 FormFields formFields = element.findFormFields(); 367 for (Iterator fieldsIter = formFields.iterator(); fieldsIter.hasNext();) { 368 FormField formField = (FormField) fieldsIter.next(); 370 371 for (Iterator controlIter = formField.getFormControls().iterator(); 373 controlIter.hasNext();) { 374 FormControl formControl = (FormControl) controlIter.next(); 375 376 String controlName = formControl.getName(); 378 379 Collection controlValues; 381 if (!(formControl.getFormControlType() == 382 FormControlType.SUBMIT)) { 383 controlValues = formControl.getValues(); 384 } else { 385 controlValues = formControl.getPredefinedValues(); 386 } 387 388 if (controlValues.size() > 0) { 389 for (Iterator valueIter = controlValues.iterator(); 391 valueIter.hasNext();) { 392 String value = (String ) valueIter.next(); 393 queryURL += "&" + controlName + "=" + value; 394 } 395 } else { 396 queryURL += "&" + controlName + "="; 397 } 398 } 399 } 400 401 if (action == null) { 403 queryURL = queryURL.replaceFirst("&", "?"); 404 } else { 405 if (!action.contains("?")) 406 queryURL = queryURL.replaceFirst("&", "?"); 407 queryURL = action + queryURL; 408 } 409 410 CharSequence context = Link.elementContext(element.getName(), 411 "name=" + name); 412 processLink(curi, queryURL, context); 413 414 } 415 416 424 void extract(CrawlURI curi, CharSequence cs) { 425 Source source = new Source(cs); 426 List elements = source.findAllElements(StartTagType.NORMAL); 427 for (Iterator elementIter = elements.iterator(); 428 elementIter.hasNext();) { 429 Element element = (Element) elementIter.next(); 430 String elementName = element.getName(); 431 Attributes attributes; 432 if (elementName.equals(HTMLElementName.META)) { 433 if (processMeta(curi, element)) { 434 break; 436 } 437 } else if (elementName.equals(HTMLElementName.SCRIPT)) { 438 processScript(curi, element); 439 } else if (elementName.equals(HTMLElementName.STYLE)) { 440 processStyle(curi, element); 441 } else if (elementName.equals(HTMLElementName.FORM)) { 442 processForm(curi, element); 443 } else if (!(attributes = element.getAttributes()).isEmpty()) { 444 processGeneralTag(curi, element, attributes); 445 } 446 } 447 } 448 449 454 public String report() { 455 StringBuffer ret = new StringBuffer (); 456 ret.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n"); 457 ret.append(" Function: Link extraction on HTML documents\n"); 458 ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n"); 459 ret.append(" Forms processed: " + this.numberOfFormsProcessed + "\n"); 460 ret.append(" Links extracted: " + this.numberOfLinksExtracted + "\n\n"); 461 return ret.toString(); 462 } 463 } 464 | Popular Tags |