JerichoExtractorHTML


1   /* JerichoExtractorHTML
2    * 
3    * Copyright (C) 2006 Olaf Freyer
4    *
5    * This file is part of the Heritrix web crawler (crawler.archive.org).
6    *
7    * Heritrix is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU Lesser Public License as published by
9    * the Free Software Foundation; either version 2.1 of the License, or
10   * any later version.
11   *
12   * Heritrix is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU Lesser Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser Public License
18   * along with Heritrix; if not, write to the Free Software
19   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20   *
21   * $Id: JerichoExtractorHTML.java,v 1.3.2.1 2007/01/13 01:31:16 stack-sf Exp $
22   */
23  package org.archive.crawler.extractor;
24  
25  import java.util.ArrayList  ;
26  import java.util.Collection  ;
27  import java.util.Iterator  ;
28  import java.util.LinkedList  ;
29  import java.util.List  ;
30  import java.util.logging.Level  ;
31  import java.util.logging.Logger  ;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.apache.commons.lang.StringEscapeUtils;
35  import org.archive.crawler.datamodel.CoreAttributeConstants;
36  import org.archive.crawler.datamodel.CrawlURI;
37  import org.archive.crawler.datamodel.RobotsHonoringPolicy;
38  import org.archive.net.UURI;
39  import org.archive.net.UURIFactory;
40  import org.archive.util.DevUtils;
41  import org.archive.util.TextUtils;
42  
43  import au.id.jericho.lib.html.Attribute;
44  import au.id.jericho.lib.html.Attributes;
45  import au.id.jericho.lib.html.Element;
46  import au.id.jericho.lib.html.FormControl;
47  import au.id.jericho.lib.html.FormControlType;
48  import au.id.jericho.lib.html.FormField;
49  import au.id.jericho.lib.html.FormFields;
50  import au.id.jericho.lib.html.HTMLElementName;
51  import au.id.jericho.lib.html.Source;
52  import au.id.jericho.lib.html.StartTagType;
53  
54  /**
55   * Improved link-extraction from an HTML content-body using jericho-html parser.
56   * This extractor extends ExtractorHTML and mimics its workflow - but has some
57   * substantial differences when it comes to internal implementation. Instead
58   * of heavily relying upon java regular expressions it uses a real html parser
59   * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net).
60   * Using this parser it can better handle broken html (i.e. missing quotes)
61   * and also offer improved extraction of HTML form URLs (not only extract
62   * the action of a form, but also its default values).
63   * Unfortunately this parser also has one major drawback - it has to read the
64   * whole document into memory for parsing, thus has an inherent OOME risk.
65   * This OOME risk can be reduced/eleminated by limiting the size of documents
66   * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule).
67   * Also note that this extractor seems to have a lower overall memory 
68   * consumption compared to ExtractorHTML. (still to be confirmed on a larger 
69   * scale crawl) 
70   * 
71   * @author Olaf Freyer
72   * @version $Date: 2007/01/13 01:31:16 $ $Revision: 1.3.2.1 $
73   */
74  public class JerichoExtractorHTML extends ExtractorHTML implements
75          CoreAttributeConstants {
76  
77      private static final long serialVersionUID = 1684681316546343615L;
78  
79      private Logger   logger = Logger.getLogger(this.getClass().getName());
80  
81      protected long numberOfFormsProcessed = 0;
82  
83      public JerichoExtractorHTML(String   name) {
84          this(name, "Jericho-HTML extractor. Extracts links from HTML " +
85                  "documents using Jericho HTML Parser. Offers same " + 
86                  "basic functionality as ExtractorHTML but better " +
87                  "handles broken HTML and extraction of default " +
88                  "values from HTML forms. A word of warning: the used " +
89                  "parser, the Jericho HTML Parser, reads the whole " +
90                  "document into memory for " +
91                  "parsing - thus this extractor has an inherent OOME risk. " +
92                  "This OOME risk can be reduced/eleminated by limiting the " +
93                  "size of documents to be parsed (i.e. using " +
94                  "NotExceedsDocumentLengthTresholdDecideRule). ");
95      }
96  
97      public JerichoExtractorHTML(String   name, String   description) {
98          super(name, description);
99      }
100 
101     private static List  <Attribute> findOnAttributes(Attributes attributes) {
102         List  <Attribute> result = new LinkedList  <Attribute>();
103         for (Iterator   attrIter = attributes.iterator(); attrIter.hasNext();) {
104             Attribute attr = (Attribute) attrIter.next();
105             if (attr.getKey().startsWith("on"))
106                 result.add(attr);
107         }
108         return result;
109     }
110 
111     protected void processGeneralTag(CrawlURI curi, Element element,
112             Attributes attributes) {
113         Attribute attr;
114         String   attrValue;
115         List   attrList;
116         String   elementName = element.getName();
117 
118         // Just in case it's an OBJECT or APPLET tag
119         String   codebase = null;
120         ArrayList  <String  > resources = null;
121 
122         final boolean framesAsEmbeds = ((Boolean  ) getUncheckedAttribute(curi,
123                 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
124 
125         final boolean ignoreFormActions = ((Boolean  ) getUncheckedAttribute(
126                 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
127 
128         final boolean overlyEagerLinkDetection =
129             ((Boolean  )getUncheckedAttribute(
130                 curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue();
131 
132         // HREF
133         if (((attr = attributes.get("href")) != null) &&
134             ((attrValue = attr.getValue()) != null)) {
135             CharSequence   context = Link.elementContext(elementName, attr
136                     .getKey());
137             if ("link".equals(elementName)) {
138                 // <LINK> elements treated as embeds (css, ico, etc)
139                 processEmbed(curi, attrValue, context);
140             } else {
141                 // other HREFs treated as links
142                 processLink(curi, attrValue, context);
143             }
144             if ("base".equals(elementName)) {
145                 try {
146                     curi.setBaseURI(attrValue);
147                 } catch (URIException e) {
148                     if (getController() != null) {
149                         // Controller can be null: e.g. when running
150                         // ExtractorTool.
151                         getController().logUriError(e, curi.getUURI(),
152                                 attrValue);
153                     } else {
154                         logger.info("Failed set base uri: " + curi + ", "
155                                 + attrValue + ": " + e.getMessage());
156                     }
157                 }
158             }
159         }
160         // ACTION
161         else if (((attr = attributes.get("action")) != null) &&
162                  ((attrValue = attr.getValue()) != null)) {
163             if (!ignoreFormActions) {
164                 CharSequence   context = Link.elementContext(elementName, attr
165                         .getKey());
166                 processLink(curi, attrValue, context);
167             }
168         }
169         // ON_
170         else if ((attrList = findOnAttributes(attributes)).size() != 0) {
171             for (Iterator   attrIter = attrList.iterator(); attrIter.hasNext();) {
172                 attr = (Attribute) attrIter.next();
173                 CharSequence   valueSegment = attr.getValueSegment();
174                 if (valueSegment != null)
175                     processScriptCode(curi, valueSegment);
176 
177             }
178         }
179         // SRC atc.
180         else if ((((attr = attributes.get("src")) != null)
181                 || ((attr = attributes.get("lowsrc")) != null)
182                 || ((attr = attributes.get("background")) != null)
183                 || ((attr = attributes.get("cite")) != null)
184                 || ((attr = attributes.get("longdesc")) != null)
185                 || ((attr = attributes.get("usemap")) != null)
186                 || ((attr = attributes.get("profile")) != null)
187                 || ((attr = attributes.get("datasrc")) != null)) &&
188                    ((attrValue = attr.getValue()) != null)) {
189 
190             final char hopType;
191             CharSequence   context = Link.elementContext(elementName, attr
192                     .getKey());
193 
194             if (!framesAsEmbeds
195                     && ("frame".equals(elementName) || "iframe"
196                             .equals(elementName)))
197                 hopType = Link.NAVLINK_HOP;
198             else
199                 hopType = Link.EMBED_HOP;
200 
201             processEmbed(curi, attrValue, context, hopType);
202         }
203         // CODEBASE
204         else if (((attr = attributes.get("codebase")) != null) &&
205                  ((attrValue = attr.getValue()) != null)) {
206             codebase = StringEscapeUtils.unescapeHtml(attrValue);
207             CharSequence   context = Link.elementContext(elementName, attr
208                     .getKey());
209             processEmbed(curi, codebase, context);
210         }
211         // CLASSID DATA
212         else if ((((attr = attributes.get("classid")) != null)
213                 || ((attr = attributes.get("data")) != null)) &&
214                    ((attrValue = attr.getValue()) != null)) {
215             if (resources == null)
216                 resources = new ArrayList  <String  >();
217             resources.add(attrValue);
218         }
219         // ARCHIVE
220         else if (((attr = attributes.get("archive")) != null) &&
221                  ((attrValue = attr.getValue()) != null)) {
222             if (resources == null)
223                 resources = new ArrayList  <String  >();
224             String  [] multi = TextUtils.split(WHITESPACE, attrValue);
225             for (int i = 0; i < multi.length; i++) {
226                 resources.add(multi[i]);
227             }
228         }
229         // CODE
230         else if (((attr = attributes.get("code")) != null) &&
231                  ((attrValue = attr.getValue()) != null)) {
232             if (resources == null)
233                 resources = new ArrayList  <String  >();
234             // If element is applet and code value does not end with
235             // '.class' then append '.class' to the code value.
236             if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
237                 resources.add(attrValue + CLASSEXT);
238             } else {
239                 resources.add(attrValue);
240             }
241         }
242         // VALUE
243         else if (((attr = attributes.get("value")) != null) &&
244                  ((attrValue = attr.getValue()) != null)) {
245             if (TextUtils.matches(LIKELY_URI_PATH, attrValue)
246                     && overlyEagerLinkDetection) {
247                 CharSequence   context = Link.elementContext(elementName, attr
248                         .getKey());
249                 processLink(curi, attrValue, context);
250             }
251 
252         }
253         // STYLE
254         else if (((attr = attributes.get("style")) != null) &&
255                  ((attrValue = attr.getValue()) != null)) {
256             // STYLE inline attribute
257             // then, parse for URIs
258             this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
259                     attrValue, getController());
260         }
261 
262         // handle codebase/resources
263         if (resources == null)
264             return;
265 
266         Iterator  <String  > iter = resources.iterator();
267         UURI codebaseURI = null;
268         String   res = null;
269         try {
270             if (codebase != null) {
271                 // TODO: Pass in the charset.
272                 codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase);
273             }
274             while (iter.hasNext()) {
275                 res = iter.next();
276                 res = StringEscapeUtils.unescapeHtml(res);
277                 if (codebaseURI != null) {
278                     res = codebaseURI.resolve(res).toString();
279                 }
280                 processEmbed(curi, res, element); // TODO: include attribute
281                                                     // too
282             }
283         } catch (URIException e) {
284             curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
285         } catch (IllegalArgumentException   e) {
286             DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"
287                     + "codebase=" + codebase + " res=" + res + "\n"
288                     + DevUtils.extraInfo(), e);
289         }
290     }
291 
292     protected boolean processMeta(CrawlURI curi, Element element) {
293         String   name = element.getAttributeValue("name");
294         String   httpEquiv = element.getAttributeValue("http-equiv");
295         String   content = element.getAttributeValue("content");
296 
297         if ("robots".equals(name) && content != null) {
298             curi.putString(A_META_ROBOTS, content);
299             RobotsHonoringPolicy policy = getSettingsHandler().getOrder()
300                     .getRobotsHonoringPolicy();
301             String   contentLower = content.toLowerCase();
302             if ((policy == null || (!policy.isType(curi,
303                     RobotsHonoringPolicy.IGNORE) && !policy.isType(curi,
304                     RobotsHonoringPolicy.CUSTOM)))
305                     && (contentLower.indexOf("nofollow") >= 0 || contentLower
306                             .indexOf("none") >= 0)) {
307                 // if 'nofollow' or 'none' is specified and the
308                 // honoring policy is not IGNORE or CUSTOM, end html extraction
309                 logger.fine("HTML extraction skipped due to robots meta-tag " +
310                     "for: " + curi.toString());
311                 return true;
312             }
313         }
314         if ("refresh".equals(httpEquiv) && content != null) {
315             String   refreshUri = content.substring(content.indexOf("=") + 1);
316             try {
317                 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
318                         Link.REFER_HOP);
319             } catch (URIException e) {
320                 if (getController() != null) {
321                     getController().logUriError(e, curi.getUURI(), refreshUri);
322                 } else {
323                     logger.info("Failed createAndAddLinkRelativeToBase " + curi
324                             + ", " + element.toString() + ", " + refreshUri
325                             + ": " + e);
326                 }
327             }
328         }
329         return false;
330     }
331 
332     protected void processScript(CrawlURI curi, Element element) {
333         // first, get attributes of script-open tag
334         // as per any other tag
335         processGeneralTag(curi, element, element.getAttributes());
336 
337         // then, apply best-effort string-analysis heuristics
338         // against any code present (false positives are OK)
339         processScriptCode(curi, element.getContent());
340 
341     }
342 
343     protected void processStyle(CrawlURI curi, Element element) {
344         // First, get attributes of script-open tag as per any other tag.
345         processGeneralTag(curi, element, element.getAttributes());
346 
347         // then, parse for URIs
348         this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
349                 element.getContent(), getController());
350     }
351 
352     protected void processForm(CrawlURI curi, Element element) {
353         String   action = element.getAttributeValue("action");
354         String   name = element.getAttributeValue("name");
355         String   queryURL = "";
356 
357         final boolean ignoreFormActions = ((Boolean  ) getUncheckedAttribute(
358                 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
359 
360         if (ignoreFormActions)
361             return;
362 
363         numberOfFormsProcessed++;
364 
365         // get all form fields
366         FormFields formFields = element.findFormFields();
367         for (Iterator   fieldsIter = formFields.iterator(); fieldsIter.hasNext();) {
368             // for each form field
369             FormField formField = (FormField) fieldsIter.next();
370 
371             // for each form control
372             for (Iterator   controlIter = formField.getFormControls().iterator();
373                 controlIter.hasNext();) {
374                 FormControl formControl = (FormControl) controlIter.next();
375 
376                 // get name of control element (and URLEncode it)
377                 String   controlName = formControl.getName();
378 
379                 // retrieve list of values - submit needs special handling
380                 Collection   controlValues;
381                 if (!(formControl.getFormControlType() ==
382                         FormControlType.SUBMIT)) {
383                     controlValues = formControl.getValues();
384                 } else {
385                     controlValues = formControl.getPredefinedValues();
386                 }
387 
388                 if (controlValues.size() > 0) {
389                     // for each value set
390                     for (Iterator   valueIter = controlValues.iterator();
391                             valueIter.hasNext();) {
392                         String   value = (String  ) valueIter.next();
393                         queryURL += "&" + controlName + "=" + value;
394                     }
395                 } else {
396                     queryURL += "&" + controlName + "=";
397                 }
398             }
399         }
400 
401         // clean up url
402         if (action == null) {
403             queryURL = queryURL.replaceFirst("&", "?");
404         } else {
405             if (!action.contains("?"))
406                 queryURL = queryURL.replaceFirst("&", "?");
407             queryURL = action + queryURL;
408         }
409 
410         CharSequence   context = Link.elementContext(element.getName(),
411             "name=" + name);
412         processLink(curi, queryURL, context);
413 
414     }
415 
416     /**
417      * Run extractor. This method is package visible to ease testing.
418      * 
419      * @param curi
420      *            CrawlURI we're processing.
421      * @param cs
422      *            Sequence from underlying ReplayCharSequence.
423      */
424     void extract(CrawlURI curi, CharSequence   cs) {
425         Source source = new Source(cs);
426         List   elements = source.findAllElements(StartTagType.NORMAL);
427         for (Iterator   elementIter = elements.iterator();
428                 elementIter.hasNext();) {
429             Element element = (Element) elementIter.next();
430             String   elementName = element.getName();
431             Attributes attributes;
432             if (elementName.equals(HTMLElementName.META)) {
433                 if (processMeta(curi, element)) {
434                     // meta tag included NOFOLLOW; abort processing
435                     break;
436                 }
437             } else if (elementName.equals(HTMLElementName.SCRIPT)) {
438                 processScript(curi, element);
439             } else if (elementName.equals(HTMLElementName.STYLE)) {
440                 processStyle(curi, element);
441             } else if (elementName.equals(HTMLElementName.FORM)) {
442                 processForm(curi, element);
443             } else if (!(attributes = element.getAttributes()).isEmpty()) {
444                 processGeneralTag(curi, element, attributes);
445             }
446         }
447     }
448 
449     /*
450      * (non-Javadoc)
451      * 
452      * @see org.archive.crawler.framework.Processor#report()
453      */
454     public String   report() {
455         StringBuffer   ret = new StringBuffer  ();
456         ret.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n");
457         ret.append("  Function:          Link extraction on HTML documents\n");
458         ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
459         ret.append("  Forms processed:   " + this.numberOfFormsProcessed + "\n");
460         ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
461         return ret.toString();
462     }
463 }
464
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags