KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > JerichoExtractorHTML


1 /* JerichoExtractorHTML
2  *
3  * Copyright (C) 2006 Olaf Freyer
4  *
5  * This file is part of the Heritrix web crawler (crawler.archive.org).
6  *
7  * Heritrix is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser Public License as published by
9  * the Free Software Foundation; either version 2.1 of the License, or
10  * any later version.
11  *
12  * Heritrix is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU Lesser Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser Public License
18  * along with Heritrix; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20  *
21  * $Id: JerichoExtractorHTML.java,v 1.3.2.1 2007/01/13 01:31:16 stack-sf Exp $
22  */

23 package org.archive.crawler.extractor;
24
25 import java.util.ArrayList JavaDoc;
26 import java.util.Collection JavaDoc;
27 import java.util.Iterator JavaDoc;
28 import java.util.LinkedList JavaDoc;
29 import java.util.List JavaDoc;
30 import java.util.logging.Level JavaDoc;
31 import java.util.logging.Logger JavaDoc;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.apache.commons.lang.StringEscapeUtils;
35 import org.archive.crawler.datamodel.CoreAttributeConstants;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.datamodel.RobotsHonoringPolicy;
38 import org.archive.net.UURI;
39 import org.archive.net.UURIFactory;
40 import org.archive.util.DevUtils;
41 import org.archive.util.TextUtils;
42
43 import au.id.jericho.lib.html.Attribute;
44 import au.id.jericho.lib.html.Attributes;
45 import au.id.jericho.lib.html.Element;
46 import au.id.jericho.lib.html.FormControl;
47 import au.id.jericho.lib.html.FormControlType;
48 import au.id.jericho.lib.html.FormField;
49 import au.id.jericho.lib.html.FormFields;
50 import au.id.jericho.lib.html.HTMLElementName;
51 import au.id.jericho.lib.html.Source;
52 import au.id.jericho.lib.html.StartTagType;
53
54 /**
55  * Improved link-extraction from an HTML content-body using jericho-html parser.
56  * This extractor extends ExtractorHTML and mimics its workflow - but has some
57  * substantial differences when it comes to internal implementation. Instead
58  * of heavily relying upon java regular expressions it uses a real html parser
59  * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net).
60  * Using this parser it can better handle broken html (i.e. missing quotes)
61  * and also offer improved extraction of HTML form URLs (not only extract
62  * the action of a form, but also its default values).
63  * Unfortunately this parser also has one major drawback - it has to read the
64  * whole document into memory for parsing, thus has an inherent OOME risk.
65  * This OOME risk can be reduced/eleminated by limiting the size of documents
66  * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule).
67  * Also note that this extractor seems to have a lower overall memory
68  * consumption compared to ExtractorHTML. (still to be confirmed on a larger
69  * scale crawl)
70  *
71  * @author Olaf Freyer
72  * @version $Date: 2007/01/13 01:31:16 $ $Revision: 1.3.2.1 $
73  */

74 public class JerichoExtractorHTML extends ExtractorHTML implements
75         CoreAttributeConstants {
76
77     private static final long serialVersionUID = 1684681316546343615L;
78
79     private Logger JavaDoc logger = Logger.getLogger(this.getClass().getName());
80
81     protected long numberOfFormsProcessed = 0;
82
83     public JerichoExtractorHTML(String JavaDoc name) {
84         this(name, "Jericho-HTML extractor. Extracts links from HTML " +
85                 "documents using Jericho HTML Parser. Offers same " +
86                 "basic functionality as ExtractorHTML but better " +
87                 "handles broken HTML and extraction of default " +
88                 "values from HTML forms. A word of warning: the used " +
89                 "parser, the Jericho HTML Parser, reads the whole " +
90                 "document into memory for " +
91                 "parsing - thus this extractor has an inherent OOME risk. " +
92                 "This OOME risk can be reduced/eleminated by limiting the " +
93                 "size of documents to be parsed (i.e. using " +
94                 "NotExceedsDocumentLengthTresholdDecideRule). ");
95     }
96
97     public JerichoExtractorHTML(String JavaDoc name, String JavaDoc description) {
98         super(name, description);
99     }
100
101     private static List JavaDoc<Attribute> findOnAttributes(Attributes attributes) {
102         List JavaDoc<Attribute> result = new LinkedList JavaDoc<Attribute>();
103         for (Iterator JavaDoc attrIter = attributes.iterator(); attrIter.hasNext();) {
104             Attribute attr = (Attribute) attrIter.next();
105             if (attr.getKey().startsWith("on"))
106                 result.add(attr);
107         }
108         return result;
109     }
110
111     protected void processGeneralTag(CrawlURI curi, Element element,
112             Attributes attributes) {
113         Attribute attr;
114         String JavaDoc attrValue;
115         List JavaDoc attrList;
116         String JavaDoc elementName = element.getName();
117
118         // Just in case it's an OBJECT or APPLET tag
119
String JavaDoc codebase = null;
120         ArrayList JavaDoc<String JavaDoc> resources = null;
121
122         final boolean framesAsEmbeds = ((Boolean JavaDoc) getUncheckedAttribute(curi,
123                 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
124
125         final boolean ignoreFormActions = ((Boolean JavaDoc) getUncheckedAttribute(
126                 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
127
128         final boolean overlyEagerLinkDetection =
129             ((Boolean JavaDoc)getUncheckedAttribute(
130                 curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue();
131
132         // HREF
133
if (((attr = attributes.get("href")) != null) &&
134             ((attrValue = attr.getValue()) != null)) {
135             CharSequence JavaDoc context = Link.elementContext(elementName, attr
136                     .getKey());
137             if ("link".equals(elementName)) {
138                 // <LINK> elements treated as embeds (css, ico, etc)
139
processEmbed(curi, attrValue, context);
140             } else {
141                 // other HREFs treated as links
142
processLink(curi, attrValue, context);
143             }
144             if ("base".equals(elementName)) {
145                 try {
146                     curi.setBaseURI(attrValue);
147                 } catch (URIException e) {
148                     if (getController() != null) {
149                         // Controller can be null: e.g. when running
150
// ExtractorTool.
151
getController().logUriError(e, curi.getUURI(),
152                                 attrValue);
153                     } else {
154                         logger.info("Failed set base uri: " + curi + ", "
155                                 + attrValue + ": " + e.getMessage());
156                     }
157                 }
158             }
159         }
160         // ACTION
161
else if (((attr = attributes.get("action")) != null) &&
162                  ((attrValue = attr.getValue()) != null)) {
163             if (!ignoreFormActions) {
164                 CharSequence JavaDoc context = Link.elementContext(elementName, attr
165                         .getKey());
166                 processLink(curi, attrValue, context);
167             }
168         }
169         // ON_
170
else if ((attrList = findOnAttributes(attributes)).size() != 0) {
171             for (Iterator JavaDoc attrIter = attrList.iterator(); attrIter.hasNext();) {
172                 attr = (Attribute) attrIter.next();
173                 CharSequence JavaDoc valueSegment = attr.getValueSegment();
174                 if (valueSegment != null)
175                     processScriptCode(curi, valueSegment);
176
177             }
178         }
179         // SRC atc.
180
else if ((((attr = attributes.get("src")) != null)
181                 || ((attr = attributes.get("lowsrc")) != null)
182                 || ((attr = attributes.get("background")) != null)
183                 || ((attr = attributes.get("cite")) != null)
184                 || ((attr = attributes.get("longdesc")) != null)
185                 || ((attr = attributes.get("usemap")) != null)
186                 || ((attr = attributes.get("profile")) != null)
187                 || ((attr = attributes.get("datasrc")) != null)) &&
188                    ((attrValue = attr.getValue()) != null)) {
189
190             final char hopType;
191             CharSequence JavaDoc context = Link.elementContext(elementName, attr
192                     .getKey());
193
194             if (!framesAsEmbeds
195                     && ("frame".equals(elementName) || "iframe"
196                             .equals(elementName)))
197                 hopType = Link.NAVLINK_HOP;
198             else
199                 hopType = Link.EMBED_HOP;
200
201             processEmbed(curi, attrValue, context, hopType);
202         }
203         // CODEBASE
204
else if (((attr = attributes.get("codebase")) != null) &&
205                  ((attrValue = attr.getValue()) != null)) {
206             codebase = StringEscapeUtils.unescapeHtml(attrValue);
207             CharSequence JavaDoc context = Link.elementContext(elementName, attr
208                     .getKey());
209             processEmbed(curi, codebase, context);
210         }
211         // CLASSID DATA
212
else if ((((attr = attributes.get("classid")) != null)
213                 || ((attr = attributes.get("data")) != null)) &&
214                    ((attrValue = attr.getValue()) != null)) {
215             if (resources == null)
216                 resources = new ArrayList JavaDoc<String JavaDoc>();
217             resources.add(attrValue);
218         }
219         // ARCHIVE
220
else if (((attr = attributes.get("archive")) != null) &&
221                  ((attrValue = attr.getValue()) != null)) {
222             if (resources == null)
223                 resources = new ArrayList JavaDoc<String JavaDoc>();
224             String JavaDoc[] multi = TextUtils.split(WHITESPACE, attrValue);
225             for (int i = 0; i < multi.length; i++) {
226                 resources.add(multi[i]);
227             }
228         }
229         // CODE
230
else if (((attr = attributes.get("code")) != null) &&
231                  ((attrValue = attr.getValue()) != null)) {
232             if (resources == null)
233                 resources = new ArrayList JavaDoc<String JavaDoc>();
234             // If element is applet and code value does not end with
235
// '.class' then append '.class' to the code value.
236
if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
237                 resources.add(attrValue + CLASSEXT);
238             } else {
239                 resources.add(attrValue);
240             }
241         }
242         // VALUE
243
else if (((attr = attributes.get("value")) != null) &&
244                  ((attrValue = attr.getValue()) != null)) {
245             if (TextUtils.matches(LIKELY_URI_PATH, attrValue)
246                     && overlyEagerLinkDetection) {
247                 CharSequence JavaDoc context = Link.elementContext(elementName, attr
248                         .getKey());
249                 processLink(curi, attrValue, context);
250             }
251
252         }
253         // STYLE
254
else if (((attr = attributes.get("style")) != null) &&
255                  ((attrValue = attr.getValue()) != null)) {
256             // STYLE inline attribute
257
// then, parse for URIs
258
this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
259                     attrValue, getController());
260         }
261
262         // handle codebase/resources
263
if (resources == null)
264             return;
265
266         Iterator JavaDoc<String JavaDoc> iter = resources.iterator();
267         UURI codebaseURI = null;
268         String JavaDoc res = null;
269         try {
270             if (codebase != null) {
271                 // TODO: Pass in the charset.
272
codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase);
273             }
274             while (iter.hasNext()) {
275                 res = iter.next();
276                 res = StringEscapeUtils.unescapeHtml(res);
277                 if (codebaseURI != null) {
278                     res = codebaseURI.resolve(res).toString();
279                 }
280                 processEmbed(curi, res, element); // TODO: include attribute
281
// too
282
}
283         } catch (URIException e) {
284             curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
285         } catch (IllegalArgumentException JavaDoc e) {
286             DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"
287                     + "codebase=" + codebase + " res=" + res + "\n"
288                     + DevUtils.extraInfo(), e);
289         }
290     }
291
292     protected boolean processMeta(CrawlURI curi, Element element) {
293         String JavaDoc name = element.getAttributeValue("name");
294         String JavaDoc httpEquiv = element.getAttributeValue("http-equiv");
295         String JavaDoc content = element.getAttributeValue("content");
296
297         if ("robots".equals(name) && content != null) {
298             curi.putString(A_META_ROBOTS, content);
299             RobotsHonoringPolicy policy = getSettingsHandler().getOrder()
300                     .getRobotsHonoringPolicy();
301             String JavaDoc contentLower = content.toLowerCase();
302             if ((policy == null || (!policy.isType(curi,
303                     RobotsHonoringPolicy.IGNORE) && !policy.isType(curi,
304                     RobotsHonoringPolicy.CUSTOM)))
305                     && (contentLower.indexOf("nofollow") >= 0 || contentLower
306                             .indexOf("none") >= 0)) {
307                 // if 'nofollow' or 'none' is specified and the
308
// honoring policy is not IGNORE or CUSTOM, end html extraction
309
logger.fine("HTML extraction skipped due to robots meta-tag " +
310                     "for: " + curi.toString());
311                 return true;
312             }
313         }
314         if ("refresh".equals(httpEquiv) && content != null) {
315             String JavaDoc refreshUri = content.substring(content.indexOf("=") + 1);
316             try {
317                 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
318                         Link.REFER_HOP);
319             } catch (URIException e) {
320                 if (getController() != null) {
321                     getController().logUriError(e, curi.getUURI(), refreshUri);
322                 } else {
323                     logger.info("Failed createAndAddLinkRelativeToBase " + curi
324                             + ", " + element.toString() + ", " + refreshUri
325                             + ": " + e);
326                 }
327             }
328         }
329         return false;
330     }
331
332     protected void processScript(CrawlURI curi, Element element) {
333         // first, get attributes of script-open tag
334
// as per any other tag
335
processGeneralTag(curi, element, element.getAttributes());
336
337         // then, apply best-effort string-analysis heuristics
338
// against any code present (false positives are OK)
339
processScriptCode(curi, element.getContent());
340
341     }
342
343     protected void processStyle(CrawlURI curi, Element element) {
344         // First, get attributes of script-open tag as per any other tag.
345
processGeneralTag(curi, element, element.getAttributes());
346
347         // then, parse for URIs
348
this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
349                 element.getContent(), getController());
350     }
351
352     protected void processForm(CrawlURI curi, Element element) {
353         String JavaDoc action = element.getAttributeValue("action");
354         String JavaDoc name = element.getAttributeValue("name");
355         String JavaDoc queryURL = "";
356
357         final boolean ignoreFormActions = ((Boolean JavaDoc) getUncheckedAttribute(
358                 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
359
360         if (ignoreFormActions)
361             return;
362
363         numberOfFormsProcessed++;
364
365         // get all form fields
366
FormFields formFields = element.findFormFields();
367         for (Iterator JavaDoc fieldsIter = formFields.iterator(); fieldsIter.hasNext();) {
368             // for each form field
369
FormField formField = (FormField) fieldsIter.next();
370
371             // for each form control
372
for (Iterator JavaDoc controlIter = formField.getFormControls().iterator();
373                 controlIter.hasNext();) {
374                 FormControl formControl = (FormControl) controlIter.next();
375
376                 // get name of control element (and URLEncode it)
377
String JavaDoc controlName = formControl.getName();
378
379                 // retrieve list of values - submit needs special handling
380
Collection JavaDoc controlValues;
381                 if (!(formControl.getFormControlType() ==
382                         FormControlType.SUBMIT)) {
383                     controlValues = formControl.getValues();
384                 } else {
385                     controlValues = formControl.getPredefinedValues();
386                 }
387
388                 if (controlValues.size() > 0) {
389                     // for each value set
390
for (Iterator JavaDoc valueIter = controlValues.iterator();
391                             valueIter.hasNext();) {
392                         String JavaDoc value = (String JavaDoc) valueIter.next();
393                         queryURL += "&" + controlName + "=" + value;
394                     }
395                 } else {
396                     queryURL += "&" + controlName + "=";
397                 }
398             }
399         }
400
401         // clean up url
402
if (action == null) {
403             queryURL = queryURL.replaceFirst("&", "?");
404         } else {
405             if (!action.contains("?"))
406                 queryURL = queryURL.replaceFirst("&", "?");
407             queryURL = action + queryURL;
408         }
409
410         CharSequence JavaDoc context = Link.elementContext(element.getName(),
411             "name=" + name);
412         processLink(curi, queryURL, context);
413
414     }
415
416     /**
417      * Run extractor. This method is package visible to ease testing.
418      *
419      * @param curi
420      * CrawlURI we're processing.
421      * @param cs
422      * Sequence from underlying ReplayCharSequence.
423      */

424     void extract(CrawlURI curi, CharSequence JavaDoc cs) {
425         Source source = new Source(cs);
426         List JavaDoc elements = source.findAllElements(StartTagType.NORMAL);
427         for (Iterator JavaDoc elementIter = elements.iterator();
428                 elementIter.hasNext();) {
429             Element element = (Element) elementIter.next();
430             String JavaDoc elementName = element.getName();
431             Attributes attributes;
432             if (elementName.equals(HTMLElementName.META)) {
433                 if (processMeta(curi, element)) {
434                     // meta tag included NOFOLLOW; abort processing
435
break;
436                 }
437             } else if (elementName.equals(HTMLElementName.SCRIPT)) {
438                 processScript(curi, element);
439             } else if (elementName.equals(HTMLElementName.STYLE)) {
440                 processStyle(curi, element);
441             } else if (elementName.equals(HTMLElementName.FORM)) {
442                 processForm(curi, element);
443             } else if (!(attributes = element.getAttributes()).isEmpty()) {
444                 processGeneralTag(curi, element, attributes);
445             }
446         }
447     }
448
449     /*
450      * (non-Javadoc)
451      *
452      * @see org.archive.crawler.framework.Processor#report()
453      */

454     public String JavaDoc report() {
455         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
456         ret.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n");
457         ret.append(" Function: Link extraction on HTML documents\n");
458         ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
459         ret.append(" Forms processed: " + this.numberOfFormsProcessed + "\n");
460         ret.append(" Links extracted: " + this.numberOfLinksExtracted + "\n\n");
461         return ret.toString();
462     }
463 }
464
Popular Tags