ExtractorHTML


1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SimpleHTMLExtractor.java
20   * Created on Jun 5, 2003
21   *
22   * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/extractor/ExtractorHTML.java,v 1.79.2.1 2007/01/13 01:31:16 stack-sf Exp $
23   */
24  package org.archive.crawler.extractor;
25  
26  import java.io.IOException  ;
27  import java.util.ArrayList  ;
28  import java.util.Iterator  ;
29  import java.util.logging.Level  ;
30  import java.util.logging.Logger  ;
31  import java.util.regex.Matcher  ;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.archive.crawler.datamodel.CoreAttributeConstants;
35  import org.archive.crawler.datamodel.CrawlURI;
36  import org.archive.crawler.datamodel.RobotsHonoringPolicy;
37  import org.archive.crawler.settings.SimpleType;
38  import org.archive.crawler.settings.Type;
39  import org.archive.io.ReplayCharSequence;
40  import org.archive.net.UURI;
41  import org.archive.net.UURIFactory;
42  import org.archive.util.DevUtils;
43  import org.archive.util.HttpRecorder;
44  import org.archive.util.TextUtils;
45  
46  /**
47   * Basic link-extraction, from an HTML content-body,
48   * using regular expressions.
49   *
50   * @author gojomo
51   *
52   */
53  public class ExtractorHTML extends Extractor
54  implements CoreAttributeConstants {
55  
56      private static final long serialVersionUID = 5855731422080471017L;
57  
58      private static Logger   logger =
59          Logger.getLogger(ExtractorHTML.class.getName());
60  
61      /**
62       * Compiled relevant tag extractor.
63       *
64       * <p>
65       * This pattern extracts either:
66       * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or
67       * <li> (2) &lt;style&gt;...&lt;/style&gt; or
68       * <li> (3) &lt;meta ...&gt; or
69       * <li> (4) any other open-tag with at least one attribute
70       * (eg matches "&lt;a HREF='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")
71       * <p>
72       * groups:
73       * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT
74       * <li> 2: just script open tag
75       * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE
76       * <li> 4: just style open tag
77       * <li> 5: entire other tag, without '<' '>'
78       * <li> 6: element
79       * <li> 7: META
80       * <li> 8: !-- comment --
81       */
82  // version w/ less unnecessary backtracking
83        private static final int MAX_ELEMENT_LENGTH =
84            Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
85                ".maxElementNameLength", "1024"));
86        
87        static final String   RELEVANT_TAG_EXTRACTOR =
88            "(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2
89            "|((style[^>]*+)>[^<]*+</style)" + // 3, 4
90            "|(((meta)|(?:\\w{1,"+MAX_ELEMENT_LENGTH+"}))\\s+[^>]*+)" + // 5, 6, 7
91            "|(!--.*?--))>"; // 8 
92  
93  //    version w/ problems with unclosed script tags 
94  //    static final String RELEVANT_TAG_EXTRACTOR =
95  //    "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?:\\w+))\\s+.*?)|(!--.*?--))>";
96  
97  
98        
99  //    // this pattern extracts 'href' or 'src' attributes from
100 //    // any open-tag innards matched by the above
101 //    static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(
102 //     "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))");
103 //
104 //    // this pattern extracts 'robots' attributes
105 //    static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(
106 //     "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))");
107 
108       private static final int MAX_ATTR_NAME_LENGTH =
109           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
110               ".maxAttributeNameLength", "1024")); // 1K; 
111       
112       static final int MAX_ATTR_VAL_LENGTH = 
113           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
114               ".maxAttributeValueLength", "16384")); // 16K; 
115       
116     // TODO: perhaps cut to near MAX_URI_LENGTH
117     
118     // this pattern extracts attributes from any open-tag innards
119     // matched by the above. attributes known to be URIs of various
120     // sorts are matched specially
121     static final String   EACH_ATTRIBUTE_EXTRACTOR =
122       "(?is)\\s((href)|(action)|(on\\w*)" // 1, 2, 3, 4 
123      +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ...
124      +"|(?:usemap)|(?:profile)|(?:datasrc))" // 5
125      +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9
126      +"|(value)|(style)|([-\\w]{1,"+MAX_ATTR_NAME_LENGTH+"}))" // 10, 11, 12
127      +"\\s*=\\s*"
128      +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))" // 13
129      +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))" // 14
130      +"|(\\S{1,"+MAX_ATTR_VAL_LENGTH+"}))"; // 15
131     // groups:
132     // 1: attribute name
133     // 2: HREF - single URI relative to doc base, or occasionally javascript:
134     // 3: ACTION - single URI relative to doc base, or occasionally javascript:
135     // 4: ON[WHATEVER] - script handler
136     // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC
137     //    single URI relative to doc base
138     // 6: CODEBASE - a single URI relative to doc base, affecting other
139     //    attributes
140     // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
141     // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
142     //    (if supplied)
143     // 9: CODE - a single URI relative to the CODEBASE (is specified).
144     // 10: VALUE - often includes a uri path on forms
145     // 11: STYLE - inline attribute style info
146     // 12: any other attribute
147     // 13: double-quote delimited attr value
148     // 14: single-quote delimited attr value
149     // 15: space-delimited attr value
150 
151 
152     // much like the javascript likely-URI extractor, but
153     // without requiring quotes -- this can indicate whether
154     // an HTML tag attribute that isn't definitionally a
155     // URI might be one anyway, as in form-tag VALUE attributes
156     static final String   LIKELY_URI_PATH =
157      "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
158     static final String   WHITESPACE = "\\s";
159     static final String   CLASSEXT =".class";
160     static final String   APPLET = "applet";
161     static final String   BASE = "base";
162     static final String   LINK = "link";
163     static final String   FRAME = "frame";
164     static final String   IFRAME = "iframe";
165 
166     public static final String   ATTR_TREAT_FRAMES_AS_EMBED_LINKS =
167         "treat-frames-as-embed-links";
168     
169     public static final String   ATTR_IGNORE_FORM_ACTION_URLS =
170         "ignore-form-action-urls";
171     
172     /** whether to try finding links in Javscript; default true */
173     public static final String   ATTR_EXTRACT_JAVASCRIPT =
174         "extract-javascript";
175 
176     public static final String   ATTR_OVERLY_EAGER_LINK_DETECTION =
177         "overly-eager-link-detection";
178     
179     public static final String   ATTR_IGNORE_UNEXPECTED_HTML = 
180         "ignore-unexpected-html";
181 
182     
183     protected long numberOfCURIsHandled = 0;
184     protected long numberOfLinksExtracted = 0;
185 
186     public ExtractorHTML(String   name) {
187         this(name, "HTML extractor. Extracts links from HTML documents");
188     }
189     
190     public ExtractorHTML(String   name, String   description) {
191         super(name, description);
192         Type t = addElementToDefinition(
193             new SimpleType(ATTR_EXTRACT_JAVASCRIPT,
194             "If true, in-page Javascript is scanned for strings that " +
195             "appear likely to be URIs. This typically finds both valid " +
196             "and invalid URIs, and attempts to fetch the invalid URIs " +
197             "sometimes generates webmaster concerns over odd crawler " +
198             "behavior. Default is true.",
199             Boolean.TRUE));
200         t.setExpertSetting(true);
201         t = addElementToDefinition(
202             new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS,
203             "If true, FRAME/IFRAME SRC-links are treated as embedded " +
204             "resources (like IMG, 'E' hop-type), otherwise they are " +
205             "treated as navigational links. Default is true.", Boolean.TRUE));
206         t.setExpertSetting(true);
207         t = addElementToDefinition(
208             new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS,
209             "If true, URIs appearing as the ACTION attribute in " +
210             "HTML FORMs are ignored. Default is false.", Boolean.FALSE));
211         t.setExpertSetting(true);
212         t = addElementToDefinition(
213             new SimpleType(ATTR_OVERLY_EAGER_LINK_DETECTION,
214             "If true, strings that look like URIs found in unusual " +
215             "places (such as form VALUE attributes) will be extracted. " +
216             "This typically finds both valid and invalid URIs, and " +
217             "attempts to fetch the invalid URIs sometimes generate " +
218             "webmaster concerns over odd crawler behavior. Default " +
219             "is true.",
220             Boolean.TRUE));
221         t.setExpertSetting(true);
222         t = addElementToDefinition(
223             new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML,
224             "If true, URIs which end in typical non-HTML extensions " +
225             "(such as .gif) will not be scanned as if it were HTML. " +
226             "Default is true.", Boolean.TRUE));
227         t.setExpertSetting(true);
228     }
229 
230     protected void processGeneralTag(CrawlURI curi, CharSequence   element,
231             CharSequence   cs) {
232 
233         Matcher   attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
234 
235         // Just in case it's an OBJECT or APPLET tag
236         String   codebase = null;
237         ArrayList  <String  > resources = null;
238         
239         final boolean framesAsEmbeds = ((Boolean  )getUncheckedAttribute(curi,
240             ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
241 
242         final boolean ignoreFormActions = ((Boolean  )getUncheckedAttribute(curi,
243                 ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
244         
245         final boolean overlyEagerLinkDetection = ((Boolean  )getUncheckedAttribute
246                 (curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue();
247         
248         final String   elementStr = element.toString();
249 
250         while (attr.find()) {
251             int valueGroup =
252                 (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15;
253             int start = attr.start(valueGroup);
254             int end = attr.end(valueGroup);
255             assert start >= 0: "Start is: " + start + ", " + curi;
256             assert end >= 0: "End is :" + end + ", " + curi;
257             CharSequence   value = cs.subSequence(start, end);
258             value = TextUtils.unescapeHtml(value);
259             if (attr.start(2) > -1) {
260                 // HREF
261                 CharSequence   context =
262                     Link.elementContext(element, attr.group(2));
263                 if(elementStr.equalsIgnoreCase(LINK)) {
264                     // <LINK> elements treated as embeds (css, ico, etc)
265                     processEmbed(curi, value, context);
266                 } else {
267                     // other HREFs treated as links
268                     processLink(curi, value, context);
269                 }
270                 if (elementStr.equalsIgnoreCase(BASE)) {
271                     try {
272                         curi.setBaseURI(value.toString());
273                     } catch (URIException e) {
274                         if (getController() != null) {
275                             // Controller can be null: e.g. when running
276                             // ExtractorTool.
277                             getController().logUriError(e, curi.getUURI(),
278                                 value.toString());
279                         } else {
280                             logger.info("Failed set base uri: " +
281                                 curi + ", " + value.toString() + ": " +
282                                 e.getMessage());
283                         }
284                     }
285                 }
286             } else if (attr.start(3) > -1) {
287                 // ACTION
288                 if (!ignoreFormActions) {
289                     CharSequence   context = Link.elementContext(element,
290                         attr.group(3));
291                     processLink(curi, value, context);
292                 }
293             } else if (attr.start(4) > -1) {
294                 // ON____
295                 processScriptCode(curi, value); // TODO: context?
296             } else if (attr.start(5) > -1) {
297                 // SRC etc.
298                 CharSequence   context = Link.elementContext(element,
299                     attr.group(5));
300                 
301                 // true, if we expect another HTML page instead of an image etc.
302                 final char hopType;
303                 
304                 if(!framesAsEmbeds
305                     && (elementStr.equalsIgnoreCase(FRAME) || elementStr
306                         .equalsIgnoreCase(IFRAME))) {
307                     hopType = Link.NAVLINK_HOP;
308                 } else {
309                     hopType = Link.EMBED_HOP;
310                 }
311                 processEmbed(curi, value, context, hopType);
312             } else if (attr.start(6) > -1) {
313                 // CODEBASE
314                 codebase = (value instanceof String  )?
315                     (String  )value: value.toString();
316                 CharSequence   context = Link.elementContext(element,
317                     attr.group(6));
318                 processEmbed(curi, codebase, context);
319             } else if (attr.start(7) > -1) {
320                 // CLASSID, DATA
321                 if (resources == null) {
322                     resources = new ArrayList  <String  >();
323                 }
324                 resources.add(value.toString());
325             } else if (attr.start(8) > -1) {
326                 // ARCHIVE
327                 if (resources==null) {
328                     resources = new ArrayList  <String  >();
329                 }
330                 String  [] multi = TextUtils.split(WHITESPACE, value);
331                 for(int i = 0; i < multi.length; i++ ) {
332                     resources.add(multi[i]);
333                 }
334             } else if (attr.start(9) > -1) {
335                 // CODE
336                 if (resources==null) {
337                     resources = new ArrayList  <String  >();
338                 }
339                 // If element is applet and code value does not end with
340                 // '.class' then append '.class' to the code value.
341                 if (elementStr.equalsIgnoreCase(APPLET) &&
342                         !value.toString().toLowerCase().endsWith(CLASSEXT)) {
343                     resources.add(value.toString() + CLASSEXT);
344                 } else {
345                     resources.add(value.toString());
346                 }
347             } else if (attr.start(10) > -1) {
348                 // VALUE, with possibility of URI
349                 if (overlyEagerLinkDetection 
350                         && TextUtils.matches(LIKELY_URI_PATH, value)) {
351                     CharSequence   context = Link.elementContext(element,
352                         attr.group(10));
353                     processLink(curi,value, context);
354                 }
355 
356             } else if (attr.start(11) > -1) {
357                 // STYLE inline attribute
358                 // then, parse for URIs
359                 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
360                     curi, value, getController());
361                 
362             } else if (attr.start(12) > -1) {
363                 // any other attribute
364                 // ignore for now
365                 // could probe for path- or script-looking strings, but
366                 // those should be vanishingly rare in other attributes,
367                 // and/or symptomatic of page bugs
368             }
369         }
370         TextUtils.recycleMatcher(attr);
371 
372         // handle codebase/resources
373         if (resources == null) {
374             return;
375         }
376         Iterator   iter = resources.iterator();
377         UURI codebaseURI = null;
378         String   res = null;
379         try {
380             if (codebase != null) {
381                 // TODO: Pass in the charset.
382                 codebaseURI = UURIFactory.
383                     getInstance(curi.getUURI(), codebase);
384             }
385             while(iter.hasNext()) {
386                 res = iter.next().toString();
387                 res = (String  ) TextUtils.unescapeHtml(res);
388                 if (codebaseURI != null) {
389                     res = codebaseURI.resolve(res).toString();
390                 }
391                 processEmbed(curi, res, element); // TODO: include attribute too
392             }
393         } catch (URIException e) {
394             curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
395         } catch (IllegalArgumentException   e) {
396             DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
397                 "codebase=" + codebase + " res=" + res + "\n" +
398                 DevUtils.extraInfo(), e);
399         }
400     }
401 
402     /**
403      * Extract the (java)script source in the given CharSequence. 
404      * 
405      * @param curi source CrawlURI
406      * @param cs CharSequence of javascript code
407      */
408     protected void processScriptCode(CrawlURI curi, CharSequence   cs) {
409         if((Boolean  )getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) {
410             this.numberOfLinksExtracted +=
411                 ExtractorJS.considerStrings(curi, cs, getController(), false);
412         } // else do nothing
413     }
414 
415     static final String   JAVASCRIPT = "(?i)^javascript:.*";
416 
417     /**
418      * Handle generic HREF cases.
419      * 
420      * @param curi
421      * @param value
422      * @param context
423      */
424     protected void processLink(CrawlURI curi, final CharSequence   value,
425             CharSequence   context) {
426         if (TextUtils.matches(JAVASCRIPT, value)) {
427             processScriptCode(curi, value. subSequence(11, value.length()));
428         } else {    
429             if (logger.isLoggable(Level.FINEST)) {
430                 logger.finest("link: " + value.toString() + " from " + curi);
431             }
432             addLinkFromString(curi,
433                 (value instanceof String  )?
434                     (String  )value: value.toString(),
435                 context, Link.NAVLINK_HOP);
436             this.numberOfLinksExtracted++;
437         }
438     }
439 
440     private void addLinkFromString(CrawlURI curi, String   uri,
441             CharSequence   context, char hopType) {
442         try {
443             // We do a 'toString' on context because its a sequence from
444             // the underlying ReplayCharSequence and the link its about
445             // to become a part of is expected to outlive the current
446             // ReplayCharSequence.
447             curi.createAndAddLinkRelativeToBase(uri, context.toString(),
448                 hopType);
449         } catch (URIException e) {
450             if (getController() != null) {
451                 getController().logUriError(e, curi.getUURI(), uri);
452             } else {
453                 logger.info("Failed createAndAddLinkRelativeToBase " +
454                     curi + ", " + uri + ", " + context + ", " + hopType +
455                     ": " + e);
456             }
457         }
458     }
459 
460     protected final void processEmbed(CrawlURI curi, CharSequence   value,
461             CharSequence   context) {
462         processEmbed(curi, value, context, Link.EMBED_HOP);
463     }
464 
465     protected void processEmbed(CrawlURI curi, final CharSequence   value,
466             CharSequence   context, char hopType) {
467         if (logger.isLoggable(Level.FINEST)) {
468             logger.finest("embed (" + hopType + "): " + value.toString() +
469                 " from " + curi);
470         }
471         addLinkFromString(curi,
472             (value instanceof String  )?
473                 (String  )value: value.toString(),
474             context, hopType);
475         this.numberOfLinksExtracted++;
476     }
477 
478     public void extract(CrawlURI curi) {
479         if (!isHttpTransactionContentToProcess(curi) ||
480                 ! (isExpectedMimeType(curi.getContentType(), "text/html")
481                    || isExpectedMimeType(curi.getContentType(), "application/xhtml"))) {
482             return;
483         }
484 
485         final boolean ignoreUnexpectedHTML =
486              ((Boolean  )getUncheckedAttribute(curi, 
487                  ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();        
488 
489         if (ignoreUnexpectedHTML) {
490             try {
491                 if(!isHtmlExpectedHere(curi)) {
492                     // HTML was not expected (eg a GIF was expected) so ignore
493                     // (as if a soft 404)
494                     return;
495                 }
496             } catch (URIException e) {
497                 logger.severe("Failed expectedHTML test: " + e.getMessage());
498             }
499         }
500 
501         this.numberOfCURIsHandled++;
502 
503         ReplayCharSequence cs = null;
504         
505         try {
506            HttpRecorder hr = curi.getHttpRecorder();
507            if (hr == null) {
508                throw new IOException  ("Why is recorder null here?");
509            }
510            cs = hr.getReplayCharSequence();
511         } catch (IOException   e) {
512             curi.addLocalizedError(this.getName(), e,
513                 "Failed get of replay char sequence " + curi.toString() +
514                     " " + e.getMessage());
515             logger.log(Level.SEVERE,"Failed get of replay char sequence in " +
516                 Thread.currentThread().getName(), e);
517         }
518         
519         if (cs == null) {
520             return;
521         }
522 
523         // We have a ReplayCharSequence open.  Wrap all in finally so we
524         // for sure close it before we leave.
525         try {
526             // Extract all links from the charsequence
527             extract(curi, cs);
528             // Set flag to indicate that link extraction is completed.
529             curi.linkExtractorFinished();
530         } finally {
531             if (cs != null) {
532                 try {
533                     cs.close();
534                 } catch (IOException   ioe) {
535                     logger.warning(TextUtils.exceptionToString(
536                         "Failed close of ReplayCharSequence.", ioe));
537                 }
538             }
539         }
540     }
541 
542     /**
543      * Run extractor.
544      * This method is package visible to ease testing.
545      * @param curi CrawlURI we're processing.
546      * @param cs Sequence from underlying ReplayCharSequence. This
547      * is TRANSIENT data. Make a copy if you want the data to live outside
548      * of this extractors' lifetime.
549      */
550     void extract(CrawlURI curi, CharSequence   cs) {
551         Matcher   tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);
552         while(tags.find()) {
553             if(Thread.interrupted()){
554                 break;
555             }
556             if (tags.start(8) > 0) {
557                 // comment match
558                 // for now do nothing
559             } else if (tags.start(7) > 0) {
560                 // <meta> match
561                 int start = tags.start(5);
562                 int end = tags.end(5);
563                 assert start >= 0: "Start is: " + start + ", " + curi;
564                 assert end >= 0: "End is :" + end + ", " + curi;
565                 if (processMeta(curi,
566                     cs.subSequence(start, end))) {
567 
568                     // meta tag included NOFOLLOW; abort processing
569                     break;
570                 }
571             } else if (tags.start(5) > 0) {
572                 // generic <whatever> match
573                 int start5 = tags.start(5);
574                 int end5 = tags.end(5);
575                 assert start5 >= 0: "Start is: " + start5 + ", " + curi;
576                 assert end5 >= 0: "End is :" + end5 + ", " + curi;
577                 int start6 = tags.start(6);
578                 int end6 = tags.end(6);
579                 assert start6 >= 0: "Start is: " + start6 + ", " + curi;
580                 assert end6 >= 0: "End is :" + end6 + ", " + curi;
581                 processGeneralTag(curi,
582                     cs.subSequence(start6, end6),
583                     cs.subSequence(start5, end5));
584 
585             } else if (tags.start(1) > 0) {
586                 // <script> match
587                 int start = tags.start(1);
588                 int end = tags.end(1);
589                 assert start >= 0: "Start is: " + start + ", " + curi;
590                 assert end >= 0: "End is :" + end + ", " + curi;
591                 assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) +
592                     ", " + curi;
593                 processScript(curi, cs.subSequence(start, end),
594                     tags.end(2) - start);
595 
596             } else if (tags.start(3) > 0){
597                 // <style... match
598                 int start = tags.start(3);
599                 int end = tags.end(3);
600                 assert start >= 0: "Start is: " + start + ", " + curi;
601                 assert end >= 0: "End is :" + end + ", " + curi;
602                 assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) +
603                     ", " + curi;
604                 processStyle(curi, cs.subSequence(start, end),
605                     tags.end(4) - start);
606             }
607         }
608         TextUtils.recycleMatcher(tags);
609     }
610 
611 
612     static final String   NON_HTML_PATH_EXTENSION =
613         "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
614         "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
615 
616     /**
617      * Test whether this HTML is so unexpected (eg in place of a GIF URI)
618      * that it shouldn't be scanned for links.
619      *
620      * @param curi CrawlURI to examine.
621      * @return True if HTML is acceptable/expected here
622      * @throws URIException
623      */
624     protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException {
625         String   path = curi.getUURI().getPath();
626         if(path==null) {
627             // no path extension, HTML is fine
628             return true;
629         }
630         int dot = path.lastIndexOf('.');
631         if (dot < 0) {
632             // no path extension, HTML is fine
633             return true;
634         }
635         if(dot<(path.length()-5)) {
636             // extension too long to recognize, HTML is fine
637             return true;
638         }
639         String   ext = path.substring(dot+1);
640         return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);
641     }
642 
643     protected void processScript(CrawlURI curi, CharSequence   sequence,
644             int endOfOpenTag) {
645         // first, get attributes of script-open tag
646         // as per any other tag
647         processGeneralTag(curi,sequence.subSequence(0,6),
648             sequence.subSequence(0,endOfOpenTag));
649 
650         // then, apply best-effort string-analysis heuristics
651         // against any code present (false positives are OK)
652         processScriptCode(
653             curi, sequence.subSequence(endOfOpenTag, sequence.length()));
654     }
655 
656     /**
657      * Process metadata tags.
658      * @param curi CrawlURI we're processing.
659      * @param cs Sequence from underlying ReplayCharSequence. This
660      * is TRANSIENT data. Make a copy if you want the data to live outside
661      * of this extractors' lifetime.
662      * @return True robots exclusion metatag.
663      */
664     protected boolean processMeta(CrawlURI curi, CharSequence   cs) {
665         Matcher   attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
666         String   name = null;
667         String   httpEquiv = null;
668         String   content = null;
669         while (attr.find()) {
670             int valueGroup =
671                 (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15;
672             CharSequence   value =
673                 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
674             if (attr.group(1).equalsIgnoreCase("name")) {
675                 name = value.toString();
676             } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
677                 httpEquiv = value.toString();
678             } else if (attr.group(1).equalsIgnoreCase("content")) {
679                 content = value.toString();
680             }
681             // TODO: handle other stuff
682         }
683         TextUtils.recycleMatcher(attr);
684 
685         // Look for the 'robots' meta-tag
686         if("robots".equalsIgnoreCase(name) && content != null ) {
687             curi.putString(A_META_ROBOTS, content);
688             RobotsHonoringPolicy policy =
689                 getSettingsHandler().getOrder().getRobotsHonoringPolicy();
690             String   contentLower = content.toLowerCase();
691             if ((policy == null
692                 || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
693                     && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
694                 && (contentLower.indexOf("nofollow") >= 0
695                     || contentLower.indexOf("none") >= 0)) {
696                 // if 'nofollow' or 'none' is specified and the
697                 // honoring policy is not IGNORE or CUSTOM, end html extraction
698                 logger.fine("HTML extraction skipped due to robots meta-tag for: "
699                                 + curi.toString());
700                 return true;
701             }
702         } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
703             String   refreshUri = content.substring(content.indexOf("=") + 1);
704             try {
705                 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
706                     Link.REFER_HOP);
707             } catch (URIException e) {
708                 if (getController() != null) {
709                     getController().logUriError(e, curi.getUURI(), refreshUri);
710                 } else {
711                     logger.info("Failed createAndAddLinkRelativeToBase " +
712                         curi + ", " + cs + ", " + refreshUri + ": " + e);
713                 }
714             }
715         }
716         return false;
717     }
718 
719     /**
720      * Process style text.
721      * @param curi CrawlURI we're processing.
722      * @param sequence Sequence from underlying ReplayCharSequence. This
723      * is TRANSIENT data. Make a copy if you want the data to live outside
724      * of this extractors' lifetime.
725      * @param endOfOpenTag
726      */
727     protected void processStyle(CrawlURI curi, CharSequence   sequence,
728             int endOfOpenTag) {
729         // First, get attributes of script-open tag as per any other tag.
730         processGeneralTag(curi, sequence.subSequence(0,6),
731             sequence.subSequence(0,endOfOpenTag));
732 
733         // then, parse for URIs
734         this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
735             curi, sequence.subSequence(endOfOpenTag,sequence.length()),
736                 getController());
737     }
738     
739 
740 
741     /* (non-Javadoc)
742      * @see org.archive.crawler.framework.Processor#report()
743      */
744     public String   report() {
745         StringBuffer   ret = new StringBuffer  ();
746         ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
747         ret.append("  Function:          Link extraction on HTML documents\n");
748         ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
749         ret.append("  Links extracted:   " + this.numberOfLinksExtracted +
750             "\n\n");
751         return ret.toString();
752     }
753 }
754 
755
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags