RegexpHTMLLinkExtractor


1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SimpleHTMLExtractor.java
20   * Created on Jun 5, 2003
21   *
22   * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/extractor/RegexpHTMLLinkExtractor.java,v 1.7.12.1 2007/01/13 01:31:31 stack-sf Exp $
23   */
24  package org.archive.extractor;
25  
26  import java.util.ArrayList  ;
27  import java.util.Iterator  ;
28  import java.util.LinkedList  ;
29  import java.util.logging.Level  ;
30  import java.util.logging.Logger  ;
31  import java.util.regex.Matcher  ;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.archive.crawler.extractor.Link;
35  import org.archive.net.UURI;
36  import org.archive.net.UURIFactory;
37  import org.archive.util.DevUtils;
38  import org.archive.util.TextUtils;
39  
40  
41  /**
42   * Basic link-extraction, from an HTML content-body,
43   * using regular expressions.
44   *
45   * ROUGH DRAFT IN PROGRESS / incomplete... untested...
46   *
47   * @author gojomo
48   */
49  public class RegexpHTMLLinkExtractor extends CharSequenceLinkExtractor {
50      private static Logger   logger =
51          Logger.getLogger(RegexpHTMLLinkExtractor.class.getName());
52  
53      boolean honorRobots = true;
54      boolean extractInlineCss = true;
55      boolean extractInlineJs = true;
56  
57      protected LinkedList  <Link> next = new LinkedList  <Link>();
58      protected Matcher   tags;
59  
60      /* (non-Javadoc)
61       * @see org.archive.extractor.CharSequenceLinkExtractor#findNextLink()
62       */
63      protected boolean findNextLink() {
64          if (tags == null) {
65              tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, sourceContent);
66          }
67          while(tags.find()) {
68              if(Thread.interrupted()){
69                  // TODO: throw an exception, perhaps, rather than just clear & break?
70                  break;
71              }
72              if (tags.start(8) > 0) {
73                  // comment match
74                  // for now do nothing
75              } else if (tags.start(7) > 0) {
76                  // <meta> match
77                  int start = tags.start(5);
78                  int end = tags.end(5);
79                  processMeta(sourceContent.subSequence(start, end));
80              } else if (tags.start(5) > 0) {
81                  // generic <whatever> match
82                  int start5 = tags.start(5);
83                  int end5 = tags.end(5);
84                  int start6 = tags.start(6);
85                  int end6 = tags.end(6);
86                  processGeneralTag(sourceContent.subSequence(start6, end6),
87                          sourceContent.subSequence(start5, end5));
88              } else if (tags.start(1) > 0) {
89                  // <script> match
90                  int start = tags.start(1);
91                  int end = tags.end(1);
92                  processScript(sourceContent.subSequence(start, end),
93                      tags.end(2) - start);
94              } else if (tags.start(3) > 0){
95                  // <style... match
96                  int start = tags.start(3);
97                  int end = tags.end(3);
98                  processStyle(sourceContent.subSequence(start, end),
99                      tags.end(4) - start);
100             }
101             if(!next.isEmpty()) {
102                 // at least one link found
103                 return true;
104             }
105         }
106         // no relevant tags found
107         return false;
108     }
109 
110     /**
111      * Compiled relevant tag extractor.
112      *
113      * <p>
114      * This pattern extracts either:
115      * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or
116      * <li> (2) &lt;style&gt;...&lt;/style&gt; or
117      * <li> (3) &lt;meta ...&gt; or
118      * <li> (4) any other open-tag with at least one attribute
119      * (eg matches "&lt;a HREF='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")
120      * <p>
121      * groups:
122      * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT
123      * <li> 2: just script open tag
124      * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE
125      * <li> 4: just style open tag
126      * <li> 5: entire other tag, without '<' '>'
127      * <li> 6: element
128      * <li> 7: META
129      * <li> 8: !-- comment --
130      */
131     static final String   RELEVANT_TAG_EXTRACTOR =
132           "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?:\\w+))\\s+[^>]*+)|(!--.*?--))>";
133 
134     // this pattern extracts attributes from any open-tag innards
135     // matched by the above. attributes known to be URIs of various
136     // sorts are matched specially
137     static final String   EACH_ATTRIBUTE_EXTRACTOR =
138       "(?is)\\s((href)|(action)|(on\\w*)"
139      +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)"
140      +"|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))"
141      +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)"
142      +"|(value)|([-\\w]+))"
143      +"\\s*=\\s*"
144      +"(?:(?:\"(.*?)(?:\"|$))"
145      +"|(?:'(.*?)(?:'|$))"
146      +"|(\\S+))";
147     // groups:
148     // 1: attribute name
149     // 2: HREF - single URI relative to doc base, or occasionally javascript:
150     // 3: ACTION - single URI relative to doc base, or occasionally javascript:
151     // 4: ON[WHATEVER] - script handler
152     // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE,DATASRC, or FOR
153     //    single URI relative to doc base
154     // 6: CODEBASE - a single URI relative to doc base, affecting other
155     //    attributes
156     // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
157     // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
158     //    (if supplied)
159     // 9: CODE - a single URI relative to the CODEBASE (is specified).
160     // 10: VALUE - often includes a uri path on forms
161     // 11: any other attribute
162     // 12: double-quote delimited attr value
163     // 13: single-quote delimited attr value
164     // 14: space-delimited attr value
165 
166 
167     // much like the javascript likely-URI extractor, but
168     // without requiring quotes -- this can indicate whether
169     // an HTML tag attribute that isn't definitionally a
170     // URI might be one anyway, as in form-tag VALUE attributes
171     static final String   LIKELY_URI_PATH =
172      "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
173     static final String   ESCAPED_AMP = "&amp;";
174     static final String   AMP ="&";
175     static final String   WHITESPACE = "\\s";
176     static final String   CLASSEXT =".class";
177     static final String   APPLET = "applet";
178     static final String   BASE = "base";
179     static final String   LINK = "link";
180 
181     protected boolean processGeneralTag(CharSequence   element, CharSequence   cs) {
182 
183         Matcher   attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
184 
185         // Just in case it's an OBJECT or APPLET tag
186         String   codebase = null;
187         ArrayList  <String  > resources = null;
188         long tally = next.size();
189 
190         while (attr.find()) {
191             int valueGroup =
192                 (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;
193             int start = attr.start(valueGroup);
194             int end = attr.end(valueGroup);
195             CharSequence   value = cs.subSequence(start, end);
196             if (attr.start(2) > -1) {
197                 // HREF
198                 CharSequence   context = Link.elementContext(element, attr.group(2));
199                 if(element.toString().equalsIgnoreCase(LINK)) {
200                     // <LINK> elements treated as embeds (css, ico, etc)
201                     processEmbed(value, context);
202                 } else {
203                     if (element.toString().equalsIgnoreCase(BASE)) {
204                         try {
205                             base = UURIFactory.getInstance(value.toString());
206                         } catch (URIException e) {
207                             extractErrorListener.noteExtractError(e,source,value);
208                         }
209                     }
210                     // other HREFs treated as links
211                     processLink(value, context);
212                 }
213             } else if (attr.start(3) > -1) {
214                 // ACTION
215                 CharSequence   context = Link.elementContext(element, attr.group(3));
216                 processLink(value, context);
217             } else if (attr.start(4) > -1) {
218                 // ON____
219                 processScriptCode(value); // TODO: context?
220             } else if (attr.start(5) > -1) {
221                 // SRC etc.
222                 CharSequence   context = Link.elementContext(element, attr.group(5));
223                 processEmbed(value, context);
224             } else if (attr.start(6) > -1) {
225                 // CODEBASE
226                 // TODO: more HTML deescaping?
227                 codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP);
228                 CharSequence   context = Link.elementContext(element,attr.group(6));
229                 processEmbed(codebase, context);
230             } else if (attr.start(7) > -1) {
231                 // CLASSID, DATA
232                 if (resources == null) {
233                     resources = new ArrayList  <String  >();
234                 }
235                 resources.add(value.toString());
236             } else if (attr.start(8) > -1) {
237                 // ARCHIVE
238                 if (resources==null) {
239                     resources = new ArrayList  <String  >();
240                 }
241                 String  [] multi = TextUtils.split(WHITESPACE, value);
242                 for(int i = 0; i < multi.length; i++ ) {
243                     resources.add(multi[i]);
244                 }
245             } else if (attr.start(9) > -1) {
246                 // CODE
247                 if (resources==null) {
248                     resources = new ArrayList  <String  >();
249                 }
250                 // If element is applet and code value does not end with
251                 // '.class' then append '.class' to the code value.
252                 if (element.toString().toLowerCase().equals(APPLET) &&
253                         !value.toString().toLowerCase().endsWith(CLASSEXT)) {
254                     resources.add(value.toString() + CLASSEXT);
255                 } else {
256                     resources.add(value.toString());
257                 }
258 
259             } else if (attr.start(10) > -1) {
260                 // VALUE
261                 if(TextUtils.matches(LIKELY_URI_PATH, value)) {
262                     CharSequence   context = Link.elementContext(element, attr.group(10));
263                     processLink(value, context);
264                 }
265 
266             } else if (attr.start(11) > -1) {
267                 // any other attribute
268                 // ignore for now
269                 // could probe for path- or script-looking strings, but
270                 // those should be vanishingly rare in other attributes,
271                 // and/or symptomatic of page bugs
272             }
273         }
274         TextUtils.recycleMatcher(attr);
275 
276         // handle codebase/resources
277         if (resources == null) {
278             return (tally-next.size())>0;
279         }
280         Iterator   iter = resources.iterator();
281         UURI codebaseURI = null;
282         String   res = null;
283         try {
284             if (codebase != null) {
285                 // TODO: Pass in the charset.
286                 codebaseURI = UURIFactory.getInstance(base, codebase);
287             }
288             while(iter.hasNext()) {
289                 res = iter.next().toString();
290                 // TODO: more HTML deescaping?
291                 res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP);
292                 if (codebaseURI != null) {
293                     res = codebaseURI.resolve(res).toString();
294                 }
295                 processEmbed(res, element); // TODO: include attribute too
296             }
297         } catch (URIException e) {
298             extractErrorListener.noteExtractError(e,source,codebase);
299         } catch (IllegalArgumentException   e) {
300             DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
301                 "codebase=" + codebase + " res=" + res + "\n" +
302                 DevUtils.extraInfo(), e);
303         }
304         return (tally-next.size())>0;
305     }
306 
307     /**
308      * @param cs
309      */
310     protected void processScriptCode(CharSequence   cs) {
311         RegexpJSLinkExtractor.extract(cs, source, base, next,
312                 extractErrorListener);
313     }
314 
315     static final String   JAVASCRIPT = "(?i)^javascript:.*";
316 
317     /**
318      * @param value
319      * @param context
320      */
321     protected void processLink(CharSequence   value, CharSequence   context) {
322         String   link = TextUtils.replaceAll(ESCAPED_AMP, value, "&");
323 
324         if(TextUtils.matches(JAVASCRIPT, link)) {
325             processScriptCode(value.subSequence(11, value.length()));
326         } else {
327             addLinkFromString(link, context,Link.NAVLINK_HOP);
328         }
329     }
330 
331     /**
332      * @param uri
333      * @param context
334      */
335     private void addLinkFromString(String   uri, CharSequence   context, char hopType) {
336         try {
337             Link link = new Link(source, UURIFactory.getInstance(
338                     base, uri), context, hopType);
339             next.addLast(link);
340         } catch (URIException e) {
341            extractErrorListener.noteExtractError(e,source,uri);
342         }
343     }
344 
345     protected long processEmbed(CharSequence   value, CharSequence   context) {
346         String   embed = TextUtils.replaceAll(ESCAPED_AMP, value, "&");
347         addLinkFromString(embed, context,Link.EMBED_HOP);
348         return 1;
349     }
350 
351     static final String   NON_HTML_PATH_EXTENSION =
352         "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
353         "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
354 
355     protected void processScript(CharSequence   sequence, int endOfOpenTag) {
356         // first, get attributes of script-open tag
357         // as per any other tag
358         processGeneralTag(sequence.subSequence(0,6),
359             sequence.subSequence(0,endOfOpenTag));
360 
361         // then, apply best-effort string-analysis heuristics
362         // against any code present (false positives are OK)
363         processScriptCode(sequence.subSequence(endOfOpenTag, sequence.length()));
364     }
365 
366     protected void processMeta(CharSequence   cs) {
367         Matcher   attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
368 
369         String   name = null;
370         String   httpEquiv = null;
371         String   content = null;
372 
373         while (attr.find()) {
374             int valueGroup =
375                 (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;
376             CharSequence   value =
377                 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
378             if (attr.group(1).equalsIgnoreCase("name")) {
379                 name = value.toString();
380             } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
381                 httpEquiv = value.toString();
382             } else if (attr.group(1).equalsIgnoreCase("content")) {
383                 content = value.toString();
384             }
385             // TODO: handle other stuff
386         }
387         TextUtils.recycleMatcher(attr);
388         
389         // Look for the 'robots' meta-tag
390         if("robots".equalsIgnoreCase(name) && content != null ) {
391             if (getHonorRobots())  {
392             String   contentLower = content.toLowerCase();
393                 if ((contentLower.indexOf("nofollow") >= 0
394                         || contentLower.indexOf("none") >= 0)) {
395                     // if 'nofollow' or 'none' is specified and we
396                     // are honoring robots, end html extraction
397                     logger.fine("HTML extraction skipped due to robots meta-tag for: "
398                                     + source);
399                     cancelFurtherExtraction();
400                     return;
401                 }
402             }
403         } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
404             String   refreshUri = content.substring(content.indexOf("=") + 1);
405             try {
406                 Link refreshLink = new Link(source, UURIFactory.getInstance(base,refreshUri), Link.elementContext("meta",httpEquiv),Link.REFER_HOP);
407                 next.addLast(refreshLink);
408             } catch (URIException e) {
409                 extractErrorListener.noteExtractError(e,source,refreshUri);
410             }
411         }
412     }
413 
414     /**
415      * @return whether to honor internal robots directives (eg meta robots)
416      */
417     private boolean getHonorRobots() {
418         return honorRobots;
419     }
420 
421     /**
422      * Ensure no further Links are extracted (by setting matcher up to fail)
423      */
424     private void cancelFurtherExtraction() {
425         // java 1.5 only:
426         // tags.region(tags.regionEnd(),tags.regionEnd());
427         tags.reset(""); 
428     }
429 
430     /**
431      * @param sequence
432      * @param endOfOpenTag
433      */
434     protected void processStyle(CharSequence   sequence,
435             int endOfOpenTag)
436     {
437         // First, get attributes of script-open tag as per any other tag.
438         processGeneralTag(sequence.subSequence(0,6),
439             sequence.subSequence(0,endOfOpenTag));
440 
441         // then, parse for URIs
442         RegexpCSSLinkExtractor.extract(sequence.subSequence(endOfOpenTag,
443                 sequence.length()), source, base, next, extractErrorListener);
444     }
445 
446     /**
447      * Discard all state. Another setup() is required to use again.
448      */
449     public void reset() {
450         super.reset();
451         TextUtils.recycleMatcher(tags);
452         tags = null;
453     }
454 
455     protected static CharSequenceLinkExtractor newDefaultInstance() {
456         return new RegexpHTMLLinkExtractor();
457     }
458 }
459 
460
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags