KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > extractor > RegexpHTMLLinkExtractor


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * SimpleHTMLExtractor.java
20  * Created on Jun 5, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/extractor/RegexpHTMLLinkExtractor.java,v 1.7.12.1 2007/01/13 01:31:31 stack-sf Exp $
23  */

24 package org.archive.extractor;
25
26 import java.util.ArrayList JavaDoc;
27 import java.util.Iterator JavaDoc;
28 import java.util.LinkedList JavaDoc;
29 import java.util.logging.Level JavaDoc;
30 import java.util.logging.Logger JavaDoc;
31 import java.util.regex.Matcher JavaDoc;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.extractor.Link;
35 import org.archive.net.UURI;
36 import org.archive.net.UURIFactory;
37 import org.archive.util.DevUtils;
38 import org.archive.util.TextUtils;
39
40
41 /**
42  * Basic link-extraction, from an HTML content-body,
43  * using regular expressions.
44  *
45  * ROUGH DRAFT IN PROGRESS / incomplete... untested...
46  *
47  * @author gojomo
48  */

49 public class RegexpHTMLLinkExtractor extends CharSequenceLinkExtractor {
50     private static Logger JavaDoc logger =
51         Logger.getLogger(RegexpHTMLLinkExtractor.class.getName());
52
53     boolean honorRobots = true;
54     boolean extractInlineCss = true;
55     boolean extractInlineJs = true;
56
57     protected LinkedList JavaDoc<Link> next = new LinkedList JavaDoc<Link>();
58     protected Matcher JavaDoc tags;
59
60     /* (non-Javadoc)
61      * @see org.archive.extractor.CharSequenceLinkExtractor#findNextLink()
62      */

63     protected boolean findNextLink() {
64         if (tags == null) {
65             tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, sourceContent);
66         }
67         while(tags.find()) {
68             if(Thread.interrupted()){
69                 // TODO: throw an exception, perhaps, rather than just clear & break?
70
break;
71             }
72             if (tags.start(8) > 0) {
73                 // comment match
74
// for now do nothing
75
} else if (tags.start(7) > 0) {
76                 // <meta> match
77
int start = tags.start(5);
78                 int end = tags.end(5);
79                 processMeta(sourceContent.subSequence(start, end));
80             } else if (tags.start(5) > 0) {
81                 // generic <whatever> match
82
int start5 = tags.start(5);
83                 int end5 = tags.end(5);
84                 int start6 = tags.start(6);
85                 int end6 = tags.end(6);
86                 processGeneralTag(sourceContent.subSequence(start6, end6),
87                         sourceContent.subSequence(start5, end5));
88             } else if (tags.start(1) > 0) {
89                 // <script> match
90
int start = tags.start(1);
91                 int end = tags.end(1);
92                 processScript(sourceContent.subSequence(start, end),
93                     tags.end(2) - start);
94             } else if (tags.start(3) > 0){
95                 // <style... match
96
int start = tags.start(3);
97                 int end = tags.end(3);
98                 processStyle(sourceContent.subSequence(start, end),
99                     tags.end(4) - start);
100             }
101             if(!next.isEmpty()) {
102                 // at least one link found
103
return true;
104             }
105         }
106         // no relevant tags found
107
return false;
108     }
109
110     /**
111      * Compiled relevant tag extractor.
112      *
113      * <p>
114      * This pattern extracts either:
115      * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or
116      * <li> (2) &lt;style&gt;...&lt;/style&gt; or
117      * <li> (3) &lt;meta ...&gt; or
118      * <li> (4) any other open-tag with at least one attribute
119      * (eg matches "&lt;a HREF='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")
120      * <p>
121      * groups:
122      * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT
123      * <li> 2: just script open tag
124      * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE
125      * <li> 4: just style open tag
126      * <li> 5: entire other tag, without '<' '>'
127      * <li> 6: element
128      * <li> 7: META
129      * <li> 8: !-- comment --
130      */

131     static final String JavaDoc RELEVANT_TAG_EXTRACTOR =
132           "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?:\\w+))\\s+[^>]*+)|(!--.*?--))>";
133
134     // this pattern extracts attributes from any open-tag innards
135
// matched by the above. attributes known to be URIs of various
136
// sorts are matched specially
137
static final String JavaDoc EACH_ATTRIBUTE_EXTRACTOR =
138       "(?is)\\s((href)|(action)|(on\\w*)"
139      +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)"
140      +"|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))"
141      +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)"
142      +"|(value)|([-\\w]+))"
143      +"\\s*=\\s*"
144      +"(?:(?:\"(.*?)(?:\"|$))"
145      +"|(?:'(.*?)(?:'|$))"
146      +"|(\\S+))";
147     // groups:
148
// 1: attribute name
149
// 2: HREF - single URI relative to doc base, or occasionally javascript:
150
// 3: ACTION - single URI relative to doc base, or occasionally javascript:
151
// 4: ON[WHATEVER] - script handler
152
// 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE,DATASRC, or FOR
153
// single URI relative to doc base
154
// 6: CODEBASE - a single URI relative to doc base, affecting other
155
// attributes
156
// 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
157
// 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
158
// (if supplied)
159
// 9: CODE - a single URI relative to the CODEBASE (is specified).
160
// 10: VALUE - often includes a uri path on forms
161
// 11: any other attribute
162
// 12: double-quote delimited attr value
163
// 13: single-quote delimited attr value
164
// 14: space-delimited attr value
165

166
167     // much like the javascript likely-URI extractor, but
168
// without requiring quotes -- this can indicate whether
169
// an HTML tag attribute that isn't definitionally a
170
// URI might be one anyway, as in form-tag VALUE attributes
171
static final String JavaDoc LIKELY_URI_PATH =
172      "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
173     static final String JavaDoc ESCAPED_AMP = "&amp;";
174     static final String JavaDoc AMP ="&";
175     static final String JavaDoc WHITESPACE = "\\s";
176     static final String JavaDoc CLASSEXT =".class";
177     static final String JavaDoc APPLET = "applet";
178     static final String JavaDoc BASE = "base";
179     static final String JavaDoc LINK = "link";
180
181     protected boolean processGeneralTag(CharSequence JavaDoc element, CharSequence JavaDoc cs) {
182
183         Matcher JavaDoc attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
184
185         // Just in case it's an OBJECT or APPLET tag
186
String JavaDoc codebase = null;
187         ArrayList JavaDoc<String JavaDoc> resources = null;
188         long tally = next.size();
189
190         while (attr.find()) {
191             int valueGroup =
192                 (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;
193             int start = attr.start(valueGroup);
194             int end = attr.end(valueGroup);
195             CharSequence JavaDoc value = cs.subSequence(start, end);
196             if (attr.start(2) > -1) {
197                 // HREF
198
CharSequence JavaDoc context = Link.elementContext(element, attr.group(2));
199                 if(element.toString().equalsIgnoreCase(LINK)) {
200                     // <LINK> elements treated as embeds (css, ico, etc)
201
processEmbed(value, context);
202                 } else {
203                     if (element.toString().equalsIgnoreCase(BASE)) {
204                         try {
205                             base = UURIFactory.getInstance(value.toString());
206                         } catch (URIException e) {
207                             extractErrorListener.noteExtractError(e,source,value);
208                         }
209                     }
210                     // other HREFs treated as links
211
processLink(value, context);
212                 }
213             } else if (attr.start(3) > -1) {
214                 // ACTION
215
CharSequence JavaDoc context = Link.elementContext(element, attr.group(3));
216                 processLink(value, context);
217             } else if (attr.start(4) > -1) {
218                 // ON____
219
processScriptCode(value); // TODO: context?
220
} else if (attr.start(5) > -1) {
221                 // SRC etc.
222
CharSequence JavaDoc context = Link.elementContext(element, attr.group(5));
223                 processEmbed(value, context);
224             } else if (attr.start(6) > -1) {
225                 // CODEBASE
226
// TODO: more HTML deescaping?
227
codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP);
228                 CharSequence JavaDoc context = Link.elementContext(element,attr.group(6));
229                 processEmbed(codebase, context);
230             } else if (attr.start(7) > -1) {
231                 // CLASSID, DATA
232
if (resources == null) {
233                     resources = new ArrayList JavaDoc<String JavaDoc>();
234                 }
235                 resources.add(value.toString());
236             } else if (attr.start(8) > -1) {
237                 // ARCHIVE
238
if (resources==null) {
239                     resources = new ArrayList JavaDoc<String JavaDoc>();
240                 }
241                 String JavaDoc[] multi = TextUtils.split(WHITESPACE, value);
242                 for(int i = 0; i < multi.length; i++ ) {
243                     resources.add(multi[i]);
244                 }
245             } else if (attr.start(9) > -1) {
246                 // CODE
247
if (resources==null) {
248                     resources = new ArrayList JavaDoc<String JavaDoc>();
249                 }
250                 // If element is applet and code value does not end with
251
// '.class' then append '.class' to the code value.
252
if (element.toString().toLowerCase().equals(APPLET) &&
253                         !value.toString().toLowerCase().endsWith(CLASSEXT)) {
254                     resources.add(value.toString() + CLASSEXT);
255                 } else {
256                     resources.add(value.toString());
257                 }
258
259             } else if (attr.start(10) > -1) {
260                 // VALUE
261
if(TextUtils.matches(LIKELY_URI_PATH, value)) {
262                     CharSequence JavaDoc context = Link.elementContext(element, attr.group(10));
263                     processLink(value, context);
264                 }
265
266             } else if (attr.start(11) > -1) {
267                 // any other attribute
268
// ignore for now
269
// could probe for path- or script-looking strings, but
270
// those should be vanishingly rare in other attributes,
271
// and/or symptomatic of page bugs
272
}
273         }
274         TextUtils.recycleMatcher(attr);
275
276         // handle codebase/resources
277
if (resources == null) {
278             return (tally-next.size())>0;
279         }
280         Iterator JavaDoc iter = resources.iterator();
281         UURI codebaseURI = null;
282         String JavaDoc res = null;
283         try {
284             if (codebase != null) {
285                 // TODO: Pass in the charset.
286
codebaseURI = UURIFactory.getInstance(base, codebase);
287             }
288             while(iter.hasNext()) {
289                 res = iter.next().toString();
290                 // TODO: more HTML deescaping?
291
res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP);
292                 if (codebaseURI != null) {
293                     res = codebaseURI.resolve(res).toString();
294                 }
295                 processEmbed(res, element); // TODO: include attribute too
296
}
297         } catch (URIException e) {
298             extractErrorListener.noteExtractError(e,source,codebase);
299         } catch (IllegalArgumentException JavaDoc e) {
300             DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
301                 "codebase=" + codebase + " res=" + res + "\n" +
302                 DevUtils.extraInfo(), e);
303         }
304         return (tally-next.size())>0;
305     }
306
307     /**
308      * @param cs
309      */

310     protected void processScriptCode(CharSequence JavaDoc cs) {
311         RegexpJSLinkExtractor.extract(cs, source, base, next,
312                 extractErrorListener);
313     }
314
315     static final String JavaDoc JAVASCRIPT = "(?i)^javascript:.*";
316
317     /**
318      * @param value
319      * @param context
320      */

321     protected void processLink(CharSequence JavaDoc value, CharSequence JavaDoc context) {
322         String JavaDoc link = TextUtils.replaceAll(ESCAPED_AMP, value, "&");
323
324         if(TextUtils.matches(JAVASCRIPT, link)) {
325             processScriptCode(value.subSequence(11, value.length()));
326         } else {
327             addLinkFromString(link, context,Link.NAVLINK_HOP);
328         }
329     }
330
331     /**
332      * @param uri
333      * @param context
334      */

335     private void addLinkFromString(String JavaDoc uri, CharSequence JavaDoc context, char hopType) {
336         try {
337             Link link = new Link(source, UURIFactory.getInstance(
338                     base, uri), context, hopType);
339             next.addLast(link);
340         } catch (URIException e) {
341            extractErrorListener.noteExtractError(e,source,uri);
342         }
343     }
344
345     protected long processEmbed(CharSequence JavaDoc value, CharSequence JavaDoc context) {
346         String JavaDoc embed = TextUtils.replaceAll(ESCAPED_AMP, value, "&");
347         addLinkFromString(embed, context,Link.EMBED_HOP);
348         return 1;
349     }
350
351     static final String JavaDoc NON_HTML_PATH_EXTENSION =
352         "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
353         "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
354
355     protected void processScript(CharSequence JavaDoc sequence, int endOfOpenTag) {
356         // first, get attributes of script-open tag
357
// as per any other tag
358
processGeneralTag(sequence.subSequence(0,6),
359             sequence.subSequence(0,endOfOpenTag));
360
361         // then, apply best-effort string-analysis heuristics
362
// against any code present (false positives are OK)
363
processScriptCode(sequence.subSequence(endOfOpenTag, sequence.length()));
364     }
365
366     protected void processMeta(CharSequence JavaDoc cs) {
367         Matcher JavaDoc attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
368
369         String JavaDoc name = null;
370         String JavaDoc httpEquiv = null;
371         String JavaDoc content = null;
372
373         while (attr.find()) {
374             int valueGroup =
375                 (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;
376             CharSequence JavaDoc value =
377                 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
378             if (attr.group(1).equalsIgnoreCase("name")) {
379                 name = value.toString();
380             } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
381                 httpEquiv = value.toString();
382             } else if (attr.group(1).equalsIgnoreCase("content")) {
383                 content = value.toString();
384             }
385             // TODO: handle other stuff
386
}
387         TextUtils.recycleMatcher(attr);
388         
389         // Look for the 'robots' meta-tag
390
if("robots".equalsIgnoreCase(name) && content != null ) {
391             if (getHonorRobots()) {
392             String JavaDoc contentLower = content.toLowerCase();
393                 if ((contentLower.indexOf("nofollow") >= 0
394                         || contentLower.indexOf("none") >= 0)) {
395                     // if 'nofollow' or 'none' is specified and we
396
// are honoring robots, end html extraction
397
logger.fine("HTML extraction skipped due to robots meta-tag for: "
398                                     + source);
399                     cancelFurtherExtraction();
400                     return;
401                 }
402             }
403         } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
404             String JavaDoc refreshUri = content.substring(content.indexOf("=") + 1);
405             try {
406                 Link refreshLink = new Link(source, UURIFactory.getInstance(base,refreshUri), Link.elementContext("meta",httpEquiv),Link.REFER_HOP);
407                 next.addLast(refreshLink);
408             } catch (URIException e) {
409                 extractErrorListener.noteExtractError(e,source,refreshUri);
410             }
411         }
412     }
413
414     /**
415      * @return whether to honor internal robots directives (eg meta robots)
416      */

417     private boolean getHonorRobots() {
418         return honorRobots;
419     }
420
421     /**
422      * Ensure no further Links are extracted (by setting matcher up to fail)
423      */

424     private void cancelFurtherExtraction() {
425         // java 1.5 only:
426
// tags.region(tags.regionEnd(),tags.regionEnd());
427
tags.reset("");
428     }
429
430     /**
431      * @param sequence
432      * @param endOfOpenTag
433      */

434     protected void processStyle(CharSequence JavaDoc sequence,
435             int endOfOpenTag)
436     {
437         // First, get attributes of script-open tag as per any other tag.
438
processGeneralTag(sequence.subSequence(0,6),
439             sequence.subSequence(0,endOfOpenTag));
440
441         // then, parse for URIs
442
RegexpCSSLinkExtractor.extract(sequence.subSequence(endOfOpenTag,
443                 sequence.length()), source, base, next, extractErrorListener);
444     }
445
446     /**
447      * Discard all state. Another setup() is required to use again.
448      */

449     public void reset() {
450         super.reset();
451         TextUtils.recycleMatcher(tags);
452         tags = null;
453     }
454
455     protected static CharSequenceLinkExtractor newDefaultInstance() {
456         return new RegexpHTMLLinkExtractor();
457     }
458 }
459
460
Popular Tags