KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorHTML


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * SimpleHTMLExtractor.java
20  * Created on Jun 5, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/extractor/ExtractorHTML.java,v 1.79.2.1 2007/01/13 01:31:16 stack-sf Exp $
23  */

24 package org.archive.crawler.extractor;
25
26 import java.io.IOException JavaDoc;
27 import java.util.ArrayList JavaDoc;
28 import java.util.Iterator JavaDoc;
29 import java.util.logging.Level JavaDoc;
30 import java.util.logging.Logger JavaDoc;
31 import java.util.regex.Matcher JavaDoc;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.datamodel.CoreAttributeConstants;
35 import org.archive.crawler.datamodel.CrawlURI;
36 import org.archive.crawler.datamodel.RobotsHonoringPolicy;
37 import org.archive.crawler.settings.SimpleType;
38 import org.archive.crawler.settings.Type;
39 import org.archive.io.ReplayCharSequence;
40 import org.archive.net.UURI;
41 import org.archive.net.UURIFactory;
42 import org.archive.util.DevUtils;
43 import org.archive.util.HttpRecorder;
44 import org.archive.util.TextUtils;
45
46 /**
47  * Basic link-extraction, from an HTML content-body,
48  * using regular expressions.
49  *
50  * @author gojomo
51  *
52  */

53 public class ExtractorHTML extends Extractor
54 implements CoreAttributeConstants {
55
56     private static final long serialVersionUID = 5855731422080471017L;
57
58     private static Logger JavaDoc logger =
59         Logger.getLogger(ExtractorHTML.class.getName());
60
61     /**
62      * Compiled relevant tag extractor.
63      *
64      * <p>
65      * This pattern extracts either:
66      * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or
67      * <li> (2) &lt;style&gt;...&lt;/style&gt; or
68      * <li> (3) &lt;meta ...&gt; or
69      * <li> (4) any other open-tag with at least one attribute
70      * (eg matches "&lt;a HREF='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")
71      * <p>
72      * groups:
73      * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT
74      * <li> 2: just script open tag
75      * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE
76      * <li> 4: just style open tag
77      * <li> 5: entire other tag, without '<' '>'
78      * <li> 6: element
79      * <li> 7: META
80      * <li> 8: !-- comment --
81      */

82 // version w/ less unnecessary backtracking
83
private static final int MAX_ELEMENT_LENGTH =
84           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
85               ".maxElementNameLength", "1024"));
86       
87       static final String JavaDoc RELEVANT_TAG_EXTRACTOR =
88           "(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2
89
"|((style[^>]*+)>[^<]*+</style)" + // 3, 4
90
"|(((meta)|(?:\\w{1,"+MAX_ELEMENT_LENGTH+"}))\\s+[^>]*+)" + // 5, 6, 7
91
"|(!--.*?--))>"; // 8
92

93 // version w/ problems with unclosed script tags
94
// static final String RELEVANT_TAG_EXTRACTOR =
95
// "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?:\\w+))\\s+.*?)|(!--.*?--))>";
96

97
98       
99 // // this pattern extracts 'href' or 'src' attributes from
100
// // any open-tag innards matched by the above
101
// static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(
102
// "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))");
103
//
104
// // this pattern extracts 'robots' attributes
105
// static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(
106
// "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))");
107

108       private static final int MAX_ATTR_NAME_LENGTH =
109           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
110               ".maxAttributeNameLength", "1024")); // 1K;
111

112       static final int MAX_ATTR_VAL_LENGTH =
113           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
114               ".maxAttributeValueLength", "16384")); // 16K;
115

116     // TODO: perhaps cut to near MAX_URI_LENGTH
117

118     // this pattern extracts attributes from any open-tag innards
119
// matched by the above. attributes known to be URIs of various
120
// sorts are matched specially
121
static final String JavaDoc EACH_ATTRIBUTE_EXTRACTOR =
122       "(?is)\\s((href)|(action)|(on\\w*)" // 1, 2, 3, 4
123
+"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ...
124
+"|(?:usemap)|(?:profile)|(?:datasrc))" // 5
125
+"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9
126
+"|(value)|(style)|([-\\w]{1,"+MAX_ATTR_NAME_LENGTH+"}))" // 10, 11, 12
127
+"\\s*=\\s*"
128      +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))" // 13
129
+"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))" // 14
130
+"|(\\S{1,"+MAX_ATTR_VAL_LENGTH+"}))"; // 15
131
// groups:
132
// 1: attribute name
133
// 2: HREF - single URI relative to doc base, or occasionally javascript:
134
// 3: ACTION - single URI relative to doc base, or occasionally javascript:
135
// 4: ON[WHATEVER] - script handler
136
// 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC
137
// single URI relative to doc base
138
// 6: CODEBASE - a single URI relative to doc base, affecting other
139
// attributes
140
// 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
141
// 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
142
// (if supplied)
143
// 9: CODE - a single URI relative to the CODEBASE (is specified).
144
// 10: VALUE - often includes a uri path on forms
145
// 11: STYLE - inline attribute style info
146
// 12: any other attribute
147
// 13: double-quote delimited attr value
148
// 14: single-quote delimited attr value
149
// 15: space-delimited attr value
150

151
152     // much like the javascript likely-URI extractor, but
153
// without requiring quotes -- this can indicate whether
154
// an HTML tag attribute that isn't definitionally a
155
// URI might be one anyway, as in form-tag VALUE attributes
156
static final String JavaDoc LIKELY_URI_PATH =
157      "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
158     static final String JavaDoc WHITESPACE = "\\s";
159     static final String JavaDoc CLASSEXT =".class";
160     static final String JavaDoc APPLET = "applet";
161     static final String JavaDoc BASE = "base";
162     static final String JavaDoc LINK = "link";
163     static final String JavaDoc FRAME = "frame";
164     static final String JavaDoc IFRAME = "iframe";
165
166     public static final String JavaDoc ATTR_TREAT_FRAMES_AS_EMBED_LINKS =
167         "treat-frames-as-embed-links";
168     
169     public static final String JavaDoc ATTR_IGNORE_FORM_ACTION_URLS =
170         "ignore-form-action-urls";
171     
172     /** whether to try finding links in Javscript; default true */
173     public static final String JavaDoc ATTR_EXTRACT_JAVASCRIPT =
174         "extract-javascript";
175
176     public static final String JavaDoc ATTR_OVERLY_EAGER_LINK_DETECTION =
177         "overly-eager-link-detection";
178     
179     public static final String JavaDoc ATTR_IGNORE_UNEXPECTED_HTML =
180         "ignore-unexpected-html";
181
182     
183     protected long numberOfCURIsHandled = 0;
184     protected long numberOfLinksExtracted = 0;
185
186     public ExtractorHTML(String JavaDoc name) {
187         this(name, "HTML extractor. Extracts links from HTML documents");
188     }
189     
190     public ExtractorHTML(String JavaDoc name, String JavaDoc description) {
191         super(name, description);
192         Type t = addElementToDefinition(
193             new SimpleType(ATTR_EXTRACT_JAVASCRIPT,
194             "If true, in-page Javascript is scanned for strings that " +
195             "appear likely to be URIs. This typically finds both valid " +
196             "and invalid URIs, and attempts to fetch the invalid URIs " +
197             "sometimes generates webmaster concerns over odd crawler " +
198             "behavior. Default is true.",
199             Boolean.TRUE));
200         t.setExpertSetting(true);
201         t = addElementToDefinition(
202             new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS,
203             "If true, FRAME/IFRAME SRC-links are treated as embedded " +
204             "resources (like IMG, 'E' hop-type), otherwise they are " +
205             "treated as navigational links. Default is true.", Boolean.TRUE));
206         t.setExpertSetting(true);
207         t = addElementToDefinition(
208             new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS,
209             "If true, URIs appearing as the ACTION attribute in " +
210             "HTML FORMs are ignored. Default is false.", Boolean.FALSE));
211         t.setExpertSetting(true);
212         t = addElementToDefinition(
213             new SimpleType(ATTR_OVERLY_EAGER_LINK_DETECTION,
214             "If true, strings that look like URIs found in unusual " +
215             "places (such as form VALUE attributes) will be extracted. " +
216             "This typically finds both valid and invalid URIs, and " +
217             "attempts to fetch the invalid URIs sometimes generate " +
218             "webmaster concerns over odd crawler behavior. Default " +
219             "is true.",
220             Boolean.TRUE));
221         t.setExpertSetting(true);
222         t = addElementToDefinition(
223             new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML,
224             "If true, URIs which end in typical non-HTML extensions " +
225             "(such as .gif) will not be scanned as if it were HTML. " +
226             "Default is true.", Boolean.TRUE));
227         t.setExpertSetting(true);
228     }
229
230     protected void processGeneralTag(CrawlURI curi, CharSequence JavaDoc element,
231             CharSequence JavaDoc cs) {
232
233         Matcher JavaDoc attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
234
235         // Just in case it's an OBJECT or APPLET tag
236
String JavaDoc codebase = null;
237         ArrayList JavaDoc<String JavaDoc> resources = null;
238         
239         final boolean framesAsEmbeds = ((Boolean JavaDoc)getUncheckedAttribute(curi,
240             ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
241
242         final boolean ignoreFormActions = ((Boolean JavaDoc)getUncheckedAttribute(curi,
243                 ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
244         
245         final boolean overlyEagerLinkDetection = ((Boolean JavaDoc)getUncheckedAttribute
246                 (curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue();
247         
248         final String JavaDoc elementStr = element.toString();
249
250         while (attr.find()) {
251             int valueGroup =
252                 (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15;
253             int start = attr.start(valueGroup);
254             int end = attr.end(valueGroup);
255             assert start >= 0: "Start is: " + start + ", " + curi;
256             assert end >= 0: "End is :" + end + ", " + curi;
257             CharSequence JavaDoc value = cs.subSequence(start, end);
258             value = TextUtils.unescapeHtml(value);
259             if (attr.start(2) > -1) {
260                 // HREF
261
CharSequence JavaDoc context =
262                     Link.elementContext(element, attr.group(2));
263                 if(elementStr.equalsIgnoreCase(LINK)) {
264                     // <LINK> elements treated as embeds (css, ico, etc)
265
processEmbed(curi, value, context);
266                 } else {
267                     // other HREFs treated as links
268
processLink(curi, value, context);
269                 }
270                 if (elementStr.equalsIgnoreCase(BASE)) {
271                     try {
272                         curi.setBaseURI(value.toString());
273                     } catch (URIException e) {
274                         if (getController() != null) {
275                             // Controller can be null: e.g. when running
276
// ExtractorTool.
277
getController().logUriError(e, curi.getUURI(),
278                                 value.toString());
279                         } else {
280                             logger.info("Failed set base uri: " +
281                                 curi + ", " + value.toString() + ": " +
282                                 e.getMessage());
283                         }
284                     }
285                 }
286             } else if (attr.start(3) > -1) {
287                 // ACTION
288
if (!ignoreFormActions) {
289                     CharSequence JavaDoc context = Link.elementContext(element,
290                         attr.group(3));
291                     processLink(curi, value, context);
292                 }
293             } else if (attr.start(4) > -1) {
294                 // ON____
295
processScriptCode(curi, value); // TODO: context?
296
} else if (attr.start(5) > -1) {
297                 // SRC etc.
298
CharSequence JavaDoc context = Link.elementContext(element,
299                     attr.group(5));
300                 
301                 // true, if we expect another HTML page instead of an image etc.
302
final char hopType;
303                 
304                 if(!framesAsEmbeds
305                     && (elementStr.equalsIgnoreCase(FRAME) || elementStr
306                         .equalsIgnoreCase(IFRAME))) {
307                     hopType = Link.NAVLINK_HOP;
308                 } else {
309                     hopType = Link.EMBED_HOP;
310                 }
311                 processEmbed(curi, value, context, hopType);
312             } else if (attr.start(6) > -1) {
313                 // CODEBASE
314
codebase = (value instanceof String JavaDoc)?
315                     (String JavaDoc)value: value.toString();
316                 CharSequence JavaDoc context = Link.elementContext(element,
317                     attr.group(6));
318                 processEmbed(curi, codebase, context);
319             } else if (attr.start(7) > -1) {
320                 // CLASSID, DATA
321
if (resources == null) {
322                     resources = new ArrayList JavaDoc<String JavaDoc>();
323                 }
324                 resources.add(value.toString());
325             } else if (attr.start(8) > -1) {
326                 // ARCHIVE
327
if (resources==null) {
328                     resources = new ArrayList JavaDoc<String JavaDoc>();
329                 }
330                 String JavaDoc[] multi = TextUtils.split(WHITESPACE, value);
331                 for(int i = 0; i < multi.length; i++ ) {
332                     resources.add(multi[i]);
333                 }
334             } else if (attr.start(9) > -1) {
335                 // CODE
336
if (resources==null) {
337                     resources = new ArrayList JavaDoc<String JavaDoc>();
338                 }
339                 // If element is applet and code value does not end with
340
// '.class' then append '.class' to the code value.
341
if (elementStr.equalsIgnoreCase(APPLET) &&
342                         !value.toString().toLowerCase().endsWith(CLASSEXT)) {
343                     resources.add(value.toString() + CLASSEXT);
344                 } else {
345                     resources.add(value.toString());
346                 }
347             } else if (attr.start(10) > -1) {
348                 // VALUE, with possibility of URI
349
if (overlyEagerLinkDetection
350                         && TextUtils.matches(LIKELY_URI_PATH, value)) {
351                     CharSequence JavaDoc context = Link.elementContext(element,
352                         attr.group(10));
353                     processLink(curi,value, context);
354                 }
355
356             } else if (attr.start(11) > -1) {
357                 // STYLE inline attribute
358
// then, parse for URIs
359
this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
360                     curi, value, getController());
361                 
362             } else if (attr.start(12) > -1) {
363                 // any other attribute
364
// ignore for now
365
// could probe for path- or script-looking strings, but
366
// those should be vanishingly rare in other attributes,
367
// and/or symptomatic of page bugs
368
}
369         }
370         TextUtils.recycleMatcher(attr);
371
372         // handle codebase/resources
373
if (resources == null) {
374             return;
375         }
376         Iterator JavaDoc iter = resources.iterator();
377         UURI codebaseURI = null;
378         String JavaDoc res = null;
379         try {
380             if (codebase != null) {
381                 // TODO: Pass in the charset.
382
codebaseURI = UURIFactory.
383                     getInstance(curi.getUURI(), codebase);
384             }
385             while(iter.hasNext()) {
386                 res = iter.next().toString();
387                 res = (String JavaDoc) TextUtils.unescapeHtml(res);
388                 if (codebaseURI != null) {
389                     res = codebaseURI.resolve(res).toString();
390                 }
391                 processEmbed(curi, res, element); // TODO: include attribute too
392
}
393         } catch (URIException e) {
394             curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
395         } catch (IllegalArgumentException JavaDoc e) {
396             DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
397                 "codebase=" + codebase + " res=" + res + "\n" +
398                 DevUtils.extraInfo(), e);
399         }
400     }
401
402     /**
403      * Extract the (java)script source in the given CharSequence.
404      *
405      * @param curi source CrawlURI
406      * @param cs CharSequence of javascript code
407      */

408     protected void processScriptCode(CrawlURI curi, CharSequence JavaDoc cs) {
409         if((Boolean JavaDoc)getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) {
410             this.numberOfLinksExtracted +=
411                 ExtractorJS.considerStrings(curi, cs, getController(), false);
412         } // else do nothing
413
}
414
415     static final String JavaDoc JAVASCRIPT = "(?i)^javascript:.*";
416
417     /**
418      * Handle generic HREF cases.
419      *
420      * @param curi
421      * @param value
422      * @param context
423      */

424     protected void processLink(CrawlURI curi, final CharSequence JavaDoc value,
425             CharSequence JavaDoc context) {
426         if (TextUtils.matches(JAVASCRIPT, value)) {
427             processScriptCode(curi, value. subSequence(11, value.length()));
428         } else {
429             if (logger.isLoggable(Level.FINEST)) {
430                 logger.finest("link: " + value.toString() + " from " + curi);
431             }
432             addLinkFromString(curi,
433                 (value instanceof String JavaDoc)?
434                     (String JavaDoc)value: value.toString(),
435                 context, Link.NAVLINK_HOP);
436             this.numberOfLinksExtracted++;
437         }
438     }
439
440     private void addLinkFromString(CrawlURI curi, String JavaDoc uri,
441             CharSequence JavaDoc context, char hopType) {
442         try {
443             // We do a 'toString' on context because its a sequence from
444
// the underlying ReplayCharSequence and the link its about
445
// to become a part of is expected to outlive the current
446
// ReplayCharSequence.
447
curi.createAndAddLinkRelativeToBase(uri, context.toString(),
448                 hopType);
449         } catch (URIException e) {
450             if (getController() != null) {
451                 getController().logUriError(e, curi.getUURI(), uri);
452             } else {
453                 logger.info("Failed createAndAddLinkRelativeToBase " +
454                     curi + ", " + uri + ", " + context + ", " + hopType +
455                     ": " + e);
456             }
457         }
458     }
459
460     protected final void processEmbed(CrawlURI curi, CharSequence JavaDoc value,
461             CharSequence JavaDoc context) {
462         processEmbed(curi, value, context, Link.EMBED_HOP);
463     }
464
465     protected void processEmbed(CrawlURI curi, final CharSequence JavaDoc value,
466             CharSequence JavaDoc context, char hopType) {
467         if (logger.isLoggable(Level.FINEST)) {
468             logger.finest("embed (" + hopType + "): " + value.toString() +
469                 " from " + curi);
470         }
471         addLinkFromString(curi,
472             (value instanceof String JavaDoc)?
473                 (String JavaDoc)value: value.toString(),
474             context, hopType);
475         this.numberOfLinksExtracted++;
476     }
477
478     public void extract(CrawlURI curi) {
479         if (!isHttpTransactionContentToProcess(curi) ||
480                 ! (isExpectedMimeType(curi.getContentType(), "text/html")
481                    || isExpectedMimeType(curi.getContentType(), "application/xhtml"))) {
482             return;
483         }
484
485         final boolean ignoreUnexpectedHTML =
486              ((Boolean JavaDoc)getUncheckedAttribute(curi,
487                  ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();
488
489         if (ignoreUnexpectedHTML) {
490             try {
491                 if(!isHtmlExpectedHere(curi)) {
492                     // HTML was not expected (eg a GIF was expected) so ignore
493
// (as if a soft 404)
494
return;
495                 }
496             } catch (URIException e) {
497                 logger.severe("Failed expectedHTML test: " + e.getMessage());
498             }
499         }
500
501         this.numberOfCURIsHandled++;
502
503         ReplayCharSequence cs = null;
504         
505         try {
506            HttpRecorder hr = curi.getHttpRecorder();
507            if (hr == null) {
508                throw new IOException JavaDoc("Why is recorder null here?");
509            }
510            cs = hr.getReplayCharSequence();
511         } catch (IOException JavaDoc e) {
512             curi.addLocalizedError(this.getName(), e,
513                 "Failed get of replay char sequence " + curi.toString() +
514                     " " + e.getMessage());
515             logger.log(Level.SEVERE,"Failed get of replay char sequence in " +
516                 Thread.currentThread().getName(), e);
517         }
518         
519         if (cs == null) {
520             return;
521         }
522
523         // We have a ReplayCharSequence open. Wrap all in finally so we
524
// for sure close it before we leave.
525
try {
526             // Extract all links from the charsequence
527
extract(curi, cs);
528             // Set flag to indicate that link extraction is completed.
529
curi.linkExtractorFinished();
530         } finally {
531             if (cs != null) {
532                 try {
533                     cs.close();
534                 } catch (IOException JavaDoc ioe) {
535                     logger.warning(TextUtils.exceptionToString(
536                         "Failed close of ReplayCharSequence.", ioe));
537                 }
538             }
539         }
540     }
541
542     /**
543      * Run extractor.
544      * This method is package visible to ease testing.
545      * @param curi CrawlURI we're processing.
546      * @param cs Sequence from underlying ReplayCharSequence. This
547      * is TRANSIENT data. Make a copy if you want the data to live outside
548      * of this extractors' lifetime.
549      */

550     void extract(CrawlURI curi, CharSequence JavaDoc cs) {
551         Matcher JavaDoc tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);
552         while(tags.find()) {
553             if(Thread.interrupted()){
554                 break;
555             }
556             if (tags.start(8) > 0) {
557                 // comment match
558
// for now do nothing
559
} else if (tags.start(7) > 0) {
560                 // <meta> match
561
int start = tags.start(5);
562                 int end = tags.end(5);
563                 assert start >= 0: "Start is: " + start + ", " + curi;
564                 assert end >= 0: "End is :" + end + ", " + curi;
565                 if (processMeta(curi,
566                     cs.subSequence(start, end))) {
567
568                     // meta tag included NOFOLLOW; abort processing
569
break;
570                 }
571             } else if (tags.start(5) > 0) {
572                 // generic <whatever> match
573
int start5 = tags.start(5);
574                 int end5 = tags.end(5);
575                 assert start5 >= 0: "Start is: " + start5 + ", " + curi;
576                 assert end5 >= 0: "End is :" + end5 + ", " + curi;
577                 int start6 = tags.start(6);
578                 int end6 = tags.end(6);
579                 assert start6 >= 0: "Start is: " + start6 + ", " + curi;
580                 assert end6 >= 0: "End is :" + end6 + ", " + curi;
581                 processGeneralTag(curi,
582                     cs.subSequence(start6, end6),
583                     cs.subSequence(start5, end5));
584
585             } else if (tags.start(1) > 0) {
586                 // <script> match
587
int start = tags.start(1);
588                 int end = tags.end(1);
589                 assert start >= 0: "Start is: " + start + ", " + curi;
590                 assert end >= 0: "End is :" + end + ", " + curi;
591                 assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) +
592                     ", " + curi;
593                 processScript(curi, cs.subSequence(start, end),
594                     tags.end(2) - start);
595
596             } else if (tags.start(3) > 0){
597                 // <style... match
598
int start = tags.start(3);
599                 int end = tags.end(3);
600                 assert start >= 0: "Start is: " + start + ", " + curi;
601                 assert end >= 0: "End is :" + end + ", " + curi;
602                 assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) +
603                     ", " + curi;
604                 processStyle(curi, cs.subSequence(start, end),
605                     tags.end(4) - start);
606             }
607         }
608         TextUtils.recycleMatcher(tags);
609     }
610
611
612     static final String JavaDoc NON_HTML_PATH_EXTENSION =
613         "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
614         "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
615
616     /**
617      * Test whether this HTML is so unexpected (eg in place of a GIF URI)
618      * that it shouldn't be scanned for links.
619      *
620      * @param curi CrawlURI to examine.
621      * @return True if HTML is acceptable/expected here
622      * @throws URIException
623      */

624     protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException {
625         String JavaDoc path = curi.getUURI().getPath();
626         if(path==null) {
627             // no path extension, HTML is fine
628
return true;
629         }
630         int dot = path.lastIndexOf('.');
631         if (dot < 0) {
632             // no path extension, HTML is fine
633
return true;
634         }
635         if(dot<(path.length()-5)) {
636             // extension too long to recognize, HTML is fine
637
return true;
638         }
639         String JavaDoc ext = path.substring(dot+1);
640         return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);
641     }
642
643     protected void processScript(CrawlURI curi, CharSequence JavaDoc sequence,
644             int endOfOpenTag) {
645         // first, get attributes of script-open tag
646
// as per any other tag
647
processGeneralTag(curi,sequence.subSequence(0,6),
648             sequence.subSequence(0,endOfOpenTag));
649
650         // then, apply best-effort string-analysis heuristics
651
// against any code present (false positives are OK)
652
processScriptCode(
653             curi, sequence.subSequence(endOfOpenTag, sequence.length()));
654     }
655
656     /**
657      * Process metadata tags.
658      * @param curi CrawlURI we're processing.
659      * @param cs Sequence from underlying ReplayCharSequence. This
660      * is TRANSIENT data. Make a copy if you want the data to live outside
661      * of this extractors' lifetime.
662      * @return True robots exclusion metatag.
663      */

664     protected boolean processMeta(CrawlURI curi, CharSequence JavaDoc cs) {
665         Matcher JavaDoc attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
666         String JavaDoc name = null;
667         String JavaDoc httpEquiv = null;
668         String JavaDoc content = null;
669         while (attr.find()) {
670             int valueGroup =
671                 (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15;
672             CharSequence JavaDoc value =
673                 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
674             if (attr.group(1).equalsIgnoreCase("name")) {
675                 name = value.toString();
676             } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
677                 httpEquiv = value.toString();
678             } else if (attr.group(1).equalsIgnoreCase("content")) {
679                 content = value.toString();
680             }
681             // TODO: handle other stuff
682
}
683         TextUtils.recycleMatcher(attr);
684
685         // Look for the 'robots' meta-tag
686
if("robots".equalsIgnoreCase(name) && content != null ) {
687             curi.putString(A_META_ROBOTS, content);
688             RobotsHonoringPolicy policy =
689                 getSettingsHandler().getOrder().getRobotsHonoringPolicy();
690             String JavaDoc contentLower = content.toLowerCase();
691             if ((policy == null
692                 || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
693                     && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
694                 && (contentLower.indexOf("nofollow") >= 0
695                     || contentLower.indexOf("none") >= 0)) {
696                 // if 'nofollow' or 'none' is specified and the
697
// honoring policy is not IGNORE or CUSTOM, end html extraction
698
logger.fine("HTML extraction skipped due to robots meta-tag for: "
699                                 + curi.toString());
700                 return true;
701             }
702         } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
703             String JavaDoc refreshUri = content.substring(content.indexOf("=") + 1);
704             try {
705                 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
706                     Link.REFER_HOP);
707             } catch (URIException e) {
708                 if (getController() != null) {
709                     getController().logUriError(e, curi.getUURI(), refreshUri);
710                 } else {
711                     logger.info("Failed createAndAddLinkRelativeToBase " +
712                         curi + ", " + cs + ", " + refreshUri + ": " + e);
713                 }
714             }
715         }
716         return false;
717     }
718
719     /**
720      * Process style text.
721      * @param curi CrawlURI we're processing.
722      * @param sequence Sequence from underlying ReplayCharSequence. This
723      * is TRANSIENT data. Make a copy if you want the data to live outside
724      * of this extractors' lifetime.
725      * @param endOfOpenTag
726      */

727     protected void processStyle(CrawlURI curi, CharSequence JavaDoc sequence,
728             int endOfOpenTag) {
729         // First, get attributes of script-open tag as per any other tag.
730
processGeneralTag(curi, sequence.subSequence(0,6),
731             sequence.subSequence(0,endOfOpenTag));
732
733         // then, parse for URIs
734
this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
735             curi, sequence.subSequence(endOfOpenTag,sequence.length()),
736                 getController());
737     }
738     
739
740
741     /* (non-Javadoc)
742      * @see org.archive.crawler.framework.Processor#report()
743      */

744     public String JavaDoc report() {
745         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
746         ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
747         ret.append(" Function: Link extraction on HTML documents\n");
748         ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
749         ret.append(" Links extracted: " + this.numberOfLinksExtracted +
750             "\n\n");
751         return ret.toString();
752     }
753 }
754
755
Popular Tags