KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > AggressiveExtractorHTML


1 /*
2  * AggressiveExtractorHTML
3  *
4  * $Id: AggressiveExtractorHTML.java,v 1.1.18.1 2007/01/13 01:31:15 stack-sf Exp $
5  *
6  * Created on Jan 6, 2004
7  *
8  * Copyright (C) 2004 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26
27 package org.archive.crawler.extractor;
28
29 import java.util.logging.Logger JavaDoc;
30
31 import org.archive.crawler.datamodel.CrawlURI;
32
33 /**
34  * Extended version of ExtractorHTML with more aggressive javascript link
35  * extraction where javascript code is parsed first with general HTML tags
36  * regexp, and than by javascript speculative link regexp.
37  *
38  * @author Igor Ranitovic
39  *
40  */

41 public class AggressiveExtractorHTML
42 extends ExtractorHTML {
43
44     private static final long serialVersionUID = 3586060081186247087L;
45
46     static Logger JavaDoc logger =
47         Logger.getLogger(AggressiveExtractorHTML.class.getName());
48     
49     public AggressiveExtractorHTML(String JavaDoc name) {
50         super(name, "Aggressive HTML extractor. Subclasses ExtractorHTML " +
51                 " so does all that it does, except in regard to javascript " +
52                 " blocks. Here " +
53                 " it first processes as JS as its parent does, but then it " +
54                 " reruns through the JS treating it as HTML (May cause many " +
55                 " false positives). It finishes by applying heuristics " +
56                 " against script code looking for possible URIs. ");
57     }
58
59     protected void processScript(CrawlURI curi, CharSequence JavaDoc sequence,
60             int endOfOpenTag) {
61         super.processScript(curi,sequence,endOfOpenTag);
62         // then, process entire javascript code as html code
63
// this may cause a lot of false positves
64
processGeneralTag(curi, sequence.subSequence(0,6),
65             sequence.subSequence(endOfOpenTag, sequence.length()));
66     }
67
68     /* (non-Javadoc)
69      * @see org.archive.crawler.framework.Processor#report()
70      */

71     public String JavaDoc report() {
72         StringBuffer JavaDoc ret = new StringBuffer JavaDoc(256);
73         ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML2\n");
74         ret.append(" Function: Link extraction on HTML documents " +
75             "(including embedded CSS)\n");
76         ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
77         ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
78         return ret.toString();
79     }
80 }
81
Popular Tags