KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorHTTP


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * SimpleHTTPExtractor.java
20  * Created on Jul 3, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/extractor/ExtractorHTTP.java,v 1.20.16.1 2007/01/13 01:31:16 stack-sf Exp $
23  */

24 package org.archive.crawler.extractor;
25
26 import java.util.logging.Logger JavaDoc;
27
28 import org.apache.commons.httpclient.Header;
29 import org.apache.commons.httpclient.HttpMethod;
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.datamodel.CoreAttributeConstants;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.Processor;
34
35 /**
36  * Extracts URIs from HTTP response headers.
37  * @author gojomo
38  */

39 public class ExtractorHTTP extends Processor
40 implements CoreAttributeConstants {
41
42     private static final long serialVersionUID = 8499072198570554647L;
43
44     private static final Logger JavaDoc LOGGER =
45         Logger.getLogger(ExtractorHTTP.class.getName());
46     protected long numberOfCURIsHandled = 0;
47     protected long numberOfLinksExtracted = 0;
48
49     public ExtractorHTTP(String JavaDoc name) {
50         super(name,
51             "HTTP extractor. Extracts URIs from HTTP response headers.");
52     }
53
54     public void innerProcess(CrawlURI curi) {
55         if (!curi.isHttpTransaction() || curi.getFetchStatus() <= 0) {
56             // If not http or if an error status code, skip.
57
return;
58         }
59         numberOfCURIsHandled++;
60         HttpMethod method = (HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
61         addHeaderLink(curi, method.getResponseHeader("Location"));
62         addHeaderLink(curi, method.getResponseHeader("Content-Location"));
63     }
64
65     protected void addHeaderLink(CrawlURI curi, Header loc) {
66         if (loc == null) {
67             // If null, return without adding anything.
68
return;
69         }
70         // TODO: consider possibility of multiple headers
71
try {
72             curi.createAndAddLink(loc.getValue(), loc.getName() + ":",
73                 Link.REFER_HOP);
74             numberOfLinksExtracted++;
75         } catch (URIException e) {
76             // There may not be a controller (e.g. If we're being run
77
// by the extractor tool).
78
if (getController() != null) {
79                 getController().logUriError(e, curi.getUURI(), loc.getValue());
80             } else {
81                 LOGGER.info(curi + ", " + loc.getValue() + ": " +
82                     e.getMessage());
83             }
84         }
85
86     }
87
88     public String JavaDoc report() {
89         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
90         ret.append("Processor: org.archive.crawler.extractor.ExtractorHTTP\n");
91         ret.append(" Function: " +
92             "Extracts URIs from HTTP response headers\n");
93         ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
94         ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
95         return ret.toString();
96     }
97 }
98
Popular Tags