KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > postprocessor > CrawlStateUpdater


1 /* CrawlStateUpdater
2  *
3  * Created on Jun 5, 2003
4  *
5  * Copyright (C) 2003 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.postprocessor;
24
25
26 import java.util.logging.Logger JavaDoc;
27
28 import org.apache.commons.httpclient.URIException;
29 import org.archive.crawler.datamodel.CoreAttributeConstants;
30 import org.archive.crawler.datamodel.CrawlHost;
31 import org.archive.crawler.datamodel.CrawlServer;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.datamodel.FetchStatusCodes;
34 import org.archive.crawler.framework.Processor;
35 import org.archive.crawler.framework.Frontier.FrontierGroup;
36
37
38 /**
39  * A step, late in the processing of a CrawlURI, for updating the per-host
40  * information that may have been affected by the fetch. This will initially
41  * be robots and ip address info; it could include other per-host stats that
42  * would affect the crawl (like total pages visited at the site) as well.
43  *
44  * @author gojomo
45  * @version $Date: 2007/01/13 01:31:24 $, $Revision: 1.11.4.1 $
46  */

47 public class CrawlStateUpdater extends Processor implements
48         CoreAttributeConstants, FetchStatusCodes {
49
50     private static final long serialVersionUID = -1072728147960180091L;
51
52     private static final Logger JavaDoc logger =
53         Logger.getLogger(CrawlStateUpdater.class.getName());
54
55     public CrawlStateUpdater(String JavaDoc name) {
56         super(name, "Crawl state updater");
57     }
58
59     protected void innerProcess(CrawlURI curi) {
60         // Tally per-server, per-host, per-frontier-class running totals
61
CrawlServer server =
62             getController().getServerCache().getServerFor(curi);
63         if (server != null) {
64             server.getSubstats().tally(curi);
65         }
66         CrawlHost host =
67             getController().getServerCache().getHostFor(curi);
68         if (host != null) {
69             host.getSubstats().tally(curi);
70         }
71         FrontierGroup group =
72             getController().getFrontier().getGroup(curi);
73         group.getSubstats().tally(curi);
74         
75         String JavaDoc scheme = curi.getUURI().getScheme().toLowerCase();
76         if (scheme.equals("http") || scheme.equals("https") &&
77                 server != null) {
78             // Update connection problems counter
79
if(curi.getFetchStatus() == S_CONNECT_FAILED) {
80                 server.incrementConsecutiveConnectionErrors();
81             } else if (curi.getFetchStatus() > 0){
82                 server.resetConsecutiveConnectionErrors();
83             }
84
85             // Update robots info
86
try {
87                 if (curi.getUURI().getPath() != null &&
88                         curi.getUURI().getPath().equals("/robots.txt")) {
89                     // Update server with robots info
90
server.updateRobots(curi);
91                 }
92             }
93             catch (URIException e) {
94                 logger.severe("Failed get path on " + curi.getUURI());
95             }
96         }
97     }
98 }
99
Popular Tags