CrawlStateUpdater


1   /* CrawlStateUpdater
2    *
3    * Created on Jun 5, 2003
4    *
5    * Copyright (C) 2003 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.postprocessor;
24  
25  
26  import java.util.logging.Logger  ;
27  
28  import org.apache.commons.httpclient.URIException;
29  import org.archive.crawler.datamodel.CoreAttributeConstants;
30  import org.archive.crawler.datamodel.CrawlHost;
31  import org.archive.crawler.datamodel.CrawlServer;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.datamodel.FetchStatusCodes;
34  import org.archive.crawler.framework.Processor;
35  import org.archive.crawler.framework.Frontier.FrontierGroup;
36  
37  
38  /**
39   * A step, late in the processing of a CrawlURI, for updating the per-host
40   * information that may have been affected by the fetch. This will initially
41   * be robots and ip address info; it could include other per-host stats that
42   * would affect the crawl (like total pages visited at the site) as well.
43   *
44   * @author gojomo
45   * @version $Date: 2007/01/13 01:31:24 $, $Revision: 1.11.4.1 $
46   */
47  public class CrawlStateUpdater extends Processor implements
48          CoreAttributeConstants, FetchStatusCodes {
49  
50      private static final long serialVersionUID = -1072728147960180091L;
51  
52      private static final Logger   logger =
53          Logger.getLogger(CrawlStateUpdater.class.getName());
54  
55      public CrawlStateUpdater(String   name) {
56          super(name, "Crawl state updater");
57      }
58  
59      protected void innerProcess(CrawlURI curi) {
60          // Tally per-server, per-host, per-frontier-class running totals
61          CrawlServer server =
62              getController().getServerCache().getServerFor(curi);
63          if (server != null) {
64              server.getSubstats().tally(curi);
65          }
66          CrawlHost host = 
67              getController().getServerCache().getHostFor(curi);
68          if (host != null) {
69              host.getSubstats().tally(curi);
70          } 
71          FrontierGroup group = 
72              getController().getFrontier().getGroup(curi);
73          group.getSubstats().tally(curi);
74          
75          String   scheme = curi.getUURI().getScheme().toLowerCase();
76          if (scheme.equals("http") || scheme.equals("https") &&
77                  server != null) {
78              // Update connection problems counter
79              if(curi.getFetchStatus() == S_CONNECT_FAILED) {
80                  server.incrementConsecutiveConnectionErrors();
81              } else if (curi.getFetchStatus() > 0){
82                  server.resetConsecutiveConnectionErrors();
83              }
84  
85              // Update robots info
86              try {
87                  if (curi.getUURI().getPath() != null &&
88                          curi.getUURI().getPath().equals("/robots.txt")) {
89                      // Update server with robots info
90                      server.updateRobots(curi);
91                  }
92              }
93              catch (URIException e) {
94                  logger.severe("Failed get path on " + curi.getUURI());
95              }
96          }
97      }
98  }
99
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags