CrawlServer


1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CrawlServer.java
20   * Created on Apr 17, 2003
21   *
22   * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/CrawlServer.java,v 1.38.4.1 2007/01/13 01:31:08 stack-sf Exp $
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.io.BufferedReader  ;
27  import java.io.IOException  ;
28  import java.io.InputStreamReader  ;
29  import java.io.ObjectInputStream  ;
30  import java.io.Serializable  ;
31  import java.io.StringReader  ;
32  import java.util.HashSet  ;
33  import java.util.Set  ;
34  import java.util.zip.Checksum  ;
35  
36  import org.apache.commons.httpclient.URIException;
37  import org.archive.crawler.datamodel.credential.CredentialAvatar;
38  import org.archive.crawler.framework.Checkpointer;
39  import org.archive.crawler.framework.ToeThread;
40  import org.archive.crawler.settings.CrawlerSettings;
41  import org.archive.crawler.settings.SettingsHandler;
42  import org.archive.io.ReplayInputStream;
43  import org.archive.net.UURIFactory;
44  
45  /**
46   * Represents a single remote "server".
47   *
48   * A server is a service on a host. There might be more than one service on a
49   * host differentiated by a port number.
50   *
51   * @author gojomo
52   */
53  public class CrawlServer implements Serializable  , CrawlSubstats.HasCrawlSubstats {
54  
55      private static final long serialVersionUID = -989714570750970369L;
56  
57      public static final long ROBOTS_NOT_FETCHED = -1;
58      /** only check if robots-fetch is perhaps superfluous 
59       * after this many tries */
60      public static final long MIN_ROBOTS_RETRIES = 2;
61  
62      private final String   server; // actually, host+port in the https case
63      private int port;
64      private transient SettingsHandler settingsHandler;
65      private RobotsExclusionPolicy robots;
66      long robotsFetched = ROBOTS_NOT_FETCHED;
67      boolean validRobots = false;
68      Checksum   robotstxtChecksum;
69      CrawlSubstats substats = new CrawlSubstats();
70      
71      // how many consecutive connection errors have been encountered;
72      // used to drive exponentially increasing retry timeout or decision
73      // to 'freeze' entire class (queue) of URIs
74      protected int consecutiveConnectionErrors = 0;
75  
76      /**
77       * Set of credential avatars.
78       */
79      private transient Set  <CredentialAvatar> avatars =  null;
80  
81      /**
82       * Creates a new CrawlServer object.
83       *
84       * @param h the host string for the server.
85       */
86      public CrawlServer(String   h) {
87          // TODO: possibly check for illegal host string
88          server = h;
89          int colonIndex = server.lastIndexOf(":");
90          if (colonIndex < 0) {
91              port = -1;
92          } else {
93              try {
94                  port = Integer.parseInt(server.substring(colonIndex + 1));
95              } catch (NumberFormatException   e) {
96                  port = -1;
97              }
98          }
99      }
100 
101     /** Get the robots exclusion policy for this server.
102      *
103      * @return the robots exclusion policy for this server.
104      */
105     public RobotsExclusionPolicy getRobots() {
106         return robots;
107     }
108 
109     /** Set the robots exclusion policy for this server.
110      *
111      * @param policy the policy to set.
112      */
113     public void setRobots(RobotsExclusionPolicy policy) {
114         robots = policy;
115     }
116 
117     public String   toString() {
118         return "CrawlServer("+server+")";
119     }
120 
121     /** Update the robots exclusion policy.
122      *
123      * @param curi the crawl URI containing the fetched robots.txt
124      * @throws IOException
125      */
126     public void updateRobots(CrawlURI curi) {
127         RobotsHonoringPolicy honoringPolicy =
128             settingsHandler.getOrder().getRobotsHonoringPolicy();
129 
130         robotsFetched = System.currentTimeMillis();
131 
132         boolean gotSomething = curi.getFetchStatus() > 0
133                 && curi.isHttpTransaction();
134         if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
135             // robots.txt lookup failed, no reason to consider IGNORE yet
136             validRobots = false;
137             return;
138         }
139         
140         CrawlerSettings settings = getSettings(curi);
141         int type = honoringPolicy.getType(settings);
142         if (type == RobotsHonoringPolicy.IGNORE) {
143             // IGNORE = ALLOWALL
144             robots = RobotsExclusionPolicy.ALLOWALL;
145             validRobots = true;
146             return;
147         }
148         
149         if(!gotSomething) {
150             // robots.txt lookup failed and policy not IGNORE
151             validRobots = false;
152             return;
153         }
154         
155         if (!curi.is2XXSuccess()) {
156             // Not found or anything but a status code in the 2xx range is
157             // treated as giving access to all of a sites' content.
158             // This is the prevailing practice of Google, since 4xx
159             // responses on robots.txt are usually indicative of a 
160             // misconfiguration or blanket-block, not an intentional
161             // indicator of partial blocking. 
162             // TODO: consider handling server errors, redirects differently
163             robots = RobotsExclusionPolicy.ALLOWALL;
164             validRobots = true;
165             return;
166         }
167 
168         ReplayInputStream contentBodyStream = null;
169         try {
170             try {
171                 BufferedReader   reader;
172                 if (type == RobotsHonoringPolicy.CUSTOM) {
173                     reader = new BufferedReader  (new StringReader  (honoringPolicy
174                             .getCustomRobots(settings)));
175                 } else {
176                     contentBodyStream = curi.getHttpRecorder()
177                             .getRecordedInput().getContentReplayInputStream();
178 
179                     contentBodyStream.setToResponseBodyStart();
180                     reader = new BufferedReader  (new InputStreamReader  (
181                             contentBodyStream));
182                 }
183                 robots = RobotsExclusionPolicy.policyFor(settings,
184                         reader, honoringPolicy);
185                 validRobots = true;
186             } finally {
187                 if (contentBodyStream != null) {
188                     contentBodyStream.close();
189                 }
190             }
191         } catch (IOException   e) {
192             robots = RobotsExclusionPolicy.ALLOWALL;
193             validRobots = true;
194             curi.addLocalizedError(getName(), e,
195                     "robots.txt parsing IOException");
196         }
197     }
198 
199     /**
200      * @return Returns the time when robots.txt was fetched.
201      */
202     public long getRobotsFetchedTime() {
203         return robotsFetched;
204     }
205 
206     /**
207      * @return The server string which might include a port number.
208      */
209     public String   getName() {
210        return server;
211     }
212 
213     /** Get the port number for this server.
214      *
215      * @return the port number or -1 if not known (uses default for protocol)
216      */
217     public int getPort() {
218         return port;
219     }
220 
221     /** 
222      * Called when object is being deserialized.
223      * In addition to the default java deserialization, this method
224      * re-establishes the references to settings handler and robots honoring
225      * policy.
226      *
227      * @param stream the stream to deserialize from.
228      * @throws IOException if I/O errors occur
229      * @throws ClassNotFoundException If the class for an object being restored
230      *         cannot be found.
231      */
232     private void readObject(ObjectInputStream   stream)
233             throws IOException  , ClassNotFoundException   {
234         stream.defaultReadObject();
235         Thread   t = Thread.currentThread();
236         if (t instanceof Checkpointer.CheckpointingThread) {
237             settingsHandler = ((Checkpointer.CheckpointingThread)t)
238                 .getController().getSettingsHandler();
239         } else if (t instanceof ToeThread) {
240             settingsHandler = ((ToeThread) Thread.currentThread())
241                 .getController().getSettingsHandler();
242         } else {
243             // TODO: log differently? (if no throw here
244             // NPE is inevitable)
245             throw new RuntimeException  ("CrawlServer must deserialize " +
246                     "in a ToeThread or CheckpointingThread");
247         }
248         postDeserialize();
249     }
250     
251     private void postDeserialize() {
252         if (this.robots != null) {
253             RobotsHonoringPolicy honoringPolicy =
254                 settingsHandler.getOrder().getRobotsHonoringPolicy();
255             this.robots.honoringPolicy = honoringPolicy;
256         }
257     }
258 
259     /** Get the settings handler.
260      *
261      * @return the settings handler.
262      */
263     public SettingsHandler getSettingsHandler() {
264         return this.settingsHandler;
265     }
266 
267     /** Get the settings object in effect for this server.
268      * @param curi
269      *
270      * @return the settings object in effect for this server.
271      * @throws URIException
272      */
273     private CrawlerSettings getSettings(CandidateURI curi) {
274         try {
275             return this.settingsHandler.
276                 getSettings(curi.getUURI().getReferencedHost(),
277                     curi.getUURI());
278         } catch (URIException e) {
279             return null;
280         }
281     }
282 
283     /** Set the settings handler to be used by this server.
284      *
285      * @param settingsHandler the settings handler to be used by this server.
286      */
287     public void setSettingsHandler(SettingsHandler settingsHandler) {
288         this.settingsHandler = settingsHandler;
289     }
290 
291     public void incrementConsecutiveConnectionErrors() {
292         this.consecutiveConnectionErrors++;
293     }
294 
295     public void resetConsecutiveConnectionErrors() {
296         this.consecutiveConnectionErrors = 0;
297     }
298 
299     /**
300      * @return Credential avatars for this server.  Returns null if none.
301      */
302     public Set   getCredentialAvatars() {
303         return this.avatars;
304     }
305 
306     /**
307      * @return True if there are avatars attached to this instance.
308      */
309     public boolean hasCredentialAvatars() {
310         return this.avatars != null && this.avatars.size() > 0;
311     }
312 
313     /**
314      * Add an avatar.
315      *
316      * @param ca Credential avatar to add to set of avatars.
317      */
318     public void addCredentialAvatar(CredentialAvatar ca) {
319         if (this.avatars == null) {
320             this.avatars = new HashSet  <CredentialAvatar>();
321         }
322         this.avatars.add(ca);
323     }
324     
325     /**
326      * If true then valid robots.txt information has been retrieved. If false
327      * either no attempt has been made to fetch robots.txt or the attempt
328      * failed.
329      *
330      * @return Returns the validRobots.
331      */
332     public boolean isValidRobots() {
333         return validRobots;
334     }
335     
336     /**
337      * Get key to use doing lookup on server instances.
338      * @param cauri CandidateURI we're to get server key for.
339      * @return String to use as server key.
340      * @throws URIException
341      */
342     public static String   getServerKey(CandidateURI cauri)
343     throws URIException {
344         // TODO: evaluate if this is really necessary -- why not 
345         // make the server of a dns CandidateURI the looked-up domain,
346         // also simplifying FetchDNS?
347         String   key = cauri.getUURI().getAuthorityMinusUserinfo();
348         if (key == null) {
349             // Fallback for cases where getAuthority() fails (eg 'dns:'.
350             // DNS UURIs have the 'domain' in the 'path' parameter, not
351             // in the authority).
352             key = cauri.getUURI().getCurrentHierPath();
353             if(key != null && !key.matches("[-_\\w\\.:]+")) {
354                 // Not just word chars and dots and colons and dashes and
355                 // underscores; throw away
356                 key = null;
357             }
358         }
359         if (key != null &&
360                 cauri.getUURI().getScheme().equals(UURIFactory.HTTPS)) {
361             // If https and no port specified, add default https port to
362             // distinuish https from http server without a port.
363             if (!key.matches(".+:[0-9]+")) {
364                 key += ":" + UURIFactory.HTTPS_PORT;
365             }
366         }
367         return key;
368     }
369 
370     /* (non-Javadoc)
371      * @see org.archive.crawler.datamodel.CrawlSubstats.HasCrawlSubstats#getSubstats()
372      */
373     public CrawlSubstats getSubstats() {
374         return substats;
375     }
376 }
377
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags