KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > datamodel > CrawlServer


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * CrawlServer.java
20  * Created on Apr 17, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/CrawlServer.java,v 1.38.4.1 2007/01/13 01:31:08 stack-sf Exp $
23  */

24 package org.archive.crawler.datamodel;
25
26 import java.io.BufferedReader JavaDoc;
27 import java.io.IOException JavaDoc;
28 import java.io.InputStreamReader JavaDoc;
29 import java.io.ObjectInputStream JavaDoc;
30 import java.io.Serializable JavaDoc;
31 import java.io.StringReader JavaDoc;
32 import java.util.HashSet JavaDoc;
33 import java.util.Set JavaDoc;
34 import java.util.zip.Checksum JavaDoc;
35
36 import org.apache.commons.httpclient.URIException;
37 import org.archive.crawler.datamodel.credential.CredentialAvatar;
38 import org.archive.crawler.framework.Checkpointer;
39 import org.archive.crawler.framework.ToeThread;
40 import org.archive.crawler.settings.CrawlerSettings;
41 import org.archive.crawler.settings.SettingsHandler;
42 import org.archive.io.ReplayInputStream;
43 import org.archive.net.UURIFactory;
44
45 /**
46  * Represents a single remote "server".
47  *
48  * A server is a service on a host. There might be more than one service on a
49  * host differentiated by a port number.
50  *
51  * @author gojomo
52  */

53 public class CrawlServer implements Serializable JavaDoc, CrawlSubstats.HasCrawlSubstats {
54
55     private static final long serialVersionUID = -989714570750970369L;
56
57     public static final long ROBOTS_NOT_FETCHED = -1;
58     /** only check if robots-fetch is perhaps superfluous
59      * after this many tries */

60     public static final long MIN_ROBOTS_RETRIES = 2;
61
62     private final String JavaDoc server; // actually, host+port in the https case
63
private int port;
64     private transient SettingsHandler settingsHandler;
65     private RobotsExclusionPolicy robots;
66     long robotsFetched = ROBOTS_NOT_FETCHED;
67     boolean validRobots = false;
68     Checksum JavaDoc robotstxtChecksum;
69     CrawlSubstats substats = new CrawlSubstats();
70     
71     // how many consecutive connection errors have been encountered;
72
// used to drive exponentially increasing retry timeout or decision
73
// to 'freeze' entire class (queue) of URIs
74
protected int consecutiveConnectionErrors = 0;
75
76     /**
77      * Set of credential avatars.
78      */

79     private transient Set JavaDoc<CredentialAvatar> avatars = null;
80
81     /**
82      * Creates a new CrawlServer object.
83      *
84      * @param h the host string for the server.
85      */

86     public CrawlServer(String JavaDoc h) {
87         // TODO: possibly check for illegal host string
88
server = h;
89         int colonIndex = server.lastIndexOf(":");
90         if (colonIndex < 0) {
91             port = -1;
92         } else {
93             try {
94                 port = Integer.parseInt(server.substring(colonIndex + 1));
95             } catch (NumberFormatException JavaDoc e) {
96                 port = -1;
97             }
98         }
99     }
100
101     /** Get the robots exclusion policy for this server.
102      *
103      * @return the robots exclusion policy for this server.
104      */

105     public RobotsExclusionPolicy getRobots() {
106         return robots;
107     }
108
109     /** Set the robots exclusion policy for this server.
110      *
111      * @param policy the policy to set.
112      */

113     public void setRobots(RobotsExclusionPolicy policy) {
114         robots = policy;
115     }
116
117     public String JavaDoc toString() {
118         return "CrawlServer("+server+")";
119     }
120
121     /** Update the robots exclusion policy.
122      *
123      * @param curi the crawl URI containing the fetched robots.txt
124      * @throws IOException
125      */

126     public void updateRobots(CrawlURI curi) {
127         RobotsHonoringPolicy honoringPolicy =
128             settingsHandler.getOrder().getRobotsHonoringPolicy();
129
130         robotsFetched = System.currentTimeMillis();
131
132         boolean gotSomething = curi.getFetchStatus() > 0
133                 && curi.isHttpTransaction();
134         if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
135             // robots.txt lookup failed, no reason to consider IGNORE yet
136
validRobots = false;
137             return;
138         }
139         
140         CrawlerSettings settings = getSettings(curi);
141         int type = honoringPolicy.getType(settings);
142         if (type == RobotsHonoringPolicy.IGNORE) {
143             // IGNORE = ALLOWALL
144
robots = RobotsExclusionPolicy.ALLOWALL;
145             validRobots = true;
146             return;
147         }
148         
149         if(!gotSomething) {
150             // robots.txt lookup failed and policy not IGNORE
151
validRobots = false;
152             return;
153         }
154         
155         if (!curi.is2XXSuccess()) {
156             // Not found or anything but a status code in the 2xx range is
157
// treated as giving access to all of a sites' content.
158
// This is the prevailing practice of Google, since 4xx
159
// responses on robots.txt are usually indicative of a
160
// misconfiguration or blanket-block, not an intentional
161
// indicator of partial blocking.
162
// TODO: consider handling server errors, redirects differently
163
robots = RobotsExclusionPolicy.ALLOWALL;
164             validRobots = true;
165             return;
166         }
167
168         ReplayInputStream contentBodyStream = null;
169         try {
170             try {
171                 BufferedReader JavaDoc reader;
172                 if (type == RobotsHonoringPolicy.CUSTOM) {
173                     reader = new BufferedReader JavaDoc(new StringReader JavaDoc(honoringPolicy
174                             .getCustomRobots(settings)));
175                 } else {
176                     contentBodyStream = curi.getHttpRecorder()
177                             .getRecordedInput().getContentReplayInputStream();
178
179                     contentBodyStream.setToResponseBodyStart();
180                     reader = new BufferedReader JavaDoc(new InputStreamReader JavaDoc(
181                             contentBodyStream));
182                 }
183                 robots = RobotsExclusionPolicy.policyFor(settings,
184                         reader, honoringPolicy);
185                 validRobots = true;
186             } finally {
187                 if (contentBodyStream != null) {
188                     contentBodyStream.close();
189                 }
190             }
191         } catch (IOException JavaDoc e) {
192             robots = RobotsExclusionPolicy.ALLOWALL;
193             validRobots = true;
194             curi.addLocalizedError(getName(), e,
195                     "robots.txt parsing IOException");
196         }
197     }
198
199     /**
200      * @return Returns the time when robots.txt was fetched.
201      */

202     public long getRobotsFetchedTime() {
203         return robotsFetched;
204     }
205
206     /**
207      * @return The server string which might include a port number.
208      */

209     public String JavaDoc getName() {
210        return server;
211     }
212
213     /** Get the port number for this server.
214      *
215      * @return the port number or -1 if not known (uses default for protocol)
216      */

217     public int getPort() {
218         return port;
219     }
220
221     /**
222      * Called when object is being deserialized.
223      * In addition to the default java deserialization, this method
224      * re-establishes the references to settings handler and robots honoring
225      * policy.
226      *
227      * @param stream the stream to deserialize from.
228      * @throws IOException if I/O errors occur
229      * @throws ClassNotFoundException If the class for an object being restored
230      * cannot be found.
231      */

232     private void readObject(ObjectInputStream JavaDoc stream)
233             throws IOException JavaDoc, ClassNotFoundException JavaDoc {
234         stream.defaultReadObject();
235         Thread JavaDoc t = Thread.currentThread();
236         if (t instanceof Checkpointer.CheckpointingThread) {
237             settingsHandler = ((Checkpointer.CheckpointingThread)t)
238                 .getController().getSettingsHandler();
239         } else if (t instanceof ToeThread) {
240             settingsHandler = ((ToeThread) Thread.currentThread())
241                 .getController().getSettingsHandler();
242         } else {
243             // TODO: log differently? (if no throw here
244
// NPE is inevitable)
245
throw new RuntimeException JavaDoc("CrawlServer must deserialize " +
246                     "in a ToeThread or CheckpointingThread");
247         }
248         postDeserialize();
249     }
250     
251     private void postDeserialize() {
252         if (this.robots != null) {
253             RobotsHonoringPolicy honoringPolicy =
254                 settingsHandler.getOrder().getRobotsHonoringPolicy();
255             this.robots.honoringPolicy = honoringPolicy;
256         }
257     }
258
259     /** Get the settings handler.
260      *
261      * @return the settings handler.
262      */

263     public SettingsHandler getSettingsHandler() {
264         return this.settingsHandler;
265     }
266
267     /** Get the settings object in effect for this server.
268      * @param curi
269      *
270      * @return the settings object in effect for this server.
271      * @throws URIException
272      */

273     private CrawlerSettings getSettings(CandidateURI curi) {
274         try {
275             return this.settingsHandler.
276                 getSettings(curi.getUURI().getReferencedHost(),
277                     curi.getUURI());
278         } catch (URIException e) {
279             return null;
280         }
281     }
282
283     /** Set the settings handler to be used by this server.
284      *
285      * @param settingsHandler the settings handler to be used by this server.
286      */

287     public void setSettingsHandler(SettingsHandler settingsHandler) {
288         this.settingsHandler = settingsHandler;
289     }
290
291     public void incrementConsecutiveConnectionErrors() {
292         this.consecutiveConnectionErrors++;
293     }
294
295     public void resetConsecutiveConnectionErrors() {
296         this.consecutiveConnectionErrors = 0;
297     }
298
299     /**
300      * @return Credential avatars for this server. Returns null if none.
301      */

302     public Set JavaDoc getCredentialAvatars() {
303         return this.avatars;
304     }
305
306     /**
307      * @return True if there are avatars attached to this instance.
308      */

309     public boolean hasCredentialAvatars() {
310         return this.avatars != null && this.avatars.size() > 0;
311     }
312
313     /**
314      * Add an avatar.
315      *
316      * @param ca Credential avatar to add to set of avatars.
317      */

318     public void addCredentialAvatar(CredentialAvatar ca) {
319         if (this.avatars == null) {
320             this.avatars = new HashSet JavaDoc<CredentialAvatar>();
321         }
322         this.avatars.add(ca);
323     }
324     
325     /**
326      * If true then valid robots.txt information has been retrieved. If false
327      * either no attempt has been made to fetch robots.txt or the attempt
328      * failed.
329      *
330      * @return Returns the validRobots.
331      */

332     public boolean isValidRobots() {
333         return validRobots;
334     }
335     
336     /**
337      * Get key to use doing lookup on server instances.
338      * @param cauri CandidateURI we're to get server key for.
339      * @return String to use as server key.
340      * @throws URIException
341      */

342     public static String JavaDoc getServerKey(CandidateURI cauri)
343     throws URIException {
344         // TODO: evaluate if this is really necessary -- why not
345
// make the server of a dns CandidateURI the looked-up domain,
346
// also simplifying FetchDNS?
347
String JavaDoc key = cauri.getUURI().getAuthorityMinusUserinfo();
348         if (key == null) {
349             // Fallback for cases where getAuthority() fails (eg 'dns:'.
350
// DNS UURIs have the 'domain' in the 'path' parameter, not
351
// in the authority).
352
key = cauri.getUURI().getCurrentHierPath();
353             if(key != null && !key.matches("[-_\\w\\.:]+")) {
354                 // Not just word chars and dots and colons and dashes and
355
// underscores; throw away
356
key = null;
357             }
358         }
359         if (key != null &&
360                 cauri.getUURI().getScheme().equals(UURIFactory.HTTPS)) {
361             // If https and no port specified, add default https port to
362
// distinuish https from http server without a port.
363
if (!key.matches(".+:[0-9]+")) {
364                 key += ":" + UURIFactory.HTTPS_PORT;
365             }
366         }
367         return key;
368     }
369
370     /* (non-Javadoc)
371      * @see org.archive.crawler.datamodel.CrawlSubstats.HasCrawlSubstats#getSubstats()
372      */

373     public CrawlSubstats getSubstats() {
374         return substats;
375     }
376 }
377
Popular Tags