KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > datamodel > RobotsExclusionPolicy


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * RobotsExclusionPolicy.java
20  * Created on Apr 17, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/RobotsExclusionPolicy.java,v 1.17.4.1 2007/01/13 01:31:09 stack-sf Exp $
23  */

24 package org.archive.crawler.datamodel;
25
26 import java.io.BufferedReader JavaDoc;
27 import java.io.IOException JavaDoc;
28 import java.io.ObjectInputStream JavaDoc;
29 import java.io.ObjectOutputStream JavaDoc;
30 import java.io.Serializable JavaDoc;
31 import java.util.ArrayList JavaDoc;
32 import java.util.HashMap JavaDoc;
33 import java.util.Iterator JavaDoc;
34 import java.util.LinkedList JavaDoc;
35 import java.util.List JavaDoc;
36 import java.util.logging.Level JavaDoc;
37 import java.util.logging.Logger JavaDoc;
38
39 import org.apache.commons.httpclient.URIException;
40 import org.archive.crawler.settings.CrawlerSettings;
41
42 /**
43  * RobotsExclusionPolicy represents the actual policy adopted with
44  * respect to a specific remote server, usually constructed from
45  * consulting the robots.txt, if any, the server provided.
46  *
47  * (The similarly named RobotsHonoringPolicy, on the other hand,
48  * describes the strategy used by the crawler to determine to what
49  * extent it respects exclusion rules.)
50  *
51  * The expiration of policies after a suitable amount of time has
52  * elapsed since last fetch is handled outside this class, in
53  * CrawlServer itself.
54  *
55  * @author gojomo
56  *
57  */

58 public class RobotsExclusionPolicy implements Serializable JavaDoc {
59
60     private static final long serialVersionUID = 6323907991237383113L;
61
62     private static final Logger JavaDoc logger =
63         Logger.getLogger(RobotsExclusionPolicy.class.getName());
64
65     private final static int NORMAL_TYPE = 0;
66     private final static int ALLOWALL_TYPE = 1;
67     private final static int DENYALL_TYPE = 2;
68     private transient int type = NORMAL_TYPE;
69
70     public static RobotsExclusionPolicy ALLOWALL =
71         new RobotsExclusionPolicy(ALLOWALL_TYPE);
72     public static RobotsExclusionPolicy DENYALL =
73         new RobotsExclusionPolicy(DENYALL_TYPE);
74
75     private LinkedList JavaDoc<String JavaDoc> userAgents = null;
76     private HashMap JavaDoc<String JavaDoc,List JavaDoc<String JavaDoc>> disallows = null;
77     transient RobotsHonoringPolicy honoringPolicy = null;
78
79     private String JavaDoc lastUsedUserAgent = null;
80     private List JavaDoc<String JavaDoc> userAgentsToTest = null;
81
82     /**
83      * @param settings
84      * @param reader
85      * @param honoringPolicy
86      * @return Robot exclusion policy.
87      * @throws IOException
88      */

89     public static RobotsExclusionPolicy policyFor(CrawlerSettings settings,
90             BufferedReader JavaDoc reader, RobotsHonoringPolicy honoringPolicy)
91     throws IOException JavaDoc {
92         LinkedList JavaDoc<String JavaDoc> userAgents = new LinkedList JavaDoc<String JavaDoc>();
93         HashMap JavaDoc<String JavaDoc,List JavaDoc<String JavaDoc>> disallows
94          = new HashMap JavaDoc<String JavaDoc,List JavaDoc<String JavaDoc>>();
95         Robotstxt.parse(reader, userAgents, disallows);
96         return (disallows.isEmpty())?
97             ALLOWALL:
98             new RobotsExclusionPolicy(settings, userAgents, disallows,
99                 honoringPolicy);
100     }
101
102
103
104     /**
105      * @param settings
106      * @param u
107      * @param d
108      * @param honoringPolicy
109      */

110     public RobotsExclusionPolicy(CrawlerSettings settings, LinkedList JavaDoc<String JavaDoc> u,
111             HashMap JavaDoc<String JavaDoc,List JavaDoc<String JavaDoc>> d,
112             RobotsHonoringPolicy honoringPolicy) {
113         userAgents = u;
114         disallows = d;
115         this.honoringPolicy = honoringPolicy;
116
117         if(honoringPolicy == null) return;
118
119         // If honoring policy is most favored user agent, all rules should be checked
120
if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED)) {
121             userAgentsToTest = userAgents;
122
123         // IF honoring policy is most favored of set, then make a list with only the set as members
124
} else if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) {
125             userAgentsToTest = new ArrayList JavaDoc<String JavaDoc>();
126             Iterator JavaDoc userAgentSet = honoringPolicy.getUserAgents(settings).iterator();
127             while(userAgentSet.hasNext()) {
128                 String JavaDoc userAgent = (String JavaDoc) userAgentSet.next();
129
130                 Iterator JavaDoc iter = userAgents.iterator();
131                 while ( iter.hasNext() ) {
132                     String JavaDoc ua = (String JavaDoc)iter.next();
133                     if (userAgent.indexOf(ua)>-1) {
134                         userAgentsToTest.add(ua);
135                         break;
136                     }
137                 }
138             }
139         }
140     }
141
142     public RobotsExclusionPolicy(int type) {
143         this(null, null, null, null);
144         this.type = type;
145     }
146
147     public boolean disallows(CrawlURI curi, String JavaDoc userAgent) {
148         if (this == ALLOWALL)
149             return false;
150         if (this == DENYALL)
151             return true;
152
153         // In the common case with policy=Classic, the useragent is remembered from uri to uri on
154
// the same server
155
if((honoringPolicy.isType(curi, RobotsHonoringPolicy.CLASSIC)
156                 || honoringPolicy.isType(curi, RobotsHonoringPolicy.CUSTOM))
157             && (lastUsedUserAgent == null
158             || !lastUsedUserAgent.equals(userAgent))) {
159
160             lastUsedUserAgent = userAgent;
161             userAgentsToTest = new ArrayList JavaDoc<String JavaDoc>();
162             Iterator JavaDoc iter = userAgents.iterator();
163             String JavaDoc lowerCaseUserAgent = userAgent.toLowerCase();
164             while ( iter.hasNext() ) {
165                 String JavaDoc ua = (String JavaDoc)iter.next();
166                 // ua in below is already lowercase. See Robotstxt.java line 60.
167
if (lowerCaseUserAgent.indexOf(ua)>-1) {
168                     userAgentsToTest.add(ua);
169                     break; // consider no more sections
170
}
171             }
172         }
173
174         boolean disallow = false;
175         boolean examined = false;
176         String JavaDoc ua = null;
177
178         // Go thru list of all user agents we might act as
179
Iterator JavaDoc uas = userAgentsToTest.iterator();
180         while(uas.hasNext() && examined == false) {
181             disallow = false;
182             ua = (String JavaDoc) uas.next();
183             Iterator JavaDoc dis = ((List JavaDoc) disallows.get(ua)).iterator();
184
185             // Check if the current user agent is allowed to crawl
186
while(dis.hasNext() && examined == false && disallow == false) {
187                 String JavaDoc disallowedPath = (String JavaDoc) dis.next();
188                 if(disallowedPath.length() == 0) {
189                     // blanket allow
190
examined = true;
191                     disallow = false;
192                     break;
193                 }
194                 try {
195                     String JavaDoc p = curi.getUURI().getPathQuery();
196                     if (p != null && p.startsWith(disallowedPath) ) {
197                         // the user agent tested isn't allowed to get this uri
198
disallow = true;
199                     }
200                 }
201                 catch (URIException e) {
202                     logger.log(Level.SEVERE,"Failed getPathQuery from " + curi, e);
203                 }
204             }
205             if(disallow == false) {
206                 // the user agent tested is allowed
207
examined = true;
208             }
209         }
210
211         // Are we supposed to masquerade as the user agent to which restrictions
212
// we follow?
213
if(honoringPolicy.shouldMasquerade(curi) && ua != null && !ua.equals("")) {
214             curi.setUserAgent(ua);
215         }
216         return disallow;
217     }
218
219     // Methods for object serialization.
220

221     /** If object is DENYALL or ALLOWALL, only the object identity and type
222      * is written in the serialization stream.
223      *
224      * @param stream the serialization stream.
225      * @throws IOException
226      */

227     private void writeObject(ObjectOutputStream JavaDoc stream) throws IOException JavaDoc {
228         stream.writeInt(type);
229         if (type == NORMAL_TYPE) {
230             stream.defaultWriteObject();
231         }
232     }
233
234     /** If object is DENYALL or ALLOWALL, only the object identity and type
235      * is read from the serialization stream.
236      *
237      * @param stream the serialization stream.
238      * @throws IOException
239      * @throws ClassNotFoundException
240      */

241     private void readObject(ObjectInputStream JavaDoc stream)
242             throws IOException JavaDoc, ClassNotFoundException JavaDoc {
243         type = stream.readInt();
244         if (type == NORMAL_TYPE) {
245             stream.defaultReadObject();
246         }
247     }
248
249     /** If object is DENYALL or ALLOWALL, the object is replaced by constants
250      * so that check for object equality works.
251      * @return Object.
252      */

253     private Object JavaDoc readResolve() {
254         if (type == NORMAL_TYPE) {
255             return this;
256         } else if (type == ALLOWALL_TYPE) {
257             return ALLOWALL;
258         } else if (type == DENYALL_TYPE) {
259             return DENYALL;
260         }
261         return null;
262     }
263
264 }
265
Popular Tags