KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > RobotExclusion


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 import java.net.URL JavaDoc;
36 import java.net.URLConnection JavaDoc;
37 import java.util.Hashtable JavaDoc;
38 import java.io.PushbackInputStream JavaDoc;
39 import java.io.BufferedInputStream JavaDoc;
40 import java.util.Vector JavaDoc;
41
42 public class RobotExclusion {
43
44     String JavaDoc myUserAgent;
45         // user agent (name) of this crawler, all lower-case
46
Hashtable JavaDoc entries = new Hashtable JavaDoc ();
47         // maps from a website ("host:port") to String[]
48

49     /**
50      * Make a RobotExclusion object.
51      * @param userAgent name of the robot using this object, as shown
52      * in the User-Agent header fields of its HTTP requests. Use
53      * null for anonymous robots.
54      */

55     public RobotExclusion (String JavaDoc userAgent) {
56         myUserAgent = userAgent.toLowerCase ();
57     }
58
59     /**
60      * Check whether a URL is disallowed by robots.txt.
61      * @param url URL to test
62      * @return true if url's Web site denies robot access to the url
63      */

64     public boolean disallowed (URL JavaDoc url) {
65         if (!url.getProtocol().startsWith ("http"))
66             // only HTTP URLs are protected by robots.txt
67
return false;
68
69         String JavaDoc website = getWebSite (url);
70         String JavaDoc[] rules = (String JavaDoc[])entries.get (website);
71
72         if (rules == null) {
73             rules = getRobotsTxt (website, myUserAgent);
74             entries.put (website, rules);
75         }
76
77         String JavaDoc path = url.getFile ();
78         for (int i=0; i<rules.length; ++i) {
79             if (path.startsWith (rules[i])) {
80                 //System.err.println ("disallowed by rule " + rules[i]);
81
return true;
82             }
83             //System.err.println ("allowed by rule " + rules[i]);
84
}
85         return false;
86     }
87
88     /**
89      * Clear the cache of robots.txt entries.
90      */

91     public void clear () {
92         entries.clear ();
93     }
94
95     /*
96      * Implementation
97      *
98      */

99
100     String JavaDoc getWebSite (URL JavaDoc url) {
101         String JavaDoc hostname = url.getHost ();
102         int port = url.getPort ();
103         return port != -1 ? hostname + ":" + port : hostname;
104     }
105
106     Vector JavaDoc rulebuf = new Vector JavaDoc ();
107     String JavaDoc[] getRobotsTxt (String JavaDoc website, String JavaDoc userAgent) {
108         try {
109             URL JavaDoc robotstxtURL = new URL JavaDoc ("http://" + website + "/robots.txt");
110             URLConnection JavaDoc uc = Access.getAccess ().openConnection (robotstxtURL);
111             PushbackInputStream JavaDoc in = new PushbackInputStream JavaDoc (new BufferedInputStream JavaDoc (uc.getInputStream ()));
112
113             rulebuf.setSize (0);
114
115             boolean relevant = false, specific = false;
116             String JavaDoc lastFieldName = null;
117             while (readField (in)) {
118                 //System.err.println (fieldName + ":" + fieldValue);
119

120                 if (fieldName == null) { // end of record
121
if (specific)
122                         break; // while loop
123
relevant = false;
124                 }
125                 else if (fieldName.equals ("user-agent")) {
126                     if (lastFieldName != null && lastFieldName.equals ("disallow")) {
127                         // end of record
128
if (specific)
129                             break; // while loop
130
relevant = false;
131                     }
132
133                     if (userAgent != null && userAgent.indexOf (fieldValue.toLowerCase()) != -1) {
134                         relevant = true;
135                         specific = true;
136                         rulebuf.setSize (0);
137                     }
138                     else if (fieldValue.equals ("*")) {
139                         relevant = true;
140                         rulebuf.setSize (0);
141                     }
142                 }
143                 else if (relevant && fieldName.equals ("disallow")) {
144                     rulebuf.addElement (fieldValue);
145                 }
146                 else { // end of record
147
if (specific)
148                         break; // while loop
149
relevant = false;
150                 }
151                 lastFieldName = fieldName;
152             }
153
154             in.close ();
155
156             String JavaDoc[] rules = new String JavaDoc[rulebuf.size ()];
157             rulebuf.copyInto (rules);
158             return rules;
159         } catch (Exception JavaDoc e) {
160             // debugging only
161
// System.err.println ("RobotExclusion: error while retrieving " + website + "/robots.txt:");
162
// e.printStackTrace ();
163
return new String JavaDoc[0];
164         }
165     }
166
167     String JavaDoc fieldName, fieldValue;
168     static final int MAX_LINE_LENGTH = 1024;
169     StringBuffer JavaDoc linebuf = new StringBuffer JavaDoc ();
170
171     // Reads one line from the input stream, parsing it into
172
// fieldName and fieldValue. Field name is lower case;
173
// whitespace is stripped at both ends of name and value.
174
// e.g., User-agent: Webcrawler
175
// is parsed into fieldName="user-agent" and fieldValue="Webcrawler".
176
// Field-less lines are parsed as fieldName=null and fieldValue=null.
177
// Returns true if a line was read, false on end-of-file.
178
boolean readField (PushbackInputStream JavaDoc in) throws Exception JavaDoc {
179         fieldName = null;
180         fieldValue = null;
181         linebuf.setLength (0);
182
183         int c;
184         int n = 0;
185         boolean saw_eoln = false;
186         while (true) {
187             c = in.read ();
188             if (c == -1)
189                 break;
190             else if (c == '\r' || c == '\n')
191                 saw_eoln = true;
192             else if (saw_eoln) {
193                 in.unread (c);
194                 break;
195             }
196             else {
197                 linebuf.append ((char)c);
198             }
199
200             ++n;
201             if (n == MAX_LINE_LENGTH)
202                 break;
203         }
204
205         //System.err.println (linebuf);
206

207         if (n == 0)
208             return false;
209
210         // extract fields from line and return
211
String JavaDoc line = linebuf.toString ();
212         int colon = line.indexOf (':');
213         if (colon == -1) {
214             fieldName = null;
215             fieldValue = null;
216         }
217         else {
218             fieldName = line.substring (0, colon).trim ().toLowerCase ();
219             fieldValue = line.substring (colon+1).trim ();
220         }
221         return true;
222     }
223
224     public static void main (String JavaDoc argv[]) throws Exception JavaDoc {
225         RobotExclusion robot = new RobotExclusion (argv[0]);
226
227         for (int i=1; i<argv.length; ++i) {
228             System.out.println (argv[i] + ": "
229                     + (!robot.disallowed (new URL JavaDoc (argv[i])) ? "OK" : "disallowed"));
230         }
231         System.in.read ();
232     }
233 }
234
Popular Tags