1 32 33 package websphinx; 34 35 import java.net.URL ; 36 import java.net.URLConnection ; 37 import java.util.Hashtable ; 38 import java.io.PushbackInputStream ; 39 import java.io.BufferedInputStream ; 40 import java.util.Vector ; 41 42 public class RobotExclusion { 43 44 String myUserAgent; 45 Hashtable entries = new Hashtable (); 47 49 55 public RobotExclusion (String userAgent) { 56 myUserAgent = userAgent.toLowerCase (); 57 } 58 59 64 public boolean disallowed (URL url) { 65 if (!url.getProtocol().startsWith ("http")) 66 return false; 68 69 String website = getWebSite (url); 70 String [] rules = (String [])entries.get (website); 71 72 if (rules == null) { 73 rules = getRobotsTxt (website, myUserAgent); 74 entries.put (website, rules); 75 } 76 77 String path = url.getFile (); 78 for (int i=0; i<rules.length; ++i) { 79 if (path.startsWith (rules[i])) { 80 return true; 82 } 83 } 85 return false; 86 } 87 88 91 public void clear () { 92 entries.clear (); 93 } 94 95 99 100 String getWebSite (URL url) { 101 String hostname = url.getHost (); 102 int port = url.getPort (); 103 return port != -1 ? hostname + ":" + port : hostname; 104 } 105 106 Vector rulebuf = new Vector (); 107 String [] getRobotsTxt (String website, String userAgent) { 108 try { 109 URL robotstxtURL = new URL ("http://" + website + "/robots.txt"); 110 URLConnection uc = Access.getAccess ().openConnection (robotstxtURL); 111 PushbackInputStream in = new PushbackInputStream (new BufferedInputStream (uc.getInputStream ())); 112 113 rulebuf.setSize (0); 114 115 boolean relevant = false, specific = false; 116 String lastFieldName = null; 117 while (readField (in)) { 118 120 if (fieldName == null) { if (specific) 122 break; relevant = false; 124 } 125 else if (fieldName.equals ("user-agent")) { 126 if (lastFieldName != null && lastFieldName.equals ("disallow")) { 127 if (specific) 129 break; relevant = false; 131 } 132 133 if (userAgent != null && userAgent.indexOf (fieldValue.toLowerCase()) != -1) { 134 relevant = true; 135 specific = true; 136 rulebuf.setSize (0); 137 } 138 else if (fieldValue.equals ("*")) { 139 relevant = true; 140 rulebuf.setSize (0); 141 } 142 } 143 else if (relevant && fieldName.equals ("disallow")) { 144 rulebuf.addElement (fieldValue); 145 } 146 else { if (specific) 148 break; relevant = false; 150 } 151 lastFieldName = fieldName; 152 } 153 154 in.close (); 155 156 String [] rules = new String [rulebuf.size ()]; 157 rulebuf.copyInto (rules); 158 return rules; 159 } catch (Exception e) { 160 return new String [0]; 164 } 165 } 166 167 String fieldName, fieldValue; 168 static final int MAX_LINE_LENGTH = 1024; 169 StringBuffer linebuf = new StringBuffer (); 170 171 boolean readField (PushbackInputStream in) throws Exception { 179 fieldName = null; 180 fieldValue = null; 181 linebuf.setLength (0); 182 183 int c; 184 int n = 0; 185 boolean saw_eoln = false; 186 while (true) { 187 c = in.read (); 188 if (c == -1) 189 break; 190 else if (c == '\r' || c == '\n') 191 saw_eoln = true; 192 else if (saw_eoln) { 193 in.unread (c); 194 break; 195 } 196 else { 197 linebuf.append ((char)c); 198 } 199 200 ++n; 201 if (n == MAX_LINE_LENGTH) 202 break; 203 } 204 205 207 if (n == 0) 208 return false; 209 210 String line = linebuf.toString (); 212 int colon = line.indexOf (':'); 213 if (colon == -1) { 214 fieldName = null; 215 fieldValue = null; 216 } 217 else { 218 fieldName = line.substring (0, colon).trim ().toLowerCase (); 219 fieldValue = line.substring (colon+1).trim (); 220 } 221 return true; 222 } 223 224 public static void main (String argv[]) throws Exception { 225 RobotExclusion robot = new RobotExclusion (argv[0]); 226 227 for (int i=1; i<argv.length; ++i) { 228 System.out.println (argv[i] + ": " 229 + (!robot.disallowed (new URL (argv[i])) ? "OK" : "disallowed")); 230 } 231 System.in.read (); 232 } 233 } 234 | Popular Tags |