1 25 package org.archive.crawler.datamodel; 26 27 import java.io.BufferedReader ; 28 import java.io.IOException ; 29 import java.util.ArrayList ; 30 import java.util.LinkedList ; 31 import java.util.List ; 32 import java.util.Map ; 33 34 public class Robotstxt { 35 public static boolean parse(BufferedReader reader, 36 final LinkedList <String > userAgents, 37 final Map <String ,List <String >> disallows) 38 throws IOException { 39 boolean hasErrors = false; 40 String read; 41 ArrayList <String > current = null; 42 String catchall = null; 43 while (reader != null) { 44 do { 45 read = reader.readLine(); 46 } while ((read != null) && ((read = read.trim()).startsWith("#") || 48 read.length() == 0)); 49 if (read == null) { 50 reader.close(); 51 reader = null; 52 } else { 53 int commentIndex = read.indexOf("#"); 54 if (commentIndex > -1) { 55 read = read.substring(0, commentIndex); 57 } 58 read = read.trim(); 59 if (read.matches("(?i)^User-agent:.*")) { 60 String ua = read.substring(11).trim().toLowerCase(); 61 if (current == null || current.size() != 0) { 62 current = new ArrayList <String >(); 65 } 66 if (ua.equals("*")) { 67 ua = ""; 68 catchall = ua; 69 } else { 70 userAgents.addLast(ua); 71 } 72 disallows.put(ua, current); 73 continue; 74 } 75 if (read.matches("(?i)Disallow:.*")) { 76 if (current == null) { 77 hasErrors = true; 79 continue; 80 } 81 String path = read.substring(9).trim(); 82 current.add(path); 83 continue; 84 } 85 } 87 } 88 89 if (catchall != null) { 90 userAgents.addLast(catchall); 91 } 92 return hasErrors; 93 } 94 95 98 public static void main(String [] args) { 99 } 101 } 102 | Popular Tags |