1 package net.javacoding.jspider.core.util.html; 2 3 import java.io.*; 4 import java.util.ArrayList ; 5 6 9 public class RobotsTXTLineSet { 10 11 public static final String USER_AGENT="user-agent:"; 12 13 protected String userAgent; 14 protected RobotsTXTLine[] lines; 15 16 RobotsTXTLineSet(String userAgent, RobotsTXTLine[] lines) { 17 this.userAgent = userAgent; 18 this.lines = lines; 19 } 20 21 public String getUserAgent ( ) { 22 return userAgent; 23 } 24 25 public RobotsTXTLine[] getLines ( ) { 26 return lines; 27 } 28 29 public static RobotsTXTLineSet findLineSet ( InputStream is, String spiderUserAgent) throws IOException { 30 BufferedReader br = new BufferedReader(new InputStreamReader(is)); 31 return findLineSet(br, spiderUserAgent); 32 } 33 34 public static RobotsTXTLineSet findLineSet(BufferedReader br, String spiderUserAgent) throws IOException { 35 String userAgent = findUserAgent ( br, spiderUserAgent ); 36 if ( userAgent == null ) { 37 return null; 38 } else { 39 RobotsTXTLine[] lines = parseRules ( br ); 40 return new RobotsTXTLineSet(userAgent, lines); 41 } 42 } 43 44 private static String findUserAgent ( BufferedReader br, String spiderUserAgent ) throws IOException { 45 if ( spiderUserAgent == null ) { 46 spiderUserAgent = ""; 47 } 48 String spiderUserAgentLowerCase = spiderUserAgent.toLowerCase(); 49 String line = br.readLine(); 50 while (line != null) { 51 line = line.trim(); 52 if (line.toLowerCase().startsWith(USER_AGENT)) { 53 String userAgent = line.substring(USER_AGENT.length() + 1).trim(); 54 if (userAgent.equals("*") || spiderUserAgentLowerCase.indexOf(userAgent.toLowerCase()) > -1) { 55 return userAgent; 56 } 57 } 58 line = br.readLine(); 59 } 60 return null; 61 } 62 63 private static RobotsTXTLine[] parseRules ( BufferedReader br ) throws IOException { 64 ArrayList al = new ArrayList (); 65 String line = br.readLine(); 66 while (line != null && (line.toLowerCase().indexOf(USER_AGENT) == -1)) { 67 RobotsTXTLine robotsTXTline = RobotsTXTLine.parse(line); 68 if ( robotsTXTline != null && robotsTXTline.getType() == RobotsTXTLine.ROBOTSTXT_RULE_DISALLOW ) { 69 al.add ( robotsTXTline ); 70 } 71 line = br.readLine(); 72 } 73 return (RobotsTXTLine[]) al.toArray(new RobotsTXTLine[al.size()]); 74 } 75 76 } 77 | Popular Tags |