1 2 3 4 package net.nutch.protocol.http; 5 6 import java.io.*; 7 import java.net.URL ; 8 import java.net.InetAddress ; 9 import java.net.UnknownHostException ; 10 import java.util.HashMap ; 11 import java.util.LinkedList ; 12 import java.util.logging.Level ; 13 import java.util.logging.Logger ; 14 15 import net.nutch.util.LogFormatter; 16 import net.nutch.util.NutchConf; 17 18 import net.nutch.protocol.*; 19 20 21 public class Http implements Protocol { 22 23 public static final Logger LOG = 24 LogFormatter.getLogger("net.nutch.net.Http"); 25 26 static { 27 if (NutchConf.getBoolean("http.verbose", false)) 28 LOG.setLevel(Level.FINE); 29 } 30 31 static final int BUFFER_SIZE = 8 * 1024; 32 33 private static final int MAX_REDIRECTS = 34 NutchConf.getInt("http.redirect.max", 3); 35 36 static String PROXY_HOST = NutchConf.get("http.proxy.host"); 37 static int PROXY_PORT = NutchConf.getInt("http.proxy.port",8080); 38 static boolean PROXY = (PROXY_HOST != null && PROXY_HOST.length() > 0); 39 40 static int TIMEOUT = NutchConf.getInt("http.timeout", 10000); 41 static int MAX_CONTENT= NutchConf.getInt("http.content.limit",64*1024); 42 43 static int MAX_DELAYS= NutchConf.getInt("http.max.delays",3); 44 static int MAX_THREADS_PER_HOST = 45 NutchConf.getInt("fetcher.threads.per.host", 1); 46 47 static String AGENT_STRING = getAgentString(); 48 49 static long SERVER_DELAY = 50 (long)(NutchConf.getFloat("fetcher.server.delay", 1.0f) * 1000); 51 52 static { 53 LOG.info("http.proxy.host = " + PROXY_HOST); 54 LOG.info("http.proxy.port = " + PROXY_PORT); 55 56 LOG.info("http.timeout = " + TIMEOUT); 57 LOG.info("http.content.limit = " + MAX_CONTENT); 58 LOG.info("http.agent = " + AGENT_STRING); 59 60 LOG.info("fetcher.server.delay = " + SERVER_DELAY); 61 LOG.info("http.max.delays = " + MAX_DELAYS); 62 } 63 64 68 private static HashMap BLOCKED_ADDR_TO_TIME = new HashMap (); 69 70 71 private static HashMap THREADS_PER_HOST_COUNT = new HashMap (); 72 73 75 private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList (); 76 77 private RobotRulesParser robotRules = new RobotRulesParser(); 78 79 private static InetAddress blockAddr(URL url) throws ProtocolException { 80 InetAddress addr; 81 try { 82 addr = InetAddress.getByName(url.getHost()); 83 } catch (UnknownHostException e) { 84 throw new HttpException(e); 85 } 86 87 int delays = 0; 88 while (true) { 89 cleanExpiredServerBlocks(); 91 Long time; 92 synchronized (BLOCKED_ADDR_TO_TIME) { 93 time = (Long ) BLOCKED_ADDR_TO_TIME.get(addr); 94 if (time == null) { 96 Integer counter = (Integer )THREADS_PER_HOST_COUNT.get(addr); 98 int count = (counter == null) ? 0 : counter.intValue(); 99 100 count++; THREADS_PER_HOST_COUNT.put(addr, new Integer (count)); 102 103 if (count >= MAX_THREADS_PER_HOST) { 104 BLOCKED_ADDR_TO_TIME.put(addr, new Long (0)); } 106 return addr; 107 } 108 } 109 110 if (delays == MAX_DELAYS) 111 throw new RetryLater(url, "Exceeded http.max.delays: retry later."); 112 113 long done = time.longValue(); 114 long now = System.currentTimeMillis(); 115 long sleep = 0; 116 if (done == 0) { sleep = SERVER_DELAY; 119 } else if (now < done) { sleep = done - now; } 122 123 try { 124 Thread.sleep(sleep); 125 } catch (InterruptedException e) {} 126 delays++; 127 } 128 } 129 130 private static void cleanExpiredServerBlocks() { 131 synchronized (BLOCKED_ADDR_TO_TIME) { 132 while (!BLOCKED_ADDR_QUEUE.isEmpty()) { 133 InetAddress addr = (InetAddress )BLOCKED_ADDR_QUEUE.getLast(); 134 long time = ((Long )BLOCKED_ADDR_TO_TIME.get(addr)).longValue(); 135 if (time <= System.currentTimeMillis()) { 136 BLOCKED_ADDR_TO_TIME.remove(addr); 137 BLOCKED_ADDR_QUEUE.removeLast(); 138 } else { 139 break; 140 } 141 } 142 } 143 } 144 145 private static void unblockAddr(InetAddress addr) { 146 synchronized (BLOCKED_ADDR_TO_TIME) { 147 int addrCount = ((Integer )THREADS_PER_HOST_COUNT.get(addr)).intValue(); 148 if (addrCount == 1) { 149 THREADS_PER_HOST_COUNT.remove(addr); 150 BLOCKED_ADDR_QUEUE.addFirst(addr); 151 BLOCKED_ADDR_TO_TIME.put 152 (addr, new Long (System.currentTimeMillis()+SERVER_DELAY)); 153 } 154 else { 155 THREADS_PER_HOST_COUNT.put(addr, new Integer (addrCount - 1)); 156 } 157 } 158 } 159 160 public Content getContent(String urlString) throws ProtocolException { 161 try { 162 URL url = new URL (urlString); 163 164 int redirects = 0; 165 while (true) { 166 167 if (!RobotRulesParser.isAllowed(url)) 168 throw new ResourceGone(url, "Blocked by robots.txt"); 169 170 InetAddress addr = blockAddr(url); 171 HttpResponse response; 172 try { 173 response = new HttpResponse(urlString, url); } finally { 175 unblockAddr(addr); 176 } 177 178 int code = response.getCode(); 179 180 if (code == 200) { return response.toContent(); 183 } else if (code == 410) { throw new ResourceGone(url, "Http: " + code); 185 186 } else if (code >= 300 && code < 400) { if (redirects == MAX_REDIRECTS) 188 throw new HttpException("Too many redirects: " + urlString); 189 url = new URL (url, response.getHeader("Location")); 190 redirects++; 191 LOG.fine("redirect to " + url); 192 193 } else { throw new HttpError(code); 195 } 196 } 197 } catch (IOException e) { 198 throw new HttpException(e); 199 } 200 } 201 202 private static String getAgentString() { 203 String agentName = NutchConf.get("http.agent.name"); 204 String agentVersion = NutchConf.get("http.agent.version"); 205 String agentDesc = NutchConf.get("http.agent.description"); 206 String agentURL = NutchConf.get("http.agent.url"); 207 String agentEmail = NutchConf.get("http.agent.email"); 208 209 if ( (agentName == null) || (agentName.trim().length() == 0) ) 210 LOG.severe("No User-Agent string set (http.agent.name)!"); 211 212 StringBuffer buf= new StringBuffer (); 213 214 buf.append(agentName); 215 if (agentVersion != null) { 216 buf.append("/"); 217 buf.append(agentVersion); 218 } 219 if ( ((agentDesc != null) && (agentDesc.length() != 0)) 220 || ((agentEmail != null) && (agentEmail.length() != 0)) 221 || ((agentURL != null) && (agentURL.length() != 0)) ) { 222 buf.append(" ("); 223 224 if ((agentDesc != null) && (agentDesc.length() != 0)) { 225 buf.append(agentDesc); 226 if ( (agentURL != null) || (agentEmail != null) ) 227 buf.append("; "); 228 } 229 230 if ((agentURL != null) && (agentURL.length() != 0)) { 231 buf.append(agentURL); 232 if (agentEmail != null) 233 buf.append("; "); 234 } 235 236 if ((agentEmail != null) && (agentEmail.length() != 0)) 237 buf.append(agentEmail); 238 239 buf.append(")"); 240 } 241 return buf.toString(); 242 } 243 244 245 public static void main(String [] args) throws Exception { 246 boolean verbose = false; 247 String url = null; 248 249 String usage = "Usage: Http [-verbose] [-timeout N] url"; 250 251 if (args.length == 0) { 252 System.err.println(usage); 253 System.exit(-1); 254 } 255 256 257 for (int i = 0; i < args.length; i++) { if (args[i].equals("-timeout")) { TIMEOUT = Integer.parseInt(args[++i]) * 1000; 260 } else if (args[i].equals("-verbose")) { verbose = true; 262 } else if (i != args.length-1) { 263 System.err.println(usage); 264 System.exit(-1); 265 } else url = args[i]; 267 } 268 269 Http http = new Http(); 270 271 if (verbose) { 272 LOG.setLevel(Level.FINE); 273 } 274 275 Content content = http.getContent(url); 276 277 System.out.println("Content Type: " + content.getContentType()); 278 System.out.println("Content Length: " + content.get("Content-Length")); 279 System.out.println("Content:"); 280 String text = new String (content.getContent()); 281 System.out.println(text); 282 283 } 284 285 } 286 | Popular Tags |