KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > protocol > http > Http


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.protocol.http;
5
6 import java.io.*;
7 import java.net.URL JavaDoc;
8 import java.net.InetAddress JavaDoc;
9 import java.net.UnknownHostException JavaDoc;
10 import java.util.HashMap JavaDoc;
11 import java.util.LinkedList JavaDoc;
12 import java.util.logging.Level JavaDoc;
13 import java.util.logging.Logger JavaDoc;
14
15 import net.nutch.util.LogFormatter;
16 import net.nutch.util.NutchConf;
17
18 import net.nutch.protocol.*;
19
20 /** An implementation of the Http protocol. */
21 public class Http implements Protocol {
22
23   public static final Logger JavaDoc LOG =
24     LogFormatter.getLogger("net.nutch.net.Http");
25
26   static {
27     if (NutchConf.getBoolean("http.verbose", false))
28       LOG.setLevel(Level.FINE);
29   }
30
31   static final int BUFFER_SIZE = 8 * 1024;
32
33   private static final int MAX_REDIRECTS =
34     NutchConf.getInt("http.redirect.max", 3);
35
36   static String JavaDoc PROXY_HOST = NutchConf.get("http.proxy.host");
37   static int PROXY_PORT = NutchConf.getInt("http.proxy.port",8080);
38   static boolean PROXY = (PROXY_HOST != null && PROXY_HOST.length() > 0);
39   
40   static int TIMEOUT = NutchConf.getInt("http.timeout", 10000);
41   static int MAX_CONTENT= NutchConf.getInt("http.content.limit",64*1024);
42
43   static int MAX_DELAYS= NutchConf.getInt("http.max.delays",3);
44   static int MAX_THREADS_PER_HOST =
45     NutchConf.getInt("fetcher.threads.per.host", 1);
46
47   static String JavaDoc AGENT_STRING = getAgentString();
48
49   static long SERVER_DELAY =
50     (long)(NutchConf.getFloat("fetcher.server.delay", 1.0f) * 1000);
51
52   static {
53     LOG.info("http.proxy.host = " + PROXY_HOST);
54     LOG.info("http.proxy.port = " + PROXY_PORT);
55
56     LOG.info("http.timeout = " + TIMEOUT);
57     LOG.info("http.content.limit = " + MAX_CONTENT);
58     LOG.info("http.agent = " + AGENT_STRING);
59
60     LOG.info("fetcher.server.delay = " + SERVER_DELAY);
61     LOG.info("http.max.delays = " + MAX_DELAYS);
62   }
63
64   /** Maps from InetAddress to a Long naming the time it should be unblocked.
65    * The Long is zero while the address is in use, then set to now+wait when
66    * a request finishes. This way only one thread at a time accesses an
67    * address. */

68   private static HashMap JavaDoc BLOCKED_ADDR_TO_TIME = new HashMap JavaDoc();
69     
70   /** Maps an address to the number of threads accessing that address. */
71   private static HashMap JavaDoc THREADS_PER_HOST_COUNT = new HashMap JavaDoc();
72
73   /** Queue of blocked InetAddress. This contains all of the non-zero entries
74    * from BLOCKED_ADDR_TO_TIME, ordered by increasing time. */

75   private static LinkedList JavaDoc BLOCKED_ADDR_QUEUE = new LinkedList JavaDoc();
76
77   private RobotRulesParser robotRules = new RobotRulesParser();
78
79   private static InetAddress JavaDoc blockAddr(URL JavaDoc url) throws ProtocolException {
80     InetAddress JavaDoc addr;
81     try {
82       addr = InetAddress.getByName(url.getHost());
83     } catch (UnknownHostException JavaDoc e) {
84       throw new HttpException(e);
85     }
86     
87     int delays = 0;
88     while (true) {
89       cleanExpiredServerBlocks(); // free held addresses
90

91       Long JavaDoc time;
92       synchronized (BLOCKED_ADDR_TO_TIME) {
93         time = (Long JavaDoc) BLOCKED_ADDR_TO_TIME.get(addr);
94         if (time == null) { // address is free
95

96           // get # of threads already accessing this addr
97
Integer JavaDoc counter = (Integer JavaDoc)THREADS_PER_HOST_COUNT.get(addr);
98           int count = (counter == null) ? 0 : counter.intValue();
99           
100           count++; // increment & store
101
THREADS_PER_HOST_COUNT.put(addr, new Integer JavaDoc(count));
102           
103           if (count >= MAX_THREADS_PER_HOST) {
104             BLOCKED_ADDR_TO_TIME.put(addr, new Long JavaDoc(0)); // block it
105
}
106           return addr;
107         }
108       }
109
110       if (delays == MAX_DELAYS)
111         throw new RetryLater(url, "Exceeded http.max.delays: retry later.");
112
113       long done = time.longValue();
114       long now = System.currentTimeMillis();
115       long sleep = 0;
116       if (done == 0) { // address is still in use
117
sleep = SERVER_DELAY; // wait at least delay
118

119       } else if (now < done) { // address is on hold
120
sleep = done - now; // wait until its free
121
}
122
123       try {
124         Thread.sleep(sleep);
125       } catch (InterruptedException JavaDoc e) {}
126       delays++;
127     }
128   }
129   
130   private static void cleanExpiredServerBlocks() {
131     synchronized (BLOCKED_ADDR_TO_TIME) {
132       while (!BLOCKED_ADDR_QUEUE.isEmpty()) {
133         InetAddress JavaDoc addr = (InetAddress JavaDoc)BLOCKED_ADDR_QUEUE.getLast();
134         long time = ((Long JavaDoc)BLOCKED_ADDR_TO_TIME.get(addr)).longValue();
135         if (time <= System.currentTimeMillis()) {
136           BLOCKED_ADDR_TO_TIME.remove(addr);
137           BLOCKED_ADDR_QUEUE.removeLast();
138         } else {
139           break;
140         }
141       }
142     }
143   }
144
145   private static void unblockAddr(InetAddress JavaDoc addr) {
146     synchronized (BLOCKED_ADDR_TO_TIME) {
147       int addrCount = ((Integer JavaDoc)THREADS_PER_HOST_COUNT.get(addr)).intValue();
148       if (addrCount == 1) {
149         THREADS_PER_HOST_COUNT.remove(addr);
150         BLOCKED_ADDR_QUEUE.addFirst(addr);
151         BLOCKED_ADDR_TO_TIME.put
152           (addr, new Long JavaDoc(System.currentTimeMillis()+SERVER_DELAY));
153       }
154       else {
155         THREADS_PER_HOST_COUNT.put(addr, new Integer JavaDoc(addrCount - 1));
156       }
157     }
158   }
159
160   public Content getContent(String JavaDoc urlString) throws ProtocolException {
161     try {
162       URL JavaDoc url = new URL JavaDoc(urlString);
163
164       int redirects = 0;
165       while (true) {
166         
167         if (!RobotRulesParser.isAllowed(url))
168           throw new ResourceGone(url, "Blocked by robots.txt");
169         
170         InetAddress JavaDoc addr = blockAddr(url);
171         HttpResponse response;
172         try {
173           response = new HttpResponse(urlString, url); // make a request
174
} finally {
175           unblockAddr(addr);
176         }
177         
178         int code = response.getCode();
179         
180         if (code == 200) { // got a good response
181
return response.toContent(); // return it
182

183         } else if (code == 410) { // page is gone
184
throw new ResourceGone(url, "Http: " + code);
185
186         } else if (code >= 300 && code < 400) { // handle redirect
187
if (redirects == MAX_REDIRECTS)
188             throw new HttpException("Too many redirects: " + urlString);
189           url = new URL JavaDoc(url, response.getHeader("Location"));
190           redirects++;
191           LOG.fine("redirect to " + url);
192           
193         } else { // convert to exception
194
throw new HttpError(code);
195         }
196       }
197     } catch (IOException e) {
198       throw new HttpException(e);
199     }
200   }
201
202   private static String JavaDoc getAgentString() {
203     String JavaDoc agentName = NutchConf.get("http.agent.name");
204     String JavaDoc agentVersion = NutchConf.get("http.agent.version");
205     String JavaDoc agentDesc = NutchConf.get("http.agent.description");
206     String JavaDoc agentURL = NutchConf.get("http.agent.url");
207     String JavaDoc agentEmail = NutchConf.get("http.agent.email");
208
209     if ( (agentName == null) || (agentName.trim().length() == 0) )
210       LOG.severe("No User-Agent string set (http.agent.name)!");
211
212     StringBuffer JavaDoc buf= new StringBuffer JavaDoc();
213
214     buf.append(agentName);
215     if (agentVersion != null) {
216       buf.append("/");
217       buf.append(agentVersion);
218     }
219     if ( ((agentDesc != null) && (agentDesc.length() != 0))
220          || ((agentEmail != null) && (agentEmail.length() != 0))
221          || ((agentURL != null) && (agentURL.length() != 0)) ) {
222       buf.append(" (");
223
224       if ((agentDesc != null) && (agentDesc.length() != 0)) {
225         buf.append(agentDesc);
226         if ( (agentURL != null) || (agentEmail != null) )
227           buf.append("; ");
228       }
229
230       if ((agentURL != null) && (agentURL.length() != 0)) {
231         buf.append(agentURL);
232         if (agentEmail != null)
233           buf.append("; ");
234       }
235
236       if ((agentEmail != null) && (agentEmail.length() != 0))
237         buf.append(agentEmail);
238
239       buf.append(")");
240     }
241     return buf.toString();
242   }
243
244   /** For debugging. */
245   public static void main(String JavaDoc[] args) throws Exception JavaDoc {
246     boolean verbose = false;
247     String JavaDoc url = null;
248
249     String JavaDoc usage = "Usage: Http [-verbose] [-timeout N] url";
250
251     if (args.length == 0) {
252       System.err.println(usage);
253       System.exit(-1);
254     }
255       
256
257     for (int i = 0; i < args.length; i++) { // parse command line
258
if (args[i].equals("-timeout")) { // found -timeout option
259
TIMEOUT = Integer.parseInt(args[++i]) * 1000;
260       } else if (args[i].equals("-verbose")) { // found -verbose option
261
verbose = true;
262       } else if (i != args.length-1) {
263         System.err.println(usage);
264         System.exit(-1);
265       } else // root is required parameter
266
url = args[i];
267     }
268
269     Http http = new Http();
270
271     if (verbose) {
272       LOG.setLevel(Level.FINE);
273     }
274
275     Content content = http.getContent(url);
276
277     System.out.println("Content Type: " + content.getContentType());
278     System.out.println("Content Length: " + content.get("Content-Length"));
279     System.out.println("Content:");
280     String JavaDoc text = new String JavaDoc(content.getContent());
281     System.out.println(text);
282
283   }
284
285 }
286
Popular Tags