HttpResponse


1   /* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.protocol.http;
5   
6   import java.io.BufferedInputStream  ;
7   import java.io.ByteArrayOutputStream  ;
8   import java.io.EOFException  ;
9   import java.io.IOException  ;
10  import java.io.InputStream  ;
11  import java.io.OutputStream  ;
12  import java.io.PushbackInputStream  ;
13  import java.net.InetAddress  ;
14  import java.net.InetSocketAddress  ;
15  import java.net.Socket  ;
16  import java.net.URL  ;
17  import java.util.Map  ;
18  import java.util.TreeMap  ;
19  import java.util.Properties  ;
20  import java.util.logging.Level  ;
21  
22  import net.nutch.protocol.Content;
23  import net.nutch.protocol.ProtocolException;
24  
25  import net.nutch.util.GZIPUtils;
26  
27  /** An HTTP response. */
28  public class HttpResponse {
29    private String   orig;
30    private String   base;
31    private byte[] content;
32    private int code;
33    private Properties   headers = new Properties  ();
34  
35    /** Returns the response code. */
36    public int getCode() { return code; }
37  
38    /** Returns the value of a named header. */
39    public String   getHeader(String   name) {
40      return (String  )headers.get(name);
41    }
42  
43    public byte[] getContent() { return content; }
44  
45    public Content toContent() {
46      String   contentType = getHeader("Content-Type");
47      if (contentType == null)
48        contentType = "";
49      return new Content(orig, base, content, contentType, headers);
50    }
51  
52    public HttpResponse(URL   url) throws ProtocolException, IOException   {
53      this(url.toString(), url);
54    }
55  
56    public HttpResponse(String   orig, URL   url)
57      throws ProtocolException, IOException   {
58      
59      this.orig = orig;
60      this.base = url.toString();
61  
62      if (!"http".equals(url.getProtocol()))
63        throw new HttpException("Not an HTTP url:" + url);
64  
65      if (Http.LOG.isLoggable(Level.FINE))
66        Http.LOG.fine("fetching " + url);
67  
68      String   path = "".equals(url.getFile()) ? "/" : url.getFile();
69  
70      // some servers will redirect a request with a host line like
71      // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
72      // don't want the :80...
73  
74      String   host = url.getHost();
75      int port;
76      String   portString;
77      if (url.getPort() == -1) {
78        port= 80;
79        portString= "";
80      } else {
81        port= url.getPort();
82        portString= ":" + port;
83      }
84      Socket   socket = null;
85  
86      try {
87        socket = new Socket  ();                    // create the socket
88        socket.setSoTimeout(Http.TIMEOUT);
89  
90  
91        // connect
92        String   sockHost = Http.PROXY ? Http.PROXY_HOST : host;
93        int sockPort = Http.PROXY ? Http.PROXY_PORT : port;
94        InetSocketAddress   sockAddr= new InetSocketAddress  (sockHost, sockPort);
95        socket.connect(sockAddr, Http.TIMEOUT);
96  
97        // make request
98        OutputStream   req = socket.getOutputStream();
99  
100       StringBuffer   reqStr = new StringBuffer  ("GET ");
101       if(Http.PROXY){
102         reqStr.append(url.getProtocol()+"://"+host+portString+path);
103       } else {
104         reqStr.append(path);
105       }
106 
107       reqStr.append(" HTTP/1.0\r\n");
108 
109       reqStr.append("Host: ");
110       reqStr.append(host);
111       reqStr.append(portString);
112       reqStr.append("\r\n");
113 
114       reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
115 
116       if ((Http.AGENT_STRING == null) || (Http.AGENT_STRING.length() == 0)) {
117         Http.LOG.severe("User-agent is not set!");
118       } else {
119         reqStr.append("User-Agent: ");
120         reqStr.append(Http.AGENT_STRING);
121         reqStr.append("\r\n");
122       }
123 
124       reqStr.append("\r\n");
125       byte[] reqBytes= reqStr.toString().getBytes();
126 
127       req.write(reqBytes);
128       req.flush();
129         
130       PushbackInputStream   in =                  // process response
131         new PushbackInputStream  (
132           new BufferedInputStream  (socket.getInputStream(), Http.BUFFER_SIZE), 
133           Http.BUFFER_SIZE) ;
134 
135       StringBuffer   line = new StringBuffer  ();
136 
137       boolean haveSeenNonContinueStatus= false;
138       while (!haveSeenNonContinueStatus) {
139         // parse status code line
140         this.code = parseStatusLine(in, line); 
141         // parse headers
142         headers.putAll(parseHeaders(in, line));
143         haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
144       }
145 
146       readPlainContent(in);
147 
148       String   contentEncoding= getHeader("Content-Encoding");
149       if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
150         Http.LOG.fine("uncompressing....");
151         byte[] compressed = content;
152 
153         content = GZIPUtils.unzipBestEffort(compressed, Http.MAX_CONTENT);
154 
155         if (content == null)
156           throw new HttpException("unzipBestEffort returned null");
157 
158         if (Http.LOG.isLoggable(Level.FINE))
159           Http.LOG.fine("fetched " + compressed.length
160                         + " bytes of compressed content (expanded to "
161                         + content.length + " bytes) from " + url);
162       } else {
163         if (Http.LOG.isLoggable(Level.FINE))
164           Http.LOG.fine("fetched " + content.length + " bytes from " + url);
165       }
166 
167     } finally {
168       if (socket != null)
169         socket.close();
170     }
171 
172   }
173 
174   private void readPlainContent(InputStream   in) 
175     throws HttpException, IOException   {
176 
177     int contentLength = Integer.MAX_VALUE;    // get content length
178     String   contentLengthString = (String  )headers.get("Content-Length");
179     if (contentLengthString != null) {
180       contentLengthString = contentLengthString.trim();
181       try {
182         contentLength = Integer.parseInt(contentLengthString);
183       } catch (NumberFormatException   e) {
184         throw new HttpException("bad content length: "+contentLengthString);
185       }
186     }
187     if (Http.MAX_CONTENT >= 0
188       && contentLength > Http.MAX_CONTENT)   // limit download size
189       contentLength  = Http.MAX_CONTENT;
190 
191     ByteArrayOutputStream   out = new ByteArrayOutputStream  (Http.BUFFER_SIZE);
192     byte[] bytes = new byte[Http.BUFFER_SIZE];
193     int length = 0;                           // read content
194     for (int i = in.read(bytes); i != -1; i = in.read(bytes)) {
195 
196       out.write(bytes, 0, i);
197       length += i;
198       if (length >= contentLength)
199         break;
200     }
201     content = out.toByteArray();
202   }
203 
204   private void readChunkedContent(PushbackInputStream   in,  
205                                   StringBuffer   line) 
206     throws HttpException, IOException   {
207     boolean doneChunks= false;
208     int contentBytesRead= 0;
209     byte[] bytes = new byte[Http.BUFFER_SIZE];
210     ByteArrayOutputStream   out = new ByteArrayOutputStream  (Http.BUFFER_SIZE);
211 
212     while (!doneChunks) {
213       Http.LOG.fine("Http: starting chunk");
214 
215       readLine(in, line, false);
216 
217       String   chunkLenStr;
218       // LOG.fine("chunk-header: '" + line + "'");
219 
220       int pos= line.indexOf(";");
221       if (pos < 0) {
222         chunkLenStr= line.toString();
223       } else {
224         chunkLenStr= line.substring(0, pos);
225         // LOG.fine("got chunk-ext: " + line.substring(pos+1));
226       }
227       chunkLenStr= chunkLenStr.trim();
228       int chunkLen;
229       try {
230         chunkLen= Integer.parseInt(chunkLenStr, 16);
231       } catch (NumberFormatException   e){ 
232         throw new HttpException("bad chunk length: "+line.toString());
233       }
234 
235       if (chunkLen == 0) {
236         doneChunks= true;
237         break;
238       }
239 
240       if ( (contentBytesRead + chunkLen) > Http.MAX_CONTENT )
241         chunkLen= Http.MAX_CONTENT - contentBytesRead;
242 
243       // read one chunk
244       int chunkBytesRead= 0;
245       while (chunkBytesRead < chunkLen) {
246 
247         int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
248                     (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE;
249         int len= in.read(bytes, 0, toRead);
250 
251         if (len == -1) 
252           throw new HttpException("chunk eof after " + contentBytesRead
253                                       + " bytes in successful chunks"
254                                       + " and " + chunkBytesRead 
255                                       + " in current chunk");
256 
257         // DANGER!!! Will printed GZIPed stuff right to your
258         // terminal!
259         // LOG.fine("read: " +  new String(bytes, 0, len));
260 
261         out.write(bytes, 0, len);
262         chunkBytesRead+= len;  
263       }
264 
265       readLine(in, line, false);
266 
267     }
268 
269     if (!doneChunks) {
270       if (contentBytesRead != Http.MAX_CONTENT) 
271         throw new HttpException("chunk eof: !doneChunk && didn't max out");
272       return;
273     }
274 
275     content = out.toByteArray();
276     parseHeaders(in, line);
277 
278   }
279 
280   private int parseStatusLine(PushbackInputStream   in, StringBuffer   line)
281     throws IOException  , HttpException {
282     readLine(in, line, false);
283 
284     int codeStart = line.indexOf(" ");
285     int codeEnd = line.indexOf(" ", codeStart+1);
286 
287     // handle lines with no plaintext result code, ie:
288     // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
289     if (codeEnd == -1) 
290       codeEnd= line.length();
291 
292     int code;
293     try {
294       code= Integer.parseInt(line.substring(codeStart+1, codeEnd));
295     } catch (NumberFormatException   e) {
296       throw new HttpException("bad status line '" + line 
297                               + "': " + e.getMessage(), e);
298     }
299 
300     return code;
301   }
302 
303 
304   private void processHeaderLine(StringBuffer   line, TreeMap   headers)
305     throws IOException  , HttpException {
306     int colonIndex = line.indexOf(":");       // key is up to colon
307     if (colonIndex == -1) {
308       int i;
309       for (i= 0; i < line.length(); i++)
310         if (!Character.isWhitespace(line.charAt(i)))
311           break;
312       if (i == line.length())
313         return;
314       throw new HttpException("No colon in header:" + line);
315     }
316     String   key = line.substring(0, colonIndex);
317 
318     int valueStart = colonIndex+1;            // skip whitespace
319     while (valueStart < line.length()) {
320       int c = line.charAt(valueStart);
321       if (c != ' ' && c != '\t')
322         break;
323       valueStart++;
324     }
325     String   value = line.substring(valueStart);
326 
327     headers.put(key, value);
328   }
329 
330   private Map   parseHeaders(PushbackInputStream   in, StringBuffer   line)
331     throws IOException  , HttpException {
332     TreeMap   headers = new TreeMap  (String.CASE_INSENSITIVE_ORDER);
333     return parseHeaders(in, line, headers);
334   }
335 
336   // Adds headers to an existing TreeMap
337   private Map   parseHeaders(PushbackInputStream   in, StringBuffer   line,
338                            TreeMap   headers)
339     throws IOException  , HttpException {
340     while (readLine(in, line, true) != 0) {
341 
342       // handle HTTP responses with missing blank line after headers
343       int pos;
344       if ( ((pos= line.indexOf("<!DOCTYPE")) != -1) 
345            || ((pos= line.indexOf("<HTML")) != -1) 
346            || ((pos= line.indexOf("<html")) != -1) ) {
347 
348         in.unread(line.substring(pos).getBytes("UTF-8"));
349         line.setLength(pos);
350 
351         try {
352           processHeaderLine(line, headers);
353         } catch (Exception   e) {
354           // fixme:
355           e.printStackTrace();
356         }
357 
358         return headers;
359       }
360 
361       processHeaderLine(line, headers);
362     }
363     return headers;
364   }
365 
366   private static int readLine(PushbackInputStream   in, StringBuffer   line,
367                       boolean allowContinuedLine)
368     throws IOException   {
369     line.setLength(0);
370     for (int c = in.read(); c != -1; c = in.read()) {
371       switch (c) {
372         case '\r':
373           if (peek(in) == '\n') {
374             in.read();
375           }
376         case '\n': 
377           if (line.length() > 0) {
378             // at EOL -- check for continued line if the current
379             // (possibly continued) line wasn't blank
380             if (allowContinuedLine) 
381               switch (peek(in)) {
382                 case ' ' : case '\t':                   // line is continued
383                   in.read();
384                   continue;
385               }
386           }
387           return line.length();      // else complete
388         default :
389           line.append((char)c);
390       }
391     }
392     throw new EOFException  ();
393   }
394 
395   private static int peek(PushbackInputStream   in) throws IOException   {
396     int value = in.read();
397     in.unread(value);
398     return value;
399   }
400 
401 }
402
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags