1 2 3 4 package net.nutch.protocol.http; 5 6 import java.io.BufferedInputStream ; 7 import java.io.ByteArrayOutputStream ; 8 import java.io.EOFException ; 9 import java.io.IOException ; 10 import java.io.InputStream ; 11 import java.io.OutputStream ; 12 import java.io.PushbackInputStream ; 13 import java.net.InetAddress ; 14 import java.net.InetSocketAddress ; 15 import java.net.Socket ; 16 import java.net.URL ; 17 import java.util.Map ; 18 import java.util.TreeMap ; 19 import java.util.Properties ; 20 import java.util.logging.Level ; 21 22 import net.nutch.protocol.Content; 23 import net.nutch.protocol.ProtocolException; 24 25 import net.nutch.util.GZIPUtils; 26 27 28 public class HttpResponse { 29 private String orig; 30 private String base; 31 private byte[] content; 32 private int code; 33 private Properties headers = new Properties (); 34 35 36 public int getCode() { return code; } 37 38 39 public String getHeader(String name) { 40 return (String )headers.get(name); 41 } 42 43 public byte[] getContent() { return content; } 44 45 public Content toContent() { 46 String contentType = getHeader("Content-Type"); 47 if (contentType == null) 48 contentType = ""; 49 return new Content(orig, base, content, contentType, headers); 50 } 51 52 public HttpResponse(URL url) throws ProtocolException, IOException { 53 this(url.toString(), url); 54 } 55 56 public HttpResponse(String orig, URL url) 57 throws ProtocolException, IOException { 58 59 this.orig = orig; 60 this.base = url.toString(); 61 62 if (!"http".equals(url.getProtocol())) 63 throw new HttpException("Not an HTTP url:" + url); 64 65 if (Http.LOG.isLoggable(Level.FINE)) 66 Http.LOG.fine("fetching " + url); 67 68 String path = "".equals(url.getFile()) ? "/" : url.getFile(); 69 70 74 String host = url.getHost(); 75 int port; 76 String portString; 77 if (url.getPort() == -1) { 78 port= 80; 79 portString= ""; 80 } else { 81 port= url.getPort(); 82 portString= ":" + port; 83 } 84 Socket socket = null; 85 86 try { 87 socket = new Socket (); socket.setSoTimeout(Http.TIMEOUT); 89 90 91 String sockHost = Http.PROXY ? Http.PROXY_HOST : host; 93 int sockPort = Http.PROXY ? Http.PROXY_PORT : port; 94 InetSocketAddress sockAddr= new InetSocketAddress (sockHost, sockPort); 95 socket.connect(sockAddr, Http.TIMEOUT); 96 97 OutputStream req = socket.getOutputStream(); 99 100 StringBuffer reqStr = new StringBuffer ("GET "); 101 if(Http.PROXY){ 102 reqStr.append(url.getProtocol()+"://"+host+portString+path); 103 } else { 104 reqStr.append(path); 105 } 106 107 reqStr.append(" HTTP/1.0\r\n"); 108 109 reqStr.append("Host: "); 110 reqStr.append(host); 111 reqStr.append(portString); 112 reqStr.append("\r\n"); 113 114 reqStr.append("Accept-Encoding: x-gzip, gzip\r\n"); 115 116 if ((Http.AGENT_STRING == null) || (Http.AGENT_STRING.length() == 0)) { 117 Http.LOG.severe("User-agent is not set!"); 118 } else { 119 reqStr.append("User-Agent: "); 120 reqStr.append(Http.AGENT_STRING); 121 reqStr.append("\r\n"); 122 } 123 124 reqStr.append("\r\n"); 125 byte[] reqBytes= reqStr.toString().getBytes(); 126 127 req.write(reqBytes); 128 req.flush(); 129 130 PushbackInputStream in = new PushbackInputStream ( 132 new BufferedInputStream (socket.getInputStream(), Http.BUFFER_SIZE), 133 Http.BUFFER_SIZE) ; 134 135 StringBuffer line = new StringBuffer (); 136 137 boolean haveSeenNonContinueStatus= false; 138 while (!haveSeenNonContinueStatus) { 139 this.code = parseStatusLine(in, line); 141 headers.putAll(parseHeaders(in, line)); 143 haveSeenNonContinueStatus= code != 100; } 145 146 readPlainContent(in); 147 148 String contentEncoding= getHeader("Content-Encoding"); 149 if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { 150 Http.LOG.fine("uncompressing...."); 151 byte[] compressed = content; 152 153 content = GZIPUtils.unzipBestEffort(compressed, Http.MAX_CONTENT); 154 155 if (content == null) 156 throw new HttpException("unzipBestEffort returned null"); 157 158 if (Http.LOG.isLoggable(Level.FINE)) 159 Http.LOG.fine("fetched " + compressed.length 160 + " bytes of compressed content (expanded to " 161 + content.length + " bytes) from " + url); 162 } else { 163 if (Http.LOG.isLoggable(Level.FINE)) 164 Http.LOG.fine("fetched " + content.length + " bytes from " + url); 165 } 166 167 } finally { 168 if (socket != null) 169 socket.close(); 170 } 171 172 } 173 174 private void readPlainContent(InputStream in) 175 throws HttpException, IOException { 176 177 int contentLength = Integer.MAX_VALUE; String contentLengthString = (String )headers.get("Content-Length"); 179 if (contentLengthString != null) { 180 contentLengthString = contentLengthString.trim(); 181 try { 182 contentLength = Integer.parseInt(contentLengthString); 183 } catch (NumberFormatException e) { 184 throw new HttpException("bad content length: "+contentLengthString); 185 } 186 } 187 if (Http.MAX_CONTENT >= 0 188 && contentLength > Http.MAX_CONTENT) contentLength = Http.MAX_CONTENT; 190 191 ByteArrayOutputStream out = new ByteArrayOutputStream (Http.BUFFER_SIZE); 192 byte[] bytes = new byte[Http.BUFFER_SIZE]; 193 int length = 0; for (int i = in.read(bytes); i != -1; i = in.read(bytes)) { 195 196 out.write(bytes, 0, i); 197 length += i; 198 if (length >= contentLength) 199 break; 200 } 201 content = out.toByteArray(); 202 } 203 204 private void readChunkedContent(PushbackInputStream in, 205 StringBuffer line) 206 throws HttpException, IOException { 207 boolean doneChunks= false; 208 int contentBytesRead= 0; 209 byte[] bytes = new byte[Http.BUFFER_SIZE]; 210 ByteArrayOutputStream out = new ByteArrayOutputStream (Http.BUFFER_SIZE); 211 212 while (!doneChunks) { 213 Http.LOG.fine("Http: starting chunk"); 214 215 readLine(in, line, false); 216 217 String chunkLenStr; 218 220 int pos= line.indexOf(";"); 221 if (pos < 0) { 222 chunkLenStr= line.toString(); 223 } else { 224 chunkLenStr= line.substring(0, pos); 225 } 227 chunkLenStr= chunkLenStr.trim(); 228 int chunkLen; 229 try { 230 chunkLen= Integer.parseInt(chunkLenStr, 16); 231 } catch (NumberFormatException e){ 232 throw new HttpException("bad chunk length: "+line.toString()); 233 } 234 235 if (chunkLen == 0) { 236 doneChunks= true; 237 break; 238 } 239 240 if ( (contentBytesRead + chunkLen) > Http.MAX_CONTENT ) 241 chunkLen= Http.MAX_CONTENT - contentBytesRead; 242 243 int chunkBytesRead= 0; 245 while (chunkBytesRead < chunkLen) { 246 247 int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? 248 (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE; 249 int len= in.read(bytes, 0, toRead); 250 251 if (len == -1) 252 throw new HttpException("chunk eof after " + contentBytesRead 253 + " bytes in successful chunks" 254 + " and " + chunkBytesRead 255 + " in current chunk"); 256 257 261 out.write(bytes, 0, len); 262 chunkBytesRead+= len; 263 } 264 265 readLine(in, line, false); 266 267 } 268 269 if (!doneChunks) { 270 if (contentBytesRead != Http.MAX_CONTENT) 271 throw new HttpException("chunk eof: !doneChunk && didn't max out"); 272 return; 273 } 274 275 content = out.toByteArray(); 276 parseHeaders(in, line); 277 278 } 279 280 private int parseStatusLine(PushbackInputStream in, StringBuffer line) 281 throws IOException , HttpException { 282 readLine(in, line, false); 283 284 int codeStart = line.indexOf(" "); 285 int codeEnd = line.indexOf(" ", codeStart+1); 286 287 if (codeEnd == -1) 290 codeEnd= line.length(); 291 292 int code; 293 try { 294 code= Integer.parseInt(line.substring(codeStart+1, codeEnd)); 295 } catch (NumberFormatException e) { 296 throw new HttpException("bad status line '" + line 297 + "': " + e.getMessage(), e); 298 } 299 300 return code; 301 } 302 303 304 private void processHeaderLine(StringBuffer line, TreeMap headers) 305 throws IOException , HttpException { 306 int colonIndex = line.indexOf(":"); if (colonIndex == -1) { 308 int i; 309 for (i= 0; i < line.length(); i++) 310 if (!Character.isWhitespace(line.charAt(i))) 311 break; 312 if (i == line.length()) 313 return; 314 throw new HttpException("No colon in header:" + line); 315 } 316 String key = line.substring(0, colonIndex); 317 318 int valueStart = colonIndex+1; while (valueStart < line.length()) { 320 int c = line.charAt(valueStart); 321 if (c != ' ' && c != '\t') 322 break; 323 valueStart++; 324 } 325 String value = line.substring(valueStart); 326 327 headers.put(key, value); 328 } 329 330 private Map parseHeaders(PushbackInputStream in, StringBuffer line) 331 throws IOException , HttpException { 332 TreeMap headers = new TreeMap (String.CASE_INSENSITIVE_ORDER); 333 return parseHeaders(in, line, headers); 334 } 335 336 private Map parseHeaders(PushbackInputStream in, StringBuffer line, 338 TreeMap headers) 339 throws IOException , HttpException { 340 while (readLine(in, line, true) != 0) { 341 342 int pos; 344 if ( ((pos= line.indexOf("<!DOCTYPE")) != -1) 345 || ((pos= line.indexOf("<HTML")) != -1) 346 || ((pos= line.indexOf("<html")) != -1) ) { 347 348 in.unread(line.substring(pos).getBytes("UTF-8")); 349 line.setLength(pos); 350 351 try { 352 processHeaderLine(line, headers); 353 } catch (Exception e) { 354 e.printStackTrace(); 356 } 357 358 return headers; 359 } 360 361 processHeaderLine(line, headers); 362 } 363 return headers; 364 } 365 366 private static int readLine(PushbackInputStream in, StringBuffer line, 367 boolean allowContinuedLine) 368 throws IOException { 369 line.setLength(0); 370 for (int c = in.read(); c != -1; c = in.read()) { 371 switch (c) { 372 case '\r': 373 if (peek(in) == '\n') { 374 in.read(); 375 } 376 case '\n': 377 if (line.length() > 0) { 378 if (allowContinuedLine) 381 switch (peek(in)) { 382 case ' ' : case '\t': in.read(); 384 continue; 385 } 386 } 387 return line.length(); default : 389 line.append((char)c); 390 } 391 } 392 throw new EOFException (); 393 } 394 395 private static int peek(PushbackInputStream in) throws IOException { 396 int value = in.read(); 397 in.unread(value); 398 return value; 399 } 400 401 } 402 | Popular Tags |