KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > matuschek > http > HttpTool


1 package net.matuschek.http;
2
3 /*************************************************
4     Copyright (c) 2001/2002 by Daniel Matuschek
5 *************************************************/

6
7 import java.io.BufferedInputStream JavaDoc;
8 import java.io.BufferedWriter JavaDoc;
9 import java.io.IOException JavaDoc;
10 import java.io.InputStream JavaDoc;
11 import java.io.OutputStreamWriter JavaDoc;
12 import java.net.InetAddress JavaDoc;
13 import java.net.URL JavaDoc;
14 import java.net.UnknownHostException JavaDoc;
15 import java.text.ParseException JavaDoc;
16 import java.text.SimpleDateFormat JavaDoc;
17 import java.util.Date JavaDoc;
18 import java.util.Locale JavaDoc;
19 import java.util.TimeZone JavaDoc;
20 import java.util.Properties JavaDoc;
21
22 import net.matuschek.http.connection.HttpConnection;
23 import net.matuschek.http.connection.HttpsHelper;
24 import net.matuschek.http.cookie.Cookie;
25 import net.matuschek.http.cookie.CookieException;
26 import net.matuschek.http.cookie.CookieManager;
27 import net.matuschek.http.cookie.MemoryCookieManager;
28 import net.matuschek.util.Base64;
29 import net.matuschek.util.ByteBuffer;
30 import net.matuschek.util.ChunkedInputStream;
31 import net.matuschek.util.LimitedBandwidthStream;
32
33 import org.apache.log4j.Category;
34
35 /**
36  * Class for retrieving documents from HTTP servers.
37  *
38  * <p>The main purpose of this class is to retrieve a document
39  * from an HTTP server. </p>
40  *
41  * <p>For many purposes the Java URLInputStream is good for this,
42  * but if you want to have full control over the HTTP headers
43  * (both request and response headers), HttpTool is the answer. </p>
44  *
45  * <p>Also it defines a callback interface to inform a client about
46  * the state of the current download operation. </p>
47  *
48  * <p>It is possible to abort a download after getting the
49  * HTTP response headers from the server (e.g. if a document of
50  * this Content-Type is useless for your application or the document
51  * is to big or whatever you like) </p>
52  *
53  * <p>HttpTool is reusuable. You should initializes it once and use
54  * it for every download operation.</p>
55  *
56  * @author Daniel Matuschek
57  * @version $Id: HttpTool.java,v 1.28 2004/03/26 20:28:44 matuschd Exp $
58  */

59 public class HttpTool {
60
61   /** Carriage return */
62   final static byte CR = 13;
63
64   /** Line feed */
65   final static byte LF = 10;
66
67   /** used HTTP version */
68   final static String JavaDoc HTTP_VERSION="HTTP/1.1";
69
70   /* Status constants */
71
72   /** HTTP connection will be established */
73   public final static int STATUS_CONNECTING=0;
74   /** HTTP connection was established, but no data where retrieved */
75   public final static int STATUS_CONNECTED=1;
76   /** data will be retrieved now */
77   public final static int STATUS_RETRIEVING=2;
78   /** download finished */
79   public final static int STATUS_DONE=3;
80   /** download could not be finished because a DownloadRule denied it */
81   public final static int STATUS_DENIEDBYRULE=4;
82
83   /** default HTTP port */
84   private final static int DEFAULT_HTTPPORT = 80;
85
86   /** default HTTPS port */
87   private final static int DEFAULT_HTTPSPORT = 443;
88
89   /** default agent name */
90   private final static String JavaDoc AGENTNAME =
91     "JoBo/1.4beta "
92     +"(http://www.matuschek.net/jobo.html)";
93
94   /**
95    * default update interval for calls of the callback interfaces
96    * (in bytes)
97    */

98   private final static int DEFAULT_UPDATEINTERVAL =1024;
99
100   /** default socket timeout in seconds */
101   private final static int DEFAULT_SOCKETTIMEOUT=20;
102
103   /** HTTP AgentName header */
104   private String JavaDoc agentName = AGENTNAME;
105   
106   /** HTTP Referer header */
107   private String JavaDoc referer = null;
108
109   /** HTTP From header */
110   private String JavaDoc fromAddress = null;
111
112   /** Date of the HTTP If-Modified-Since header */
113   private Date JavaDoc modifyDate = null;
114   
115   /**
116    * maximal used bandwidth in bytes per second
117    * 0 disables bandwidth limitations
118    */

119   private int bandwidth = 0;
120
121   /** proxy address */
122   private InetAddress JavaDoc proxyAddr = null;
123
124   /** proxy port number */
125   private int proxyPort = 0;
126
127   /** textual description of the proxy (format host:port) */
128   private String JavaDoc proxyDescr="";
129
130   /** timeout for getting data in seconds */
131   private int socketTimeout = DEFAULT_SOCKETTIMEOUT;
132
133   /** HttpTool should accept and use cookies */
134   private boolean cookiesEnabled = true;
135
136   /** Log4J Category object for logging */
137   private Category log = null;
138
139   /** Authentication infos */
140   private Properties JavaDoc userInfos = new Properties JavaDoc();
141   
142   /** @link dependency */
143   /*#HttpDoc lnkHttpDoc;*/
144
145
146   /**
147    * defines after how many bytes read from the web
148    * server the Callback interface will be called
149    * (default updates after one kilobyte)
150    */

151   private int updateInterval = DEFAULT_UPDATEINTERVAL;
152   
153   /**
154    * callback interface that will be used after n bytes are
155    * read from the web server to update the state of the current
156    * retrieve operation to the application
157    */

158   private HttpToolCallback callback=null;
159
160   /**
161    * DownloadRuleSet tells the HttpTool, if it should download
162    * the whole file after getting the headers
163    */

164   private DownloadRuleSet downloadRules = null;
165
166   /**
167    * The cookie manager will be used to store cookies
168    */

169   private CookieManager cookieManager = null;
170
171   /**
172    * The DateFormat instance will be used to format If-Modified-Since requests
173    */

174   static SimpleDateFormat JavaDoc df;
175
176   private NTLMAuthorization ntlmAuthorization = null;
177     
178   /*
179    * Initialize df to a formatter for timezone "GMT" and locale Locale.US
180    * without changing the default timezone. If-Modified-Since requests need
181    * to be in that format.
182    */

183   static {
184     TimeZone JavaDoc local = TimeZone.getDefault();
185     TimeZone JavaDoc gmt = TimeZone.getTimeZone("GMT");
186     TimeZone.setDefault(gmt);
187     df = new SimpleDateFormat JavaDoc("EEE, dd MMM yyyy HH:mm:ss z", Locale.US);
188     TimeZone.setDefault(local);
189   }
190
191
192   /**
193    * Initializes HttpTool with a new CookieManager (that will not contain
194    * any cookie).
195    * Enables logging
196    */

197   public HttpTool() {
198     this.cookieManager = new MemoryCookieManager();
199     log = Category.getInstance(getClass().getName());
200   }
201
202
203   /**
204    * Sets the Referer: HTTP header
205    * @param referer value for the Referer header
206    */

207   public void setReferer(String JavaDoc referer) {
208     this.referer = referer;
209   }
210   
211   /**
212    * Sets the User-Agent: HTTP header
213    * @param name name of the user agent (may contain spaces)
214    */

215   public void setAgentName(String JavaDoc name) {
216     this.agentName = name;
217   }
218   
219   /**
220    * Gets the current setting of the User-Agent HTTP header
221    * @return the User-Agent name
222    */

223   public String JavaDoc getAgentName() {
224     return agentName;
225   }
226
227   /**
228    * <b>Insiders BugFix</b>
229    * This method finishes the MemoryCleanupManager.
230    */

231   public void finish() {
232       if (cookieManager != null) {
233           cookieManager.finish();
234       }
235   }
236
237   /**
238    * Sets the DownloadRules for this object <br />
239    * A download rule uses the HTTP return headers to decide if the
240    * download should be finished.
241    * @param rule a DownloadRule
242    */

243   public void setDownloadRuleSet(DownloadRuleSet rules) {
244     this.downloadRules=rules;
245   }
246
247
248   /**
249    * Gets the DownloadRules for this object
250    * @return a DownloadRuleSet
251    */

252   public DownloadRuleSet getDownloadRuleSet() {
253     return this.downloadRules;
254   }
255
256
257   /**
258    * Gets the timeout for getting data in seconds
259    * @return the value of sockerTimeout
260    * @see #setTimeout(int)
261    */

262   public int getTimeout() {
263     return this.socketTimeout;
264   }
265
266
267   /**
268    * Sets the timeout for getting data. If HttpTool can't read
269    * data from a remote web server after this number of seconds
270    * it will stop the download of the current file
271    * @param timeout Timeout in seconds
272    */

273   public void setTimeout(int timeout) {
274     this.socketTimeout = timeout;
275   }
276
277
278   /**
279    * Enable/disable cookies
280    * @param enable if true, HTTP cookies will be enabled, if false
281    * HttpTool will not use cookies
282    */

283   public void setEnableCookies(boolean enable) {
284     this.cookiesEnabled=enable;
285   }
286
287   /**
288    * Get the status of the cookie engine
289    * @return true, if HTTP cookies are enabled, false otherwise
290    */

291   public boolean getEnableCookies() {
292     return this.cookiesEnabled;
293   }
294
295
296   /**
297    * sets a proxy to use
298    * @param proxyDescr the Proxy definition in the format host:port
299    */

300   public void setProxy(String JavaDoc proxyDescr)
301     throws HttpException
302   {
303     proxyAddr=null;
304     proxyPort=0;
305     String JavaDoc proxyHost = null;
306
307     if ((proxyDescr != null) &&
308     (! proxyDescr.equals(""))) {
309       int pos = proxyDescr.indexOf(":");
310       if (pos > 0) {
311     try {
312       String JavaDoc port = proxyDescr.substring(pos+1);
313       proxyHost = proxyDescr.substring(0,pos);
314       proxyPort = Integer.parseInt(port);
315       proxyAddr = InetAddress.getByName(proxyHost);
316     } catch (NumberFormatException JavaDoc e) {
317       throw new HttpException("Proxy definition incorrect, "+
318                   "port not numeric: "+
319                   proxyDescr);
320     } catch (UnknownHostException JavaDoc e) {
321       throw new HttpException("Host not found: "+proxyHost);
322     }
323       } else {
324     throw new HttpException("Proxy definition incorrect, "+
325                 "fomat must be host:port: "+
326                 proxyDescr);
327       }
328     }
329     this.proxyDescr=proxyDescr;
330   }
331
332
333   /**
334    * Gets a textual representation of the current proxy settings
335    * @return return the proxy settings in the format host:port
336    */

337   public String JavaDoc getProxy() {
338     return proxyDescr;
339   }
340
341
342   /**
343    * Set the value of the "If-Modified-Since" header
344    * Usually, this is null and HttpTool will retrieve every
345    * document. Setting this to a date will retrieve only
346    * documents that were modified since this time
347    */

348   public void setIfModifiedSince(Date JavaDoc modifyDate) {
349     this.modifyDate=modifyDate;
350   }
351
352
353   /**
354    * Returns the date used for the "If-Modified-Since" header
355    * @return a Date object if the "If-Modified-Since" header is set,
356    * null otherwise
357    */

358   public Date JavaDoc getIfModifiedSince() {
359     return this.modifyDate;
360   }
361
362
363   /**
364    * Sets the content From: HTTP header
365    * @param fromAdress an email adress (e.g. some@where.com)
366    */

367   public void setFromAddress(String JavaDoc fromAddress) {
368     this.fromAddress=fromAddress;
369   }
370
371   
372   /**
373    * Gets the current callback object
374    * @return the defined HttpToolCallback object
375    */

376   public HttpToolCallback getCallback() {
377     return callback;
378   }
379
380
381   /**
382    * Get the value of bandwidth.
383    * @return value of bandwidth.
384    */

385   public int getBandwidth() {
386     return bandwidth;
387   }
388   
389
390   /**
391    * Set the value of bandwidth.
392    * @param bandwith Value to assign to bandwidth.
393    */

394   public void setBandwidth(int bandwidth) {
395     this.bandwidth = bandwidth;
396   }
397   
398
399   /**
400    * Sets a callback object
401    *
402    * If set this object will be used to inform about the current
403    * status of the download. HttpTool will call methods of this
404    * object while retrieving a document.
405    *
406    * @param callback a callback object
407    * @see HttpToolCallback
408    */

409   public void setCallback(HttpToolCallback callback) {
410     this.callback = callback;
411   }
412
413
414   /**
415    * Gets the current update interval
416    * @return the update interval in bytes
417    * @see #setUpdateInterval(int)
418    */

419   public int getUpdateInterval() {
420     return updateInterval;
421   }
422
423
424   /**
425    * Sets the callback update interval
426    *
427    * This setting is used if a callback object is defined. Then after
428    * reading this number of bytes, the method
429    * <code>setHttpToolDocCurrentSize</code> will be called.
430    * You should not set this to a value smaller then 1000 unless your
431    * bandwidth is very small, because it will slow down downloads.
432    *
433    * @param updateInterval update interval in bytes
434    *
435    * @see HttpToolCallbackInterface#setHttpToolDocCurrentSize(int)
436    */

437   public void setUpdateInterval(int updateInterval) {
438     if (updateInterval > 0) {
439       this.updateInterval = updateInterval;
440     } else {
441       throw new IllegalArgumentException JavaDoc("updateInterval must be > 0 (was "+
442                      updateInterval+")");
443     }
444   }
445
446   /**
447    * Sets the CookieManager for this HttpTool
448    * By default a MemoryCookieManager will be used, but you can
449    * use this method to use your own CookieManager implementation
450    *
451    * @param cm an object that implements the CookieManager interface
452    */

453   public void setCookieManager(CookieManager cm) {
454     this.cookieManager = cm;
455   }
456
457
458   /**
459    * Gets the CookieManager used by this HttpTool
460    *
461    * @return the CookieManager that will be used by this HttpTool
462    */

463   public CookieManager getCookieManager() {
464     return this.cookieManager;
465   }
466
467
468   /**
469    * Delete all cookies
470    */

471   public void clearCookies() {
472     if (cookieManager != null) {
473       cookieManager.clear();
474     }
475   }
476
477
478   /**
479    * Retrieves a document from the given URL.
480    * If Cookies are enabled it will use the CookieManager to set Cookies
481    * it got from former retrieveDocument operations.
482    *
483    * @param u the URL to retrieve (only http:// supported yet)
484    * @param method HttpConstants.GET for a GET request, HttpConstants.POST
485    * for a POST request
486    * @param parameters additional parameters. Will be added to the URL if
487    * this is a GET request, posted if it is a POST request
488    * @return a HttpDoc if a document was retrieved, null otherwise
489    *
490    * @see HttpConstants
491    */

492   public HttpDoc retrieveDocument(URL JavaDoc u, int method, String JavaDoc parameters) throws HttpException {
493     DocAndConnection docAndConnection = retrieveDocumentInternal(u, method, parameters, null, null);
494     HttpDoc doc = docAndConnection != null ? docAndConnection.httpDoc : null;
495     if (doc != null && doc.getHttpCode() == 401) {
496         String JavaDoc authProtName = NTLMAuthorization.WWW_AUTHENTICATE_HEADER;
497         String JavaDoc authProtValue = doc.getHeaderValue(authProtName);
498         if (authProtValue == null) {
499             authProtName = NTLMAuthorization.PROXY_AUTHENTICATE_HEADER;
500             authProtValue = doc.getHeaderValue(authProtName);
501         }
502         if (authProtValue.indexOf(NTLMAuthorization.NTLM_TAG)>=0 ||
503             authProtValue.indexOf("Negotiate")>=0) {
504             
505             try {
506                 // STEP 1 - send NTLM-Request
507
NTLMAuthorization authorization = (NTLMAuthorization) ntlmAuthorization.clone();
508                 authorization.setHost(u.getHost());
509                 // log.info("NTLM-Authentication: " + authorization);
510
String JavaDoc auth = authorization.getRequest();
511                 docAndConnection = retrieveDocumentInternal(u, method, parameters, null, auth);
512                 
513                 // STEP 2 - receive NTLM-Nonce
514
doc = docAndConnection.httpDoc;
515                 authProtValue = doc.getHeaderValue(authProtName);
516                 authorization.extractNonce(authProtValue);
517                 
518                 // STEP 3 - send NTLM-Response
519
auth = authorization.getResponse();
520                 docAndConnection = retrieveDocumentInternal(u, method, parameters, docAndConnection.httpConnection, auth);
521                 if (docAndConnection != null) {
522                     doc = docAndConnection.httpDoc;
523                     if (docAndConnection.httpConnection != null) {
524                         docAndConnection.httpConnection.close();
525                     }
526                 } else {
527                     doc = null; // BUGFIX (Not modified files return null)
528
}
529                 
530             } catch (Exception JavaDoc e) {
531                 log.error("NTLM-Authentication Error: " + e.getMessage());
532                 throw new HttpException(e.getMessage());
533             }
534         }
535     }
536     return doc;
537   }
538    
539    /**
540     * Internal structure to keep connection after retrieval of doc.
541     */

542    protected class DocAndConnection {
543         HttpDoc httpDoc;
544         HttpConnection httpConnection;
545    }
546    
547   /**
548    * Same like method without parameter httpConnection, but this
549    * method uses the passed connection.
550    * @param u
551    * @param method
552    * @param parameters
553    * @param httpConnection (Use this connection)
554    * @return DocAndConnection
555    * @throws HttpException
556    */

557   protected DocAndConnection retrieveDocumentInternal(URL JavaDoc u, int method, String JavaDoc parameters,
558              HttpConnection httpConn, String JavaDoc ntlmAuthorizationInfo)
559     throws HttpException
560   {
561     String JavaDoc host = null;
562     InetAddress JavaDoc addr = null;
563     String JavaDoc path = null;
564     String JavaDoc requestPath = null;
565     String JavaDoc protocol = null;
566     String JavaDoc userinfo = null;
567     boolean chunkedEncoding = false;
568     boolean secureConnection = false;
569     ChunkedInputStream chunkStream=null;
570
571     // Content-Length
572
int docSize = -1;
573       
574     int port = 0;
575     HttpDoc doc = new HttpDoc();
576     int i = 0;
577
578     // set document URL
579
doc.setURL(u);
580
581     // document buffer
582
ByteBuffer buff = new ByteBuffer();
583
584     // the connection to the HTTP server
585
// HttpConnection httpConn = null;
586

587     InputStream JavaDoc is = null;
588     BufferedWriter JavaDoc bwrite = null;
589
590     // get host
591
host = u.getHost();
592     if (host == null) {
593       throw new HttpException("no host part in URL found");
594     }
595
596     // get address, if not using a proxy
597
// if the client runs behind a proxy it is possible, that name
598
// resolution for the internet is not possible
599
if(! useProxy()) {
600       try {
601     addr = InetAddress.getByName(host);
602       } catch (UnknownHostException JavaDoc e) {
603     addr = null;
604       }
605       if (addr == null) {
606     throw new HttpException("host part (" + host + ") does not resolve");
607       }
608     }
609
610     // get path
611
path = u.getFile();
612     if (path.equals("")) {
613       path = "/";
614     }
615     // replace spaces
616
path=path.replaceAll(" ","%20");
617
618     // get protocol and port
619
port = u.getPort();
620     protocol = u.getProtocol().toLowerCase();
621     if (protocol.equals("http")) {
622       if (port == -1) {
623     port = DEFAULT_HTTPPORT;
624       }
625     } else if (protocol.equals("https")) {
626       if (port == -1) {
627     port = DEFAULT_HTTPSPORT;
628       }
629       secureConnection=true;
630     } else {
631       throw new HttpException("protocol " + protocol + " not supported");
632     }
633
634     // if using the proxy, request path is the whole URL, otherwise only
635
// the path part of the URL
636
if (useProxy() && (! secureConnection)) {
637       requestPath="http://"+host+path;
638     } else {
639       requestPath=path;
640     }
641
642     // get user info
643
userinfo = u.getUserInfo();
644     if (userinfo != null) {
645      if (userinfo.equals("")) {
646       userinfo=null;
647      } else {
648       // Store user info for this host
649
userInfos.setProperty(host,userinfo);
650      }
651     } else {
652      // do we hae a stored user info?
653
userinfo=userInfos.getProperty(host);
654     }
655
656
657     if (callback != null) {
658       callback.setHttpToolDocUrl(u.toString());
659       callback.setHttpToolStatus(STATUS_CONNECTING);
660     }
661
662     // okay, we got all needed information, try to connect to the host
663
try {
664         if (httpConn == null) {
665           // connect and initialize streams
666
// timeout is stored in seconds in HttpTool, but
667
// HttpConnection uses milliseconds
668
if (secureConnection) {
669             HttpsHelper helper = new HttpsHelper(proxyAddr,proxyPort,useProxy());
670             httpConn = helper.createHttpsConnection(host,port);
671           } else {
672             if (useProxy()) {
673               httpConn = HttpConnection.createConnection(proxyAddr,
674                                      proxyPort,
675                                      socketTimeout*1000);
676             } else {
677               httpConn = HttpConnection.createConnection(addr,
678                                      port,
679                                      socketTimeout*1000);
680             }
681           }
682         }
683     
684       is = new LimitedBandwidthStream(
685         new BufferedInputStream JavaDoc(httpConn.getInputStream(), 256),
686                     bandwidth);
687       bwrite = new BufferedWriter JavaDoc(
688              new OutputStreamWriter JavaDoc(httpConn.getOutputStream()));
689
690       if (callback != null) {
691     callback.setHttpToolStatus(STATUS_CONNECTED);
692       }
693
694
695       // write HTTP request
696
// get or post ?
697
if (method == HttpConstants.GET) {
698     bwrite.write("GET ");
699     bwrite.write(requestPath);
700     if ((parameters != null)
701         && (! parameters.equals(""))) {
702       bwrite.write("?");
703       bwrite.write(parameters);
704     }
705
706       } else if (method == HttpConstants.POST) {
707     bwrite.write("POST " + requestPath);
708       } else {
709     throw new HttpException("HTTP method " + method + " not supported");
710       }
711
712       // last part of request line
713
bwrite.write(" ");
714       bwrite.write(HTTP_VERSION);
715       bwrite.write("\r\n");
716
717       // Referer header only if defined
718
if (referer != null) {
719     bwrite.write("Referer: " + referer + "\r\n");
720       }
721
722       // if cookies are enabled, write a Cookie: header
723
if (cookiesEnabled) {
724     String JavaDoc cookieString = cookieManager.cookiesForURL(u);
725     if (cookieString != null) {
726       bwrite.write("Cookie: ");
727       bwrite.write(cookieString);
728       bwrite.write("\r\n");
729       log.debug("Cookie request header: "+cookieString);
730     }
731       }
732
733       // Write other headers
734
bwrite.write("Host: " + host + "\r\n");
735       bwrite.write("User-Agent: " + agentName + "\r\n");
736       bwrite.write("Accept: */*\r\n");
737       if (ntlmAuthorizationInfo == null) {
738         bwrite.write("Connection: close\r\n");
739       } else {
740         bwrite.write("Connection: keep-alive\r\n");
741       }
742
743       // Write "From:" header only if a fromAddress is defined
744
if (fromAddress != null) {
745     bwrite.write("From: "+fromAddress+"\r\n");
746       }
747
748       // if we have username and password, lets write an Authorization
749
// header
750
if (userinfo != null) {
751     // special hack to support usernames with "@"
752
// TO DO: find a better solution for this problem
753
userinfo = userinfo.replace('%','@');
754     bwrite.write("Authorization: Basic ");
755     bwrite.write(Base64.encode(userinfo));
756     bwrite.write("\r\n");
757         log.debug(userinfo);
758         
759       }
760       
761       if (ntlmAuthorizationInfo != null) {
762         bwrite.write("Authorization: NTLM ");
763         bwrite.write(ntlmAuthorizationInfo);
764         bwrite.write("\r\n");
765       }
766       
767
768       // if there is a "If-Modified-Since" date, also write this header
769
if (modifyDate != null) {
770     String JavaDoc dateStr = df.format(modifyDate);
771
772     bwrite.write("If-Modified-Since: ");
773     bwrite.write(dateStr);
774     bwrite.write("\r\n");
775     log.debug("If-Modified-Since header: "+dateStr);
776       }
777
778       // for a POST request we also need a content-length header
779
if (method == HttpConstants.POST) {
780     bwrite.write("Content-Type: application/x-www-form-urlencoded\r\n");
781     bwrite.write("Content-Length: "+parameters.length()+"\r\n");
782       }
783
784       // finished headers
785
bwrite.write("\r\n");
786       // if this is a POST request, we have to add the POST parameters
787
if (method == HttpConstants.POST) {
788     bwrite.write(parameters);
789       }
790       bwrite.flush();
791       
792       if (callback != null) {
793     callback.setHttpToolStatus(STATUS_RETRIEVING);
794       }
795
796       // read the first line (HTTP return code)
797
while ((i = is.read()) != 10) {
798     if (i == -1) {
799       throw new HttpException("Could not get HTTP return code "+
800                   "(buffer content is "+buff.toString()+")");
801     }
802     buff.append((byte)i);
803       }
804
805       String JavaDoc httpCode = lineString(buff.getContent());
806       buff.clean();
807       doc.setHttpCode(httpCode);
808
809
810       // read the HTTP headers
811
boolean finishedHeaders = false;
812       while (!finishedHeaders) {
813     i = is.read();
814     if (i == -1) {
815       throw new HttpException("Could not read HTTP headers");
816     }
817     if (i >= 32) {
818       buff.append((byte)i);
819     }
820     // HTTP header processing
821
if (i == LF) {
822       String JavaDoc line = lineString(buff.getContent());
823       
824       buff.clean();
825       // empty line means "end of headers"
826
if (line.trim().equals("")) {
827         finishedHeaders = true;
828       } else {
829         HttpHeader head = new HttpHeader(line);
830         doc.addHeader(head);
831
832         if (cookiesEnabled
833         && head.isSetCookie()) {
834           try {
835         Cookie cookie = new Cookie(head.toLine(),u);
836         cookieManager.add(cookie);
837         log.debug("Got a cookie "+cookie);
838           } catch (CookieException e) {
839         log.info("Could not interpret cookie: "+e.getMessage());
840           }
841         }
842
843         // Content chunked ?
844
if (head.getName().equalsIgnoreCase("Transfer-Encoding")
845         && head.getValue().equalsIgnoreCase("chunked")) {
846           chunkedEncoding = true;
847         }
848
849       }
850     }
851       }
852       buff.clean();
853
854       // if there is a DownloadRule, ask if we should download
855
// the data
856
if (downloadRules != null) {
857         // if it is not allowed to download this URL, close socket
858
// and return a null document
859
boolean isNotModified = false;
860         if (modifyDate != null) {
861             HttpHeader lastModifiedHeader = doc.getHttpHeader("Last-Modified");
862             if (lastModifiedHeader != null) {
863                 try {
864                     Date JavaDoc lastModifiedDate = df.parse(lastModifiedHeader.getValue());
865                     if (lastModifiedDate.compareTo(modifyDate) <= 0) {
866                         isNotModified = true;
867                     }
868                 } catch (ParseException JavaDoc e) {}
869             }
870         }
871         
872         if (! downloadRules.downloadAllowed(doc.getHttpHeader()) || isNotModified) {
873           if (doc.isNotModified()) {
874             log.info("If-Not-Modified successfull for: " + u);
875           } else if (isNotModified) {
876             log.info("Header indicates not modified for: " + u);
877           } else {
878             log.info("Download not allowed by download rule.");
879           }
880           // Close connection
881
httpConn.close(); httpConn = null;
882     
883           if (callback != null) {
884             callback.setHttpToolStatus(STATUS_DENIEDBYRULE);
885           }
886           return null;
887         }
888       }
889
890       
891       // if we got encoding "chunked", use the ChunkedInputStream
892
if (chunkedEncoding) {
893     chunkStream = new ChunkedInputStream(is);
894       }
895       
896
897       // did we got an Content-Length header ?
898
HttpHeader contentLength = doc.getHeader(HttpHeader.CONTENT_LENGTH);
899       if (contentLength != null) {
900     
901     try {
902       docSize = Integer.parseInt(contentLength.getValue());
903     } catch (NumberFormatException JavaDoc e) {
904       log.error("Got a malformed Content-Length header from the server");
905       docSize = -1;
906     }
907
908     // send information to callback
909
if (callback != null) {
910       callback.setHttpToolDocSize(docSize);
911     }
912
913     // initialize the byte buffer with the given document size
914
// there is no need to increase the buffer size dynamically
915
if (docSize > 0) {
916       buff.setSize(docSize);
917     }
918       }
919
920       // read data
921
boolean finished = false;
922       int count=0;
923
924       while (! finished) {
925     
926     if (chunkedEncoding) {
927       i = chunkStream.read();
928     } else {
929       i = is.read();
930     }
931     
932     if (i == -1) {
933       // this should only happen on HTTP/1.0 responses
934
// without a Content-Length header
935
finished = true;
936     } else {
937       buff.append((byte)i);
938       count++;
939     }
940
941
942     // finished ?
943
// there are other tests then wait until read gives us a -1:
944

945     // if there was a Content-Length header stop after reading the
946
// given number of bytes
947
if (count == docSize) {
948       finished = true;
949     }
950     
951     // if it is a chunked stream we should use the isDone method
952
// to look if we reached the end
953
if (chunkedEncoding) {
954       if (chunkStream.isDone()) {
955         finished=true;
956       }
957     }
958     
959
960     // should we call the callback interface ?
961
if (callback != null) {
962       if (((buff.length() % updateInterval) == 0)
963           || finished) {
964         callback.setHttpToolDocCurrentSize(buff.length());
965       }
966     }
967
968     
969       }
970       
971       doc.setContent(buff.getContent());
972
973       if (ntlmAuthorizationInfo == null) {
974         // close everything
975
// bwrite.close();
976
// is.close();
977
httpConn.close(); httpConn = null;
978       }
979       
980       if (callback != null) {
981     callback.setHttpToolStatus(STATUS_DONE);
982       }
983
984     } catch (IOException JavaDoc e) {
985       throw new HttpException(e.getMessage());
986     }
987
988     DocAndConnection docAndConnection = new DocAndConnection();
989     docAndConnection.httpDoc = doc;
990     docAndConnection.httpConnection = httpConn;
991     
992     return docAndConnection;
993   }
994
995
996
997   /**
998    * should I use a proxy ?
999    * @return true if a proxy was configured, false otherwise
1000   */

1001  protected boolean useProxy() {
1002    return (proxyAddr != null);
1003  }
1004
1005
1006  /**
1007   * convert an array of bytes to a String. if the last byte is an CR
1008   * it will be ignored
1009   */

1010  protected String JavaDoc lineString(byte[] b) {
1011    if (b.length == 0) {
1012      return "";
1013    }
1014
1015    if (b[b.length-1] != CR) {
1016      return new String JavaDoc(b);
1017    } else {
1018      return new String JavaDoc(b,0,b.length-1);
1019    }
1020  }
1021
1022public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
1023    this.ntlmAuthorization = ntlmAuthorization;
1024}
1025
1026public NTLMAuthorization getNtlmAuthorization() {
1027    return ntlmAuthorization;
1028}
1029
1030}
1031
Popular Tags