KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > weblech > spider > URLGetter


1 /*
2  * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
3  *
4  * Copyright (c) 2001 Brian Pitcher
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */

24
25 // $Header: /cvsroot/weblech/weblech/src/weblech/spider/URLGetter.java,v 1.2 2002/06/02 08:02:45 weblech Exp $
26

27 package weblech.spider;
28
29 import org.apache.log4j.Category;
30
31 import java.net.HttpURLConnection JavaDoc;
32 import java.net.URL JavaDoc;
33 import java.net.Authenticator JavaDoc;
34 import java.io.*;
35
36 import weblech.util.Log4j;
37
38 public class URLGetter
39 {
40     private final static Category _logClass = Category.getInstance(URLGetter.class);
41
42     static
43     {
44         Log4j.init();
45     }
46
47     private int failureCount = 0;
48
49     private final SpiderConfig config;
50
51     public URLGetter(SpiderConfig config)
52     {
53         _logClass.debug("URLGetter()");
54         this.config = config;
55
56         Authenticator.setDefault(new DumbAuthenticator(config.getBasicAuthUser(), config.getBasicAuthPassword()));
57     }
58
59     public URLObject getURL(URLToDownload url)
60     {
61         _logClass.debug("getURL(" + url + ")");
62
63         if(failureCount > 10)
64         {
65             _logClass.warn("Lots of failures recently, waiting 5 seconds before attempting download");
66             try { Thread.sleep(5 * 1000); } catch(InterruptedException JavaDoc e) { };
67             failureCount = 0;
68         }
69
70         URL JavaDoc requestedURL = url.getURL();
71         URL JavaDoc referer = url.getReferer();
72
73         try
74         {
75             _logClass.debug("Creating HTTP connection to " + requestedURL);
76             HttpURLConnection JavaDoc conn = (HttpURLConnection JavaDoc) requestedURL.openConnection();
77             if(referer != null)
78             {
79                 _logClass.debug("Setting Referer header to " + referer);
80                 conn.setRequestProperty("Referer", referer.toExternalForm());
81             }
82
83             if(config.getUserAgent() != null)
84             {
85                 _logClass.debug("Setting User-Agent to " + config.getUserAgent());
86                 conn.setRequestProperty("User-Agent", config.getUserAgent());
87             }
88
89             conn.setUseCaches(false);
90
91             _logClass.debug("Opening URL");
92             long startTime = System.currentTimeMillis();
93             conn.connect();
94
95             String JavaDoc resp = conn.getResponseMessage();
96             _logClass.debug("Remote server response: " + resp);
97
98             String JavaDoc respStr = conn.getHeaderField(0);
99             _logClass.info("Server response: " + respStr);
100
101             for(int i = 1; ; i++)
102             {
103                 String JavaDoc key = conn.getHeaderFieldKey(i);
104                 if(key == null)
105                 {
106                     break;
107                 }
108                 String JavaDoc value = conn.getHeaderField(key);
109                 _logClass.debug("Received header " + key + ": " + value);
110             }
111
112             _logClass.debug("Getting buffered input stream from remote connection");
113             BufferedInputStream remoteBIS = new BufferedInputStream(conn.getInputStream());
114             ByteArrayOutputStream baos = new ByteArrayOutputStream(10240);
115             byte[] buf = new byte[1024];
116             int bytesRead = 0;
117             while(bytesRead >= 0)
118             {
119                 baos.write(buf, 0, bytesRead);
120                 bytesRead = remoteBIS.read(buf);
121             }
122
123             byte[] content = baos.toByteArray();
124             long timeTaken = System.currentTimeMillis() - startTime;
125             if(timeTaken < 100) timeTaken = 500;
126
127             int bytesPerSec = (int) ((double) content.length / ((double)timeTaken / 1000.0));
128             _logClass.info("Downloaded " + content.length + " bytes, " + bytesPerSec + " bytes/sec");
129             if(content.length < conn.getContentLength())
130             {
131                 _logClass.warn("Didn't download full content for URL: " + url);
132                 failureCount++;
133                 return null;
134             }
135             return new URLObject(requestedURL, conn.getContentType(), content, config);
136         }
137     catch(FileNotFoundException fnfe) {
138         _logClass.warn("File not found: " + fnfe.getMessage());
139         return null;
140     }
141         catch(IOException ioe)
142         {
143             _logClass.warn("Caught IO Exception: " + ioe.getMessage(), ioe);
144             failureCount++;
145             return null;
146         }
147     }
148 }
149
Popular Tags