KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > sourceforge > cvsgrab > WebBrowser


1 /*
2  * CVSGrab
3  * Author: Ludovic Claude (ludovicc@users.sourceforge.net)
4  * Distributable under BSD license.
5  */

6 package net.sourceforge.cvsgrab;
7
8 import java.io.BufferedInputStream JavaDoc;
9 import java.io.ByteArrayOutputStream JavaDoc;
10 import java.io.File JavaDoc;
11 import java.io.FileOutputStream JavaDoc;
12 import java.io.IOException JavaDoc;
13 import java.io.InputStream JavaDoc;
14 import java.io.StringReader JavaDoc;
15 import java.net.InetAddress JavaDoc;
16 import java.net.UnknownHostException JavaDoc;
17 import java.util.Iterator JavaDoc;
18 import java.util.Properties JavaDoc;
19 import java.util.StringTokenizer JavaDoc;
20 import java.util.zip.GZIPInputStream JavaDoc;
21
22 import net.sourceforge.cvsgrab.util.PasswordField;
23
24 import org.apache.commons.httpclient.Header;
25 import org.apache.commons.httpclient.HttpClient;
26 import org.apache.commons.httpclient.HttpMethod;
27 import org.apache.commons.httpclient.HttpMethodBase;
28 import org.apache.commons.httpclient.HttpRecoverableException;
29 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
30 import org.apache.commons.httpclient.NTCredentials;
31 import org.apache.commons.httpclient.URIException;
32 import org.apache.commons.httpclient.UsernamePasswordCredentials;
33 import org.apache.commons.httpclient.cookie.CookiePolicy;
34 import org.apache.commons.httpclient.methods.GetMethod;
35 import org.apache.commons.httpclient.util.URIUtil;
36 import org.apache.xerces.parsers.DOMParser;
37 import org.apache.xerces.xni.parser.XMLInputSource;
38 import org.cyberneko.html.HTMLConfiguration;
39 import org.w3c.dom.Document JavaDoc;
40 import org.xml.sax.SAXNotRecognizedException JavaDoc;
41 import org.xml.sax.SAXNotSupportedException JavaDoc;
42
43 /**
44  * Emulates a web browser
45  *
46  * @author <a HREF="mailto:ludovicc@users.sourceforge.net">Ludovic Claude</a>
47  * @version $Revision: 1.19 $ $Date: 2005/06/25 19:51:33 $
48  * @created on 11 oct. 2003
49  */

50 public class WebBrowser {
51
52     private static WebBrowser _instance = new WebBrowser();
53
54     private HttpClient _client;
55     private DOMParser _htmlParser;
56
57     /**
58      * @return the singleton instance
59      */

60     public static WebBrowser getInstance() {
61         return _instance;
62     }
63
64     public static String JavaDoc forceFinalSlash(String JavaDoc s) {
65         if (!s.endsWith("/")) {
66             return s + "/";
67         }
68         return s;
69     }
70
71     public static String JavaDoc removeFinalSlash(String JavaDoc s) {
72         if (s != null && s.endsWith("/")) {
73             return s.substring(0, s.length()-1);
74         }
75         return s;
76     }
77
78     public static String JavaDoc addQueryParam(String JavaDoc url, String JavaDoc queryParam) {
79         String JavaDoc newUrl = url;
80         if (queryParam != null) {
81             if (newUrl.indexOf('?') > 0) {
82                 newUrl += "&";
83             } else {
84                 newUrl += "?";
85             }
86             newUrl += queryParam;
87         }
88         return newUrl;
89     }
90
91     public static String JavaDoc addQueryParam(String JavaDoc url, String JavaDoc paramName, String JavaDoc paramValue) {
92         String JavaDoc newUrl = url;
93         if (paramName != null && paramValue != null) {
94             if (newUrl.indexOf('?') > 0) {
95                 newUrl += "&";
96             } else {
97                 newUrl += "?";
98             }
99             try {
100                 newUrl += paramName + "=" + URIUtil.encodeQuery(paramValue);
101             } catch (URIException e) {
102                 e.printStackTrace();
103                 throw new RuntimeException JavaDoc("Cannot encode parameter value " + paramValue);
104             }
105         }
106         return newUrl;
107     }
108
109     /**
110      * Extract the query parameters
111      * @param urlQuery The query section of the url. Must be of the form ? (optional) key1=value1&key2=value2
112      * @return the parameters extracted as properties
113      */

114     public static Properties JavaDoc getQueryParams(String JavaDoc urlQuery) {
115         Properties JavaDoc p = new Properties JavaDoc();
116         StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(urlQuery, "?&;");
117         while (st.hasMoreTokens()) {
118             String JavaDoc part = st.nextToken();
119             String JavaDoc key = part.substring(0, part.indexOf('='));
120             String JavaDoc value = part.substring(part.indexOf('=') + 1);
121             p.put(key, value);
122         }
123         return p;
124     }
125
126     /**
127      * Converts the query items to a single query string
128      * @param queryItems The set of (key,value) for the query
129      * @return a query string compatible with the format of url queries
130      */

131     public static String JavaDoc toQueryParams(Properties JavaDoc queryItems) {
132         StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
133         for (Iterator JavaDoc i = queryItems.keySet().iterator(); i.hasNext();) {
134             String JavaDoc key = (String JavaDoc) i.next();
135             String JavaDoc value = queryItems.getProperty(key);
136             sb.append(key);
137             sb.append('=');
138             sb.append(value);
139             if (i.hasNext()) {
140                 sb.append('&');
141             }
142         }
143         return sb.toString();
144     }
145
146     /**
147      * Constructor for WebBrowser
148      */

149     public WebBrowser() {
150         super();
151         CookiePolicy.setDefaultPolicy(CookiePolicy.COMPATIBILITY);
152         _client = new HttpClient();
153         _client.setConnectionTimeout(5000);
154         _htmlParser = new DOMParser(new HTMLConfiguration());
155         try {
156             _htmlParser.setProperty("http://cyberneko.org/html/properties/names/elems", "upper");
157             _htmlParser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
158             _htmlParser.setFeature("http://apache.org/xml/features/scanner/notify-builtin-refs", true);
159             _htmlParser.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
160             _htmlParser.setFeature("http://xml.org/sax/features/namespaces", false);
161         } catch (SAXNotRecognizedException JavaDoc e) {
162             e.printStackTrace();
163         } catch (SAXNotSupportedException JavaDoc e) {
164             e.printStackTrace();
165         }
166     }
167
168     /**
169      * Use a proxy to bypass the firewall
170      *
171      * @param proxyHost Host of the proxy
172      * @param proxyPort Port of the proxy
173      * @param proxyNTDomain NT domain for authentification on a MS proxy
174      * @param userName Username (if authentification is required), or null
175      * @param password Password (if authentification is required), or null
176      */

177     public void useProxy(String JavaDoc proxyHost, int proxyPort, final String JavaDoc ntDomain, final String JavaDoc userName, String JavaDoc password) {
178         CVSGrab.getLog().info("Using proxy " + proxyHost + ":" + proxyPort);
179         _client.getHostConfiguration().setProxy(proxyHost, proxyPort);
180         if (userName != null) {
181             if (password == null ) {
182                 PasswordField pwdField = new PasswordField();
183                 try {
184                     password = pwdField.getPassword("Enter the password for the proxy: ");
185                 } catch (IOException JavaDoc ex) {
186                     ex.printStackTrace();
187                 }
188             }
189             if (ntDomain == null) {
190                 CVSGrab.getLog().info("Login on the proxy with user name " + userName);
191                 _client.getState().setProxyCredentials(null, proxyHost,
192                     new UsernamePasswordCredentials(userName, password));
193             } else {
194                 try {
195                     String JavaDoc host = InetAddress.getLocalHost().getHostName();
196                     CVSGrab.getLog().info("Login on the NT proxy with user name " + userName
197                             + ", host " + host + ", NT domain " + ntDomain);
198                     _client.getState().setProxyCredentials(null, proxyHost,
199                         new NTCredentials(userName, password, host, ntDomain));
200                 } catch (UnknownHostException JavaDoc ex) {
201                     ex.printStackTrace();
202                 }
203             }
204         }
205     }
206
207     /**
208      * Use authentification for the web server
209      *
210      * @param userName The username to use on the web server
211      * @param password The password to use on the web server
212      */

213     public void useWebAuthentification(final String JavaDoc userName, String JavaDoc password) {
214         CVSGrab.getLog().info("Login on the web server with user name " + userName + " and password " + password);
215         if (password == null ) {
216             PasswordField pwdField = new PasswordField();
217             try {
218                 password = pwdField.getPassword("Enter the password for the web server: ");
219             } catch (IOException JavaDoc ex) {
220                 ex.printStackTrace();
221             }
222         }
223         _client.getState().setCredentials(null, null,
224           new UsernamePasswordCredentials(userName, password));
225     }
226
227     /**
228      * Allow simultaneous connections on different threads.
229      */

230     public void useMultithreading() {
231         _client = new HttpClient(new MultiThreadedHttpConnectionManager());
232     }
233
234     /**
235      * Execute a http method
236      *
237      * @param method The method
238      * @param url The url called by the method, only useful for error reporting
239      * @return the last http method executed (after following redirects)
240      */

241     public HttpMethod executeMethod(HttpMethod method, String JavaDoc url) {
242         int statusCode = -1;
243         int attempt = 0;
244
245         try {
246             method.setRequestHeader("User-Agent", "cvsgrab (http://cvsgrab.sourceforge.net)");
247             method.setRequestHeader("Cache-Control", "no-cache");
248             method.setRequestHeader("Accept-Encoding", "gzip");
249
250             // We will retry up to 3 times.
251
while ((statusCode == -1) && (attempt < 3)) {
252                 try {
253                     // execute the method.
254
statusCode = _client.executeMethod(method);
255                     CVSGrab.getLog().trace("Executed method " + url + " with status code " + statusCode);
256                 } catch (HttpRecoverableException e) {
257                     CVSGrab.getLog().warn("A recoverable exception occurred, retrying. " + e.getMessage());
258                 } catch (IOException JavaDoc e) {
259                     CVSGrab.getLog().error("Failed to download file " + url);
260                     e.printStackTrace();
261                     throw new RuntimeException JavaDoc("Failed to download file " + url);
262                 }
263             }
264
265             // Check that we didn't run out of retries.
266
if (statusCode == -1) {
267                 CVSGrab.getLog().error("Failed to recover from exception.");
268                 throw new RuntimeException JavaDoc("Error when reading " + url);
269             }
270
271             if (statusCode >= 400) {
272                 CVSGrab.getLog().debug("Page not found (error " + statusCode + ")");
273                 throw new RuntimeException JavaDoc("Error " + statusCode + " when reading " + url);
274             }
275
276             // Tests for redirects
277
if ((statusCode >= 300) && (statusCode < 400)) {
278                 Header locationHeader = method.getResponseHeader("location");
279
280                 if (locationHeader != null) {
281                     String JavaDoc redirectLocation = locationHeader.getValue();
282
283                     method.releaseConnection();
284                     CVSGrab.getLog().debug("Redirect to " + redirectLocation);
285
286                     HttpMethod redirectMethod = new GetMethod(redirectLocation);
287
288                     executeMethod(redirectMethod, redirectLocation);
289
290                     return redirectMethod;
291                 } else {
292                     // The response is invalid and did not provide the new location for
293
// the resource. Report an error or possibly handle the response
294
// like a 404 Not Found error.
295
CVSGrab.getLog().error("Page not found");
296                     throw new RuntimeException JavaDoc("Error when reading " + url);
297                 }
298             }
299         } catch (RuntimeException JavaDoc e) {
300             method.releaseConnection();
301             throw e;
302         }
303
304         return method;
305     }
306
307     /**
308      * Gets the response from a method that has been executed
309      *
310      * @param method The method
311      * @param url The url called by the method, only useful for error reporting
312      */

313     public String JavaDoc getResponse(HttpMethod method, String JavaDoc url) {
314         HttpMethod lastMethod = executeMethod(method, url);
315         String JavaDoc response = null;
316         try {
317             // Gzip support by Ralf Stoffels (rstoffels)
318
String JavaDoc contentEncoding = null;
319             if (lastMethod.getResponseHeader("Content-Encoding") != null) {
320                 contentEncoding = lastMethod.getResponseHeader("Content-Encoding").getValue();
321             }
322             if (contentEncoding != null && contentEncoding.toLowerCase().indexOf("gzip") >= 0) {
323                 try {
324                     InputStream JavaDoc inStream = lastMethod.getResponseBodyAsStream();
325                     if (inStream != null) {
326                         inStream = new GZIPInputStream JavaDoc(lastMethod.getResponseBodyAsStream());
327                         if (inStream != null) {
328                             response = getResponseContent(lastMethod, inStream);
329                         }
330                     }
331                 }
332                 catch (IOException JavaDoc e) {
333                     CVSGrab.getLog().error("I/O failure reading response body", e);
334                 }
335             } else {
336                 try {
337                     response = getResponseContent(lastMethod, lastMethod.getResponseBodyAsStream());
338                 } catch (IOException JavaDoc e) {
339                     CVSGrab.getLog().error("I/O failure reading response body", e);
340                 }
341             }
342         } finally {
343             lastMethod.releaseConnection();
344         }
345         return response;
346     }
347
348     private String JavaDoc getResponseContent(HttpMethod lastMethod, InputStream JavaDoc inStream) throws IOException JavaDoc {
349         String JavaDoc response;
350         ByteArrayOutputStream JavaDoc outstream = new ByteArrayOutputStream JavaDoc();
351         byte[] buffer = new byte[4096];
352         int len;
353         while ((len = inStream.read(buffer)) > 0) {
354             outstream.write(buffer, 0, len);
355         }
356         outstream.close();
357         response = new String JavaDoc(outstream.toByteArray(),
358                 ((HttpMethodBase) lastMethod).getResponseCharSet());
359         return response;
360     }
361
362     /**
363      * Execute the method and gets the response as a xml document.
364      *
365      * @param method The method
366      * @param url The url called by the method, only useful for error reporting
367      */

368     public Document JavaDoc getDocument(String JavaDoc url) throws Exception JavaDoc {
369         // Safety: some web sites will block if the url tries to open certain paths
370
if (url.endsWith("/browse/")) {
371             if (url.indexOf("netbeans.org") >= 0) {
372                 throw new Exception JavaDoc("This url " + url + " doesn't work on Netbeans.org");
373             }
374             if (url.indexOf("dev.java.net") >= 0) {
375                 throw new Exception JavaDoc("This url " + url + " doesn't work on dev.java.net");
376             }
377         }
378         return getDocument(new GetMethod(url), url);
379     }
380     
381     /**
382      * Execute the method and gets the response as a xml document.
383      *
384      * @param method The method
385      * @param url The url called by the method, only useful for error reporting
386      */

387     public Document JavaDoc getDocument(HttpMethod method, String JavaDoc url) throws Exception JavaDoc {
388         String JavaDoc response = getResponse(method, url);
389         return getDocumentFromSource(response);
390     }
391
392     public Document JavaDoc getDocumentFromSource(String JavaDoc docSource) throws Exception JavaDoc {
393         // Hack to kill namespaces in xhtml
394
int pos = 0;
395         do {
396            pos = docSource.indexOf("xmlns", pos);
397            if (pos > 0) {
398                int eq = docSource.indexOf('=', pos);
399                int lt = docSource.indexOf('<', pos);
400                int gt = docSource.indexOf('>', pos);
401                if (eq > 0 && eq < gt && gt < lt) {
402                    docSource = docSource.substring(0, pos) + docSource.substring(gt);
403                }
404            }
405         } while (pos > 0);
406
407         XMLInputSource source = new XMLInputSource(null, null, null, new StringReader JavaDoc(docSource), null);
408
409         _htmlParser.parse(source);
410
411         Document JavaDoc doc = _htmlParser.getDocument();
412         return doc;
413     }
414
415     public void loadFile(String JavaDoc url, File JavaDoc destFile) throws Exception JavaDoc {
416         loadFile(new GetMethod(url), destFile, url);
417     }
418     
419     public void loadFile(HttpMethod method, File JavaDoc destFile, String JavaDoc url) throws Exception JavaDoc {
420         HttpMethod lastMethod = executeMethod(method, url);
421         String JavaDoc contentEncoding = null;
422         if (lastMethod.getResponseHeader("Content-Encoding") != null) {
423             contentEncoding = lastMethod.getResponseHeader("Content-Encoding").getValue();
424         }
425         try {
426             FileOutputStream JavaDoc out = null;
427             InputStream JavaDoc in = new BufferedInputStream JavaDoc(lastMethod.getResponseBodyAsStream());
428             if (contentEncoding != null && contentEncoding.toLowerCase().indexOf("gzip") >= 0) {
429                 in = new GZIPInputStream JavaDoc(lastMethod.getResponseBodyAsStream());
430             }
431             try {
432                 out = new FileOutputStream JavaDoc(destFile);
433
434                 byte[] buffer = new byte[8 * 1024];
435                 int count = 0;
436                 do {
437                     out.write(buffer, 0, count);
438                     count = in.read(buffer, 0, buffer.length);
439                 } while (count != -1);
440             } finally {
441                 if (out != null) {
442                     out.close();
443                 }
444                 if (in != null) {
445                     in.close();
446                 }
447             }
448         } finally {
449             lastMethod.releaseConnection();
450         }
451     }
452
453 }
454
Popular Tags