1 24 25 27 package weblech.spider; 28 29 import org.apache.log4j.Category; 30 31 import java.net.HttpURLConnection ; 32 import java.net.URL ; 33 import java.net.Authenticator ; 34 import java.io.*; 35 36 import weblech.util.Log4j; 37 38 public class URLGetter 39 { 40 private final static Category _logClass = Category.getInstance(URLGetter.class); 41 42 static 43 { 44 Log4j.init(); 45 } 46 47 private int failureCount = 0; 48 49 private final SpiderConfig config; 50 51 public URLGetter(SpiderConfig config) 52 { 53 _logClass.debug("URLGetter()"); 54 this.config = config; 55 56 Authenticator.setDefault(new DumbAuthenticator(config.getBasicAuthUser(), config.getBasicAuthPassword())); 57 } 58 59 public URLObject getURL(URLToDownload url) 60 { 61 _logClass.debug("getURL(" + url + ")"); 62 63 if(failureCount > 10) 64 { 65 _logClass.warn("Lots of failures recently, waiting 5 seconds before attempting download"); 66 try { Thread.sleep(5 * 1000); } catch(InterruptedException e) { }; 67 failureCount = 0; 68 } 69 70 URL requestedURL = url.getURL(); 71 URL referer = url.getReferer(); 72 73 try 74 { 75 _logClass.debug("Creating HTTP connection to " + requestedURL); 76 HttpURLConnection conn = (HttpURLConnection ) requestedURL.openConnection(); 77 if(referer != null) 78 { 79 _logClass.debug("Setting Referer header to " + referer); 80 conn.setRequestProperty("Referer", referer.toExternalForm()); 81 } 82 83 if(config.getUserAgent() != null) 84 { 85 _logClass.debug("Setting User-Agent to " + config.getUserAgent()); 86 conn.setRequestProperty("User-Agent", config.getUserAgent()); 87 } 88 89 conn.setUseCaches(false); 90 91 _logClass.debug("Opening URL"); 92 long startTime = System.currentTimeMillis(); 93 conn.connect(); 94 95 String resp = conn.getResponseMessage(); 96 _logClass.debug("Remote server response: " + resp); 97 98 String respStr = conn.getHeaderField(0); 99 _logClass.info("Server response: " + respStr); 100 101 for(int i = 1; ; i++) 102 { 103 String key = conn.getHeaderFieldKey(i); 104 if(key == null) 105 { 106 break; 107 } 108 String value = conn.getHeaderField(key); 109 _logClass.debug("Received header " + key + ": " + value); 110 } 111 112 _logClass.debug("Getting buffered input stream from remote connection"); 113 BufferedInputStream remoteBIS = new BufferedInputStream(conn.getInputStream()); 114 ByteArrayOutputStream baos = new ByteArrayOutputStream(10240); 115 byte[] buf = new byte[1024]; 116 int bytesRead = 0; 117 while(bytesRead >= 0) 118 { 119 baos.write(buf, 0, bytesRead); 120 bytesRead = remoteBIS.read(buf); 121 } 122 123 byte[] content = baos.toByteArray(); 124 long timeTaken = System.currentTimeMillis() - startTime; 125 if(timeTaken < 100) timeTaken = 500; 126 127 int bytesPerSec = (int) ((double) content.length / ((double)timeTaken / 1000.0)); 128 _logClass.info("Downloaded " + content.length + " bytes, " + bytesPerSec + " bytes/sec"); 129 if(content.length < conn.getContentLength()) 130 { 131 _logClass.warn("Didn't download full content for URL: " + url); 132 failureCount++; 133 return null; 134 } 135 return new URLObject(requestedURL, conn.getContentType(), content, config); 136 } 137 catch(FileNotFoundException fnfe) { 138 _logClass.warn("File not found: " + fnfe.getMessage()); 139 return null; 140 } 141 catch(IOException ioe) 142 { 143 _logClass.warn("Caught IO Exception: " + ioe.getMessage(), ioe); 144 failureCount++; 145 return null; 146 } 147 } 148 } 149 | Popular Tags |