|                                                                                                              1
 24
 25
 27  package weblech.spider;
 28
 29  import org.apache.log4j.Category;
 30
 31  import java.net.HttpURLConnection
  ; 32  import java.net.URL
  ; 33  import java.net.Authenticator
  ; 34  import java.io.*;
 35
 36  import weblech.util.Log4j;
 37
 38  public class URLGetter
 39  {
 40      private final static Category _logClass = Category.getInstance(URLGetter.class);
 41
 42      static
 43      {
 44          Log4j.init();
 45      }
 46
 47      private int failureCount = 0;
 48
 49      private final SpiderConfig config;
 50
 51      public URLGetter(SpiderConfig config)
 52      {
 53          _logClass.debug("URLGetter()");
 54          this.config = config;
 55
 56          Authenticator.setDefault(new DumbAuthenticator(config.getBasicAuthUser(), config.getBasicAuthPassword()));
 57      }
 58
 59      public URLObject getURL(URLToDownload url)
 60      {
 61          _logClass.debug("getURL(" + url + ")");
 62
 63          if(failureCount > 10)
 64          {
 65              _logClass.warn("Lots of failures recently, waiting 5 seconds before attempting download");
 66              try { Thread.sleep(5 * 1000); } catch(InterruptedException
  e) { }; 67              failureCount = 0;
 68          }
 69
 70          URL
  requestedURL = url.getURL(); 71          URL
  referer = url.getReferer(); 72
 73          try
 74          {
 75              _logClass.debug("Creating HTTP connection to " + requestedURL);
 76              HttpURLConnection
  conn = (HttpURLConnection  ) requestedURL.openConnection(); 77              if(referer != null)
 78              {
 79                  _logClass.debug("Setting Referer header to " + referer);
 80                  conn.setRequestProperty("Referer", referer.toExternalForm());
 81              }
 82
 83              if(config.getUserAgent() != null)
 84              {
 85                  _logClass.debug("Setting User-Agent to " + config.getUserAgent());
 86                  conn.setRequestProperty("User-Agent", config.getUserAgent());
 87              }
 88
 89              conn.setUseCaches(false);
 90
 91              _logClass.debug("Opening URL");
 92              long startTime = System.currentTimeMillis();
 93              conn.connect();
 94
 95              String
  resp = conn.getResponseMessage(); 96              _logClass.debug("Remote server response: " + resp);
 97
 98              String
  respStr = conn.getHeaderField(0); 99              _logClass.info("Server response: " + respStr);
 100
 101             for(int i = 1; ; i++)
 102             {
 103                 String
  key = conn.getHeaderFieldKey(i); 104                 if(key == null)
 105                 {
 106                     break;
 107                 }
 108                 String
  value = conn.getHeaderField(key); 109                 _logClass.debug("Received header " + key + ": " + value);
 110             }
 111
 112             _logClass.debug("Getting buffered input stream from remote connection");
 113             BufferedInputStream remoteBIS = new BufferedInputStream(conn.getInputStream());
 114             ByteArrayOutputStream baos = new ByteArrayOutputStream(10240);
 115             byte[] buf = new byte[1024];
 116             int bytesRead = 0;
 117             while(bytesRead >= 0)
 118             {
 119                 baos.write(buf, 0, bytesRead);
 120                 bytesRead = remoteBIS.read(buf);
 121             }
 122
 123             byte[] content = baos.toByteArray();
 124             long timeTaken = System.currentTimeMillis() - startTime;
 125             if(timeTaken < 100) timeTaken = 500;
 126
 127             int bytesPerSec = (int) ((double) content.length / ((double)timeTaken / 1000.0));
 128             _logClass.info("Downloaded " + content.length + " bytes, " + bytesPerSec + " bytes/sec");
 129             if(content.length < conn.getContentLength())
 130             {
 131                 _logClass.warn("Didn't download full content for URL: " + url);
 132                 failureCount++;
 133                 return null;
 134             }
 135             return new URLObject(requestedURL, conn.getContentType(), content, config);
 136         }
 137     catch(FileNotFoundException fnfe) {
 138         _logClass.warn("File not found: " + fnfe.getMessage());
 139         return null;
 140     }
 141         catch(IOException ioe)
 142         {
 143             _logClass.warn("Caught IO Exception: " + ioe.getMessage(), ioe);
 144             failureCount++;
 145             return null;
 146         }
 147     }
 148 }
 149
                                                                                                                                                                                                             |                                                                       
 
 
 
 
 
                                                                                   Popular Tags                                                                                                                                                                                              |