1 24 package org.archive.crawler.fetcher; 25 26 import java.io.ByteArrayInputStream ; 27 import java.io.ByteArrayOutputStream ; 28 import java.io.IOException ; 29 import java.io.InputStream ; 30 import java.net.InetAddress ; 31 import java.net.UnknownHostException ; 32 import java.util.logging.Level ; 33 import java.util.logging.Logger ; 34 import java.util.regex.Matcher ; 35 36 import org.apache.commons.httpclient.URIException; 37 import org.archive.crawler.datamodel.CoreAttributeConstants; 38 import org.archive.crawler.datamodel.CrawlHost; 39 import org.archive.crawler.datamodel.CrawlURI; 40 import org.archive.crawler.datamodel.FetchStatusCodes; 41 import org.archive.crawler.framework.Processor; 42 import org.archive.crawler.settings.SimpleType; 43 import org.archive.util.ArchiveUtils; 44 import org.archive.util.HttpRecorder; 45 import org.archive.util.InetAddressUtil; 46 import org.xbill.DNS.ARecord; 47 import org.xbill.DNS.DClass; 48 import org.xbill.DNS.Lookup; 49 import org.xbill.DNS.Record; 50 import org.xbill.DNS.ResolverConfig; 51 import org.xbill.DNS.TextParseException; 52 import org.xbill.DNS.Type; 53 54 55 62 public class FetchDNS extends Processor 63 implements CoreAttributeConstants, FetchStatusCodes { 64 private static final long serialVersionUID = 4686199203459704426L; 65 66 private Logger logger = Logger.getLogger(this.getClass().getName()); 67 68 private short ClassType = DClass.IN; 70 private short TypeType = Type.A; 71 protected InetAddress serverInetAddr = null; 72 73 private static final String ATTR_ACCEPT_NON_DNS_RESOLVES = 74 "accept-non-dns-resolves"; 75 private static final Boolean DEFAULT_ACCEPT_NON_DNS_RESOLVES = 76 Boolean.FALSE; 77 private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES 78 = 6 * 60 * 60; 80 private byte [] reusableBuffer = new byte[1024]; 81 82 87 public FetchDNS(String name) { 88 super(name, "DNS Fetcher. Handles DNS lookups."); 89 org.archive.crawler.settings.Type e = 90 addElementToDefinition(new SimpleType(ATTR_ACCEPT_NON_DNS_RESOLVES, 91 "If a DNS lookup fails, whether or not to fallback to " + 92 "InetAddress resolution, which may use local 'hosts' files " + 93 "or other mechanisms.", DEFAULT_ACCEPT_NON_DNS_RESOLVES)); 94 e.setExpertSetting(true); 95 e = addElementToDefinition(new SimpleType(FetchHTTP.ATTR_SHA1_CONTENT, 96 "Whether or not to perform an on-the-fly SHA1 hash of" + 97 "retrieved content-bodies.", 98 FetchHTTP.DEFAULT_SHA1_CONTENT)); 99 e.setExpertSetting(true); 100 } 101 102 protected void innerProcess(CrawlURI curi) { 103 if (!curi.getUURI().getScheme().equals("dns")) { 104 return; 106 } 107 Record[] rrecordSet = null; String dnsName = null; 109 try { 110 dnsName = curi.getUURI().getReferencedHost(); 111 } catch (URIException e) { 112 logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e); 113 } 114 115 if(dnsName == null) { 116 curi.setFetchStatus(S_UNFETCHABLE_URI); 117 return; 118 } 119 120 CrawlHost targetHost = null; 123 if (getController() != null && 124 getController().getServerCache() != null) { 125 targetHost = getController().getServerCache().getHostFor(dnsName); 126 } else { 127 targetHost = new CrawlHost(dnsName); 129 } 130 if (isQuadAddress(curi, dnsName, targetHost)) { 131 return; 133 } 134 135 curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis()); 137 138 try { 141 rrecordSet = (new Lookup(dnsName, TypeType, ClassType)).run(); 142 } catch (TextParseException e) { 143 rrecordSet = null; 144 } 145 curi.setContentType("text/dns"); 146 if (rrecordSet != null) { 147 if (logger.isLoggable(Level.FINE)) { 148 logger.fine("Found recordset for " + dnsName); 149 } 150 storeDNSRecord(curi, dnsName, targetHost, rrecordSet); 151 } else { 152 if (logger.isLoggable(Level.FINE)) { 153 logger.fine("Failed find of recordset for " + dnsName); 154 } 155 if (((Boolean )getUncheckedAttribute(null, 156 ATTR_ACCEPT_NON_DNS_RESOLVES)).booleanValue()) { 157 InetAddress address = null; 159 try { 160 address = InetAddress.getByName(dnsName); 161 } catch (UnknownHostException e1) { 162 address = null; 163 } 164 if (address != null) { 165 targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES); 166 curi.setFetchStatus(S_GETBYNAME_SUCCESS); 167 if (logger.isLoggable(Level.FINE)) { 168 logger.fine("Found address for " + dnsName + 169 " using native dns."); 170 } 171 } else { 172 if (logger.isLoggable(Level.FINE)) { 173 logger.fine("Failed find of address for " + dnsName + 174 " using native dns."); 175 } 176 setUnresolvable(curi, targetHost); 177 } 178 } else { 179 setUnresolvable(curi, targetHost); 180 } 181 } 182 curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis()); 183 } 184 185 protected void storeDNSRecord(final CrawlURI curi, final String dnsName, 186 final CrawlHost targetHost, final Record[] rrecordSet) { 187 ARecord arecord = getFirstARecord(rrecordSet); 190 if (arecord == null) { 191 throw new NullPointerException ("Got null arecord for " + 192 dnsName); 193 } 194 targetHost.setIP(arecord.getAddress(), arecord.getTTL()); 195 try { 196 recordDNS(curi, rrecordSet); 197 curi.setFetchStatus(S_DNS_SUCCESS); 198 curi.putString(A_DNS_SERVER_IP_LABEL, ResolverConfig.getCurrentConfig().server()); 199 } catch (IOException e) { 200 logger.log(Level.SEVERE, "Failed store of DNS Record for " + 201 curi.toString(), e); 202 setUnresolvable(curi, targetHost); 203 } 204 } 205 206 protected boolean isQuadAddress(final CrawlURI curi, final String dnsName, 207 final CrawlHost targetHost) { 208 boolean result = false; 209 Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName); 210 if (matcher == null || !matcher.matches()) { 212 return result; 213 } 214 215 result = true; 216 if (logger.isLoggable(Level.WARNING)) { 219 logger.warning("Unnecessary DNS CrawlURI created: " + curi); 220 } 221 try { 222 targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] { 223 (byte) (new Integer (matcher.group(1)).intValue()), 224 (byte) (new Integer (matcher.group(2)).intValue()), 225 (byte) (new Integer (matcher.group(3)).intValue()), 226 (byte) (new Integer (matcher.group(4)).intValue()) }), 227 CrawlHost.IP_NEVER_EXPIRES); curi.setFetchStatus(S_DNS_SUCCESS); 229 } catch (UnknownHostException e) { 230 logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e); 231 setUnresolvable(curi, targetHost); 232 } 233 return result; 234 } 235 236 protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet) 237 throws IOException { 238 final byte[] dnsRecord = 239 getDNSRecord(curi.getLong(A_FETCH_BEGAN_TIME), rrecordSet); 240 HttpRecorder rec = HttpRecorder.getHttpRecorder(); 241 boolean sha1Content = ((Boolean )getUncheckedAttribute(curi, 243 FetchHTTP.ATTR_SHA1_CONTENT)).booleanValue(); 244 if(sha1Content) { 245 rec.getRecordedInput().setSha1Digest(); 246 } else { 247 rec.getRecordedInput().setDigest(null); 248 } 249 curi.setHttpRecorder(rec); 250 InputStream is = curi.getHttpRecorder().inputWrap( 251 new ByteArrayInputStream (dnsRecord)); 252 try { 255 while (is.read(this.reusableBuffer) != -1) { 256 continue; 257 } 258 } finally { 259 is.close(); 260 rec.closeRecorders(); 261 } 262 curi.setContentSize(dnsRecord.length); 263 curi.setContentDigest(FetchHTTP.SHA1, 264 rec.getRecordedInput().getDigestValue()); 265 } 266 267 protected byte [] getDNSRecord(final long fetchStart, 268 final Record[] rrecordSet) 269 throws IOException { 270 ByteArrayOutputStream baos = new ByteArrayOutputStream (); 271 byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes(); 273 baos.write(fetchDate); 274 baos.write("\n".getBytes()); 276 int recordLength = fetchDate.length + 1; 277 if (rrecordSet != null) { 278 for (int i = 0; i < rrecordSet.length; i++) { 279 byte[] record = rrecordSet[i].toString().getBytes(); 280 recordLength += record.length; 281 baos.write(record); 282 baos.write("\n".getBytes()); 284 recordLength += 1; 285 } 286 } 287 return baos.toByteArray(); 288 } 289 290 protected void setUnresolvable(CrawlURI curi, CrawlHost host) { 291 host.setIP(null, 0); 292 curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); 293 } 294 295 protected ARecord getFirstARecord(Record[] rrecordSet) { 296 ARecord arecord = null; 297 if (rrecordSet == null || rrecordSet.length == 0) { 298 if (logger.isLoggable(Level.FINEST)) { 299 logger.finest("rrecordSet is null or zero length: " + 300 rrecordSet); 301 } 302 return arecord; 303 } 304 for (int i = 0; i < rrecordSet.length; i++) { 305 if (rrecordSet[i].getType() != Type.A) { 306 if (logger.isLoggable(Level.FINEST)) { 307 logger.finest("Record " + Integer.toString(i) + 308 " is not A type but " + rrecordSet[i].getType()); 309 } 310 continue; 311 } 312 arecord = (ARecord) rrecordSet[i]; 313 break; 314 } 315 return arecord; 316 } 317 } | Popular Tags |