KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > fetcher > FetchDNS


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * FetchDNS
20  * Created on Jun 5, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/fetcher/FetchDNS.java,v 1.29.4.1 2007/01/13 01:31:17 stack-sf Exp $
23  */

24 package org.archive.crawler.fetcher;
25
26 import java.io.ByteArrayInputStream JavaDoc;
27 import java.io.ByteArrayOutputStream JavaDoc;
28 import java.io.IOException JavaDoc;
29 import java.io.InputStream JavaDoc;
30 import java.net.InetAddress JavaDoc;
31 import java.net.UnknownHostException JavaDoc;
32 import java.util.logging.Level JavaDoc;
33 import java.util.logging.Logger JavaDoc;
34 import java.util.regex.Matcher JavaDoc;
35
36 import org.apache.commons.httpclient.URIException;
37 import org.archive.crawler.datamodel.CoreAttributeConstants;
38 import org.archive.crawler.datamodel.CrawlHost;
39 import org.archive.crawler.datamodel.CrawlURI;
40 import org.archive.crawler.datamodel.FetchStatusCodes;
41 import org.archive.crawler.framework.Processor;
42 import org.archive.crawler.settings.SimpleType;
43 import org.archive.util.ArchiveUtils;
44 import org.archive.util.HttpRecorder;
45 import org.archive.util.InetAddressUtil;
46 import org.xbill.DNS.ARecord;
47 import org.xbill.DNS.DClass;
48 import org.xbill.DNS.Lookup;
49 import org.xbill.DNS.Record;
50 import org.xbill.DNS.ResolverConfig;
51 import org.xbill.DNS.TextParseException;
52 import org.xbill.DNS.Type;
53
54
55 /**
56  * Processor to resolve 'dns:' URIs.
57  *
58  * TODO: Refactor to use org.archive.util.DNSJavaUtils.
59  *
60  * @author multiple
61  */

62 public class FetchDNS extends Processor
63 implements CoreAttributeConstants, FetchStatusCodes {
64     private static final long serialVersionUID = 4686199203459704426L;
65
66     private Logger JavaDoc logger = Logger.getLogger(this.getClass().getName());
67
68     // Defaults.
69
private short ClassType = DClass.IN;
70     private short TypeType = Type.A;
71     protected InetAddress JavaDoc serverInetAddr = null;
72     
73     private static final String JavaDoc ATTR_ACCEPT_NON_DNS_RESOLVES =
74         "accept-non-dns-resolves";
75     private static final Boolean JavaDoc DEFAULT_ACCEPT_NON_DNS_RESOLVES =
76         Boolean.FALSE;
77     private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES
78         = 6 * 60 * 60; // 6 hrs
79

80     private byte [] reusableBuffer = new byte[1024];
81
82     /**
83      * Create a new instance of FetchDNS.
84      *
85      * @param name the name of this attribute.
86      */

87     public FetchDNS(String JavaDoc name) {
88         super(name, "DNS Fetcher. Handles DNS lookups.");
89         org.archive.crawler.settings.Type e =
90             addElementToDefinition(new SimpleType(ATTR_ACCEPT_NON_DNS_RESOLVES,
91                 "If a DNS lookup fails, whether or not to fallback to " +
92                 "InetAddress resolution, which may use local 'hosts' files " +
93                 "or other mechanisms.", DEFAULT_ACCEPT_NON_DNS_RESOLVES));
94         e.setExpertSetting(true);
95         e = addElementToDefinition(new SimpleType(FetchHTTP.ATTR_SHA1_CONTENT,
96             "Whether or not to perform an on-the-fly SHA1 hash of" +
97             "retrieved content-bodies.",
98             FetchHTTP.DEFAULT_SHA1_CONTENT));
99         e.setExpertSetting(true);
100     }
101
102     protected void innerProcess(CrawlURI curi) {
103         if (!curi.getUURI().getScheme().equals("dns")) {
104             // Only handles dns
105
return;
106         }
107         Record[] rrecordSet = null; // Retrieved dns records
108
String JavaDoc dnsName = null;
109         try {
110             dnsName = curi.getUURI().getReferencedHost();
111         } catch (URIException e) {
112             logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
113         }
114         
115         if(dnsName == null) {
116             curi.setFetchStatus(S_UNFETCHABLE_URI);
117             return;
118         }
119
120         // Make sure we're in "normal operating mode", e.g. a cache +
121
// controller exist to assist us.
122
CrawlHost targetHost = null;
123         if (getController() != null &&
124                 getController().getServerCache() != null) {
125             targetHost = getController().getServerCache().getHostFor(dnsName);
126         } else {
127             // Standalone operation (mostly for test cases/potential other uses)
128
targetHost = new CrawlHost(dnsName);
129         }
130         if (isQuadAddress(curi, dnsName, targetHost)) {
131             // We're done processing.
132
return;
133         }
134         
135         // Do actual DNS lookup.
136
curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
137
138         // Try to get the records for this host (assume domain name)
139
// TODO: Bug #935119 concerns potential hang here
140
try {
141             rrecordSet = (new Lookup(dnsName, TypeType, ClassType)).run();
142         } catch (TextParseException e) {
143             rrecordSet = null;
144         }
145         curi.setContentType("text/dns");
146         if (rrecordSet != null) {
147             if (logger.isLoggable(Level.FINE)) {
148                 logger.fine("Found recordset for " + dnsName);
149             }
150             storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
151         } else {
152             if (logger.isLoggable(Level.FINE)) {
153                 logger.fine("Failed find of recordset for " + dnsName);
154             }
155             if (((Boolean JavaDoc)getUncheckedAttribute(null,
156                     ATTR_ACCEPT_NON_DNS_RESOLVES)).booleanValue()) {
157                 // Do lookup that bypasses javadns.
158
InetAddress JavaDoc address = null;
159                 try {
160                     address = InetAddress.getByName(dnsName);
161                 } catch (UnknownHostException JavaDoc e1) {
162                     address = null;
163                 }
164                 if (address != null) {
165                     targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
166                     curi.setFetchStatus(S_GETBYNAME_SUCCESS);
167                     if (logger.isLoggable(Level.FINE)) {
168                         logger.fine("Found address for " + dnsName +
169                             " using native dns.");
170                     }
171                 } else {
172                     if (logger.isLoggable(Level.FINE)) {
173                         logger.fine("Failed find of address for " + dnsName +
174                             " using native dns.");
175                     }
176                     setUnresolvable(curi, targetHost);
177                 }
178             } else {
179                 setUnresolvable(curi, targetHost);
180             }
181         }
182         curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
183     }
184     
185     protected void storeDNSRecord(final CrawlURI curi, final String JavaDoc dnsName,
186             final CrawlHost targetHost, final Record[] rrecordSet) {
187         // Get TTL and IP info from the first A record (there may be
188
// multiple, e.g. www.washington.edu) then update the CrawlServer
189
ARecord arecord = getFirstARecord(rrecordSet);
190         if (arecord == null) {
191             throw new NullPointerException JavaDoc("Got null arecord for " +
192                 dnsName);
193         }
194         targetHost.setIP(arecord.getAddress(), arecord.getTTL());
195         try {
196             recordDNS(curi, rrecordSet);
197             curi.setFetchStatus(S_DNS_SUCCESS);
198             curi.putString(A_DNS_SERVER_IP_LABEL, ResolverConfig.getCurrentConfig().server());
199         } catch (IOException JavaDoc e) {
200             logger.log(Level.SEVERE, "Failed store of DNS Record for " +
201                 curi.toString(), e);
202             setUnresolvable(curi, targetHost);
203         }
204     }
205     
206     protected boolean isQuadAddress(final CrawlURI curi, final String JavaDoc dnsName,
207             final CrawlHost targetHost) {
208         boolean result = false;
209         Matcher JavaDoc matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
210         // If it's an ip no need to do a lookup
211
if (matcher == null || !matcher.matches()) {
212             return result;
213         }
214         
215         result = true;
216         // Ideally this branch would never be reached: no CrawlURI
217
// would be created for numerical IPs
218
if (logger.isLoggable(Level.WARNING)) {
219             logger.warning("Unnecessary DNS CrawlURI created: " + curi);
220         }
221         try {
222             targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {
223                     (byte) (new Integer JavaDoc(matcher.group(1)).intValue()),
224                     (byte) (new Integer JavaDoc(matcher.group(2)).intValue()),
225                     (byte) (new Integer JavaDoc(matcher.group(3)).intValue()),
226                     (byte) (new Integer JavaDoc(matcher.group(4)).intValue()) }),
227                     CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
228
curi.setFetchStatus(S_DNS_SUCCESS);
229         } catch (UnknownHostException JavaDoc e) {
230             logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
231             setUnresolvable(curi, targetHost);
232         }
233         return result;
234     }
235     
236     protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)
237     throws IOException JavaDoc {
238         final byte[] dnsRecord =
239             getDNSRecord(curi.getLong(A_FETCH_BEGAN_TIME), rrecordSet);
240         HttpRecorder rec = HttpRecorder.getHttpRecorder();
241         // Shall we get a digest on the content downloaded?
242
boolean sha1Content = ((Boolean JavaDoc)getUncheckedAttribute(curi,
243             FetchHTTP.ATTR_SHA1_CONTENT)).booleanValue();
244         if(sha1Content) {
245             rec.getRecordedInput().setSha1Digest();
246         } else {
247             rec.getRecordedInput().setDigest(null);
248         }
249         curi.setHttpRecorder(rec);
250         InputStream JavaDoc is = curi.getHttpRecorder().inputWrap(
251                 new ByteArrayInputStream JavaDoc(dnsRecord));
252         // Reading from the wrapped stream, behind the scenes, will write
253
// files into scratch space
254
try {
255             while (is.read(this.reusableBuffer) != -1) {
256                 continue;
257             }
258         } finally {
259             is.close();
260             rec.closeRecorders();
261         }
262         curi.setContentSize(dnsRecord.length);
263         curi.setContentDigest(FetchHTTP.SHA1,
264             rec.getRecordedInput().getDigestValue());
265     }
266     
267     protected byte [] getDNSRecord(final long fetchStart,
268             final Record[] rrecordSet)
269     throws IOException JavaDoc {
270         ByteArrayOutputStream JavaDoc baos = new ByteArrayOutputStream JavaDoc();
271         // Start the record with a 14-digit date per RFC 2540
272
byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
273         baos.write(fetchDate);
274         // Don't forget the newline
275
baos.write("\n".getBytes());
276         int recordLength = fetchDate.length + 1;
277         if (rrecordSet != null) {
278             for (int i = 0; i < rrecordSet.length; i++) {
279                 byte[] record = rrecordSet[i].toString().getBytes();
280                 recordLength += record.length;
281                 baos.write(record);
282                 // Add the newline between records back in
283
baos.write("\n".getBytes());
284                 recordLength += 1;
285             }
286         }
287         return baos.toByteArray();
288     }
289     
290     protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
291         host.setIP(null, 0);
292         curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE);
293     }
294     
295     protected ARecord getFirstARecord(Record[] rrecordSet) {
296         ARecord arecord = null;
297         if (rrecordSet == null || rrecordSet.length == 0) {
298             if (logger.isLoggable(Level.FINEST)) {
299                 logger.finest("rrecordSet is null or zero length: " +
300                     rrecordSet);
301             }
302             return arecord;
303         }
304         for (int i = 0; i < rrecordSet.length; i++) {
305             if (rrecordSet[i].getType() != Type.A) {
306                 if (logger.isLoggable(Level.FINEST)) {
307                     logger.finest("Record " + Integer.toString(i) +
308                         " is not A type but " + rrecordSet[i].getType());
309                 }
310                 continue;
311             }
312             arecord = (ARecord) rrecordSet[i];
313             break;
314         }
315         return arecord;
316     }
317 }
Popular Tags