KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > fetcher > FetchFTP


1 /* FetchFTP.java
2  *
3  * $Id: FetchFTP.java,v 1.9 2006/09/05 22:39:52 paul_jack Exp $
4  *
5  * Created on Jun 5, 2003
6  *
7  * Copyright (C) 2003 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.fetcher;
26
27
28 import java.io.IOException JavaDoc;
29 import java.io.UnsupportedEncodingException JavaDoc;
30 import java.net.Socket JavaDoc;
31 import java.net.URLEncoder JavaDoc;
32 import java.util.logging.Level JavaDoc;
33 import java.util.logging.Logger JavaDoc;
34 import java.util.regex.Matcher JavaDoc;
35 import java.util.regex.Pattern JavaDoc;
36
37 import javax.management.AttributeNotFoundException JavaDoc;
38
39 import org.apache.commons.httpclient.URIException;
40 import org.apache.commons.net.ftp.FTPCommand;
41 import org.archive.crawler.datamodel.CrawlURI;
42 import org.archive.crawler.datamodel.CoreAttributeConstants;
43 import org.archive.crawler.datamodel.FetchStatusCodes;
44 import org.archive.crawler.extractor.Link;
45 import static org.archive.crawler.extractor.Link.NAVLINK_HOP;
46 import static org.archive.crawler.extractor.Link.NAVLINK_MISC;
47 import org.archive.crawler.framework.Processor;
48 import org.archive.crawler.settings.SimpleType;
49 import org.archive.io.RecordingInputStream;
50 import org.archive.io.ReplayCharSequence;
51 import org.archive.net.ClientFTP;
52 import org.archive.net.FTPException;
53 import org.archive.net.UURI;
54 import org.archive.util.ArchiveUtils;
55 import org.archive.util.HttpRecorder;
56
57
58 /**
59  * Fetches documents and directory listings using FTP. This class will also
60  * try to extract FTP "links" from directory listings. For this class to
61  * archive a directory listing, the remote FTP server must support the NLIST
62  * command. Most modern FTP servers should.
63  *
64  * @author pjack
65  *
66  */

67 public class FetchFTP extends Processor implements CoreAttributeConstants {
68
69     
70     /** Serialization ID; robust against trivial API changes. */
71     private static final long serialVersionUID =
72      ArchiveUtils.classnameBasedUID(FetchFTP.class,1);
73
74     /** Logger for this class. */
75     private static Logger JavaDoc logger = Logger.getLogger(FetchFTP.class.getName());
76
77     /** Pattern for matching directory entries. */
78     private static Pattern JavaDoc DIR =
79      Pattern.compile("(.+)$", Pattern.MULTILINE);
80
81     
82     /** The name for the <code>username</code> attribute. */
83     final public static String JavaDoc ATTR_USERNAME = "username";
84    
85     /** The description for the <code>username</code> attribute. */
86     final private static String JavaDoc DESC_USERNAME = "The username to send to " +
87      "FTP servers. By convention, the default value of \"anonymous\" is " +
88      "used for publicly available FTP sites.";
89     
90     /** The default value for the <code>username</code> attribute. */
91     final private static String JavaDoc DEFAULT_USERNAME = "anonymous";
92
93
94     /** The name for the <code>password</code> attribute. */
95     final public static String JavaDoc ATTR_PASSWORD = "password";
96    
97     /** The description for the <code>password</code> attribute. */
98     final private static String JavaDoc DESC_PASSWORD = "The password to send to " +
99     "FTP servers. By convention, anonymous users send their email address " +
100     "in this field.";
101     
102     /** The default value for the <code>password</code> attribute. */
103     final private static String JavaDoc DEFAULT_PASSWORD = "";
104
105     
106     /** The name for the <code>extract-from-dirs</code> attribute. */
107     final private static String JavaDoc ATTR_EXTRACT = "extract-from-dirs";
108     
109     /** The description for the <code>extract-from-dirs</code> attribute. */
110     final private static String JavaDoc DESC_EXTRACT = "Set to true to extract "
111      + "further URIs from FTP directories. Default is true.";
112     
113     /** The default value for the <code>extract-from-dirs</code> attribute. */
114     final private static boolean DEFAULT_EXTRACT = true;
115
116     
117     /** The name for the <code>extract-parent</code> attribute. */
118     final private static String JavaDoc ATTR_EXTRACT_PARENT = "extract_parent";
119     
120     /** The description for the <code>extract-parent</code> attribute. */
121     final private static String JavaDoc DESC_EXTRACT_PARENT = "Set to true to extract "
122      + "the parent URI from all FTP URIs. Default is true.";
123     
124     /** The default value for the <code>extract-parent</code> attribute. */
125     final private static boolean DEFAULT_EXTRACT_PARENT = true;
126     
127     
128     /** The name for the <code>max-length-bytes</code> attribute. */
129     final public static String JavaDoc ATTR_MAX_LENGTH = "max-length-bytes";
130     
131     /** The description for the <code>max-length-bytes</code> attribute. */
132     final private static String JavaDoc DESC_MAX_LENGTH =
133         "Maximum length in bytes to fetch.\n" +
134         "Fetch is truncated at this length. A value of 0 means no limit.";
135     
136     /** The default value for the <code>max-length-bytes</code> attribute. */
137     final private static long DEFAULT_MAX_LENGTH = 0;
138
139     
140     /** The name for the <code>fetch-bandwidth</code> attribute. */
141     final public static String JavaDoc ATTR_BANDWIDTH = "fetch-bandwidth";
142     
143     /** The description for the <code>fetch-bandwidth</code> attribute. */
144     final private static String JavaDoc DESC_BANDWIDTH = "";
145     
146     /** The default value for the <code>fetch-bandwidth</code> attribute. */
147     final private static int DEFAULT_BANDWIDTH = 0;
148     
149     
150     /** The name for the <code>timeout-seconds</code> attribute. */
151     final public static String JavaDoc ATTR_TIMEOUT = "timeout-seconds";
152     
153     /** The description for the <code>timeout-seconds</code> attribute. */
154     final private static String JavaDoc DESC_TIMEOUT = "If the fetch is not "
155      + "completed in this number of seconds, give up (and retry later).";
156     
157     /** The default value for the <code>timeout-seconds</code> attribute. */
158     final private static int DEFAULT_TIMEOUT = 1200;
159     
160
161     /**
162      * Constructs a new <code>FetchFTP</code>.
163      *
164      * @param name the name of this processor
165      */

166     public FetchFTP(String JavaDoc name) {
167         super(name, "FTP Fetcher.");
168         add(ATTR_USERNAME, DESC_USERNAME, DEFAULT_USERNAME);
169         add(ATTR_PASSWORD, DESC_PASSWORD, DEFAULT_PASSWORD);
170         add(ATTR_EXTRACT, DESC_EXTRACT, DEFAULT_EXTRACT);
171         add(ATTR_EXTRACT_PARENT, DESC_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT);
172         add(ATTR_MAX_LENGTH, DESC_MAX_LENGTH, DEFAULT_MAX_LENGTH);
173         add(ATTR_BANDWIDTH, DESC_BANDWIDTH, DEFAULT_BANDWIDTH);
174         add(ATTR_TIMEOUT, DESC_TIMEOUT, DEFAULT_TIMEOUT);
175     }
176
177     
178     /**
179      * Convenience method for adding an attribute.
180      *
181      * @param name The name of the attribute
182      * @param desc The description of the attribute
183      * @param def The default value for the attribute
184      */

185     private void add(String JavaDoc name, String JavaDoc desc, Object JavaDoc def) {
186         SimpleType st = new SimpleType(name, desc, def);
187         addElementToDefinition(st);
188     }
189     
190     
191     /**
192      * Convenience method for extracting an attribute.
193      * If a value for the specified name cannot be found,
194      * a warning is written to the log and the specified
195      * default value is returned instead.
196      *
197      * @param context The context for the attribute fetch
198      * @param name The name of the attribute to fetch
199      * @param def The value to return if the attribute isn't found
200      * @return The value of that attribute
201      */

202     private Object JavaDoc get(Object JavaDoc context, String JavaDoc name, Object JavaDoc def) {
203         try {
204             return getAttribute(context, name);
205         } catch (AttributeNotFoundException JavaDoc e) {
206             logger.warning("Attribute not found (using default): " + name);
207             return def;
208         }
209     }
210     
211
212     /**
213      * Processes the given URI. If the given URI is not an FTP URI, then
214      * this method does nothing. Otherwise an attempt is made to connect
215      * to the FTP server.
216      *
217      * <p>If the connection is successful, an attempt will be made to CD to
218      * the path specified in the URI. If the remote CD command succeeds,
219      * then it is assumed that the URI represents a directory. If the
220      * CD command fails, then it is assumed that the URI represents
221      * a file.
222      *
223      * <p>For directories, the directory listing will be fetched using
224      * the FTP LIST command, and saved to the HttpRecorder. If the
225      * <code>extract.from.dirs</code> attribute is set to true, then
226      * the files in the fetched list will be added to the curi as
227      * extracted FTP links. (It was easier to do that here, rather
228      * than writing a separate FTPExtractor.)
229      *
230      * <p>For files, the file will be fetched using the FTP RETR
231      * command, and saved to the HttpRecorder.
232      *
233      * <p>All file transfers (including directory listings) occur using
234      * Binary mode transfer. Also, the local passive transfer mode
235      * is always used, to play well with firewalls.
236      *
237      * @param curi the curi to process
238      * @throws InterruptedException if the thread is interrupted during
239      * processing
240      */

241     public void innerProcess(CrawlURI curi) throws InterruptedException JavaDoc {
242         if (!curi.getUURI().getScheme().equals("ftp")) {
243             return;
244         }
245         
246         curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
247         HttpRecorder recorder = HttpRecorder.getHttpRecorder();
248         ClientFTP client = new ClientFTP();
249         
250         try {
251             fetch(curi, client, recorder);
252         } catch (FTPException e) {
253             logger.log(Level.SEVERE, "FTP server reported problem.", e);
254             curi.setFetchStatus(e.getReplyCode());
255         } catch (IOException JavaDoc e) {
256             logger.log(Level.SEVERE, "IO Error during FTP fetch.", e);
257             curi.setFetchStatus(FetchStatusCodes.S_CONNECT_LOST);
258         } finally {
259             disconnect(client);
260             curi.setContentSize(recorder.getRecordedInput().getSize());
261             curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
262         }
263     }
264
265
266     /**
267      * Fetches a document from an FTP server.
268      *
269      * @param curi the URI of the document to fetch
270      * @param client the FTPClient to use for the fetch
271      * @param recorder the recorder to preserve the document in
272      * @throws IOException if a network or protocol error occurs
273      * @throws InterruptedException if the thread is interrupted
274      */

275     private void fetch(CrawlURI curi, ClientFTP client, HttpRecorder recorder)
276     throws IOException JavaDoc, InterruptedException JavaDoc {
277         // Connect to the FTP server.
278
UURI uuri = curi.getUURI();
279         int port = uuri.getPort();
280         if (port == -1) {
281             port = 21;
282         }
283         client.connectStrict(uuri.getHost(), port);
284         
285         // Authenticate.
286
String JavaDoc[] auth = getAuth(curi);
287         client.loginStrict(auth[0], auth[1]);
288         
289         // The given resource may or may not be a directory.
290
// To figure out which is which, execute a CD command to
291
// the UURI's path. If CD works, it's a directory.
292
boolean dir = client.changeWorkingDirectory(uuri.getPath());
293         if (dir) {
294             curi.setContentType("text/plain");
295         }
296         
297         // TODO: A future version of this class could use the system string to
298
// set up custom directory parsing if the FTP server doesn't support
299
// the nlist command.
300
if (logger.isLoggable(Level.FINE)) {
301             String JavaDoc system = client.getSystemName();
302             logger.fine(system);
303         }
304         
305         // Get a data socket. This will either be the result of a NLIST
306
// command for a directory, or a RETR command for a file.
307
int command = dir ? FTPCommand.NLST : FTPCommand.RETR;
308         String JavaDoc path = dir ? "." : uuri.getPath();
309         client.enterLocalPassiveMode();
310         client.setBinary();
311         Socket JavaDoc socket = client.openDataConnection(command, path);
312         curi.setFetchStatus(client.getReplyCode());
313
314         // Save the streams in the CURI, where downstream processors
315
// expect to find them.
316
try {
317             saveToRecorder(curi, socket, recorder);
318         } finally {
319             recorder.close();
320             close(socket);
321         }
322
323         curi.setFetchStatus(200);
324         if (dir) {
325             extract(curi, recorder);
326         }
327         addParent(curi);
328     }
329     
330     
331     /**
332      * Saves the given socket to the given recorder.
333      *
334      * @param curi the curi that owns the recorder
335      * @param socket the socket whose streams to save
336      * @param recorder the recorder to save them to
337      * @throws IOException if a network or file error occurs
338      * @throws InterruptedException if the thread is interrupted
339      */

340     private void saveToRecorder(CrawlURI curi,
341             Socket JavaDoc socket, HttpRecorder recorder)
342     throws IOException JavaDoc, InterruptedException JavaDoc {
343         curi.setHttpRecorder(recorder);
344         recorder.markContentBegin();
345         recorder.inputWrap(socket.getInputStream());
346         recorder.outputWrap(socket.getOutputStream());
347
348         // Read the remote file/dir listing in its entirety.
349
long softMax = 0;
350         long hardMax = getMaxLength(curi);
351         long timeout = (long)getTimeout(curi) * 1000;
352         int maxRate = getFetchBandwidth(curi);
353         RecordingInputStream input = recorder.getRecordedInput();
354         input.readFullyOrUntil(softMax, hardMax, timeout, maxRate);
355     }
356     
357     
358     /**
359      * Extract FTP links in a directory listing.
360      * The listing must already be saved to the given recorder.
361      *
362      * @param curi The curi to save extracted links to
363      * @param recorder The recorder containing the directory listing
364      */

365     private void extract(CrawlURI curi, HttpRecorder recorder) {
366         if (!getExtractFromDirs(curi)) {
367             return;
368         }
369         
370         ReplayCharSequence seq = null;
371         try {
372             seq = recorder.getReplayCharSequence();
373             extract(curi, seq);
374         } catch (IOException JavaDoc e) {
375             logger.log(Level.SEVERE, "IO error during extraction.", e);
376         } catch (RuntimeException JavaDoc e) {
377             logger.log(Level.SEVERE, "IO error during extraction.", e);
378         } finally {
379             close(seq);
380         }
381     }
382     
383     
384     /**
385      * Extracts FTP links in a directory listing.
386      *
387      * @param curi The curi to save extracted links to
388      * @param dir The directory listing to extract links from
389      * @throws URIException if an extracted link is invalid
390      */

391     private void extract(CrawlURI curi, ReplayCharSequence dir) {
392         logger.log(Level.FINEST, "Extracting URIs from FTP directory.");
393         Matcher JavaDoc matcher = DIR.matcher(dir);
394         while (matcher.find()) {
395             String JavaDoc file = matcher.group(1);
396             addExtracted(curi, file);
397         }
398     }
399
400
401     /**
402      * Adds an extracted filename to the curi. A new URI will be formed
403      * by taking the given curi (which should represent the directory the
404      * file lives in) and appending the file.
405      *
406      * @param curi the curi to store the discovered link in
407      * @param file the filename of the discovered link
408      */

409     private void addExtracted(CrawlURI curi, String JavaDoc file) {
410         try {
411             file = URLEncoder.encode(file, "UTF-8");
412         } catch (UnsupportedEncodingException JavaDoc e) {
413             throw new AssertionError JavaDoc(e);
414         }
415         if (logger.isLoggable(Level.FINEST)) {
416             logger.log(Level.FINEST, "Found " + file);
417         }
418         String JavaDoc base = curi.toString();
419         if (base.endsWith("/")) {
420             base = base.substring(0, base.length() - 1);
421         }
422         try {
423             UURI n = new UURI(base + "/" + file, true);
424             Link link = new Link(curi.getUURI(), n, NAVLINK_MISC, NAVLINK_HOP);
425             curi.addOutLink(link);
426         } catch (URIException e) {
427             logger.log(Level.WARNING, "URI error during extraction.", e);
428         }
429     }
430     
431
432     /**
433      * Extracts the parent URI from the given curi, then adds that parent
434      * URI as a discovered link to the curi.
435      *
436      * <p>If the <code>extract-parent</code> attribute is false, then this
437      * method does nothing. Also, if the path of the given curi is
438      * <code>/</code>, then this method does nothing.
439      *
440      * <p>Otherwise the parent is determined by eliminated the lowest part
441      * of the URI's path. Eg, the parent of <code>ftp://foo.com/one/two</code>
442      * is <code>ftp://foo.com/one</code>.
443      *
444      * @param curi the curi whose parent to add
445      */

446     private void addParent(CrawlURI curi) {
447         if (!getExtractParent(curi)) {
448             return;
449         }
450         UURI uuri = curi.getUURI();
451         try {
452             if (uuri.getPath().equals("/")) {
453                 // There's no parent to add.
454
return;
455             }
456             String JavaDoc scheme = uuri.getScheme();
457             String JavaDoc auth = uuri.getEscapedAuthority();
458             String JavaDoc path = uuri.getEscapedCurrentHierPath();
459             UURI parent = new UURI(scheme + "://" + auth + path, false);
460
461             Link link = new Link(uuri, parent, NAVLINK_MISC, NAVLINK_HOP);
462             curi.addOutLink(link);
463         } catch (URIException e) {
464             logger.log(Level.WARNING, "URI error during extraction.", e);
465         }
466     }
467     
468     
469     /**
470      * Returns the <code>extract.from.dirs</code> attribute for this
471      * <code>FetchFTP</code> and the given curi.
472      *
473      * @param curi the curi whose attribute to return
474      * @return that curi's <code>extract.from.dirs</code>
475      */

476     public boolean getExtractFromDirs(CrawlURI curi) {
477         return (Boolean JavaDoc)get(curi, ATTR_EXTRACT, DEFAULT_EXTRACT);
478     }
479     
480     
481     /**
482      * Returns the <code>extract.parent</code> attribute for this
483      * <code>FetchFTP</code> and the given curi.
484      *
485      * @param curi the curi whose attribute to return
486      * @return that curi's <code>extract-parent</code>
487      */

488     public boolean getExtractParent(CrawlURI curi) {
489         return (Boolean JavaDoc)get(curi, ATTR_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT);
490     }
491
492
493     /**
494      * Returns the <code>timeout-seconds</code> attribute for this
495      * <code>FetchFTP</code> and the given curi.
496      *
497      * @param curi the curi whose attribute to return
498      * @return that curi's <code>timeout-seconds</code>
499      */

500     public int getTimeout(CrawlURI curi) {
501         return (Integer JavaDoc)get(curi, ATTR_TIMEOUT, DEFAULT_TIMEOUT);
502     }
503
504
505     /**
506      * Returns the <code>max-length-bytes</code> attribute for this
507      * <code>FetchFTP</code> and the given curi.
508      *
509      * @param curi the curi whose attribute to return
510      * @return that curi's <code>max-length-bytes</code>
511      */

512     public long getMaxLength(CrawlURI curi) {
513         return (Long JavaDoc)get(curi, ATTR_MAX_LENGTH, DEFAULT_MAX_LENGTH);
514     }
515
516
517     /**
518      * Returns the <code>fetch-bandwidth</code> attribute for this
519      * <code>FetchFTP</code> and the given curi.
520      *
521      * @param curi the curi whose attribute to return
522      * @return that curi's <code>fetch-bandwidth</code>
523      */

524     public int getFetchBandwidth(CrawlURI curi) {
525         return (Integer JavaDoc)get(curi, ATTR_BANDWIDTH, DEFAULT_BANDWIDTH);
526     }
527
528
529     /**
530      * Returns the username and password for the given URI. This method
531      * always returns an array of length 2. The first element in the returned
532      * array is the username for the URI, and the second element is the
533      * password.
534      *
535      * <p>If the URI itself contains the username and password (i.e., it looks
536      * like <code>ftp://username:password@host/path</code>) then that username
537      * and password are returned.
538      *
539      * <p>Otherwise the settings system is probed for the <code>username</code>
540      * and <code>password</code> attributes for this <code>FTPFetch</code>
541      * and the given <code>curi</code> context. The values of those
542      * attributes are then returned.
543      *
544      * @param curi the curi whose username and password to return
545      * @return an array containing the username and password
546      */

547     private String JavaDoc[] getAuth(CrawlURI curi) {
548         String JavaDoc[] result = new String JavaDoc[2];
549         UURI uuri = curi.getUURI();
550         String JavaDoc userinfo;
551         try {
552             userinfo = uuri.getUserinfo();
553         } catch (URIException e) {
554             assert false;
555             logger.finest("getUserinfo raised URIException.");
556             userinfo = null;
557         }
558         if (userinfo != null) {
559             int p = userinfo.indexOf(':');
560             if (p > 0) {
561                 result[0] = userinfo.substring(0,p);
562                 result[1] = userinfo.substring(p + 1);
563                 return result;
564             }
565         }
566         result[0] = (String JavaDoc)get(curi, ATTR_USERNAME, DEFAULT_USERNAME);
567         result[1] = (String JavaDoc)get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD);
568         return result;
569     }
570     
571     
572     /**
573      * Determines the password for the given URI. If the URI itself contains
574      * a password, then that password is returned. Otherwise the settings
575      * system is probed for the <code>password</code> attribute, and the value
576      * for that attribute is returned.
577      *
578      * @param curi the curi whose password to return
579      * @return that password
580      */

581     public String JavaDoc determinePassword(CrawlURI curi) {
582         return (String JavaDoc)get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD);
583     }
584
585
586     /**
587      * Quietly closes the given socket.
588      *
589      * @param socket the socket to close
590      */

591     private static void close(Socket JavaDoc socket) {
592         try {
593             socket.close();
594         } catch (IOException JavaDoc e) {
595             logger.log(Level.WARNING, "IO error closing socket.", e);
596         }
597     }
598
599
600     /**
601      * Quietly closes the given sequence.
602      * If an IOException is raised, this method logs it as a warning.
603      *
604      * @param seq the sequence to close
605      */

606     private static void close(ReplayCharSequence seq) {
607         if (seq == null) {
608             return;
609         }
610         try {
611             seq.close();
612         } catch (IOException JavaDoc e) {
613             logger.log(Level.WARNING, "IO error closing ReplayCharSequence.",
614              e);
615         }
616     }
617
618     
619     /**
620      * Quietly disconnects from the given FTP client.
621      * If an IOException is raised, this method logs it as a warning.
622      *
623      * @param client the client to disconnect
624      */

625     private static void disconnect(ClientFTP client) {
626         if (client.isConnected()) try {
627             client.disconnect();
628         } catch (IOException JavaDoc e) {
629             if (logger.isLoggable(Level.WARNING)) {
630                 logger.warning("Could not disconnect from FTP client: "
631                  + e.getMessage());
632             }
633         }
634     }
635
636
637 }
638
Popular Tags