1 25 package org.archive.crawler.fetcher; 26 27 28 import java.io.IOException ; 29 import java.io.UnsupportedEncodingException ; 30 import java.net.Socket ; 31 import java.net.URLEncoder ; 32 import java.util.logging.Level ; 33 import java.util.logging.Logger ; 34 import java.util.regex.Matcher ; 35 import java.util.regex.Pattern ; 36 37 import javax.management.AttributeNotFoundException ; 38 39 import org.apache.commons.httpclient.URIException; 40 import org.apache.commons.net.ftp.FTPCommand; 41 import org.archive.crawler.datamodel.CrawlURI; 42 import org.archive.crawler.datamodel.CoreAttributeConstants; 43 import org.archive.crawler.datamodel.FetchStatusCodes; 44 import org.archive.crawler.extractor.Link; 45 import static org.archive.crawler.extractor.Link.NAVLINK_HOP; 46 import static org.archive.crawler.extractor.Link.NAVLINK_MISC; 47 import org.archive.crawler.framework.Processor; 48 import org.archive.crawler.settings.SimpleType; 49 import org.archive.io.RecordingInputStream; 50 import org.archive.io.ReplayCharSequence; 51 import org.archive.net.ClientFTP; 52 import org.archive.net.FTPException; 53 import org.archive.net.UURI; 54 import org.archive.util.ArchiveUtils; 55 import org.archive.util.HttpRecorder; 56 57 58 67 public class FetchFTP extends Processor implements CoreAttributeConstants { 68 69 70 71 private static final long serialVersionUID = 72 ArchiveUtils.classnameBasedUID(FetchFTP.class,1); 73 74 75 private static Logger logger = Logger.getLogger(FetchFTP.class.getName()); 76 77 78 private static Pattern DIR = 79 Pattern.compile("(.+)$", Pattern.MULTILINE); 80 81 82 83 final public static String ATTR_USERNAME = "username"; 84 85 86 final private static String DESC_USERNAME = "The username to send to " + 87 "FTP servers. By convention, the default value of \"anonymous\" is " + 88 "used for publicly available FTP sites."; 89 90 91 final private static String DEFAULT_USERNAME = "anonymous"; 92 93 94 95 final public static String ATTR_PASSWORD = "password"; 96 97 98 final private static String DESC_PASSWORD = "The password to send to " + 99 "FTP servers. By convention, anonymous users send their email address " + 100 "in this field."; 101 102 103 final private static String DEFAULT_PASSWORD = ""; 104 105 106 107 final private static String ATTR_EXTRACT = "extract-from-dirs"; 108 109 110 final private static String DESC_EXTRACT = "Set to true to extract " 111 + "further URIs from FTP directories. Default is true."; 112 113 114 final private static boolean DEFAULT_EXTRACT = true; 115 116 117 118 final private static String ATTR_EXTRACT_PARENT = "extract_parent"; 119 120 121 final private static String DESC_EXTRACT_PARENT = "Set to true to extract " 122 + "the parent URI from all FTP URIs. Default is true."; 123 124 125 final private static boolean DEFAULT_EXTRACT_PARENT = true; 126 127 128 129 final public static String ATTR_MAX_LENGTH = "max-length-bytes"; 130 131 132 final private static String DESC_MAX_LENGTH = 133 "Maximum length in bytes to fetch.\n" + 134 "Fetch is truncated at this length. A value of 0 means no limit."; 135 136 137 final private static long DEFAULT_MAX_LENGTH = 0; 138 139 140 141 final public static String ATTR_BANDWIDTH = "fetch-bandwidth"; 142 143 144 final private static String DESC_BANDWIDTH = ""; 145 146 147 final private static int DEFAULT_BANDWIDTH = 0; 148 149 150 151 final public static String ATTR_TIMEOUT = "timeout-seconds"; 152 153 154 final private static String DESC_TIMEOUT = "If the fetch is not " 155 + "completed in this number of seconds, give up (and retry later)."; 156 157 158 final private static int DEFAULT_TIMEOUT = 1200; 159 160 161 166 public FetchFTP(String name) { 167 super(name, "FTP Fetcher."); 168 add(ATTR_USERNAME, DESC_USERNAME, DEFAULT_USERNAME); 169 add(ATTR_PASSWORD, DESC_PASSWORD, DEFAULT_PASSWORD); 170 add(ATTR_EXTRACT, DESC_EXTRACT, DEFAULT_EXTRACT); 171 add(ATTR_EXTRACT_PARENT, DESC_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT); 172 add(ATTR_MAX_LENGTH, DESC_MAX_LENGTH, DEFAULT_MAX_LENGTH); 173 add(ATTR_BANDWIDTH, DESC_BANDWIDTH, DEFAULT_BANDWIDTH); 174 add(ATTR_TIMEOUT, DESC_TIMEOUT, DEFAULT_TIMEOUT); 175 } 176 177 178 185 private void add(String name, String desc, Object def) { 186 SimpleType st = new SimpleType(name, desc, def); 187 addElementToDefinition(st); 188 } 189 190 191 202 private Object get(Object context, String name, Object def) { 203 try { 204 return getAttribute(context, name); 205 } catch (AttributeNotFoundException e) { 206 logger.warning("Attribute not found (using default): " + name); 207 return def; 208 } 209 } 210 211 212 241 public void innerProcess(CrawlURI curi) throws InterruptedException { 242 if (!curi.getUURI().getScheme().equals("ftp")) { 243 return; 244 } 245 246 curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis()); 247 HttpRecorder recorder = HttpRecorder.getHttpRecorder(); 248 ClientFTP client = new ClientFTP(); 249 250 try { 251 fetch(curi, client, recorder); 252 } catch (FTPException e) { 253 logger.log(Level.SEVERE, "FTP server reported problem.", e); 254 curi.setFetchStatus(e.getReplyCode()); 255 } catch (IOException e) { 256 logger.log(Level.SEVERE, "IO Error during FTP fetch.", e); 257 curi.setFetchStatus(FetchStatusCodes.S_CONNECT_LOST); 258 } finally { 259 disconnect(client); 260 curi.setContentSize(recorder.getRecordedInput().getSize()); 261 curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis()); 262 } 263 } 264 265 266 275 private void fetch(CrawlURI curi, ClientFTP client, HttpRecorder recorder) 276 throws IOException , InterruptedException { 277 UURI uuri = curi.getUURI(); 279 int port = uuri.getPort(); 280 if (port == -1) { 281 port = 21; 282 } 283 client.connectStrict(uuri.getHost(), port); 284 285 String [] auth = getAuth(curi); 287 client.loginStrict(auth[0], auth[1]); 288 289 boolean dir = client.changeWorkingDirectory(uuri.getPath()); 293 if (dir) { 294 curi.setContentType("text/plain"); 295 } 296 297 if (logger.isLoggable(Level.FINE)) { 301 String system = client.getSystemName(); 302 logger.fine(system); 303 } 304 305 int command = dir ? FTPCommand.NLST : FTPCommand.RETR; 308 String path = dir ? "." : uuri.getPath(); 309 client.enterLocalPassiveMode(); 310 client.setBinary(); 311 Socket socket = client.openDataConnection(command, path); 312 curi.setFetchStatus(client.getReplyCode()); 313 314 try { 317 saveToRecorder(curi, socket, recorder); 318 } finally { 319 recorder.close(); 320 close(socket); 321 } 322 323 curi.setFetchStatus(200); 324 if (dir) { 325 extract(curi, recorder); 326 } 327 addParent(curi); 328 } 329 330 331 340 private void saveToRecorder(CrawlURI curi, 341 Socket socket, HttpRecorder recorder) 342 throws IOException , InterruptedException { 343 curi.setHttpRecorder(recorder); 344 recorder.markContentBegin(); 345 recorder.inputWrap(socket.getInputStream()); 346 recorder.outputWrap(socket.getOutputStream()); 347 348 long softMax = 0; 350 long hardMax = getMaxLength(curi); 351 long timeout = (long)getTimeout(curi) * 1000; 352 int maxRate = getFetchBandwidth(curi); 353 RecordingInputStream input = recorder.getRecordedInput(); 354 input.readFullyOrUntil(softMax, hardMax, timeout, maxRate); 355 } 356 357 358 365 private void extract(CrawlURI curi, HttpRecorder recorder) { 366 if (!getExtractFromDirs(curi)) { 367 return; 368 } 369 370 ReplayCharSequence seq = null; 371 try { 372 seq = recorder.getReplayCharSequence(); 373 extract(curi, seq); 374 } catch (IOException e) { 375 logger.log(Level.SEVERE, "IO error during extraction.", e); 376 } catch (RuntimeException e) { 377 logger.log(Level.SEVERE, "IO error during extraction.", e); 378 } finally { 379 close(seq); 380 } 381 } 382 383 384 391 private void extract(CrawlURI curi, ReplayCharSequence dir) { 392 logger.log(Level.FINEST, "Extracting URIs from FTP directory."); 393 Matcher matcher = DIR.matcher(dir); 394 while (matcher.find()) { 395 String file = matcher.group(1); 396 addExtracted(curi, file); 397 } 398 } 399 400 401 409 private void addExtracted(CrawlURI curi, String file) { 410 try { 411 file = URLEncoder.encode(file, "UTF-8"); 412 } catch (UnsupportedEncodingException e) { 413 throw new AssertionError (e); 414 } 415 if (logger.isLoggable(Level.FINEST)) { 416 logger.log(Level.FINEST, "Found " + file); 417 } 418 String base = curi.toString(); 419 if (base.endsWith("/")) { 420 base = base.substring(0, base.length() - 1); 421 } 422 try { 423 UURI n = new UURI(base + "/" + file, true); 424 Link link = new Link(curi.getUURI(), n, NAVLINK_MISC, NAVLINK_HOP); 425 curi.addOutLink(link); 426 } catch (URIException e) { 427 logger.log(Level.WARNING, "URI error during extraction.", e); 428 } 429 } 430 431 432 446 private void addParent(CrawlURI curi) { 447 if (!getExtractParent(curi)) { 448 return; 449 } 450 UURI uuri = curi.getUURI(); 451 try { 452 if (uuri.getPath().equals("/")) { 453 return; 455 } 456 String scheme = uuri.getScheme(); 457 String auth = uuri.getEscapedAuthority(); 458 String path = uuri.getEscapedCurrentHierPath(); 459 UURI parent = new UURI(scheme + "://" + auth + path, false); 460 461 Link link = new Link(uuri, parent, NAVLINK_MISC, NAVLINK_HOP); 462 curi.addOutLink(link); 463 } catch (URIException e) { 464 logger.log(Level.WARNING, "URI error during extraction.", e); 465 } 466 } 467 468 469 476 public boolean getExtractFromDirs(CrawlURI curi) { 477 return (Boolean )get(curi, ATTR_EXTRACT, DEFAULT_EXTRACT); 478 } 479 480 481 488 public boolean getExtractParent(CrawlURI curi) { 489 return (Boolean )get(curi, ATTR_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT); 490 } 491 492 493 500 public int getTimeout(CrawlURI curi) { 501 return (Integer )get(curi, ATTR_TIMEOUT, DEFAULT_TIMEOUT); 502 } 503 504 505 512 public long getMaxLength(CrawlURI curi) { 513 return (Long )get(curi, ATTR_MAX_LENGTH, DEFAULT_MAX_LENGTH); 514 } 515 516 517 524 public int getFetchBandwidth(CrawlURI curi) { 525 return (Integer )get(curi, ATTR_BANDWIDTH, DEFAULT_BANDWIDTH); 526 } 527 528 529 547 private String [] getAuth(CrawlURI curi) { 548 String [] result = new String [2]; 549 UURI uuri = curi.getUURI(); 550 String userinfo; 551 try { 552 userinfo = uuri.getUserinfo(); 553 } catch (URIException e) { 554 assert false; 555 logger.finest("getUserinfo raised URIException."); 556 userinfo = null; 557 } 558 if (userinfo != null) { 559 int p = userinfo.indexOf(':'); 560 if (p > 0) { 561 result[0] = userinfo.substring(0,p); 562 result[1] = userinfo.substring(p + 1); 563 return result; 564 } 565 } 566 result[0] = (String )get(curi, ATTR_USERNAME, DEFAULT_USERNAME); 567 result[1] = (String )get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD); 568 return result; 569 } 570 571 572 581 public String determinePassword(CrawlURI curi) { 582 return (String )get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD); 583 } 584 585 586 591 private static void close(Socket socket) { 592 try { 593 socket.close(); 594 } catch (IOException e) { 595 logger.log(Level.WARNING, "IO error closing socket.", e); 596 } 597 } 598 599 600 606 private static void close(ReplayCharSequence seq) { 607 if (seq == null) { 608 return; 609 } 610 try { 611 seq.close(); 612 } catch (IOException e) { 613 logger.log(Level.WARNING, "IO error closing ReplayCharSequence.", 614 e); 615 } 616 } 617 618 619 625 private static void disconnect(ClientFTP client) { 626 if (client.isConnected()) try { 627 client.disconnect(); 628 } catch (IOException e) { 629 if (logger.isLoggable(Level.WARNING)) { 630 logger.warning("Could not disconnect from FTP client: " 631 + e.getMessage()); 632 } 633 } 634 } 635 636 637 } 638 | Popular Tags |