1 package net.matuschek.jobo; 2 3 6 7 import java.io.File ; 8 import java.net.URL ; 9 10 import net.matuschek.getopt.GetOpt; 11 import net.matuschek.http.HttpDocToFile; 12 import net.matuschek.http.SystemOutHttpToolCallback; 13 import net.matuschek.spider.WebRobot; 14 15 import org.apache.log4j.Category; 16 import org.apache.log4j.PropertyConfigurator; 17 import org.apache.log4j.BasicConfigurator; 18 19 25 public class JoBo { 26 27 protected static Category log = Category.getInstance(""); 28 29 public static void printUsage() { 30 final String usageInfo = 31 "command line options: \n"+ 32 " [-r http://...] set start referer (default \"-\")\n"+ 33 " [-d maxdepth] set maximal search depth (default 0)\n"+ 34 " [-o] allow walk to other hosts (default no)\n"+ 35 " [-s directory] directory to store retrieved documents\n"+ 36 " (default \".\")\n"+ 37 " [-m minsize] store only files larger then this size in bytes\n"+ 38 " (default 0)\n"+ 39 " [-a agentName] set user agent name\n"+ 40 " (default \"JoBo\")\n"+ 41 " [-i] ignore robots.txt\n"+ 42 " [-w seconds] wait n seconds after retrieving a file to limit\n"+ 43 " load on the remote server (default 60)\n"+ 44 " [-v] verbose mode, useful, if something is wrong\n"+ 45 " with the XML configuration\n"+ 46 " url start URL"; 47 48 System.out.println(usageInfo+"\n\n"); 49 } 50 51 52 55 public static void initializeLogging() { 56 final String configfile = "logging.conf"; 57 58 59 File f=new File (configfile); 60 if (f.exists()) { 61 PropertyConfigurator.configure(configfile); 63 } else { 64 BasicConfigurator.configure(); 65 } 66 } 67 68 69 70 public static void main(String [] argv) 71 throws Exception 72 { 73 String basedir="."; 74 int minSize=0; 75 76 initializeLogging(); 77 78 if (argv.length<1) { 79 printUsage(); 80 return; 81 } 82 83 GetOpt opt = new GetOpt(argv); 85 String option = null; 86 87 JoBoBase jobobase = JoBoBase.createFromXML(); 88 WebRobot robby = jobobase.getRobot(); 89 90 option=opt.getOptionString("r"); 92 if (option != null) { 93 robby.setStartReferer(option); 94 } 95 96 option=opt.getOptionString("d"); 98 if (option != null) { 99 try { 100 int maxDepth=Integer.parseInt(option); 101 robby.setMaxDepth(maxDepth); 102 } catch (NumberFormatException e) { 103 System.out.println("Wrong number for maxDepth: "+option); 104 } 105 } 106 107 if (opt.getOptionBoolean("o")) { 109 robby.setWalkToOtherHosts(true); 110 } 111 112 option=opt.getOptionString("s"); 114 if (option != null) { 115 basedir=option; 116 } 117 118 option=opt.getOptionString("m"); 120 if (option != null) { 121 try { 122 minSize=Integer.parseInt(option); 123 } catch (NumberFormatException e) {} 124 } 125 126 option=opt.getOptionString("a"); 128 if (option != null) { 129 robby.setAgentName(option); 130 } 131 132 if (opt.getOptionBoolean("i")) { 134 robby.setIgnoreRobotsTxt(true); 135 } 136 137 option=opt.getOptionString("w"); 139 if (option != null) { 140 try { 141 int waitTime=Integer.parseInt(option); 142 robby.setSleepTime(waitTime*1000); 143 } catch (NumberFormatException e) {} 144 } 145 146 if (opt.getOptionBoolean("?")) { 148 printUsage(); 149 return; 150 } 151 152 URL u = new URL (argv[argv.length-1]); 153 154 HttpDocToFile docStore=new HttpDocToFile(basedir); 155 docStore.setMinFileSize(minSize); 156 157 SystemOutHttpToolCallback statusInfo = new SystemOutHttpToolCallback(); 158 159 robby.setStartURL(u); 160 robby.setDocManager(docStore); 161 robby.setHttpToolCallback(statusInfo); 162 163 robby.run(); 164 165 } 166 } 167 | Popular Tags |