1 2 3 4 package net.nutch.tools; 5 6 import java.io.*; 7 import java.net.*; 8 import java.util.*; 9 import java.text.*; 10 import java.util.logging.*; 11 12 import net.nutch.io.*; 13 import net.nutch.db.*; 14 import net.nutch.fs.*; 15 import net.nutch.util.*; 16 import net.nutch.fetcher.*; 17 import net.nutch.indexer.*; 18 19 21 public class CrawlTool { 22 public static final Logger LOG = 23 LogFormatter.getLogger("net.nutch.tools.CrawlTool"); 24 25 static { 26 NutchConf.addConfResource("crawl-tool.xml"); 27 } 28 29 31 private static String getDate() { 32 return new SimpleDateFormat("yyyyMMddHHmmss").format 33 (new Date(System.currentTimeMillis())); 34 } 35 36 37 private static String getLatestSegment(NutchFileSystem nfs, String segmentsDir) throws IOException { 38 File bestSegment = null; 39 File[] allSegmentFiles = nfs.listFiles(new File(segmentsDir)); 40 for (int i = 0; i < allSegmentFiles.length; i++) { 41 String name = allSegmentFiles[i].getName(); 42 if (bestSegment == null || bestSegment.getName().compareTo(name) < 0) { 43 bestSegment = allSegmentFiles[i]; 44 } 45 } 46 return bestSegment.getPath(); 47 } 48 49 52 private static String [] prependFileSystem(String fs, String nameserver, String [] items) { 53 String [] results = null; 54 if ("-ndfs".equals(fs)) { 55 results = new String [items.length + 2]; 56 results[0] = fs; 57 results[1] = nameserver; 58 System.arraycopy(items, 0, results, 2, items.length); 59 } else if ("-local".equals(fs)) { 60 results = new String [items.length + 1]; 61 results[0] = fs; 62 System.arraycopy(items, 0, results, 1, items.length); 63 } else { 64 results = items; 65 } 66 return results; 67 } 68 69 70 public static void main(String args[]) throws Exception { 71 if (args.length < 1) { 72 System.out.println("Usage: CrawlTool (-local | -ndfs <nameserver:port>) <root_url_file> [-dir d] [-threads n] [-depth i] [-showThreadID]"); 73 return; 74 } 75 76 String fs = "-local"; 77 String nameserver = ""; 78 if ("-ndfs".equals(args[0])) { 79 fs = "-ndfs"; 80 nameserver = args[1]; 81 } 82 NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0); 83 try { 84 String rootUrlFile = null; 85 String dir = new File("crawl-" + getDate()).getCanonicalFile().getName(); 86 int threads = NutchConf.getInt("fetcher.threads.fetch", 10); 87 int depth = 5; 88 boolean showThreadID = false; 89 90 for (int i = 0; i < args.length; i++) { 91 if ("-dir".equals(args[i])) { 92 dir = args[i+1]; 93 i++; 94 } else if ("-threads".equals(args[i])) { 95 threads = Integer.parseInt(args[i+1]); 96 i++; 97 } else if ("-depth".equals(args[i])) { 98 depth = Integer.parseInt(args[i+1]); 99 i++; 100 } else if ("-showThreadID".equals(args[i])) { 101 showThreadID = true; 102 } else if (args[i] != null) { 103 rootUrlFile = args[i]; 104 } 105 } 106 107 if (nfs.exists(new File(dir))) { 108 throw new RuntimeException (dir + " already exists."); 109 } 110 111 LOG.info("crawl started in: " + dir); 112 LOG.info("rootUrlFile = " + rootUrlFile); 113 LOG.info("threads = " + threads); 114 LOG.info("depth = " + depth); 115 116 String db = new File(dir + "/db").getCanonicalPath(); 117 String segments = new File(dir + "/segments").getCanonicalPath(); 118 119 WebDBAdminTool.main(prependFileSystem(fs, nameserver, new String [] { db, "-create"})); 121 WebDBInjector.main(prependFileSystem(fs, nameserver, new String [] { db, "-urlfile", rootUrlFile })); 122 123 for (int i = 0; i < depth; i++) { 124 FetchListTool.main(prependFileSystem(fs, nameserver, new String [] { db, segments } )); 126 String segment = getLatestSegment(nfs, segments); 127 Fetcher.main(prependFileSystem(fs, nameserver, new String [] { "-threads", ""+threads, segment } )); 128 UpdateDatabaseTool.main(prependFileSystem(fs, nameserver, new String [] { db, segment } )); 129 } 130 131 136 FileUtil.fullyDelete(nfs, new File(segments)); 138 139 FetchListTool.main(prependFileSystem(fs, nameserver, new String [] { db, segments, "-adddays", "" + Integer.MAX_VALUE } )); 141 142 String segment = getLatestSegment(nfs, segments); 143 144 Fetcher.main(prependFileSystem(fs, nameserver, new String [] { "-threads", ""+threads, segment } )); 146 147 File workDir = new File(dir, "workdir"); 149 IndexSegment.main(prependFileSystem(fs, nameserver, new String [] { segment, "-dir", workDir.getPath() } )); 150 DeleteDuplicates.main(prependFileSystem(fs, nameserver, new String [] { segments })); 151 IndexMerger.main(prependFileSystem(fs, nameserver, new String [] { new File(dir + "/index").getCanonicalPath(), segment } )); 152 153 LOG.info("crawl finished: " + dir); 154 } finally { 155 nfs.close(); 156 } 157 } 158 } 159 | Popular Tags |