KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > tools > CrawlTool


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.tools;
5
6 import java.io.*;
7 import java.net.*;
8 import java.util.*;
9 import java.text.*;
10 import java.util.logging.*;
11
12 import net.nutch.io.*;
13 import net.nutch.db.*;
14 import net.nutch.fs.*;
15 import net.nutch.util.*;
16 import net.nutch.fetcher.*;
17 import net.nutch.indexer.*;
18
19 /*
20  */

21 public class CrawlTool {
22   public static final Logger LOG =
23     LogFormatter.getLogger("net.nutch.tools.CrawlTool");
24
25   static {
26     NutchConf.addConfResource("crawl-tool.xml");
27   }
28
29   /** Returns a string representing the current date and time that also sorts
30    * lexicographically by date. */

31   private static String JavaDoc getDate() {
32     return new SimpleDateFormat("yyyyMMddHHmmss").format
33       (new Date(System.currentTimeMillis()));
34   }
35
36   /** Returns the pathname of the latest segment in a segments directory. */
37   private static String JavaDoc getLatestSegment(NutchFileSystem nfs, String JavaDoc segmentsDir) throws IOException {
38       File bestSegment = null;
39       File[] allSegmentFiles = nfs.listFiles(new File(segmentsDir));
40       for (int i = 0; i < allSegmentFiles.length; i++) {
41           String JavaDoc name = allSegmentFiles[i].getName();
42           if (bestSegment == null || bestSegment.getName().compareTo(name) < 0) {
43               bestSegment = allSegmentFiles[i];
44           }
45       }
46       return bestSegment.getPath();
47   }
48
49   /**
50    * Useful in constructing a command-line for other tools
51    */

52   private static String JavaDoc[] prependFileSystem(String JavaDoc fs, String JavaDoc nameserver, String JavaDoc[] items) {
53       String JavaDoc[] results = null;
54       if ("-ndfs".equals(fs)) {
55           results = new String JavaDoc[items.length + 2];
56           results[0] = fs;
57           results[1] = nameserver;
58           System.arraycopy(items, 0, results, 2, items.length);
59       } else if ("-local".equals(fs)) {
60           results = new String JavaDoc[items.length + 1];
61           results[0] = fs;
62           System.arraycopy(items, 0, results, 1, items.length);
63       } else {
64           results = items;
65       }
66       return results;
67   }
68
69   /* Perform complete crawling and indexing given a set of root urls. */
70   public static void main(String JavaDoc args[]) throws Exception JavaDoc {
71     if (args.length < 1) {
72       System.out.println("Usage: CrawlTool (-local | -ndfs <nameserver:port>) <root_url_file> [-dir d] [-threads n] [-depth i] [-showThreadID]");
73       return;
74     }
75
76     String JavaDoc fs = "-local";
77     String JavaDoc nameserver = "";
78     if ("-ndfs".equals(args[0])) {
79         fs = "-ndfs";
80         nameserver = args[1];
81     }
82     NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
83     try {
84         String JavaDoc rootUrlFile = null;
85         String JavaDoc dir = new File("crawl-" + getDate()).getCanonicalFile().getName();
86         int threads = NutchConf.getInt("fetcher.threads.fetch", 10);
87         int depth = 5;
88         boolean showThreadID = false;
89
90         for (int i = 0; i < args.length; i++) {
91             if ("-dir".equals(args[i])) {
92                 dir = args[i+1];
93                 i++;
94             } else if ("-threads".equals(args[i])) {
95                 threads = Integer.parseInt(args[i+1]);
96                 i++;
97             } else if ("-depth".equals(args[i])) {
98                 depth = Integer.parseInt(args[i+1]);
99                 i++;
100             } else if ("-showThreadID".equals(args[i])) {
101                 showThreadID = true;
102             } else if (args[i] != null) {
103                 rootUrlFile = args[i];
104             }
105         }
106
107         if (nfs.exists(new File(dir))) {
108             throw new RuntimeException JavaDoc(dir + " already exists.");
109         }
110
111         LOG.info("crawl started in: " + dir);
112         LOG.info("rootUrlFile = " + rootUrlFile);
113         LOG.info("threads = " + threads);
114         LOG.info("depth = " + depth);
115
116         String JavaDoc db = new File(dir + "/db").getCanonicalPath();
117         String JavaDoc segments = new File(dir + "/segments").getCanonicalPath();
118
119         // initialize the web database
120
WebDBAdminTool.main(prependFileSystem(fs, nameserver, new String JavaDoc[] { db, "-create"}));
121         WebDBInjector.main(prependFileSystem(fs, nameserver, new String JavaDoc[] { db, "-urlfile", rootUrlFile }));
122
123         for (int i = 0; i < depth; i++) {
124             // generate a new segment
125
FetchListTool.main(prependFileSystem(fs, nameserver, new String JavaDoc[] { db, segments } ));
126             String JavaDoc segment = getLatestSegment(nfs, segments);
127             Fetcher.main(prependFileSystem(fs, nameserver, new String JavaDoc[] { "-threads", ""+threads, segment } ));
128             UpdateDatabaseTool.main(prependFileSystem(fs, nameserver, new String JavaDoc[] { db, segment } ));
129         }
130
131         // Re-fetch everything to get the complete set of incoming anchor texts
132
// associated with each page. We should fix this, so that we can update
133
// the previously fetched segments with the anchors that are now in the
134
// database, but until that algorithm is written, we re-fetch.
135

136         // delete all the old segment data
137
FileUtil.fullyDelete(nfs, new File(segments));
138
139         // generate a single segment containing all pages in the db
140
FetchListTool.main(prependFileSystem(fs, nameserver, new String JavaDoc[] { db, segments, "-adddays", "" + Integer.MAX_VALUE } ));
141
142         String JavaDoc segment = getLatestSegment(nfs, segments);
143
144         // re-fetch everything
145
Fetcher.main(prependFileSystem(fs, nameserver, new String JavaDoc[] { "-threads", ""+threads, segment } ));
146
147         // index, dedup & merge
148
File workDir = new File(dir, "workdir");
149         IndexSegment.main(prependFileSystem(fs, nameserver, new String JavaDoc[] { segment, "-dir", workDir.getPath() } ));
150         DeleteDuplicates.main(prependFileSystem(fs, nameserver, new String JavaDoc[] { segments }));
151         IndexMerger.main(prependFileSystem(fs, nameserver, new String JavaDoc[] { new File(dir + "/index").getCanonicalPath(), segment } ));
152
153         LOG.info("crawl finished: " + dir);
154     } finally {
155         nfs.close();
156     }
157   }
158 }
159
Popular Tags