CrawlTool


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.tools;
5   
6   import java.io.*;
7   import java.net.*;
8   import java.util.*;
9   import java.text.*;
10  import java.util.logging.*;
11  
12  import net.nutch.io.*;
13  import net.nutch.db.*;
14  import net.nutch.fs.*;
15  import net.nutch.util.*;
16  import net.nutch.fetcher.*;
17  import net.nutch.indexer.*;
18  
19  /*
20   */
21  public class CrawlTool {
22    public static final Logger LOG =
23      LogFormatter.getLogger("net.nutch.tools.CrawlTool");
24  
25    static {
26      NutchConf.addConfResource("crawl-tool.xml");
27    }
28  
29    /** Returns a string representing the current date and time that also sorts
30     * lexicographically by date. */
31    private static String   getDate() {
32      return new SimpleDateFormat("yyyyMMddHHmmss").format
33        (new Date(System.currentTimeMillis()));
34    }
35  
36    /** Returns the pathname of the latest segment in a segments directory. */
37    private static String   getLatestSegment(NutchFileSystem nfs, String   segmentsDir) throws IOException {
38        File bestSegment = null;
39        File[] allSegmentFiles = nfs.listFiles(new File(segmentsDir));
40        for (int i = 0; i < allSegmentFiles.length; i++) {
41            String   name = allSegmentFiles[i].getName();
42            if (bestSegment == null || bestSegment.getName().compareTo(name) < 0) {
43                bestSegment = allSegmentFiles[i];
44            }
45        }
46        return bestSegment.getPath();
47    }
48  
49    /**
50     * Useful in constructing a command-line for other tools
51     */
52    private static String  [] prependFileSystem(String   fs, String   nameserver, String  [] items) {
53        String  [] results = null;
54        if ("-ndfs".equals(fs)) {
55            results = new String  [items.length + 2];
56            results[0] = fs;
57            results[1] = nameserver;
58            System.arraycopy(items, 0, results, 2, items.length);
59        } else if ("-local".equals(fs)) {
60            results = new String  [items.length + 1];
61            results[0] = fs;
62            System.arraycopy(items, 0, results, 1, items.length);
63        } else {
64            results = items;
65        }
66        return results;
67    }
68  
69    /* Perform complete crawling and indexing given a set of root urls. */
70    public static void main(String   args[]) throws Exception   {
71      if (args.length < 1) {
72        System.out.println("Usage: CrawlTool (-local | -ndfs <nameserver:port>) <root_url_file> [-dir d] [-threads n] [-depth i] [-showThreadID]");
73        return;
74      }
75  
76      String   fs = "-local";
77      String   nameserver = "";
78      if ("-ndfs".equals(args[0])) {
79          fs = "-ndfs";
80          nameserver = args[1];
81      }
82      NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
83      try {
84          String   rootUrlFile = null;
85          String   dir = new File("crawl-" + getDate()).getCanonicalFile().getName();
86          int threads = NutchConf.getInt("fetcher.threads.fetch", 10);
87          int depth = 5;
88          boolean showThreadID = false;
89  
90          for (int i = 0; i < args.length; i++) {
91              if ("-dir".equals(args[i])) {
92                  dir = args[i+1];
93                  i++;
94              } else if ("-threads".equals(args[i])) {
95                  threads = Integer.parseInt(args[i+1]);
96                  i++;
97              } else if ("-depth".equals(args[i])) {
98                  depth = Integer.parseInt(args[i+1]);
99                  i++;
100             } else if ("-showThreadID".equals(args[i])) {
101                 showThreadID = true;
102             } else if (args[i] != null) {
103                 rootUrlFile = args[i];
104             }
105         }
106 
107         if (nfs.exists(new File(dir))) {
108             throw new RuntimeException  (dir + " already exists.");
109         }
110 
111         LOG.info("crawl started in: " + dir);
112         LOG.info("rootUrlFile = " + rootUrlFile);
113         LOG.info("threads = " + threads);
114         LOG.info("depth = " + depth);
115 
116         String   db = new File(dir + "/db").getCanonicalPath();
117         String   segments = new File(dir + "/segments").getCanonicalPath();
118 
119         // initialize the web database
120         WebDBAdminTool.main(prependFileSystem(fs, nameserver, new String  [] { db, "-create"}));
121         WebDBInjector.main(prependFileSystem(fs, nameserver, new String  [] { db, "-urlfile", rootUrlFile }));
122 
123         for (int i = 0; i < depth; i++) {
124             // generate a new segment
125             FetchListTool.main(prependFileSystem(fs, nameserver, new String  [] { db, segments } ));
126             String   segment = getLatestSegment(nfs, segments);
127             Fetcher.main(prependFileSystem(fs, nameserver, new String  [] { "-threads", ""+threads, segment } ));
128             UpdateDatabaseTool.main(prependFileSystem(fs, nameserver, new String  [] { db, segment } ));
129         }
130 
131         // Re-fetch everything to get the complete set of incoming anchor texts
132         // associated with each page.  We should fix this, so that we can update
133         // the previously fetched segments with the anchors that are now in the
134         // database, but until that algorithm is written, we re-fetch.
135     
136         // delete all the old segment data
137         FileUtil.fullyDelete(nfs, new File(segments));
138 
139         // generate a single segment containing all pages in the db
140         FetchListTool.main(prependFileSystem(fs, nameserver, new String  [] { db, segments, "-adddays", "" + Integer.MAX_VALUE } ));
141 
142         String   segment = getLatestSegment(nfs, segments);
143 
144         // re-fetch everything
145         Fetcher.main(prependFileSystem(fs, nameserver, new String  [] { "-threads", ""+threads, segment } ));
146 
147         // index, dedup & merge
148         File workDir = new File(dir, "workdir");
149         IndexSegment.main(prependFileSystem(fs, nameserver, new String  [] { segment, "-dir", workDir.getPath() } ));
150         DeleteDuplicates.main(prependFileSystem(fs, nameserver, new String  [] { segments }));
151         IndexMerger.main(prependFileSystem(fs, nameserver, new String  [] { new File(dir + "/index").getCanonicalPath(), segment } ));
152 
153         LOG.info("crawl finished: " + dir);
154     } finally {
155         nfs.close();
156     }
157   }
158 }
159
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags