KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > tools > WebDBAdminTool


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.tools;
5
6 import java.io.*;
7 import java.util.*;
8 import java.util.logging.*;
9
10 import net.nutch.db.*;
11 import net.nutch.fs.*;
12 import net.nutch.util.*;
13 import net.nutch.linkdb.*;
14 import net.nutch.pagedb.*;
15 import net.nutch.pagedb.*;
16
17 /******************************************
18  * The WebDBAdminTool is for Nutch administrators
19  * who need special access to the webdb. It allows
20  * for finer editing of the stored values.
21  *
22  * @author Mike Cafarella
23  ******************************************/

24 public class WebDBAdminTool {
25     public static final Logger LOG = LogFormatter.getLogger("net.nutch.tools.WebDBAdminTool");
26
27     IWebDBReader reader;
28
29     public WebDBAdminTool(IWebDBReader reader) {
30         this.reader = reader;
31     }
32
33     /**
34      * Emit the webdb to 2 text files.
35      */

36     public void textDump(String JavaDoc dumpName) throws IOException {
37         //
38
// First the pages
39
//
40
PrintStream out = new PrintStream(new BufferedOutputStream(new FileOutputStream(new File(dumpName + ".pages"))));
41         try {
42             for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
43                 Page p = (Page) e.nextElement();
44                 out.println(p.toTabbedString());
45             }
46         } finally {
47             out.close();
48         }
49
50         //
51
// Then the links
52
//
53
out = new PrintStream(new BufferedOutputStream(new FileOutputStream(new File(dumpName + ".links"))));
54         try {
55             for (Enumeration e = reader.links(); e.hasMoreElements(); ) {
56                 Link l = (Link) e.nextElement();
57                 out.println(l.toTabbedString());
58             }
59         } finally {
60             out.close();
61         }
62     }
63
64     /**
65      * Emit the top K-rated Pages.
66      */

67     public void emitTopK(int k) throws IOException {
68         // Create a sorted list
69
SortedSet topSet = new TreeSet(new Comparator() {
70             public int compare(Object JavaDoc o1, Object JavaDoc o2) {
71                 Page p1 = (Page) o1;
72                 Page p2 = (Page) o2;
73                 if (p1.getScore() < p2.getScore()) {
74                     return -1;
75                 } else if (p1.getScore() == p2.getScore()) {
76                     return 0;
77                 } else {
78                     return 1;
79                 }
80             }
81         }
82             );
83
84         // Find the top k elts
85
Page lowestPage = null;
86         for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
87             Page curPage = (Page) e.nextElement();
88                     
89             if (topSet.size() < k) {
90                 topSet.add(curPage);
91                 lowestPage = (Page) topSet.first();
92             } else if (lowestPage.getScore() < curPage.getScore()) {
93                 topSet.remove(lowestPage);
94                 topSet.add(curPage);
95                 lowestPage = (Page) topSet.first();
96             }
97         }
98             
99         // Print them out
100
int i = 0;
101         for (Iterator it = topSet.iterator(); it.hasNext(); i++) {
102             LOG.info("Page " + i + ": " + (Page) it.next());
103         }
104     }
105
106     /**
107      * Emit each page's score and link data
108      */

109     public void scoreDump() throws IOException {
110         for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
111             Page p = (Page) e.nextElement();
112             Link links[] = reader.getLinks(p.getURL());
113             int numLinks = 0;
114             if (links != null) {
115                 numLinks = links.length;
116             }
117
118             LOG.info(p.getURL() + "\t" + p.getScore() + "\t" + numLinks);
119         }
120     }
121
122     /**
123      * This tool performs a number of generic db management tasks.
124      * Right now, it only emits the text-format database.
125      */

126     public static void main(String JavaDoc argv[]) throws FileNotFoundException, IOException {
127         if (argv.length < 2) {
128             System.out.println("Usage: java net.nutch.tools.WebDBAdminTool (-local | -ndfs <namenode:port>) db [-create] [-textdump dumpPrefix] [-scoredump] [-top k]");
129             return;
130         }
131
132         boolean create = false;
133         String JavaDoc command = null, dumpName = null;
134         int k = 0;
135         int i = 0;
136         NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
137         File root = new File(argv[i++]);
138         for (; i < argv.length; i++) {
139             if ("-create".equals(argv[i])) {
140                 command = argv[i];
141                 create = true;
142             } else if ("-textdump".equals(argv[i])) {
143                 command = argv[i];
144                 i++;
145                 dumpName = argv[i];
146             } else if ("-top".equals(argv[i])) {
147                 command = argv[i];
148                 i++;
149                 k = Integer.parseInt(argv[i]);
150             } else if ("-scoredump".equals(argv[i])) {
151                 command = argv[i];
152             }
153         }
154
155         //
156
// For db creation
157
//
158
if ("-create".equals(command)) {
159             WebDBWriter.createWebDB(nfs, root);
160             LOG.info("Created webdb at " + nfs + "," + root);
161             nfs.close();
162             return;
163         }
164
165         //
166
// For other functions
167
//
168
IWebDBReader reader = new WebDBReader(nfs, root);
169         try {
170             WebDBAdminTool admin = new WebDBAdminTool(reader);
171             if ("-textdump".equals(command)) {
172                 admin.textDump(dumpName);
173             } else if ("-top".equals(command)) {
174                 admin.emitTopK(k);
175             } else if ("-scoredump".equals(command)) {
176                 admin.scoreDump();
177             }
178         } finally {
179             reader.close();
180             nfs.close();
181         }
182     }
183 }
184
Popular Tags