1 2 3 4 package net.nutch.tools; 5 6 import java.io.*; 7 import java.util.*; 8 import java.util.logging.*; 9 10 import net.nutch.db.*; 11 import net.nutch.fs.*; 12 import net.nutch.util.*; 13 import net.nutch.linkdb.*; 14 import net.nutch.pagedb.*; 15 import net.nutch.pagedb.*; 16 17 24 public class WebDBAdminTool { 25 public static final Logger LOG = LogFormatter.getLogger("net.nutch.tools.WebDBAdminTool"); 26 27 IWebDBReader reader; 28 29 public WebDBAdminTool(IWebDBReader reader) { 30 this.reader = reader; 31 } 32 33 36 public void textDump(String dumpName) throws IOException { 37 PrintStream out = new PrintStream(new BufferedOutputStream(new FileOutputStream(new File(dumpName + ".pages")))); 41 try { 42 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { 43 Page p = (Page) e.nextElement(); 44 out.println(p.toTabbedString()); 45 } 46 } finally { 47 out.close(); 48 } 49 50 out = new PrintStream(new BufferedOutputStream(new FileOutputStream(new File(dumpName + ".links")))); 54 try { 55 for (Enumeration e = reader.links(); e.hasMoreElements(); ) { 56 Link l = (Link) e.nextElement(); 57 out.println(l.toTabbedString()); 58 } 59 } finally { 60 out.close(); 61 } 62 } 63 64 67 public void emitTopK(int k) throws IOException { 68 SortedSet topSet = new TreeSet(new Comparator() { 70 public int compare(Object o1, Object o2) { 71 Page p1 = (Page) o1; 72 Page p2 = (Page) o2; 73 if (p1.getScore() < p2.getScore()) { 74 return -1; 75 } else if (p1.getScore() == p2.getScore()) { 76 return 0; 77 } else { 78 return 1; 79 } 80 } 81 } 82 ); 83 84 Page lowestPage = null; 86 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { 87 Page curPage = (Page) e.nextElement(); 88 89 if (topSet.size() < k) { 90 topSet.add(curPage); 91 lowestPage = (Page) topSet.first(); 92 } else if (lowestPage.getScore() < curPage.getScore()) { 93 topSet.remove(lowestPage); 94 topSet.add(curPage); 95 lowestPage = (Page) topSet.first(); 96 } 97 } 98 99 int i = 0; 101 for (Iterator it = topSet.iterator(); it.hasNext(); i++) { 102 LOG.info("Page " + i + ": " + (Page) it.next()); 103 } 104 } 105 106 109 public void scoreDump() throws IOException { 110 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { 111 Page p = (Page) e.nextElement(); 112 Link links[] = reader.getLinks(p.getURL()); 113 int numLinks = 0; 114 if (links != null) { 115 numLinks = links.length; 116 } 117 118 LOG.info(p.getURL() + "\t" + p.getScore() + "\t" + numLinks); 119 } 120 } 121 122 126 public static void main(String argv[]) throws FileNotFoundException, IOException { 127 if (argv.length < 2) { 128 System.out.println("Usage: java net.nutch.tools.WebDBAdminTool (-local | -ndfs <namenode:port>) db [-create] [-textdump dumpPrefix] [-scoredump] [-top k]"); 129 return; 130 } 131 132 boolean create = false; 133 String command = null, dumpName = null; 134 int k = 0; 135 int i = 0; 136 NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i); 137 File root = new File(argv[i++]); 138 for (; i < argv.length; i++) { 139 if ("-create".equals(argv[i])) { 140 command = argv[i]; 141 create = true; 142 } else if ("-textdump".equals(argv[i])) { 143 command = argv[i]; 144 i++; 145 dumpName = argv[i]; 146 } else if ("-top".equals(argv[i])) { 147 command = argv[i]; 148 i++; 149 k = Integer.parseInt(argv[i]); 150 } else if ("-scoredump".equals(argv[i])) { 151 command = argv[i]; 152 } 153 } 154 155 if ("-create".equals(command)) { 159 WebDBWriter.createWebDB(nfs, root); 160 LOG.info("Created webdb at " + nfs + "," + root); 161 nfs.close(); 162 return; 163 } 164 165 IWebDBReader reader = new WebDBReader(nfs, root); 169 try { 170 WebDBAdminTool admin = new WebDBAdminTool(reader); 171 if ("-textdump".equals(command)) { 172 admin.textDump(dumpName); 173 } else if ("-top".equals(command)) { 174 admin.emitTopK(k); 175 } else if ("-scoredump".equals(command)) { 176 admin.scoreDump(); 177 } 178 } finally { 179 reader.close(); 180 nfs.close(); 181 } 182 } 183 } 184 | Popular Tags |