1 2 3 4 package net.nutch.util; 5 6 import java.io.*; 7 import java.util.*; 8 9 import net.nutch.db.*; 10 import net.nutch.fs.*; 11 12 39 public class ScoreStats { 40 private final static double INVERTED_LOG_BASE_TEN = (1.0 / Math.log(10)); 41 private final static double EXP_127_MODIFIER = (1000.0 / (Math.log(Float.MAX_VALUE) * INVERTED_LOG_BASE_TEN)); 42 43 private final static double RANGE_COMPRESSOR = INVERTED_LOG_BASE_TEN * EXP_127_MODIFIER; 44 long totalScores = 0; 45 46 long buckets[] = new long[2001]; 50 51 53 public ScoreStats() { 54 } 55 56 63 public void addScore(float score) { 64 if (score < 1) { 65 int index = (int) Math.floor(score * 1000); 66 buckets[index]++; 67 } else { 68 int index = (int) Math.floor(Math.log(score) * RANGE_COMPRESSOR); 70 index += 1000; 71 buckets[index]++; 72 } 73 totalScores++; 74 } 75 76 80 public void emitDistribution(PrintStream pout) { 81 pout.println("***** Estimated Score Distribution *****"); 82 pout.println(" (to choose a fetchlist cutoff score)"); 83 pout.println(); 84 85 double decileChunk = totalScores / 10.0; 87 double percentileChunk = totalScores / 100.0; 88 89 double grandTotal = 0, minScore = Double.MAX_VALUE, maxScore = Double.MIN_VALUE; 91 long scoresSoFar = 0; 92 int decileCount = 0, percentileCount = 0; 93 94 for (int i = 0; i < buckets.length; i++) { 96 scoresSoFar += buckets[i]; 102 103 double reconstructedValue = 0.0; 106 if (i < 1000) { 107 reconstructedValue = i / 1000.0; 108 } else { 109 int localIndex = i - 1000; 110 reconstructedValue = Math.exp(localIndex / RANGE_COMPRESSOR); 111 } 112 113 grandTotal += (reconstructedValue * buckets[i]); 115 if (buckets[i] > 0) { 116 if (minScore > reconstructedValue) { 117 minScore = reconstructedValue; 118 } 119 if (maxScore < reconstructedValue) { 120 maxScore = reconstructedValue; 121 } 122 } 123 124 if (scoresSoFar >= ((decileCount * decileChunk) + (percentileCount * percentileChunk))) { 130 131 double precisePercentile = ((int) Math.round(((totalScores - scoresSoFar) / (totalScores * 1.0)) * 10000)) / 100.0; 134 135 String equalityOperator = ">="; 137 if ((totalScores - scoresSoFar) == 0) { 138 equalityOperator = ">"; 139 } 140 141 pout.println(precisePercentile + "% (" + (totalScores - scoresSoFar) + ") have score " + equalityOperator + " " + reconstructedValue); 142 143 while (decileCount < 9 && scoresSoFar >= (decileCount * decileChunk) + (percentileCount * percentileChunk)) { 148 decileCount++; 149 } 150 if (decileCount >= 9) { 151 while (percentileCount < 10 && scoresSoFar >= (decileCount * decileChunk) + (percentileCount * percentileChunk)) { 152 percentileCount++; 153 } 154 } 155 156 if (percentileCount >= 10) { 158 break; 159 } 160 } 161 } 162 163 pout.println(); 164 pout.println(); 165 pout.println("Min score is " + minScore); 166 pout.println("Max score is " + maxScore); 167 pout.println("Average score is " + (grandTotal / scoresSoFar)); 168 } 169 170 172 public static void main(String argv[]) throws IOException { 173 if (argv.length < 1) { 174 System.out.println("Usage: java net.nutch.util.ScoreStats [-real (-local | -ndfs <namenode:port>) <db>] [-simulated <numScores> <min> <max> [seed]]"); 175 return; 176 } 177 178 NutchFileSystem nfs = null; 179 File root = null; 180 long seed = new Random().nextLong(); 181 boolean simulated = false; 182 int numScores = 0; 183 float min = 0, max = 0; 184 185 if ("-real".equals(argv[0])) { 186 nfs = NutchFileSystem.parseArgs(argv, 1); 187 root = new File(argv[1]); 188 } else if ("-simulated".equals(argv[0])) { 189 simulated = true; 190 numScores = Integer.parseInt(argv[1]); 191 min = Float.parseFloat(argv[2]); 192 max = Float.parseFloat(argv[3]); 193 if (argv.length > 4) { 194 seed = Long.parseLong(argv[4]); 195 } 196 } else { 197 System.out.println("No command specified"); 198 } 199 200 System.out.println("Using seed: " + seed); 201 ScoreStats ss = new ScoreStats(); 202 if (simulated) { 203 Random r = new Random(seed); 204 for (int i = 0; i < numScores; i++) { 205 float newScore = min + (r.nextFloat() * (max - min)); 206 ss.addScore(newScore); 207 } 208 } else { 209 IWebDBReader reader = new WebDBReader(nfs, root); 210 try { 211 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { 212 Page p = (Page) e.nextElement(); 213 ss.addScore(p.getScore()); 214 } 215 } finally { 216 reader.close(); 217 } 218 } 219 220 ss.emitDistribution(System.out); 221 } 222 } 223 224 | Popular Tags |