KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > util > ScoreStats


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.util;
5
6 import java.io.*;
7 import java.util.*;
8
9 import net.nutch.db.*;
10 import net.nutch.fs.*;
11
12 /*************************************************************
13  * When we generate a fetchlist, we need to choose a "cutoff"
14  * score, such that any scores above that cutoff will be included
15  * in the fetchlist. Any scores below will not be. (It is too
16  * hard to do the obvious thing, which is to sort the list of all
17  * pages by score, and pick the top K.)
18  *
19  * We need a good way to choose that cutoff. ScoreStats is used
20  * during LinkAnalysis to track the distribution of scores that
21  * we compute. We bucketize the scorespace into 2000 buckets.
22  * the first 1000 are equally-spaced counts for the range 0..1.0
23  * (non-inclusive). The 2nd buckets are logarithmically spaced
24  * between 1 and Float.MAX_VALUE.
25  *
26  * If the score is < 1, then choose a bucket by (score / 1000) and
27  * choosing the incrementing the resulting slot.
28  *
29  * If the score is >1, then take the base-10 log, and take the
30  * integer floor. This should be an int no greater than 9. This
31  * is the hundreds-place digit for the index. (Since '1' is in
32  * the thousands-place.) Next, find where the score appears in
33  * the range between floor(log(score)), and ceiling(log(score)).
34  * The percentage of the distance between these two values is
35  * reflected in the final two digits for the index.
36  *
37  * @author Mike Cafarella
38  ***************************************************************/

39 public class ScoreStats {
40     private final static double INVERTED_LOG_BASE_TEN = (1.0 / Math.log(10));
41     private final static double EXP_127_MODIFIER = (1000.0 / (Math.log(Float.MAX_VALUE) * INVERTED_LOG_BASE_TEN));
42
43     private final static double RANGE_COMPRESSOR = INVERTED_LOG_BASE_TEN * EXP_127_MODIFIER;
44     long totalScores = 0;
45
46     //
47
// For bucketizing score counts
48
//
49
long buckets[] = new long[2001];
50
51     /**
52      */

53     public ScoreStats() {
54     }
55
56     /**
57      * Increment the counter in the right place. We keep
58      * 2000 different buckets. Half of them are <1, and
59      * half are >1.
60      
61      * Dies when it tries to fill bucket "1132"
62      */

63     public void addScore(float score) {
64         if (score < 1) {
65             int index = (int) Math.floor(score * 1000);
66             buckets[index]++;
67         } else {
68             // Here we need to find the floor'ed base-10 logarithm.
69
int index = (int) Math.floor(Math.log(score) * RANGE_COMPRESSOR);
70             index += 1000;
71             buckets[index]++;
72         }
73         totalScores++;
74     }
75
76     /**
77      * Print out the distribution, with greater specificity
78      * for percentiles 90th - 100th.
79      */

80     public void emitDistribution(PrintStream pout) {
81         pout.println("***** Estimated Score Distribution *****");
82         pout.println(" (to choose a fetchlist cutoff score)");
83         pout.println();
84
85         // Figure out how big each percentile chunk is.
86
double decileChunk = totalScores / 10.0;
87         double percentileChunk = totalScores / 100.0;
88
89         // Now, emit everything
90
double grandTotal = 0, minScore = Double.MAX_VALUE, maxScore = Double.MIN_VALUE;
91         long scoresSoFar = 0;
92         int decileCount = 0, percentileCount = 0;
93
94         // Go through all the sample buckets
95
for (int i = 0; i < buckets.length; i++) {
96             //
97
// Always increment the
98
// seen-sample counter by the number of samples
99
// in the current bucket.
100
//
101
scoresSoFar += buckets[i];
102
103             // From the bucket index, recreate the
104
// original score (as best we can)
105
double reconstructedValue = 0.0;
106             if (i < 1000) {
107                 reconstructedValue = i / 1000.0;
108             } else {
109                 int localIndex = i - 1000;
110                 reconstructedValue = Math.exp(localIndex / RANGE_COMPRESSOR);
111             }
112
113             // Keep running stats on min, max, avg scores
114
grandTotal += (reconstructedValue * buckets[i]);
115             if (buckets[i] > 0) {
116                 if (minScore > reconstructedValue) {
117                     minScore = reconstructedValue;
118                 }
119                 if (maxScore < reconstructedValue) {
120                     maxScore = reconstructedValue;
121                 }
122             }
123
124             //
125
// If the number of samples we've seen so far is
126
// GTE the predicted percentile break, then we want to
127
// emit a println().
128
//
129
if (scoresSoFar >= ((decileCount * decileChunk) + (percentileCount * percentileChunk))) {
130
131                 // Compute what percentile of the items
132
// we've reached
133
double precisePercentile = ((int) Math.round(((totalScores - scoresSoFar) / (totalScores * 1.0)) * 10000)) / 100.0;
134
135                 // Emit
136
String JavaDoc equalityOperator = ">=";
137                 if ((totalScores - scoresSoFar) == 0) {
138                     equalityOperator = ">";
139                 }
140
141                 pout.println(precisePercentile + "% (" + (totalScores - scoresSoFar) + ") have score " + equalityOperator + " " + reconstructedValue);
142
143                 // Bump our decile and percentile counters.
144
// We may have to bump multiple times if
145
// a single bucket carried us across several
146
// boundaries.
147
while (decileCount < 9 && scoresSoFar >= (decileCount * decileChunk) + (percentileCount * percentileChunk)) {
148                     decileCount++;
149                 }
150                 if (decileCount >= 9) {
151                     while (percentileCount < 10 && scoresSoFar >= (decileCount * decileChunk) + (percentileCount * percentileChunk)) {
152                         percentileCount++;
153                     }
154                 }
155
156                 // If we've reached the top percentile, then we're done!
157
if (percentileCount >= 10) {
158                     break;
159                 }
160             }
161         }
162
163         pout.println();
164         pout.println();
165         pout.println("Min score is " + minScore);
166         pout.println("Max score is " + maxScore);
167         pout.println("Average score is " + (grandTotal / scoresSoFar));
168     }
169
170     /**
171      */

172     public static void main(String JavaDoc argv[]) throws IOException {
173         if (argv.length < 1) {
174             System.out.println("Usage: java net.nutch.util.ScoreStats [-real (-local | -ndfs <namenode:port>) <db>] [-simulated <numScores> <min> <max> [seed]]");
175             return;
176         }
177
178         NutchFileSystem nfs = null;
179         File root = null;
180         long seed = new Random().nextLong();
181         boolean simulated = false;
182         int numScores = 0;
183         float min = 0, max = 0;
184
185         if ("-real".equals(argv[0])) {
186             nfs = NutchFileSystem.parseArgs(argv, 1);
187             root = new File(argv[1]);
188         } else if ("-simulated".equals(argv[0])) {
189             simulated = true;
190             numScores = Integer.parseInt(argv[1]);
191             min = Float.parseFloat(argv[2]);
192             max = Float.parseFloat(argv[3]);
193             if (argv.length > 4) {
194                 seed = Long.parseLong(argv[4]);
195             }
196         } else {
197             System.out.println("No command specified");
198         }
199
200         System.out.println("Using seed: " + seed);
201         ScoreStats ss = new ScoreStats();
202         if (simulated) {
203             Random r = new Random(seed);
204             for (int i = 0; i < numScores; i++) {
205                 float newScore = min + (r.nextFloat() * (max - min));
206                 ss.addScore(newScore);
207             }
208         } else {
209             IWebDBReader reader = new WebDBReader(nfs, root);
210             try {
211                 for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
212                     Page p = (Page) e.nextElement();
213                     ss.addScore(p.getScore());
214                 }
215             } finally {
216                 reader.close();
217             }
218         }
219         
220         ss.emitDistribution(System.out);
221     }
222 }
223
224
Popular Tags