KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > indexer > HighFreqTerms


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.indexer;
5
6 import org.apache.lucene.util.PriorityQueue;
7 import org.apache.lucene.store.Directory;
8 import org.apache.lucene.store.FSDirectory;
9 import org.apache.lucene.index.IndexReader;
10 import org.apache.lucene.index.Term;
11 import org.apache.lucene.index.TermEnum;
12
13 import java.io.OutputStreamWriter JavaDoc;
14
15 /** Lists the most frequent terms in an index. */
16 public class HighFreqTerms {
17   public static int numTerms = 100;
18
19   private static class TermFreq {
20     TermFreq(Term t, int df) {
21       term = t;
22       docFreq = df;
23     }
24     int docFreq;
25     Term term;
26   }
27
28   private static class TermFreqQueue extends PriorityQueue {
29     TermFreqQueue(int size) {
30       initialize(size);
31     }
32
33     protected final boolean lessThan(Object JavaDoc a, Object JavaDoc b) {
34       TermFreq termInfoA = (TermFreq)a;
35       TermFreq termInfoB = (TermFreq)b;
36       return termInfoA.docFreq < termInfoB.docFreq;
37     }
38   }
39
40   public static void main(String JavaDoc[] args) throws Exception JavaDoc {
41     IndexReader reader = null;
42     boolean noFreqs = false;
43     int count = 100;
44     String JavaDoc usage = "HighFreqTerms [-count <n>] [-nofreqs] <index dir>";
45
46     if (args.length == 0) {
47       System.err.println(usage);
48       System.exit(-1);
49     }
50
51     for (int i = 0; i < args.length; i++) { // parse command line
52
if (args[i].equals("-count")) { // found -count option
53
count = Integer.parseInt(args[++i]);
54       } else if (args[i].equals("-nofreqs")) { // found -nofreqs option
55
noFreqs = true;
56       } else {
57         reader = IndexReader.open(args[i]);
58       }
59     }
60
61     TermFreqQueue tiq = new TermFreqQueue(count);
62     TermEnum terms = reader.terms();
63       
64     int minFreq = 0;
65     while (terms.next()) {
66       if (terms.docFreq() > minFreq) {
67         tiq.put(new TermFreq(terms.term(), terms.docFreq()));
68         if (tiq.size() >= count) { // if tiq overfull
69
tiq.pop(); // remove lowest in tiq
70
minFreq = ((TermFreq)tiq.top()).docFreq; // reset minFreq
71
}
72       }
73     }
74
75     OutputStreamWriter JavaDoc out = new OutputStreamWriter JavaDoc(System.out, "UTF-8");
76     while (tiq.size() != 0) {
77       TermFreq termInfo = (TermFreq)tiq.pop();
78       out.write(termInfo.term.toString());
79       if (!noFreqs) {
80         out.write(" ");
81         out.write(Integer.toString(termInfo.docFreq));
82       }
83       out.write("\n");
84     }
85
86     out.flush();
87     reader.close();
88   }
89
90 }
91
92
Popular Tags