1 2 3 4 package net.nutch.indexer; 5 6 import org.apache.lucene.util.PriorityQueue; 7 import org.apache.lucene.store.Directory; 8 import org.apache.lucene.store.FSDirectory; 9 import org.apache.lucene.index.IndexReader; 10 import org.apache.lucene.index.Term; 11 import org.apache.lucene.index.TermEnum; 12 13 import java.io.OutputStreamWriter ; 14 15 16 public class HighFreqTerms { 17 public static int numTerms = 100; 18 19 private static class TermFreq { 20 TermFreq(Term t, int df) { 21 term = t; 22 docFreq = df; 23 } 24 int docFreq; 25 Term term; 26 } 27 28 private static class TermFreqQueue extends PriorityQueue { 29 TermFreqQueue(int size) { 30 initialize(size); 31 } 32 33 protected final boolean lessThan(Object a, Object b) { 34 TermFreq termInfoA = (TermFreq)a; 35 TermFreq termInfoB = (TermFreq)b; 36 return termInfoA.docFreq < termInfoB.docFreq; 37 } 38 } 39 40 public static void main(String [] args) throws Exception { 41 IndexReader reader = null; 42 boolean noFreqs = false; 43 int count = 100; 44 String usage = "HighFreqTerms [-count <n>] [-nofreqs] <index dir>"; 45 46 if (args.length == 0) { 47 System.err.println(usage); 48 System.exit(-1); 49 } 50 51 for (int i = 0; i < args.length; i++) { if (args[i].equals("-count")) { count = Integer.parseInt(args[++i]); 54 } else if (args[i].equals("-nofreqs")) { noFreqs = true; 56 } else { 57 reader = IndexReader.open(args[i]); 58 } 59 } 60 61 TermFreqQueue tiq = new TermFreqQueue(count); 62 TermEnum terms = reader.terms(); 63 64 int minFreq = 0; 65 while (terms.next()) { 66 if (terms.docFreq() > minFreq) { 67 tiq.put(new TermFreq(terms.term(), terms.docFreq())); 68 if (tiq.size() >= count) { tiq.pop(); minFreq = ((TermFreq)tiq.top()).docFreq; } 72 } 73 } 74 75 OutputStreamWriter out = new OutputStreamWriter (System.out, "UTF-8"); 76 while (tiq.size() != 0) { 77 TermFreq termInfo = (TermFreq)tiq.pop(); 78 out.write(termInfo.term.toString()); 79 if (!noFreqs) { 80 out.write(" "); 81 out.write(Integer.toString(termInfo.docFreq)); 82 } 83 out.write("\n"); 84 } 85 86 out.flush(); 87 reader.close(); 88 } 89 90 } 91 92 | Popular Tags |