KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > contineo > core > text > analyze > WordRanker


1 package org.contineo.core.text.analyze;
2
3 import java.util.ArrayList JavaDoc;
4 import java.util.Collection JavaDoc;
5 import java.util.Enumeration JavaDoc;
6 import java.util.Hashtable JavaDoc;
7
8 /**
9  * Provides functionality like getting the top most words in a document.
10  * Created on 24. Juli 2003, 22:28
11  * @author Michael Scholz
12  */

13 public abstract class WordRanker {
14  
15     protected Hashtable JavaDoc stoptable = new Hashtable JavaDoc();
16
17     /**
18      * @uml.property name="wordtable"
19      * @uml.associationEnd
20      */

21     protected Hashtable JavaDoc<String JavaDoc, WordEntry> wordtable = new Hashtable JavaDoc<String JavaDoc, WordEntry>();
22
23     protected long wordcount = 0;
24     protected int minlen = 2;
25
26     
27     protected Entry getTopWord(Hashtable JavaDoc table) {
28         Entry entry = new Entry();
29         Enumeration JavaDoc enum1 = table.keys();
30         int topvalue = -1;
31         String JavaDoc topword = "";
32         String JavaDoc topOriginWord = "";
33         while (enum1.hasMoreElements()) {
34             String JavaDoc key = (String JavaDoc)enum1.nextElement();
35             WordEntry termEntry = (WordEntry)table.get(key);
36             int val = termEntry.getValue();
37             if (val > topvalue) {
38                 topvalue = val;
39                 topword = key;
40                 topOriginWord = termEntry.getOriginWord();
41             }
42         }
43         entry.setWord(topword);
44         entry.setNumber(topvalue);
45         entry.setOriginWord(topOriginWord);
46         return entry;
47     }
48     
49     /**
50      * Sets the minimum length of words which should be analyzed.
51      * @param len
52      */

53     public void setMinLen(int len) {
54         minlen = len;
55     }
56     
57     /**
58      * Returns the top words of an analyzed document.
59      * @param hits - Number of top words to be returned.
60      * @return
61      */

62     public Collection JavaDoc getTopWords(int hits) {
63         Hashtable JavaDoc table = new Hashtable JavaDoc<String JavaDoc, WordEntry>(wordtable);
64         Collection JavaDoc<Entry> coll = new ArrayList JavaDoc<Entry>(hits);
65         if (hits > table.size())
66             hits = table.size();
67         for (int i = 0; i < hits; i++) {
68             Entry e = getTopWord(table);
69             if (!e.getWord().equals("")) {
70                 coll.add(e);
71                 table.remove(e.getWord());
72             }
73         }
74         return coll;
75     }
76     
77     /**
78      * @return Number of entries in the hitlist containing the topwords.
79      */

80     public int relevantWords() {
81         return wordtable.size();
82     }
83     
84     /**
85      * @return Number of words in the analyzed document.
86      */

87     public long getWordCount() {
88         return wordcount;
89     }
90 }
Popular Tags