KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > contineo > core > text > analyze > WordTable


1 package org.contineo.core.text.analyze;
2
3 import java.text.BreakIterator JavaDoc;
4 import java.util.Hashtable JavaDoc;
5
6 /**
7  * Analyses a set of provided words and builds a word table with each unique word and it's significance.
8  * Created on 23. Juli 2003, 19:31
9  * @author Michael Scholz
10  */

11 public class WordTable {
12     
13     public final static AnalyseResult fillWordTable(BreakIterator JavaDoc boundary, StringBuffer JavaDoc source, Hashtable JavaDoc stopwords, int minlen, Stemmer stemmer) {
14         int start = boundary.first();
15         long wordcount = 0;
16         String JavaDoc prestem = "";
17         String JavaDoc preword = "";
18         AnalyseResult result = new AnalyseResult();
19         Hashtable JavaDoc<String JavaDoc, WordEntry> wordtable = new Hashtable JavaDoc<String JavaDoc, WordEntry>(source.length()/6);
20         for (int end = boundary.next();end != BreakIterator.DONE;start = end, end = boundary.next()) {
21             String JavaDoc word = source.substring(start,end).trim();
22             char next = ' ';
23             try {
24                 next = source.charAt(end);
25             } catch (Exception JavaDoc e) {
26             }
27             if (word.length() > minlen) {
28                 String JavaDoc stem = stemmer.stem(word.toLowerCase());
29                 WordEntry entry = new WordEntry();
30                 if (word.length() >= minlen && !stopwords.containsKey(word) && !stopwords.containsKey(stem)) {
31                     wordcount++;
32                     if (wordtable.containsKey(stem)) {
33                         //value = Integer.parseInt((String)wordtable.get(stem));
34
entry = (WordEntry)wordtable.get(stem);
35                         entry.incValue();
36                         if (word.length() < entry.getOriginWord().length() && next != (char)45)
37                             entry.setOriginWord(word);
38                     } else {
39                         entry.incValue();
40                         entry.setOriginWord(word);
41                     }
42                     wordtable.put(stem,entry);
43                     if (!prestem.equals("")) {
44                         WordEntry gentry = new WordEntry();
45                         String JavaDoc groupstem = prestem + " " + stem;
46                         String JavaDoc groupword = preword + " " + word;
47                         if (wordtable.containsKey(groupstem)) {
48                             //value = Integer.parseInt((String)wordtable.get(stem));
49
gentry = (WordEntry)wordtable.get(groupstem);
50                             gentry.incValue();
51                             if (groupword.length() < gentry.getOriginWord().length() && next != (char)45)
52                                 gentry.setOriginWord(groupword);
53                         } else {
54                             gentry.setOriginWord(groupword);
55                         }
56                         wordtable.put(groupstem, gentry);
57                         
58                         try {
59                             WordEntry tempEntry = (WordEntry)wordtable.get(stem);
60                             if (tempEntry.getValue() > 0) {
61                                 tempEntry.decValue();
62                                 wordtable.put(stem,tempEntry);
63                             }
64                             tempEntry = (WordEntry)wordtable.get(prestem);
65                             if (tempEntry.getValue() > 0) {
66                                 tempEntry.decValue();
67                                 wordtable.put(prestem,tempEntry);
68                             }
69                         } catch (Exception JavaDoc e) {
70                         }
71                     }
72                     prestem = stem;
73                     preword = word;
74                 }
75             } else if (word.length() > 1 || word.equals(".") || word.equals("!") ||
76                     word.equals("?") || word.equals(",") || word.equals(";")) {
77                 prestem = "";
78                 preword = "";
79             }
80         }
81         result.setWordCount(wordcount);
82         result.setWordTable(wordtable);
83         return result;
84     }
85     
86 }
Popular Tags