KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > contineo > core > text > AnalyzeText


1 package org.contineo.core.text;
2
3 import java.util.Collection JavaDoc;
4 import java.util.Iterator JavaDoc;
5
6 import org.contineo.core.text.analyze.Analyzer;
7 import org.contineo.core.text.analyze.AnalyzerFactory;
8 import org.contineo.core.text.analyze.Entry;
9 import org.contineo.documan.Term;
10 import org.contineo.documan.dao.TermDAO;
11
12 /**
13  * Class for analysing texts like extracting keywords from a given text.
14  * Created on 24.03.2004
15  * @author Michael Scholz
16  */

17 public class AnalyzeText {
18
19     /**
20      *
21      */

22     public AnalyzeText() {
23     }
24
25     /**
26      * This method selects 20 keywords of a given text in a specified language and stores
27      * these keywords in a database.
28      * @param menuid MenuId of the document the text is from.
29      * @param text Text of a document.
30      * @param language Identified language of the text.
31      */

32     public void storeTerms(int menuid, String JavaDoc text, String JavaDoc language) {
33         TermDAO termDao = new TermDAO();
34         Analyzer analyzer = AnalyzerFactory.create(language);
35         analyzer.analyze(text);
36         long words = analyzer.getWordCount();
37         Collection JavaDoc terms = analyzer.getTopWords(20);
38         Iterator JavaDoc iter = terms.iterator();
39         while (iter.hasNext()) {
40             Entry entry = (Entry)iter.next();
41             Term term = new Term();
42             term.setMenuid(menuid);
43             term.setStem(entry.getWord());
44             term.setValue(entry.getNumber()*1000/words);
45             term.setWordCount(entry.getNumber());
46             term.setOriginWord(entry.getOriginWord());
47             termDao.store(term);
48         }
49     }
50     
51     /**
52      * This method extracts a specified number of keywords and appends them to a String
53      * @param count Number of keywords.
54      * @param text Given text of a document.
55      * @param language Identified language of the text.
56      * @return String of keywords like "Information Retrieval, DMS, CMS"
57      */

58     public String JavaDoc getTerms(int count, String JavaDoc text, String JavaDoc language) {
59         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
60         Analyzer analyzer = AnalyzerFactory.create(language);
61         analyzer.analyze(text);
62         Collection JavaDoc terms = analyzer.getTopWords(count);
63         Iterator JavaDoc iter = terms.iterator();
64         int temp = 0;
65         while (iter.hasNext() && temp < count) {
66             Entry entry = (Entry)iter.next();
67             if (temp > 0)
68                 result.append(", ");
69             result.append(entry.getOriginWord());
70             temp++;
71         }
72         return result.toString();
73     }
74 }
75
Popular Tags