KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > nl > DutchAnalyzer


1 package org.apache.lucene.analysis.nl;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import org.apache.lucene.analysis.Analyzer;
20 import org.apache.lucene.analysis.StopFilter;
21 import org.apache.lucene.analysis.TokenStream;
22 import org.apache.lucene.analysis.standard.StandardFilter;
23 import org.apache.lucene.analysis.standard.StandardTokenizer;
24
25 import java.io.File JavaDoc;
26 import java.io.Reader JavaDoc;
27 import java.util.HashMap JavaDoc;
28 import java.util.HashSet JavaDoc;
29 import java.util.Set JavaDoc;
30 import java.util.Map JavaDoc;
31
32 /**
33  * Analyzer for Dutch language. Supports an external list of stopwords (words that
34  * will not be indexed at all), an external list of exclusions (word that will
35  * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
36  * the algorithm (dictionary stemming).
37  * A default set of stopwords is used unless an alternative list is specified, the
38  * exclusion list is empty by default.
39  *
40  * @author Edwin de Jonge
41  */

42 public class DutchAnalyzer extends Analyzer {
43   /**
44    * List of typical Dutch stopwords.
45    */

46   public final static String JavaDoc[] DUTCH_STOP_WORDS =
47       {
48         "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
49         "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
50         "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
51         "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
52         "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
53         "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
54         "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
55         "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
56         "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
57         "uw", "iemand", "geweest", "andere"
58       };
59
60
61   /**
62    * Contains the stopwords used with the StopFilter.
63    */

64   private Set JavaDoc stoptable = new HashSet JavaDoc();
65
66   /**
67    * Contains words that should be indexed but not stemmed.
68    */

69   private Set JavaDoc excltable = new HashSet JavaDoc();
70
71   private Map JavaDoc _stemdict = new HashMap JavaDoc();
72
73
74   /**
75    * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}).
76    */

77   public DutchAnalyzer() {
78     stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
79     _stemdict.put("fiets", "fiets"); //otherwise fiet
80
_stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
81
_stemdict.put("ei", "eier");
82     _stemdict.put("kind", "kinder");
83   }
84
85   /**
86    * Builds an analyzer with the given stop words.
87    *
88    * @param stopwords
89    */

90   public DutchAnalyzer(String JavaDoc[] stopwords) {
91     stoptable = StopFilter.makeStopSet(stopwords);
92   }
93
94   /**
95    * Builds an analyzer with the given stop words.
96    *
97    * @param stopwords
98    */

99   public DutchAnalyzer(HashSet JavaDoc stopwords) {
100     stoptable = stopwords;
101   }
102
103   /**
104    * Builds an analyzer with the given stop words.
105    *
106    * @param stopwords
107    */

108   public DutchAnalyzer(File JavaDoc stopwords) {
109     stoptable = new HashSet JavaDoc(WordlistLoader.getWordtable(stopwords).keySet());
110   }
111
112   /**
113    * Builds an exclusionlist from an array of Strings.
114    *
115    * @param exclusionlist
116    */

117   public void setStemExclusionTable(String JavaDoc[] exclusionlist) {
118     excltable = StopFilter.makeStopSet(exclusionlist);
119   }
120
121   /**
122    * Builds an exclusionlist from a Hashtable.
123    */

124   public void setStemExclusionTable(HashSet JavaDoc exclusionlist) {
125     excltable = exclusionlist;
126   }
127
128   /**
129    * Builds an exclusionlist from the words contained in the given file.
130    */

131   public void setStemExclusionTable(File JavaDoc exclusionlist) {
132     excltable = new HashSet JavaDoc(WordlistLoader.getWordtable(exclusionlist).keySet());
133   }
134
135   /**
136    * Reads a stemdictionary file , that overrules the stemming algorithm
137    * This is a textfile that contains per line
138    * word\tstem
139    * i.e: tabseperated
140    */

141   public void setStemDictionary(File JavaDoc stemdict) {
142     _stemdict = WordlistLoader.getStemDict(stemdict);
143   }
144
145   /**
146    * Creates a TokenStream which tokenizes all the text in the provided TextReader.
147    *
148    * @return A TokenStream build from a StandardTokenizer filtered with StandardFilter,
149    * StopFilter, DutchStemFilter
150    */

151   public TokenStream tokenStream(String JavaDoc fieldName, Reader JavaDoc reader) {
152     TokenStream result = new StandardTokenizer(reader);
153     result = new StandardFilter(result);
154     result = new StopFilter(result, stoptable);
155     result = new DutchStemFilter(result, excltable, _stemdict);
156     return result;
157   }
158 }
159
Popular Tags