KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > el > GreekAnalyzer


1 package org.apache.lucene.analysis.el;
2
3 /**
4  * Copyright 2005 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import org.apache.lucene.analysis.Analyzer;
20 import org.apache.lucene.analysis.StopFilter;
21 import org.apache.lucene.analysis.TokenStream;
22 import org.apache.lucene.analysis.standard.StandardTokenizer;
23
24 import java.io.Reader JavaDoc;
25 import java.util.HashSet JavaDoc;
26 import java.util.Hashtable JavaDoc;
27 import java.util.Set JavaDoc;
28
29 /**
30  * Analyzer for the Greek language. Supports an external list of stopwords (words
31  * that will not be indexed at all).
32  * A default set of stopwords is used unless an alternative list is specified.
33  *
34  * @author Panagiotis Astithas, past@ebs.gr
35  */

36 public final class GreekAnalyzer extends Analyzer
37 {
38     // the letters are indexes to the charset array (see GreekCharsets.java)
39
private static char A = 6;
40     private static char B = 7;
41     private static char G = 8;
42     private static char D = 9;
43     private static char E = 10;
44     private static char Z = 11;
45     private static char H = 12;
46     private static char TH = 13;
47     private static char I = 14;
48     private static char K = 15;
49     private static char L = 16;
50     private static char M = 17;
51     private static char N = 18;
52     private static char KS = 19;
53     private static char O = 20;
54     private static char P = 21;
55     private static char R = 22;
56     private static char S = 24; // skip final sigma
57
private static char T = 25;
58     private static char Y = 26;
59     private static char F = 27;
60     private static char X = 28;
61     private static char PS = 29;
62     private static char W = 30;
63
64     /**
65      * List of typical Greek stopwords.
66      */

67     private static char[][] GREEK_STOP_WORDS = {
68         {O},
69         {H},
70         {T, O},
71         {O, I},
72         {T, A},
73         {T, O, Y},
74         {T, H, S},
75         {T, W, N},
76         {T, O, N},
77         {T, H, N},
78         {K, A, I},
79         {K, I},
80         {K},
81         {E, I, M, A, I},
82         {E, I, S, A, I},
83         {E, I, N, A, I},
84         {E, I, M, A, S, T, E},
85         {E, I, S, T, E},
86         {S, T, O},
87         {S, T, O, N},
88         {S, T, H},
89         {S, T, H, N},
90         {M, A},
91         {A, L, L, A},
92         {A, P, O},
93         {G, I, A},
94         {P, R, O, S},
95         {M, E},
96         {S, E},
97         {W, S},
98         {P, A, R, A},
99         {A, N, T, I},
100         {K, A, T, A},
101         {M, E, T, A},
102         {TH, A},
103         {N, A},
104         {D, E},
105         {D, E, N},
106         {M, H},
107         {M, H, N},
108         {E, P, I},
109         {E, N, W},
110         {E, A, N},
111         {A, N},
112         {T, O, T, E},
113         {P, O, Y},
114         {P, W, S},
115         {P, O, I, O, S},
116         {P, O, I, A},
117         {P, O, I, O},
118         {P, O, I, O, I},
119         {P, O, I, E, S},
120         {P, O, I, W, N},
121         {P, O, I, O, Y, S},
122         {A, Y, T, O, S},
123         {A, Y, T, H},
124         {A, Y, T, O},
125         {A, Y, T, O, I},
126         {A, Y, T, W, N},
127         {A, Y, T, O, Y, S},
128         {A, Y, T, E, S},
129         {A, Y, T, A},
130         {E, K, E, I, N, O, S},
131         {E, K, E, I, N, H},
132         {E, K, E, I, N, O},
133         {E, K, E, I, N, O, I},
134         {E, K, E, I, N, E, S},
135         {E, K, E, I, N, A},
136         {E, K, E, I, N, W, N},
137         {E, K, E, I, N, O, Y, S},
138         {O, P, W, S},
139         {O, M, W, S},
140         {I, S, W, S},
141         {O, S, O},
142         {O, T, I}
143     };
144
145     /**
146      * Contains the stopwords used with the StopFilter.
147      */

148     private Set JavaDoc stopSet = new HashSet JavaDoc();
149
150     /**
151      * Charset for Greek letters.
152      * Represents encoding for 24 lowercase Greek letters.
153      * Predefined charsets can be taken from GreekCharSets class
154      */

155     private char[] charset;
156
157     public GreekAnalyzer() {
158         charset = GreekCharsets.UnicodeGreek;
159         stopSet = StopFilter.makeStopSet(
160                     makeStopWords(GreekCharsets.UnicodeGreek));
161     }
162
163     /**
164      * Builds an analyzer.
165      */

166     public GreekAnalyzer(char[] charset)
167     {
168         this.charset = charset;
169         stopSet = StopFilter.makeStopSet(makeStopWords(charset));
170     }
171
172     /**
173      * Builds an analyzer with the given stop words.
174      */

175     public GreekAnalyzer(char[] charset, String JavaDoc[] stopwords)
176     {
177         this.charset = charset;
178         stopSet = StopFilter.makeStopSet(stopwords);
179     }
180
181     // Takes greek stop words and translates them to a String array, using
182
// the given charset
183
private static String JavaDoc[] makeStopWords(char[] charset)
184     {
185         String JavaDoc[] res = new String JavaDoc[GREEK_STOP_WORDS.length];
186         for (int i = 0; i < res.length; i++)
187         {
188             char[] theStopWord = GREEK_STOP_WORDS[i];
189             // translate the word,using the charset
190
StringBuffer JavaDoc theWord = new StringBuffer JavaDoc();
191             for (int j = 0; j < theStopWord.length; j++)
192             {
193                 theWord.append(charset[theStopWord[j]]);
194             }
195             res[i] = theWord.toString();
196         }
197         return res;
198     }
199
200     /**
201      * Builds an analyzer with the given stop words.
202      */

203     public GreekAnalyzer(char[] charset, Hashtable JavaDoc stopwords)
204     {
205         this.charset = charset;
206         stopSet = new HashSet JavaDoc(stopwords.keySet());
207     }
208
209     /**
210      * Creates a TokenStream which tokenizes all the text in the provided Reader.
211      *
212      * @return A TokenStream build from a StandardTokenizer filtered with
213      * GreekLowerCaseFilter and StopFilter
214      */

215     public TokenStream tokenStream(String JavaDoc fieldName, Reader JavaDoc reader)
216     {
217         TokenStream result = new StandardTokenizer(reader);
218         result = new GreekLowerCaseFilter(result, charset);
219         result = new StopFilter(result, stopSet);
220         return result;
221     }
222 }
223
Popular Tags