GreekAnalyzer


1   package org.apache.lucene.analysis.el;
2   
3   /**
4    * Copyright 2005 The Apache Software Foundation
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  import org.apache.lucene.analysis.Analyzer;
20  import org.apache.lucene.analysis.StopFilter;
21  import org.apache.lucene.analysis.TokenStream;
22  import org.apache.lucene.analysis.standard.StandardTokenizer;
23  
24  import java.io.Reader  ;
25  import java.util.HashSet  ;
26  import java.util.Hashtable  ;
27  import java.util.Set  ;
28  
29  /**
30   * Analyzer for the Greek language. Supports an external list of stopwords (words
31   * that will not be indexed at all).
32   * A default set of stopwords is used unless an alternative list is specified.
33   *
34   * @author  Panagiotis Astithas, past@ebs.gr
35   */
36  public final class GreekAnalyzer extends Analyzer
37  {
38      // the letters are indexes to the charset array (see GreekCharsets.java)
39      private static char A = 6;
40      private static char B = 7;
41      private static char G = 8;
42      private static char D = 9;
43      private static char E = 10;
44      private static char Z = 11;
45      private static char H = 12;
46      private static char TH = 13;
47      private static char I = 14;
48      private static char K = 15;
49      private static char L = 16;
50      private static char M = 17;
51      private static char N = 18;
52      private static char KS = 19;
53      private static char O = 20;
54      private static char P = 21;
55      private static char R = 22;
56      private static char S = 24; // skip final sigma
57      private static char T = 25;
58      private static char Y = 26;
59      private static char F = 27;
60      private static char X = 28;
61      private static char PS = 29;
62      private static char W = 30;
63  
64      /**
65       * List of typical Greek stopwords.
66       */
67      private static char[][] GREEK_STOP_WORDS = {
68          {O},
69          {H},
70          {T, O},
71          {O, I},
72          {T, A},
73          {T, O, Y},
74          {T, H, S},
75          {T, W, N},
76          {T, O, N},
77          {T, H, N},
78          {K, A, I},
79          {K, I},
80          {K},
81          {E, I, M, A, I},
82          {E, I, S, A, I},
83          {E, I, N, A, I},
84          {E, I, M, A, S, T, E},
85          {E, I, S, T, E},
86          {S, T, O},
87          {S, T, O, N},
88          {S, T, H},
89          {S, T, H, N},
90          {M, A},
91          {A, L, L, A},
92          {A, P, O},
93          {G, I, A},
94          {P, R, O, S},
95          {M, E},
96          {S, E},
97          {W, S},
98          {P, A, R, A},
99          {A, N, T, I},
100         {K, A, T, A},
101         {M, E, T, A},
102         {TH, A},
103         {N, A},
104         {D, E},
105         {D, E, N},
106         {M, H},
107         {M, H, N},
108         {E, P, I},
109         {E, N, W},
110         {E, A, N},
111         {A, N},
112         {T, O, T, E},
113         {P, O, Y},
114         {P, W, S},
115         {P, O, I, O, S},
116         {P, O, I, A},
117         {P, O, I, O},
118         {P, O, I, O, I},
119         {P, O, I, E, S},
120         {P, O, I, W, N},
121         {P, O, I, O, Y, S},
122         {A, Y, T, O, S},
123         {A, Y, T, H},
124         {A, Y, T, O},
125         {A, Y, T, O, I},
126         {A, Y, T, W, N},
127         {A, Y, T, O, Y, S},
128         {A, Y, T, E, S},
129         {A, Y, T, A},
130         {E, K, E, I, N, O, S},
131         {E, K, E, I, N, H},
132         {E, K, E, I, N, O},
133         {E, K, E, I, N, O, I},
134         {E, K, E, I, N, E, S},
135         {E, K, E, I, N, A},
136         {E, K, E, I, N, W, N},
137         {E, K, E, I, N, O, Y, S},
138         {O, P, W, S},
139         {O, M, W, S},
140         {I, S, W, S},
141         {O, S, O},
142         {O, T, I}
143     };
144 
145     /**
146      * Contains the stopwords used with the StopFilter.
147      */
148     private Set   stopSet = new HashSet  ();
149 
150     /**
151      * Charset for Greek letters.
152      * Represents encoding for 24 lowercase Greek letters.
153      * Predefined charsets can be taken from GreekCharSets class
154      */
155     private char[] charset;
156 
157     public GreekAnalyzer() {
158         charset = GreekCharsets.UnicodeGreek;
159         stopSet = StopFilter.makeStopSet(
160                     makeStopWords(GreekCharsets.UnicodeGreek));
161     }
162 
163     /**
164      * Builds an analyzer.
165      */
166     public GreekAnalyzer(char[] charset)
167     {
168         this.charset = charset;
169         stopSet = StopFilter.makeStopSet(makeStopWords(charset));
170     }
171 
172     /**
173      * Builds an analyzer with the given stop words.
174      */
175     public GreekAnalyzer(char[] charset, String  [] stopwords)
176     {
177         this.charset = charset;
178         stopSet = StopFilter.makeStopSet(stopwords);
179     }
180 
181     // Takes greek stop words and translates them to a String array, using
182     // the given charset
183     private static String  [] makeStopWords(char[] charset)
184     {
185         String  [] res = new String  [GREEK_STOP_WORDS.length];
186         for (int i = 0; i < res.length; i++)
187         {
188             char[] theStopWord = GREEK_STOP_WORDS[i];
189             // translate the word,using the charset
190             StringBuffer   theWord = new StringBuffer  ();
191             for (int j = 0; j < theStopWord.length; j++)
192             {
193                 theWord.append(charset[theStopWord[j]]);
194             }
195             res[i] = theWord.toString();
196         }
197         return res;
198     }
199 
200     /**
201      * Builds an analyzer with the given stop words.
202      */
203     public GreekAnalyzer(char[] charset, Hashtable   stopwords)
204     {
205         this.charset = charset;
206         stopSet = new HashSet  (stopwords.keySet());
207     }
208 
209     /**
210      * Creates a TokenStream which tokenizes all the text in the provided Reader.
211      *
212      * @return  A TokenStream build from a StandardTokenizer filtered with
213      *                  GreekLowerCaseFilter and StopFilter
214      */
215     public TokenStream tokenStream(String   fieldName, Reader   reader)
216     {
217         TokenStream result = new StandardTokenizer(reader);
218         result = new GreekLowerCaseFilter(result, charset);
219         result = new StopFilter(result, stopSet);
220         return result;
221     }
222 }
223
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags