KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > el > GreekCharsets


1 package org.apache.lucene.analysis.el;
2
3 /**
4  * Copyright 2005 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 /**
20  * GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
21  * for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
22  * Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
23  * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
24  * the definition of a new charset as well as the required logic in the toLowerCase() method.
25  *
26  * @author Panagiotis Astithas, past@ebs.gr
27  */

28 public class GreekCharsets
29 {
30     // Unicode Greek charset
31
public static char[] UnicodeGreek = {
32         // lower case
33
'\u0390',
34         '\u03AC',
35         '\u03AD',
36         '\u03AE',
37         '\u03AF',
38         '\u03B0',
39         '\u03B1',
40         '\u03B2',
41         '\u03B3',
42         '\u03B4',
43         '\u03B5',
44         '\u03B6',
45         '\u03B7',
46         '\u03B8',
47         '\u03B9',
48         '\u03BA',
49         '\u03BB',
50         '\u03BC',
51         '\u03BD',
52         '\u03BE',
53         '\u03BF',
54         '\u03C0',
55         '\u03C1',
56         '\u03C2',
57         '\u03C3',
58         '\u03C4',
59         '\u03C5',
60         '\u03C6',
61         '\u03C7',
62         '\u03C8',
63         '\u03C9',
64         '\u03CA',
65         '\u03CB',
66         '\u03CC',
67         '\u03CD',
68         '\u03CE',
69         // upper case
70
'\u0386',
71         '\u0388',
72         '\u0389',
73         '\u038A',
74         '\u038C',
75         '\u038E',
76         '\u038F',
77         '\u0391',
78         '\u0392',
79         '\u0393',
80         '\u0394',
81         '\u0395',
82         '\u0396',
83         '\u0397',
84         '\u0398',
85         '\u0399',
86         '\u039A',
87         '\u039B',
88         '\u039C',
89         '\u039D',
90         '\u039E',
91         '\u039F',
92         '\u03A0',
93         '\u03A1',
94         '\u03A3',
95         '\u03A4',
96         '\u03A5',
97         '\u03A6',
98         '\u03A7',
99         '\u03A8',
100         '\u03A9',
101         '\u03AA',
102         '\u03AB'
103     };
104
105     // ISO-8859-7 charset (ELOT-928)
106
public static char[] ISO = {
107         // lower case
108
0xc0,
109         0xdc,
110         0xdd,
111         0xde,
112         0xdf,
113         0xe0,
114         0xe1,
115         0xe2,
116         0xe3,
117         0xe4,
118         0xe5,
119         0xe6,
120         0xe7,
121         0xe8,
122         0xe9,
123         0xea,
124         0xeb,
125         0xec,
126         0xed,
127         0xee,
128         0xef,
129         0xf0,
130         0xf1,
131         0xf2,
132         0xf3,
133         0xf4,
134         0xf5,
135         0xf6,
136         0xf7,
137         0xf8,
138         0xf9,
139         0xfa,
140         0xfb,
141         0xfc,
142         0xfd,
143         0xfe,
144         // upper case
145
0xb6,
146         0xb8,
147         0xb9,
148         0xba,
149         0xbc,
150         0xbe,
151         0xbf,
152         0xc1,
153         0xc2,
154         0xc3,
155         0xc4,
156         0xc5,
157         0xc6,
158         0xc7,
159         0xc8,
160         0xc9,
161         0xca,
162         0xcb,
163         0xcc,
164         0xcd,
165         0xce,
166         0xcf,
167         0xd0,
168         0xd1,
169         0xd3,
170         0xd4,
171         0xd5,
172         0xd6,
173         0xd7,
174         0xd8,
175         0xd9,
176         0xda,
177         0xdb
178     };
179
180     // CP1253 charset
181
public static char[] CP1253 = {
182         // lower case
183
0xc0,
184         0xdc,
185         0xdd,
186         0xde,
187         0xdf,
188         0xe0,
189         0xe1,
190         0xe2,
191         0xe3,
192         0xe4,
193         0xe5,
194         0xe6,
195         0xe7,
196         0xe8,
197         0xe9,
198         0xea,
199         0xeb,
200         0xec,
201         0xed,
202         0xee,
203         0xef,
204         0xf0,
205         0xf1,
206         0xf2,
207         0xf3,
208         0xf4,
209         0xf5,
210         0xf6,
211         0xf7,
212         0xf8,
213         0xf9,
214         0xfa,
215         0xfb,
216         0xfc,
217         0xfd,
218         0xfe,
219         // upper case
220
0xa2,
221         0xb8,
222         0xb9,
223         0xba,
224         0xbc,
225         0xbe,
226         0xbf,
227         0xc1,
228         0xc2,
229         0xc3,
230         0xc4,
231         0xc5,
232         0xc6,
233         0xc7,
234         0xc8,
235         0xc9,
236         0xca,
237         0xcb,
238         0xcc,
239         0xcd,
240         0xce,
241         0xcf,
242         0xd0,
243         0xd1,
244         0xd3,
245         0xd4,
246         0xd5,
247         0xd6,
248         0xd7,
249         0xd8,
250         0xd9,
251         0xda,
252         0xdb
253     };
254
255     public static char toLowerCase(char letter, char[] charset)
256     {
257         if (charset == UnicodeGreek) {
258             // First deal with lower case, not accented letters
259
if (letter >= '\u03B1' && letter <= '\u03C9')
260             {
261                 // Special case 'small final sigma', where we return 'small sigma'
262
if (letter == '\u03C2') {
263                     return '\u03C3';
264                 } else {
265                     return letter;
266                 }
267             }
268             // Then deal with lower case, accented letters
269
// alpha with acute
270
if (letter == '\u03AC') {
271                 return '\u03B1';
272             }
273             // epsilon with acute
274
if (letter == '\u03AD') {
275                 return '\u03B5';
276             }
277             // eta with acute
278
if (letter == '\u03AE') {
279                 return '\u03B7';
280             }
281             // iota with acute, iota with diaeresis, iota with acute and diaeresis
282
if (letter == '\u03AF' || letter == '\u03CA' || letter == '\u0390') {
283                 return '\u03B9';
284             }
285             // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
286
if (letter == '\u03CD' || letter == '\u03CB' || letter == '\u03B0') {
287                 return '\u03C5';
288             }
289             // omicron with acute
290
if (letter == '\u03CC') {
291                 return '\u03BF';
292             }
293             // omega with acute
294
if (letter == '\u03CE') {
295                 return '\u03C9';
296             }
297             // After that, deal with upper case, not accented letters
298
if (letter >= '\u0391' && letter <= '\u03A9')
299             {
300                 return (char) (letter + 32);
301             }
302             // Finally deal with upper case, accented letters
303
// alpha with acute
304
if (letter == '\u0386') {
305                 return '\u03B1';
306             }
307             // epsilon with acute
308
if (letter == '\u0388') {
309                 return '\u03B5';
310             }
311             // eta with acute
312
if (letter == '\u0389') {
313                 return '\u03B7';
314             }
315             // iota with acute, iota with diaeresis
316
if (letter == '\u038A' || letter == '\u03AA') {
317                 return '\u03B9';
318             }
319             // upsilon with acute, upsilon with diaeresis
320
if (letter == '\u038E' || letter == '\u03AB') {
321                 return '\u03C5';
322             }
323             // omicron with acute
324
if (letter == '\u038C') {
325                 return '\u03BF';
326             }
327             // omega with acute
328
if (letter == '\u038F') {
329                 return '\u03C9';
330             }
331         } else if (charset == ISO) {
332             // First deal with lower case, not accented letters
333
if (letter >= 0xe1 && letter <= 0xf9)
334             {
335                 // Special case 'small final sigma', where we return 'small sigma'
336
if (letter == 0xf2) {
337                     return 0xf3;
338                 } else {
339                     return letter;
340                 }
341             }
342             // Then deal with lower case, accented letters
343
// alpha with acute
344
if (letter == 0xdc) {
345                 return 0xe1;
346             }
347             // epsilon with acute
348
if (letter == 0xdd) {
349                 return 0xe5;
350             }
351             // eta with acute
352
if (letter == 0xde) {
353                 return 0xe7;
354             }
355             // iota with acute, iota with diaeresis, iota with acute and diaeresis
356
if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
357                 return '\u03B9';
358             }
359             // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
360
if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
361                 return 0xf5;
362             }
363             // omicron with acute
364
if (letter == 0xfc) {
365                 return 0xef;
366             }
367             // omega with acute
368
if (letter == 0xfe) {
369                 return 0xf9;
370             }
371             // After that, deal with upper case, not accented letters
372
if (letter >= 0xc1 && letter <= 0xd9) {
373                 return (char) (letter + 32);
374             }
375             // Finally deal with upper case, accented letters
376
// alpha with acute
377
if (letter == 0xb6) {
378                 return 0xe1;
379             }
380             // epsilon with acute
381
if (letter == 0xb8) {
382                 return 0xe5;
383             }
384             // eta with acute
385
if (letter == 0xb9) {
386                 return 0xe7;
387             }
388             // iota with acute, iota with diaeresis
389
if (letter == 0xba || letter == 0xda) {
390                 return 0xe9;
391             }
392             // upsilon with acute, upsilon with diaeresis
393
if (letter == 0xbe || letter == 0xdb) {
394                 return 0xf5;
395             }
396             // omicron with acute
397
if (letter == 0xbc) {
398                 return 0xef;
399             }
400             // omega with acute
401
if (letter == 0xbf) {
402                 return 0xf9;
403             }
404         } else if (charset == CP1253) {
405             // First deal with lower case, not accented letters
406
if (letter >= 0xe1 && letter <= 0xf9)
407             {
408                 // Special case 'small final sigma', where we return 'small sigma'
409
if (letter == 0xf2) {
410                     return 0xf3;
411                 } else {
412                     return letter;
413                 }
414             }
415             // Then deal with lower case, accented letters
416
// alpha with acute
417
if (letter == 0xdc) {
418                 return 0xe1;
419             }
420             // epsilon with acute
421
if (letter == 0xdd) {
422                 return 0xe5;
423             }
424             // eta with acute
425
if (letter == 0xde) {
426                 return 0xe7;
427             }
428             // iota with acute, iota with diaeresis, iota with acute and diaeresis
429
if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
430                 return '\u03B9';
431             }
432             // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
433
if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
434                 return 0xf5;
435             }
436             // omicron with acute
437
if (letter == 0xfc) {
438                 return 0xef;
439             }
440             // omega with acute
441
if (letter == 0xfe) {
442                 return 0xf9;
443             }
444             // After that, deal with upper case, not accented letters
445
if (letter >= 0xc1 && letter <= 0xd9) {
446                 return (char) (letter + 32);
447             }
448             // Finally deal with upper case, accented letters
449
// alpha with acute
450
if (letter == 0xa2) {
451                 return 0xe1;
452             }
453             // epsilon with acute
454
if (letter == 0xb8) {
455                 return 0xe5;
456             }
457             // eta with acute
458
if (letter == 0xb9) {
459                 return 0xe7;
460             }
461             // iota with acute, iota with diaeresis
462
if (letter == 0xba || letter == 0xda) {
463                 return 0xe9;
464             }
465             // upsilon with acute, upsilon with diaeresis
466
if (letter == 0xbe || letter == 0xdb) {
467                 return 0xf5;
468             }
469             // omicron with acute
470
if (letter == 0xbc) {
471                 return 0xef;
472             }
473             // omega with acute
474
if (letter == 0xbf) {
475                 return 0xf9;
476             }
477         }
478
479         return Character.toLowerCase(letter);
480     }
481 }
482
Popular Tags