KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > ru > RussianCharsets


1 package org.apache.lucene.analysis.ru;
2 /**
3  * Copyright 2004 The Apache Software Foundation
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */

17
18 /**
19  * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
20  * for russian characters in Unicode, KOI8 and CP1252.
21  * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
22  * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
23  * and adding logic to toLowerCase() method for that charset.
24  *
25  * @author Boris Okner, b.okner@rogers.com
26  * @version $Id: RussianCharsets.java 150998 2004-08-16 20:30:46Z dnaber $
27  */

28 public class RussianCharsets
29 {
30     // Unicode Russian charset (lowercase letters only)
31
public static char[] UnicodeRussian = {
32         '\u0430',
33         '\u0431',
34         '\u0432',
35         '\u0433',
36         '\u0434',
37         '\u0435',
38         '\u0436',
39         '\u0437',
40         '\u0438',
41         '\u0439',
42         '\u043A',
43         '\u043B',
44         '\u043C',
45         '\u043D',
46         '\u043E',
47         '\u043F',
48         '\u0440',
49         '\u0441',
50         '\u0442',
51         '\u0443',
52         '\u0444',
53         '\u0445',
54         '\u0446',
55         '\u0447',
56         '\u0448',
57         '\u0449',
58         '\u044A',
59         '\u044B',
60         '\u044C',
61         '\u044D',
62         '\u044E',
63         '\u044F',
64         // upper case
65
'\u0410',
66         '\u0411',
67         '\u0412',
68         '\u0413',
69         '\u0414',
70         '\u0415',
71         '\u0416',
72         '\u0417',
73         '\u0418',
74         '\u0419',
75         '\u041A',
76         '\u041B',
77         '\u041C',
78         '\u041D',
79         '\u041E',
80         '\u041F',
81         '\u0420',
82         '\u0421',
83         '\u0422',
84         '\u0423',
85         '\u0424',
86         '\u0425',
87         '\u0426',
88         '\u0427',
89         '\u0428',
90         '\u0429',
91         '\u042A',
92         '\u042B',
93         '\u042C',
94         '\u042D',
95         '\u042E',
96         '\u042F'
97     };
98
99     // KOI8 charset
100
public static char[] KOI8 = {
101         0xc1,
102         0xc2,
103         0xd7,
104         0xc7,
105         0xc4,
106         0xc5,
107         0xd6,
108         0xda,
109         0xc9,
110         0xca,
111         0xcb,
112         0xcc,
113         0xcd,
114         0xce,
115         0xcf,
116         0xd0,
117         0xd2,
118         0xd3,
119         0xd4,
120         0xd5,
121         0xc6,
122         0xc8,
123         0xc3,
124         0xde,
125         0xdb,
126         0xdd,
127         0xdf,
128         0xd9,
129         0xd8,
130         0xdc,
131         0xc0,
132         0xd1,
133         // upper case
134
0xe1,
135         0xe2,
136         0xf7,
137         0xe7,
138         0xe4,
139         0xe5,
140         0xf6,
141         0xfa,
142         0xe9,
143         0xea,
144         0xeb,
145         0xec,
146         0xed,
147         0xee,
148         0xef,
149         0xf0,
150         0xf2,
151         0xf3,
152         0xf4,
153         0xf5,
154         0xe6,
155         0xe8,
156         0xe3,
157         0xfe,
158         0xfb,
159         0xfd,
160         0xff,
161         0xf9,
162         0xf8,
163         0xfc,
164         0xe0,
165         0xf1
166     };
167
168     // CP1251 eharset
169
public static char[] CP1251 = {
170         0xE0,
171         0xE1,
172         0xE2,
173         0xE3,
174         0xE4,
175         0xE5,
176         0xE6,
177         0xE7,
178         0xE8,
179         0xE9,
180         0xEA,
181         0xEB,
182         0xEC,
183         0xED,
184         0xEE,
185         0xEF,
186         0xF0,
187         0xF1,
188         0xF2,
189         0xF3,
190         0xF4,
191         0xF5,
192         0xF6,
193         0xF7,
194         0xF8,
195         0xF9,
196         0xFA,
197         0xFB,
198         0xFC,
199         0xFD,
200         0xFE,
201         0xFF,
202         // upper case
203
0xC0,
204         0xC1,
205         0xC2,
206         0xC3,
207         0xC4,
208         0xC5,
209         0xC6,
210         0xC7,
211         0xC8,
212         0xC9,
213         0xCA,
214         0xCB,
215         0xCC,
216         0xCD,
217         0xCE,
218         0xCF,
219         0xD0,
220         0xD1,
221         0xD2,
222         0xD3,
223         0xD4,
224         0xD5,
225         0xD6,
226         0xD7,
227         0xD8,
228         0xD9,
229         0xDA,
230         0xDB,
231         0xDC,
232         0xDD,
233         0xDE,
234         0xDF
235     };
236
237     public static char toLowerCase(char letter, char[] charset)
238     {
239         if (charset == UnicodeRussian)
240         {
241             if (letter >= '\u0430' && letter <= '\u044F')
242             {
243                 return letter;
244             }
245             if (letter >= '\u0410' && letter <= '\u042F')
246             {
247                 return (char) (letter + 32);
248             }
249         }
250
251         if (charset == KOI8)
252         {
253             if (letter >= 0xe0 && letter <= 0xff)
254             {
255                 return (char) (letter - 32);
256             }
257             if (letter >= 0xc0 && letter <= 0xdf)
258             {
259                 return letter;
260             }
261
262         }
263
264         if (charset == CP1251)
265         {
266             if (letter >= 0xC0 && letter <= 0xDF)
267             {
268                 return (char) (letter + 32);
269             }
270             if (letter >= 0xE0 && letter <= 0xFF)
271             {
272                 return letter;
273             }
274
275         }
276
277         return Character.toLowerCase(letter);
278     }
279 }
280
Popular Tags