KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > ISOLatin1AccentFilter


1 package org.apache.lucene.analysis;
2
3 /**
4  * Copyright 2005 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 /**
20  * A filter that replaces accented characters in the ISO Latin 1 character set
21  * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
22  * <p>
23  * For instance, '&agrave;' will be replaced by 'a'.
24  * <p>
25  */

26 public class ISOLatin1AccentFilter extends TokenFilter {
27     public ISOLatin1AccentFilter(TokenStream input) {
28         super(input);
29     }
30
31     public final Token next() throws java.io.IOException JavaDoc {
32         final Token t = input.next();
33         if (t == null)
34             return null;
35         // Return a token with filtered characters.
36
return new Token(removeAccents(t.termText()), t.startOffset(), t.endOffset(), t.type());
37     }
38
39     /**
40      * To replace accented characters in a String by unaccented equivalents.
41      */

42     public final static String JavaDoc removeAccents(String JavaDoc input) {
43         final StringBuffer JavaDoc output = new StringBuffer JavaDoc();
44         for (int i = 0; i < input.length(); i++) {
45             switch (input.charAt(i)) {
46                 case '\u00C0' : // À
47
case '\u00C1' : // ?
48
case '\u00C2' : // Â
49
case '\u00C3' : // Ã
50
case '\u00C4' : // Ä
51
case '\u00C5' : // Å
52
output.append("A");
53                     break;
54                 case '\u00C6' : // Æ
55
output.append("AE");
56                     break;
57                 case '\u00C7' : // Ç
58
output.append("C");
59                     break;
60                 case '\u00C8' : // È
61
case '\u00C9' : // É
62
case '\u00CA' : // Ê
63
case '\u00CB' : // Ë
64
output.append("E");
65                     break;
66                 case '\u00CC' : // Ì
67
case '\u00CD' : // ?
68
case '\u00CE' : // Î
69
case '\u00CF' : // ?
70
output.append("I");
71                     break;
72                 case '\u00D0' : // ?
73
output.append("D");
74                     break;
75                 case '\u00D1' : // Ñ
76
output.append("N");
77                     break;
78                 case '\u00D2' : // Ò
79
case '\u00D3' : // Ó
80
case '\u00D4' : // Ô
81
case '\u00D5' : // Õ
82
case '\u00D6' : // Ö
83
case '\u00D8' : // Ø
84
output.append("O");
85                     break;
86                 case '\u0152' : // Œ
87
output.append("OE");
88                     break;
89                 case '\u00DE' : // Þ
90
output.append("TH");
91                     break;
92                 case '\u00D9' : // Ù
93
case '\u00DA' : // Ú
94
case '\u00DB' : // Û
95
case '\u00DC' : // Ü
96
output.append("U");
97                     break;
98                 case '\u00DD' : // ?
99
case '\u0178' : // Ÿ
100
output.append("Y");
101                     break;
102                 case '\u00E0' : // à
103
case '\u00E1' : // á
104
case '\u00E2' : // â
105
case '\u00E3' : // ã
106
case '\u00E4' : // ä
107
case '\u00E5' : // å
108
output.append("a");
109                     break;
110                 case '\u00E6' : // æ
111
output.append("ae");
112                     break;
113                 case '\u00E7' : // ç
114
output.append("c");
115                     break;
116                 case '\u00E8' : // è
117
case '\u00E9' : // é
118
case '\u00EA' : // ê
119
case '\u00EB' : // ë
120
output.append("e");
121                     break;
122                 case '\u00EC' : // ì
123
case '\u00ED' : // í
124
case '\u00EE' : // î
125
case '\u00EF' : // ï
126
output.append("i");
127                     break;
128                 case '\u00F0' : // ð
129
output.append("d");
130                     break;
131                 case '\u00F1' : // ñ
132
output.append("n");
133                     break;
134                 case '\u00F2' : // ò
135
case '\u00F3' : // ó
136
case '\u00F4' : // ô
137
case '\u00F5' : // õ
138
case '\u00F6' : // ö
139
case '\u00F8' : // ø
140
output.append("o");
141                     break;
142                 case '\u0153' : // œ
143
output.append("oe");
144                     break;
145                 case '\u00DF' : // ß
146
output.append("ss");
147                     break;
148                 case '\u00FE' : // þ
149
output.append("th");
150                     break;
151                 case '\u00F9' : // ù
152
case '\u00FA' : // ú
153
case '\u00FB' : // û
154
case '\u00FC' : // ü
155
output.append("u");
156                     break;
157                 case '\u00FD' : // ý
158
case '\u00FF' : // ÿ
159
output.append("y");
160                     break;
161                 default :
162                     output.append(input.charAt(i));
163                     break;
164             }
165         }
166         return output.toString();
167     }
168 }
Popular Tags