KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > javabb > lucene > analysis > SpecialCharFilter


1 /*
2  * Copyright 28/03/2005 - Vicinity - www.vicinity.com.br All rights reserveds
3  */

4 package org.javabb.lucene.analysis;
5
6
7 import java.io.IOException JavaDoc;
8 import java.util.regex.Matcher JavaDoc;
9 import java.util.regex.Pattern JavaDoc;
10
11 import org.apache.lucene.analysis.Token;
12 import org.apache.lucene.analysis.TokenFilter;
13 import org.apache.lucene.analysis.TokenStream;
14
15
16 /**
17  * Avoid special chars in token terms. Special chars includes only accentuateds
18  * letters like "á" , "Ó", etc, and "c" with a cedilla. Note that it is
19  * case insensitive and it always replace with a lower case letter.
20  *
21  * @author Marcos Silva Pereira - marcos.pereira@vicinity.com.br
22  * @version $Id$
23  */

24 class SpecialCharFilter extends TokenFilter {
25
26     private static final String JavaDoc[] REPLACES;
27     private static final Pattern JavaDoc[] PATTERNS;
28
29     static {
30
31         REPLACES = new String JavaDoc[]{"a", "e", "i", "o", "u", "c"};
32
33         PATTERNS = new Pattern JavaDoc[REPLACES.length];
34
35         // pre compile patterns
36
PATTERNS[0] = Pattern.compile("[âãáàä]", Pattern.CASE_INSENSITIVE);
37         PATTERNS[1] = Pattern.compile("[éèêë]", Pattern.CASE_INSENSITIVE);
38         PATTERNS[2] = Pattern.compile("[íìîï]", Pattern.CASE_INSENSITIVE);
39         PATTERNS[3] = Pattern.compile("[óòôõö]", Pattern.CASE_INSENSITIVE);
40         PATTERNS[4] = Pattern.compile("[úùûü]", Pattern.CASE_INSENSITIVE);
41         PATTERNS[5] = Pattern.compile("ç", Pattern.CASE_INSENSITIVE);
42
43     }
44
45     /**
46      * @param in
47      */

48     public SpecialCharFilter ( TokenStream in ) {
49
50         super(in);
51
52     }
53
54     /**
55      * @see org.apache.lucene.analysis.TokenStream#next()
56      */

57     public Token next() throws IOException JavaDoc {
58
59         Token t = input.next();
60
61         if (t == null) {
62
63             return null;
64
65         }
66
67         String JavaDoc termText = replaceSpecial(t.termText());
68         Token token = new Token(termText, t.startOffset(), t.endOffset());
69
70         return token;
71
72     }
73
74     private String JavaDoc replaceSpecial( String JavaDoc text ) {
75
76         String JavaDoc result = text;
77
78         for (int i = 0; i < PATTERNS.length; i++) {
79
80             Matcher JavaDoc matcher = PATTERNS[i].matcher(result);
81             result = matcher.replaceAll(REPLACES[i]);
82
83         }
84
85         return result;
86
87     }
88 }
89
Popular Tags