KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > javabb > lucene > analysis > PortugueseAnalyzer


1 /*
2  * Copyright 28/03/2005 - Vicinity - www.vicinity.com.br All rights reserveds
3  */

4 package org.javabb.lucene.analysis;
5
6
7 import java.io.Reader JavaDoc;
8 import java.util.HashSet JavaDoc;
9 import java.util.Set JavaDoc;
10
11 import org.apache.lucene.analysis.Analyzer;
12 import org.apache.lucene.analysis.LowerCaseFilter;
13 import org.apache.lucene.analysis.StopFilter;
14 import org.apache.lucene.analysis.TokenStream;
15 import org.apache.lucene.analysis.standard.StandardFilter;
16 import org.apache.lucene.analysis.standard.StandardTokenizer;
17
18
19 /**
20  * <p>
21  * Lucene Analyzer for brazilian portuguese language. This does not do stemmer
22  * or others advanceds processing, only remove portuguese {@link #STOP_WORDS}
23  * and avoid especial characters, like, but not only, "á", "ç", "õ", etc. Indeed,
24  * any accentuated characters is parse to a similar non accentuated characterer.
25  * For instance, "á" is parsed to "a". Further more, this analyzer does all
26  * works made by a StandardAnalyzer.
27  * <p>
28  * Stop words can be assigned using:
29  * <ul>
30  * <li>{@link #PortugueseAnalyzer() Empty constructor}, which will use
31  * {@link #STOP_WORDS default stop words} set;</li>
32  * <li>{@link #PortugueseAnalyzer(String[]) String-array-based-constructor},
33  * to determine your own stop words set;</li>
34  * <li>{@link #PortugueseAnalyzer(String) String-comma-delimeted-constructor},
35  * which will {@link java.lang.String#split(java.lang.String) split} a String
36  * using comma like delimeter. In other words, if you provide "stop, words" the
37  * stop words set will be {stop, word}</li>
38  * </ul>
39  *
40  * @author Marcos Silva Pereira - marcos.pereira@vicinity.com.br
41  *
42  * @since 06/12/2004
43  *
44  * @version $Id$
45  *
46  * @see br.com.vicinity.lucene.analysis.SpecialCharFilter
47  * @see org.apache.lucene.analysis.standard.StandardAnalyzer
48  */

49 public class PortugueseAnalyzer extends Analyzer {
50
51     /**
52      * <code>STOP_WORDS</code> contains a default set of stop words.
53      */

54     public static final String JavaDoc[] STOP_WORDS = new String JavaDoc[] { "0", "1", "2",
55             "3", "4", "5", "6", "7", "8", "9", "a", "ainda", "alem", "algum",
56             "alguma", "alguns", "ali", "além", "ambas", "ambos", "ano", "anos",
57             "antes", "ao", "aonde", "aos", "apenas", "apos", "aquela",
58             "aquele", "aqueles", "as", "assim", "ato", "até", "b", "bem",
59             "boa", "bom", "c", "cada", "cargo", "carta", "casa", "com", "como",
60             "consta", "contra", "contudo", "cuja", "cujas", "cujo", "cujos",
61             "d", "da", "daquele", "dar", "das", "data", "de", "dela", "dele",
62             "deles", "demais", "depois", "desde", "desta", "deste", "deu",
63             "dia", "dias", "dispoe", "dispoem", "dito", "diversa", "diversas",
64             "diversos", "diz", "do", "dois", "dos", "dr", "duas", "durante",
65             "e", "ela", "elas", "ele", "eles", "em", "enfim", "entao", "entre",
66             "então", "era", "eram", "essa", "essas", "esse", "esses", "esta",
67             "estas", "estava", "este", "estes", "f", "fazer", "fez", "ficou",
68             "fim", "foi", "foram", "fr", "g", "gente", "geral", "h", "ha",
69             "havia", "hoje", "há", "i", "isso", "isto", "j", "já", "k", "l",
70             "lhe", "lhes", "logo", "lugar", "m", "maior", "mais", "mas", "me",
71             "mediante", "menos", "mesma", "mesmas", "mesmo", "mesmos", "muito",
72             "muitos", "n", "na", "nao", "nas", "nem", "nesse", "nesta",
73             "neste", "no", "nome", "nos", "nossa", "nosso", "nossos", "nova",
74             "novo", "não", "nós", "o", "onde", "ordem", "os", "ou", "outra",
75             "outras", "outro", "outros", "p", "para", "parte", "pela", "pelas",
76             "pelo", "pelos", "perante", "pois", "por", "porque", "portanto",
77             "porém", "pouco", "propios", "proprio", "q", "quais", "qual",
78             "qualquer", "quando", "quanto", "que", "quem", "quer", "r", "rua",
79             "s", "se", "segundo", "seja", "sem", "sempre", "sendo", "ser",
80             "seu", "seus", "sob", "sobre", "sua", "suas", "são", "só", "sôbre",
81             "t", "tal", "tambem", "também", "tanto", "tem", "tendo", "ter",
82             "teu", "teus", "teve", "tinha", "tinham", "toda", "todas", "todo",
83             "todos", "três", "tua", "tuas", "tudo", "tão", "u", "um", "uma",
84             "umas", "uns", "v", "veio", "vem", "vez", "vê", "w", "x", "y", "z",
85             "à", "às", "é", "êle" };
86
87     private Set JavaDoc stopWords = new HashSet JavaDoc();
88
89     /**
90      * Construct a analyzer with {@link #STOP_WORDS default stop words} set
91      */

92     public PortugueseAnalyzer () {
93
94         this(STOP_WORDS);
95
96     }
97
98     /**
99      * @param stopWords
100      */

101     public PortugueseAnalyzer ( String JavaDoc[] stopWords ) {
102
103         this.stopWords = StopFilter.makeStopSet(stopWords);
104
105     }
106
107     /**
108      * @param words
109      */

110     public PortugueseAnalyzer ( String JavaDoc words ) {
111
112         this(makeArray(words));
113
114     }
115
116     /**
117      * @param fieldName
118      * @param reader
119      *
120      * @return
121      *
122      * @see StandardAnalyzer#tokenStream(java.lang.String, java.io.Reader)
123      */

124     public TokenStream tokenStream( String JavaDoc fieldName, Reader JavaDoc reader ) {
125
126         TokenStream result = new StandardTokenizer(reader);
127
128         result = new StandardFilter(result);
129         result = new LowerCaseFilter(result);
130         result = new StopFilter(result, stopWords);
131         result = new SpecialCharFilter(result);
132
133         return result;
134
135     }
136
137     /**
138      * Construct a array from a String with comma separated words.
139      *
140      * @param words a String with comma separated words
141      *
142      * @return a array with all words from <tt>words</tt>
143      */

144     private static String JavaDoc[] makeArray( String JavaDoc words ) {
145
146         String JavaDoc[] split = words.split(", ");
147         String JavaDoc[] result = new String JavaDoc[split.length];
148
149         for (int i = 0; i < split.length; i++) {
150
151             result[i] = split[i].trim().toLowerCase();
152
153         }
154
155         return result;
156
157     }
158
159 }
160
Popular Tags