KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > br > BrazilianAnalyzer


1 package org.apache.lucene.analysis.br;
2
3 /**
4  * Copyright 2004-2005 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import org.apache.lucene.analysis.Analyzer;
20 import org.apache.lucene.analysis.LowerCaseFilter;
21 import org.apache.lucene.analysis.StopFilter;
22 import org.apache.lucene.analysis.TokenStream;
23 import org.apache.lucene.analysis.WordlistLoader;
24 import org.apache.lucene.analysis.standard.StandardFilter;
25 import org.apache.lucene.analysis.standard.StandardTokenizer;
26 import java.io.File JavaDoc;
27 import java.io.IOException JavaDoc;
28 import java.io.Reader JavaDoc;
29 import java.util.Hashtable JavaDoc;
30 import java.util.HashSet JavaDoc;
31 import java.util.Set JavaDoc;
32
33 /**
34  * Analyzer for Brazilian language. Supports an external list of stopwords (words that
35  * will not be indexed at all) and an external list of exclusions (word that will
36  * not be stemmed, but indexed).
37  *
38  * @author João Kramer
39  */

40 public final class BrazilianAnalyzer extends Analyzer {
41
42     /**
43      * List of typical Brazilian stopwords.
44      */

45     public final static String JavaDoc[] BRAZILIAN_STOP_WORDS = {
46       "a","ainda","alem","ambas","ambos","antes",
47       "ao","aonde","aos","apos","aquele","aqueles",
48       "as","assim","com","como","contra","contudo",
49       "cuja","cujas","cujo","cujos","da","das","de",
50       "dela","dele","deles","demais","depois","desde",
51       "desta","deste","dispoe","dispoem","diversa",
52       "diversas","diversos","do","dos","durante","e",
53       "ela","elas","ele","eles","em","entao","entre",
54       "essa","essas","esse","esses","esta","estas",
55       "este","estes","ha","isso","isto","logo","mais",
56       "mas","mediante","menos","mesma","mesmas","mesmo",
57       "mesmos","na","nas","nao","nas","nem","nesse","neste",
58       "nos","o","os","ou","outra","outras","outro","outros",
59       "pelas","pelas","pelo","pelos","perante","pois","por",
60       "porque","portanto","proprio","propios","quais","qual",
61       "qualquer","quando","quanto","que","quem","quer","se",
62       "seja","sem","sendo","seu","seus","sob","sobre","sua",
63       "suas","tal","tambem","teu","teus","toda","todas","todo",
64       "todos","tua","tuas","tudo","um","uma","umas","uns"};
65
66
67     /**
68      * Contains the stopwords used with the StopFilter.
69      */

70     private Set JavaDoc stoptable = new HashSet JavaDoc();
71     
72     /**
73      * Contains words that should be indexed but not stemmed.
74      */

75     private Set JavaDoc excltable = new HashSet JavaDoc();
76
77     /**
78      * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
79      */

80     public BrazilianAnalyzer() {
81         stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS );
82     }
83
84     /**
85      * Builds an analyzer with the given stop words.
86      */

87     public BrazilianAnalyzer( String JavaDoc[] stopwords ) {
88         stoptable = StopFilter.makeStopSet( stopwords );
89     }
90
91     /**
92      * Builds an analyzer with the given stop words.
93      */

94     public BrazilianAnalyzer( Hashtable JavaDoc stopwords ) {
95         stoptable = new HashSet JavaDoc(stopwords.keySet());
96     }
97
98     /**
99      * Builds an analyzer with the given stop words.
100      */

101     public BrazilianAnalyzer( File JavaDoc stopwords ) throws IOException JavaDoc {
102         stoptable = WordlistLoader.getWordSet( stopwords );
103     }
104
105     /**
106      * Builds an exclusionlist from an array of Strings.
107      */

108     public void setStemExclusionTable( String JavaDoc[] exclusionlist ) {
109         excltable = StopFilter.makeStopSet( exclusionlist );
110     }
111     /**
112      * Builds an exclusionlist from a Hashtable.
113      */

114     public void setStemExclusionTable( Hashtable JavaDoc exclusionlist ) {
115         excltable = new HashSet JavaDoc(exclusionlist.keySet());
116     }
117     /**
118      * Builds an exclusionlist from the words contained in the given file.
119      */

120     public void setStemExclusionTable( File JavaDoc exclusionlist ) throws IOException JavaDoc {
121         excltable = WordlistLoader.getWordSet( exclusionlist );
122     }
123
124     /**
125      * Creates a TokenStream which tokenizes all the text in the provided Reader.
126      *
127      * @return A TokenStream build from a StandardTokenizer filtered with
128      * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
129      */

130     public final TokenStream tokenStream(String JavaDoc fieldName, Reader JavaDoc reader) {
131         TokenStream result = new StandardTokenizer( reader );
132         result = new StandardFilter( result );
133         result = new StopFilter( result, stoptable );
134         result = new BrazilianStemFilter( result, excltable );
135         // Convert to lowercase after stemming!
136
result = new LowerCaseFilter( result );
137         return result;
138     }
139 }
140
141
Popular Tags