BrazilianAnalyzer


1   package org.apache.lucene.analysis.br;
2   
3   /**
4    * Copyright 2004-2005 The Apache Software Foundation
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  import org.apache.lucene.analysis.Analyzer;
20  import org.apache.lucene.analysis.LowerCaseFilter;
21  import org.apache.lucene.analysis.StopFilter;
22  import org.apache.lucene.analysis.TokenStream;
23  import org.apache.lucene.analysis.WordlistLoader;
24  import org.apache.lucene.analysis.standard.StandardFilter;
25  import org.apache.lucene.analysis.standard.StandardTokenizer;
26  import java.io.File  ;
27  import java.io.IOException  ;
28  import java.io.Reader  ;
29  import java.util.Hashtable  ;
30  import java.util.HashSet  ;
31  import java.util.Set  ;
32  
33  /**
34   * Analyzer for Brazilian language. Supports an external list of stopwords (words that
35   * will not be indexed at all) and an external list of exclusions (word that will
36   * not be stemmed, but indexed).
37   *
38   * @author    Jo&atilde;o Kramer
39   */
40  public final class BrazilianAnalyzer extends Analyzer {
41  
42      /**
43       * List of typical Brazilian stopwords.
44       */
45      public final static String  [] BRAZILIAN_STOP_WORDS = {
46        "a","ainda","alem","ambas","ambos","antes",
47        "ao","aonde","aos","apos","aquele","aqueles",
48        "as","assim","com","como","contra","contudo",
49        "cuja","cujas","cujo","cujos","da","das","de",
50        "dela","dele","deles","demais","depois","desde",
51        "desta","deste","dispoe","dispoem","diversa",
52        "diversas","diversos","do","dos","durante","e",
53        "ela","elas","ele","eles","em","entao","entre",
54        "essa","essas","esse","esses","esta","estas",
55        "este","estes","ha","isso","isto","logo","mais",
56        "mas","mediante","menos","mesma","mesmas","mesmo",
57        "mesmos","na","nas","nao","nas","nem","nesse","neste",
58        "nos","o","os","ou","outra","outras","outro","outros",
59        "pelas","pelas","pelo","pelos","perante","pois","por",
60        "porque","portanto","proprio","propios","quais","qual",
61        "qualquer","quando","quanto","que","quem","quer","se",
62        "seja","sem","sendo","seu","seus","sob","sobre","sua",
63        "suas","tal","tambem","teu","teus","toda","todas","todo",
64        "todos","tua","tuas","tudo","um","uma","umas","uns"};
65  
66  
67      /**
68       * Contains the stopwords used with the StopFilter.
69       */
70      private Set   stoptable = new HashSet  ();
71      
72      /**
73       * Contains words that should be indexed but not stemmed.
74       */
75      private Set   excltable = new HashSet  ();
76  
77      /**
78       * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
79       */
80      public BrazilianAnalyzer() {
81          stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS );
82      }
83  
84      /**
85       * Builds an analyzer with the given stop words.
86       */
87      public BrazilianAnalyzer( String  [] stopwords ) {
88          stoptable = StopFilter.makeStopSet( stopwords );
89      }
90  
91      /**
92       * Builds an analyzer with the given stop words.
93       */
94      public BrazilianAnalyzer( Hashtable   stopwords ) {
95          stoptable = new HashSet  (stopwords.keySet());
96      }
97  
98      /**
99       * Builds an analyzer with the given stop words.
100      */
101     public BrazilianAnalyzer( File   stopwords ) throws IOException   {
102         stoptable = WordlistLoader.getWordSet( stopwords );
103     }
104 
105     /**
106      * Builds an exclusionlist from an array of Strings.
107      */
108     public void setStemExclusionTable( String  [] exclusionlist ) {
109         excltable = StopFilter.makeStopSet( exclusionlist );
110     }
111     /**
112      * Builds an exclusionlist from a Hashtable.
113      */
114     public void setStemExclusionTable( Hashtable   exclusionlist ) {
115         excltable = new HashSet  (exclusionlist.keySet());
116     }
117     /**
118      * Builds an exclusionlist from the words contained in the given file.
119      */
120     public void setStemExclusionTable( File   exclusionlist ) throws IOException   {
121         excltable = WordlistLoader.getWordSet( exclusionlist );
122     }
123 
124     /**
125      * Creates a TokenStream which tokenizes all the text in the provided Reader.
126      *
127      * @return  A TokenStream build from a StandardTokenizer filtered with
128      *          StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
129      */
130     public final TokenStream tokenStream(String   fieldName, Reader   reader) {
131         TokenStream result = new StandardTokenizer( reader );
132         result = new StandardFilter( result );
133         result = new StopFilter( result, stoptable );
134         result = new BrazilianStemFilter( result, excltable );
135         // Convert to lowercase after stemming!
136         result = new LowerCaseFilter( result );
137         return result;
138     }
139 }
140 
141
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags