1 package org.apache.lucene.analysis.cz; 2 3 18 19 import org.apache.lucene.analysis.Analyzer; 20 import org.apache.lucene.analysis.LowerCaseFilter; 21 import org.apache.lucene.analysis.StopFilter; 22 import org.apache.lucene.analysis.TokenStream; 23 import org.apache.lucene.analysis.WordlistLoader; 24 import org.apache.lucene.analysis.standard.StandardFilter; 25 import org.apache.lucene.analysis.standard.StandardTokenizer; 26 27 import java.io.*; 28 import java.util.Hashtable ; 29 import java.util.HashSet ; 30 import java.util.Set ; 31 32 40 public final class CzechAnalyzer extends Analyzer { 41 42 45 public final static String [] CZECH_STOP_WORDS = { 46 "a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem", 47 "byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto", 48 "jej","zda","pro\u010d","m\u00e1te","tato","kam","tohoto","kdo","kte\u0159\u00ed", 49 "mi","n\u00e1m","tom","tomuto","m\u00edt","nic","proto","kterou","byla", 50 "toho","proto\u017ee","asi","ho","na\u0161i","napi\u0161te","re","co\u017e","t\u00edm", 51 "tak\u017ee","sv\u00fdch","jej\u00ed","sv\u00fdmi","jste","aj","tu","tedy","teto", 52 "bylo","kde","ke","prav\u00e9","ji","nad","nejsou","\u010di","pod","t\u00e9ma", 53 "mezi","p\u0159es","ty","pak","v\u00e1m","ani","kdy\u017e","v\u0161ak","neg","jsem", 54 "tento","\u010dl\u00e1nku","\u010dl\u00e1nky","aby","jsme","p\u0159ed","pta","jejich", 55 "byl","je\u0161t\u011b","a\u017e","bez","tak\u00e9","pouze","prvn\u00ed","va\u0161e","kter\u00e1", 56 "n\u00e1s","nov\u00fd","tipy","pokud","m\u016f\u017ee","strana","jeho","sv\u00e9","jin\u00e9", 57 "zpr\u00e1vy","nov\u00e9","nen\u00ed","v\u00e1s","jen","podle","zde","u\u017e","b\u00fdt","v\u00edce", 58 "bude","ji\u017e","ne\u017e","kter\u00fd","by","kter\u00e9","co","nebo","ten","tak", 59 "m\u00e1","p\u0159i","od","po","jsou","jak","dal\u0161\u00ed","ale","si","se","ve", 60 "to","jako","za","zp\u011bt","ze","do","pro","je","na","atd","atp", 61 "jakmile","p\u0159i\u010dem\u017e","j\u00e1","on","ona","ono","oni","ony","my","vy", 62 "j\u00ed","ji","m\u011b","mne","jemu","tomu","t\u011bm","t\u011bmu","n\u011bmu","n\u011bmu\u017e", 63 "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e", 64 }; 65 66 69 private Set stoptable; 70 71 74 public CzechAnalyzer() { 75 stoptable = StopFilter.makeStopSet( CZECH_STOP_WORDS ); 76 } 77 78 81 public CzechAnalyzer( String [] stopwords ) { 82 stoptable = StopFilter.makeStopSet( stopwords ); 83 } 84 85 90 public CzechAnalyzer( Hashtable stopwords ) { 91 stoptable = new HashSet (stopwords.keySet()); 92 } 93 94 public CzechAnalyzer( HashSet stopwords ) { 95 stoptable = stopwords; 96 } 97 98 101 public CzechAnalyzer( File stopwords ) throws IOException { 102 stoptable = WordlistLoader.getWordSet( stopwords ); 103 } 104 105 110 public void loadStopWords( InputStream wordfile, String encoding ) { 111 if ( wordfile == null ) { 112 stoptable = new HashSet (); 113 return; 114 } 115 try { 116 stoptable = new HashSet (); 118 119 InputStreamReader isr; 120 if (encoding == null) 121 isr = new InputStreamReader(wordfile); 122 else 123 isr = new InputStreamReader(wordfile, encoding); 124 125 LineNumberReader lnr = new LineNumberReader(isr); 126 String word; 127 while ( ( word = lnr.readLine() ) != null ) { 128 stoptable.add(word); 129 } 130 131 } catch ( IOException e ) { 132 stoptable = null; 133 } 134 } 135 136 142 public final TokenStream tokenStream( String fieldName, Reader reader ) { 143 TokenStream result = new StandardTokenizer( reader ); 144 result = new StandardFilter( result ); 145 result = new LowerCaseFilter( result ); 146 result = new StopFilter( result, stoptable ); 147 return result; 148 } 149 } 150 151 | Popular Tags |