1 package org.apache.lucene.analysis.ru; 2 3 18 19 import junit.framework.TestCase; 20 21 import java.io.*; 22 23 import org.apache.lucene.analysis.TokenStream; 24 import org.apache.lucene.analysis.Token; 25 26 32 33 public class TestRussianAnalyzer extends TestCase 34 { 35 private InputStreamReader inWords; 36 37 private InputStreamReader sampleUnicode; 38 39 private Reader inWordsKOI8; 40 41 private Reader sampleKOI8; 42 43 private Reader inWords1251; 44 45 private Reader sample1251; 46 47 private File dataDir; 48 49 protected void setUp() throws Exception 50 { 51 dataDir = new File(System.getProperty("dataDir")); 52 } 53 54 public void testUnicode() throws IOException 55 { 56 RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); 57 inWords = 58 new InputStreamReader( 59 new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUnicode.txt")), 60 "Unicode"); 61 62 sampleUnicode = 63 new InputStreamReader( 64 new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resUnicode.htm")), 65 "Unicode"); 66 67 TokenStream in = ra.tokenStream("all", inWords); 68 69 RussianLetterTokenizer sample = 70 new RussianLetterTokenizer( 71 sampleUnicode, 72 RussianCharsets.UnicodeRussian); 73 74 for (;;) 75 { 76 Token token = in.next(); 77 78 if (token == null) 79 { 80 break; 81 } 82 83 Token sampleToken = sample.next(); 84 assertEquals( 85 "Unicode", 86 token.termText(), 87 sampleToken == null 88 ? null 89 : sampleToken.termText()); 90 } 91 92 inWords.close(); 93 sampleUnicode.close(); 94 } 95 96 public void testKOI8() throws IOException 97 { 98 RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); 100 inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1"); 102 103 sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1"); 104 105 TokenStream in = ra.tokenStream("all", inWordsKOI8); 106 RussianLetterTokenizer sample = 107 new RussianLetterTokenizer( 108 sampleKOI8, 109 RussianCharsets.KOI8); 110 111 for (;;) 112 { 113 Token token = in.next(); 114 115 if (token == null) 116 { 117 break; 118 } 119 120 Token sampleToken = sample.next(); 121 assertEquals( 122 "KOI8", 123 token.termText(), 124 sampleToken == null 125 ? null 126 : sampleToken.termText()); 127 128 } 129 130 inWordsKOI8.close(); 131 sampleKOI8.close(); 132 } 133 134 public void test1251() throws IOException 135 { 136 inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1"); 138 139 sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1"); 140 141 RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); 142 TokenStream in = ra.tokenStream("", inWords1251); 143 RussianLetterTokenizer sample = 144 new RussianLetterTokenizer( 145 sample1251, 146 RussianCharsets.CP1251); 147 148 for (;;) 149 { 150 Token token = in.next(); 151 152 if (token == null) 153 { 154 break; 155 } 156 157 Token sampleToken = sample.next(); 158 assertEquals( 159 "1251", 160 token.termText(), 161 sampleToken == null 162 ? null 163 : sampleToken.termText()); 164 165 } 166 167 inWords1251.close(); 168 sample1251.close(); 169 } 170 } 171 | Popular Tags |